(decode_coding_sjis): Handle 4th charset (typically JISX0212).
[bpt/emacs.git] / src / coding.c
1 /* Coding system handler (conversion, detection, etc).
2 Copyright (C) 1995, 1997, 1998 Electrotechnical Laboratory, JAPAN.
3 Licensed to the Free Software Foundation.
4 Copyright (C) 2001, 2002 Free Software Foundation, Inc.
5 Copyright (C) 2003
6 National Institute of Advanced Industrial Science and Technology (AIST)
7 Registration Number H13PRO009
8
9 This file is part of GNU Emacs.
10
11 GNU Emacs is free software; you can redistribute it and/or modify
12 it under the terms of the GNU General Public License as published by
13 the Free Software Foundation; either version 2, or (at your option)
14 any later version.
15
16 GNU Emacs is distributed in the hope that it will be useful,
17 but WITHOUT ANY WARRANTY; without even the implied warranty of
18 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19 GNU General Public License for more details.
20
21 You should have received a copy of the GNU General Public License
22 along with GNU Emacs; see the file COPYING. If not, write to
23 the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
24 Boston, MA 02111-1307, USA. */
25
26 /*** TABLE OF CONTENTS ***
27
28 0. General comments
29 1. Preamble
30 2. Emacs' internal format (emacs-utf-8) handlers
31 3. UTF-8 handlers
32 4. UTF-16 handlers
33 5. Charset-base coding systems handlers
34 6. emacs-mule (old Emacs' internal format) handlers
35 7. ISO2022 handlers
36 8. Shift-JIS and BIG5 handlers
37 9. CCL handlers
38 10. C library functions
39 11. Emacs Lisp library functions
40 12. Postamble
41
42 */
43
44 /*** 0. General comments ***
45
46
47 CODING SYSTEM
48
49 A coding system is an object for an encoding mechanism that contains
50 information about how to convert byte sequences to character
51 sequences and vice versa. When we say "decode", it means converting
52 a byte sequence of a specific coding system into a character
53 sequence that is represented by Emacs' internal coding system
54 `emacs-utf-8', and when we say "encode", it means converting a
55 character sequence of emacs-utf-8 to a byte sequence of a specific
56 coding system.
57
58 In Emacs Lisp, a coding system is represented by a Lisp symbol. In
59 C level, a coding system is represented by a vector of attributes
60 stored in the hash table Vcharset_hash_table. The conversion from
61 coding system symbol to attributes vector is done by looking up
62 Vcharset_hash_table by the symbol.
63
64 Coding systems are classified into the following types depending on
65 the encoding mechanism. Here's a brief description of the types.
66
67 o UTF-8
68
69 o UTF-16
70
71 o Charset-base coding system
72
73 A coding system defined by one or more (coded) character sets.
74 Decoding and encoding are done by a code converter defined for each
75 character set.
76
77 o Old Emacs internal format (emacs-mule)
78
79 The coding system adopted by old versions of Emacs (20 and 21).
80
81 o ISO2022-base coding system
82
83 The most famous coding system for multiple character sets. X's
84 Compound Text, various EUCs (Extended Unix Code), and coding systems
85 used in the Internet communication such as ISO-2022-JP are all
86 variants of ISO2022.
87
88 o SJIS (or Shift-JIS or MS-Kanji-Code)
89
90 A coding system to encode character sets: ASCII, JISX0201, and
91 JISX0208. Widely used for PC's in Japan. Details are described in
92 section 8.
93
94 o BIG5
95
96 A coding system to encode character sets: ASCII and Big5. Widely
97 used for Chinese (mainly in Taiwan and Hong Kong). Details are
98 described in section 8. In this file, when we write "big5" (all
99 lowercase), we mean the coding system, and when we write "Big5"
100 (capitalized), we mean the character set.
101
102 o CCL
103
104 If a user wants to decode/encode text encoded in a coding system
105 not listed above, he can supply a decoder and an encoder for it in
106 CCL (Code Conversion Language) programs. Emacs executes the CCL
107 program while decoding/encoding.
108
109 o Raw-text
110
111 A coding system for text containing raw eight-bit data. Emacs
112 treats each byte of source text as a character (except for
113 end-of-line conversion).
114
115 o No-conversion
116
117 Like raw text, but don't do end-of-line conversion.
118
119
120 END-OF-LINE FORMAT
121
122 How text end-of-line is encoded depends on operating system. For
123 instance, Unix's format is just one byte of LF (line-feed) code,
124 whereas DOS's format is two-byte sequence of `carriage-return' and
125 `line-feed' codes. MacOS's format is usually one byte of
126 `carriage-return'.
127
128 Since text character encoding and end-of-line encoding are
129 independent, any coding system described above can take any format
130 of end-of-line (except for no-conversion).
131
132 STRUCT CODING_SYSTEM
133
134 Before using a coding system for code conversion (i.e. decoding and
135 encoding), we setup a structure of type `struct coding_system'.
136 This structure keeps various information about a specific code
137 conversion (e.g. the location of source and destination data).
138
139 */
140
141 /* COMMON MACROS */
142
143
144 /*** GENERAL NOTES on `detect_coding_XXX ()' functions ***
145
146 These functions check if a byte sequence specified as a source in
147 CODING conforms to the format of XXX, and update the members of
148 DETECT_INFO.
149
150 Return 1 if the byte sequence conforms to XXX, otherwise return 0.
151
152 Below is the template of these functions. */
153
154 #if 0
155 static int
156 detect_coding_XXX (coding, detect_info)
157 struct coding_system *coding;
158 struct coding_detection_info *detect_info;
159 {
160 unsigned char *src = coding->source;
161 unsigned char *src_end = coding->source + coding->src_bytes;
162 int multibytep = coding->src_multibyte;
163 int consumed_chars = 0;
164 int found = 0;
165 ...;
166
167 while (1)
168 {
169 /* Get one byte from the source. If the souce is exausted, jump
170 to no_more_source:. */
171 ONE_MORE_BYTE (c);
172
173 if (! __C_conforms_to_XXX___ (c))
174 break;
175 if (! __C_strongly_suggests_XXX__ (c))
176 found = CATEGORY_MASK_XXX;
177 }
178 /* The byte sequence is invalid for XXX. */
179 detect_info->rejected |= CATEGORY_MASK_XXX;
180 return 0;
181
182 no_more_source:
183 /* The source exausted successfully. */
184 detect_info->found |= found;
185 return 1;
186 }
187 #endif
188
189 /*** GENERAL NOTES on `decode_coding_XXX ()' functions ***
190
191 These functions decode a byte sequence specified as a source by
192 CODING. The resulting multibyte text goes to a place pointed to by
193 CODING->charbuf, the length of which should not exceed
194 CODING->charbuf_size;
195
196 These functions set the information of original and decoded texts in
197 CODING->consumed, CODING->consumed_char, and CODING->charbuf_used.
198 They also set CODING->result to one of CODING_RESULT_XXX indicating
199 how the decoding is finished.
200
201 Below is the template of these functions. */
202
203 #if 0
204 static void
205 decode_coding_XXXX (coding)
206 struct coding_system *coding;
207 {
208 unsigned char *src = coding->source + coding->consumed;
209 unsigned char *src_end = coding->source + coding->src_bytes;
210 /* SRC_BASE remembers the start position in source in each loop.
211 The loop will be exited when there's not enough source code, or
212 when there's no room in CHARBUF for a decoded character. */
213 unsigned char *src_base;
214 /* A buffer to produce decoded characters. */
215 int *charbuf = coding->charbuf;
216 int *charbuf_end = charbuf + coding->charbuf_size;
217 int multibytep = coding->src_multibyte;
218
219 while (1)
220 {
221 src_base = src;
222 if (charbuf < charbuf_end)
223 /* No more room to produce a decoded character. */
224 break;
225 ONE_MORE_BYTE (c);
226 /* Decode it. */
227 }
228
229 no_more_source:
230 if (src_base < src_end
231 && coding->mode & CODING_MODE_LAST_BLOCK)
232 /* If the source ends by partial bytes to construct a character,
233 treat them as eight-bit raw data. */
234 while (src_base < src_end && charbuf < charbuf_end)
235 *charbuf++ = *src_base++;
236 /* Remember how many bytes and characters we consumed. If the
237 source is multibyte, the bytes and chars are not identical. */
238 coding->consumed = coding->consumed_char = src_base - coding->source;
239 /* Remember how many characters we produced. */
240 coding->charbuf_used = charbuf - coding->charbuf;
241 }
242 #endif
243
244 /*** GENERAL NOTES on `encode_coding_XXX ()' functions ***
245
246 These functions encode SRC_BYTES length text at SOURCE of Emacs'
247 internal multibyte format by CODING. The resulting byte sequence
248 goes to a place pointed to by DESTINATION, the length of which
249 should not exceed DST_BYTES.
250
251 These functions set the information of original and encoded texts in
252 the members produced, produced_char, consumed, and consumed_char of
253 the structure *CODING. They also set the member result to one of
254 CODING_RESULT_XXX indicating how the encoding finished.
255
256 DST_BYTES zero means that source area and destination area are
257 overlapped, which means that we can produce a encoded text until it
258 reaches at the head of not-yet-encoded source text.
259
260 Below is a template of these functions. */
261 #if 0
262 static void
263 encode_coding_XXX (coding)
264 struct coding_system *coding;
265 {
266 int multibytep = coding->dst_multibyte;
267 int *charbuf = coding->charbuf;
268 int *charbuf_end = charbuf->charbuf + coding->charbuf_used;
269 unsigned char *dst = coding->destination + coding->produced;
270 unsigned char *dst_end = coding->destination + coding->dst_bytes;
271 unsigned char *adjusted_dst_end = dst_end - _MAX_BYTES_PRODUCED_IN_LOOP_;
272 int produced_chars = 0;
273
274 for (; charbuf < charbuf_end && dst < adjusted_dst_end; charbuf++)
275 {
276 int c = *charbuf;
277 /* Encode C into DST, and increment DST. */
278 }
279 label_no_more_destination:
280 /* How many chars and bytes we produced. */
281 coding->produced_char += produced_chars;
282 coding->produced = dst - coding->destination;
283 }
284 #endif
285
286 \f
287 /*** 1. Preamble ***/
288
289 #include <config.h>
290 #include <stdio.h>
291
292 #include "lisp.h"
293 #include "buffer.h"
294 #include "character.h"
295 #include "charset.h"
296 #include "ccl.h"
297 #include "composite.h"
298 #include "coding.h"
299 #include "window.h"
300
301 Lisp_Object Vcoding_system_hash_table;
302
303 Lisp_Object Qcoding_system, Qcoding_aliases, Qeol_type;
304 Lisp_Object Qunix, Qdos;
305 extern Lisp_Object Qmac; /* frame.c */
306 Lisp_Object Qbuffer_file_coding_system;
307 Lisp_Object Qpost_read_conversion, Qpre_write_conversion;
308 Lisp_Object Qdefault_char;
309 Lisp_Object Qno_conversion, Qundecided;
310 Lisp_Object Qcharset, Qiso_2022, Qutf_8, Qutf_16, Qshift_jis, Qbig5;
311 Lisp_Object Qbig, Qlittle;
312 Lisp_Object Qcoding_system_history;
313 Lisp_Object Qvalid_codes;
314 Lisp_Object QCcategory, QCmnemonic, QCdefalut_char;
315 Lisp_Object QCdecode_translation_table, QCencode_translation_table;
316 Lisp_Object QCpost_read_conversion, QCpre_write_conversion;
317
318 extern Lisp_Object Qinsert_file_contents, Qwrite_region;
319 Lisp_Object Qcall_process, Qcall_process_region, Qprocess_argument;
320 Lisp_Object Qstart_process, Qopen_network_stream;
321 Lisp_Object Qtarget_idx;
322
323 Lisp_Object Qinsufficient_source, Qinconsistent_eol, Qinvalid_source;
324 Lisp_Object Qinterrupted, Qinsufficient_memory;
325
326 int coding_system_require_warning;
327
328 Lisp_Object Vselect_safe_coding_system_function;
329
330 /* Mnemonic string for each format of end-of-line. */
331 Lisp_Object eol_mnemonic_unix, eol_mnemonic_dos, eol_mnemonic_mac;
332 /* Mnemonic string to indicate format of end-of-line is not yet
333 decided. */
334 Lisp_Object eol_mnemonic_undecided;
335
336 #ifdef emacs
337
338 Lisp_Object Vcoding_system_list, Vcoding_system_alist;
339
340 Lisp_Object Qcoding_system_p, Qcoding_system_error;
341
342 /* Coding system emacs-mule and raw-text are for converting only
343 end-of-line format. */
344 Lisp_Object Qemacs_mule, Qraw_text;
345 Lisp_Object Qutf_8_emacs;
346
347 /* Coding-systems are handed between Emacs Lisp programs and C internal
348 routines by the following three variables. */
349 /* Coding-system for reading files and receiving data from process. */
350 Lisp_Object Vcoding_system_for_read;
351 /* Coding-system for writing files and sending data to process. */
352 Lisp_Object Vcoding_system_for_write;
353 /* Coding-system actually used in the latest I/O. */
354 Lisp_Object Vlast_coding_system_used;
355 /* Set to non-nil when an error is detected while code conversion. */
356 Lisp_Object Vlast_code_conversion_error;
357 /* A vector of length 256 which contains information about special
358 Latin codes (especially for dealing with Microsoft codes). */
359 Lisp_Object Vlatin_extra_code_table;
360
361 /* Flag to inhibit code conversion of end-of-line format. */
362 int inhibit_eol_conversion;
363
364 /* Flag to inhibit ISO2022 escape sequence detection. */
365 int inhibit_iso_escape_detection;
366
367 /* Flag to make buffer-file-coding-system inherit from process-coding. */
368 int inherit_process_coding_system;
369
370 /* Coding system to be used to encode text for terminal display. */
371 struct coding_system terminal_coding;
372
373 /* Coding system to be used to encode text for terminal display when
374 terminal coding system is nil. */
375 struct coding_system safe_terminal_coding;
376
377 /* Coding system of what is sent from terminal keyboard. */
378 struct coding_system keyboard_coding;
379
380 Lisp_Object Vfile_coding_system_alist;
381 Lisp_Object Vprocess_coding_system_alist;
382 Lisp_Object Vnetwork_coding_system_alist;
383
384 Lisp_Object Vlocale_coding_system;
385
386 #endif /* emacs */
387
388 /* Flag to tell if we look up translation table on character code
389 conversion. */
390 Lisp_Object Venable_character_translation;
391 /* Standard translation table to look up on decoding (reading). */
392 Lisp_Object Vstandard_translation_table_for_decode;
393 /* Standard translation table to look up on encoding (writing). */
394 Lisp_Object Vstandard_translation_table_for_encode;
395
396 Lisp_Object Qtranslation_table;
397 Lisp_Object Qtranslation_table_id;
398 Lisp_Object Qtranslation_table_for_decode;
399 Lisp_Object Qtranslation_table_for_encode;
400
401 /* Alist of charsets vs revision number. */
402 static Lisp_Object Vcharset_revision_table;
403
404 /* Default coding systems used for process I/O. */
405 Lisp_Object Vdefault_process_coding_system;
406
407 /* Char table for translating Quail and self-inserting input. */
408 Lisp_Object Vtranslation_table_for_input;
409
410 /* Two special coding systems. */
411 Lisp_Object Vsjis_coding_system;
412 Lisp_Object Vbig5_coding_system;
413
414 static void record_conversion_result (struct coding_system *coding,
415 enum coding_result_code result);
416 static int detect_coding_utf_8 P_ ((struct coding_system *,
417 struct coding_detection_info *info));
418 static void decode_coding_utf_8 P_ ((struct coding_system *));
419 static int encode_coding_utf_8 P_ ((struct coding_system *));
420
421 static int detect_coding_utf_16 P_ ((struct coding_system *,
422 struct coding_detection_info *info));
423 static void decode_coding_utf_16 P_ ((struct coding_system *));
424 static int encode_coding_utf_16 P_ ((struct coding_system *));
425
426 static int detect_coding_iso_2022 P_ ((struct coding_system *,
427 struct coding_detection_info *info));
428 static void decode_coding_iso_2022 P_ ((struct coding_system *));
429 static int encode_coding_iso_2022 P_ ((struct coding_system *));
430
431 static int detect_coding_emacs_mule P_ ((struct coding_system *,
432 struct coding_detection_info *info));
433 static void decode_coding_emacs_mule P_ ((struct coding_system *));
434 static int encode_coding_emacs_mule P_ ((struct coding_system *));
435
436 static int detect_coding_sjis P_ ((struct coding_system *,
437 struct coding_detection_info *info));
438 static void decode_coding_sjis P_ ((struct coding_system *));
439 static int encode_coding_sjis P_ ((struct coding_system *));
440
441 static int detect_coding_big5 P_ ((struct coding_system *,
442 struct coding_detection_info *info));
443 static void decode_coding_big5 P_ ((struct coding_system *));
444 static int encode_coding_big5 P_ ((struct coding_system *));
445
446 static int detect_coding_ccl P_ ((struct coding_system *,
447 struct coding_detection_info *info));
448 static void decode_coding_ccl P_ ((struct coding_system *));
449 static int encode_coding_ccl P_ ((struct coding_system *));
450
451 static void decode_coding_raw_text P_ ((struct coding_system *));
452 static int encode_coding_raw_text P_ ((struct coding_system *));
453
454
455 /* ISO2022 section */
456
457 #define CODING_ISO_INITIAL(coding, reg) \
458 (XINT (AREF (AREF (CODING_ID_ATTRS ((coding)->id), \
459 coding_attr_iso_initial), \
460 reg)))
461
462
463 #define CODING_ISO_REQUEST(coding, charset_id) \
464 ((charset_id <= (coding)->max_charset_id \
465 ? (coding)->safe_charsets[charset_id] \
466 : -1))
467
468
469 #define CODING_ISO_FLAGS(coding) \
470 ((coding)->spec.iso_2022.flags)
471 #define CODING_ISO_DESIGNATION(coding, reg) \
472 ((coding)->spec.iso_2022.current_designation[reg])
473 #define CODING_ISO_INVOCATION(coding, plane) \
474 ((coding)->spec.iso_2022.current_invocation[plane])
475 #define CODING_ISO_SINGLE_SHIFTING(coding) \
476 ((coding)->spec.iso_2022.single_shifting)
477 #define CODING_ISO_BOL(coding) \
478 ((coding)->spec.iso_2022.bol)
479 #define CODING_ISO_INVOKED_CHARSET(coding, plane) \
480 CODING_ISO_DESIGNATION ((coding), CODING_ISO_INVOCATION ((coding), (plane)))
481
482 /* Control characters of ISO2022. */
483 /* code */ /* function */
484 #define ISO_CODE_LF 0x0A /* line-feed */
485 #define ISO_CODE_CR 0x0D /* carriage-return */
486 #define ISO_CODE_SO 0x0E /* shift-out */
487 #define ISO_CODE_SI 0x0F /* shift-in */
488 #define ISO_CODE_SS2_7 0x19 /* single-shift-2 for 7-bit code */
489 #define ISO_CODE_ESC 0x1B /* escape */
490 #define ISO_CODE_SS2 0x8E /* single-shift-2 */
491 #define ISO_CODE_SS3 0x8F /* single-shift-3 */
492 #define ISO_CODE_CSI 0x9B /* control-sequence-introducer */
493
494 /* All code (1-byte) of ISO2022 is classified into one of the
495 followings. */
496 enum iso_code_class_type
497 {
498 ISO_control_0, /* Control codes in the range
499 0x00..0x1F and 0x7F, except for the
500 following 5 codes. */
501 ISO_shift_out, /* ISO_CODE_SO (0x0E) */
502 ISO_shift_in, /* ISO_CODE_SI (0x0F) */
503 ISO_single_shift_2_7, /* ISO_CODE_SS2_7 (0x19) */
504 ISO_escape, /* ISO_CODE_SO (0x1B) */
505 ISO_control_1, /* Control codes in the range
506 0x80..0x9F, except for the
507 following 3 codes. */
508 ISO_single_shift_2, /* ISO_CODE_SS2 (0x8E) */
509 ISO_single_shift_3, /* ISO_CODE_SS3 (0x8F) */
510 ISO_control_sequence_introducer, /* ISO_CODE_CSI (0x9B) */
511 ISO_0x20_or_0x7F, /* Codes of the values 0x20 or 0x7F. */
512 ISO_graphic_plane_0, /* Graphic codes in the range 0x21..0x7E. */
513 ISO_0xA0_or_0xFF, /* Codes of the values 0xA0 or 0xFF. */
514 ISO_graphic_plane_1 /* Graphic codes in the range 0xA1..0xFE. */
515 };
516
517 /** The macros CODING_ISO_FLAG_XXX defines a flag bit of the
518 `iso-flags' attribute of an iso2022 coding system. */
519
520 /* If set, produce long-form designation sequence (e.g. ESC $ ( A)
521 instead of the correct short-form sequence (e.g. ESC $ A). */
522 #define CODING_ISO_FLAG_LONG_FORM 0x0001
523
524 /* If set, reset graphic planes and registers at end-of-line to the
525 initial state. */
526 #define CODING_ISO_FLAG_RESET_AT_EOL 0x0002
527
528 /* If set, reset graphic planes and registers before any control
529 characters to the initial state. */
530 #define CODING_ISO_FLAG_RESET_AT_CNTL 0x0004
531
532 /* If set, encode by 7-bit environment. */
533 #define CODING_ISO_FLAG_SEVEN_BITS 0x0008
534
535 /* If set, use locking-shift function. */
536 #define CODING_ISO_FLAG_LOCKING_SHIFT 0x0010
537
538 /* If set, use single-shift function. Overwrite
539 CODING_ISO_FLAG_LOCKING_SHIFT. */
540 #define CODING_ISO_FLAG_SINGLE_SHIFT 0x0020
541
542 /* If set, use designation escape sequence. */
543 #define CODING_ISO_FLAG_DESIGNATION 0x0040
544
545 /* If set, produce revision number sequence. */
546 #define CODING_ISO_FLAG_REVISION 0x0080
547
548 /* If set, produce ISO6429's direction specifying sequence. */
549 #define CODING_ISO_FLAG_DIRECTION 0x0100
550
551 /* If set, assume designation states are reset at beginning of line on
552 output. */
553 #define CODING_ISO_FLAG_INIT_AT_BOL 0x0200
554
555 /* If set, designation sequence should be placed at beginning of line
556 on output. */
557 #define CODING_ISO_FLAG_DESIGNATE_AT_BOL 0x0400
558
559 /* If set, do not encode unsafe charactes on output. */
560 #define CODING_ISO_FLAG_SAFE 0x0800
561
562 /* If set, extra latin codes (128..159) are accepted as a valid code
563 on input. */
564 #define CODING_ISO_FLAG_LATIN_EXTRA 0x1000
565
566 #define CODING_ISO_FLAG_COMPOSITION 0x2000
567
568 #define CODING_ISO_FLAG_EUC_TW_SHIFT 0x4000
569
570 #define CODING_ISO_FLAG_USE_ROMAN 0x8000
571
572 #define CODING_ISO_FLAG_USE_OLDJIS 0x10000
573
574 #define CODING_ISO_FLAG_FULL_SUPPORT 0x100000
575
576 /* A character to be produced on output if encoding of the original
577 character is prohibited by CODING_ISO_FLAG_SAFE. */
578 #define CODING_INHIBIT_CHARACTER_SUBSTITUTION '?'
579
580
581 /* UTF-16 section */
582 #define CODING_UTF_16_BOM(coding) \
583 ((coding)->spec.utf_16.bom)
584
585 #define CODING_UTF_16_ENDIAN(coding) \
586 ((coding)->spec.utf_16.endian)
587
588 #define CODING_UTF_16_SURROGATE(coding) \
589 ((coding)->spec.utf_16.surrogate)
590
591
592 /* CCL section */
593 #define CODING_CCL_DECODER(coding) \
594 AREF (CODING_ID_ATTRS ((coding)->id), coding_attr_ccl_decoder)
595 #define CODING_CCL_ENCODER(coding) \
596 AREF (CODING_ID_ATTRS ((coding)->id), coding_attr_ccl_encoder)
597 #define CODING_CCL_VALIDS(coding) \
598 (SDATA (AREF (CODING_ID_ATTRS ((coding)->id), coding_attr_ccl_valids)))
599
600 /* Index for each coding category in `coding_categories' */
601
602 enum coding_category
603 {
604 coding_category_iso_7,
605 coding_category_iso_7_tight,
606 coding_category_iso_8_1,
607 coding_category_iso_8_2,
608 coding_category_iso_7_else,
609 coding_category_iso_8_else,
610 coding_category_utf_8,
611 coding_category_utf_16_auto,
612 coding_category_utf_16_be,
613 coding_category_utf_16_le,
614 coding_category_utf_16_be_nosig,
615 coding_category_utf_16_le_nosig,
616 coding_category_charset,
617 coding_category_sjis,
618 coding_category_big5,
619 coding_category_ccl,
620 coding_category_emacs_mule,
621 /* All above are targets of code detection. */
622 coding_category_raw_text,
623 coding_category_undecided,
624 coding_category_max
625 };
626
627 /* Definitions of flag bits used in detect_coding_XXXX. */
628 #define CATEGORY_MASK_ISO_7 (1 << coding_category_iso_7)
629 #define CATEGORY_MASK_ISO_7_TIGHT (1 << coding_category_iso_7_tight)
630 #define CATEGORY_MASK_ISO_8_1 (1 << coding_category_iso_8_1)
631 #define CATEGORY_MASK_ISO_8_2 (1 << coding_category_iso_8_2)
632 #define CATEGORY_MASK_ISO_7_ELSE (1 << coding_category_iso_7_else)
633 #define CATEGORY_MASK_ISO_8_ELSE (1 << coding_category_iso_8_else)
634 #define CATEGORY_MASK_UTF_8 (1 << coding_category_utf_8)
635 #define CATEGORY_MASK_UTF_16_AUTO (1 << coding_category_utf_16_auto)
636 #define CATEGORY_MASK_UTF_16_BE (1 << coding_category_utf_16_be)
637 #define CATEGORY_MASK_UTF_16_LE (1 << coding_category_utf_16_le)
638 #define CATEGORY_MASK_UTF_16_BE_NOSIG (1 << coding_category_utf_16_be_nosig)
639 #define CATEGORY_MASK_UTF_16_LE_NOSIG (1 << coding_category_utf_16_le_nosig)
640 #define CATEGORY_MASK_CHARSET (1 << coding_category_charset)
641 #define CATEGORY_MASK_SJIS (1 << coding_category_sjis)
642 #define CATEGORY_MASK_BIG5 (1 << coding_category_big5)
643 #define CATEGORY_MASK_CCL (1 << coding_category_ccl)
644 #define CATEGORY_MASK_EMACS_MULE (1 << coding_category_emacs_mule)
645 #define CATEGORY_MASK_RAW_TEXT (1 << coding_category_raw_text)
646
647 /* This value is returned if detect_coding_mask () find nothing other
648 than ASCII characters. */
649 #define CATEGORY_MASK_ANY \
650 (CATEGORY_MASK_ISO_7 \
651 | CATEGORY_MASK_ISO_7_TIGHT \
652 | CATEGORY_MASK_ISO_8_1 \
653 | CATEGORY_MASK_ISO_8_2 \
654 | CATEGORY_MASK_ISO_7_ELSE \
655 | CATEGORY_MASK_ISO_8_ELSE \
656 | CATEGORY_MASK_UTF_8 \
657 | CATEGORY_MASK_UTF_16_BE \
658 | CATEGORY_MASK_UTF_16_LE \
659 | CATEGORY_MASK_UTF_16_BE_NOSIG \
660 | CATEGORY_MASK_UTF_16_LE_NOSIG \
661 | CATEGORY_MASK_CHARSET \
662 | CATEGORY_MASK_SJIS \
663 | CATEGORY_MASK_BIG5 \
664 | CATEGORY_MASK_CCL \
665 | CATEGORY_MASK_EMACS_MULE)
666
667
668 #define CATEGORY_MASK_ISO_7BIT \
669 (CATEGORY_MASK_ISO_7 | CATEGORY_MASK_ISO_7_TIGHT)
670
671 #define CATEGORY_MASK_ISO_8BIT \
672 (CATEGORY_MASK_ISO_8_1 | CATEGORY_MASK_ISO_8_2)
673
674 #define CATEGORY_MASK_ISO_ELSE \
675 (CATEGORY_MASK_ISO_7_ELSE | CATEGORY_MASK_ISO_8_ELSE)
676
677 #define CATEGORY_MASK_ISO_ESCAPE \
678 (CATEGORY_MASK_ISO_7 \
679 | CATEGORY_MASK_ISO_7_TIGHT \
680 | CATEGORY_MASK_ISO_7_ELSE \
681 | CATEGORY_MASK_ISO_8_ELSE)
682
683 #define CATEGORY_MASK_ISO \
684 ( CATEGORY_MASK_ISO_7BIT \
685 | CATEGORY_MASK_ISO_8BIT \
686 | CATEGORY_MASK_ISO_ELSE)
687
688 #define CATEGORY_MASK_UTF_16 \
689 (CATEGORY_MASK_UTF_16_BE \
690 | CATEGORY_MASK_UTF_16_LE \
691 | CATEGORY_MASK_UTF_16_BE_NOSIG \
692 | CATEGORY_MASK_UTF_16_LE_NOSIG)
693
694
695 /* List of symbols `coding-category-xxx' ordered by priority. This
696 variable is exposed to Emacs Lisp. */
697 static Lisp_Object Vcoding_category_list;
698
699 /* Table of coding categories (Lisp symbols). This variable is for
700 internal use oly. */
701 static Lisp_Object Vcoding_category_table;
702
703 /* Table of coding-categories ordered by priority. */
704 static enum coding_category coding_priorities[coding_category_max];
705
706 /* Nth element is a coding context for the coding system bound to the
707 Nth coding category. */
708 static struct coding_system coding_categories[coding_category_max];
709
710 /*** Commonly used macros and functions ***/
711
712 #ifndef min
713 #define min(a, b) ((a) < (b) ? (a) : (b))
714 #endif
715 #ifndef max
716 #define max(a, b) ((a) > (b) ? (a) : (b))
717 #endif
718
719 #define CODING_GET_INFO(coding, attrs, charset_list) \
720 do { \
721 (attrs) = CODING_ID_ATTRS ((coding)->id); \
722 (charset_list) = CODING_ATTR_CHARSET_LIST (attrs); \
723 } while (0)
724
725
726 /* Safely get one byte from the source text pointed by SRC which ends
727 at SRC_END, and set C to that byte. If there are not enough bytes
728 in the source, it jumps to `no_more_source'. If multibytep is
729 nonzero, and a multibyte character is found at SRC, set C to the
730 negative value of the character code. The caller should declare
731 and set these variables appropriately in advance:
732 src, src_end, multibytep */
733
734 #define ONE_MORE_BYTE(c) \
735 do { \
736 if (src == src_end) \
737 { \
738 if (src_base < src) \
739 record_conversion_result \
740 (coding, CODING_RESULT_INSUFFICIENT_SRC); \
741 goto no_more_source; \
742 } \
743 c = *src++; \
744 if (multibytep && (c & 0x80)) \
745 { \
746 if ((c & 0xFE) == 0xC0) \
747 c = ((c & 1) << 6) | *src++; \
748 else \
749 { \
750 c = - string_char (--src, &src, NULL); \
751 record_conversion_result \
752 (coding, CODING_RESULT_INVALID_SRC); \
753 } \
754 } \
755 consumed_chars++; \
756 } while (0)
757
758
759 #define ONE_MORE_BYTE_NO_CHECK(c) \
760 do { \
761 c = *src++; \
762 if (multibytep && (c & 0x80)) \
763 { \
764 if ((c & 0xFE) == 0xC0) \
765 c = ((c & 1) << 6) | *src++; \
766 else \
767 { \
768 c = - string_char (--src, &src, NULL); \
769 record_conversion_result \
770 (coding, CODING_RESULT_INVALID_SRC); \
771 } \
772 } \
773 consumed_chars++; \
774 } while (0)
775
776
777 /* Store a byte C in the place pointed by DST and increment DST to the
778 next free point, and increment PRODUCED_CHARS. The caller should
779 assure that C is 0..127, and declare and set the variable `dst'
780 appropriately in advance.
781 */
782
783
784 #define EMIT_ONE_ASCII_BYTE(c) \
785 do { \
786 produced_chars++; \
787 *dst++ = (c); \
788 } while (0)
789
790
791 /* Like EMIT_ONE_ASCII_BYTE byt store two bytes; C1 and C2. */
792
793 #define EMIT_TWO_ASCII_BYTES(c1, c2) \
794 do { \
795 produced_chars += 2; \
796 *dst++ = (c1), *dst++ = (c2); \
797 } while (0)
798
799
800 /* Store a byte C in the place pointed by DST and increment DST to the
801 next free point, and increment PRODUCED_CHARS. If MULTIBYTEP is
802 nonzero, store in an appropriate multibyte from. The caller should
803 declare and set the variables `dst' and `multibytep' appropriately
804 in advance. */
805
806 #define EMIT_ONE_BYTE(c) \
807 do { \
808 produced_chars++; \
809 if (multibytep) \
810 { \
811 int ch = (c); \
812 if (ch >= 0x80) \
813 ch = BYTE8_TO_CHAR (ch); \
814 CHAR_STRING_ADVANCE (ch, dst); \
815 } \
816 else \
817 *dst++ = (c); \
818 } while (0)
819
820
821 /* Like EMIT_ONE_BYTE, but emit two bytes; C1 and C2. */
822
823 #define EMIT_TWO_BYTES(c1, c2) \
824 do { \
825 produced_chars += 2; \
826 if (multibytep) \
827 { \
828 int ch; \
829 \
830 ch = (c1); \
831 if (ch >= 0x80) \
832 ch = BYTE8_TO_CHAR (ch); \
833 CHAR_STRING_ADVANCE (ch, dst); \
834 ch = (c2); \
835 if (ch >= 0x80) \
836 ch = BYTE8_TO_CHAR (ch); \
837 CHAR_STRING_ADVANCE (ch, dst); \
838 } \
839 else \
840 { \
841 *dst++ = (c1); \
842 *dst++ = (c2); \
843 } \
844 } while (0)
845
846
847 #define EMIT_THREE_BYTES(c1, c2, c3) \
848 do { \
849 EMIT_ONE_BYTE (c1); \
850 EMIT_TWO_BYTES (c2, c3); \
851 } while (0)
852
853
854 #define EMIT_FOUR_BYTES(c1, c2, c3, c4) \
855 do { \
856 EMIT_TWO_BYTES (c1, c2); \
857 EMIT_TWO_BYTES (c3, c4); \
858 } while (0)
859
860
861 static void
862 record_conversion_result (struct coding_system *coding,
863 enum coding_result_code result)
864 {
865 coding->result = result;
866 switch (result)
867 {
868 case CODING_RESULT_INSUFFICIENT_SRC:
869 Vlast_code_conversion_error = Qinsufficient_source;
870 break;
871 case CODING_RESULT_INCONSISTENT_EOL:
872 Vlast_code_conversion_error = Qinconsistent_eol;
873 break;
874 case CODING_RESULT_INVALID_SRC:
875 Vlast_code_conversion_error = Qinvalid_source;
876 break;
877 case CODING_RESULT_INTERRUPT:
878 Vlast_code_conversion_error = Qinterrupted;
879 break;
880 case CODING_RESULT_INSUFFICIENT_MEM:
881 Vlast_code_conversion_error = Qinsufficient_memory;
882 break;
883 }
884 }
885
886 #define CODING_DECODE_CHAR(coding, src, src_base, src_end, charset, code, c) \
887 do { \
888 charset_map_loaded = 0; \
889 c = DECODE_CHAR (charset, code); \
890 if (charset_map_loaded) \
891 { \
892 const unsigned char *orig = coding->source; \
893 EMACS_INT offset; \
894 \
895 coding_set_source (coding); \
896 offset = coding->source - orig; \
897 src += offset; \
898 src_base += offset; \
899 src_end += offset; \
900 } \
901 } while (0)
902
903
904 #define ASSURE_DESTINATION(bytes) \
905 do { \
906 if (dst + (bytes) >= dst_end) \
907 { \
908 int more_bytes = charbuf_end - charbuf + (bytes); \
909 \
910 dst = alloc_destination (coding, more_bytes, dst); \
911 dst_end = coding->destination + coding->dst_bytes; \
912 } \
913 } while (0)
914
915
916
917 static void
918 coding_set_source (coding)
919 struct coding_system *coding;
920 {
921 if (BUFFERP (coding->src_object))
922 {
923 struct buffer *buf = XBUFFER (coding->src_object);
924
925 if (coding->src_pos < 0)
926 coding->source = BUF_GAP_END_ADDR (buf) + coding->src_pos_byte;
927 else
928 coding->source = BUF_BYTE_ADDRESS (buf, coding->src_pos_byte);
929 }
930 else if (STRINGP (coding->src_object))
931 {
932 coding->source = SDATA (coding->src_object) + coding->src_pos_byte;
933 }
934 else
935 /* Otherwise, the source is C string and is never relocated
936 automatically. Thus we don't have to update anything. */
937 ;
938 }
939
940 static void
941 coding_set_destination (coding)
942 struct coding_system *coding;
943 {
944 if (BUFFERP (coding->dst_object))
945 {
946 if (coding->src_pos < 0)
947 {
948 coding->destination = BEG_ADDR + coding->dst_pos_byte - 1;
949 coding->dst_bytes = (GAP_END_ADDR
950 - (coding->src_bytes - coding->consumed)
951 - coding->destination);
952 }
953 else
954 {
955 /* We are sure that coding->dst_pos_byte is before the gap
956 of the buffer. */
957 coding->destination = (BUF_BEG_ADDR (XBUFFER (coding->dst_object))
958 + coding->dst_pos_byte - 1);
959 coding->dst_bytes = (BUF_GAP_END_ADDR (XBUFFER (coding->dst_object))
960 - coding->destination);
961 }
962 }
963 else
964 /* Otherwise, the destination is C string and is never relocated
965 automatically. Thus we don't have to update anything. */
966 ;
967 }
968
969
970 static void
971 coding_alloc_by_realloc (coding, bytes)
972 struct coding_system *coding;
973 EMACS_INT bytes;
974 {
975 coding->destination = (unsigned char *) xrealloc (coding->destination,
976 coding->dst_bytes + bytes);
977 coding->dst_bytes += bytes;
978 }
979
980 static void
981 coding_alloc_by_making_gap (coding, bytes)
982 struct coding_system *coding;
983 EMACS_INT bytes;
984 {
985 if (BUFFERP (coding->dst_object)
986 && EQ (coding->src_object, coding->dst_object))
987 {
988 EMACS_INT add = coding->src_bytes - coding->consumed;
989
990 GAP_SIZE -= add; ZV += add; Z += add; ZV_BYTE += add; Z_BYTE += add;
991 make_gap (bytes);
992 GAP_SIZE += add; ZV -= add; Z -= add; ZV_BYTE -= add; Z_BYTE -= add;
993 }
994 else
995 {
996 Lisp_Object this_buffer;
997
998 this_buffer = Fcurrent_buffer ();
999 set_buffer_internal (XBUFFER (coding->dst_object));
1000 make_gap (bytes);
1001 set_buffer_internal (XBUFFER (this_buffer));
1002 }
1003 }
1004
1005
1006 static unsigned char *
1007 alloc_destination (coding, nbytes, dst)
1008 struct coding_system *coding;
1009 EMACS_INT nbytes;
1010 unsigned char *dst;
1011 {
1012 EMACS_INT offset = dst - coding->destination;
1013
1014 if (BUFFERP (coding->dst_object))
1015 coding_alloc_by_making_gap (coding, nbytes);
1016 else
1017 coding_alloc_by_realloc (coding, nbytes);
1018 record_conversion_result (coding, CODING_RESULT_SUCCESS);
1019 coding_set_destination (coding);
1020 dst = coding->destination + offset;
1021 return dst;
1022 }
1023
1024 /** Macros for annotations. */
1025
1026 /* Maximum length of annotation data (sum of annotations for
1027 composition and charset). */
1028 #define MAX_ANNOTATION_LENGTH (5 + (MAX_COMPOSITION_COMPONENTS * 2) - 1 + 5)
1029
1030 /* An annotation data is stored in the array coding->charbuf in this
1031 format:
1032 [ -LENGTH ANNOTATION_MASK FROM TO ... ]
1033 LENGTH is the number of elements in the annotation.
1034 ANNOTATION_MASK is one of CODING_ANNOTATE_XXX_MASK.
1035 FROM and TO specify the range of text annotated. They are relative
1036 to coding->src_pos (on encoding) or coding->dst_pos (on decoding).
1037
1038 The format of the following elements depend on ANNOTATION_MASK.
1039
1040 In the case of CODING_ANNOTATE_COMPOSITION_MASK, these elements
1041 follows:
1042 ... METHOD [ COMPOSITION-COMPONENTS ... ]
1043 METHOD is one of enum composition_method.
1044 Optionnal COMPOSITION-COMPONENTS are characters and composition
1045 rules.
1046
1047 In the case of CODING_ANNOTATE_CHARSET_MASK, one element CHARSET-ID
1048 follows. */
1049
1050 #define ADD_ANNOTATION_DATA(buf, len, mask, from, to) \
1051 do { \
1052 *(buf)++ = -(len); \
1053 *(buf)++ = (mask); \
1054 *(buf)++ = (from); \
1055 *(buf)++ = (to); \
1056 coding->annotated = 1; \
1057 } while (0);
1058
1059 #define ADD_COMPOSITION_DATA(buf, from, to, method) \
1060 do { \
1061 ADD_ANNOTATION_DATA (buf, 5, CODING_ANNOTATE_COMPOSITION_MASK, from, to); \
1062 *buf++ = method; \
1063 } while (0)
1064
1065
1066 #define ADD_CHARSET_DATA(buf, from, to, id) \
1067 do { \
1068 ADD_ANNOTATION_DATA (buf, 5, CODING_ANNOTATE_CHARSET_MASK, from, to); \
1069 *buf++ = id; \
1070 } while (0)
1071
1072 \f
1073 /*** 2. Emacs' internal format (emacs-utf-8) ***/
1074
1075
1076
1077 \f
1078 /*** 3. UTF-8 ***/
1079
1080 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
1081 Check if a text is encoded in UTF-8. If it is, return 1, else
1082 return 0. */
1083
1084 #define UTF_8_1_OCTET_P(c) ((c) < 0x80)
1085 #define UTF_8_EXTRA_OCTET_P(c) (((c) & 0xC0) == 0x80)
1086 #define UTF_8_2_OCTET_LEADING_P(c) (((c) & 0xE0) == 0xC0)
1087 #define UTF_8_3_OCTET_LEADING_P(c) (((c) & 0xF0) == 0xE0)
1088 #define UTF_8_4_OCTET_LEADING_P(c) (((c) & 0xF8) == 0xF0)
1089 #define UTF_8_5_OCTET_LEADING_P(c) (((c) & 0xFC) == 0xF8)
1090
1091 static int
1092 detect_coding_utf_8 (coding, detect_info)
1093 struct coding_system *coding;
1094 struct coding_detection_info *detect_info;
1095 {
1096 const unsigned char *src = coding->source, *src_base;
1097 const unsigned char *src_end = coding->source + coding->src_bytes;
1098 int multibytep = coding->src_multibyte;
1099 int consumed_chars = 0;
1100 int found = 0;
1101
1102 detect_info->checked |= CATEGORY_MASK_UTF_8;
1103 /* A coding system of this category is always ASCII compatible. */
1104 src += coding->head_ascii;
1105
1106 while (1)
1107 {
1108 int c, c1, c2, c3, c4;
1109
1110 src_base = src;
1111 ONE_MORE_BYTE (c);
1112 if (c < 0 || UTF_8_1_OCTET_P (c))
1113 continue;
1114 ONE_MORE_BYTE (c1);
1115 if (c1 < 0 || ! UTF_8_EXTRA_OCTET_P (c1))
1116 break;
1117 if (UTF_8_2_OCTET_LEADING_P (c))
1118 {
1119 found = CATEGORY_MASK_UTF_8;
1120 continue;
1121 }
1122 ONE_MORE_BYTE (c2);
1123 if (c2 < 0 || ! UTF_8_EXTRA_OCTET_P (c2))
1124 break;
1125 if (UTF_8_3_OCTET_LEADING_P (c))
1126 {
1127 found = CATEGORY_MASK_UTF_8;
1128 continue;
1129 }
1130 ONE_MORE_BYTE (c3);
1131 if (c3 < 0 || ! UTF_8_EXTRA_OCTET_P (c3))
1132 break;
1133 if (UTF_8_4_OCTET_LEADING_P (c))
1134 {
1135 found = CATEGORY_MASK_UTF_8;
1136 continue;
1137 }
1138 ONE_MORE_BYTE (c4);
1139 if (c4 < 0 || ! UTF_8_EXTRA_OCTET_P (c4))
1140 break;
1141 if (UTF_8_5_OCTET_LEADING_P (c))
1142 {
1143 found = CATEGORY_MASK_UTF_8;
1144 continue;
1145 }
1146 break;
1147 }
1148 detect_info->rejected |= CATEGORY_MASK_UTF_8;
1149 return 0;
1150
1151 no_more_source:
1152 if (src_base < src && coding->mode & CODING_MODE_LAST_BLOCK)
1153 {
1154 detect_info->rejected |= CATEGORY_MASK_UTF_8;
1155 return 0;
1156 }
1157 detect_info->found |= found;
1158 return 1;
1159 }
1160
1161
1162 static void
1163 decode_coding_utf_8 (coding)
1164 struct coding_system *coding;
1165 {
1166 const unsigned char *src = coding->source + coding->consumed;
1167 const unsigned char *src_end = coding->source + coding->src_bytes;
1168 const unsigned char *src_base;
1169 int *charbuf = coding->charbuf;
1170 int *charbuf_end = charbuf + coding->charbuf_size;
1171 int consumed_chars = 0, consumed_chars_base;
1172 int multibytep = coding->src_multibyte;
1173 Lisp_Object attr, charset_list;
1174
1175 CODING_GET_INFO (coding, attr, charset_list);
1176
1177 while (1)
1178 {
1179 int c, c1, c2, c3, c4, c5;
1180
1181 src_base = src;
1182 consumed_chars_base = consumed_chars;
1183
1184 if (charbuf >= charbuf_end)
1185 break;
1186
1187 ONE_MORE_BYTE (c1);
1188 if (c1 < 0)
1189 {
1190 c = - c1;
1191 }
1192 else if (UTF_8_1_OCTET_P(c1))
1193 {
1194 c = c1;
1195 }
1196 else
1197 {
1198 ONE_MORE_BYTE (c2);
1199 if (c2 < 0 || ! UTF_8_EXTRA_OCTET_P (c2))
1200 goto invalid_code;
1201 if (UTF_8_2_OCTET_LEADING_P (c1))
1202 {
1203 c = ((c1 & 0x1F) << 6) | (c2 & 0x3F);
1204 /* Reject overlong sequences here and below. Encoders
1205 producing them are incorrect, they can be misleading,
1206 and they mess up read/write invariance. */
1207 if (c < 128)
1208 goto invalid_code;
1209 }
1210 else
1211 {
1212 ONE_MORE_BYTE (c3);
1213 if (c3 < 0 || ! UTF_8_EXTRA_OCTET_P (c3))
1214 goto invalid_code;
1215 if (UTF_8_3_OCTET_LEADING_P (c1))
1216 {
1217 c = (((c1 & 0xF) << 12)
1218 | ((c2 & 0x3F) << 6) | (c3 & 0x3F));
1219 if (c < 0x800
1220 || (c >= 0xd800 && c < 0xe000)) /* surrogates (invalid) */
1221 goto invalid_code;
1222 }
1223 else
1224 {
1225 ONE_MORE_BYTE (c4);
1226 if (c4 < 0 || ! UTF_8_EXTRA_OCTET_P (c4))
1227 goto invalid_code;
1228 if (UTF_8_4_OCTET_LEADING_P (c1))
1229 {
1230 c = (((c1 & 0x7) << 18) | ((c2 & 0x3F) << 12)
1231 | ((c3 & 0x3F) << 6) | (c4 & 0x3F));
1232 if (c < 0x10000)
1233 goto invalid_code;
1234 }
1235 else
1236 {
1237 ONE_MORE_BYTE (c5);
1238 if (c5 < 0 || ! UTF_8_EXTRA_OCTET_P (c5))
1239 goto invalid_code;
1240 if (UTF_8_5_OCTET_LEADING_P (c1))
1241 {
1242 c = (((c1 & 0x3) << 24) | ((c2 & 0x3F) << 18)
1243 | ((c3 & 0x3F) << 12) | ((c4 & 0x3F) << 6)
1244 | (c5 & 0x3F));
1245 if ((c > MAX_CHAR) || (c < 0x200000))
1246 goto invalid_code;
1247 }
1248 else
1249 goto invalid_code;
1250 }
1251 }
1252 }
1253 }
1254
1255 *charbuf++ = c;
1256 continue;
1257
1258 invalid_code:
1259 src = src_base;
1260 consumed_chars = consumed_chars_base;
1261 ONE_MORE_BYTE (c);
1262 *charbuf++ = ASCII_BYTE_P (c) ? c : BYTE8_TO_CHAR (c);
1263 coding->errors++;
1264 }
1265
1266 no_more_source:
1267 coding->consumed_char += consumed_chars_base;
1268 coding->consumed = src_base - coding->source;
1269 coding->charbuf_used = charbuf - coding->charbuf;
1270 }
1271
1272
1273 static int
1274 encode_coding_utf_8 (coding)
1275 struct coding_system *coding;
1276 {
1277 int multibytep = coding->dst_multibyte;
1278 int *charbuf = coding->charbuf;
1279 int *charbuf_end = charbuf + coding->charbuf_used;
1280 unsigned char *dst = coding->destination + coding->produced;
1281 unsigned char *dst_end = coding->destination + coding->dst_bytes;
1282 int produced_chars = 0;
1283 int c;
1284
1285 if (multibytep)
1286 {
1287 int safe_room = MAX_MULTIBYTE_LENGTH * 2;
1288
1289 while (charbuf < charbuf_end)
1290 {
1291 unsigned char str[MAX_MULTIBYTE_LENGTH], *p, *pend = str;
1292
1293 ASSURE_DESTINATION (safe_room);
1294 c = *charbuf++;
1295 if (CHAR_BYTE8_P (c))
1296 {
1297 c = CHAR_TO_BYTE8 (c);
1298 EMIT_ONE_BYTE (c);
1299 }
1300 else
1301 {
1302 CHAR_STRING_ADVANCE (c, pend);
1303 for (p = str; p < pend; p++)
1304 EMIT_ONE_BYTE (*p);
1305 }
1306 }
1307 }
1308 else
1309 {
1310 int safe_room = MAX_MULTIBYTE_LENGTH;
1311
1312 while (charbuf < charbuf_end)
1313 {
1314 ASSURE_DESTINATION (safe_room);
1315 c = *charbuf++;
1316 dst += CHAR_STRING (c, dst);
1317 produced_chars++;
1318 }
1319 }
1320 record_conversion_result (coding, CODING_RESULT_SUCCESS);
1321 coding->produced_char += produced_chars;
1322 coding->produced = dst - coding->destination;
1323 return 0;
1324 }
1325
1326
1327 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
1328 Check if a text is encoded in one of UTF-16 based coding systems.
1329 If it is, return 1, else return 0. */
1330
1331 #define UTF_16_HIGH_SURROGATE_P(val) \
1332 (((val) & 0xFC00) == 0xD800)
1333
1334 #define UTF_16_LOW_SURROGATE_P(val) \
1335 (((val) & 0xFC00) == 0xDC00)
1336
1337 #define UTF_16_INVALID_P(val) \
1338 (((val) == 0xFFFE) \
1339 || ((val) == 0xFFFF) \
1340 || UTF_16_LOW_SURROGATE_P (val))
1341
1342
1343 static int
1344 detect_coding_utf_16 (coding, detect_info)
1345 struct coding_system *coding;
1346 struct coding_detection_info *detect_info;
1347 {
1348 const unsigned char *src = coding->source, *src_base = src;
1349 const unsigned char *src_end = coding->source + coding->src_bytes;
1350 int multibytep = coding->src_multibyte;
1351 int consumed_chars = 0;
1352 int c1, c2;
1353
1354 detect_info->checked |= CATEGORY_MASK_UTF_16;
1355 if (coding->mode & CODING_MODE_LAST_BLOCK
1356 && (coding->src_chars & 1))
1357 {
1358 detect_info->rejected |= CATEGORY_MASK_UTF_16;
1359 return 0;
1360 }
1361
1362 ONE_MORE_BYTE (c1);
1363 ONE_MORE_BYTE (c2);
1364 if ((c1 == 0xFF) && (c2 == 0xFE))
1365 {
1366 detect_info->found |= (CATEGORY_MASK_UTF_16_LE
1367 | CATEGORY_MASK_UTF_16_AUTO);
1368 detect_info->rejected |= (CATEGORY_MASK_UTF_16_BE
1369 | CATEGORY_MASK_UTF_16_BE_NOSIG
1370 | CATEGORY_MASK_UTF_16_LE_NOSIG);
1371 }
1372 else if ((c1 == 0xFE) && (c2 == 0xFF))
1373 {
1374 detect_info->found |= (CATEGORY_MASK_UTF_16_BE
1375 | CATEGORY_MASK_UTF_16_AUTO);
1376 detect_info->rejected |= (CATEGORY_MASK_UTF_16_LE
1377 | CATEGORY_MASK_UTF_16_BE_NOSIG
1378 | CATEGORY_MASK_UTF_16_LE_NOSIG);
1379 }
1380 else if (c1 >= 0 && c2 >= 0)
1381 {
1382 unsigned char b1[256], b2[256];
1383 int b1_variants = 1, b2_variants = 1;
1384 int n;
1385
1386 bzero (b1, 256), bzero (b2, 256);
1387 b1[c1]++, b2[c2]++;
1388 for (n = 0; n < 256 && src < src_end; n++)
1389 {
1390 src_base = src;
1391 ONE_MORE_BYTE (c1);
1392 ONE_MORE_BYTE (c2);
1393 if (c1 < 0 || c2 < 0)
1394 break;
1395 if (! b1[c1++]) b1_variants++;
1396 if (! b2[c2++]) b2_variants++;
1397 }
1398 if (b1_variants < b2_variants)
1399 detect_info->found |= CATEGORY_MASK_UTF_16_BE_NOSIG;
1400 else
1401 detect_info->found |= CATEGORY_MASK_UTF_16_LE_NOSIG;
1402 detect_info->rejected
1403 |= (CATEGORY_MASK_UTF_16_BE | CATEGORY_MASK_UTF_16_LE);
1404 }
1405 no_more_source:
1406 return 1;
1407 }
1408
1409 static void
1410 decode_coding_utf_16 (coding)
1411 struct coding_system *coding;
1412 {
1413 const unsigned char *src = coding->source + coding->consumed;
1414 const unsigned char *src_end = coding->source + coding->src_bytes;
1415 const unsigned char *src_base;
1416 int *charbuf = coding->charbuf;
1417 int *charbuf_end = charbuf + coding->charbuf_size;
1418 int consumed_chars = 0, consumed_chars_base;
1419 int multibytep = coding->src_multibyte;
1420 enum utf_16_bom_type bom = CODING_UTF_16_BOM (coding);
1421 enum utf_16_endian_type endian = CODING_UTF_16_ENDIAN (coding);
1422 int surrogate = CODING_UTF_16_SURROGATE (coding);
1423 Lisp_Object attr, charset_list;
1424
1425 CODING_GET_INFO (coding, attr, charset_list);
1426
1427 if (bom == utf_16_with_bom)
1428 {
1429 int c, c1, c2;
1430
1431 src_base = src;
1432 ONE_MORE_BYTE (c1);
1433 ONE_MORE_BYTE (c2);
1434 c = (c1 << 8) | c2;
1435
1436 if (endian == utf_16_big_endian
1437 ? c != 0xFEFF : c != 0xFFFE)
1438 {
1439 /* The first two bytes are not BOM. Treat them as bytes
1440 for a normal character. */
1441 src = src_base;
1442 coding->errors++;
1443 }
1444 CODING_UTF_16_BOM (coding) = utf_16_without_bom;
1445 }
1446 else if (bom == utf_16_detect_bom)
1447 {
1448 /* We have already tried to detect BOM and failed in
1449 detect_coding. */
1450 CODING_UTF_16_BOM (coding) = utf_16_without_bom;
1451 }
1452
1453 while (1)
1454 {
1455 int c, c1, c2;
1456
1457 src_base = src;
1458 consumed_chars_base = consumed_chars;
1459
1460 if (charbuf + 2 >= charbuf_end)
1461 break;
1462
1463 ONE_MORE_BYTE (c1);
1464 if (c1 < 0)
1465 {
1466 *charbuf++ = -c1;
1467 continue;
1468 }
1469 ONE_MORE_BYTE (c2);
1470 if (c2 < 0)
1471 {
1472 *charbuf++ = ASCII_BYTE_P (c1) ? c1 : BYTE8_TO_CHAR (c1);
1473 *charbuf++ = -c2;
1474 continue;
1475 }
1476 c = (endian == utf_16_big_endian
1477 ? ((c1 << 8) | c2) : ((c2 << 8) | c1));
1478 if (surrogate)
1479 {
1480 if (! UTF_16_LOW_SURROGATE_P (c))
1481 {
1482 if (endian == utf_16_big_endian)
1483 c1 = surrogate >> 8, c2 = surrogate & 0xFF;
1484 else
1485 c1 = surrogate & 0xFF, c2 = surrogate >> 8;
1486 *charbuf++ = c1;
1487 *charbuf++ = c2;
1488 coding->errors++;
1489 if (UTF_16_HIGH_SURROGATE_P (c))
1490 CODING_UTF_16_SURROGATE (coding) = surrogate = c;
1491 else
1492 *charbuf++ = c;
1493 }
1494 else
1495 {
1496 c = ((surrogate - 0xD800) << 10) | (c - 0xDC00);
1497 CODING_UTF_16_SURROGATE (coding) = surrogate = 0;
1498 *charbuf++ = c;
1499 }
1500 }
1501 else
1502 {
1503 if (UTF_16_HIGH_SURROGATE_P (c))
1504 CODING_UTF_16_SURROGATE (coding) = surrogate = c;
1505 else
1506 *charbuf++ = c;
1507 }
1508 }
1509
1510 no_more_source:
1511 coding->consumed_char += consumed_chars_base;
1512 coding->consumed = src_base - coding->source;
1513 coding->charbuf_used = charbuf - coding->charbuf;
1514 }
1515
1516 static int
1517 encode_coding_utf_16 (coding)
1518 struct coding_system *coding;
1519 {
1520 int multibytep = coding->dst_multibyte;
1521 int *charbuf = coding->charbuf;
1522 int *charbuf_end = charbuf + coding->charbuf_used;
1523 unsigned char *dst = coding->destination + coding->produced;
1524 unsigned char *dst_end = coding->destination + coding->dst_bytes;
1525 int safe_room = 8;
1526 enum utf_16_bom_type bom = CODING_UTF_16_BOM (coding);
1527 int big_endian = CODING_UTF_16_ENDIAN (coding) == utf_16_big_endian;
1528 int produced_chars = 0;
1529 Lisp_Object attrs, charset_list;
1530 int c;
1531
1532 CODING_GET_INFO (coding, attrs, charset_list);
1533
1534 if (bom != utf_16_without_bom)
1535 {
1536 ASSURE_DESTINATION (safe_room);
1537 if (big_endian)
1538 EMIT_TWO_BYTES (0xFE, 0xFF);
1539 else
1540 EMIT_TWO_BYTES (0xFF, 0xFE);
1541 CODING_UTF_16_BOM (coding) = utf_16_without_bom;
1542 }
1543
1544 while (charbuf < charbuf_end)
1545 {
1546 ASSURE_DESTINATION (safe_room);
1547 c = *charbuf++;
1548 if (c >= MAX_UNICODE_CHAR)
1549 c = coding->default_char;
1550
1551 if (c < 0x10000)
1552 {
1553 if (big_endian)
1554 EMIT_TWO_BYTES (c >> 8, c & 0xFF);
1555 else
1556 EMIT_TWO_BYTES (c & 0xFF, c >> 8);
1557 }
1558 else
1559 {
1560 int c1, c2;
1561
1562 c -= 0x10000;
1563 c1 = (c >> 10) + 0xD800;
1564 c2 = (c & 0x3FF) + 0xDC00;
1565 if (big_endian)
1566 EMIT_FOUR_BYTES (c1 >> 8, c1 & 0xFF, c2 >> 8, c2 & 0xFF);
1567 else
1568 EMIT_FOUR_BYTES (c1 & 0xFF, c1 >> 8, c2 & 0xFF, c2 >> 8);
1569 }
1570 }
1571 record_conversion_result (coding, CODING_RESULT_SUCCESS);
1572 coding->produced = dst - coding->destination;
1573 coding->produced_char += produced_chars;
1574 return 0;
1575 }
1576
1577 \f
1578 /*** 6. Old Emacs' internal format (emacs-mule) ***/
1579
1580 /* Emacs' internal format for representation of multiple character
1581 sets is a kind of multi-byte encoding, i.e. characters are
1582 represented by variable-length sequences of one-byte codes.
1583
1584 ASCII characters and control characters (e.g. `tab', `newline') are
1585 represented by one-byte sequences which are their ASCII codes, in
1586 the range 0x00 through 0x7F.
1587
1588 8-bit characters of the range 0x80..0x9F are represented by
1589 two-byte sequences of LEADING_CODE_8_BIT_CONTROL and (their 8-bit
1590 code + 0x20).
1591
1592 8-bit characters of the range 0xA0..0xFF are represented by
1593 one-byte sequences which are their 8-bit code.
1594
1595 The other characters are represented by a sequence of `base
1596 leading-code', optional `extended leading-code', and one or two
1597 `position-code's. The length of the sequence is determined by the
1598 base leading-code. Leading-code takes the range 0x81 through 0x9D,
1599 whereas extended leading-code and position-code take the range 0xA0
1600 through 0xFF. See `charset.h' for more details about leading-code
1601 and position-code.
1602
1603 --- CODE RANGE of Emacs' internal format ---
1604 character set range
1605 ------------- -----
1606 ascii 0x00..0x7F
1607 eight-bit-control LEADING_CODE_8_BIT_CONTROL + 0xA0..0xBF
1608 eight-bit-graphic 0xA0..0xBF
1609 ELSE 0x81..0x9D + [0xA0..0xFF]+
1610 ---------------------------------------------
1611
1612 As this is the internal character representation, the format is
1613 usually not used externally (i.e. in a file or in a data sent to a
1614 process). But, it is possible to have a text externally in this
1615 format (i.e. by encoding by the coding system `emacs-mule').
1616
1617 In that case, a sequence of one-byte codes has a slightly different
1618 form.
1619
1620 At first, all characters in eight-bit-control are represented by
1621 one-byte sequences which are their 8-bit code.
1622
1623 Next, character composition data are represented by the byte
1624 sequence of the form: 0x80 METHOD BYTES CHARS COMPONENT ...,
1625 where,
1626 METHOD is 0xF0 plus one of composition method (enum
1627 composition_method),
1628
1629 BYTES is 0xA0 plus a byte length of this composition data,
1630
1631 CHARS is 0x20 plus a number of characters composed by this
1632 data,
1633
1634 COMPONENTs are characters of multibye form or composition
1635 rules encoded by two-byte of ASCII codes.
1636
1637 In addition, for backward compatibility, the following formats are
1638 also recognized as composition data on decoding.
1639
1640 0x80 MSEQ ...
1641 0x80 0xFF MSEQ RULE MSEQ RULE ... MSEQ
1642
1643 Here,
1644 MSEQ is a multibyte form but in these special format:
1645 ASCII: 0xA0 ASCII_CODE+0x80,
1646 other: LEADING_CODE+0x20 FOLLOWING-BYTE ...,
1647 RULE is a one byte code of the range 0xA0..0xF0 that
1648 represents a composition rule.
1649 */
1650
1651 char emacs_mule_bytes[256];
1652
1653 int
1654 emacs_mule_char (coding, src, nbytes, nchars, id)
1655 struct coding_system *coding;
1656 const unsigned char *src;
1657 int *nbytes, *nchars, *id;
1658 {
1659 const unsigned char *src_end = coding->source + coding->src_bytes;
1660 const unsigned char *src_base = src;
1661 int multibytep = coding->src_multibyte;
1662 struct charset *charset;
1663 unsigned code;
1664 int c;
1665 int consumed_chars = 0;
1666
1667 ONE_MORE_BYTE (c);
1668 if (c < 0)
1669 {
1670 c = -c;
1671 charset = emacs_mule_charset[0];
1672 }
1673 else
1674 {
1675 switch (emacs_mule_bytes[c])
1676 {
1677 case 2:
1678 if (! (charset = emacs_mule_charset[c]))
1679 goto invalid_code;
1680 ONE_MORE_BYTE (c);
1681 if (c < 0)
1682 goto invalid_code;
1683 code = c & 0x7F;
1684 break;
1685
1686 case 3:
1687 if (c == EMACS_MULE_LEADING_CODE_PRIVATE_11
1688 || c == EMACS_MULE_LEADING_CODE_PRIVATE_12)
1689 {
1690 ONE_MORE_BYTE (c);
1691 if (c < 0 || ! (charset = emacs_mule_charset[c]))
1692 goto invalid_code;
1693 ONE_MORE_BYTE (c);
1694 if (c < 0)
1695 goto invalid_code;
1696 code = c & 0x7F;
1697 }
1698 else
1699 {
1700 if (! (charset = emacs_mule_charset[c]))
1701 goto invalid_code;
1702 ONE_MORE_BYTE (c);
1703 if (c < 0)
1704 goto invalid_code;
1705 code = (c & 0x7F) << 8;
1706 ONE_MORE_BYTE (c);
1707 if (c < 0)
1708 goto invalid_code;
1709 code |= c & 0x7F;
1710 }
1711 break;
1712
1713 case 4:
1714 ONE_MORE_BYTE (c);
1715 if (c < 0 || ! (charset = emacs_mule_charset[c]))
1716 goto invalid_code;
1717 ONE_MORE_BYTE (c);
1718 if (c < 0)
1719 goto invalid_code;
1720 code = (c & 0x7F) << 8;
1721 ONE_MORE_BYTE (c);
1722 if (c < 0)
1723 goto invalid_code;
1724 code |= c & 0x7F;
1725 break;
1726
1727 case 1:
1728 code = c;
1729 charset = CHARSET_FROM_ID (ASCII_BYTE_P (code)
1730 ? charset_ascii : charset_eight_bit);
1731 break;
1732
1733 default:
1734 abort ();
1735 }
1736 c = DECODE_CHAR (charset, code);
1737 if (c < 0)
1738 goto invalid_code;
1739 }
1740 *nbytes = src - src_base;
1741 *nchars = consumed_chars;
1742 if (id)
1743 *id = charset->id;
1744 return c;
1745
1746 no_more_source:
1747 return -2;
1748
1749 invalid_code:
1750 return -1;
1751 }
1752
1753
1754 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
1755 Check if a text is encoded in `emacs-mule'. If it is, return 1,
1756 else return 0. */
1757
1758 static int
1759 detect_coding_emacs_mule (coding, detect_info)
1760 struct coding_system *coding;
1761 struct coding_detection_info *detect_info;
1762 {
1763 const unsigned char *src = coding->source, *src_base;
1764 const unsigned char *src_end = coding->source + coding->src_bytes;
1765 int multibytep = coding->src_multibyte;
1766 int consumed_chars = 0;
1767 int c;
1768 int found = 0;
1769
1770 detect_info->checked |= CATEGORY_MASK_EMACS_MULE;
1771 /* A coding system of this category is always ASCII compatible. */
1772 src += coding->head_ascii;
1773
1774 while (1)
1775 {
1776 src_base = src;
1777 ONE_MORE_BYTE (c);
1778 if (c < 0)
1779 continue;
1780 if (c == 0x80)
1781 {
1782 /* Perhaps the start of composite character. We simple skip
1783 it because analyzing it is too heavy for detecting. But,
1784 at least, we check that the composite character
1785 constitues of more than 4 bytes. */
1786 const unsigned char *src_base;
1787
1788 repeat:
1789 src_base = src;
1790 do
1791 {
1792 ONE_MORE_BYTE (c);
1793 }
1794 while (c >= 0xA0);
1795
1796 if (src - src_base <= 4)
1797 break;
1798 found = CATEGORY_MASK_EMACS_MULE;
1799 if (c == 0x80)
1800 goto repeat;
1801 }
1802
1803 if (c < 0x80)
1804 {
1805 if (c < 0x20
1806 && (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO))
1807 break;
1808 }
1809 else
1810 {
1811 const unsigned char *src_base = src - 1;
1812
1813 do
1814 {
1815 ONE_MORE_BYTE (c);
1816 }
1817 while (c >= 0xA0);
1818 if (src - src_base != emacs_mule_bytes[*src_base])
1819 break;
1820 found = CATEGORY_MASK_EMACS_MULE;
1821 }
1822 }
1823 detect_info->rejected |= CATEGORY_MASK_EMACS_MULE;
1824 return 0;
1825
1826 no_more_source:
1827 if (src_base < src && coding->mode & CODING_MODE_LAST_BLOCK)
1828 {
1829 detect_info->rejected |= CATEGORY_MASK_EMACS_MULE;
1830 return 0;
1831 }
1832 detect_info->found |= found;
1833 return 1;
1834 }
1835
1836
1837 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions". */
1838
1839 /* Decode a character represented as a component of composition
1840 sequence of Emacs 20/21 style at SRC. Set C to that character and
1841 update SRC to the head of next character (or an encoded composition
1842 rule). If SRC doesn't points a composition component, set C to -1.
1843 If SRC points an invalid byte sequence, global exit by a return
1844 value 0. */
1845
1846 #define DECODE_EMACS_MULE_COMPOSITION_CHAR(buf) \
1847 if (1) \
1848 { \
1849 int c; \
1850 int nbytes, nchars; \
1851 \
1852 if (src == src_end) \
1853 break; \
1854 c = emacs_mule_char (coding, src, &nbytes, &nchars, NULL);\
1855 if (c < 0) \
1856 { \
1857 if (c == -2) \
1858 break; \
1859 goto invalid_code; \
1860 } \
1861 *buf++ = c; \
1862 src += nbytes; \
1863 consumed_chars += nchars; \
1864 } \
1865 else
1866
1867
1868 /* Decode a composition rule represented as a component of composition
1869 sequence of Emacs 20 style at SRC. Store the decoded rule in *BUF,
1870 and increment BUF. If SRC points an invalid byte sequence, set C
1871 to -1. */
1872
1873 #define DECODE_EMACS_MULE_COMPOSITION_RULE_20(buf) \
1874 do { \
1875 int c, gref, nref; \
1876 \
1877 if (src >= src_end) \
1878 goto invalid_code; \
1879 ONE_MORE_BYTE_NO_CHECK (c); \
1880 c -= 0x20; \
1881 if (c < 0 || c >= 81) \
1882 goto invalid_code; \
1883 \
1884 gref = c / 9, nref = c % 9; \
1885 *buf++ = COMPOSITION_ENCODE_RULE (gref, nref); \
1886 } while (0)
1887
1888
1889 /* Decode a composition rule represented as a component of composition
1890 sequence of Emacs 21 style at SRC. Store the decoded rule in *BUF,
1891 and increment BUF. If SRC points an invalid byte sequence, set C
1892 to -1. */
1893
1894 #define DECODE_EMACS_MULE_COMPOSITION_RULE_21(buf) \
1895 do { \
1896 int gref, nref; \
1897 \
1898 if (src + 1>= src_end) \
1899 goto invalid_code; \
1900 ONE_MORE_BYTE_NO_CHECK (gref); \
1901 gref -= 0x20; \
1902 ONE_MORE_BYTE_NO_CHECK (nref); \
1903 nref -= 0x20; \
1904 if (gref < 0 || gref >= 81 \
1905 || nref < 0 || nref >= 81) \
1906 goto invalid_code; \
1907 *buf++ = COMPOSITION_ENCODE_RULE (gref, nref); \
1908 } while (0)
1909
1910
1911 #define DECODE_EMACS_MULE_21_COMPOSITION(c) \
1912 do { \
1913 /* Emacs 21 style format. The first three bytes at SRC are \
1914 (METHOD - 0xF2), (BYTES - 0xA0), (CHARS - 0xA0), where BYTES is \
1915 the byte length of this composition information, CHARS is the \
1916 number of characters composed by this composition. */ \
1917 enum composition_method method = c - 0xF2; \
1918 int *charbuf_base = charbuf; \
1919 int from, to; \
1920 int consumed_chars_limit; \
1921 int nbytes, nchars; \
1922 \
1923 ONE_MORE_BYTE (c); \
1924 if (c < 0) \
1925 goto invalid_code; \
1926 nbytes = c - 0xA0; \
1927 if (nbytes < 3) \
1928 goto invalid_code; \
1929 ONE_MORE_BYTE (c); \
1930 if (c < 0) \
1931 goto invalid_code; \
1932 nchars = c - 0xA0; \
1933 from = coding->produced + char_offset; \
1934 to = from + nchars; \
1935 ADD_COMPOSITION_DATA (charbuf, from, to, method); \
1936 consumed_chars_limit = consumed_chars_base + nbytes; \
1937 if (method != COMPOSITION_RELATIVE) \
1938 { \
1939 int i = 0; \
1940 while (consumed_chars < consumed_chars_limit) \
1941 { \
1942 if (i % 2 && method != COMPOSITION_WITH_ALTCHARS) \
1943 DECODE_EMACS_MULE_COMPOSITION_RULE_21 (charbuf); \
1944 else \
1945 DECODE_EMACS_MULE_COMPOSITION_CHAR (charbuf); \
1946 i++; \
1947 } \
1948 if (consumed_chars < consumed_chars_limit) \
1949 goto invalid_code; \
1950 charbuf_base[0] -= i; \
1951 } \
1952 } while (0)
1953
1954
1955 #define DECODE_EMACS_MULE_20_RELATIVE_COMPOSITION(c) \
1956 do { \
1957 /* Emacs 20 style format for relative composition. */ \
1958 /* Store multibyte form of characters to be composed. */ \
1959 enum composition_method method = COMPOSITION_RELATIVE; \
1960 int components[MAX_COMPOSITION_COMPONENTS * 2 - 1]; \
1961 int *buf = components; \
1962 int i, j; \
1963 int from, to; \
1964 \
1965 src = src_base; \
1966 ONE_MORE_BYTE (c); /* skip 0x80 */ \
1967 for (i = 0; i < MAX_COMPOSITION_COMPONENTS; i++) \
1968 DECODE_EMACS_MULE_COMPOSITION_CHAR (buf); \
1969 if (i < 2) \
1970 goto invalid_code; \
1971 from = coding->produced_char + char_offset; \
1972 to = from + i; \
1973 ADD_COMPOSITION_DATA (charbuf, from, to, method); \
1974 for (j = 0; j < i; j++) \
1975 *charbuf++ = components[j]; \
1976 } while (0)
1977
1978
1979 #define DECODE_EMACS_MULE_20_RULEBASE_COMPOSITION(c) \
1980 do { \
1981 /* Emacs 20 style format for rule-base composition. */ \
1982 /* Store multibyte form of characters to be composed. */ \
1983 enum composition_method method = COMPOSITION_WITH_RULE; \
1984 int components[MAX_COMPOSITION_COMPONENTS * 2 - 1]; \
1985 int *buf = components; \
1986 int i, j; \
1987 int from, to; \
1988 \
1989 DECODE_EMACS_MULE_COMPOSITION_CHAR (buf); \
1990 for (i = 0; i < MAX_COMPOSITION_COMPONENTS; i++) \
1991 { \
1992 DECODE_EMACS_MULE_COMPOSITION_RULE_20 (buf); \
1993 DECODE_EMACS_MULE_COMPOSITION_CHAR (buf); \
1994 } \
1995 if (i < 1 || (buf - components) % 2 == 0) \
1996 goto invalid_code; \
1997 if (charbuf + i + (i / 2) + 1 < charbuf_end) \
1998 goto no_more_source; \
1999 from = coding->produced_char + char_offset; \
2000 to = from + i; \
2001 ADD_COMPOSITION_DATA (buf, from, to, method); \
2002 for (j = 0; j < i; j++) \
2003 *charbuf++ = components[j]; \
2004 for (j = 0; j < i; j += 2) \
2005 *charbuf++ = components[j]; \
2006 } while (0)
2007
2008
2009 static void
2010 decode_coding_emacs_mule (coding)
2011 struct coding_system *coding;
2012 {
2013 const unsigned char *src = coding->source + coding->consumed;
2014 const unsigned char *src_end = coding->source + coding->src_bytes;
2015 const unsigned char *src_base;
2016 int *charbuf = coding->charbuf;
2017 int *charbuf_end = charbuf + coding->charbuf_size - MAX_ANNOTATION_LENGTH;
2018 int consumed_chars = 0, consumed_chars_base;
2019 int multibytep = coding->src_multibyte;
2020 Lisp_Object attrs, charset_list;
2021 int char_offset = coding->produced_char;
2022 int last_offset = char_offset;
2023 int last_id = charset_ascii;
2024
2025 CODING_GET_INFO (coding, attrs, charset_list);
2026
2027 while (1)
2028 {
2029 int c;
2030
2031 src_base = src;
2032 consumed_chars_base = consumed_chars;
2033
2034 if (charbuf >= charbuf_end)
2035 break;
2036
2037 ONE_MORE_BYTE (c);
2038 if (c < 0)
2039 {
2040 *charbuf++ = -c;
2041 char_offset++;
2042 }
2043 else if (c < 0x80)
2044 {
2045 *charbuf++ = c;
2046 char_offset++;
2047 }
2048 else if (c == 0x80)
2049 {
2050 ONE_MORE_BYTE (c);
2051 if (c < 0)
2052 goto invalid_code;
2053 if (c - 0xF2 >= COMPOSITION_RELATIVE
2054 && c - 0xF2 <= COMPOSITION_WITH_RULE_ALTCHARS)
2055 DECODE_EMACS_MULE_21_COMPOSITION (c);
2056 else if (c < 0xC0)
2057 DECODE_EMACS_MULE_20_RELATIVE_COMPOSITION (c);
2058 else if (c == 0xFF)
2059 DECODE_EMACS_MULE_20_RULEBASE_COMPOSITION (c);
2060 else
2061 goto invalid_code;
2062 }
2063 else if (c < 0xA0 && emacs_mule_bytes[c] > 1)
2064 {
2065 int nbytes, nchars;
2066 int id;
2067
2068 src = src_base;
2069 consumed_chars = consumed_chars_base;
2070 c = emacs_mule_char (coding, src, &nbytes, &nchars, &id);
2071 if (c < 0)
2072 {
2073 if (c == -2)
2074 break;
2075 goto invalid_code;
2076 }
2077 if (last_id != id)
2078 {
2079 if (last_id != charset_ascii)
2080 ADD_CHARSET_DATA (charbuf, last_offset, char_offset, last_id);
2081 last_id = id;
2082 last_offset = char_offset;
2083 }
2084 *charbuf++ = c;
2085 src += nbytes;
2086 consumed_chars += nchars;
2087 char_offset++;
2088 }
2089 continue;
2090
2091 invalid_code:
2092 src = src_base;
2093 consumed_chars = consumed_chars_base;
2094 ONE_MORE_BYTE (c);
2095 *charbuf++ = ASCII_BYTE_P (c) ? c : BYTE8_TO_CHAR (c);
2096 char_offset++;
2097 coding->errors++;
2098 }
2099
2100 no_more_source:
2101 if (last_id != charset_ascii)
2102 ADD_CHARSET_DATA (charbuf, last_offset, char_offset, last_id);
2103 coding->consumed_char += consumed_chars_base;
2104 coding->consumed = src_base - coding->source;
2105 coding->charbuf_used = charbuf - coding->charbuf;
2106 }
2107
2108
2109 #define EMACS_MULE_LEADING_CODES(id, codes) \
2110 do { \
2111 if (id < 0xA0) \
2112 codes[0] = id, codes[1] = 0; \
2113 else if (id < 0xE0) \
2114 codes[0] = 0x9A, codes[1] = id; \
2115 else if (id < 0xF0) \
2116 codes[0] = 0x9B, codes[1] = id; \
2117 else if (id < 0xF5) \
2118 codes[0] = 0x9C, codes[1] = id; \
2119 else \
2120 codes[0] = 0x9D, codes[1] = id; \
2121 } while (0);
2122
2123
2124 static int
2125 encode_coding_emacs_mule (coding)
2126 struct coding_system *coding;
2127 {
2128 int multibytep = coding->dst_multibyte;
2129 int *charbuf = coding->charbuf;
2130 int *charbuf_end = charbuf + coding->charbuf_used;
2131 unsigned char *dst = coding->destination + coding->produced;
2132 unsigned char *dst_end = coding->destination + coding->dst_bytes;
2133 int safe_room = 8;
2134 int produced_chars = 0;
2135 Lisp_Object attrs, charset_list;
2136 int c;
2137 int preferred_charset_id = -1;
2138
2139 CODING_GET_INFO (coding, attrs, charset_list);
2140 if (! EQ (charset_list, Vemacs_mule_charset_list))
2141 {
2142 CODING_ATTR_CHARSET_LIST (attrs)
2143 = charset_list = Vemacs_mule_charset_list;
2144 }
2145
2146 while (charbuf < charbuf_end)
2147 {
2148 ASSURE_DESTINATION (safe_room);
2149 c = *charbuf++;
2150
2151 if (c < 0)
2152 {
2153 /* Handle an annotation. */
2154 switch (*charbuf)
2155 {
2156 case CODING_ANNOTATE_COMPOSITION_MASK:
2157 /* Not yet implemented. */
2158 break;
2159 case CODING_ANNOTATE_CHARSET_MASK:
2160 preferred_charset_id = charbuf[3];
2161 if (preferred_charset_id >= 0
2162 && NILP (Fmemq (make_number (preferred_charset_id),
2163 charset_list)))
2164 preferred_charset_id = -1;
2165 break;
2166 default:
2167 abort ();
2168 }
2169 charbuf += -c - 1;
2170 continue;
2171 }
2172
2173 if (ASCII_CHAR_P (c))
2174 EMIT_ONE_ASCII_BYTE (c);
2175 else if (CHAR_BYTE8_P (c))
2176 {
2177 c = CHAR_TO_BYTE8 (c);
2178 EMIT_ONE_BYTE (c);
2179 }
2180 else
2181 {
2182 struct charset *charset;
2183 unsigned code;
2184 int dimension;
2185 int emacs_mule_id;
2186 unsigned char leading_codes[2];
2187
2188 if (preferred_charset_id >= 0)
2189 {
2190 charset = CHARSET_FROM_ID (preferred_charset_id);
2191 if (! CHAR_CHARSET_P (c, charset))
2192 charset = char_charset (c, charset_list, NULL);
2193 }
2194 else
2195 charset = char_charset (c, charset_list, &code);
2196 if (! charset)
2197 {
2198 c = coding->default_char;
2199 if (ASCII_CHAR_P (c))
2200 {
2201 EMIT_ONE_ASCII_BYTE (c);
2202 continue;
2203 }
2204 charset = char_charset (c, charset_list, &code);
2205 }
2206 dimension = CHARSET_DIMENSION (charset);
2207 emacs_mule_id = CHARSET_EMACS_MULE_ID (charset);
2208 EMACS_MULE_LEADING_CODES (emacs_mule_id, leading_codes);
2209 EMIT_ONE_BYTE (leading_codes[0]);
2210 if (leading_codes[1])
2211 EMIT_ONE_BYTE (leading_codes[1]);
2212 if (dimension == 1)
2213 EMIT_ONE_BYTE (code | 0x80);
2214 else
2215 {
2216 code |= 0x8080;
2217 EMIT_ONE_BYTE (code >> 8);
2218 EMIT_ONE_BYTE (code & 0xFF);
2219 }
2220 }
2221 }
2222 record_conversion_result (coding, CODING_RESULT_SUCCESS);
2223 coding->produced_char += produced_chars;
2224 coding->produced = dst - coding->destination;
2225 return 0;
2226 }
2227
2228 \f
2229 /*** 7. ISO2022 handlers ***/
2230
2231 /* The following note describes the coding system ISO2022 briefly.
2232 Since the intention of this note is to help understand the
2233 functions in this file, some parts are NOT ACCURATE or are OVERLY
2234 SIMPLIFIED. For thorough understanding, please refer to the
2235 original document of ISO2022. This is equivalent to the standard
2236 ECMA-35, obtainable from <URL:http://www.ecma.ch/> (*).
2237
2238 ISO2022 provides many mechanisms to encode several character sets
2239 in 7-bit and 8-bit environments. For 7-bit environments, all text
2240 is encoded using bytes less than 128. This may make the encoded
2241 text a little bit longer, but the text passes more easily through
2242 several types of gateway, some of which strip off the MSB (Most
2243 Significant Bit).
2244
2245 There are two kinds of character sets: control character sets and
2246 graphic character sets. The former contain control characters such
2247 as `newline' and `escape' to provide control functions (control
2248 functions are also provided by escape sequences). The latter
2249 contain graphic characters such as 'A' and '-'. Emacs recognizes
2250 two control character sets and many graphic character sets.
2251
2252 Graphic character sets are classified into one of the following
2253 four classes, according to the number of bytes (DIMENSION) and
2254 number of characters in one dimension (CHARS) of the set:
2255 - DIMENSION1_CHARS94
2256 - DIMENSION1_CHARS96
2257 - DIMENSION2_CHARS94
2258 - DIMENSION2_CHARS96
2259
2260 In addition, each character set is assigned an identification tag,
2261 unique for each set, called the "final character" (denoted as <F>
2262 hereafter). The <F> of each character set is decided by ECMA(*)
2263 when it is registered in ISO. The code range of <F> is 0x30..0x7F
2264 (0x30..0x3F are for private use only).
2265
2266 Note (*): ECMA = European Computer Manufacturers Association
2267
2268 Here are examples of graphic character sets [NAME(<F>)]:
2269 o DIMENSION1_CHARS94 -- ASCII('B'), right-half-of-JISX0201('I'), ...
2270 o DIMENSION1_CHARS96 -- right-half-of-ISO8859-1('A'), ...
2271 o DIMENSION2_CHARS94 -- GB2312('A'), JISX0208('B'), ...
2272 o DIMENSION2_CHARS96 -- none for the moment
2273
2274 A code area (1 byte=8 bits) is divided into 4 areas, C0, GL, C1, and GR.
2275 C0 [0x00..0x1F] -- control character plane 0
2276 GL [0x20..0x7F] -- graphic character plane 0
2277 C1 [0x80..0x9F] -- control character plane 1
2278 GR [0xA0..0xFF] -- graphic character plane 1
2279
2280 A control character set is directly designated and invoked to C0 or
2281 C1 by an escape sequence. The most common case is that:
2282 - ISO646's control character set is designated/invoked to C0, and
2283 - ISO6429's control character set is designated/invoked to C1,
2284 and usually these designations/invocations are omitted in encoded
2285 text. In a 7-bit environment, only C0 can be used, and a control
2286 character for C1 is encoded by an appropriate escape sequence to
2287 fit into the environment. All control characters for C1 are
2288 defined to have corresponding escape sequences.
2289
2290 A graphic character set is at first designated to one of four
2291 graphic registers (G0 through G3), then these graphic registers are
2292 invoked to GL or GR. These designations and invocations can be
2293 done independently. The most common case is that G0 is invoked to
2294 GL, G1 is invoked to GR, and ASCII is designated to G0. Usually
2295 these invocations and designations are omitted in encoded text.
2296 In a 7-bit environment, only GL can be used.
2297
2298 When a graphic character set of CHARS94 is invoked to GL, codes
2299 0x20 and 0x7F of the GL area work as control characters SPACE and
2300 DEL respectively, and codes 0xA0 and 0xFF of the GR area should not
2301 be used.
2302
2303 There are two ways of invocation: locking-shift and single-shift.
2304 With locking-shift, the invocation lasts until the next different
2305 invocation, whereas with single-shift, the invocation affects the
2306 following character only and doesn't affect the locking-shift
2307 state. Invocations are done by the following control characters or
2308 escape sequences:
2309
2310 ----------------------------------------------------------------------
2311 abbrev function cntrl escape seq description
2312 ----------------------------------------------------------------------
2313 SI/LS0 (shift-in) 0x0F none invoke G0 into GL
2314 SO/LS1 (shift-out) 0x0E none invoke G1 into GL
2315 LS2 (locking-shift-2) none ESC 'n' invoke G2 into GL
2316 LS3 (locking-shift-3) none ESC 'o' invoke G3 into GL
2317 LS1R (locking-shift-1 right) none ESC '~' invoke G1 into GR (*)
2318 LS2R (locking-shift-2 right) none ESC '}' invoke G2 into GR (*)
2319 LS3R (locking-shift 3 right) none ESC '|' invoke G3 into GR (*)
2320 SS2 (single-shift-2) 0x8E ESC 'N' invoke G2 for one char
2321 SS3 (single-shift-3) 0x8F ESC 'O' invoke G3 for one char
2322 ----------------------------------------------------------------------
2323 (*) These are not used by any known coding system.
2324
2325 Control characters for these functions are defined by macros
2326 ISO_CODE_XXX in `coding.h'.
2327
2328 Designations are done by the following escape sequences:
2329 ----------------------------------------------------------------------
2330 escape sequence description
2331 ----------------------------------------------------------------------
2332 ESC '(' <F> designate DIMENSION1_CHARS94<F> to G0
2333 ESC ')' <F> designate DIMENSION1_CHARS94<F> to G1
2334 ESC '*' <F> designate DIMENSION1_CHARS94<F> to G2
2335 ESC '+' <F> designate DIMENSION1_CHARS94<F> to G3
2336 ESC ',' <F> designate DIMENSION1_CHARS96<F> to G0 (*)
2337 ESC '-' <F> designate DIMENSION1_CHARS96<F> to G1
2338 ESC '.' <F> designate DIMENSION1_CHARS96<F> to G2
2339 ESC '/' <F> designate DIMENSION1_CHARS96<F> to G3
2340 ESC '$' '(' <F> designate DIMENSION2_CHARS94<F> to G0 (**)
2341 ESC '$' ')' <F> designate DIMENSION2_CHARS94<F> to G1
2342 ESC '$' '*' <F> designate DIMENSION2_CHARS94<F> to G2
2343 ESC '$' '+' <F> designate DIMENSION2_CHARS94<F> to G3
2344 ESC '$' ',' <F> designate DIMENSION2_CHARS96<F> to G0 (*)
2345 ESC '$' '-' <F> designate DIMENSION2_CHARS96<F> to G1
2346 ESC '$' '.' <F> designate DIMENSION2_CHARS96<F> to G2
2347 ESC '$' '/' <F> designate DIMENSION2_CHARS96<F> to G3
2348 ----------------------------------------------------------------------
2349
2350 In this list, "DIMENSION1_CHARS94<F>" means a graphic character set
2351 of dimension 1, chars 94, and final character <F>, etc...
2352
2353 Note (*): Although these designations are not allowed in ISO2022,
2354 Emacs accepts them on decoding, and produces them on encoding
2355 CHARS96 character sets in a coding system which is characterized as
2356 7-bit environment, non-locking-shift, and non-single-shift.
2357
2358 Note (**): If <F> is '@', 'A', or 'B', the intermediate character
2359 '(' must be omitted. We refer to this as "short-form" hereafter.
2360
2361 Now you may notice that there are a lot of ways of encoding the
2362 same multilingual text in ISO2022. Actually, there exist many
2363 coding systems such as Compound Text (used in X11's inter client
2364 communication, ISO-2022-JP (used in Japanese Internet), ISO-2022-KR
2365 (used in Korean Internet), EUC (Extended UNIX Code, used in Asian
2366 localized platforms), and all of these are variants of ISO2022.
2367
2368 In addition to the above, Emacs handles two more kinds of escape
2369 sequences: ISO6429's direction specification and Emacs' private
2370 sequence for specifying character composition.
2371
2372 ISO6429's direction specification takes the following form:
2373 o CSI ']' -- end of the current direction
2374 o CSI '0' ']' -- end of the current direction
2375 o CSI '1' ']' -- start of left-to-right text
2376 o CSI '2' ']' -- start of right-to-left text
2377 The control character CSI (0x9B: control sequence introducer) is
2378 abbreviated to the escape sequence ESC '[' in a 7-bit environment.
2379
2380 Character composition specification takes the following form:
2381 o ESC '0' -- start relative composition
2382 o ESC '1' -- end composition
2383 o ESC '2' -- start rule-base composition (*)
2384 o ESC '3' -- start relative composition with alternate chars (**)
2385 o ESC '4' -- start rule-base composition with alternate chars (**)
2386 Since these are not standard escape sequences of any ISO standard,
2387 the use of them with these meanings is restricted to Emacs only.
2388
2389 (*) This form is used only in Emacs 20.7 and older versions,
2390 but newer versions can safely decode it.
2391 (**) This form is used only in Emacs 21.1 and newer versions,
2392 and older versions can't decode it.
2393
2394 Here's a list of example usages of these composition escape
2395 sequences (categorized by `enum composition_method').
2396
2397 COMPOSITION_RELATIVE:
2398 ESC 0 CHAR [ CHAR ] ESC 1
2399 COMPOSITION_WITH_RULE:
2400 ESC 2 CHAR [ RULE CHAR ] ESC 1
2401 COMPOSITION_WITH_ALTCHARS:
2402 ESC 3 ALTCHAR [ ALTCHAR ] ESC 0 CHAR [ CHAR ] ESC 1
2403 COMPOSITION_WITH_RULE_ALTCHARS:
2404 ESC 4 ALTCHAR [ RULE ALTCHAR ] ESC 0 CHAR [ CHAR ] ESC 1 */
2405
2406 enum iso_code_class_type iso_code_class[256];
2407
2408 #define SAFE_CHARSET_P(coding, id) \
2409 ((id) <= (coding)->max_charset_id \
2410 && (coding)->safe_charsets[id] >= 0)
2411
2412
2413 #define SHIFT_OUT_OK(category) \
2414 (CODING_ISO_INITIAL (&coding_categories[category], 1) >= 0)
2415
2416 static void
2417 setup_iso_safe_charsets (attrs)
2418 Lisp_Object attrs;
2419 {
2420 Lisp_Object charset_list, safe_charsets;
2421 Lisp_Object request;
2422 Lisp_Object reg_usage;
2423 Lisp_Object tail;
2424 int reg94, reg96;
2425 int flags = XINT (AREF (attrs, coding_attr_iso_flags));
2426 int max_charset_id;
2427
2428 charset_list = CODING_ATTR_CHARSET_LIST (attrs);
2429 if ((flags & CODING_ISO_FLAG_FULL_SUPPORT)
2430 && ! EQ (charset_list, Viso_2022_charset_list))
2431 {
2432 CODING_ATTR_CHARSET_LIST (attrs)
2433 = charset_list = Viso_2022_charset_list;
2434 ASET (attrs, coding_attr_safe_charsets, Qnil);
2435 }
2436
2437 if (STRINGP (AREF (attrs, coding_attr_safe_charsets)))
2438 return;
2439
2440 max_charset_id = 0;
2441 for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
2442 {
2443 int id = XINT (XCAR (tail));
2444 if (max_charset_id < id)
2445 max_charset_id = id;
2446 }
2447
2448 safe_charsets = Fmake_string (make_number (max_charset_id + 1),
2449 make_number (255));
2450 request = AREF (attrs, coding_attr_iso_request);
2451 reg_usage = AREF (attrs, coding_attr_iso_usage);
2452 reg94 = XINT (XCAR (reg_usage));
2453 reg96 = XINT (XCDR (reg_usage));
2454
2455 for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
2456 {
2457 Lisp_Object id;
2458 Lisp_Object reg;
2459 struct charset *charset;
2460
2461 id = XCAR (tail);
2462 charset = CHARSET_FROM_ID (XINT (id));
2463 reg = Fcdr (Fassq (id, request));
2464 if (! NILP (reg))
2465 SSET (safe_charsets, XINT (id), XINT (reg));
2466 else if (charset->iso_chars_96)
2467 {
2468 if (reg96 < 4)
2469 SSET (safe_charsets, XINT (id), reg96);
2470 }
2471 else
2472 {
2473 if (reg94 < 4)
2474 SSET (safe_charsets, XINT (id), reg94);
2475 }
2476 }
2477 ASET (attrs, coding_attr_safe_charsets, safe_charsets);
2478 }
2479
2480
2481 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
2482 Check if a text is encoded in one of ISO-2022 based codig systems.
2483 If it is, return 1, else return 0. */
2484
2485 static int
2486 detect_coding_iso_2022 (coding, detect_info)
2487 struct coding_system *coding;
2488 struct coding_detection_info *detect_info;
2489 {
2490 const unsigned char *src = coding->source, *src_base = src;
2491 const unsigned char *src_end = coding->source + coding->src_bytes;
2492 int multibytep = coding->src_multibyte;
2493 int single_shifting = 0;
2494 int id;
2495 int c, c1;
2496 int consumed_chars = 0;
2497 int i;
2498 int rejected = 0;
2499 int found = 0;
2500
2501 detect_info->checked |= CATEGORY_MASK_ISO;
2502
2503 for (i = coding_category_iso_7; i <= coding_category_iso_8_else; i++)
2504 {
2505 struct coding_system *this = &(coding_categories[i]);
2506 Lisp_Object attrs, val;
2507
2508 attrs = CODING_ID_ATTRS (this->id);
2509 if (CODING_ISO_FLAGS (this) & CODING_ISO_FLAG_FULL_SUPPORT
2510 && ! EQ (CODING_ATTR_SAFE_CHARSETS (attrs), Viso_2022_charset_list))
2511 setup_iso_safe_charsets (attrs);
2512 val = CODING_ATTR_SAFE_CHARSETS (attrs);
2513 this->max_charset_id = SCHARS (val) - 1;
2514 this->safe_charsets = (char *) SDATA (val);
2515 }
2516
2517 /* A coding system of this category is always ASCII compatible. */
2518 src += coding->head_ascii;
2519
2520 while (rejected != CATEGORY_MASK_ISO)
2521 {
2522 src_base = src;
2523 ONE_MORE_BYTE (c);
2524 switch (c)
2525 {
2526 case ISO_CODE_ESC:
2527 if (inhibit_iso_escape_detection)
2528 break;
2529 single_shifting = 0;
2530 ONE_MORE_BYTE (c);
2531 if (c >= '(' && c <= '/')
2532 {
2533 /* Designation sequence for a charset of dimension 1. */
2534 ONE_MORE_BYTE (c1);
2535 if (c1 < ' ' || c1 >= 0x80
2536 || (id = iso_charset_table[0][c >= ','][c1]) < 0)
2537 /* Invalid designation sequence. Just ignore. */
2538 break;
2539 }
2540 else if (c == '$')
2541 {
2542 /* Designation sequence for a charset of dimension 2. */
2543 ONE_MORE_BYTE (c);
2544 if (c >= '@' && c <= 'B')
2545 /* Designation for JISX0208.1978, GB2312, or JISX0208. */
2546 id = iso_charset_table[1][0][c];
2547 else if (c >= '(' && c <= '/')
2548 {
2549 ONE_MORE_BYTE (c1);
2550 if (c1 < ' ' || c1 >= 0x80
2551 || (id = iso_charset_table[1][c >= ','][c1]) < 0)
2552 /* Invalid designation sequence. Just ignore. */
2553 break;
2554 }
2555 else
2556 /* Invalid designation sequence. Just ignore it. */
2557 break;
2558 }
2559 else if (c == 'N' || c == 'O')
2560 {
2561 /* ESC <Fe> for SS2 or SS3. */
2562 single_shifting = 1;
2563 rejected |= CATEGORY_MASK_ISO_7BIT | CATEGORY_MASK_ISO_8BIT;
2564 break;
2565 }
2566 else if (c >= '0' && c <= '4')
2567 {
2568 /* ESC <Fp> for start/end composition. */
2569 found |= CATEGORY_MASK_ISO;
2570 break;
2571 }
2572 else
2573 {
2574 /* Invalid escape sequence. Just ignore it. */
2575 break;
2576 }
2577
2578 /* We found a valid designation sequence for CHARSET. */
2579 rejected |= CATEGORY_MASK_ISO_8BIT;
2580 if (SAFE_CHARSET_P (&coding_categories[coding_category_iso_7],
2581 id))
2582 found |= CATEGORY_MASK_ISO_7;
2583 else
2584 rejected |= CATEGORY_MASK_ISO_7;
2585 if (SAFE_CHARSET_P (&coding_categories[coding_category_iso_7_tight],
2586 id))
2587 found |= CATEGORY_MASK_ISO_7_TIGHT;
2588 else
2589 rejected |= CATEGORY_MASK_ISO_7_TIGHT;
2590 if (SAFE_CHARSET_P (&coding_categories[coding_category_iso_7_else],
2591 id))
2592 found |= CATEGORY_MASK_ISO_7_ELSE;
2593 else
2594 rejected |= CATEGORY_MASK_ISO_7_ELSE;
2595 if (SAFE_CHARSET_P (&coding_categories[coding_category_iso_8_else],
2596 id))
2597 found |= CATEGORY_MASK_ISO_8_ELSE;
2598 else
2599 rejected |= CATEGORY_MASK_ISO_8_ELSE;
2600 break;
2601
2602 case ISO_CODE_SO:
2603 case ISO_CODE_SI:
2604 /* Locking shift out/in. */
2605 if (inhibit_iso_escape_detection)
2606 break;
2607 single_shifting = 0;
2608 rejected |= CATEGORY_MASK_ISO_7BIT | CATEGORY_MASK_ISO_8BIT;
2609 found |= CATEGORY_MASK_ISO_ELSE;
2610 break;
2611
2612 case ISO_CODE_CSI:
2613 /* Control sequence introducer. */
2614 single_shifting = 0;
2615 rejected |= CATEGORY_MASK_ISO_7BIT | CATEGORY_MASK_ISO_7_ELSE;
2616 found |= CATEGORY_MASK_ISO_8_ELSE;
2617 goto check_extra_latin;
2618
2619
2620 case ISO_CODE_SS2:
2621 case ISO_CODE_SS3:
2622 /* Single shift. */
2623 if (inhibit_iso_escape_detection)
2624 break;
2625 single_shifting = 1;
2626 rejected |= CATEGORY_MASK_ISO_7BIT;
2627 if (CODING_ISO_FLAGS (&coding_categories[coding_category_iso_8_1])
2628 & CODING_ISO_FLAG_SINGLE_SHIFT)
2629 found |= CATEGORY_MASK_ISO_8_1;
2630 if (CODING_ISO_FLAGS (&coding_categories[coding_category_iso_8_2])
2631 & CODING_ISO_FLAG_SINGLE_SHIFT)
2632 found |= CATEGORY_MASK_ISO_8_2;
2633 goto check_extra_latin;
2634
2635 default:
2636 if (c < 0)
2637 continue;
2638 if (c < 0x80)
2639 {
2640 single_shifting = 0;
2641 break;
2642 }
2643 if (c >= 0xA0)
2644 {
2645 rejected |= CATEGORY_MASK_ISO_7BIT | CATEGORY_MASK_ISO_7_ELSE;
2646 found |= CATEGORY_MASK_ISO_8_1;
2647 /* Check the length of succeeding codes of the range
2648 0xA0..0FF. If the byte length is even, we include
2649 CATEGORY_MASK_ISO_8_2 in `found'. We can check this
2650 only when we are not single shifting. */
2651 if (! single_shifting
2652 && ! (rejected & CATEGORY_MASK_ISO_8_2))
2653 {
2654 int i = 1;
2655 while (src < src_end)
2656 {
2657 ONE_MORE_BYTE (c);
2658 if (c < 0xA0)
2659 break;
2660 i++;
2661 }
2662
2663 if (i & 1 && src < src_end)
2664 rejected |= CATEGORY_MASK_ISO_8_2;
2665 else
2666 found |= CATEGORY_MASK_ISO_8_2;
2667 }
2668 break;
2669 }
2670 check_extra_latin:
2671 single_shifting = 0;
2672 if (! VECTORP (Vlatin_extra_code_table)
2673 || NILP (XVECTOR (Vlatin_extra_code_table)->contents[c]))
2674 {
2675 rejected = CATEGORY_MASK_ISO;
2676 break;
2677 }
2678 if (CODING_ISO_FLAGS (&coding_categories[coding_category_iso_8_1])
2679 & CODING_ISO_FLAG_LATIN_EXTRA)
2680 found |= CATEGORY_MASK_ISO_8_1;
2681 else
2682 rejected |= CATEGORY_MASK_ISO_8_1;
2683 if (CODING_ISO_FLAGS (&coding_categories[coding_category_iso_8_2])
2684 & CODING_ISO_FLAG_LATIN_EXTRA)
2685 found |= CATEGORY_MASK_ISO_8_2;
2686 else
2687 rejected |= CATEGORY_MASK_ISO_8_2;
2688 }
2689 }
2690 detect_info->rejected |= CATEGORY_MASK_ISO;
2691 return 0;
2692
2693 no_more_source:
2694 detect_info->rejected |= rejected;
2695 detect_info->found |= (found & ~rejected);
2696 return 1;
2697 }
2698
2699
2700 /* Set designation state into CODING. */
2701 #define DECODE_DESIGNATION(reg, dim, chars_96, final) \
2702 do { \
2703 int id, prev; \
2704 \
2705 if (final < '0' || final >= 128 \
2706 || ((id = ISO_CHARSET_TABLE (dim, chars_96, final)) < 0) \
2707 || !SAFE_CHARSET_P (coding, id)) \
2708 { \
2709 CODING_ISO_DESIGNATION (coding, reg) = -2; \
2710 goto invalid_code; \
2711 } \
2712 prev = CODING_ISO_DESIGNATION (coding, reg); \
2713 if (id == charset_jisx0201_roman) \
2714 { \
2715 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_USE_ROMAN) \
2716 id = charset_ascii; \
2717 } \
2718 else if (id == charset_jisx0208_1978) \
2719 { \
2720 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_USE_OLDJIS) \
2721 id = charset_jisx0208; \
2722 } \
2723 CODING_ISO_DESIGNATION (coding, reg) = id; \
2724 /* If there was an invalid designation to REG previously, and this \
2725 designation is ASCII to REG, we should keep this designation \
2726 sequence. */ \
2727 if (prev == -2 && id == charset_ascii) \
2728 goto invalid_code; \
2729 } while (0)
2730
2731
2732 #define MAYBE_FINISH_COMPOSITION() \
2733 do { \
2734 int i; \
2735 if (composition_state == COMPOSING_NO) \
2736 break; \
2737 /* It is assured that we have enough room for producing \
2738 characters stored in the table `components'. */ \
2739 if (charbuf + component_idx > charbuf_end) \
2740 goto no_more_source; \
2741 composition_state = COMPOSING_NO; \
2742 if (method == COMPOSITION_RELATIVE \
2743 || method == COMPOSITION_WITH_ALTCHARS) \
2744 { \
2745 for (i = 0; i < component_idx; i++) \
2746 *charbuf++ = components[i]; \
2747 char_offset += component_idx; \
2748 } \
2749 else \
2750 { \
2751 for (i = 0; i < component_idx; i += 2) \
2752 *charbuf++ = components[i]; \
2753 char_offset += (component_idx / 2) + 1; \
2754 } \
2755 } while (0)
2756
2757
2758 /* Handle composition start sequence ESC 0, ESC 2, ESC 3, or ESC 4.
2759 ESC 0 : relative composition : ESC 0 CHAR ... ESC 1
2760 ESC 2 : rulebase composition : ESC 2 CHAR RULE CHAR RULE ... CHAR ESC 1
2761 ESC 3 : altchar composition : ESC 3 CHAR ... ESC 0 CHAR ... ESC 1
2762 ESC 4 : alt&rule composition : ESC 4 CHAR RULE ... CHAR ESC 0 CHAR ... ESC 1
2763 */
2764
2765 #define DECODE_COMPOSITION_START(c1) \
2766 do { \
2767 if (c1 == '0' \
2768 && composition_state == COMPOSING_COMPONENT_RULE) \
2769 { \
2770 component_len = component_idx; \
2771 composition_state = COMPOSING_CHAR; \
2772 } \
2773 else \
2774 { \
2775 const unsigned char *p; \
2776 \
2777 MAYBE_FINISH_COMPOSITION (); \
2778 if (charbuf + MAX_COMPOSITION_COMPONENTS > charbuf_end) \
2779 goto no_more_source; \
2780 for (p = src; p < src_end - 1; p++) \
2781 if (*p == ISO_CODE_ESC && p[1] == '1') \
2782 break; \
2783 if (p == src_end - 1) \
2784 { \
2785 if (coding->mode & CODING_MODE_LAST_BLOCK) \
2786 goto invalid_code; \
2787 goto no_more_source; \
2788 } \
2789 \
2790 /* This is surely the start of a composition. */ \
2791 method = (c1 == '0' ? COMPOSITION_RELATIVE \
2792 : c1 == '2' ? COMPOSITION_WITH_RULE \
2793 : c1 == '3' ? COMPOSITION_WITH_ALTCHARS \
2794 : COMPOSITION_WITH_RULE_ALTCHARS); \
2795 composition_state = (c1 <= '2' ? COMPOSING_CHAR \
2796 : COMPOSING_COMPONENT_CHAR); \
2797 component_idx = component_len = 0; \
2798 } \
2799 } while (0)
2800
2801
2802 /* Handle compositoin end sequence ESC 1. */
2803
2804 #define DECODE_COMPOSITION_END() \
2805 do { \
2806 int nchars = (component_len > 0 ? component_idx - component_len \
2807 : method == COMPOSITION_RELATIVE ? component_idx \
2808 : (component_idx + 1) / 2); \
2809 int i; \
2810 int *saved_charbuf = charbuf; \
2811 int from = char_offset; \
2812 int to = from + nchars; \
2813 \
2814 ADD_COMPOSITION_DATA (charbuf, from, to, method); \
2815 if (method != COMPOSITION_RELATIVE) \
2816 { \
2817 if (component_len == 0) \
2818 for (i = 0; i < component_idx; i++) \
2819 *charbuf++ = components[i]; \
2820 else \
2821 for (i = 0; i < component_len; i++) \
2822 *charbuf++ = components[i]; \
2823 *saved_charbuf = saved_charbuf - charbuf; \
2824 } \
2825 if (method == COMPOSITION_WITH_RULE) \
2826 for (i = 0; i < component_idx; i += 2, char_offset++) \
2827 *charbuf++ = components[i]; \
2828 else \
2829 for (i = component_len; i < component_idx; i++, char_offset++) \
2830 *charbuf++ = components[i]; \
2831 coding->annotated = 1; \
2832 composition_state = COMPOSING_NO; \
2833 } while (0)
2834
2835
2836 /* Decode a composition rule from the byte C1 (and maybe one more byte
2837 from SRC) and store one encoded composition rule in
2838 coding->cmp_data. */
2839
2840 #define DECODE_COMPOSITION_RULE(c1) \
2841 do { \
2842 (c1) -= 32; \
2843 if (c1 < 81) /* old format (before ver.21) */ \
2844 { \
2845 int gref = (c1) / 9; \
2846 int nref = (c1) % 9; \
2847 if (gref == 4) gref = 10; \
2848 if (nref == 4) nref = 10; \
2849 c1 = COMPOSITION_ENCODE_RULE (gref, nref); \
2850 } \
2851 else if (c1 < 93) /* new format (after ver.21) */ \
2852 { \
2853 ONE_MORE_BYTE (c2); \
2854 c1 = COMPOSITION_ENCODE_RULE (c1 - 81, c2 - 32); \
2855 } \
2856 else \
2857 c1 = 0; \
2858 } while (0)
2859
2860
2861 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions". */
2862
2863 static void
2864 decode_coding_iso_2022 (coding)
2865 struct coding_system *coding;
2866 {
2867 const unsigned char *src = coding->source + coding->consumed;
2868 const unsigned char *src_end = coding->source + coding->src_bytes;
2869 const unsigned char *src_base;
2870 int *charbuf = coding->charbuf;
2871 int *charbuf_end
2872 = charbuf + coding->charbuf_size - 4 - MAX_ANNOTATION_LENGTH;
2873 int consumed_chars = 0, consumed_chars_base;
2874 int multibytep = coding->src_multibyte;
2875 /* Charsets invoked to graphic plane 0 and 1 respectively. */
2876 int charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
2877 int charset_id_1 = CODING_ISO_INVOKED_CHARSET (coding, 1);
2878 struct charset *charset;
2879 int c;
2880 /* For handling composition sequence. */
2881 #define COMPOSING_NO 0
2882 #define COMPOSING_CHAR 1
2883 #define COMPOSING_RULE 2
2884 #define COMPOSING_COMPONENT_CHAR 3
2885 #define COMPOSING_COMPONENT_RULE 4
2886
2887 int composition_state = COMPOSING_NO;
2888 enum composition_method method;
2889 int components[MAX_COMPOSITION_COMPONENTS * 2 + 1];
2890 int component_idx;
2891 int component_len;
2892 Lisp_Object attrs, charset_list;
2893 int char_offset = coding->produced_char;
2894 int last_offset = char_offset;
2895 int last_id = charset_ascii;
2896
2897 CODING_GET_INFO (coding, attrs, charset_list);
2898 setup_iso_safe_charsets (attrs);
2899
2900 while (1)
2901 {
2902 int c1, c2;
2903
2904 src_base = src;
2905 consumed_chars_base = consumed_chars;
2906
2907 if (charbuf >= charbuf_end)
2908 break;
2909
2910 ONE_MORE_BYTE (c1);
2911 if (c1 < 0)
2912 goto invalid_code;
2913
2914 /* We produce at most one character. */
2915 switch (iso_code_class [c1])
2916 {
2917 case ISO_0x20_or_0x7F:
2918 if (composition_state != COMPOSING_NO)
2919 {
2920 if (composition_state == COMPOSING_RULE
2921 || composition_state == COMPOSING_COMPONENT_RULE)
2922 {
2923 DECODE_COMPOSITION_RULE (c1);
2924 components[component_idx++] = c1;
2925 composition_state--;
2926 continue;
2927 }
2928 }
2929 if (charset_id_0 < 0
2930 || ! CHARSET_ISO_CHARS_96 (CHARSET_FROM_ID (charset_id_0)))
2931 /* This is SPACE or DEL. */
2932 charset = CHARSET_FROM_ID (charset_ascii);
2933 else
2934 charset = CHARSET_FROM_ID (charset_id_0);
2935 break;
2936
2937 case ISO_graphic_plane_0:
2938 if (composition_state != COMPOSING_NO)
2939 {
2940 if (composition_state == COMPOSING_RULE
2941 || composition_state == COMPOSING_COMPONENT_RULE)
2942 {
2943 DECODE_COMPOSITION_RULE (c1);
2944 components[component_idx++] = c1;
2945 composition_state--;
2946 continue;
2947 }
2948 }
2949 charset = CHARSET_FROM_ID (charset_id_0);
2950 break;
2951
2952 case ISO_0xA0_or_0xFF:
2953 if (charset_id_1 < 0
2954 || ! CHARSET_ISO_CHARS_96 (CHARSET_FROM_ID (charset_id_1))
2955 || CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS)
2956 goto invalid_code;
2957 /* This is a graphic character, we fall down ... */
2958
2959 case ISO_graphic_plane_1:
2960 if (charset_id_1 < 0)
2961 goto invalid_code;
2962 charset = CHARSET_FROM_ID (charset_id_1);
2963 break;
2964
2965 case ISO_control_0:
2966 MAYBE_FINISH_COMPOSITION ();
2967 charset = CHARSET_FROM_ID (charset_ascii);
2968 break;
2969
2970 case ISO_control_1:
2971 MAYBE_FINISH_COMPOSITION ();
2972 goto invalid_code;
2973
2974 case ISO_shift_out:
2975 if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LOCKING_SHIFT)
2976 || CODING_ISO_DESIGNATION (coding, 1) < 0)
2977 goto invalid_code;
2978 CODING_ISO_INVOCATION (coding, 0) = 1;
2979 charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
2980 continue;
2981
2982 case ISO_shift_in:
2983 if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LOCKING_SHIFT))
2984 goto invalid_code;
2985 CODING_ISO_INVOCATION (coding, 0) = 0;
2986 charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
2987 continue;
2988
2989 case ISO_single_shift_2_7:
2990 case ISO_single_shift_2:
2991 if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT))
2992 goto invalid_code;
2993 /* SS2 is handled as an escape sequence of ESC 'N' */
2994 c1 = 'N';
2995 goto label_escape_sequence;
2996
2997 case ISO_single_shift_3:
2998 if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT))
2999 goto invalid_code;
3000 /* SS2 is handled as an escape sequence of ESC 'O' */
3001 c1 = 'O';
3002 goto label_escape_sequence;
3003
3004 case ISO_control_sequence_introducer:
3005 /* CSI is handled as an escape sequence of ESC '[' ... */
3006 c1 = '[';
3007 goto label_escape_sequence;
3008
3009 case ISO_escape:
3010 ONE_MORE_BYTE (c1);
3011 label_escape_sequence:
3012 /* Escape sequences handled here are invocation,
3013 designation, direction specification, and character
3014 composition specification. */
3015 switch (c1)
3016 {
3017 case '&': /* revision of following character set */
3018 ONE_MORE_BYTE (c1);
3019 if (!(c1 >= '@' && c1 <= '~'))
3020 goto invalid_code;
3021 ONE_MORE_BYTE (c1);
3022 if (c1 != ISO_CODE_ESC)
3023 goto invalid_code;
3024 ONE_MORE_BYTE (c1);
3025 goto label_escape_sequence;
3026
3027 case '$': /* designation of 2-byte character set */
3028 if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DESIGNATION))
3029 goto invalid_code;
3030 ONE_MORE_BYTE (c1);
3031 if (c1 >= '@' && c1 <= 'B')
3032 { /* designation of JISX0208.1978, GB2312.1980,
3033 or JISX0208.1980 */
3034 DECODE_DESIGNATION (0, 2, 0, c1);
3035 }
3036 else if (c1 >= 0x28 && c1 <= 0x2B)
3037 { /* designation of DIMENSION2_CHARS94 character set */
3038 ONE_MORE_BYTE (c2);
3039 DECODE_DESIGNATION (c1 - 0x28, 2, 0, c2);
3040 }
3041 else if (c1 >= 0x2C && c1 <= 0x2F)
3042 { /* designation of DIMENSION2_CHARS96 character set */
3043 ONE_MORE_BYTE (c2);
3044 DECODE_DESIGNATION (c1 - 0x2C, 2, 1, c2);
3045 }
3046 else
3047 goto invalid_code;
3048 /* We must update these variables now. */
3049 charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3050 charset_id_1 = CODING_ISO_INVOKED_CHARSET (coding, 1);
3051 continue;
3052
3053 case 'n': /* invocation of locking-shift-2 */
3054 if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LOCKING_SHIFT)
3055 || CODING_ISO_DESIGNATION (coding, 2) < 0)
3056 goto invalid_code;
3057 CODING_ISO_INVOCATION (coding, 0) = 2;
3058 charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3059 continue;
3060
3061 case 'o': /* invocation of locking-shift-3 */
3062 if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LOCKING_SHIFT)
3063 || CODING_ISO_DESIGNATION (coding, 3) < 0)
3064 goto invalid_code;
3065 CODING_ISO_INVOCATION (coding, 0) = 3;
3066 charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3067 continue;
3068
3069 case 'N': /* invocation of single-shift-2 */
3070 if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT)
3071 || CODING_ISO_DESIGNATION (coding, 2) < 0)
3072 goto invalid_code;
3073 charset = CHARSET_FROM_ID (CODING_ISO_DESIGNATION (coding, 2));
3074 ONE_MORE_BYTE (c1);
3075 if (c1 < 0x20 || (c1 >= 0x80 && c1 < 0xA0))
3076 goto invalid_code;
3077 break;
3078
3079 case 'O': /* invocation of single-shift-3 */
3080 if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT)
3081 || CODING_ISO_DESIGNATION (coding, 3) < 0)
3082 goto invalid_code;
3083 charset = CHARSET_FROM_ID (CODING_ISO_DESIGNATION (coding, 3));
3084 ONE_MORE_BYTE (c1);
3085 if (c1 < 0x20 || (c1 >= 0x80 && c1 < 0xA0))
3086 goto invalid_code;
3087 break;
3088
3089 case '0': case '2': case '3': case '4': /* start composition */
3090 if (! (coding->common_flags & CODING_ANNOTATE_COMPOSITION_MASK))
3091 goto invalid_code;
3092 DECODE_COMPOSITION_START (c1);
3093 continue;
3094
3095 case '1': /* end composition */
3096 if (composition_state == COMPOSING_NO)
3097 goto invalid_code;
3098 DECODE_COMPOSITION_END ();
3099 continue;
3100
3101 case '[': /* specification of direction */
3102 if (! CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DIRECTION)
3103 goto invalid_code;
3104 /* For the moment, nested direction is not supported.
3105 So, `coding->mode & CODING_MODE_DIRECTION' zero means
3106 left-to-right, and nozero means right-to-left. */
3107 ONE_MORE_BYTE (c1);
3108 switch (c1)
3109 {
3110 case ']': /* end of the current direction */
3111 coding->mode &= ~CODING_MODE_DIRECTION;
3112
3113 case '0': /* end of the current direction */
3114 case '1': /* start of left-to-right direction */
3115 ONE_MORE_BYTE (c1);
3116 if (c1 == ']')
3117 coding->mode &= ~CODING_MODE_DIRECTION;
3118 else
3119 goto invalid_code;
3120 break;
3121
3122 case '2': /* start of right-to-left direction */
3123 ONE_MORE_BYTE (c1);
3124 if (c1 == ']')
3125 coding->mode |= CODING_MODE_DIRECTION;
3126 else
3127 goto invalid_code;
3128 break;
3129
3130 default:
3131 goto invalid_code;
3132 }
3133 continue;
3134
3135 case '%':
3136 ONE_MORE_BYTE (c1);
3137 if (c1 == '/')
3138 {
3139 /* CTEXT extended segment:
3140 ESC % / [0-4] M L --ENCODING-NAME-- \002 --BYTES--
3141 We keep these bytes as is for the moment.
3142 They may be decoded by post-read-conversion. */
3143 int dim, M, L;
3144 int size;
3145
3146 ONE_MORE_BYTE (dim);
3147 ONE_MORE_BYTE (M);
3148 ONE_MORE_BYTE (L);
3149 size = ((M - 128) * 128) + (L - 128);
3150 if (charbuf + 8 + size > charbuf_end)
3151 goto break_loop;
3152 *charbuf++ = ISO_CODE_ESC;
3153 *charbuf++ = '%';
3154 *charbuf++ = '/';
3155 *charbuf++ = dim;
3156 *charbuf++ = BYTE8_TO_CHAR (M);
3157 *charbuf++ = BYTE8_TO_CHAR (L);
3158 while (size-- > 0)
3159 {
3160 ONE_MORE_BYTE (c1);
3161 *charbuf++ = ASCII_BYTE_P (c1) ? c1 : BYTE8_TO_CHAR (c1);
3162 }
3163 }
3164 else if (c1 == 'G')
3165 {
3166 /* XFree86 extension for embedding UTF-8 in CTEXT:
3167 ESC % G --UTF-8-BYTES-- ESC % @
3168 We keep these bytes as is for the moment.
3169 They may be decoded by post-read-conversion. */
3170 int *p = charbuf;
3171
3172 if (p + 6 > charbuf_end)
3173 goto break_loop;
3174 *p++ = ISO_CODE_ESC;
3175 *p++ = '%';
3176 *p++ = 'G';
3177 while (p < charbuf_end)
3178 {
3179 ONE_MORE_BYTE (c1);
3180 if (c1 == ISO_CODE_ESC
3181 && src + 1 < src_end
3182 && src[0] == '%'
3183 && src[1] == '@')
3184 break;
3185 *p++ = ASCII_BYTE_P (c1) ? c1 : BYTE8_TO_CHAR (c1);
3186 }
3187 if (p + 3 > charbuf_end)
3188 goto break_loop;
3189 *p++ = ISO_CODE_ESC;
3190 *p++ = '%';
3191 *p++ = '@';
3192 charbuf = p;
3193 }
3194 else
3195 goto invalid_code;
3196 continue;
3197 break;
3198
3199 default:
3200 if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DESIGNATION))
3201 goto invalid_code;
3202 if (c1 >= 0x28 && c1 <= 0x2B)
3203 { /* designation of DIMENSION1_CHARS94 character set */
3204 ONE_MORE_BYTE (c2);
3205 DECODE_DESIGNATION (c1 - 0x28, 1, 0, c2);
3206 }
3207 else if (c1 >= 0x2C && c1 <= 0x2F)
3208 { /* designation of DIMENSION1_CHARS96 character set */
3209 ONE_MORE_BYTE (c2);
3210 DECODE_DESIGNATION (c1 - 0x2C, 1, 1, c2);
3211 }
3212 else
3213 goto invalid_code;
3214 /* We must update these variables now. */
3215 charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3216 charset_id_1 = CODING_ISO_INVOKED_CHARSET (coding, 1);
3217 continue;
3218 }
3219 }
3220
3221 if (charset->id != charset_ascii
3222 && last_id != charset->id)
3223 {
3224 if (last_id != charset_ascii)
3225 ADD_CHARSET_DATA (charbuf, last_offset, char_offset, last_id);
3226 last_id = charset->id;
3227 last_offset = char_offset;
3228 }
3229
3230 /* Now we know CHARSET and 1st position code C1 of a character.
3231 Produce a decoded character while getting 2nd position code
3232 C2 if necessary. */
3233 c1 &= 0x7F;
3234 if (CHARSET_DIMENSION (charset) > 1)
3235 {
3236 ONE_MORE_BYTE (c2);
3237 if (c2 < 0x20 || (c2 >= 0x80 && c2 < 0xA0))
3238 /* C2 is not in a valid range. */
3239 goto invalid_code;
3240 c1 = (c1 << 8) | (c2 & 0x7F);
3241 if (CHARSET_DIMENSION (charset) > 2)
3242 {
3243 ONE_MORE_BYTE (c2);
3244 if (c2 < 0x20 || (c2 >= 0x80 && c2 < 0xA0))
3245 /* C2 is not in a valid range. */
3246 goto invalid_code;
3247 c1 = (c1 << 8) | (c2 & 0x7F);
3248 }
3249 }
3250
3251 CODING_DECODE_CHAR (coding, src, src_base, src_end, charset, c1, c);
3252 if (c < 0)
3253 {
3254 MAYBE_FINISH_COMPOSITION ();
3255 for (; src_base < src; src_base++, char_offset++)
3256 {
3257 if (ASCII_BYTE_P (*src_base))
3258 *charbuf++ = *src_base;
3259 else
3260 *charbuf++ = BYTE8_TO_CHAR (*src_base);
3261 }
3262 }
3263 else if (composition_state == COMPOSING_NO)
3264 {
3265 *charbuf++ = c;
3266 char_offset++;
3267 }
3268 else
3269 {
3270 components[component_idx++] = c;
3271 if (method == COMPOSITION_WITH_RULE
3272 || (method == COMPOSITION_WITH_RULE_ALTCHARS
3273 && composition_state == COMPOSING_COMPONENT_CHAR))
3274 composition_state++;
3275 }
3276 continue;
3277
3278 invalid_code:
3279 MAYBE_FINISH_COMPOSITION ();
3280 src = src_base;
3281 consumed_chars = consumed_chars_base;
3282 ONE_MORE_BYTE (c);
3283 *charbuf++ = c < 0 ? -c : ASCII_BYTE_P (c) ? c : BYTE8_TO_CHAR (c);
3284 char_offset++;
3285 coding->errors++;
3286 continue;
3287
3288 break_loop:
3289 break;
3290 }
3291
3292 no_more_source:
3293 if (last_id != charset_ascii)
3294 ADD_CHARSET_DATA (charbuf, last_offset, char_offset, last_id);
3295 coding->consumed_char += consumed_chars_base;
3296 coding->consumed = src_base - coding->source;
3297 coding->charbuf_used = charbuf - coding->charbuf;
3298 }
3299
3300
3301 /* ISO2022 encoding stuff. */
3302
3303 /*
3304 It is not enough to say just "ISO2022" on encoding, we have to
3305 specify more details. In Emacs, each coding system of ISO2022
3306 variant has the following specifications:
3307 1. Initial designation to G0 thru G3.
3308 2. Allows short-form designation?
3309 3. ASCII should be designated to G0 before control characters?
3310 4. ASCII should be designated to G0 at end of line?
3311 5. 7-bit environment or 8-bit environment?
3312 6. Use locking-shift?
3313 7. Use Single-shift?
3314 And the following two are only for Japanese:
3315 8. Use ASCII in place of JIS0201-1976-Roman?
3316 9. Use JISX0208-1983 in place of JISX0208-1978?
3317 These specifications are encoded in CODING_ISO_FLAGS (coding) as flag bits
3318 defined by macros CODING_ISO_FLAG_XXX. See `coding.h' for more
3319 details.
3320 */
3321
3322 /* Produce codes (escape sequence) for designating CHARSET to graphic
3323 register REG at DST, and increment DST. If <final-char> of CHARSET is
3324 '@', 'A', or 'B' and the coding system CODING allows, produce
3325 designation sequence of short-form. */
3326
3327 #define ENCODE_DESIGNATION(charset, reg, coding) \
3328 do { \
3329 unsigned char final_char = CHARSET_ISO_FINAL (charset); \
3330 char *intermediate_char_94 = "()*+"; \
3331 char *intermediate_char_96 = ",-./"; \
3332 int revision = -1; \
3333 int c; \
3334 \
3335 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_REVISION) \
3336 revision = CHARSET_ISO_REVISION (charset); \
3337 \
3338 if (revision >= 0) \
3339 { \
3340 EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, '&'); \
3341 EMIT_ONE_BYTE ('@' + revision); \
3342 } \
3343 EMIT_ONE_ASCII_BYTE (ISO_CODE_ESC); \
3344 if (CHARSET_DIMENSION (charset) == 1) \
3345 { \
3346 if (! CHARSET_ISO_CHARS_96 (charset)) \
3347 c = intermediate_char_94[reg]; \
3348 else \
3349 c = intermediate_char_96[reg]; \
3350 EMIT_ONE_ASCII_BYTE (c); \
3351 } \
3352 else \
3353 { \
3354 EMIT_ONE_ASCII_BYTE ('$'); \
3355 if (! CHARSET_ISO_CHARS_96 (charset)) \
3356 { \
3357 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LONG_FORM \
3358 || reg != 0 \
3359 || final_char < '@' || final_char > 'B') \
3360 EMIT_ONE_ASCII_BYTE (intermediate_char_94[reg]); \
3361 } \
3362 else \
3363 EMIT_ONE_ASCII_BYTE (intermediate_char_96[reg]); \
3364 } \
3365 EMIT_ONE_ASCII_BYTE (final_char); \
3366 \
3367 CODING_ISO_DESIGNATION (coding, reg) = CHARSET_ID (charset); \
3368 } while (0)
3369
3370
3371 /* The following two macros produce codes (control character or escape
3372 sequence) for ISO2022 single-shift functions (single-shift-2 and
3373 single-shift-3). */
3374
3375 #define ENCODE_SINGLE_SHIFT_2 \
3376 do { \
3377 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS) \
3378 EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'N'); \
3379 else \
3380 EMIT_ONE_BYTE (ISO_CODE_SS2); \
3381 CODING_ISO_SINGLE_SHIFTING (coding) = 1; \
3382 } while (0)
3383
3384
3385 #define ENCODE_SINGLE_SHIFT_3 \
3386 do { \
3387 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS) \
3388 EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'O'); \
3389 else \
3390 EMIT_ONE_BYTE (ISO_CODE_SS3); \
3391 CODING_ISO_SINGLE_SHIFTING (coding) = 1; \
3392 } while (0)
3393
3394
3395 /* The following four macros produce codes (control character or
3396 escape sequence) for ISO2022 locking-shift functions (shift-in,
3397 shift-out, locking-shift-2, and locking-shift-3). */
3398
3399 #define ENCODE_SHIFT_IN \
3400 do { \
3401 EMIT_ONE_ASCII_BYTE (ISO_CODE_SI); \
3402 CODING_ISO_INVOCATION (coding, 0) = 0; \
3403 } while (0)
3404
3405
3406 #define ENCODE_SHIFT_OUT \
3407 do { \
3408 EMIT_ONE_ASCII_BYTE (ISO_CODE_SO); \
3409 CODING_ISO_INVOCATION (coding, 0) = 1; \
3410 } while (0)
3411
3412
3413 #define ENCODE_LOCKING_SHIFT_2 \
3414 do { \
3415 EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'n'); \
3416 CODING_ISO_INVOCATION (coding, 0) = 2; \
3417 } while (0)
3418
3419
3420 #define ENCODE_LOCKING_SHIFT_3 \
3421 do { \
3422 EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'n'); \
3423 CODING_ISO_INVOCATION (coding, 0) = 3; \
3424 } while (0)
3425
3426
3427 /* Produce codes for a DIMENSION1 character whose character set is
3428 CHARSET and whose position-code is C1. Designation and invocation
3429 sequences are also produced in advance if necessary. */
3430
3431 #define ENCODE_ISO_CHARACTER_DIMENSION1(charset, c1) \
3432 do { \
3433 int id = CHARSET_ID (charset); \
3434 \
3435 if ((CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_USE_ROMAN) \
3436 && id == charset_ascii) \
3437 { \
3438 id = charset_jisx0201_roman; \
3439 charset = CHARSET_FROM_ID (id); \
3440 } \
3441 \
3442 if (CODING_ISO_SINGLE_SHIFTING (coding)) \
3443 { \
3444 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS) \
3445 EMIT_ONE_ASCII_BYTE (c1 & 0x7F); \
3446 else \
3447 EMIT_ONE_BYTE (c1 | 0x80); \
3448 CODING_ISO_SINGLE_SHIFTING (coding) = 0; \
3449 break; \
3450 } \
3451 else if (id == CODING_ISO_INVOKED_CHARSET (coding, 0)) \
3452 { \
3453 EMIT_ONE_ASCII_BYTE (c1 & 0x7F); \
3454 break; \
3455 } \
3456 else if (id == CODING_ISO_INVOKED_CHARSET (coding, 1)) \
3457 { \
3458 EMIT_ONE_BYTE (c1 | 0x80); \
3459 break; \
3460 } \
3461 else \
3462 /* Since CHARSET is not yet invoked to any graphic planes, we \
3463 must invoke it, or, at first, designate it to some graphic \
3464 register. Then repeat the loop to actually produce the \
3465 character. */ \
3466 dst = encode_invocation_designation (charset, coding, dst, \
3467 &produced_chars); \
3468 } while (1)
3469
3470
3471 /* Produce codes for a DIMENSION2 character whose character set is
3472 CHARSET and whose position-codes are C1 and C2. Designation and
3473 invocation codes are also produced in advance if necessary. */
3474
3475 #define ENCODE_ISO_CHARACTER_DIMENSION2(charset, c1, c2) \
3476 do { \
3477 int id = CHARSET_ID (charset); \
3478 \
3479 if ((CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_USE_OLDJIS) \
3480 && id == charset_jisx0208) \
3481 { \
3482 id = charset_jisx0208_1978; \
3483 charset = CHARSET_FROM_ID (id); \
3484 } \
3485 \
3486 if (CODING_ISO_SINGLE_SHIFTING (coding)) \
3487 { \
3488 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS) \
3489 EMIT_TWO_ASCII_BYTES ((c1) & 0x7F, (c2) & 0x7F); \
3490 else \
3491 EMIT_TWO_BYTES ((c1) | 0x80, (c2) | 0x80); \
3492 CODING_ISO_SINGLE_SHIFTING (coding) = 0; \
3493 break; \
3494 } \
3495 else if (id == CODING_ISO_INVOKED_CHARSET (coding, 0)) \
3496 { \
3497 EMIT_TWO_ASCII_BYTES ((c1) & 0x7F, (c2) & 0x7F); \
3498 break; \
3499 } \
3500 else if (id == CODING_ISO_INVOKED_CHARSET (coding, 1)) \
3501 { \
3502 EMIT_TWO_BYTES ((c1) | 0x80, (c2) | 0x80); \
3503 break; \
3504 } \
3505 else \
3506 /* Since CHARSET is not yet invoked to any graphic planes, we \
3507 must invoke it, or, at first, designate it to some graphic \
3508 register. Then repeat the loop to actually produce the \
3509 character. */ \
3510 dst = encode_invocation_designation (charset, coding, dst, \
3511 &produced_chars); \
3512 } while (1)
3513
3514
3515 #define ENCODE_ISO_CHARACTER(charset, c) \
3516 do { \
3517 int code = ENCODE_CHAR ((charset),(c)); \
3518 \
3519 if (CHARSET_DIMENSION (charset) == 1) \
3520 ENCODE_ISO_CHARACTER_DIMENSION1 ((charset), code); \
3521 else \
3522 ENCODE_ISO_CHARACTER_DIMENSION2 ((charset), code >> 8, code & 0xFF); \
3523 } while (0)
3524
3525
3526 /* Produce designation and invocation codes at a place pointed by DST
3527 to use CHARSET. The element `spec.iso_2022' of *CODING is updated.
3528 Return new DST. */
3529
3530 unsigned char *
3531 encode_invocation_designation (charset, coding, dst, p_nchars)
3532 struct charset *charset;
3533 struct coding_system *coding;
3534 unsigned char *dst;
3535 int *p_nchars;
3536 {
3537 int multibytep = coding->dst_multibyte;
3538 int produced_chars = *p_nchars;
3539 int reg; /* graphic register number */
3540 int id = CHARSET_ID (charset);
3541
3542 /* At first, check designations. */
3543 for (reg = 0; reg < 4; reg++)
3544 if (id == CODING_ISO_DESIGNATION (coding, reg))
3545 break;
3546
3547 if (reg >= 4)
3548 {
3549 /* CHARSET is not yet designated to any graphic registers. */
3550 /* At first check the requested designation. */
3551 reg = CODING_ISO_REQUEST (coding, id);
3552 if (reg < 0)
3553 /* Since CHARSET requests no special designation, designate it
3554 to graphic register 0. */
3555 reg = 0;
3556
3557 ENCODE_DESIGNATION (charset, reg, coding);
3558 }
3559
3560 if (CODING_ISO_INVOCATION (coding, 0) != reg
3561 && CODING_ISO_INVOCATION (coding, 1) != reg)
3562 {
3563 /* Since the graphic register REG is not invoked to any graphic
3564 planes, invoke it to graphic plane 0. */
3565 switch (reg)
3566 {
3567 case 0: /* graphic register 0 */
3568 ENCODE_SHIFT_IN;
3569 break;
3570
3571 case 1: /* graphic register 1 */
3572 ENCODE_SHIFT_OUT;
3573 break;
3574
3575 case 2: /* graphic register 2 */
3576 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT)
3577 ENCODE_SINGLE_SHIFT_2;
3578 else
3579 ENCODE_LOCKING_SHIFT_2;
3580 break;
3581
3582 case 3: /* graphic register 3 */
3583 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT)
3584 ENCODE_SINGLE_SHIFT_3;
3585 else
3586 ENCODE_LOCKING_SHIFT_3;
3587 break;
3588 }
3589 }
3590
3591 *p_nchars = produced_chars;
3592 return dst;
3593 }
3594
3595 /* The following three macros produce codes for indicating direction
3596 of text. */
3597 #define ENCODE_CONTROL_SEQUENCE_INTRODUCER \
3598 do { \
3599 if (CODING_ISO_FLAGS (coding) == CODING_ISO_FLAG_SEVEN_BITS) \
3600 EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, '['); \
3601 else \
3602 EMIT_ONE_BYTE (ISO_CODE_CSI); \
3603 } while (0)
3604
3605
3606 #define ENCODE_DIRECTION_R2L() \
3607 do { \
3608 ENCODE_CONTROL_SEQUENCE_INTRODUCER (dst); \
3609 EMIT_TWO_ASCII_BYTES ('2', ']'); \
3610 } while (0)
3611
3612
3613 #define ENCODE_DIRECTION_L2R() \
3614 do { \
3615 ENCODE_CONTROL_SEQUENCE_INTRODUCER (dst); \
3616 EMIT_TWO_ASCII_BYTES ('0', ']'); \
3617 } while (0)
3618
3619
3620 /* Produce codes for designation and invocation to reset the graphic
3621 planes and registers to initial state. */
3622 #define ENCODE_RESET_PLANE_AND_REGISTER() \
3623 do { \
3624 int reg; \
3625 struct charset *charset; \
3626 \
3627 if (CODING_ISO_INVOCATION (coding, 0) != 0) \
3628 ENCODE_SHIFT_IN; \
3629 for (reg = 0; reg < 4; reg++) \
3630 if (CODING_ISO_INITIAL (coding, reg) >= 0 \
3631 && (CODING_ISO_DESIGNATION (coding, reg) \
3632 != CODING_ISO_INITIAL (coding, reg))) \
3633 { \
3634 charset = CHARSET_FROM_ID (CODING_ISO_INITIAL (coding, reg)); \
3635 ENCODE_DESIGNATION (charset, reg, coding); \
3636 } \
3637 } while (0)
3638
3639
3640 /* Produce designation sequences of charsets in the line started from
3641 SRC to a place pointed by DST, and return updated DST.
3642
3643 If the current block ends before any end-of-line, we may fail to
3644 find all the necessary designations. */
3645
3646 static unsigned char *
3647 encode_designation_at_bol (coding, charbuf, charbuf_end, dst)
3648 struct coding_system *coding;
3649 int *charbuf, *charbuf_end;
3650 unsigned char *dst;
3651 {
3652 struct charset *charset;
3653 /* Table of charsets to be designated to each graphic register. */
3654 int r[4];
3655 int c, found = 0, reg;
3656 int produced_chars = 0;
3657 int multibytep = coding->dst_multibyte;
3658 Lisp_Object attrs;
3659 Lisp_Object charset_list;
3660
3661 attrs = CODING_ID_ATTRS (coding->id);
3662 charset_list = CODING_ATTR_CHARSET_LIST (attrs);
3663 if (EQ (charset_list, Qiso_2022))
3664 charset_list = Viso_2022_charset_list;
3665
3666 for (reg = 0; reg < 4; reg++)
3667 r[reg] = -1;
3668
3669 while (found < 4)
3670 {
3671 int id;
3672
3673 c = *charbuf++;
3674 if (c == '\n')
3675 break;
3676 charset = char_charset (c, charset_list, NULL);
3677 id = CHARSET_ID (charset);
3678 reg = CODING_ISO_REQUEST (coding, id);
3679 if (reg >= 0 && r[reg] < 0)
3680 {
3681 found++;
3682 r[reg] = id;
3683 }
3684 }
3685
3686 if (found)
3687 {
3688 for (reg = 0; reg < 4; reg++)
3689 if (r[reg] >= 0
3690 && CODING_ISO_DESIGNATION (coding, reg) != r[reg])
3691 ENCODE_DESIGNATION (CHARSET_FROM_ID (r[reg]), reg, coding);
3692 }
3693
3694 return dst;
3695 }
3696
3697 /* See the above "GENERAL NOTES on `encode_coding_XXX ()' functions". */
3698
3699 static int
3700 encode_coding_iso_2022 (coding)
3701 struct coding_system *coding;
3702 {
3703 int multibytep = coding->dst_multibyte;
3704 int *charbuf = coding->charbuf;
3705 int *charbuf_end = charbuf + coding->charbuf_used;
3706 unsigned char *dst = coding->destination + coding->produced;
3707 unsigned char *dst_end = coding->destination + coding->dst_bytes;
3708 int safe_room = 16;
3709 int bol_designation
3710 = (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DESIGNATE_AT_BOL
3711 && CODING_ISO_BOL (coding));
3712 int produced_chars = 0;
3713 Lisp_Object attrs, eol_type, charset_list;
3714 int ascii_compatible;
3715 int c;
3716 int preferred_charset_id = -1;
3717
3718 CODING_GET_INFO (coding, attrs, charset_list);
3719 eol_type = CODING_ID_EOL_TYPE (coding->id);
3720 if (VECTORP (eol_type))
3721 eol_type = Qunix;
3722
3723 setup_iso_safe_charsets (attrs);
3724 /* Charset list may have been changed. */
3725 charset_list = CODING_ATTR_CHARSET_LIST (attrs); \
3726 coding->safe_charsets = (char *) SDATA (CODING_ATTR_SAFE_CHARSETS(attrs));
3727
3728 ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs));
3729
3730 while (charbuf < charbuf_end)
3731 {
3732 ASSURE_DESTINATION (safe_room);
3733
3734 if (bol_designation)
3735 {
3736 unsigned char *dst_prev = dst;
3737
3738 /* We have to produce designation sequences if any now. */
3739 dst = encode_designation_at_bol (coding, charbuf, charbuf_end, dst);
3740 bol_designation = 0;
3741 /* We are sure that designation sequences are all ASCII bytes. */
3742 produced_chars += dst - dst_prev;
3743 }
3744
3745 c = *charbuf++;
3746
3747 if (c < 0)
3748 {
3749 /* Handle an annotation. */
3750 switch (*charbuf)
3751 {
3752 case CODING_ANNOTATE_COMPOSITION_MASK:
3753 /* Not yet implemented. */
3754 break;
3755 case CODING_ANNOTATE_CHARSET_MASK:
3756 preferred_charset_id = charbuf[3];
3757 if (preferred_charset_id >= 0
3758 && NILP (Fmemq (make_number (preferred_charset_id),
3759 charset_list)))
3760 preferred_charset_id = -1;
3761 break;
3762 default:
3763 abort ();
3764 }
3765 charbuf += -c - 1;
3766 continue;
3767 }
3768
3769 /* Now encode the character C. */
3770 if (c < 0x20 || c == 0x7F)
3771 {
3772 if (c == '\n'
3773 || (c == '\r' && EQ (eol_type, Qmac)))
3774 {
3775 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_RESET_AT_EOL)
3776 ENCODE_RESET_PLANE_AND_REGISTER ();
3777 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_INIT_AT_BOL)
3778 {
3779 int i;
3780
3781 for (i = 0; i < 4; i++)
3782 CODING_ISO_DESIGNATION (coding, i)
3783 = CODING_ISO_INITIAL (coding, i);
3784 }
3785 bol_designation
3786 = CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DESIGNATE_AT_BOL;
3787 }
3788 else if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_RESET_AT_CNTL)
3789 ENCODE_RESET_PLANE_AND_REGISTER ();
3790 EMIT_ONE_ASCII_BYTE (c);
3791 }
3792 else if (ASCII_CHAR_P (c))
3793 {
3794 if (ascii_compatible)
3795 EMIT_ONE_ASCII_BYTE (c);
3796 else
3797 {
3798 struct charset *charset = CHARSET_FROM_ID (charset_ascii);
3799 ENCODE_ISO_CHARACTER (charset, c);
3800 }
3801 }
3802 else if (CHAR_BYTE8_P (c))
3803 {
3804 c = CHAR_TO_BYTE8 (c);
3805 EMIT_ONE_BYTE (c);
3806 }
3807 else
3808 {
3809 struct charset *charset;
3810
3811 if (preferred_charset_id >= 0)
3812 {
3813 charset = CHARSET_FROM_ID (preferred_charset_id);
3814 if (! CHAR_CHARSET_P (c, charset))
3815 charset = char_charset (c, charset_list, NULL);
3816 }
3817 else
3818 charset = char_charset (c, charset_list, NULL);
3819 if (!charset)
3820 {
3821 if (coding->mode & CODING_MODE_SAFE_ENCODING)
3822 {
3823 c = CODING_INHIBIT_CHARACTER_SUBSTITUTION;
3824 charset = CHARSET_FROM_ID (charset_ascii);
3825 }
3826 else
3827 {
3828 c = coding->default_char;
3829 charset = char_charset (c, charset_list, NULL);
3830 }
3831 }
3832 ENCODE_ISO_CHARACTER (charset, c);
3833 }
3834 }
3835
3836 if (coding->mode & CODING_MODE_LAST_BLOCK
3837 && CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_RESET_AT_EOL)
3838 {
3839 ASSURE_DESTINATION (safe_room);
3840 ENCODE_RESET_PLANE_AND_REGISTER ();
3841 }
3842 record_conversion_result (coding, CODING_RESULT_SUCCESS);
3843 CODING_ISO_BOL (coding) = bol_designation;
3844 coding->produced_char += produced_chars;
3845 coding->produced = dst - coding->destination;
3846 return 0;
3847 }
3848
3849 \f
3850 /*** 8,9. SJIS and BIG5 handlers ***/
3851
3852 /* Although SJIS and BIG5 are not ISO's coding system, they are used
3853 quite widely. So, for the moment, Emacs supports them in the bare
3854 C code. But, in the future, they may be supported only by CCL. */
3855
3856 /* SJIS is a coding system encoding three character sets: ASCII, right
3857 half of JISX0201-Kana, and JISX0208. An ASCII character is encoded
3858 as is. A character of charset katakana-jisx0201 is encoded by
3859 "position-code + 0x80". A character of charset japanese-jisx0208
3860 is encoded in 2-byte but two position-codes are divided and shifted
3861 so that it fit in the range below.
3862
3863 --- CODE RANGE of SJIS ---
3864 (character set) (range)
3865 ASCII 0x00 .. 0x7F
3866 KATAKANA-JISX0201 0xA0 .. 0xDF
3867 JISX0208 (1st byte) 0x81 .. 0x9F and 0xE0 .. 0xEF
3868 (2nd byte) 0x40 .. 0x7E and 0x80 .. 0xFC
3869 -------------------------------
3870
3871 */
3872
3873 /* BIG5 is a coding system encoding two character sets: ASCII and
3874 Big5. An ASCII character is encoded as is. Big5 is a two-byte
3875 character set and is encoded in two-byte.
3876
3877 --- CODE RANGE of BIG5 ---
3878 (character set) (range)
3879 ASCII 0x00 .. 0x7F
3880 Big5 (1st byte) 0xA1 .. 0xFE
3881 (2nd byte) 0x40 .. 0x7E and 0xA1 .. 0xFE
3882 --------------------------
3883
3884 */
3885
3886 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
3887 Check if a text is encoded in SJIS. If it is, return
3888 CATEGORY_MASK_SJIS, else return 0. */
3889
3890 static int
3891 detect_coding_sjis (coding, detect_info)
3892 struct coding_system *coding;
3893 struct coding_detection_info *detect_info;
3894 {
3895 const unsigned char *src = coding->source, *src_base;
3896 const unsigned char *src_end = coding->source + coding->src_bytes;
3897 int multibytep = coding->src_multibyte;
3898 int consumed_chars = 0;
3899 int found = 0;
3900 int c;
3901
3902 detect_info->checked |= CATEGORY_MASK_SJIS;
3903 /* A coding system of this category is always ASCII compatible. */
3904 src += coding->head_ascii;
3905
3906 while (1)
3907 {
3908 src_base = src;
3909 ONE_MORE_BYTE (c);
3910 if (c < 0x80)
3911 continue;
3912 if ((c >= 0x81 && c <= 0x9F) || (c >= 0xE0 && c <= 0xEF))
3913 {
3914 ONE_MORE_BYTE (c);
3915 if (c < 0x40 || c == 0x7F || c > 0xFC)
3916 break;
3917 found = CATEGORY_MASK_SJIS;
3918 }
3919 else if (c >= 0xA0 && c < 0xE0)
3920 found = CATEGORY_MASK_SJIS;
3921 else
3922 break;
3923 }
3924 detect_info->rejected |= CATEGORY_MASK_SJIS;
3925 return 0;
3926
3927 no_more_source:
3928 if (src_base < src && coding->mode & CODING_MODE_LAST_BLOCK)
3929 {
3930 detect_info->rejected |= CATEGORY_MASK_SJIS;
3931 return 0;
3932 }
3933 detect_info->found |= found;
3934 return 1;
3935 }
3936
3937 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
3938 Check if a text is encoded in BIG5. If it is, return
3939 CATEGORY_MASK_BIG5, else return 0. */
3940
3941 static int
3942 detect_coding_big5 (coding, detect_info)
3943 struct coding_system *coding;
3944 struct coding_detection_info *detect_info;
3945 {
3946 const unsigned char *src = coding->source, *src_base;
3947 const unsigned char *src_end = coding->source + coding->src_bytes;
3948 int multibytep = coding->src_multibyte;
3949 int consumed_chars = 0;
3950 int found = 0;
3951 int c;
3952
3953 detect_info->checked |= CATEGORY_MASK_BIG5;
3954 /* A coding system of this category is always ASCII compatible. */
3955 src += coding->head_ascii;
3956
3957 while (1)
3958 {
3959 src_base = src;
3960 ONE_MORE_BYTE (c);
3961 if (c < 0x80)
3962 continue;
3963 if (c >= 0xA1)
3964 {
3965 ONE_MORE_BYTE (c);
3966 if (c < 0x40 || (c >= 0x7F && c <= 0xA0))
3967 return 0;
3968 found = CATEGORY_MASK_BIG5;
3969 }
3970 else
3971 break;
3972 }
3973 detect_info->rejected |= CATEGORY_MASK_BIG5;
3974 return 0;
3975
3976 no_more_source:
3977 if (src_base < src && coding->mode & CODING_MODE_LAST_BLOCK)
3978 {
3979 detect_info->rejected |= CATEGORY_MASK_BIG5;
3980 return 0;
3981 }
3982 detect_info->found |= found;
3983 return 1;
3984 }
3985
3986 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".
3987 If SJIS_P is 1, decode SJIS text, else decode BIG5 test. */
3988
3989 static void
3990 decode_coding_sjis (coding)
3991 struct coding_system *coding;
3992 {
3993 const unsigned char *src = coding->source + coding->consumed;
3994 const unsigned char *src_end = coding->source + coding->src_bytes;
3995 const unsigned char *src_base;
3996 int *charbuf = coding->charbuf;
3997 int *charbuf_end = charbuf + coding->charbuf_size - MAX_ANNOTATION_LENGTH;
3998 int consumed_chars = 0, consumed_chars_base;
3999 int multibytep = coding->src_multibyte;
4000 struct charset *charset_roman, *charset_kanji, *charset_kana;
4001 struct charset *charset_kanji2;
4002 Lisp_Object attrs, charset_list, val;
4003 int char_offset = coding->produced_char;
4004 int last_offset = char_offset;
4005 int last_id = charset_ascii;
4006
4007 CODING_GET_INFO (coding, attrs, charset_list);
4008
4009 val = charset_list;
4010 charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4011 charset_kana = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4012 charset_kanji = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4013 charset_kanji2 = NILP (val) ? NULL : CHARSET_FROM_ID (XINT (XCAR (val)));
4014
4015 while (1)
4016 {
4017 int c, c1;
4018 struct charset *charset;
4019
4020 src_base = src;
4021 consumed_chars_base = consumed_chars;
4022
4023 if (charbuf >= charbuf_end)
4024 break;
4025
4026 ONE_MORE_BYTE (c);
4027 if (c < 0)
4028 goto invalid_code;
4029 if (c < 0x80)
4030 charset = charset_roman;
4031 else if (c == 0x80 || c == 0xA0)
4032 goto invalid_code;
4033 else if (c >= 0xA1 && c <= 0xDF)
4034 {
4035 /* SJIS -> JISX0201-Kana */
4036 c &= 0x7F;
4037 charset = charset_kana;
4038 }
4039 else if (c <= 0xEF)
4040 {
4041 /* SJIS -> JISX0208 */
4042 ONE_MORE_BYTE (c1);
4043 if (c1 < 0x40 || c1 == 0x7F || c1 > 0xFC)
4044 goto invalid_code;
4045 c = (c << 8) | c1;
4046 SJIS_TO_JIS (c);
4047 charset = charset_kanji;
4048 }
4049 else if (c <= 0xFC && charset_kanji2)
4050 {
4051 /* SJIS -> JISX0212 */
4052 ONE_MORE_BYTE (c1);
4053 if (c1 < 0x40 || c1 == 0x7F || c1 > 0xFC)
4054 goto invalid_code;
4055 c = (c << 8) | c1;
4056 SJIS_TO_JIS2 (c);
4057 charset = charset_kanji2;
4058 }
4059 else
4060 goto invalid_code;
4061 if (charset->id != charset_ascii
4062 && last_id != charset->id)
4063 {
4064 if (last_id != charset_ascii)
4065 ADD_CHARSET_DATA (charbuf, last_offset, char_offset, last_id);
4066 last_id = charset->id;
4067 last_offset = char_offset;
4068 }
4069 CODING_DECODE_CHAR (coding, src, src_base, src_end, charset, c, c);
4070 *charbuf++ = c;
4071 char_offset++;
4072 continue;
4073
4074 invalid_code:
4075 src = src_base;
4076 consumed_chars = consumed_chars_base;
4077 ONE_MORE_BYTE (c);
4078 *charbuf++ = c < 0 ? -c : BYTE8_TO_CHAR (c);
4079 char_offset++;
4080 coding->errors++;
4081 }
4082
4083 no_more_source:
4084 if (last_id != charset_ascii)
4085 ADD_CHARSET_DATA (charbuf, last_offset, char_offset, last_id);
4086 coding->consumed_char += consumed_chars_base;
4087 coding->consumed = src_base - coding->source;
4088 coding->charbuf_used = charbuf - coding->charbuf;
4089 }
4090
4091 static void
4092 decode_coding_big5 (coding)
4093 struct coding_system *coding;
4094 {
4095 const unsigned char *src = coding->source + coding->consumed;
4096 const unsigned char *src_end = coding->source + coding->src_bytes;
4097 const unsigned char *src_base;
4098 int *charbuf = coding->charbuf;
4099 int *charbuf_end = charbuf + coding->charbuf_size - MAX_ANNOTATION_LENGTH;
4100 int consumed_chars = 0, consumed_chars_base;
4101 int multibytep = coding->src_multibyte;
4102 struct charset *charset_roman, *charset_big5;
4103 Lisp_Object attrs, charset_list, val;
4104 int char_offset = coding->produced_char;
4105 int last_offset = char_offset;
4106 int last_id = charset_ascii;
4107
4108 CODING_GET_INFO (coding, attrs, charset_list);
4109 val = charset_list;
4110 charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4111 charset_big5 = CHARSET_FROM_ID (XINT (XCAR (val)));
4112
4113 while (1)
4114 {
4115 int c, c1;
4116 struct charset *charset;
4117
4118 src_base = src;
4119 consumed_chars_base = consumed_chars;
4120
4121 if (charbuf >= charbuf_end)
4122 break;
4123
4124 ONE_MORE_BYTE (c);
4125
4126 if (c < 0)
4127 goto invalid_code;
4128 if (c < 0x80)
4129 charset = charset_roman;
4130 else
4131 {
4132 /* BIG5 -> Big5 */
4133 if (c < 0xA1 || c > 0xFE)
4134 goto invalid_code;
4135 ONE_MORE_BYTE (c1);
4136 if (c1 < 0x40 || (c1 > 0x7E && c1 < 0xA1) || c1 > 0xFE)
4137 goto invalid_code;
4138 c = c << 8 | c1;
4139 charset = charset_big5;
4140 }
4141 if (charset->id != charset_ascii
4142 && last_id != charset->id)
4143 {
4144 if (last_id != charset_ascii)
4145 ADD_CHARSET_DATA (charbuf, last_offset, char_offset, last_id);
4146 last_id = charset->id;
4147 last_offset = char_offset;
4148 }
4149 CODING_DECODE_CHAR (coding, src, src_base, src_end, charset, c, c);
4150 *charbuf++ = c;
4151 char_offset++;
4152 continue;
4153
4154 invalid_code:
4155 src = src_base;
4156 consumed_chars = consumed_chars_base;
4157 ONE_MORE_BYTE (c);
4158 *charbuf++ = c < 0 ? -c : BYTE8_TO_CHAR (c);
4159 char_offset++;
4160 coding->errors++;
4161 }
4162
4163 no_more_source:
4164 if (last_id != charset_ascii)
4165 ADD_CHARSET_DATA (charbuf, last_offset, char_offset, last_id);
4166 coding->consumed_char += consumed_chars_base;
4167 coding->consumed = src_base - coding->source;
4168 coding->charbuf_used = charbuf - coding->charbuf;
4169 }
4170
4171 /* See the above "GENERAL NOTES on `encode_coding_XXX ()' functions".
4172 This function can encode charsets `ascii', `katakana-jisx0201',
4173 `japanese-jisx0208', `chinese-big5-1', and `chinese-big5-2'. We
4174 are sure that all these charsets are registered as official charset
4175 (i.e. do not have extended leading-codes). Characters of other
4176 charsets are produced without any encoding. If SJIS_P is 1, encode
4177 SJIS text, else encode BIG5 text. */
4178
4179 static int
4180 encode_coding_sjis (coding)
4181 struct coding_system *coding;
4182 {
4183 int multibytep = coding->dst_multibyte;
4184 int *charbuf = coding->charbuf;
4185 int *charbuf_end = charbuf + coding->charbuf_used;
4186 unsigned char *dst = coding->destination + coding->produced;
4187 unsigned char *dst_end = coding->destination + coding->dst_bytes;
4188 int safe_room = 4;
4189 int produced_chars = 0;
4190 Lisp_Object attrs, charset_list, val;
4191 int ascii_compatible;
4192 struct charset *charset_roman, *charset_kanji, *charset_kana;
4193 struct charset *charset_kanji2;
4194 int c;
4195
4196 CODING_GET_INFO (coding, attrs, charset_list);
4197 val = charset_list;
4198 charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4199 charset_kana = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4200 charset_kanji = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4201 charset_kanji2 = NILP (val) ? NULL : CHARSET_FROM_ID (XINT (XCAR (val)));
4202
4203 ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs));
4204
4205 while (charbuf < charbuf_end)
4206 {
4207 ASSURE_DESTINATION (safe_room);
4208 c = *charbuf++;
4209 /* Now encode the character C. */
4210 if (ASCII_CHAR_P (c) && ascii_compatible)
4211 EMIT_ONE_ASCII_BYTE (c);
4212 else if (CHAR_BYTE8_P (c))
4213 {
4214 c = CHAR_TO_BYTE8 (c);
4215 EMIT_ONE_BYTE (c);
4216 }
4217 else
4218 {
4219 unsigned code;
4220 struct charset *charset = char_charset (c, charset_list, &code);
4221
4222 if (!charset)
4223 {
4224 if (coding->mode & CODING_MODE_SAFE_ENCODING)
4225 {
4226 code = CODING_INHIBIT_CHARACTER_SUBSTITUTION;
4227 charset = CHARSET_FROM_ID (charset_ascii);
4228 }
4229 else
4230 {
4231 c = coding->default_char;
4232 charset = char_charset (c, charset_list, &code);
4233 }
4234 }
4235 if (code == CHARSET_INVALID_CODE (charset))
4236 abort ();
4237 if (charset == charset_kanji)
4238 {
4239 int c1, c2;
4240 JIS_TO_SJIS (code);
4241 c1 = code >> 8, c2 = code & 0xFF;
4242 EMIT_TWO_BYTES (c1, c2);
4243 }
4244 else if (charset == charset_kana)
4245 EMIT_ONE_BYTE (code | 0x80);
4246 else if (charset_kanji2 && charset == charset_kanji2)
4247 {
4248 int c1, c2;
4249
4250 c1 = code >> 8;
4251 if (c1 == 0x21 || (c1 >= 0x23 && c1 < 0x25)
4252 || (c1 >= 0x2C && c1 <= 0x2F) || c1 >= 0x6E)
4253 {
4254 JIS_TO_SJIS2 (code);
4255 c1 = code >> 8, c2 = code & 0xFF;
4256 EMIT_TWO_BYTES (c1, c2);
4257 }
4258 else
4259 EMIT_ONE_ASCII_BYTE (code & 0x7F);
4260 }
4261 else
4262 EMIT_ONE_ASCII_BYTE (code & 0x7F);
4263 }
4264 }
4265 record_conversion_result (coding, CODING_RESULT_SUCCESS);
4266 coding->produced_char += produced_chars;
4267 coding->produced = dst - coding->destination;
4268 return 0;
4269 }
4270
4271 static int
4272 encode_coding_big5 (coding)
4273 struct coding_system *coding;
4274 {
4275 int multibytep = coding->dst_multibyte;
4276 int *charbuf = coding->charbuf;
4277 int *charbuf_end = charbuf + coding->charbuf_used;
4278 unsigned char *dst = coding->destination + coding->produced;
4279 unsigned char *dst_end = coding->destination + coding->dst_bytes;
4280 int safe_room = 4;
4281 int produced_chars = 0;
4282 Lisp_Object attrs, charset_list, val;
4283 int ascii_compatible;
4284 struct charset *charset_roman, *charset_big5;
4285 int c;
4286
4287 CODING_GET_INFO (coding, attrs, charset_list);
4288 val = charset_list;
4289 charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4290 charset_big5 = CHARSET_FROM_ID (XINT (XCAR (val)));
4291 ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs));
4292
4293 while (charbuf < charbuf_end)
4294 {
4295 ASSURE_DESTINATION (safe_room);
4296 c = *charbuf++;
4297 /* Now encode the character C. */
4298 if (ASCII_CHAR_P (c) && ascii_compatible)
4299 EMIT_ONE_ASCII_BYTE (c);
4300 else if (CHAR_BYTE8_P (c))
4301 {
4302 c = CHAR_TO_BYTE8 (c);
4303 EMIT_ONE_BYTE (c);
4304 }
4305 else
4306 {
4307 unsigned code;
4308 struct charset *charset = char_charset (c, charset_list, &code);
4309
4310 if (! charset)
4311 {
4312 if (coding->mode & CODING_MODE_SAFE_ENCODING)
4313 {
4314 code = CODING_INHIBIT_CHARACTER_SUBSTITUTION;
4315 charset = CHARSET_FROM_ID (charset_ascii);
4316 }
4317 else
4318 {
4319 c = coding->default_char;
4320 charset = char_charset (c, charset_list, &code);
4321 }
4322 }
4323 if (code == CHARSET_INVALID_CODE (charset))
4324 abort ();
4325 if (charset == charset_big5)
4326 {
4327 int c1, c2;
4328
4329 c1 = code >> 8, c2 = code & 0xFF;
4330 EMIT_TWO_BYTES (c1, c2);
4331 }
4332 else
4333 EMIT_ONE_ASCII_BYTE (code & 0x7F);
4334 }
4335 }
4336 record_conversion_result (coding, CODING_RESULT_SUCCESS);
4337 coding->produced_char += produced_chars;
4338 coding->produced = dst - coding->destination;
4339 return 0;
4340 }
4341
4342 \f
4343 /*** 10. CCL handlers ***/
4344
4345 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
4346 Check if a text is encoded in a coding system of which
4347 encoder/decoder are written in CCL program. If it is, return
4348 CATEGORY_MASK_CCL, else return 0. */
4349
4350 static int
4351 detect_coding_ccl (coding, detect_info)
4352 struct coding_system *coding;
4353 struct coding_detection_info *detect_info;
4354 {
4355 const unsigned char *src = coding->source, *src_base;
4356 const unsigned char *src_end = coding->source + coding->src_bytes;
4357 int multibytep = coding->src_multibyte;
4358 int consumed_chars = 0;
4359 int found = 0;
4360 unsigned char *valids = CODING_CCL_VALIDS (coding);
4361 int head_ascii = coding->head_ascii;
4362 Lisp_Object attrs;
4363
4364 detect_info->checked |= CATEGORY_MASK_CCL;
4365
4366 coding = &coding_categories[coding_category_ccl];
4367 attrs = CODING_ID_ATTRS (coding->id);
4368 if (! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
4369 src += head_ascii;
4370
4371 while (1)
4372 {
4373 int c;
4374
4375 src_base = src;
4376 ONE_MORE_BYTE (c);
4377 if (c < 0 || ! valids[c])
4378 break;
4379 if ((valids[c] > 1))
4380 found = CATEGORY_MASK_CCL;
4381 }
4382 detect_info->rejected |= CATEGORY_MASK_CCL;
4383 return 0;
4384
4385 no_more_source:
4386 detect_info->found |= found;
4387 return 1;
4388 }
4389
4390 static void
4391 decode_coding_ccl (coding)
4392 struct coding_system *coding;
4393 {
4394 const unsigned char *src = coding->source + coding->consumed;
4395 const unsigned char *src_end = coding->source + coding->src_bytes;
4396 int *charbuf = coding->charbuf;
4397 int *charbuf_end = charbuf + coding->charbuf_size;
4398 int consumed_chars = 0;
4399 int multibytep = coding->src_multibyte;
4400 struct ccl_program ccl;
4401 int source_charbuf[1024];
4402 int source_byteidx[1024];
4403 Lisp_Object attrs, charset_list;
4404
4405 CODING_GET_INFO (coding, attrs, charset_list);
4406 setup_ccl_program (&ccl, CODING_CCL_DECODER (coding));
4407
4408 while (src < src_end)
4409 {
4410 const unsigned char *p = src;
4411 int *source, *source_end;
4412 int i = 0;
4413
4414 if (multibytep)
4415 while (i < 1024 && p < src_end)
4416 {
4417 source_byteidx[i] = p - src;
4418 source_charbuf[i++] = STRING_CHAR_ADVANCE (p);
4419 }
4420 else
4421 while (i < 1024 && p < src_end)
4422 source_charbuf[i++] = *p++;
4423
4424 if (p == src_end && coding->mode & CODING_MODE_LAST_BLOCK)
4425 ccl.last_block = 1;
4426
4427 source = source_charbuf;
4428 source_end = source + i;
4429 while (source < source_end)
4430 {
4431 ccl_driver (&ccl, source, charbuf,
4432 source_end - source, charbuf_end - charbuf,
4433 charset_list);
4434 source += ccl.consumed;
4435 charbuf += ccl.produced;
4436 if (ccl.status != CCL_STAT_SUSPEND_BY_DST)
4437 break;
4438 }
4439 if (source < source_end)
4440 src += source_byteidx[source - source_charbuf];
4441 else
4442 src = p;
4443 consumed_chars += source - source_charbuf;
4444
4445 if (ccl.status != CCL_STAT_SUSPEND_BY_SRC
4446 && ccl.status != CODING_RESULT_INSUFFICIENT_SRC)
4447 break;
4448 }
4449
4450 switch (ccl.status)
4451 {
4452 case CCL_STAT_SUSPEND_BY_SRC:
4453 record_conversion_result (coding, CODING_RESULT_INSUFFICIENT_SRC);
4454 break;
4455 case CCL_STAT_SUSPEND_BY_DST:
4456 break;
4457 case CCL_STAT_QUIT:
4458 case CCL_STAT_INVALID_CMD:
4459 record_conversion_result (coding, CODING_RESULT_INTERRUPT);
4460 break;
4461 default:
4462 record_conversion_result (coding, CODING_RESULT_SUCCESS);
4463 break;
4464 }
4465 coding->consumed_char += consumed_chars;
4466 coding->consumed = src - coding->source;
4467 coding->charbuf_used = charbuf - coding->charbuf;
4468 }
4469
4470 static int
4471 encode_coding_ccl (coding)
4472 struct coding_system *coding;
4473 {
4474 struct ccl_program ccl;
4475 int multibytep = coding->dst_multibyte;
4476 int *charbuf = coding->charbuf;
4477 int *charbuf_end = charbuf + coding->charbuf_used;
4478 unsigned char *dst = coding->destination + coding->produced;
4479 unsigned char *dst_end = coding->destination + coding->dst_bytes;
4480 unsigned char *adjusted_dst_end = dst_end - 1;
4481 int destination_charbuf[1024];
4482 int i, produced_chars = 0;
4483 Lisp_Object attrs, charset_list;
4484
4485 CODING_GET_INFO (coding, attrs, charset_list);
4486 setup_ccl_program (&ccl, CODING_CCL_ENCODER (coding));
4487
4488 ccl.last_block = coding->mode & CODING_MODE_LAST_BLOCK;
4489 ccl.dst_multibyte = coding->dst_multibyte;
4490
4491 while (charbuf < charbuf_end && dst < adjusted_dst_end)
4492 {
4493 int dst_bytes = dst_end - dst;
4494 if (dst_bytes > 1024)
4495 dst_bytes = 1024;
4496
4497 ccl_driver (&ccl, charbuf, destination_charbuf,
4498 charbuf_end - charbuf, dst_bytes, charset_list);
4499 charbuf += ccl.consumed;
4500 if (multibytep)
4501 for (i = 0; i < ccl.produced; i++)
4502 EMIT_ONE_BYTE (destination_charbuf[i] & 0xFF);
4503 else
4504 {
4505 for (i = 0; i < ccl.produced; i++)
4506 *dst++ = destination_charbuf[i] & 0xFF;
4507 produced_chars += ccl.produced;
4508 }
4509 }
4510
4511 switch (ccl.status)
4512 {
4513 case CCL_STAT_SUSPEND_BY_SRC:
4514 record_conversion_result (coding, CODING_RESULT_INSUFFICIENT_SRC);
4515 break;
4516 case CCL_STAT_SUSPEND_BY_DST:
4517 record_conversion_result (coding, CODING_RESULT_INSUFFICIENT_DST);
4518 break;
4519 case CCL_STAT_QUIT:
4520 case CCL_STAT_INVALID_CMD:
4521 record_conversion_result (coding, CODING_RESULT_INTERRUPT);
4522 break;
4523 default:
4524 record_conversion_result (coding, CODING_RESULT_SUCCESS);
4525 break;
4526 }
4527
4528 coding->produced_char += produced_chars;
4529 coding->produced = dst - coding->destination;
4530 return 0;
4531 }
4532
4533
4534 \f
4535 /*** 10, 11. no-conversion handlers ***/
4536
4537 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions". */
4538
4539 static void
4540 decode_coding_raw_text (coding)
4541 struct coding_system *coding;
4542 {
4543 coding->chars_at_source = 1;
4544 coding->consumed_char = 0;
4545 coding->consumed = 0;
4546 record_conversion_result (coding, CODING_RESULT_SUCCESS);
4547 }
4548
4549 static int
4550 encode_coding_raw_text (coding)
4551 struct coding_system *coding;
4552 {
4553 int multibytep = coding->dst_multibyte;
4554 int *charbuf = coding->charbuf;
4555 int *charbuf_end = coding->charbuf + coding->charbuf_used;
4556 unsigned char *dst = coding->destination + coding->produced;
4557 unsigned char *dst_end = coding->destination + coding->dst_bytes;
4558 int produced_chars = 0;
4559 int c;
4560
4561 if (multibytep)
4562 {
4563 int safe_room = MAX_MULTIBYTE_LENGTH * 2;
4564
4565 if (coding->src_multibyte)
4566 while (charbuf < charbuf_end)
4567 {
4568 ASSURE_DESTINATION (safe_room);
4569 c = *charbuf++;
4570 if (ASCII_CHAR_P (c))
4571 EMIT_ONE_ASCII_BYTE (c);
4572 else if (CHAR_BYTE8_P (c))
4573 {
4574 c = CHAR_TO_BYTE8 (c);
4575 EMIT_ONE_BYTE (c);
4576 }
4577 else
4578 {
4579 unsigned char str[MAX_MULTIBYTE_LENGTH], *p0 = str, *p1 = str;
4580
4581 CHAR_STRING_ADVANCE (c, p1);
4582 while (p0 < p1)
4583 {
4584 EMIT_ONE_BYTE (*p0);
4585 p0++;
4586 }
4587 }
4588 }
4589 else
4590 while (charbuf < charbuf_end)
4591 {
4592 ASSURE_DESTINATION (safe_room);
4593 c = *charbuf++;
4594 EMIT_ONE_BYTE (c);
4595 }
4596 }
4597 else
4598 {
4599 if (coding->src_multibyte)
4600 {
4601 int safe_room = MAX_MULTIBYTE_LENGTH;
4602
4603 while (charbuf < charbuf_end)
4604 {
4605 ASSURE_DESTINATION (safe_room);
4606 c = *charbuf++;
4607 if (ASCII_CHAR_P (c))
4608 *dst++ = c;
4609 else if (CHAR_BYTE8_P (c))
4610 *dst++ = CHAR_TO_BYTE8 (c);
4611 else
4612 CHAR_STRING_ADVANCE (c, dst);
4613 produced_chars++;
4614 }
4615 }
4616 else
4617 {
4618 ASSURE_DESTINATION (charbuf_end - charbuf);
4619 while (charbuf < charbuf_end && dst < dst_end)
4620 *dst++ = *charbuf++;
4621 produced_chars = dst - (coding->destination + coding->dst_bytes);
4622 }
4623 }
4624 record_conversion_result (coding, CODING_RESULT_SUCCESS);
4625 coding->produced_char += produced_chars;
4626 coding->produced = dst - coding->destination;
4627 return 0;
4628 }
4629
4630 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
4631 Check if a text is encoded in a charset-based coding system. If it
4632 is, return 1, else return 0. */
4633
4634 static int
4635 detect_coding_charset (coding, detect_info)
4636 struct coding_system *coding;
4637 struct coding_detection_info *detect_info;
4638 {
4639 const unsigned char *src = coding->source, *src_base;
4640 const unsigned char *src_end = coding->source + coding->src_bytes;
4641 int multibytep = coding->src_multibyte;
4642 int consumed_chars = 0;
4643 Lisp_Object attrs, valids;
4644 int found = 0;
4645
4646 detect_info->checked |= CATEGORY_MASK_CHARSET;
4647
4648 coding = &coding_categories[coding_category_charset];
4649 attrs = CODING_ID_ATTRS (coding->id);
4650 valids = AREF (attrs, coding_attr_charset_valids);
4651
4652 if (! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
4653 src += coding->head_ascii;
4654
4655 while (1)
4656 {
4657 int c;
4658
4659 src_base = src;
4660 ONE_MORE_BYTE (c);
4661 if (c < 0)
4662 continue;
4663 if (NILP (AREF (valids, c)))
4664 break;
4665 if (c >= 0x80)
4666 found = CATEGORY_MASK_CHARSET;
4667 }
4668 detect_info->rejected |= CATEGORY_MASK_CHARSET;
4669 return 0;
4670
4671 no_more_source:
4672 detect_info->found |= found;
4673 return 1;
4674 }
4675
4676 static void
4677 decode_coding_charset (coding)
4678 struct coding_system *coding;
4679 {
4680 const unsigned char *src = coding->source + coding->consumed;
4681 const unsigned char *src_end = coding->source + coding->src_bytes;
4682 const unsigned char *src_base;
4683 int *charbuf = coding->charbuf;
4684 int *charbuf_end = charbuf + coding->charbuf_size - MAX_ANNOTATION_LENGTH;
4685 int consumed_chars = 0, consumed_chars_base;
4686 int multibytep = coding->src_multibyte;
4687 Lisp_Object attrs, charset_list, valids;
4688 int char_offset = coding->produced_char;
4689 int last_offset = char_offset;
4690 int last_id = charset_ascii;
4691
4692 CODING_GET_INFO (coding, attrs, charset_list);
4693 valids = AREF (attrs, coding_attr_charset_valids);
4694
4695 while (1)
4696 {
4697 int c;
4698 Lisp_Object val;
4699 struct charset *charset;
4700 int dim;
4701 int len = 1;
4702 unsigned code;
4703
4704 src_base = src;
4705 consumed_chars_base = consumed_chars;
4706
4707 if (charbuf >= charbuf_end)
4708 break;
4709
4710 ONE_MORE_BYTE (c);
4711 if (c < 0)
4712 goto invalid_code;
4713 code = c;
4714
4715 val = AREF (valids, c);
4716 if (NILP (val))
4717 goto invalid_code;
4718 if (INTEGERP (val))
4719 {
4720 charset = CHARSET_FROM_ID (XFASTINT (val));
4721 dim = CHARSET_DIMENSION (charset);
4722 while (len < dim)
4723 {
4724 ONE_MORE_BYTE (c);
4725 code = (code << 8) | c;
4726 len++;
4727 }
4728 CODING_DECODE_CHAR (coding, src, src_base, src_end,
4729 charset, code, c);
4730 }
4731 else
4732 {
4733 /* VAL is a list of charset IDs. It is assured that the
4734 list is sorted by charset dimensions (smaller one
4735 comes first). */
4736 while (CONSP (val))
4737 {
4738 charset = CHARSET_FROM_ID (XFASTINT (XCAR (val)));
4739 dim = CHARSET_DIMENSION (charset);
4740 while (len < dim)
4741 {
4742 ONE_MORE_BYTE (c);
4743 code = (code << 8) | c;
4744 len++;
4745 }
4746 CODING_DECODE_CHAR (coding, src, src_base,
4747 src_end, charset, code, c);
4748 if (c >= 0)
4749 break;
4750 val = XCDR (val);
4751 }
4752 }
4753 if (c < 0)
4754 goto invalid_code;
4755 if (charset->id != charset_ascii
4756 && last_id != charset->id)
4757 {
4758 if (last_id != charset_ascii)
4759 ADD_CHARSET_DATA (charbuf, last_offset, char_offset, last_id);
4760 last_id = charset->id;
4761 last_offset = char_offset;
4762 }
4763
4764 *charbuf++ = c;
4765 char_offset++;
4766 continue;
4767
4768 invalid_code:
4769 src = src_base;
4770 consumed_chars = consumed_chars_base;
4771 ONE_MORE_BYTE (c);
4772 *charbuf++ = c < 0 ? -c : ASCII_BYTE_P (c) ? c : BYTE8_TO_CHAR (c);
4773 char_offset++;
4774 coding->errors++;
4775 }
4776
4777 no_more_source:
4778 if (last_id != charset_ascii)
4779 ADD_CHARSET_DATA (charbuf, last_offset, char_offset, last_id);
4780 coding->consumed_char += consumed_chars_base;
4781 coding->consumed = src_base - coding->source;
4782 coding->charbuf_used = charbuf - coding->charbuf;
4783 }
4784
4785 static int
4786 encode_coding_charset (coding)
4787 struct coding_system *coding;
4788 {
4789 int multibytep = coding->dst_multibyte;
4790 int *charbuf = coding->charbuf;
4791 int *charbuf_end = charbuf + coding->charbuf_used;
4792 unsigned char *dst = coding->destination + coding->produced;
4793 unsigned char *dst_end = coding->destination + coding->dst_bytes;
4794 int safe_room = MAX_MULTIBYTE_LENGTH;
4795 int produced_chars = 0;
4796 Lisp_Object attrs, charset_list;
4797 int ascii_compatible;
4798 int c;
4799
4800 CODING_GET_INFO (coding, attrs, charset_list);
4801 ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs));
4802
4803 while (charbuf < charbuf_end)
4804 {
4805 struct charset *charset;
4806 unsigned code;
4807
4808 ASSURE_DESTINATION (safe_room);
4809 c = *charbuf++;
4810 if (ascii_compatible && ASCII_CHAR_P (c))
4811 EMIT_ONE_ASCII_BYTE (c);
4812 else if (CHAR_BYTE8_P (c))
4813 {
4814 c = CHAR_TO_BYTE8 (c);
4815 EMIT_ONE_BYTE (c);
4816 }
4817 else
4818 {
4819 charset = char_charset (c, charset_list, &code);
4820 if (charset)
4821 {
4822 if (CHARSET_DIMENSION (charset) == 1)
4823 EMIT_ONE_BYTE (code);
4824 else if (CHARSET_DIMENSION (charset) == 2)
4825 EMIT_TWO_BYTES (code >> 8, code & 0xFF);
4826 else if (CHARSET_DIMENSION (charset) == 3)
4827 EMIT_THREE_BYTES (code >> 16, (code >> 8) & 0xFF, code & 0xFF);
4828 else
4829 EMIT_FOUR_BYTES (code >> 24, (code >> 16) & 0xFF,
4830 (code >> 8) & 0xFF, code & 0xFF);
4831 }
4832 else
4833 {
4834 if (coding->mode & CODING_MODE_SAFE_ENCODING)
4835 c = CODING_INHIBIT_CHARACTER_SUBSTITUTION;
4836 else
4837 c = coding->default_char;
4838 EMIT_ONE_BYTE (c);
4839 }
4840 }
4841 }
4842
4843 record_conversion_result (coding, CODING_RESULT_SUCCESS);
4844 coding->produced_char += produced_chars;
4845 coding->produced = dst - coding->destination;
4846 return 0;
4847 }
4848
4849 \f
4850 /*** 7. C library functions ***/
4851
4852 /* Setup coding context CODING from information about CODING_SYSTEM.
4853 If CODING_SYSTEM is nil, `no-conversion' is assumed. If
4854 CODING_SYSTEM is invalid, signal an error. */
4855
4856 void
4857 setup_coding_system (coding_system, coding)
4858 Lisp_Object coding_system;
4859 struct coding_system *coding;
4860 {
4861 Lisp_Object attrs;
4862 Lisp_Object eol_type;
4863 Lisp_Object coding_type;
4864 Lisp_Object val;
4865
4866 if (NILP (coding_system))
4867 coding_system = Qno_conversion;
4868
4869 CHECK_CODING_SYSTEM_GET_ID (coding_system, coding->id);
4870
4871 attrs = CODING_ID_ATTRS (coding->id);
4872 eol_type = CODING_ID_EOL_TYPE (coding->id);
4873
4874 coding->mode = 0;
4875 coding->head_ascii = -1;
4876 coding->common_flags
4877 = (VECTORP (eol_type) ? CODING_REQUIRE_DETECTION_MASK : 0);
4878 if (! NILP (CODING_ATTR_POST_READ (attrs)))
4879 coding->common_flags |= CODING_REQUIRE_DECODING_MASK;
4880 if (! NILP (CODING_ATTR_PRE_WRITE (attrs)))
4881 coding->common_flags |= CODING_REQUIRE_ENCODING_MASK;
4882 if (! NILP (CODING_ATTR_FOR_UNIBYTE (attrs)))
4883 coding->common_flags |= CODING_FOR_UNIBYTE_MASK;
4884
4885 val = CODING_ATTR_SAFE_CHARSETS (attrs);
4886 coding->max_charset_id = SCHARS (val) - 1;
4887 coding->safe_charsets = (char *) SDATA (val);
4888 coding->default_char = XINT (CODING_ATTR_DEFAULT_CHAR (attrs));
4889
4890 coding_type = CODING_ATTR_TYPE (attrs);
4891 if (EQ (coding_type, Qundecided))
4892 {
4893 coding->detector = NULL;
4894 coding->decoder = decode_coding_raw_text;
4895 coding->encoder = encode_coding_raw_text;
4896 coding->common_flags |= CODING_REQUIRE_DETECTION_MASK;
4897 }
4898 else if (EQ (coding_type, Qiso_2022))
4899 {
4900 int i;
4901 int flags = XINT (AREF (attrs, coding_attr_iso_flags));
4902
4903 /* Invoke graphic register 0 to plane 0. */
4904 CODING_ISO_INVOCATION (coding, 0) = 0;
4905 /* Invoke graphic register 1 to plane 1 if we can use 8-bit. */
4906 CODING_ISO_INVOCATION (coding, 1)
4907 = (flags & CODING_ISO_FLAG_SEVEN_BITS ? -1 : 1);
4908 /* Setup the initial status of designation. */
4909 for (i = 0; i < 4; i++)
4910 CODING_ISO_DESIGNATION (coding, i) = CODING_ISO_INITIAL (coding, i);
4911 /* Not single shifting initially. */
4912 CODING_ISO_SINGLE_SHIFTING (coding) = 0;
4913 /* Beginning of buffer should also be regarded as bol. */
4914 CODING_ISO_BOL (coding) = 1;
4915 coding->detector = detect_coding_iso_2022;
4916 coding->decoder = decode_coding_iso_2022;
4917 coding->encoder = encode_coding_iso_2022;
4918 if (flags & CODING_ISO_FLAG_SAFE)
4919 coding->mode |= CODING_MODE_SAFE_ENCODING;
4920 coding->common_flags
4921 |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK
4922 | CODING_REQUIRE_FLUSHING_MASK);
4923 if (flags & CODING_ISO_FLAG_COMPOSITION)
4924 coding->common_flags |= CODING_ANNOTATE_COMPOSITION_MASK;
4925 if (flags & CODING_ISO_FLAG_DESIGNATION)
4926 coding->common_flags |= CODING_ANNOTATE_CHARSET_MASK;
4927 if (flags & CODING_ISO_FLAG_FULL_SUPPORT)
4928 {
4929 setup_iso_safe_charsets (attrs);
4930 val = CODING_ATTR_SAFE_CHARSETS (attrs);
4931 coding->max_charset_id = SCHARS (val) - 1;
4932 coding->safe_charsets = (char *) SDATA (val);
4933 }
4934 CODING_ISO_FLAGS (coding) = flags;
4935 }
4936 else if (EQ (coding_type, Qcharset))
4937 {
4938 coding->detector = detect_coding_charset;
4939 coding->decoder = decode_coding_charset;
4940 coding->encoder = encode_coding_charset;
4941 coding->common_flags
4942 |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
4943 }
4944 else if (EQ (coding_type, Qutf_8))
4945 {
4946 coding->detector = detect_coding_utf_8;
4947 coding->decoder = decode_coding_utf_8;
4948 coding->encoder = encode_coding_utf_8;
4949 coding->common_flags
4950 |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
4951 }
4952 else if (EQ (coding_type, Qutf_16))
4953 {
4954 val = AREF (attrs, coding_attr_utf_16_bom);
4955 CODING_UTF_16_BOM (coding) = (CONSP (val) ? utf_16_detect_bom
4956 : EQ (val, Qt) ? utf_16_with_bom
4957 : utf_16_without_bom);
4958 val = AREF (attrs, coding_attr_utf_16_endian);
4959 CODING_UTF_16_ENDIAN (coding) = (EQ (val, Qbig) ? utf_16_big_endian
4960 : utf_16_little_endian);
4961 CODING_UTF_16_SURROGATE (coding) = 0;
4962 coding->detector = detect_coding_utf_16;
4963 coding->decoder = decode_coding_utf_16;
4964 coding->encoder = encode_coding_utf_16;
4965 coding->common_flags
4966 |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
4967 if (CODING_UTF_16_BOM (coding) == utf_16_detect_bom)
4968 coding->common_flags |= CODING_REQUIRE_DETECTION_MASK;
4969 }
4970 else if (EQ (coding_type, Qccl))
4971 {
4972 coding->detector = detect_coding_ccl;
4973 coding->decoder = decode_coding_ccl;
4974 coding->encoder = encode_coding_ccl;
4975 coding->common_flags
4976 |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK
4977 | CODING_REQUIRE_FLUSHING_MASK);
4978 }
4979 else if (EQ (coding_type, Qemacs_mule))
4980 {
4981 coding->detector = detect_coding_emacs_mule;
4982 coding->decoder = decode_coding_emacs_mule;
4983 coding->encoder = encode_coding_emacs_mule;
4984 coding->common_flags
4985 |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
4986 if (! NILP (AREF (attrs, coding_attr_emacs_mule_full))
4987 && ! EQ (CODING_ATTR_CHARSET_LIST (attrs), Vemacs_mule_charset_list))
4988 {
4989 Lisp_Object tail, safe_charsets;
4990 int max_charset_id = 0;
4991
4992 for (tail = Vemacs_mule_charset_list; CONSP (tail);
4993 tail = XCDR (tail))
4994 if (max_charset_id < XFASTINT (XCAR (tail)))
4995 max_charset_id = XFASTINT (XCAR (tail));
4996 safe_charsets = Fmake_string (make_number (max_charset_id + 1),
4997 make_number (255));
4998 for (tail = Vemacs_mule_charset_list; CONSP (tail);
4999 tail = XCDR (tail))
5000 SSET (safe_charsets, XFASTINT (XCAR (tail)), 0);
5001 coding->max_charset_id = max_charset_id;
5002 coding->safe_charsets = (char *) SDATA (safe_charsets);
5003 }
5004 }
5005 else if (EQ (coding_type, Qshift_jis))
5006 {
5007 coding->detector = detect_coding_sjis;
5008 coding->decoder = decode_coding_sjis;
5009 coding->encoder = encode_coding_sjis;
5010 coding->common_flags
5011 |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
5012 }
5013 else if (EQ (coding_type, Qbig5))
5014 {
5015 coding->detector = detect_coding_big5;
5016 coding->decoder = decode_coding_big5;
5017 coding->encoder = encode_coding_big5;
5018 coding->common_flags
5019 |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
5020 }
5021 else /* EQ (coding_type, Qraw_text) */
5022 {
5023 coding->detector = NULL;
5024 coding->decoder = decode_coding_raw_text;
5025 coding->encoder = encode_coding_raw_text;
5026 }
5027
5028 return;
5029 }
5030
5031 /* Return raw-text or one of its subsidiaries that has the same
5032 eol_type as CODING-SYSTEM. */
5033
5034 Lisp_Object
5035 raw_text_coding_system (coding_system)
5036 Lisp_Object coding_system;
5037 {
5038 Lisp_Object spec, attrs;
5039 Lisp_Object eol_type, raw_text_eol_type;
5040
5041 if (NILP (coding_system))
5042 return Qraw_text;
5043 spec = CODING_SYSTEM_SPEC (coding_system);
5044 attrs = AREF (spec, 0);
5045
5046 if (EQ (CODING_ATTR_TYPE (attrs), Qraw_text))
5047 return coding_system;
5048
5049 eol_type = AREF (spec, 2);
5050 if (VECTORP (eol_type))
5051 return Qraw_text;
5052 spec = CODING_SYSTEM_SPEC (Qraw_text);
5053 raw_text_eol_type = AREF (spec, 2);
5054 return (EQ (eol_type, Qunix) ? AREF (raw_text_eol_type, 0)
5055 : EQ (eol_type, Qdos) ? AREF (raw_text_eol_type, 1)
5056 : AREF (raw_text_eol_type, 2));
5057 }
5058
5059
5060 /* If CODING_SYSTEM doesn't specify end-of-line format but PARENT
5061 does, return one of the subsidiary that has the same eol-spec as
5062 PARENT. Otherwise, return CODING_SYSTEM. */
5063
5064 Lisp_Object
5065 coding_inherit_eol_type (coding_system, parent)
5066 Lisp_Object coding_system, parent;
5067 {
5068 Lisp_Object spec, eol_type;
5069
5070 if (NILP (coding_system))
5071 coding_system = Qraw_text;
5072 spec = CODING_SYSTEM_SPEC (coding_system);
5073 eol_type = AREF (spec, 2);
5074 if (VECTORP (eol_type)
5075 && ! NILP (parent))
5076 {
5077 Lisp_Object parent_spec;
5078 Lisp_Object parent_eol_type;
5079
5080 parent_spec
5081 = CODING_SYSTEM_SPEC (buffer_defaults.buffer_file_coding_system);
5082 parent_eol_type = AREF (parent_spec, 2);
5083 if (EQ (parent_eol_type, Qunix))
5084 coding_system = AREF (eol_type, 0);
5085 else if (EQ (parent_eol_type, Qdos))
5086 coding_system = AREF (eol_type, 1);
5087 else if (EQ (parent_eol_type, Qmac))
5088 coding_system = AREF (eol_type, 2);
5089 }
5090 return coding_system;
5091 }
5092
5093 /* Emacs has a mechanism to automatically detect a coding system if it
5094 is one of Emacs' internal format, ISO2022, SJIS, and BIG5. But,
5095 it's impossible to distinguish some coding systems accurately
5096 because they use the same range of codes. So, at first, coding
5097 systems are categorized into 7, those are:
5098
5099 o coding-category-emacs-mule
5100
5101 The category for a coding system which has the same code range
5102 as Emacs' internal format. Assigned the coding-system (Lisp
5103 symbol) `emacs-mule' by default.
5104
5105 o coding-category-sjis
5106
5107 The category for a coding system which has the same code range
5108 as SJIS. Assigned the coding-system (Lisp
5109 symbol) `japanese-shift-jis' by default.
5110
5111 o coding-category-iso-7
5112
5113 The category for a coding system which has the same code range
5114 as ISO2022 of 7-bit environment. This doesn't use any locking
5115 shift and single shift functions. This can encode/decode all
5116 charsets. Assigned the coding-system (Lisp symbol)
5117 `iso-2022-7bit' by default.
5118
5119 o coding-category-iso-7-tight
5120
5121 Same as coding-category-iso-7 except that this can
5122 encode/decode only the specified charsets.
5123
5124 o coding-category-iso-8-1
5125
5126 The category for a coding system which has the same code range
5127 as ISO2022 of 8-bit environment and graphic plane 1 used only
5128 for DIMENSION1 charset. This doesn't use any locking shift
5129 and single shift functions. Assigned the coding-system (Lisp
5130 symbol) `iso-latin-1' by default.
5131
5132 o coding-category-iso-8-2
5133
5134 The category for a coding system which has the same code range
5135 as ISO2022 of 8-bit environment and graphic plane 1 used only
5136 for DIMENSION2 charset. This doesn't use any locking shift
5137 and single shift functions. Assigned the coding-system (Lisp
5138 symbol) `japanese-iso-8bit' by default.
5139
5140 o coding-category-iso-7-else
5141
5142 The category for a coding system which has the same code range
5143 as ISO2022 of 7-bit environemnt but uses locking shift or
5144 single shift functions. Assigned the coding-system (Lisp
5145 symbol) `iso-2022-7bit-lock' by default.
5146
5147 o coding-category-iso-8-else
5148
5149 The category for a coding system which has the same code range
5150 as ISO2022 of 8-bit environemnt but uses locking shift or
5151 single shift functions. Assigned the coding-system (Lisp
5152 symbol) `iso-2022-8bit-ss2' by default.
5153
5154 o coding-category-big5
5155
5156 The category for a coding system which has the same code range
5157 as BIG5. Assigned the coding-system (Lisp symbol)
5158 `cn-big5' by default.
5159
5160 o coding-category-utf-8
5161
5162 The category for a coding system which has the same code range
5163 as UTF-8 (cf. RFC2279). Assigned the coding-system (Lisp
5164 symbol) `utf-8' by default.
5165
5166 o coding-category-utf-16-be
5167
5168 The category for a coding system in which a text has an
5169 Unicode signature (cf. Unicode Standard) in the order of BIG
5170 endian at the head. Assigned the coding-system (Lisp symbol)
5171 `utf-16-be' by default.
5172
5173 o coding-category-utf-16-le
5174
5175 The category for a coding system in which a text has an
5176 Unicode signature (cf. Unicode Standard) in the order of
5177 LITTLE endian at the head. Assigned the coding-system (Lisp
5178 symbol) `utf-16-le' by default.
5179
5180 o coding-category-ccl
5181
5182 The category for a coding system of which encoder/decoder is
5183 written in CCL programs. The default value is nil, i.e., no
5184 coding system is assigned.
5185
5186 o coding-category-binary
5187
5188 The category for a coding system not categorized in any of the
5189 above. Assigned the coding-system (Lisp symbol)
5190 `no-conversion' by default.
5191
5192 Each of them is a Lisp symbol and the value is an actual
5193 `coding-system's (this is also a Lisp symbol) assigned by a user.
5194 What Emacs does actually is to detect a category of coding system.
5195 Then, it uses a `coding-system' assigned to it. If Emacs can't
5196 decide only one possible category, it selects a category of the
5197 highest priority. Priorities of categories are also specified by a
5198 user in a Lisp variable `coding-category-list'.
5199
5200 */
5201
5202 #define EOL_SEEN_NONE 0
5203 #define EOL_SEEN_LF 1
5204 #define EOL_SEEN_CR 2
5205 #define EOL_SEEN_CRLF 4
5206
5207 /* Detect how end-of-line of a text of length SRC_BYTES pointed by
5208 SOURCE is encoded. If CATEGORY is one of
5209 coding_category_utf_16_XXXX, assume that CR and LF are encoded by
5210 two-byte, else they are encoded by one-byte.
5211
5212 Return one of EOL_SEEN_XXX. */
5213
5214 #define MAX_EOL_CHECK_COUNT 3
5215
5216 static int
5217 detect_eol (source, src_bytes, category)
5218 unsigned char *source;
5219 EMACS_INT src_bytes;
5220 enum coding_category category;
5221 {
5222 unsigned char *src = source, *src_end = src + src_bytes;
5223 unsigned char c;
5224 int total = 0;
5225 int eol_seen = EOL_SEEN_NONE;
5226
5227 if ((1 << category) & CATEGORY_MASK_UTF_16)
5228 {
5229 int msb, lsb;
5230
5231 msb = category == (coding_category_utf_16_le
5232 | coding_category_utf_16_le_nosig);
5233 lsb = 1 - msb;
5234
5235 while (src + 1 < src_end)
5236 {
5237 c = src[lsb];
5238 if (src[msb] == 0 && (c == '\n' || c == '\r'))
5239 {
5240 int this_eol;
5241
5242 if (c == '\n')
5243 this_eol = EOL_SEEN_LF;
5244 else if (src + 3 >= src_end
5245 || src[msb + 2] != 0
5246 || src[lsb + 2] != '\n')
5247 this_eol = EOL_SEEN_CR;
5248 else
5249 this_eol = EOL_SEEN_CRLF;
5250
5251 if (eol_seen == EOL_SEEN_NONE)
5252 /* This is the first end-of-line. */
5253 eol_seen = this_eol;
5254 else if (eol_seen != this_eol)
5255 {
5256 /* The found type is different from what found before. */
5257 eol_seen = EOL_SEEN_LF;
5258 break;
5259 }
5260 if (++total == MAX_EOL_CHECK_COUNT)
5261 break;
5262 }
5263 src += 2;
5264 }
5265 }
5266 else
5267 {
5268 while (src < src_end)
5269 {
5270 c = *src++;
5271 if (c == '\n' || c == '\r')
5272 {
5273 int this_eol;
5274
5275 if (c == '\n')
5276 this_eol = EOL_SEEN_LF;
5277 else if (src >= src_end || *src != '\n')
5278 this_eol = EOL_SEEN_CR;
5279 else
5280 this_eol = EOL_SEEN_CRLF, src++;
5281
5282 if (eol_seen == EOL_SEEN_NONE)
5283 /* This is the first end-of-line. */
5284 eol_seen = this_eol;
5285 else if (eol_seen != this_eol)
5286 {
5287 /* The found type is different from what found before. */
5288 eol_seen = EOL_SEEN_LF;
5289 break;
5290 }
5291 if (++total == MAX_EOL_CHECK_COUNT)
5292 break;
5293 }
5294 }
5295 }
5296 return eol_seen;
5297 }
5298
5299
5300 static Lisp_Object
5301 adjust_coding_eol_type (coding, eol_seen)
5302 struct coding_system *coding;
5303 int eol_seen;
5304 {
5305 Lisp_Object eol_type;
5306
5307 eol_type = CODING_ID_EOL_TYPE (coding->id);
5308 if (eol_seen & EOL_SEEN_LF)
5309 {
5310 coding->id = CODING_SYSTEM_ID (AREF (eol_type, 0));
5311 eol_type = Qunix;
5312 }
5313 else if (eol_seen & EOL_SEEN_CRLF)
5314 {
5315 coding->id = CODING_SYSTEM_ID (AREF (eol_type, 1));
5316 eol_type = Qdos;
5317 }
5318 else if (eol_seen & EOL_SEEN_CR)
5319 {
5320 coding->id = CODING_SYSTEM_ID (AREF (eol_type, 2));
5321 eol_type = Qmac;
5322 }
5323 return eol_type;
5324 }
5325
5326 /* Detect how a text specified in CODING is encoded. If a coding
5327 system is detected, update fields of CODING by the detected coding
5328 system. */
5329
5330 void
5331 detect_coding (coding)
5332 struct coding_system *coding;
5333 {
5334 const unsigned char *src, *src_end;
5335 Lisp_Object attrs, coding_type;
5336
5337 coding->consumed = coding->consumed_char = 0;
5338 coding->produced = coding->produced_char = 0;
5339 coding_set_source (coding);
5340
5341 src_end = coding->source + coding->src_bytes;
5342
5343 /* If we have not yet decided the text encoding type, detect it
5344 now. */
5345 if (EQ (CODING_ATTR_TYPE (CODING_ID_ATTRS (coding->id)), Qundecided))
5346 {
5347 int c, i;
5348
5349 for (i = 0, src = coding->source; src < src_end; i++, src++)
5350 {
5351 c = *src;
5352 if (c & 0x80 || (c < 0x20 && (c == 0
5353 || c == ISO_CODE_ESC
5354 || c == ISO_CODE_SI
5355 || c == ISO_CODE_SO)))
5356 break;
5357 }
5358 coding->head_ascii = src - (coding->source + coding->consumed);
5359
5360 if (coding->head_ascii < coding->src_bytes)
5361 {
5362 struct coding_detection_info detect_info;
5363 enum coding_category category;
5364 struct coding_system *this;
5365
5366 detect_info.checked = detect_info.found = detect_info.rejected = 0;
5367 for (i = 0; i < coding_category_raw_text; i++)
5368 {
5369 category = coding_priorities[i];
5370 this = coding_categories + category;
5371 if (this->id < 0)
5372 {
5373 /* No coding system of this category is defined. */
5374 detect_info.rejected |= (1 << category);
5375 }
5376 else if (category >= coding_category_raw_text)
5377 continue;
5378 else if (detect_info.checked & (1 << category))
5379 {
5380 if (detect_info.found & (1 << category))
5381 break;
5382 }
5383 else if ((*(this->detector)) (coding, &detect_info)
5384 && detect_info.found & (1 << category))
5385 {
5386 if (category == coding_category_utf_16_auto)
5387 {
5388 if (detect_info.found & CATEGORY_MASK_UTF_16_LE)
5389 category = coding_category_utf_16_le;
5390 else
5391 category = coding_category_utf_16_be;
5392 }
5393 break;
5394 }
5395 }
5396 if (i < coding_category_raw_text)
5397 setup_coding_system (CODING_ID_NAME (this->id), coding);
5398 else if (detect_info.rejected == CATEGORY_MASK_ANY)
5399 setup_coding_system (Qraw_text, coding);
5400 else if (detect_info.rejected)
5401 for (i = 0; i < coding_category_raw_text; i++)
5402 if (! (detect_info.rejected & (1 << coding_priorities[i])))
5403 {
5404 this = coding_categories + coding_priorities[i];
5405 setup_coding_system (CODING_ID_NAME (this->id), coding);
5406 break;
5407 }
5408 }
5409 }
5410 else if (XINT (CODING_ATTR_CATEGORY (CODING_ID_ATTRS (coding->id)))
5411 == coding_category_utf_16_auto)
5412 {
5413 Lisp_Object coding_systems;
5414 struct coding_detection_info detect_info;
5415
5416 coding_systems
5417 = AREF (CODING_ID_ATTRS (coding->id), coding_attr_utf_16_bom);
5418 detect_info.found = detect_info.rejected = 0;
5419 if (CONSP (coding_systems)
5420 && detect_coding_utf_16 (coding, &detect_info))
5421 {
5422 if (detect_info.found & CATEGORY_MASK_UTF_16_LE)
5423 setup_coding_system (XCAR (coding_systems), coding);
5424 else if (detect_info.found & CATEGORY_MASK_UTF_16_BE)
5425 setup_coding_system (XCDR (coding_systems), coding);
5426 }
5427 }
5428 }
5429
5430
5431 static void
5432 decode_eol (coding)
5433 struct coding_system *coding;
5434 {
5435 Lisp_Object eol_type;
5436 unsigned char *p, *pbeg, *pend;
5437
5438 eol_type = CODING_ID_EOL_TYPE (coding->id);
5439 if (EQ (eol_type, Qunix))
5440 return;
5441
5442 if (NILP (coding->dst_object))
5443 pbeg = coding->destination;
5444 else
5445 pbeg = BYTE_POS_ADDR (coding->dst_pos_byte);
5446 pend = pbeg + coding->produced;
5447
5448 if (VECTORP (eol_type))
5449 {
5450 int eol_seen = EOL_SEEN_NONE;
5451
5452 for (p = pbeg; p < pend; p++)
5453 {
5454 if (*p == '\n')
5455 eol_seen |= EOL_SEEN_LF;
5456 else if (*p == '\r')
5457 {
5458 if (p + 1 < pend && *(p + 1) == '\n')
5459 {
5460 eol_seen |= EOL_SEEN_CRLF;
5461 p++;
5462 }
5463 else
5464 eol_seen |= EOL_SEEN_CR;
5465 }
5466 }
5467 if (eol_seen != EOL_SEEN_NONE
5468 && eol_seen != EOL_SEEN_LF
5469 && eol_seen != EOL_SEEN_CRLF
5470 && eol_seen != EOL_SEEN_CR)
5471 eol_seen = EOL_SEEN_LF;
5472 if (eol_seen != EOL_SEEN_NONE)
5473 eol_type = adjust_coding_eol_type (coding, eol_seen);
5474 }
5475
5476 if (EQ (eol_type, Qmac))
5477 {
5478 for (p = pbeg; p < pend; p++)
5479 if (*p == '\r')
5480 *p = '\n';
5481 }
5482 else if (EQ (eol_type, Qdos))
5483 {
5484 int n = 0;
5485
5486 if (NILP (coding->dst_object))
5487 {
5488 for (p = pend - 2; p >= pbeg; p--)
5489 if (*p == '\r')
5490 {
5491 safe_bcopy ((char *) (p + 1), (char *) p, pend-- - p - 1);
5492 n++;
5493 }
5494 }
5495 else
5496 {
5497 for (p = pend - 2; p >= pbeg; p--)
5498 if (*p == '\r')
5499 {
5500 int pos_byte = coding->dst_pos_byte + (p - pbeg);
5501 int pos = BYTE_TO_CHAR (pos_byte);
5502
5503 del_range_2 (pos, pos_byte, pos + 1, pos_byte + 1, 0);
5504 n++;
5505 }
5506 }
5507 coding->produced -= n;
5508 coding->produced_char -= n;
5509 }
5510 }
5511
5512
5513 /* Return a translation table (or list of them) from coding system
5514 attribute vector ATTRS for encoding (ENCODEP is nonzero) or
5515 decoding (ENCODEP is zero). */
5516
5517 static INLINE
5518 get_translation_table (attrs, encodep)
5519 {
5520 Lisp_Object standard, translation_table;
5521
5522 if (encodep)
5523 translation_table = CODING_ATTR_ENCODE_TBL (attrs),
5524 standard = Vstandard_translation_table_for_encode;
5525 else
5526 translation_table = CODING_ATTR_DECODE_TBL (attrs),
5527 standard = Vstandard_translation_table_for_decode;
5528 if (NILP (translation_table))
5529 return standard;
5530 if (SYMBOLP (translation_table))
5531 translation_table = Fget (translation_table, Qtranslation_table);
5532 else if (CONSP (translation_table))
5533 {
5534 Lisp_Object val;
5535
5536 translation_table = Fcopy_sequence (translation_table);
5537 for (val = translation_table; CONSP (val); val = XCDR (val))
5538 if (SYMBOLP (XCAR (val)))
5539 XSETCAR (val, Fget (XCAR (val), Qtranslation_table));
5540 }
5541 if (! NILP (standard))
5542 {
5543 if (CONSP (translation_table))
5544 translation_table = nconc2 (translation_table, Fcons (standard, Qnil));
5545 else
5546 translation_table = Fcons (translation_table, Fcons (standard, Qnil));
5547 }
5548 return translation_table;
5549 }
5550
5551
5552 static void
5553 translate_chars (coding, table)
5554 struct coding_system *coding;
5555 Lisp_Object table;
5556 {
5557 int *charbuf = coding->charbuf;
5558 int *charbuf_end = charbuf + coding->charbuf_used;
5559 int c;
5560
5561 if (coding->chars_at_source)
5562 return;
5563
5564 while (charbuf < charbuf_end)
5565 {
5566 c = *charbuf;
5567 if (c < 0)
5568 charbuf += -c;
5569 else
5570 *charbuf++ = translate_char (table, c);
5571 }
5572 }
5573
5574 static int
5575 produce_chars (coding)
5576 struct coding_system *coding;
5577 {
5578 unsigned char *dst = coding->destination + coding->produced;
5579 unsigned char *dst_end = coding->destination + coding->dst_bytes;
5580 int produced;
5581 int produced_chars = 0;
5582
5583 if (! coding->chars_at_source)
5584 {
5585 /* Characters are in coding->charbuf. */
5586 int *buf = coding->charbuf;
5587 int *buf_end = buf + coding->charbuf_used;
5588 unsigned char *adjusted_dst_end;
5589
5590 if (BUFFERP (coding->src_object)
5591 && EQ (coding->src_object, coding->dst_object))
5592 dst_end = ((unsigned char *) coding->source) + coding->consumed;
5593 adjusted_dst_end = dst_end - MAX_MULTIBYTE_LENGTH;
5594
5595 while (buf < buf_end)
5596 {
5597 int c = *buf++;
5598
5599 if (dst >= adjusted_dst_end)
5600 {
5601 dst = alloc_destination (coding,
5602 buf_end - buf + MAX_MULTIBYTE_LENGTH,
5603 dst);
5604 dst_end = coding->destination + coding->dst_bytes;
5605 adjusted_dst_end = dst_end - MAX_MULTIBYTE_LENGTH;
5606 }
5607 if (c >= 0)
5608 {
5609 if (coding->dst_multibyte
5610 || ! CHAR_BYTE8_P (c))
5611 CHAR_STRING_ADVANCE (c, dst);
5612 else
5613 *dst++ = CHAR_TO_BYTE8 (c);
5614 produced_chars++;
5615 }
5616 else
5617 /* This is an annotation datum. (-C) is the length of
5618 it. */
5619 buf += -c - 1;
5620 }
5621 }
5622 else
5623 {
5624 const unsigned char *src = coding->source;
5625 const unsigned char *src_end = src + coding->src_bytes;
5626 Lisp_Object eol_type;
5627
5628 eol_type = CODING_ID_EOL_TYPE (coding->id);
5629
5630 if (coding->src_multibyte != coding->dst_multibyte)
5631 {
5632 if (coding->src_multibyte)
5633 {
5634 int multibytep = 1;
5635 int consumed_chars;
5636
5637 while (1)
5638 {
5639 const unsigned char *src_base = src;
5640 int c;
5641
5642 ONE_MORE_BYTE (c);
5643 if (c == '\r')
5644 {
5645 if (EQ (eol_type, Qdos))
5646 {
5647 if (src == src_end)
5648 {
5649 record_conversion_result
5650 (coding, CODING_RESULT_INSUFFICIENT_SRC);
5651 goto no_more_source;
5652 }
5653 if (*src == '\n')
5654 c = *src++;
5655 }
5656 else if (EQ (eol_type, Qmac))
5657 c = '\n';
5658 }
5659 if (dst == dst_end)
5660 {
5661 coding->consumed = src - coding->source;
5662
5663 if (EQ (coding->src_object, coding->dst_object))
5664 dst_end = (unsigned char *) src;
5665 if (dst == dst_end)
5666 {
5667 dst = alloc_destination (coding, src_end - src + 1,
5668 dst);
5669 dst_end = coding->destination + coding->dst_bytes;
5670 coding_set_source (coding);
5671 src = coding->source + coding->consumed;
5672 src_end = coding->source + coding->src_bytes;
5673 }
5674 }
5675 *dst++ = c;
5676 produced_chars++;
5677 }
5678 no_more_source:
5679 ;
5680 }
5681 else
5682 while (src < src_end)
5683 {
5684 int multibytep = 1;
5685 int c = *src++;
5686
5687 if (c == '\r')
5688 {
5689 if (EQ (eol_type, Qdos))
5690 {
5691 if (src < src_end
5692 && *src == '\n')
5693 c = *src++;
5694 }
5695 else if (EQ (eol_type, Qmac))
5696 c = '\n';
5697 }
5698 if (dst >= dst_end - 1)
5699 {
5700 coding->consumed = src - coding->source;
5701
5702 if (EQ (coding->src_object, coding->dst_object))
5703 dst_end = (unsigned char *) src;
5704 if (dst >= dst_end - 1)
5705 {
5706 dst = alloc_destination (coding, src_end - src + 2,
5707 dst);
5708 dst_end = coding->destination + coding->dst_bytes;
5709 coding_set_source (coding);
5710 src = coding->source + coding->consumed;
5711 src_end = coding->source + coding->src_bytes;
5712 }
5713 }
5714 EMIT_ONE_BYTE (c);
5715 }
5716 }
5717 else
5718 {
5719 if (!EQ (coding->src_object, coding->dst_object))
5720 {
5721 int require = coding->src_bytes - coding->dst_bytes;
5722
5723 if (require > 0)
5724 {
5725 EMACS_INT offset = src - coding->source;
5726
5727 dst = alloc_destination (coding, require, dst);
5728 coding_set_source (coding);
5729 src = coding->source + offset;
5730 src_end = coding->source + coding->src_bytes;
5731 }
5732 }
5733 produced_chars = coding->src_chars;
5734 while (src < src_end)
5735 {
5736 int c = *src++;
5737
5738 if (c == '\r')
5739 {
5740 if (EQ (eol_type, Qdos))
5741 {
5742 if (src < src_end
5743 && *src == '\n')
5744 c = *src++;
5745 produced_chars--;
5746 }
5747 else if (EQ (eol_type, Qmac))
5748 c = '\n';
5749 }
5750 *dst++ = c;
5751 }
5752 }
5753 coding->consumed = coding->src_bytes;
5754 coding->consumed_char = coding->src_chars;
5755 }
5756
5757 produced = dst - (coding->destination + coding->produced);
5758 if (BUFFERP (coding->dst_object))
5759 insert_from_gap (produced_chars, produced);
5760 coding->produced += produced;
5761 coding->produced_char += produced_chars;
5762 return produced_chars;
5763 }
5764
5765 /* Compose text in CODING->object according to the annotation data at
5766 CHARBUF. CHARBUF is an array:
5767 [ -LENGTH ANNOTATION_MASK FROM TO METHOD COMP_LEN [ COMPONENTS... ] ]
5768 */
5769
5770 static INLINE void
5771 produce_composition (coding, charbuf)
5772 struct coding_system *coding;
5773 int *charbuf;
5774 {
5775 int len;
5776 EMACS_INT from, to;
5777 enum composition_method method;
5778 Lisp_Object components;
5779
5780 len = -charbuf[0];
5781 from = coding->dst_pos + charbuf[2];
5782 to = coding->dst_pos + charbuf[3];
5783 method = (enum composition_method) (charbuf[4]);
5784
5785 if (method == COMPOSITION_RELATIVE)
5786 components = Qnil;
5787 else
5788 {
5789 Lisp_Object args[MAX_COMPOSITION_COMPONENTS * 2 - 1];
5790 int i;
5791
5792 len -= 5;
5793 charbuf += 5;
5794 for (i = 0; i < len; i++)
5795 args[i] = make_number (charbuf[i]);
5796 components = (method == COMPOSITION_WITH_ALTCHARS
5797 ? Fstring (len, args) : Fvector (len, args));
5798 }
5799 compose_text (from, to, components, Qnil, coding->dst_object);
5800 }
5801
5802
5803 /* Put `charset' property on text in CODING->object according to
5804 the annotation data at CHARBUF. CHARBUF is an array:
5805 [ -LENGTH ANNOTATION_MASK FROM TO CHARSET-ID ]
5806 */
5807
5808 static INLINE void
5809 produce_charset (coding, charbuf)
5810 struct coding_system *coding;
5811 int *charbuf;
5812 {
5813 EMACS_INT from = coding->dst_pos + charbuf[2];
5814 EMACS_INT to = coding->dst_pos + charbuf[3];
5815 struct charset *charset = CHARSET_FROM_ID (charbuf[4]);
5816
5817 Fput_text_property (make_number (from), make_number (to),
5818 Qcharset, CHARSET_NAME (charset),
5819 coding->dst_object);
5820 }
5821
5822
5823 #define CHARBUF_SIZE 0x4000
5824
5825 #define ALLOC_CONVERSION_WORK_AREA(coding) \
5826 do { \
5827 int size = CHARBUF_SIZE;; \
5828 \
5829 coding->charbuf = NULL; \
5830 while (size > 1024) \
5831 { \
5832 coding->charbuf = (int *) alloca (sizeof (int) * size); \
5833 if (coding->charbuf) \
5834 break; \
5835 size >>= 1; \
5836 } \
5837 if (! coding->charbuf) \
5838 { \
5839 record_conversion_result (coding, CODING_RESULT_INSUFFICIENT_MEM); \
5840 return coding->result; \
5841 } \
5842 coding->charbuf_size = size; \
5843 } while (0)
5844
5845
5846 static void
5847 produce_annotation (coding)
5848 struct coding_system *coding;
5849 {
5850 int *charbuf = coding->charbuf;
5851 int *charbuf_end = charbuf + coding->charbuf_used;
5852
5853 if (NILP (coding->dst_object))
5854 return;
5855
5856 while (charbuf < charbuf_end)
5857 {
5858 if (*charbuf >= 0)
5859 charbuf++;
5860 else
5861 {
5862 int len = -*charbuf;
5863 switch (charbuf[1])
5864 {
5865 case CODING_ANNOTATE_COMPOSITION_MASK:
5866 produce_composition (coding, charbuf);
5867 break;
5868 case CODING_ANNOTATE_CHARSET_MASK:
5869 produce_charset (coding, charbuf);
5870 break;
5871 default:
5872 abort ();
5873 }
5874 charbuf += len;
5875 }
5876 }
5877 }
5878
5879 /* Decode the data at CODING->src_object into CODING->dst_object.
5880 CODING->src_object is a buffer, a string, or nil.
5881 CODING->dst_object is a buffer.
5882
5883 If CODING->src_object is a buffer, it must be the current buffer.
5884 In this case, if CODING->src_pos is positive, it is a position of
5885 the source text in the buffer, otherwise, the source text is in the
5886 gap area of the buffer, and CODING->src_pos specifies the offset of
5887 the text from GPT (which must be the same as PT). If this is the
5888 same buffer as CODING->dst_object, CODING->src_pos must be
5889 negative.
5890
5891 If CODING->src_object is a string, CODING->src_pos in an index to
5892 that string.
5893
5894 If CODING->src_object is nil, CODING->source must already point to
5895 the non-relocatable memory area. In this case, CODING->src_pos is
5896 an offset from CODING->source.
5897
5898 The decoded data is inserted at the current point of the buffer
5899 CODING->dst_object.
5900 */
5901
5902 static int
5903 decode_coding (coding)
5904 struct coding_system *coding;
5905 {
5906 Lisp_Object attrs;
5907 Lisp_Object undo_list;
5908 Lisp_Object translation_table;
5909
5910 if (BUFFERP (coding->src_object)
5911 && coding->src_pos > 0
5912 && coding->src_pos < GPT
5913 && coding->src_pos + coding->src_chars > GPT)
5914 move_gap_both (coding->src_pos, coding->src_pos_byte);
5915
5916 undo_list = Qt;
5917 if (BUFFERP (coding->dst_object))
5918 {
5919 if (current_buffer != XBUFFER (coding->dst_object))
5920 set_buffer_internal (XBUFFER (coding->dst_object));
5921 if (GPT != PT)
5922 move_gap_both (PT, PT_BYTE);
5923 undo_list = current_buffer->undo_list;
5924 current_buffer->undo_list = Qt;
5925 }
5926
5927 coding->consumed = coding->consumed_char = 0;
5928 coding->produced = coding->produced_char = 0;
5929 coding->chars_at_source = 0;
5930 record_conversion_result (coding, CODING_RESULT_SUCCESS);
5931 coding->errors = 0;
5932
5933 ALLOC_CONVERSION_WORK_AREA (coding);
5934
5935 attrs = CODING_ID_ATTRS (coding->id);
5936 translation_table = get_translation_table (attrs, 0);
5937
5938 do
5939 {
5940 coding_set_source (coding);
5941 coding->annotated = 0;
5942 (*(coding->decoder)) (coding);
5943 if (!NILP (translation_table))
5944 translate_chars (coding, translation_table);
5945 coding_set_destination (coding);
5946 produce_chars (coding);
5947 if (coding->annotated)
5948 produce_annotation (coding);
5949 }
5950 while (coding->consumed < coding->src_bytes
5951 && ! coding->result);
5952
5953 coding->carryover_bytes = 0;
5954 if (coding->consumed < coding->src_bytes)
5955 {
5956 int nbytes = coding->src_bytes - coding->consumed;
5957 const unsigned char *src;
5958
5959 coding_set_source (coding);
5960 coding_set_destination (coding);
5961 src = coding->source + coding->consumed;
5962
5963 if (coding->mode & CODING_MODE_LAST_BLOCK)
5964 {
5965 /* Flush out unprocessed data as binary chars. We are sure
5966 that the number of data is less than the size of
5967 coding->charbuf. */
5968 coding->charbuf_used = 0;
5969 while (nbytes-- > 0)
5970 {
5971 int c = *src++;
5972
5973 coding->charbuf[coding->charbuf_used++] = (c & 0x80 ? - c : c);
5974 }
5975 produce_chars (coding);
5976 }
5977 else
5978 {
5979 /* Record unprocessed bytes in coding->carryover. We are
5980 sure that the number of data is less than the size of
5981 coding->carryover. */
5982 unsigned char *p = coding->carryover;
5983
5984 coding->carryover_bytes = nbytes;
5985 while (nbytes-- > 0)
5986 *p++ = *src++;
5987 }
5988 coding->consumed = coding->src_bytes;
5989 }
5990
5991 if (BUFFERP (coding->dst_object))
5992 {
5993 current_buffer->undo_list = undo_list;
5994 record_insert (coding->dst_pos, coding->produced_char);
5995 }
5996 if (! EQ (CODING_ID_EOL_TYPE (coding->id), Qunix))
5997 decode_eol (coding);
5998 return coding->result;
5999 }
6000
6001
6002 /* Extract an annotation datum from a composition starting at POS and
6003 ending before LIMIT of CODING->src_object (buffer or string), store
6004 the data in BUF, set *STOP to a starting position of the next
6005 composition (if any) or to LIMIT, and return the address of the
6006 next element of BUF.
6007
6008 If such an annotation is not found, set *STOP to a starting
6009 position of a composition after POS (if any) or to LIMIT, and
6010 return BUF. */
6011
6012 static INLINE int *
6013 handle_composition_annotation (pos, limit, coding, buf, stop)
6014 EMACS_INT pos, limit;
6015 struct coding_system *coding;
6016 int *buf;
6017 EMACS_INT *stop;
6018 {
6019 EMACS_INT start, end;
6020 Lisp_Object prop;
6021
6022 if (! find_composition (pos, limit, &start, &end, &prop, coding->src_object)
6023 || end > limit)
6024 *stop = limit;
6025 else if (start > pos)
6026 *stop = start;
6027 else
6028 {
6029 if (start == pos)
6030 {
6031 /* We found a composition. Store the corresponding
6032 annotation data in BUF. */
6033 int *head = buf;
6034 enum composition_method method = COMPOSITION_METHOD (prop);
6035 int nchars = COMPOSITION_LENGTH (prop);
6036
6037 ADD_COMPOSITION_DATA (buf, 0, nchars, method);
6038 if (method != COMPOSITION_RELATIVE)
6039 {
6040 Lisp_Object components;
6041 int len, i, i_byte;
6042
6043 components = COMPOSITION_COMPONENTS (prop);
6044 if (VECTORP (components))
6045 {
6046 len = XVECTOR (components)->size;
6047 for (i = 0; i < len; i++)
6048 *buf++ = XINT (AREF (components, i));
6049 }
6050 else if (STRINGP (components))
6051 {
6052 len = SCHARS (components);
6053 i = i_byte = 0;
6054 while (i < len)
6055 {
6056 FETCH_STRING_CHAR_ADVANCE (*buf, components, i, i_byte);
6057 buf++;
6058 }
6059 }
6060 else if (INTEGERP (components))
6061 {
6062 len = 1;
6063 *buf++ = XINT (components);
6064 }
6065 else if (CONSP (components))
6066 {
6067 for (len = 0; CONSP (components);
6068 len++, components = XCDR (components))
6069 *buf++ = XINT (XCAR (components));
6070 }
6071 else
6072 abort ();
6073 *head -= len;
6074 }
6075 }
6076
6077 if (find_composition (end, limit, &start, &end, &prop,
6078 coding->src_object)
6079 && end <= limit)
6080 *stop = start;
6081 else
6082 *stop = limit;
6083 }
6084 return buf;
6085 }
6086
6087
6088 /* Extract an annotation datum from a text property `charset' at POS of
6089 CODING->src_object (buffer of string), store the data in BUF, set
6090 *STOP to the position where the value of `charset' property changes
6091 (limiting by LIMIT), and return the address of the next element of
6092 BUF.
6093
6094 If the property value is nil, set *STOP to the position where the
6095 property value is non-nil (limiting by LIMIT), and return BUF. */
6096
6097 static INLINE int *
6098 handle_charset_annotation (pos, limit, coding, buf, stop)
6099 EMACS_INT pos, limit;
6100 struct coding_system *coding;
6101 int *buf;
6102 EMACS_INT *stop;
6103 {
6104 Lisp_Object val, next;
6105 int id;
6106
6107 val = Fget_text_property (make_number (pos), Qcharset, coding->src_object);
6108 if (! NILP (val) && CHARSETP (val))
6109 id = XINT (CHARSET_SYMBOL_ID (val));
6110 else
6111 id = -1;
6112 ADD_CHARSET_DATA (buf, 0, 0, id);
6113 next = Fnext_single_property_change (make_number (pos), Qcharset,
6114 coding->src_object,
6115 make_number (limit));
6116 *stop = XINT (next);
6117 return buf;
6118 }
6119
6120
6121 static void
6122 consume_chars (coding)
6123 struct coding_system *coding;
6124 {
6125 int *buf = coding->charbuf;
6126 int *buf_end = coding->charbuf + coding->charbuf_size;
6127 const unsigned char *src = coding->source + coding->consumed;
6128 const unsigned char *src_end = coding->source + coding->src_bytes;
6129 EMACS_INT pos = coding->src_pos + coding->consumed_char;
6130 EMACS_INT end_pos = coding->src_pos + coding->src_chars;
6131 int multibytep = coding->src_multibyte;
6132 Lisp_Object eol_type;
6133 int c;
6134 EMACS_INT stop, stop_composition, stop_charset;
6135
6136 eol_type = CODING_ID_EOL_TYPE (coding->id);
6137 if (VECTORP (eol_type))
6138 eol_type = Qunix;
6139
6140 /* Note: composition handling is not yet implemented. */
6141 coding->common_flags &= ~CODING_ANNOTATE_COMPOSITION_MASK;
6142
6143 if (NILP (coding->src_object))
6144 stop = stop_composition = stop_charset = end_pos;
6145 else
6146 {
6147 if (coding->common_flags & CODING_ANNOTATE_COMPOSITION_MASK)
6148 stop = stop_composition = pos;
6149 else
6150 stop = stop_composition = end_pos;
6151 if (coding->common_flags & CODING_ANNOTATE_CHARSET_MASK)
6152 stop = stop_charset = pos;
6153 else
6154 stop_charset = end_pos;
6155 }
6156
6157 /* Compensate for CRLF and conversion. */
6158 buf_end -= 1 + MAX_ANNOTATION_LENGTH;
6159 while (buf < buf_end)
6160 {
6161 if (pos == stop)
6162 {
6163 if (pos == end_pos)
6164 break;
6165 if (pos == stop_composition)
6166 buf = handle_composition_annotation (pos, end_pos, coding,
6167 buf, &stop_composition);
6168 if (pos == stop_charset)
6169 buf = handle_charset_annotation (pos, end_pos, coding,
6170 buf, &stop_charset);
6171 stop = (stop_composition < stop_charset
6172 ? stop_composition : stop_charset);
6173 }
6174
6175 if (! multibytep)
6176 {
6177 EMACS_INT bytes;
6178
6179 if (! CODING_FOR_UNIBYTE (coding)
6180 && (bytes = MULTIBYTE_LENGTH (src, src_end)) > 0)
6181 c = STRING_CHAR_ADVANCE (src), pos += bytes;
6182 else
6183 c = *src++, pos++;
6184 }
6185 else
6186 c = STRING_CHAR_ADVANCE (src), pos++;
6187 if ((c == '\r') && (coding->mode & CODING_MODE_SELECTIVE_DISPLAY))
6188 c = '\n';
6189 if (! EQ (eol_type, Qunix))
6190 {
6191 if (c == '\n')
6192 {
6193 if (EQ (eol_type, Qdos))
6194 *buf++ = '\r';
6195 else
6196 c = '\r';
6197 }
6198 }
6199 *buf++ = c;
6200 }
6201
6202 coding->consumed = src - coding->source;
6203 coding->consumed_char = pos - coding->src_pos;
6204 coding->charbuf_used = buf - coding->charbuf;
6205 coding->chars_at_source = 0;
6206 }
6207
6208
6209 /* Encode the text at CODING->src_object into CODING->dst_object.
6210 CODING->src_object is a buffer or a string.
6211 CODING->dst_object is a buffer or nil.
6212
6213 If CODING->src_object is a buffer, it must be the current buffer.
6214 In this case, if CODING->src_pos is positive, it is a position of
6215 the source text in the buffer, otherwise. the source text is in the
6216 gap area of the buffer, and coding->src_pos specifies the offset of
6217 the text from GPT (which must be the same as PT). If this is the
6218 same buffer as CODING->dst_object, CODING->src_pos must be
6219 negative and CODING should not have `pre-write-conversion'.
6220
6221 If CODING->src_object is a string, CODING should not have
6222 `pre-write-conversion'.
6223
6224 If CODING->dst_object is a buffer, the encoded data is inserted at
6225 the current point of that buffer.
6226
6227 If CODING->dst_object is nil, the encoded data is placed at the
6228 memory area specified by CODING->destination. */
6229
6230 static int
6231 encode_coding (coding)
6232 struct coding_system *coding;
6233 {
6234 Lisp_Object attrs;
6235 Lisp_Object translation_table;
6236
6237 attrs = CODING_ID_ATTRS (coding->id);
6238 translation_table = get_translation_table (attrs, 1);
6239
6240 if (BUFFERP (coding->dst_object))
6241 {
6242 set_buffer_internal (XBUFFER (coding->dst_object));
6243 coding->dst_multibyte
6244 = ! NILP (current_buffer->enable_multibyte_characters);
6245 }
6246
6247 coding->consumed = coding->consumed_char = 0;
6248 coding->produced = coding->produced_char = 0;
6249 record_conversion_result (coding, CODING_RESULT_SUCCESS);
6250 coding->errors = 0;
6251
6252 ALLOC_CONVERSION_WORK_AREA (coding);
6253
6254 do {
6255 coding_set_source (coding);
6256 consume_chars (coding);
6257
6258 if (!NILP (translation_table))
6259 translate_chars (coding, translation_table);
6260
6261 coding_set_destination (coding);
6262 (*(coding->encoder)) (coding);
6263 } while (coding->consumed_char < coding->src_chars);
6264
6265 if (BUFFERP (coding->dst_object))
6266 insert_from_gap (coding->produced_char, coding->produced);
6267
6268 return (coding->result);
6269 }
6270
6271
6272 /* Name (or base name) of work buffer for code conversion. */
6273 static Lisp_Object Vcode_conversion_workbuf_name;
6274
6275 /* A working buffer used by the top level conversion. Once it is
6276 created, it is never destroyed. It has the name
6277 Vcode_conversion_workbuf_name. The other working buffers are
6278 destroyed after the use is finished, and their names are modified
6279 versions of Vcode_conversion_workbuf_name. */
6280 static Lisp_Object Vcode_conversion_reused_workbuf;
6281
6282 /* 1 iff Vcode_conversion_reused_workbuf is already in use. */
6283 static int reused_workbuf_in_use;
6284
6285
6286 /* Return a working buffer of code convesion. MULTIBYTE specifies the
6287 multibyteness of returning buffer. */
6288
6289 Lisp_Object
6290 make_conversion_work_buffer (multibyte)
6291 {
6292 Lisp_Object name, workbuf;
6293 struct buffer *current;
6294
6295 if (reused_workbuf_in_use++)
6296 {
6297 name = Fgenerate_new_buffer_name (Vcode_conversion_workbuf_name, Qnil);
6298 workbuf = Fget_buffer_create (name);
6299 }
6300 else
6301 {
6302 name = Vcode_conversion_workbuf_name;
6303 workbuf = Fget_buffer_create (name);
6304 if (NILP (Vcode_conversion_reused_workbuf))
6305 Vcode_conversion_reused_workbuf = workbuf;
6306 }
6307 current = current_buffer;
6308 set_buffer_internal (XBUFFER (workbuf));
6309 Ferase_buffer ();
6310 current_buffer->undo_list = Qt;
6311 current_buffer->enable_multibyte_characters = multibyte ? Qt : Qnil;
6312 set_buffer_internal (current);
6313 return workbuf;
6314 }
6315
6316
6317 static Lisp_Object
6318 code_conversion_restore (arg)
6319 Lisp_Object arg;
6320 {
6321 Lisp_Object current, workbuf;
6322
6323 current = XCAR (arg);
6324 workbuf = XCDR (arg);
6325 if (! NILP (workbuf))
6326 {
6327 if (EQ (workbuf, Vcode_conversion_reused_workbuf))
6328 reused_workbuf_in_use = 0;
6329 else if (! NILP (Fbuffer_live_p (workbuf)))
6330 Fkill_buffer (workbuf);
6331 }
6332 set_buffer_internal (XBUFFER (current));
6333 return Qnil;
6334 }
6335
6336 Lisp_Object
6337 code_conversion_save (with_work_buf, multibyte)
6338 int with_work_buf, multibyte;
6339 {
6340 Lisp_Object workbuf = Qnil;
6341
6342 if (with_work_buf)
6343 workbuf = make_conversion_work_buffer (multibyte);
6344 record_unwind_protect (code_conversion_restore,
6345 Fcons (Fcurrent_buffer (), workbuf));
6346 return workbuf;
6347 }
6348
6349 int
6350 decode_coding_gap (coding, chars, bytes)
6351 struct coding_system *coding;
6352 EMACS_INT chars, bytes;
6353 {
6354 int count = specpdl_ptr - specpdl;
6355 Lisp_Object attrs;
6356
6357 code_conversion_save (0, 0);
6358
6359 coding->src_object = Fcurrent_buffer ();
6360 coding->src_chars = chars;
6361 coding->src_bytes = bytes;
6362 coding->src_pos = -chars;
6363 coding->src_pos_byte = -bytes;
6364 coding->src_multibyte = chars < bytes;
6365 coding->dst_object = coding->src_object;
6366 coding->dst_pos = PT;
6367 coding->dst_pos_byte = PT_BYTE;
6368 coding->dst_multibyte = ! NILP (current_buffer->enable_multibyte_characters);
6369 coding->mode |= CODING_MODE_LAST_BLOCK;
6370
6371 if (CODING_REQUIRE_DETECTION (coding))
6372 detect_coding (coding);
6373
6374 decode_coding (coding);
6375
6376 attrs = CODING_ID_ATTRS (coding->id);
6377 if (! NILP (CODING_ATTR_POST_READ (attrs)))
6378 {
6379 EMACS_INT prev_Z = Z, prev_Z_BYTE = Z_BYTE;
6380 Lisp_Object val;
6381
6382 TEMP_SET_PT_BOTH (coding->dst_pos, coding->dst_pos_byte);
6383 val = call1 (CODING_ATTR_POST_READ (attrs),
6384 make_number (coding->produced_char));
6385 CHECK_NATNUM (val);
6386 coding->produced_char += Z - prev_Z;
6387 coding->produced += Z_BYTE - prev_Z_BYTE;
6388 }
6389
6390 unbind_to (count, Qnil);
6391 return coding->result;
6392 }
6393
6394 int
6395 encode_coding_gap (coding, chars, bytes)
6396 struct coding_system *coding;
6397 EMACS_INT chars, bytes;
6398 {
6399 int count = specpdl_ptr - specpdl;
6400
6401 code_conversion_save (0, 0);
6402
6403 coding->src_object = Fcurrent_buffer ();
6404 coding->src_chars = chars;
6405 coding->src_bytes = bytes;
6406 coding->src_pos = -chars;
6407 coding->src_pos_byte = -bytes;
6408 coding->src_multibyte = chars < bytes;
6409 coding->dst_object = coding->src_object;
6410 coding->dst_pos = PT;
6411 coding->dst_pos_byte = PT_BYTE;
6412
6413 encode_coding (coding);
6414
6415 unbind_to (count, Qnil);
6416 return coding->result;
6417 }
6418
6419
6420 /* Decode the text in the range FROM/FROM_BYTE and TO/TO_BYTE in
6421 SRC_OBJECT into DST_OBJECT by coding context CODING.
6422
6423 SRC_OBJECT is a buffer, a string, or Qnil.
6424
6425 If it is a buffer, the text is at point of the buffer. FROM and TO
6426 are positions in the buffer.
6427
6428 If it is a string, the text is at the beginning of the string.
6429 FROM and TO are indices to the string.
6430
6431 If it is nil, the text is at coding->source. FROM and TO are
6432 indices to coding->source.
6433
6434 DST_OBJECT is a buffer, Qt, or Qnil.
6435
6436 If it is a buffer, the decoded text is inserted at point of the
6437 buffer. If the buffer is the same as SRC_OBJECT, the source text
6438 is deleted.
6439
6440 If it is Qt, a string is made from the decoded text, and
6441 set in CODING->dst_object.
6442
6443 If it is Qnil, the decoded text is stored at CODING->destination.
6444 The caller must allocate CODING->dst_bytes bytes at
6445 CODING->destination by xmalloc. If the decoded text is longer than
6446 CODING->dst_bytes, CODING->destination is relocated by xrealloc.
6447 */
6448
6449 void
6450 decode_coding_object (coding, src_object, from, from_byte, to, to_byte,
6451 dst_object)
6452 struct coding_system *coding;
6453 Lisp_Object src_object;
6454 EMACS_INT from, from_byte, to, to_byte;
6455 Lisp_Object dst_object;
6456 {
6457 int count = specpdl_ptr - specpdl;
6458 unsigned char *destination;
6459 EMACS_INT dst_bytes;
6460 EMACS_INT chars = to - from;
6461 EMACS_INT bytes = to_byte - from_byte;
6462 Lisp_Object attrs;
6463 Lisp_Object buffer;
6464 int saved_pt = -1, saved_pt_byte;
6465
6466 buffer = Fcurrent_buffer ();
6467
6468 if (NILP (dst_object))
6469 {
6470 destination = coding->destination;
6471 dst_bytes = coding->dst_bytes;
6472 }
6473
6474 coding->src_object = src_object;
6475 coding->src_chars = chars;
6476 coding->src_bytes = bytes;
6477 coding->src_multibyte = chars < bytes;
6478
6479 if (STRINGP (src_object))
6480 {
6481 coding->src_pos = from;
6482 coding->src_pos_byte = from_byte;
6483 }
6484 else if (BUFFERP (src_object))
6485 {
6486 set_buffer_internal (XBUFFER (src_object));
6487 if (from != GPT)
6488 move_gap_both (from, from_byte);
6489 if (EQ (src_object, dst_object))
6490 {
6491 saved_pt = PT, saved_pt_byte = PT_BYTE;
6492 TEMP_SET_PT_BOTH (from, from_byte);
6493 del_range_both (from, from_byte, to, to_byte, 1);
6494 coding->src_pos = -chars;
6495 coding->src_pos_byte = -bytes;
6496 }
6497 else
6498 {
6499 coding->src_pos = from;
6500 coding->src_pos_byte = from_byte;
6501 }
6502 }
6503
6504 if (CODING_REQUIRE_DETECTION (coding))
6505 detect_coding (coding);
6506 attrs = CODING_ID_ATTRS (coding->id);
6507
6508 if (EQ (dst_object, Qt)
6509 || (! NILP (CODING_ATTR_POST_READ (attrs))
6510 && NILP (dst_object)))
6511 {
6512 coding->dst_object = code_conversion_save (1, 1);
6513 coding->dst_pos = BEG;
6514 coding->dst_pos_byte = BEG_BYTE;
6515 coding->dst_multibyte = 1;
6516 }
6517 else if (BUFFERP (dst_object))
6518 {
6519 code_conversion_save (0, 0);
6520 coding->dst_object = dst_object;
6521 coding->dst_pos = BUF_PT (XBUFFER (dst_object));
6522 coding->dst_pos_byte = BUF_PT_BYTE (XBUFFER (dst_object));
6523 coding->dst_multibyte
6524 = ! NILP (XBUFFER (dst_object)->enable_multibyte_characters);
6525 }
6526 else
6527 {
6528 code_conversion_save (0, 0);
6529 coding->dst_object = Qnil;
6530 coding->dst_multibyte = 1;
6531 }
6532
6533 decode_coding (coding);
6534
6535 if (BUFFERP (coding->dst_object))
6536 set_buffer_internal (XBUFFER (coding->dst_object));
6537
6538 if (! NILP (CODING_ATTR_POST_READ (attrs)))
6539 {
6540 struct gcpro gcpro1, gcpro2;
6541 EMACS_INT prev_Z = Z, prev_Z_BYTE = Z_BYTE;
6542 Lisp_Object val;
6543
6544 TEMP_SET_PT_BOTH (coding->dst_pos, coding->dst_pos_byte);
6545 GCPRO2 (coding->src_object, coding->dst_object);
6546 val = call1 (CODING_ATTR_POST_READ (attrs),
6547 make_number (coding->produced_char));
6548 UNGCPRO;
6549 CHECK_NATNUM (val);
6550 coding->produced_char += Z - prev_Z;
6551 coding->produced += Z_BYTE - prev_Z_BYTE;
6552 }
6553
6554 if (EQ (dst_object, Qt))
6555 {
6556 coding->dst_object = Fbuffer_string ();
6557 }
6558 else if (NILP (dst_object) && BUFFERP (coding->dst_object))
6559 {
6560 set_buffer_internal (XBUFFER (coding->dst_object));
6561 if (dst_bytes < coding->produced)
6562 {
6563 destination
6564 = (unsigned char *) xrealloc (destination, coding->produced);
6565 if (! destination)
6566 {
6567 record_conversion_result (coding,
6568 CODING_RESULT_INSUFFICIENT_DST);
6569 unbind_to (count, Qnil);
6570 return;
6571 }
6572 if (BEGV < GPT && GPT < BEGV + coding->produced_char)
6573 move_gap_both (BEGV, BEGV_BYTE);
6574 bcopy (BEGV_ADDR, destination, coding->produced);
6575 coding->destination = destination;
6576 }
6577 }
6578
6579 if (saved_pt >= 0)
6580 {
6581 /* This is the case of:
6582 (BUFFERP (src_object) && EQ (src_object, dst_object))
6583 As we have moved PT while replacing the original buffer
6584 contents, we must recover it now. */
6585 set_buffer_internal (XBUFFER (src_object));
6586 if (saved_pt < from)
6587 TEMP_SET_PT_BOTH (saved_pt, saved_pt_byte);
6588 else if (saved_pt < from + chars)
6589 TEMP_SET_PT_BOTH (from, from_byte);
6590 else if (! NILP (current_buffer->enable_multibyte_characters))
6591 TEMP_SET_PT_BOTH (saved_pt + (coding->produced_char - chars),
6592 saved_pt_byte + (coding->produced - bytes));
6593 else
6594 TEMP_SET_PT_BOTH (saved_pt + (coding->produced - bytes),
6595 saved_pt_byte + (coding->produced - bytes));
6596 }
6597
6598 unbind_to (count, coding->dst_object);
6599 }
6600
6601
6602 void
6603 encode_coding_object (coding, src_object, from, from_byte, to, to_byte,
6604 dst_object)
6605 struct coding_system *coding;
6606 Lisp_Object src_object;
6607 EMACS_INT from, from_byte, to, to_byte;
6608 Lisp_Object dst_object;
6609 {
6610 int count = specpdl_ptr - specpdl;
6611 EMACS_INT chars = to - from;
6612 EMACS_INT bytes = to_byte - from_byte;
6613 Lisp_Object attrs;
6614 Lisp_Object buffer;
6615 int saved_pt = -1, saved_pt_byte;
6616
6617 buffer = Fcurrent_buffer ();
6618
6619 coding->src_object = src_object;
6620 coding->src_chars = chars;
6621 coding->src_bytes = bytes;
6622 coding->src_multibyte = chars < bytes;
6623
6624 attrs = CODING_ID_ATTRS (coding->id);
6625
6626 if (! NILP (CODING_ATTR_PRE_WRITE (attrs)))
6627 {
6628 coding->src_object = code_conversion_save (1, coding->src_multibyte);
6629 set_buffer_internal (XBUFFER (coding->src_object));
6630 if (STRINGP (src_object))
6631 insert_from_string (src_object, from, from_byte, chars, bytes, 0);
6632 else if (BUFFERP (src_object))
6633 insert_from_buffer (XBUFFER (src_object), from, chars, 0);
6634 else
6635 insert_1_both (coding->source + from, chars, bytes, 0, 0, 0);
6636
6637 if (EQ (src_object, dst_object))
6638 {
6639 set_buffer_internal (XBUFFER (src_object));
6640 saved_pt = PT, saved_pt_byte = PT_BYTE;
6641 del_range_both (from, from_byte, to, to_byte, 1);
6642 set_buffer_internal (XBUFFER (coding->src_object));
6643 }
6644
6645 call2 (CODING_ATTR_PRE_WRITE (attrs),
6646 make_number (BEG), make_number (Z));
6647 coding->src_object = Fcurrent_buffer ();
6648 if (BEG != GPT)
6649 move_gap_both (BEG, BEG_BYTE);
6650 coding->src_chars = Z - BEG;
6651 coding->src_bytes = Z_BYTE - BEG_BYTE;
6652 coding->src_pos = BEG;
6653 coding->src_pos_byte = BEG_BYTE;
6654 coding->src_multibyte = Z < Z_BYTE;
6655 }
6656 else if (STRINGP (src_object))
6657 {
6658 code_conversion_save (0, 0);
6659 coding->src_pos = from;
6660 coding->src_pos_byte = from_byte;
6661 }
6662 else if (BUFFERP (src_object))
6663 {
6664 code_conversion_save (0, 0);
6665 set_buffer_internal (XBUFFER (src_object));
6666 if (EQ (src_object, dst_object))
6667 {
6668 saved_pt = PT, saved_pt_byte = PT_BYTE;
6669 coding->src_object = del_range_1 (from, to, 1, 1);
6670 coding->src_pos = 0;
6671 coding->src_pos_byte = 0;
6672 }
6673 else
6674 {
6675 if (from < GPT && to >= GPT)
6676 move_gap_both (from, from_byte);
6677 coding->src_pos = from;
6678 coding->src_pos_byte = from_byte;
6679 }
6680 }
6681 else
6682 code_conversion_save (0, 0);
6683
6684 if (BUFFERP (dst_object))
6685 {
6686 coding->dst_object = dst_object;
6687 if (EQ (src_object, dst_object))
6688 {
6689 coding->dst_pos = from;
6690 coding->dst_pos_byte = from_byte;
6691 }
6692 else
6693 {
6694 coding->dst_pos = BUF_PT (XBUFFER (dst_object));
6695 coding->dst_pos_byte = BUF_PT_BYTE (XBUFFER (dst_object));
6696 }
6697 coding->dst_multibyte
6698 = ! NILP (XBUFFER (dst_object)->enable_multibyte_characters);
6699 }
6700 else if (EQ (dst_object, Qt))
6701 {
6702 coding->dst_object = Qnil;
6703 coding->dst_bytes = coding->src_chars;
6704 if (coding->dst_bytes == 0)
6705 coding->dst_bytes = 1;
6706 coding->destination = (unsigned char *) xmalloc (coding->dst_bytes);
6707 coding->dst_multibyte = 0;
6708 }
6709 else
6710 {
6711 coding->dst_object = Qnil;
6712 coding->dst_multibyte = 0;
6713 }
6714
6715 encode_coding (coding);
6716
6717 if (EQ (dst_object, Qt))
6718 {
6719 if (BUFFERP (coding->dst_object))
6720 coding->dst_object = Fbuffer_string ();
6721 else
6722 {
6723 coding->dst_object
6724 = make_unibyte_string ((char *) coding->destination,
6725 coding->produced);
6726 xfree (coding->destination);
6727 }
6728 }
6729
6730 if (saved_pt >= 0)
6731 {
6732 /* This is the case of:
6733 (BUFFERP (src_object) && EQ (src_object, dst_object))
6734 As we have moved PT while replacing the original buffer
6735 contents, we must recover it now. */
6736 set_buffer_internal (XBUFFER (src_object));
6737 if (saved_pt < from)
6738 TEMP_SET_PT_BOTH (saved_pt, saved_pt_byte);
6739 else if (saved_pt < from + chars)
6740 TEMP_SET_PT_BOTH (from, from_byte);
6741 else if (! NILP (current_buffer->enable_multibyte_characters))
6742 TEMP_SET_PT_BOTH (saved_pt + (coding->produced_char - chars),
6743 saved_pt_byte + (coding->produced - bytes));
6744 else
6745 TEMP_SET_PT_BOTH (saved_pt + (coding->produced - bytes),
6746 saved_pt_byte + (coding->produced - bytes));
6747 }
6748
6749 unbind_to (count, Qnil);
6750 }
6751
6752
6753 Lisp_Object
6754 preferred_coding_system ()
6755 {
6756 int id = coding_categories[coding_priorities[0]].id;
6757
6758 return CODING_ID_NAME (id);
6759 }
6760
6761 \f
6762 #ifdef emacs
6763 /*** 8. Emacs Lisp library functions ***/
6764
6765 DEFUN ("coding-system-p", Fcoding_system_p, Scoding_system_p, 1, 1, 0,
6766 doc: /* Return t if OBJECT is nil or a coding-system.
6767 See the documentation of `define-coding-system' for information
6768 about coding-system objects. */)
6769 (obj)
6770 Lisp_Object obj;
6771 {
6772 return ((NILP (obj) || CODING_SYSTEM_P (obj)) ? Qt : Qnil);
6773 }
6774
6775 DEFUN ("read-non-nil-coding-system", Fread_non_nil_coding_system,
6776 Sread_non_nil_coding_system, 1, 1, 0,
6777 doc: /* Read a coding system from the minibuffer, prompting with string PROMPT. */)
6778 (prompt)
6779 Lisp_Object prompt;
6780 {
6781 Lisp_Object val;
6782 do
6783 {
6784 val = Fcompleting_read (prompt, Vcoding_system_alist, Qnil,
6785 Qt, Qnil, Qcoding_system_history, Qnil, Qnil);
6786 }
6787 while (SCHARS (val) == 0);
6788 return (Fintern (val, Qnil));
6789 }
6790
6791 DEFUN ("read-coding-system", Fread_coding_system, Sread_coding_system, 1, 2, 0,
6792 doc: /* Read a coding system from the minibuffer, prompting with string PROMPT.
6793 If the user enters null input, return second argument DEFAULT-CODING-SYSTEM. */)
6794 (prompt, default_coding_system)
6795 Lisp_Object prompt, default_coding_system;
6796 {
6797 Lisp_Object val;
6798 if (SYMBOLP (default_coding_system))
6799 XSETSTRING (default_coding_system, XPNTR (SYMBOL_NAME (default_coding_system)));
6800 val = Fcompleting_read (prompt, Vcoding_system_alist, Qnil,
6801 Qt, Qnil, Qcoding_system_history,
6802 default_coding_system, Qnil);
6803 return (SCHARS (val) == 0 ? Qnil : Fintern (val, Qnil));
6804 }
6805
6806 DEFUN ("check-coding-system", Fcheck_coding_system, Scheck_coding_system,
6807 1, 1, 0,
6808 doc: /* Check validity of CODING-SYSTEM.
6809 If valid, return CODING-SYSTEM, else signal a `coding-system-error' error. */)
6810 (coding_system)
6811 Lisp_Object coding_system;
6812 {
6813 CHECK_SYMBOL (coding_system);
6814 if (!NILP (Fcoding_system_p (coding_system)))
6815 return coding_system;
6816 while (1)
6817 Fsignal (Qcoding_system_error, Fcons (coding_system, Qnil));
6818 }
6819
6820 \f
6821 /* Detect how the bytes at SRC of length SRC_BYTES are encoded. If
6822 HIGHEST is nonzero, return the coding system of the highest
6823 priority among the detected coding systems. Otherwize return a
6824 list of detected coding systems sorted by their priorities. If
6825 MULTIBYTEP is nonzero, it is assumed that the bytes are in correct
6826 multibyte form but contains only ASCII and eight-bit chars.
6827 Otherwise, the bytes are raw bytes.
6828
6829 CODING-SYSTEM controls the detection as below:
6830
6831 If it is nil, detect both text-format and eol-format. If the
6832 text-format part of CODING-SYSTEM is already specified
6833 (e.g. `iso-latin-1'), detect only eol-format. If the eol-format
6834 part of CODING-SYSTEM is already specified (e.g. `undecided-unix'),
6835 detect only text-format. */
6836
6837 Lisp_Object
6838 detect_coding_system (src, src_chars, src_bytes, highest, multibytep,
6839 coding_system)
6840 const unsigned char *src;
6841 int src_chars, src_bytes, highest;
6842 int multibytep;
6843 Lisp_Object coding_system;
6844 {
6845 const unsigned char *src_end = src + src_bytes;
6846 Lisp_Object attrs, eol_type;
6847 Lisp_Object val;
6848 struct coding_system coding;
6849 int id;
6850 struct coding_detection_info detect_info;
6851 enum coding_category base_category;
6852
6853 if (NILP (coding_system))
6854 coding_system = Qundecided;
6855 setup_coding_system (coding_system, &coding);
6856 attrs = CODING_ID_ATTRS (coding.id);
6857 eol_type = CODING_ID_EOL_TYPE (coding.id);
6858 coding_system = CODING_ATTR_BASE_NAME (attrs);
6859
6860 coding.source = src;
6861 coding.src_chars = src_chars;
6862 coding.src_bytes = src_bytes;
6863 coding.src_multibyte = multibytep;
6864 coding.consumed = 0;
6865 coding.mode |= CODING_MODE_LAST_BLOCK;
6866
6867 detect_info.checked = detect_info.found = detect_info.rejected = 0;
6868
6869 /* At first, detect text-format if necessary. */
6870 base_category = XINT (CODING_ATTR_CATEGORY (attrs));
6871 if (base_category == coding_category_undecided)
6872 {
6873 enum coding_category category;
6874 struct coding_system *this;
6875 int c, i;
6876
6877 /* Skip all ASCII bytes except for a few ISO2022 controls. */
6878 for (i = 0; src < src_end; i++, src++)
6879 {
6880 c = *src;
6881 if (c & 0x80 || (c < 0x20 && (c == 0
6882 || c == ISO_CODE_ESC
6883 || c == ISO_CODE_SI
6884 || c == ISO_CODE_SO)))
6885 break;
6886 }
6887 coding.head_ascii = src - coding.source;
6888
6889 if (src < src_end)
6890 for (i = 0; i < coding_category_raw_text; i++)
6891 {
6892 category = coding_priorities[i];
6893 this = coding_categories + category;
6894
6895 if (this->id < 0)
6896 {
6897 /* No coding system of this category is defined. */
6898 detect_info.rejected |= (1 << category);
6899 }
6900 else if (category >= coding_category_raw_text)
6901 continue;
6902 else if (detect_info.checked & (1 << category))
6903 {
6904 if (highest
6905 && (detect_info.found & (1 << category)))
6906 break;
6907 }
6908 else
6909 {
6910 if ((*(this->detector)) (&coding, &detect_info)
6911 && highest
6912 && (detect_info.found & (1 << category)))
6913 {
6914 if (category == coding_category_utf_16_auto)
6915 {
6916 if (detect_info.found & CATEGORY_MASK_UTF_16_LE)
6917 category = coding_category_utf_16_le;
6918 else
6919 category = coding_category_utf_16_be;
6920 }
6921 break;
6922 }
6923 }
6924 }
6925
6926 if (detect_info.rejected == CATEGORY_MASK_ANY)
6927 {
6928 detect_info.found = CATEGORY_MASK_RAW_TEXT;
6929 id = coding_categories[coding_category_raw_text].id;
6930 val = Fcons (make_number (id), Qnil);
6931 }
6932 else if (! detect_info.rejected && ! detect_info.found)
6933 {
6934 detect_info.found = CATEGORY_MASK_ANY;
6935 id = coding_categories[coding_category_undecided].id;
6936 val = Fcons (make_number (id), Qnil);
6937 }
6938 else if (highest)
6939 {
6940 if (detect_info.found)
6941 {
6942 detect_info.found = 1 << category;
6943 val = Fcons (make_number (this->id), Qnil);
6944 }
6945 else
6946 for (i = 0; i < coding_category_raw_text; i++)
6947 if (! (detect_info.rejected & (1 << coding_priorities[i])))
6948 {
6949 detect_info.found = 1 << coding_priorities[i];
6950 id = coding_categories[coding_priorities[i]].id;
6951 val = Fcons (make_number (id), Qnil);
6952 break;
6953 }
6954 }
6955 else
6956 {
6957 int mask = detect_info.rejected | detect_info.found;
6958 int found = 0;
6959 val = Qnil;
6960
6961 for (i = coding_category_raw_text - 1; i >= 0; i--)
6962 {
6963 category = coding_priorities[i];
6964 if (! (mask & (1 << category)))
6965 {
6966 found |= 1 << category;
6967 id = coding_categories[category].id;
6968 val = Fcons (make_number (id), val);
6969 }
6970 }
6971 for (i = coding_category_raw_text - 1; i >= 0; i--)
6972 {
6973 category = coding_priorities[i];
6974 if (detect_info.found & (1 << category))
6975 {
6976 id = coding_categories[category].id;
6977 val = Fcons (make_number (id), val);
6978 }
6979 }
6980 detect_info.found |= found;
6981 }
6982 }
6983 else if (base_category == coding_category_utf_16_auto)
6984 {
6985 if (detect_coding_utf_16 (&coding, &detect_info))
6986 {
6987 enum coding_category category;
6988 struct coding_system *this;
6989
6990 if (detect_info.found & CATEGORY_MASK_UTF_16_LE)
6991 this = coding_categories + coding_category_utf_16_le;
6992 else if (detect_info.found & CATEGORY_MASK_UTF_16_BE)
6993 this = coding_categories + coding_category_utf_16_be;
6994 else if (detect_info.rejected & CATEGORY_MASK_UTF_16_LE_NOSIG)
6995 this = coding_categories + coding_category_utf_16_be_nosig;
6996 else
6997 this = coding_categories + coding_category_utf_16_le_nosig;
6998 val = Fcons (make_number (this->id), Qnil);
6999 }
7000 }
7001 else
7002 {
7003 detect_info.found = 1 << XINT (CODING_ATTR_CATEGORY (attrs));
7004 val = Fcons (make_number (coding.id), Qnil);
7005 }
7006
7007 /* Then, detect eol-format if necessary. */
7008 {
7009 int normal_eol = -1, utf_16_be_eol = -1, utf_16_le_eol;
7010 Lisp_Object tail;
7011
7012 if (VECTORP (eol_type))
7013 {
7014 if (detect_info.found & ~CATEGORY_MASK_UTF_16)
7015 normal_eol = detect_eol (coding.source, src_bytes,
7016 coding_category_raw_text);
7017 if (detect_info.found & (CATEGORY_MASK_UTF_16_BE
7018 | CATEGORY_MASK_UTF_16_BE_NOSIG))
7019 utf_16_be_eol = detect_eol (coding.source, src_bytes,
7020 coding_category_utf_16_be);
7021 if (detect_info.found & (CATEGORY_MASK_UTF_16_LE
7022 | CATEGORY_MASK_UTF_16_LE_NOSIG))
7023 utf_16_le_eol = detect_eol (coding.source, src_bytes,
7024 coding_category_utf_16_le);
7025 }
7026 else
7027 {
7028 if (EQ (eol_type, Qunix))
7029 normal_eol = utf_16_be_eol = utf_16_le_eol = EOL_SEEN_LF;
7030 else if (EQ (eol_type, Qdos))
7031 normal_eol = utf_16_be_eol = utf_16_le_eol = EOL_SEEN_CRLF;
7032 else
7033 normal_eol = utf_16_be_eol = utf_16_le_eol = EOL_SEEN_CR;
7034 }
7035
7036 for (tail = val; CONSP (tail); tail = XCDR (tail))
7037 {
7038 enum coding_category category;
7039 int this_eol;
7040
7041 id = XINT (XCAR (tail));
7042 attrs = CODING_ID_ATTRS (id);
7043 category = XINT (CODING_ATTR_CATEGORY (attrs));
7044 eol_type = CODING_ID_EOL_TYPE (id);
7045 if (VECTORP (eol_type))
7046 {
7047 if (category == coding_category_utf_16_be
7048 || category == coding_category_utf_16_be_nosig)
7049 this_eol = utf_16_be_eol;
7050 else if (category == coding_category_utf_16_le
7051 || category == coding_category_utf_16_le_nosig)
7052 this_eol = utf_16_le_eol;
7053 else
7054 this_eol = normal_eol;
7055
7056 if (this_eol == EOL_SEEN_LF)
7057 XSETCAR (tail, AREF (eol_type, 0));
7058 else if (this_eol == EOL_SEEN_CRLF)
7059 XSETCAR (tail, AREF (eol_type, 1));
7060 else if (this_eol == EOL_SEEN_CR)
7061 XSETCAR (tail, AREF (eol_type, 2));
7062 else
7063 XSETCAR (tail, CODING_ID_NAME (id));
7064 }
7065 else
7066 XSETCAR (tail, CODING_ID_NAME (id));
7067 }
7068 }
7069
7070 return (highest ? XCAR (val) : val);
7071 }
7072
7073
7074 DEFUN ("detect-coding-region", Fdetect_coding_region, Sdetect_coding_region,
7075 2, 3, 0,
7076 doc: /* Detect coding system of the text in the region between START and END.
7077 Return a list of possible coding systems ordered by priority.
7078
7079 If only ASCII characters are found, it returns a list of single element
7080 `undecided' or its subsidiary coding system according to a detected
7081 end-of-line format.
7082
7083 If optional argument HIGHEST is non-nil, return the coding system of
7084 highest priority. */)
7085 (start, end, highest)
7086 Lisp_Object start, end, highest;
7087 {
7088 int from, to;
7089 int from_byte, to_byte;
7090
7091 CHECK_NUMBER_COERCE_MARKER (start);
7092 CHECK_NUMBER_COERCE_MARKER (end);
7093
7094 validate_region (&start, &end);
7095 from = XINT (start), to = XINT (end);
7096 from_byte = CHAR_TO_BYTE (from);
7097 to_byte = CHAR_TO_BYTE (to);
7098
7099 if (from < GPT && to >= GPT)
7100 move_gap_both (to, to_byte);
7101
7102 return detect_coding_system (BYTE_POS_ADDR (from_byte),
7103 to - from, to_byte - from_byte,
7104 !NILP (highest),
7105 !NILP (current_buffer
7106 ->enable_multibyte_characters),
7107 Qnil);
7108 }
7109
7110 DEFUN ("detect-coding-string", Fdetect_coding_string, Sdetect_coding_string,
7111 1, 2, 0,
7112 doc: /* Detect coding system of the text in STRING.
7113 Return a list of possible coding systems ordered by priority.
7114
7115 If only ASCII characters are found, it returns a list of single element
7116 `undecided' or its subsidiary coding system according to a detected
7117 end-of-line format.
7118
7119 If optional argument HIGHEST is non-nil, return the coding system of
7120 highest priority. */)
7121 (string, highest)
7122 Lisp_Object string, highest;
7123 {
7124 CHECK_STRING (string);
7125
7126 return detect_coding_system (SDATA (string),
7127 SCHARS (string), SBYTES (string),
7128 !NILP (highest), STRING_MULTIBYTE (string),
7129 Qnil);
7130 }
7131
7132
7133 static INLINE int
7134 char_encodable_p (c, attrs)
7135 int c;
7136 Lisp_Object attrs;
7137 {
7138 Lisp_Object tail;
7139 struct charset *charset;
7140 Lisp_Object translation_table;
7141
7142 translation_table = CODING_ATTR_TRANS_TBL (attrs);
7143 if (! NILP (translation_table))
7144 c = translate_char (translation_table, c);
7145 for (tail = CODING_ATTR_CHARSET_LIST (attrs);
7146 CONSP (tail); tail = XCDR (tail))
7147 {
7148 charset = CHARSET_FROM_ID (XINT (XCAR (tail)));
7149 if (CHAR_CHARSET_P (c, charset))
7150 break;
7151 }
7152 return (! NILP (tail));
7153 }
7154
7155
7156 /* Return a list of coding systems that safely encode the text between
7157 START and END. If EXCLUDE is non-nil, it is a list of coding
7158 systems not to check. The returned list doesn't contain any such
7159 coding systems. In any case, if the text contains only ASCII or is
7160 unibyte, return t. */
7161
7162 DEFUN ("find-coding-systems-region-internal",
7163 Ffind_coding_systems_region_internal,
7164 Sfind_coding_systems_region_internal, 2, 3, 0,
7165 doc: /* Internal use only. */)
7166 (start, end, exclude)
7167 Lisp_Object start, end, exclude;
7168 {
7169 Lisp_Object coding_attrs_list, safe_codings;
7170 EMACS_INT start_byte, end_byte;
7171 const unsigned char *p, *pbeg, *pend;
7172 int c;
7173 Lisp_Object tail, elt;
7174
7175 if (STRINGP (start))
7176 {
7177 if (!STRING_MULTIBYTE (start)
7178 || SCHARS (start) == SBYTES (start))
7179 return Qt;
7180 start_byte = 0;
7181 end_byte = SBYTES (start);
7182 }
7183 else
7184 {
7185 CHECK_NUMBER_COERCE_MARKER (start);
7186 CHECK_NUMBER_COERCE_MARKER (end);
7187 if (XINT (start) < BEG || XINT (end) > Z || XINT (start) > XINT (end))
7188 args_out_of_range (start, end);
7189 if (NILP (current_buffer->enable_multibyte_characters))
7190 return Qt;
7191 start_byte = CHAR_TO_BYTE (XINT (start));
7192 end_byte = CHAR_TO_BYTE (XINT (end));
7193 if (XINT (end) - XINT (start) == end_byte - start_byte)
7194 return Qt;
7195
7196 if (XINT (start) < GPT && XINT (end) > GPT)
7197 {
7198 if ((GPT - XINT (start)) < (XINT (end) - GPT))
7199 move_gap_both (XINT (start), start_byte);
7200 else
7201 move_gap_both (XINT (end), end_byte);
7202 }
7203 }
7204
7205 coding_attrs_list = Qnil;
7206 for (tail = Vcoding_system_list; CONSP (tail); tail = XCDR (tail))
7207 if (NILP (exclude)
7208 || NILP (Fmemq (XCAR (tail), exclude)))
7209 {
7210 Lisp_Object attrs;
7211
7212 attrs = AREF (CODING_SYSTEM_SPEC (XCAR (tail)), 0);
7213 if (EQ (XCAR (tail), CODING_ATTR_BASE_NAME (attrs))
7214 && ! EQ (CODING_ATTR_TYPE (attrs), Qundecided))
7215 {
7216 ASET (attrs, coding_attr_trans_tbl,
7217 get_translation_table (attrs, 1));
7218 coding_attrs_list = Fcons (attrs, coding_attrs_list);
7219 }
7220 }
7221
7222 if (STRINGP (start))
7223 p = pbeg = SDATA (start);
7224 else
7225 p = pbeg = BYTE_POS_ADDR (start_byte);
7226 pend = p + (end_byte - start_byte);
7227
7228 while (p < pend && ASCII_BYTE_P (*p)) p++;
7229 while (p < pend && ASCII_BYTE_P (*(pend - 1))) pend--;
7230
7231 while (p < pend)
7232 {
7233 if (ASCII_BYTE_P (*p))
7234 p++;
7235 else
7236 {
7237 c = STRING_CHAR_ADVANCE (p);
7238
7239 charset_map_loaded = 0;
7240 for (tail = coding_attrs_list; CONSP (tail);)
7241 {
7242 elt = XCAR (tail);
7243 if (NILP (elt))
7244 tail = XCDR (tail);
7245 else if (char_encodable_p (c, elt))
7246 tail = XCDR (tail);
7247 else if (CONSP (XCDR (tail)))
7248 {
7249 XSETCAR (tail, XCAR (XCDR (tail)));
7250 XSETCDR (tail, XCDR (XCDR (tail)));
7251 }
7252 else
7253 {
7254 XSETCAR (tail, Qnil);
7255 tail = XCDR (tail);
7256 }
7257 }
7258 if (charset_map_loaded)
7259 {
7260 EMACS_INT p_offset = p - pbeg, pend_offset = pend - pbeg;
7261
7262 if (STRINGP (start))
7263 pbeg = SDATA (start);
7264 else
7265 pbeg = BYTE_POS_ADDR (start_byte);
7266 p = pbeg + p_offset;
7267 pend = pbeg + pend_offset;
7268 }
7269 }
7270 }
7271
7272 safe_codings = Qnil;
7273 for (tail = coding_attrs_list; CONSP (tail); tail = XCDR (tail))
7274 if (! NILP (XCAR (tail)))
7275 safe_codings = Fcons (CODING_ATTR_BASE_NAME (XCAR (tail)), safe_codings);
7276
7277 return safe_codings;
7278 }
7279
7280
7281 DEFUN ("unencodable-char-position", Funencodable_char_position,
7282 Sunencodable_char_position, 3, 5, 0,
7283 doc: /*
7284 Return position of first un-encodable character in a region.
7285 START and END specfiy the region and CODING-SYSTEM specifies the
7286 encoding to check. Return nil if CODING-SYSTEM does encode the region.
7287
7288 If optional 4th argument COUNT is non-nil, it specifies at most how
7289 many un-encodable characters to search. In this case, the value is a
7290 list of positions.
7291
7292 If optional 5th argument STRING is non-nil, it is a string to search
7293 for un-encodable characters. In that case, START and END are indexes
7294 to the string. */)
7295 (start, end, coding_system, count, string)
7296 Lisp_Object start, end, coding_system, count, string;
7297 {
7298 int n;
7299 struct coding_system coding;
7300 Lisp_Object attrs, charset_list, translation_table;
7301 Lisp_Object positions;
7302 int from, to;
7303 const unsigned char *p, *stop, *pend;
7304 int ascii_compatible;
7305
7306 setup_coding_system (Fcheck_coding_system (coding_system), &coding);
7307 attrs = CODING_ID_ATTRS (coding.id);
7308 if (EQ (CODING_ATTR_TYPE (attrs), Qraw_text))
7309 return Qnil;
7310 ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs));
7311 charset_list = CODING_ATTR_CHARSET_LIST (attrs);
7312 translation_table = get_translation_table (attrs, 1);
7313
7314 if (NILP (string))
7315 {
7316 validate_region (&start, &end);
7317 from = XINT (start);
7318 to = XINT (end);
7319 if (NILP (current_buffer->enable_multibyte_characters)
7320 || (ascii_compatible
7321 && (to - from) == (CHAR_TO_BYTE (to) - (CHAR_TO_BYTE (from)))))
7322 return Qnil;
7323 p = CHAR_POS_ADDR (from);
7324 pend = CHAR_POS_ADDR (to);
7325 if (from < GPT && to >= GPT)
7326 stop = GPT_ADDR;
7327 else
7328 stop = pend;
7329 }
7330 else
7331 {
7332 CHECK_STRING (string);
7333 CHECK_NATNUM (start);
7334 CHECK_NATNUM (end);
7335 from = XINT (start);
7336 to = XINT (end);
7337 if (from > to
7338 || to > SCHARS (string))
7339 args_out_of_range_3 (string, start, end);
7340 if (! STRING_MULTIBYTE (string))
7341 return Qnil;
7342 p = SDATA (string) + string_char_to_byte (string, from);
7343 stop = pend = SDATA (string) + string_char_to_byte (string, to);
7344 if (ascii_compatible && (to - from) == (pend - p))
7345 return Qnil;
7346 }
7347
7348 if (NILP (count))
7349 n = 1;
7350 else
7351 {
7352 CHECK_NATNUM (count);
7353 n = XINT (count);
7354 }
7355
7356 positions = Qnil;
7357 while (1)
7358 {
7359 int c;
7360
7361 if (ascii_compatible)
7362 while (p < stop && ASCII_BYTE_P (*p))
7363 p++, from++;
7364 if (p >= stop)
7365 {
7366 if (p >= pend)
7367 break;
7368 stop = pend;
7369 p = GAP_END_ADDR;
7370 }
7371
7372 c = STRING_CHAR_ADVANCE (p);
7373 if (! (ASCII_CHAR_P (c) && ascii_compatible)
7374 && ! char_charset (translate_char (translation_table, c),
7375 charset_list, NULL))
7376 {
7377 positions = Fcons (make_number (from), positions);
7378 n--;
7379 if (n == 0)
7380 break;
7381 }
7382
7383 from++;
7384 }
7385
7386 return (NILP (count) ? Fcar (positions) : Fnreverse (positions));
7387 }
7388
7389
7390 DEFUN ("check-coding-systems-region", Fcheck_coding_systems_region,
7391 Scheck_coding_systems_region, 3, 3, 0,
7392 doc: /* Check if the region is encodable by coding systems.
7393
7394 START and END are buffer positions specifying the region.
7395 CODING-SYSTEM-LIST is a list of coding systems to check.
7396
7397 The value is an alist ((CODING-SYSTEM POS0 POS1 ...) ...), where
7398 CODING-SYSTEM is a member of CODING-SYSTEM-LIst and can't encode the
7399 whole region, POS0, POS1, ... are buffer positions where non-encodable
7400 characters are found.
7401
7402 If all coding systems in CODING-SYSTEM-LIST can encode the region, the
7403 value is nil.
7404
7405 START may be a string. In that case, check if the string is
7406 encodable, and the value contains indices to the string instead of
7407 buffer positions. END is ignored. */)
7408 (start, end, coding_system_list)
7409 Lisp_Object start, end, coding_system_list;
7410 {
7411 Lisp_Object list;
7412 EMACS_INT start_byte, end_byte;
7413 int pos;
7414 const unsigned char *p, *pbeg, *pend;
7415 int c;
7416 Lisp_Object tail, elt, attrs;
7417
7418 if (STRINGP (start))
7419 {
7420 if (!STRING_MULTIBYTE (start)
7421 && SCHARS (start) != SBYTES (start))
7422 return Qnil;
7423 start_byte = 0;
7424 end_byte = SBYTES (start);
7425 pos = 0;
7426 }
7427 else
7428 {
7429 CHECK_NUMBER_COERCE_MARKER (start);
7430 CHECK_NUMBER_COERCE_MARKER (end);
7431 if (XINT (start) < BEG || XINT (end) > Z || XINT (start) > XINT (end))
7432 args_out_of_range (start, end);
7433 if (NILP (current_buffer->enable_multibyte_characters))
7434 return Qnil;
7435 start_byte = CHAR_TO_BYTE (XINT (start));
7436 end_byte = CHAR_TO_BYTE (XINT (end));
7437 if (XINT (end) - XINT (start) == end_byte - start_byte)
7438 return Qt;
7439
7440 if (XINT (start) < GPT && XINT (end) > GPT)
7441 {
7442 if ((GPT - XINT (start)) < (XINT (end) - GPT))
7443 move_gap_both (XINT (start), start_byte);
7444 else
7445 move_gap_both (XINT (end), end_byte);
7446 }
7447 pos = XINT (start);
7448 }
7449
7450 list = Qnil;
7451 for (tail = coding_system_list; CONSP (tail); tail = XCDR (tail))
7452 {
7453 elt = XCAR (tail);
7454 attrs = AREF (CODING_SYSTEM_SPEC (elt), 0);
7455 ASET (attrs, coding_attr_trans_tbl, get_translation_table (attrs, 1));
7456 list = Fcons (Fcons (elt, Fcons (attrs, Qnil)), list);
7457 }
7458
7459 if (STRINGP (start))
7460 p = pbeg = SDATA (start);
7461 else
7462 p = pbeg = BYTE_POS_ADDR (start_byte);
7463 pend = p + (end_byte - start_byte);
7464
7465 while (p < pend && ASCII_BYTE_P (*p)) p++, pos++;
7466 while (p < pend && ASCII_BYTE_P (*(pend - 1))) pend--;
7467
7468 while (p < pend)
7469 {
7470 if (ASCII_BYTE_P (*p))
7471 p++;
7472 else
7473 {
7474 c = STRING_CHAR_ADVANCE (p);
7475
7476 charset_map_loaded = 0;
7477 for (tail = list; CONSP (tail); tail = XCDR (tail))
7478 {
7479 elt = XCDR (XCAR (tail));
7480 if (! char_encodable_p (c, XCAR (elt)))
7481 XSETCDR (elt, Fcons (make_number (pos), XCDR (elt)));
7482 }
7483 if (charset_map_loaded)
7484 {
7485 EMACS_INT p_offset = p - pbeg, pend_offset = pend - pbeg;
7486
7487 if (STRINGP (start))
7488 pbeg = SDATA (start);
7489 else
7490 pbeg = BYTE_POS_ADDR (start_byte);
7491 p = pbeg + p_offset;
7492 pend = pbeg + pend_offset;
7493 }
7494 }
7495 pos++;
7496 }
7497
7498 tail = list;
7499 list = Qnil;
7500 for (; CONSP (tail); tail = XCDR (tail))
7501 {
7502 elt = XCAR (tail);
7503 if (CONSP (XCDR (XCDR (elt))))
7504 list = Fcons (Fcons (XCAR (elt), Fnreverse (XCDR (XCDR (elt)))),
7505 list);
7506 }
7507
7508 return list;
7509 }
7510
7511
7512 Lisp_Object
7513 code_convert_region (start, end, coding_system, dst_object, encodep, norecord)
7514 Lisp_Object start, end, coding_system, dst_object;
7515 int encodep, norecord;
7516 {
7517 struct coding_system coding;
7518 EMACS_INT from, from_byte, to, to_byte;
7519 Lisp_Object src_object;
7520
7521 CHECK_NUMBER_COERCE_MARKER (start);
7522 CHECK_NUMBER_COERCE_MARKER (end);
7523 if (NILP (coding_system))
7524 coding_system = Qno_conversion;
7525 else
7526 CHECK_CODING_SYSTEM (coding_system);
7527 src_object = Fcurrent_buffer ();
7528 if (NILP (dst_object))
7529 dst_object = src_object;
7530 else if (! EQ (dst_object, Qt))
7531 CHECK_BUFFER (dst_object);
7532
7533 validate_region (&start, &end);
7534 from = XFASTINT (start);
7535 from_byte = CHAR_TO_BYTE (from);
7536 to = XFASTINT (end);
7537 to_byte = CHAR_TO_BYTE (to);
7538
7539 setup_coding_system (coding_system, &coding);
7540 coding.mode |= CODING_MODE_LAST_BLOCK;
7541
7542 if (encodep)
7543 encode_coding_object (&coding, src_object, from, from_byte, to, to_byte,
7544 dst_object);
7545 else
7546 decode_coding_object (&coding, src_object, from, from_byte, to, to_byte,
7547 dst_object);
7548 if (! norecord)
7549 Vlast_coding_system_used = CODING_ID_NAME (coding.id);
7550
7551 return (BUFFERP (dst_object)
7552 ? make_number (coding.produced_char)
7553 : coding.dst_object);
7554 }
7555
7556
7557 DEFUN ("decode-coding-region", Fdecode_coding_region, Sdecode_coding_region,
7558 3, 4, "r\nzCoding system: ",
7559 doc: /* Decode the current region from the specified coding system.
7560 When called from a program, takes four arguments:
7561 START, END, CODING-SYSTEM, and DESTINATION.
7562 START and END are buffer positions.
7563
7564 Optional 4th arguments DESTINATION specifies where the decoded text goes.
7565 If nil, the region between START and END is replace by the decoded text.
7566 If buffer, the decoded text is inserted in the buffer.
7567 If t, the decoded text is returned.
7568
7569 This function sets `last-coding-system-used' to the precise coding system
7570 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
7571 not fully specified.)
7572 It returns the length of the decoded text. */)
7573 (start, end, coding_system, destination)
7574 Lisp_Object start, end, coding_system, destination;
7575 {
7576 return code_convert_region (start, end, coding_system, destination, 0, 0);
7577 }
7578
7579 DEFUN ("encode-coding-region", Fencode_coding_region, Sencode_coding_region,
7580 3, 4, "r\nzCoding system: ",
7581 doc: /* Encode the current region by specified coding system.
7582 When called from a program, takes three arguments:
7583 START, END, and CODING-SYSTEM. START and END are buffer positions.
7584
7585 Optional 4th arguments DESTINATION specifies where the encoded text goes.
7586 If nil, the region between START and END is replace by the encoded text.
7587 If buffer, the encoded text is inserted in the buffer.
7588 If t, the encoded text is returned.
7589
7590 This function sets `last-coding-system-used' to the precise coding system
7591 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
7592 not fully specified.)
7593 It returns the length of the encoded text. */)
7594 (start, end, coding_system, destination)
7595 Lisp_Object start, end, coding_system, destination;
7596 {
7597 return code_convert_region (start, end, coding_system, destination, 1, 0);
7598 }
7599
7600 Lisp_Object
7601 code_convert_string (string, coding_system, dst_object,
7602 encodep, nocopy, norecord)
7603 Lisp_Object string, coding_system, dst_object;
7604 int encodep, nocopy, norecord;
7605 {
7606 struct coding_system coding;
7607 EMACS_INT chars, bytes;
7608
7609 CHECK_STRING (string);
7610 if (NILP (coding_system))
7611 {
7612 if (! norecord)
7613 Vlast_coding_system_used = Qno_conversion;
7614 if (NILP (dst_object))
7615 return (nocopy ? Fcopy_sequence (string) : string);
7616 }
7617
7618 if (NILP (coding_system))
7619 coding_system = Qno_conversion;
7620 else
7621 CHECK_CODING_SYSTEM (coding_system);
7622 if (NILP (dst_object))
7623 dst_object = Qt;
7624 else if (! EQ (dst_object, Qt))
7625 CHECK_BUFFER (dst_object);
7626
7627 setup_coding_system (coding_system, &coding);
7628 coding.mode |= CODING_MODE_LAST_BLOCK;
7629 chars = SCHARS (string);
7630 bytes = SBYTES (string);
7631 if (encodep)
7632 encode_coding_object (&coding, string, 0, 0, chars, bytes, dst_object);
7633 else
7634 decode_coding_object (&coding, string, 0, 0, chars, bytes, dst_object);
7635 if (! norecord)
7636 Vlast_coding_system_used = CODING_ID_NAME (coding.id);
7637
7638 return (BUFFERP (dst_object)
7639 ? make_number (coding.produced_char)
7640 : coding.dst_object);
7641 }
7642
7643
7644 /* Encode or decode STRING according to CODING_SYSTEM.
7645 Do not set Vlast_coding_system_used.
7646
7647 This function is called only from macros DECODE_FILE and
7648 ENCODE_FILE, thus we ignore character composition. */
7649
7650 Lisp_Object
7651 code_convert_string_norecord (string, coding_system, encodep)
7652 Lisp_Object string, coding_system;
7653 int encodep;
7654 {
7655 return code_convert_string (string, coding_system, Qt, encodep, 0, 1);
7656 }
7657
7658
7659 DEFUN ("decode-coding-string", Fdecode_coding_string, Sdecode_coding_string,
7660 2, 4, 0,
7661 doc: /* Decode STRING which is encoded in CODING-SYSTEM, and return the result.
7662
7663 Optional third arg NOCOPY non-nil means it is OK to return STRING itself
7664 if the decoding operation is trivial.
7665
7666 Optional fourth arg BUFFER non-nil meant that the decoded text is
7667 inserted in BUFFER instead of returned as a string. In this case,
7668 the return value is BUFFER.
7669
7670 This function sets `last-coding-system-used' to the precise coding system
7671 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
7672 not fully specified. */)
7673 (string, coding_system, nocopy, buffer)
7674 Lisp_Object string, coding_system, nocopy, buffer;
7675 {
7676 return code_convert_string (string, coding_system, buffer,
7677 0, ! NILP (nocopy), 0);
7678 }
7679
7680 DEFUN ("encode-coding-string", Fencode_coding_string, Sencode_coding_string,
7681 2, 4, 0,
7682 doc: /* Encode STRING to CODING-SYSTEM, and return the result.
7683
7684 Optional third arg NOCOPY non-nil means it is OK to return STRING
7685 itself if the encoding operation is trivial.
7686
7687 Optional fourth arg BUFFER non-nil meant that the encoded text is
7688 inserted in BUFFER instead of returned as a string. In this case,
7689 the return value is BUFFER.
7690
7691 This function sets `last-coding-system-used' to the precise coding system
7692 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
7693 not fully specified.) */)
7694 (string, coding_system, nocopy, buffer)
7695 Lisp_Object string, coding_system, nocopy, buffer;
7696 {
7697 return code_convert_string (string, coding_system, buffer,
7698 1, ! NILP (nocopy), 1);
7699 }
7700
7701 \f
7702 DEFUN ("decode-sjis-char", Fdecode_sjis_char, Sdecode_sjis_char, 1, 1, 0,
7703 doc: /* Decode a Japanese character which has CODE in shift_jis encoding.
7704 Return the corresponding character. */)
7705 (code)
7706 Lisp_Object code;
7707 {
7708 Lisp_Object spec, attrs, val;
7709 struct charset *charset_roman, *charset_kanji, *charset_kana, *charset;
7710 int c;
7711
7712 CHECK_NATNUM (code);
7713 c = XFASTINT (code);
7714 CHECK_CODING_SYSTEM_GET_SPEC (Vsjis_coding_system, spec);
7715 attrs = AREF (spec, 0);
7716
7717 if (ASCII_BYTE_P (c)
7718 && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
7719 return code;
7720
7721 val = CODING_ATTR_CHARSET_LIST (attrs);
7722 charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
7723 charset_kana = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
7724 charset_kanji = CHARSET_FROM_ID (XINT (XCAR (val)));
7725
7726 if (c <= 0x7F)
7727 charset = charset_roman;
7728 else if (c >= 0xA0 && c < 0xDF)
7729 {
7730 charset = charset_kana;
7731 c -= 0x80;
7732 }
7733 else
7734 {
7735 int s1 = c >> 8, s2 = c & 0xFF;
7736
7737 if (s1 < 0x81 || (s1 > 0x9F && s1 < 0xE0) || s1 > 0xEF
7738 || s2 < 0x40 || s2 == 0x7F || s2 > 0xFC)
7739 error ("Invalid code: %d", code);
7740 SJIS_TO_JIS (c);
7741 charset = charset_kanji;
7742 }
7743 c = DECODE_CHAR (charset, c);
7744 if (c < 0)
7745 error ("Invalid code: %d", code);
7746 return make_number (c);
7747 }
7748
7749
7750 DEFUN ("encode-sjis-char", Fencode_sjis_char, Sencode_sjis_char, 1, 1, 0,
7751 doc: /* Encode a Japanese character CHAR to shift_jis encoding.
7752 Return the corresponding code in SJIS. */)
7753 (ch)
7754 Lisp_Object ch;
7755 {
7756 Lisp_Object spec, attrs, charset_list;
7757 int c;
7758 struct charset *charset;
7759 unsigned code;
7760
7761 CHECK_CHARACTER (ch);
7762 c = XFASTINT (ch);
7763 CHECK_CODING_SYSTEM_GET_SPEC (Vsjis_coding_system, spec);
7764 attrs = AREF (spec, 0);
7765
7766 if (ASCII_CHAR_P (c)
7767 && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
7768 return ch;
7769
7770 charset_list = CODING_ATTR_CHARSET_LIST (attrs);
7771 charset = char_charset (c, charset_list, &code);
7772 if (code == CHARSET_INVALID_CODE (charset))
7773 error ("Can't encode by shift_jis encoding: %d", c);
7774 JIS_TO_SJIS (code);
7775
7776 return make_number (code);
7777 }
7778
7779 DEFUN ("decode-big5-char", Fdecode_big5_char, Sdecode_big5_char, 1, 1, 0,
7780 doc: /* Decode a Big5 character which has CODE in BIG5 coding system.
7781 Return the corresponding character. */)
7782 (code)
7783 Lisp_Object code;
7784 {
7785 Lisp_Object spec, attrs, val;
7786 struct charset *charset_roman, *charset_big5, *charset;
7787 int c;
7788
7789 CHECK_NATNUM (code);
7790 c = XFASTINT (code);
7791 CHECK_CODING_SYSTEM_GET_SPEC (Vbig5_coding_system, spec);
7792 attrs = AREF (spec, 0);
7793
7794 if (ASCII_BYTE_P (c)
7795 && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
7796 return code;
7797
7798 val = CODING_ATTR_CHARSET_LIST (attrs);
7799 charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
7800 charset_big5 = CHARSET_FROM_ID (XINT (XCAR (val)));
7801
7802 if (c <= 0x7F)
7803 charset = charset_roman;
7804 else
7805 {
7806 int b1 = c >> 8, b2 = c & 0x7F;
7807 if (b1 < 0xA1 || b1 > 0xFE
7808 || b2 < 0x40 || (b2 > 0x7E && b2 < 0xA1) || b2 > 0xFE)
7809 error ("Invalid code: %d", code);
7810 charset = charset_big5;
7811 }
7812 c = DECODE_CHAR (charset, (unsigned )c);
7813 if (c < 0)
7814 error ("Invalid code: %d", code);
7815 return make_number (c);
7816 }
7817
7818 DEFUN ("encode-big5-char", Fencode_big5_char, Sencode_big5_char, 1, 1, 0,
7819 doc: /* Encode the Big5 character CHAR to BIG5 coding system.
7820 Return the corresponding character code in Big5. */)
7821 (ch)
7822 Lisp_Object ch;
7823 {
7824 Lisp_Object spec, attrs, charset_list;
7825 struct charset *charset;
7826 int c;
7827 unsigned code;
7828
7829 CHECK_CHARACTER (ch);
7830 c = XFASTINT (ch);
7831 CHECK_CODING_SYSTEM_GET_SPEC (Vbig5_coding_system, spec);
7832 attrs = AREF (spec, 0);
7833 if (ASCII_CHAR_P (c)
7834 && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
7835 return ch;
7836
7837 charset_list = CODING_ATTR_CHARSET_LIST (attrs);
7838 charset = char_charset (c, charset_list, &code);
7839 if (code == CHARSET_INVALID_CODE (charset))
7840 error ("Can't encode by Big5 encoding: %d", c);
7841
7842 return make_number (code);
7843 }
7844
7845 \f
7846 DEFUN ("set-terminal-coding-system-internal",
7847 Fset_terminal_coding_system_internal,
7848 Sset_terminal_coding_system_internal, 1, 1, 0,
7849 doc: /* Internal use only. */)
7850 (coding_system)
7851 Lisp_Object coding_system;
7852 {
7853 CHECK_SYMBOL (coding_system);
7854 setup_coding_system (Fcheck_coding_system (coding_system),
7855 &terminal_coding);
7856
7857 /* We had better not send unsafe characters to terminal. */
7858 terminal_coding.mode |= CODING_MODE_SAFE_ENCODING;
7859 /* Characer composition should be disabled. */
7860 terminal_coding.common_flags &= ~CODING_ANNOTATE_COMPOSITION_MASK;
7861 terminal_coding.src_multibyte = 1;
7862 terminal_coding.dst_multibyte = 0;
7863 return Qnil;
7864 }
7865
7866 DEFUN ("set-safe-terminal-coding-system-internal",
7867 Fset_safe_terminal_coding_system_internal,
7868 Sset_safe_terminal_coding_system_internal, 1, 1, 0,
7869 doc: /* Internal use only. */)
7870 (coding_system)
7871 Lisp_Object coding_system;
7872 {
7873 CHECK_SYMBOL (coding_system);
7874 setup_coding_system (Fcheck_coding_system (coding_system),
7875 &safe_terminal_coding);
7876 /* Characer composition should be disabled. */
7877 safe_terminal_coding.common_flags &= ~CODING_ANNOTATE_COMPOSITION_MASK;
7878 safe_terminal_coding.src_multibyte = 1;
7879 safe_terminal_coding.dst_multibyte = 0;
7880 return Qnil;
7881 }
7882
7883 DEFUN ("terminal-coding-system",
7884 Fterminal_coding_system, Sterminal_coding_system, 0, 0, 0,
7885 doc: /* Return coding system specified for terminal output. */)
7886 ()
7887 {
7888 return CODING_ID_NAME (terminal_coding.id);
7889 }
7890
7891 DEFUN ("set-keyboard-coding-system-internal",
7892 Fset_keyboard_coding_system_internal,
7893 Sset_keyboard_coding_system_internal, 1, 1, 0,
7894 doc: /* Internal use only. */)
7895 (coding_system)
7896 Lisp_Object coding_system;
7897 {
7898 CHECK_SYMBOL (coding_system);
7899 setup_coding_system (Fcheck_coding_system (coding_system),
7900 &keyboard_coding);
7901 /* Characer composition should be disabled. */
7902 keyboard_coding.common_flags &= ~CODING_ANNOTATE_COMPOSITION_MASK;
7903 return Qnil;
7904 }
7905
7906 DEFUN ("keyboard-coding-system",
7907 Fkeyboard_coding_system, Skeyboard_coding_system, 0, 0, 0,
7908 doc: /* Return coding system specified for decoding keyboard input. */)
7909 ()
7910 {
7911 return CODING_ID_NAME (keyboard_coding.id);
7912 }
7913
7914 \f
7915 DEFUN ("find-operation-coding-system", Ffind_operation_coding_system,
7916 Sfind_operation_coding_system, 1, MANY, 0,
7917 doc: /* Choose a coding system for an operation based on the target name.
7918 The value names a pair of coding systems: (DECODING-SYSTEM . ENCODING-SYSTEM).
7919 DECODING-SYSTEM is the coding system to use for decoding
7920 \(in case OPERATION does decoding), and ENCODING-SYSTEM is the coding system
7921 for encoding (in case OPERATION does encoding).
7922
7923 The first argument OPERATION specifies an I/O primitive:
7924 For file I/O, `insert-file-contents' or `write-region'.
7925 For process I/O, `call-process', `call-process-region', or `start-process'.
7926 For network I/O, `open-network-stream'.
7927
7928 The remaining arguments should be the same arguments that were passed
7929 to the primitive. Depending on which primitive, one of those arguments
7930 is selected as the TARGET. For example, if OPERATION does file I/O,
7931 whichever argument specifies the file name is TARGET.
7932
7933 TARGET has a meaning which depends on OPERATION:
7934 For file I/O, TARGET is a file name.
7935 For process I/O, TARGET is a process name.
7936 For network I/O, TARGET is a service name or a port number
7937
7938 This function looks up what specified for TARGET in,
7939 `file-coding-system-alist', `process-coding-system-alist',
7940 or `network-coding-system-alist' depending on OPERATION.
7941 They may specify a coding system, a cons of coding systems,
7942 or a function symbol to call.
7943 In the last case, we call the function with one argument,
7944 which is a list of all the arguments given to this function.
7945
7946 usage: (find-operation-coding-system OPERATION ARGUMENTS ...) */)
7947 (nargs, args)
7948 int nargs;
7949 Lisp_Object *args;
7950 {
7951 Lisp_Object operation, target_idx, target, val;
7952 register Lisp_Object chain;
7953
7954 if (nargs < 2)
7955 error ("Too few arguments");
7956 operation = args[0];
7957 if (!SYMBOLP (operation)
7958 || !INTEGERP (target_idx = Fget (operation, Qtarget_idx)))
7959 error ("Invalid first arguement");
7960 if (nargs < 1 + XINT (target_idx))
7961 error ("Too few arguments for operation: %s",
7962 SDATA (SYMBOL_NAME (operation)));
7963 target = args[XINT (target_idx) + 1];
7964 if (!(STRINGP (target)
7965 || (EQ (operation, Qopen_network_stream) && INTEGERP (target))))
7966 error ("Invalid %dth argument", XINT (target_idx) + 1);
7967
7968 chain = ((EQ (operation, Qinsert_file_contents)
7969 || EQ (operation, Qwrite_region))
7970 ? Vfile_coding_system_alist
7971 : (EQ (operation, Qopen_network_stream)
7972 ? Vnetwork_coding_system_alist
7973 : Vprocess_coding_system_alist));
7974 if (NILP (chain))
7975 return Qnil;
7976
7977 for (; CONSP (chain); chain = XCDR (chain))
7978 {
7979 Lisp_Object elt;
7980
7981 elt = XCAR (chain);
7982 if (CONSP (elt)
7983 && ((STRINGP (target)
7984 && STRINGP (XCAR (elt))
7985 && fast_string_match (XCAR (elt), target) >= 0)
7986 || (INTEGERP (target) && EQ (target, XCAR (elt)))))
7987 {
7988 val = XCDR (elt);
7989 /* Here, if VAL is both a valid coding system and a valid
7990 function symbol, we return VAL as a coding system. */
7991 if (CONSP (val))
7992 return val;
7993 if (! SYMBOLP (val))
7994 return Qnil;
7995 if (! NILP (Fcoding_system_p (val)))
7996 return Fcons (val, val);
7997 if (! NILP (Ffboundp (val)))
7998 {
7999 val = call1 (val, Flist (nargs, args));
8000 if (CONSP (val))
8001 return val;
8002 if (SYMBOLP (val) && ! NILP (Fcoding_system_p (val)))
8003 return Fcons (val, val);
8004 }
8005 return Qnil;
8006 }
8007 }
8008 return Qnil;
8009 }
8010
8011 DEFUN ("set-coding-system-priority", Fset_coding_system_priority,
8012 Sset_coding_system_priority, 0, MANY, 0,
8013 doc: /* Assign higher priority to the coding systems given as arguments.
8014 If multiple coding systems belongs to the same category,
8015 all but the first one are ignored.
8016
8017 usage: (set-coding-system-priority ...) */)
8018 (nargs, args)
8019 int nargs;
8020 Lisp_Object *args;
8021 {
8022 int i, j;
8023 int changed[coding_category_max];
8024 enum coding_category priorities[coding_category_max];
8025
8026 bzero (changed, sizeof changed);
8027
8028 for (i = j = 0; i < nargs; i++)
8029 {
8030 enum coding_category category;
8031 Lisp_Object spec, attrs;
8032
8033 CHECK_CODING_SYSTEM_GET_SPEC (args[i], spec);
8034 attrs = AREF (spec, 0);
8035 category = XINT (CODING_ATTR_CATEGORY (attrs));
8036 if (changed[category])
8037 /* Ignore this coding system because a coding system of the
8038 same category already had a higher priority. */
8039 continue;
8040 changed[category] = 1;
8041 priorities[j++] = category;
8042 if (coding_categories[category].id >= 0
8043 && ! EQ (args[i], CODING_ID_NAME (coding_categories[category].id)))
8044 setup_coding_system (args[i], &coding_categories[category]);
8045 Fset (AREF (Vcoding_category_table, category), args[i]);
8046 }
8047
8048 /* Now we have decided top J priorities. Reflect the order of the
8049 original priorities to the remaining priorities. */
8050
8051 for (i = j, j = 0; i < coding_category_max; i++, j++)
8052 {
8053 while (j < coding_category_max
8054 && changed[coding_priorities[j]])
8055 j++;
8056 if (j == coding_category_max)
8057 abort ();
8058 priorities[i] = coding_priorities[j];
8059 }
8060
8061 bcopy (priorities, coding_priorities, sizeof priorities);
8062
8063 /* Update `coding-category-list'. */
8064 Vcoding_category_list = Qnil;
8065 for (i = coding_category_max - 1; i >= 0; i--)
8066 Vcoding_category_list
8067 = Fcons (AREF (Vcoding_category_table, priorities[i]),
8068 Vcoding_category_list);
8069
8070 return Qnil;
8071 }
8072
8073 DEFUN ("coding-system-priority-list", Fcoding_system_priority_list,
8074 Scoding_system_priority_list, 0, 1, 0,
8075 doc: /* Return a list of coding systems ordered by their priorities.
8076 HIGHESTP non-nil means just return the highest priority one. */)
8077 (highestp)
8078 Lisp_Object highestp;
8079 {
8080 int i;
8081 Lisp_Object val;
8082
8083 for (i = 0, val = Qnil; i < coding_category_max; i++)
8084 {
8085 enum coding_category category = coding_priorities[i];
8086 int id = coding_categories[category].id;
8087 Lisp_Object attrs;
8088
8089 if (id < 0)
8090 continue;
8091 attrs = CODING_ID_ATTRS (id);
8092 if (! NILP (highestp))
8093 return CODING_ATTR_BASE_NAME (attrs);
8094 val = Fcons (CODING_ATTR_BASE_NAME (attrs), val);
8095 }
8096 return Fnreverse (val);
8097 }
8098
8099 static char *suffixes[] = { "-unix", "-dos", "-mac" };
8100
8101 static Lisp_Object
8102 make_subsidiaries (base)
8103 Lisp_Object base;
8104 {
8105 Lisp_Object subsidiaries;
8106 int base_name_len = SBYTES (SYMBOL_NAME (base));
8107 char *buf = (char *) alloca (base_name_len + 6);
8108 int i;
8109
8110 bcopy (SDATA (SYMBOL_NAME (base)), buf, base_name_len);
8111 subsidiaries = Fmake_vector (make_number (3), Qnil);
8112 for (i = 0; i < 3; i++)
8113 {
8114 bcopy (suffixes[i], buf + base_name_len, strlen (suffixes[i]) + 1);
8115 ASET (subsidiaries, i, intern (buf));
8116 }
8117 return subsidiaries;
8118 }
8119
8120
8121 DEFUN ("define-coding-system-internal", Fdefine_coding_system_internal,
8122 Sdefine_coding_system_internal, coding_arg_max, MANY, 0,
8123 doc: /* For internal use only.
8124 usage: (define-coding-system-internal ...) */)
8125 (nargs, args)
8126 int nargs;
8127 Lisp_Object *args;
8128 {
8129 Lisp_Object name;
8130 Lisp_Object spec_vec; /* [ ATTRS ALIASE EOL_TYPE ] */
8131 Lisp_Object attrs; /* Vector of attributes. */
8132 Lisp_Object eol_type;
8133 Lisp_Object aliases;
8134 Lisp_Object coding_type, charset_list, safe_charsets;
8135 enum coding_category category;
8136 Lisp_Object tail, val;
8137 int max_charset_id = 0;
8138 int i;
8139
8140 if (nargs < coding_arg_max)
8141 goto short_args;
8142
8143 attrs = Fmake_vector (make_number (coding_attr_last_index), Qnil);
8144
8145 name = args[coding_arg_name];
8146 CHECK_SYMBOL (name);
8147 CODING_ATTR_BASE_NAME (attrs) = name;
8148
8149 val = args[coding_arg_mnemonic];
8150 if (! STRINGP (val))
8151 CHECK_CHARACTER (val);
8152 CODING_ATTR_MNEMONIC (attrs) = val;
8153
8154 coding_type = args[coding_arg_coding_type];
8155 CHECK_SYMBOL (coding_type);
8156 CODING_ATTR_TYPE (attrs) = coding_type;
8157
8158 charset_list = args[coding_arg_charset_list];
8159 if (SYMBOLP (charset_list))
8160 {
8161 if (EQ (charset_list, Qiso_2022))
8162 {
8163 if (! EQ (coding_type, Qiso_2022))
8164 error ("Invalid charset-list");
8165 charset_list = Viso_2022_charset_list;
8166 }
8167 else if (EQ (charset_list, Qemacs_mule))
8168 {
8169 if (! EQ (coding_type, Qemacs_mule))
8170 error ("Invalid charset-list");
8171 charset_list = Vemacs_mule_charset_list;
8172 }
8173 for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
8174 if (max_charset_id < XFASTINT (XCAR (tail)))
8175 max_charset_id = XFASTINT (XCAR (tail));
8176 }
8177 else
8178 {
8179 charset_list = Fcopy_sequence (charset_list);
8180 for (tail = charset_list; !NILP (tail); tail = Fcdr (tail))
8181 {
8182 struct charset *charset;
8183
8184 val = Fcar (tail);
8185 CHECK_CHARSET_GET_CHARSET (val, charset);
8186 if (EQ (coding_type, Qiso_2022)
8187 ? CHARSET_ISO_FINAL (charset) < 0
8188 : EQ (coding_type, Qemacs_mule)
8189 ? CHARSET_EMACS_MULE_ID (charset) < 0
8190 : 0)
8191 error ("Can't handle charset `%s'",
8192 SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
8193
8194 XSETCAR (tail, make_number (charset->id));
8195 if (max_charset_id < charset->id)
8196 max_charset_id = charset->id;
8197 }
8198 }
8199 CODING_ATTR_CHARSET_LIST (attrs) = charset_list;
8200
8201 safe_charsets = Fmake_string (make_number (max_charset_id + 1),
8202 make_number (255));
8203 for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
8204 SSET (safe_charsets, XFASTINT (XCAR (tail)), 0);
8205 CODING_ATTR_SAFE_CHARSETS (attrs) = safe_charsets;
8206
8207 CODING_ATTR_ASCII_COMPAT (attrs) = args[coding_arg_ascii_compatible_p];
8208
8209 val = args[coding_arg_decode_translation_table];
8210 if (! CHAR_TABLE_P (val) && ! CONSP (val))
8211 CHECK_SYMBOL (val);
8212 CODING_ATTR_DECODE_TBL (attrs) = val;
8213
8214 val = args[coding_arg_encode_translation_table];
8215 if (! CHAR_TABLE_P (val) && ! CONSP (val))
8216 CHECK_SYMBOL (val);
8217 CODING_ATTR_ENCODE_TBL (attrs) = val;
8218
8219 val = args[coding_arg_post_read_conversion];
8220 CHECK_SYMBOL (val);
8221 CODING_ATTR_POST_READ (attrs) = val;
8222
8223 val = args[coding_arg_pre_write_conversion];
8224 CHECK_SYMBOL (val);
8225 CODING_ATTR_PRE_WRITE (attrs) = val;
8226
8227 val = args[coding_arg_default_char];
8228 if (NILP (val))
8229 CODING_ATTR_DEFAULT_CHAR (attrs) = make_number (' ');
8230 else
8231 {
8232 CHECK_CHARACTER (val);
8233 CODING_ATTR_DEFAULT_CHAR (attrs) = val;
8234 }
8235
8236 val = args[coding_arg_for_unibyte];
8237 CODING_ATTR_FOR_UNIBYTE (attrs) = NILP (val) ? Qnil : Qt;
8238
8239 val = args[coding_arg_plist];
8240 CHECK_LIST (val);
8241 CODING_ATTR_PLIST (attrs) = val;
8242
8243 if (EQ (coding_type, Qcharset))
8244 {
8245 /* Generate a lisp vector of 256 elements. Each element is nil,
8246 integer, or a list of charset IDs.
8247
8248 If Nth element is nil, the byte code N is invalid in this
8249 coding system.
8250
8251 If Nth element is a number NUM, N is the first byte of a
8252 charset whose ID is NUM.
8253
8254 If Nth element is a list of charset IDs, N is the first byte
8255 of one of them. The list is sorted by dimensions of the
8256 charsets. A charset of smaller dimension comes firtst. */
8257 val = Fmake_vector (make_number (256), Qnil);
8258
8259 for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
8260 {
8261 struct charset *charset = CHARSET_FROM_ID (XFASTINT (XCAR (tail)));
8262 int dim = CHARSET_DIMENSION (charset);
8263 int idx = (dim - 1) * 4;
8264
8265 if (CHARSET_ASCII_COMPATIBLE_P (charset))
8266 CODING_ATTR_ASCII_COMPAT (attrs) = Qt;
8267
8268 for (i = charset->code_space[idx];
8269 i <= charset->code_space[idx + 1]; i++)
8270 {
8271 Lisp_Object tmp, tmp2;
8272 int dim2;
8273
8274 tmp = AREF (val, i);
8275 if (NILP (tmp))
8276 tmp = XCAR (tail);
8277 else if (NUMBERP (tmp))
8278 {
8279 dim2 = CHARSET_DIMENSION (CHARSET_FROM_ID (XFASTINT (tmp)));
8280 if (dim < dim2)
8281 tmp = Fcons (XCAR (tail), Fcons (tmp, Qnil));
8282 else
8283 tmp = Fcons (tmp, Fcons (XCAR (tail), Qnil));
8284 }
8285 else
8286 {
8287 for (tmp2 = tmp; CONSP (tmp2); tmp2 = XCDR (tmp2))
8288 {
8289 dim2 = CHARSET_DIMENSION (CHARSET_FROM_ID (XFASTINT (XCAR (tmp2))));
8290 if (dim < dim2)
8291 break;
8292 }
8293 if (NILP (tmp2))
8294 tmp = nconc2 (tmp, Fcons (XCAR (tail), Qnil));
8295 else
8296 {
8297 XSETCDR (tmp2, Fcons (XCAR (tmp2), XCDR (tmp2)));
8298 XSETCAR (tmp2, XCAR (tail));
8299 }
8300 }
8301 ASET (val, i, tmp);
8302 }
8303 }
8304 ASET (attrs, coding_attr_charset_valids, val);
8305 category = coding_category_charset;
8306 }
8307 else if (EQ (coding_type, Qccl))
8308 {
8309 Lisp_Object valids;
8310
8311 if (nargs < coding_arg_ccl_max)
8312 goto short_args;
8313
8314 val = args[coding_arg_ccl_decoder];
8315 CHECK_CCL_PROGRAM (val);
8316 if (VECTORP (val))
8317 val = Fcopy_sequence (val);
8318 ASET (attrs, coding_attr_ccl_decoder, val);
8319
8320 val = args[coding_arg_ccl_encoder];
8321 CHECK_CCL_PROGRAM (val);
8322 if (VECTORP (val))
8323 val = Fcopy_sequence (val);
8324 ASET (attrs, coding_attr_ccl_encoder, val);
8325
8326 val = args[coding_arg_ccl_valids];
8327 valids = Fmake_string (make_number (256), make_number (0));
8328 for (tail = val; !NILP (tail); tail = Fcdr (tail))
8329 {
8330 int from, to;
8331
8332 val = Fcar (tail);
8333 if (INTEGERP (val))
8334 {
8335 from = to = XINT (val);
8336 if (from < 0 || from > 255)
8337 args_out_of_range_3 (val, make_number (0), make_number (255));
8338 }
8339 else
8340 {
8341 CHECK_CONS (val);
8342 CHECK_NATNUM_CAR (val);
8343 CHECK_NATNUM_CDR (val);
8344 from = XINT (XCAR (val));
8345 if (from > 255)
8346 args_out_of_range_3 (XCAR (val),
8347 make_number (0), make_number (255));
8348 to = XINT (XCDR (val));
8349 if (to < from || to > 255)
8350 args_out_of_range_3 (XCDR (val),
8351 XCAR (val), make_number (255));
8352 }
8353 for (i = from; i <= to; i++)
8354 SSET (valids, i, 1);
8355 }
8356 ASET (attrs, coding_attr_ccl_valids, valids);
8357
8358 category = coding_category_ccl;
8359 }
8360 else if (EQ (coding_type, Qutf_16))
8361 {
8362 Lisp_Object bom, endian;
8363
8364 CODING_ATTR_ASCII_COMPAT (attrs) = Qnil;
8365
8366 if (nargs < coding_arg_utf16_max)
8367 goto short_args;
8368
8369 bom = args[coding_arg_utf16_bom];
8370 if (! NILP (bom) && ! EQ (bom, Qt))
8371 {
8372 CHECK_CONS (bom);
8373 val = XCAR (bom);
8374 CHECK_CODING_SYSTEM (val);
8375 val = XCDR (bom);
8376 CHECK_CODING_SYSTEM (val);
8377 }
8378 ASET (attrs, coding_attr_utf_16_bom, bom);
8379
8380 endian = args[coding_arg_utf16_endian];
8381 CHECK_SYMBOL (endian);
8382 if (NILP (endian))
8383 endian = Qbig;
8384 else if (! EQ (endian, Qbig) && ! EQ (endian, Qlittle))
8385 error ("Invalid endian: %s", SDATA (SYMBOL_NAME (endian)));
8386 ASET (attrs, coding_attr_utf_16_endian, endian);
8387
8388 category = (CONSP (bom)
8389 ? coding_category_utf_16_auto
8390 : NILP (bom)
8391 ? (EQ (endian, Qbig)
8392 ? coding_category_utf_16_be_nosig
8393 : coding_category_utf_16_le_nosig)
8394 : (EQ (endian, Qbig)
8395 ? coding_category_utf_16_be
8396 : coding_category_utf_16_le));
8397 }
8398 else if (EQ (coding_type, Qiso_2022))
8399 {
8400 Lisp_Object initial, reg_usage, request, flags;
8401 int i;
8402
8403 if (nargs < coding_arg_iso2022_max)
8404 goto short_args;
8405
8406 initial = Fcopy_sequence (args[coding_arg_iso2022_initial]);
8407 CHECK_VECTOR (initial);
8408 for (i = 0; i < 4; i++)
8409 {
8410 val = Faref (initial, make_number (i));
8411 if (! NILP (val))
8412 {
8413 struct charset *charset;
8414
8415 CHECK_CHARSET_GET_CHARSET (val, charset);
8416 ASET (initial, i, make_number (CHARSET_ID (charset)));
8417 if (i == 0 && CHARSET_ASCII_COMPATIBLE_P (charset))
8418 CODING_ATTR_ASCII_COMPAT (attrs) = Qt;
8419 }
8420 else
8421 ASET (initial, i, make_number (-1));
8422 }
8423
8424 reg_usage = args[coding_arg_iso2022_reg_usage];
8425 CHECK_CONS (reg_usage);
8426 CHECK_NUMBER_CAR (reg_usage);
8427 CHECK_NUMBER_CDR (reg_usage);
8428
8429 request = Fcopy_sequence (args[coding_arg_iso2022_request]);
8430 for (tail = request; ! NILP (tail); tail = Fcdr (tail))
8431 {
8432 int id;
8433 Lisp_Object tmp;
8434
8435 val = Fcar (tail);
8436 CHECK_CONS (val);
8437 tmp = XCAR (val);
8438 CHECK_CHARSET_GET_ID (tmp, id);
8439 CHECK_NATNUM_CDR (val);
8440 if (XINT (XCDR (val)) >= 4)
8441 error ("Invalid graphic register number: %d", XINT (XCDR (val)));
8442 XSETCAR (val, make_number (id));
8443 }
8444
8445 flags = args[coding_arg_iso2022_flags];
8446 CHECK_NATNUM (flags);
8447 i = XINT (flags);
8448 if (EQ (args[coding_arg_charset_list], Qiso_2022))
8449 flags = make_number (i | CODING_ISO_FLAG_FULL_SUPPORT);
8450
8451 ASET (attrs, coding_attr_iso_initial, initial);
8452 ASET (attrs, coding_attr_iso_usage, reg_usage);
8453 ASET (attrs, coding_attr_iso_request, request);
8454 ASET (attrs, coding_attr_iso_flags, flags);
8455 setup_iso_safe_charsets (attrs);
8456
8457 if (i & CODING_ISO_FLAG_SEVEN_BITS)
8458 category = ((i & (CODING_ISO_FLAG_LOCKING_SHIFT
8459 | CODING_ISO_FLAG_SINGLE_SHIFT))
8460 ? coding_category_iso_7_else
8461 : EQ (args[coding_arg_charset_list], Qiso_2022)
8462 ? coding_category_iso_7
8463 : coding_category_iso_7_tight);
8464 else
8465 {
8466 int id = XINT (AREF (initial, 1));
8467
8468 category = (((i & CODING_ISO_FLAG_LOCKING_SHIFT)
8469 || EQ (args[coding_arg_charset_list], Qiso_2022)
8470 || id < 0)
8471 ? coding_category_iso_8_else
8472 : (CHARSET_DIMENSION (CHARSET_FROM_ID (id)) == 1)
8473 ? coding_category_iso_8_1
8474 : coding_category_iso_8_2);
8475 }
8476 if (category != coding_category_iso_8_1
8477 && category != coding_category_iso_8_2)
8478 CODING_ATTR_ASCII_COMPAT (attrs) = Qnil;
8479 }
8480 else if (EQ (coding_type, Qemacs_mule))
8481 {
8482 if (EQ (args[coding_arg_charset_list], Qemacs_mule))
8483 ASET (attrs, coding_attr_emacs_mule_full, Qt);
8484 CODING_ATTR_ASCII_COMPAT (attrs) = Qt;
8485 category = coding_category_emacs_mule;
8486 }
8487 else if (EQ (coding_type, Qshift_jis))
8488 {
8489
8490 struct charset *charset;
8491
8492 if (XINT (Flength (charset_list)) != 3
8493 && XINT (Flength (charset_list)) != 4)
8494 error ("There should be three or four charsets");
8495
8496 charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
8497 if (CHARSET_DIMENSION (charset) != 1)
8498 error ("Dimension of charset %s is not one",
8499 SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
8500 if (CHARSET_ASCII_COMPATIBLE_P (charset))
8501 CODING_ATTR_ASCII_COMPAT (attrs) = Qt;
8502
8503 charset_list = XCDR (charset_list);
8504 charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
8505 if (CHARSET_DIMENSION (charset) != 1)
8506 error ("Dimension of charset %s is not one",
8507 SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
8508
8509 charset_list = XCDR (charset_list);
8510 charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
8511 if (CHARSET_DIMENSION (charset) != 2)
8512 error ("Dimension of charset %s is not two",
8513 SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
8514
8515 charset_list = XCDR (charset_list);
8516 if (! NILP (charset_list))
8517 {
8518 charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
8519 if (CHARSET_DIMENSION (charset) != 2)
8520 error ("Dimension of charset %s is not two",
8521 SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
8522 }
8523
8524 category = coding_category_sjis;
8525 Vsjis_coding_system = name;
8526 }
8527 else if (EQ (coding_type, Qbig5))
8528 {
8529 struct charset *charset;
8530
8531 if (XINT (Flength (charset_list)) != 2)
8532 error ("There should be just two charsets");
8533
8534 charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
8535 if (CHARSET_DIMENSION (charset) != 1)
8536 error ("Dimension of charset %s is not one",
8537 SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
8538 if (CHARSET_ASCII_COMPATIBLE_P (charset))
8539 CODING_ATTR_ASCII_COMPAT (attrs) = Qt;
8540
8541 charset_list = XCDR (charset_list);
8542 charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
8543 if (CHARSET_DIMENSION (charset) != 2)
8544 error ("Dimension of charset %s is not two",
8545 SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
8546
8547 category = coding_category_big5;
8548 Vbig5_coding_system = name;
8549 }
8550 else if (EQ (coding_type, Qraw_text))
8551 {
8552 category = coding_category_raw_text;
8553 CODING_ATTR_ASCII_COMPAT (attrs) = Qt;
8554 }
8555 else if (EQ (coding_type, Qutf_8))
8556 {
8557 category = coding_category_utf_8;
8558 CODING_ATTR_ASCII_COMPAT (attrs) = Qt;
8559 }
8560 else if (EQ (coding_type, Qundecided))
8561 category = coding_category_undecided;
8562 else
8563 error ("Invalid coding system type: %s",
8564 SDATA (SYMBOL_NAME (coding_type)));
8565
8566 CODING_ATTR_CATEGORY (attrs) = make_number (category);
8567 CODING_ATTR_PLIST (attrs)
8568 = Fcons (QCcategory, Fcons (AREF (Vcoding_category_table, category),
8569 CODING_ATTR_PLIST (attrs)));
8570
8571 eol_type = args[coding_arg_eol_type];
8572 if (! NILP (eol_type)
8573 && ! EQ (eol_type, Qunix)
8574 && ! EQ (eol_type, Qdos)
8575 && ! EQ (eol_type, Qmac))
8576 error ("Invalid eol-type");
8577
8578 aliases = Fcons (name, Qnil);
8579
8580 if (NILP (eol_type))
8581 {
8582 eol_type = make_subsidiaries (name);
8583 for (i = 0; i < 3; i++)
8584 {
8585 Lisp_Object this_spec, this_name, this_aliases, this_eol_type;
8586
8587 this_name = AREF (eol_type, i);
8588 this_aliases = Fcons (this_name, Qnil);
8589 this_eol_type = (i == 0 ? Qunix : i == 1 ? Qdos : Qmac);
8590 this_spec = Fmake_vector (make_number (3), attrs);
8591 ASET (this_spec, 1, this_aliases);
8592 ASET (this_spec, 2, this_eol_type);
8593 Fputhash (this_name, this_spec, Vcoding_system_hash_table);
8594 Vcoding_system_list = Fcons (this_name, Vcoding_system_list);
8595 Vcoding_system_alist = Fcons (Fcons (Fsymbol_name (this_name), Qnil),
8596 Vcoding_system_alist);
8597 }
8598 }
8599
8600 spec_vec = Fmake_vector (make_number (3), attrs);
8601 ASET (spec_vec, 1, aliases);
8602 ASET (spec_vec, 2, eol_type);
8603
8604 Fputhash (name, spec_vec, Vcoding_system_hash_table);
8605 Vcoding_system_list = Fcons (name, Vcoding_system_list);
8606 Vcoding_system_alist = Fcons (Fcons (Fsymbol_name (name), Qnil),
8607 Vcoding_system_alist);
8608
8609 {
8610 int id = coding_categories[category].id;
8611
8612 if (id < 0 || EQ (name, CODING_ID_NAME (id)))
8613 setup_coding_system (name, &coding_categories[category]);
8614 }
8615
8616 return Qnil;
8617
8618 short_args:
8619 return Fsignal (Qwrong_number_of_arguments,
8620 Fcons (intern ("define-coding-system-internal"),
8621 make_number (nargs)));
8622 }
8623
8624
8625 DEFUN ("coding-system-put", Fcoding_system_put, Scoding_system_put,
8626 3, 3, 0,
8627 doc: /* Change value in CODING-SYSTEM's property list PROP to VAL. */)
8628 (coding_system, prop, val)
8629 Lisp_Object coding_system, prop, val;
8630 {
8631 Lisp_Object spec, attrs, plist;
8632
8633 CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
8634 attrs = AREF (spec, 0);
8635 if (EQ (prop, QCmnemonic))
8636 {
8637 if (! STRINGP (val))
8638 CHECK_CHARACTER (val);
8639 CODING_ATTR_MNEMONIC (attrs) = val;
8640 }
8641 else if (EQ (prop, QCdefalut_char))
8642 {
8643 if (NILP (val))
8644 val = make_number (' ');
8645 else
8646 CHECK_CHARACTER (val);
8647 CODING_ATTR_DEFAULT_CHAR (attrs) = val;
8648 }
8649 else if (EQ (prop, QCdecode_translation_table))
8650 {
8651 if (! CHAR_TABLE_P (val) && ! CONSP (val))
8652 CHECK_SYMBOL (val);
8653 CODING_ATTR_DECODE_TBL (attrs) = val;
8654 }
8655 else if (EQ (prop, QCencode_translation_table))
8656 {
8657 if (! CHAR_TABLE_P (val) && ! CONSP (val))
8658 CHECK_SYMBOL (val);
8659 CODING_ATTR_ENCODE_TBL (attrs) = val;
8660 }
8661 else if (EQ (prop, QCpost_read_conversion))
8662 {
8663 CHECK_SYMBOL (val);
8664 CODING_ATTR_POST_READ (attrs) = val;
8665 }
8666 else if (EQ (prop, QCpre_write_conversion))
8667 {
8668 CHECK_SYMBOL (val);
8669 CODING_ATTR_PRE_WRITE (attrs) = val;
8670 }
8671
8672 CODING_ATTR_PLIST (attrs)
8673 = Fplist_put (CODING_ATTR_PLIST (attrs), prop, val);
8674 return val;
8675 }
8676
8677
8678 DEFUN ("define-coding-system-alias", Fdefine_coding_system_alias,
8679 Sdefine_coding_system_alias, 2, 2, 0,
8680 doc: /* Define ALIAS as an alias for CODING-SYSTEM. */)
8681 (alias, coding_system)
8682 Lisp_Object alias, coding_system;
8683 {
8684 Lisp_Object spec, aliases, eol_type;
8685
8686 CHECK_SYMBOL (alias);
8687 CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
8688 aliases = AREF (spec, 1);
8689 /* ALISES should be a list of length more than zero, and the first
8690 element is a base coding system. Append ALIAS at the tail of the
8691 list. */
8692 while (!NILP (XCDR (aliases)))
8693 aliases = XCDR (aliases);
8694 XSETCDR (aliases, Fcons (alias, Qnil));
8695
8696 eol_type = AREF (spec, 2);
8697 if (VECTORP (eol_type))
8698 {
8699 Lisp_Object subsidiaries;
8700 int i;
8701
8702 subsidiaries = make_subsidiaries (alias);
8703 for (i = 0; i < 3; i++)
8704 Fdefine_coding_system_alias (AREF (subsidiaries, i),
8705 AREF (eol_type, i));
8706 }
8707
8708 Fputhash (alias, spec, Vcoding_system_hash_table);
8709 Vcoding_system_list = Fcons (alias, Vcoding_system_list);
8710 Vcoding_system_alist = Fcons (Fcons (Fsymbol_name (alias), Qnil),
8711 Vcoding_system_alist);
8712
8713 return Qnil;
8714 }
8715
8716 DEFUN ("coding-system-base", Fcoding_system_base, Scoding_system_base,
8717 1, 1, 0,
8718 doc: /* Return the base of CODING-SYSTEM.
8719 Any alias or subsidiary coding system is not a base coding system. */)
8720 (coding_system)
8721 Lisp_Object coding_system;
8722 {
8723 Lisp_Object spec, attrs;
8724
8725 if (NILP (coding_system))
8726 return (Qno_conversion);
8727 CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
8728 attrs = AREF (spec, 0);
8729 return CODING_ATTR_BASE_NAME (attrs);
8730 }
8731
8732 DEFUN ("coding-system-plist", Fcoding_system_plist, Scoding_system_plist,
8733 1, 1, 0,
8734 doc: "Return the property list of CODING-SYSTEM.")
8735 (coding_system)
8736 Lisp_Object coding_system;
8737 {
8738 Lisp_Object spec, attrs;
8739
8740 if (NILP (coding_system))
8741 coding_system = Qno_conversion;
8742 CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
8743 attrs = AREF (spec, 0);
8744 return CODING_ATTR_PLIST (attrs);
8745 }
8746
8747
8748 DEFUN ("coding-system-aliases", Fcoding_system_aliases, Scoding_system_aliases,
8749 1, 1, 0,
8750 doc: /* Return the list of aliases of CODING-SYSTEM. */)
8751 (coding_system)
8752 Lisp_Object coding_system;
8753 {
8754 Lisp_Object spec;
8755
8756 if (NILP (coding_system))
8757 coding_system = Qno_conversion;
8758 CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
8759 return AREF (spec, 1);
8760 }
8761
8762 DEFUN ("coding-system-eol-type", Fcoding_system_eol_type,
8763 Scoding_system_eol_type, 1, 1, 0,
8764 doc: /* Return eol-type of CODING-SYSTEM.
8765 An eol-type is integer 0, 1, 2, or a vector of coding systems.
8766
8767 Integer values 0, 1, and 2 indicate a format of end-of-line; LF, CRLF,
8768 and CR respectively.
8769
8770 A vector value indicates that a format of end-of-line should be
8771 detected automatically. Nth element of the vector is the subsidiary
8772 coding system whose eol-type is N. */)
8773 (coding_system)
8774 Lisp_Object coding_system;
8775 {
8776 Lisp_Object spec, eol_type;
8777 int n;
8778
8779 if (NILP (coding_system))
8780 coding_system = Qno_conversion;
8781 if (! CODING_SYSTEM_P (coding_system))
8782 return Qnil;
8783 spec = CODING_SYSTEM_SPEC (coding_system);
8784 eol_type = AREF (spec, 2);
8785 if (VECTORP (eol_type))
8786 return Fcopy_sequence (eol_type);
8787 n = EQ (eol_type, Qunix) ? 0 : EQ (eol_type, Qdos) ? 1 : 2;
8788 return make_number (n);
8789 }
8790
8791 #endif /* emacs */
8792
8793 \f
8794 /*** 9. Post-amble ***/
8795
8796 void
8797 init_coding_once ()
8798 {
8799 int i;
8800
8801 for (i = 0; i < coding_category_max; i++)
8802 {
8803 coding_categories[i].id = -1;
8804 coding_priorities[i] = i;
8805 }
8806
8807 /* ISO2022 specific initialize routine. */
8808 for (i = 0; i < 0x20; i++)
8809 iso_code_class[i] = ISO_control_0;
8810 for (i = 0x21; i < 0x7F; i++)
8811 iso_code_class[i] = ISO_graphic_plane_0;
8812 for (i = 0x80; i < 0xA0; i++)
8813 iso_code_class[i] = ISO_control_1;
8814 for (i = 0xA1; i < 0xFF; i++)
8815 iso_code_class[i] = ISO_graphic_plane_1;
8816 iso_code_class[0x20] = iso_code_class[0x7F] = ISO_0x20_or_0x7F;
8817 iso_code_class[0xA0] = iso_code_class[0xFF] = ISO_0xA0_or_0xFF;
8818 iso_code_class[ISO_CODE_SO] = ISO_shift_out;
8819 iso_code_class[ISO_CODE_SI] = ISO_shift_in;
8820 iso_code_class[ISO_CODE_SS2_7] = ISO_single_shift_2_7;
8821 iso_code_class[ISO_CODE_ESC] = ISO_escape;
8822 iso_code_class[ISO_CODE_SS2] = ISO_single_shift_2;
8823 iso_code_class[ISO_CODE_SS3] = ISO_single_shift_3;
8824 iso_code_class[ISO_CODE_CSI] = ISO_control_sequence_introducer;
8825
8826 for (i = 0; i < 256; i++)
8827 {
8828 emacs_mule_bytes[i] = 1;
8829 }
8830 emacs_mule_bytes[EMACS_MULE_LEADING_CODE_PRIVATE_11] = 3;
8831 emacs_mule_bytes[EMACS_MULE_LEADING_CODE_PRIVATE_12] = 3;
8832 emacs_mule_bytes[EMACS_MULE_LEADING_CODE_PRIVATE_21] = 4;
8833 emacs_mule_bytes[EMACS_MULE_LEADING_CODE_PRIVATE_22] = 4;
8834 }
8835
8836 #ifdef emacs
8837
8838 void
8839 syms_of_coding ()
8840 {
8841 staticpro (&Vcoding_system_hash_table);
8842 {
8843 Lisp_Object args[2];
8844 args[0] = QCtest;
8845 args[1] = Qeq;
8846 Vcoding_system_hash_table = Fmake_hash_table (2, args);
8847 }
8848
8849 staticpro (&Vsjis_coding_system);
8850 Vsjis_coding_system = Qnil;
8851
8852 staticpro (&Vbig5_coding_system);
8853 Vbig5_coding_system = Qnil;
8854
8855 staticpro (&Vcode_conversion_reused_workbuf);
8856 Vcode_conversion_reused_workbuf = Qnil;
8857
8858 staticpro (&Vcode_conversion_workbuf_name);
8859 Vcode_conversion_workbuf_name = build_string (" *code-conversion-work*");
8860
8861 reused_workbuf_in_use = 0;
8862
8863 DEFSYM (Qcharset, "charset");
8864 DEFSYM (Qtarget_idx, "target-idx");
8865 DEFSYM (Qcoding_system_history, "coding-system-history");
8866 Fset (Qcoding_system_history, Qnil);
8867
8868 /* Target FILENAME is the first argument. */
8869 Fput (Qinsert_file_contents, Qtarget_idx, make_number (0));
8870 /* Target FILENAME is the third argument. */
8871 Fput (Qwrite_region, Qtarget_idx, make_number (2));
8872
8873 DEFSYM (Qcall_process, "call-process");
8874 /* Target PROGRAM is the first argument. */
8875 Fput (Qcall_process, Qtarget_idx, make_number (0));
8876
8877 DEFSYM (Qcall_process_region, "call-process-region");
8878 /* Target PROGRAM is the third argument. */
8879 Fput (Qcall_process_region, Qtarget_idx, make_number (2));
8880
8881 DEFSYM (Qstart_process, "start-process");
8882 /* Target PROGRAM is the third argument. */
8883 Fput (Qstart_process, Qtarget_idx, make_number (2));
8884
8885 DEFSYM (Qopen_network_stream, "open-network-stream");
8886 /* Target SERVICE is the fourth argument. */
8887 Fput (Qopen_network_stream, Qtarget_idx, make_number (3));
8888
8889 DEFSYM (Qcoding_system, "coding-system");
8890 DEFSYM (Qcoding_aliases, "coding-aliases");
8891
8892 DEFSYM (Qeol_type, "eol-type");
8893 DEFSYM (Qunix, "unix");
8894 DEFSYM (Qdos, "dos");
8895
8896 DEFSYM (Qbuffer_file_coding_system, "buffer-file-coding-system");
8897 DEFSYM (Qpost_read_conversion, "post-read-conversion");
8898 DEFSYM (Qpre_write_conversion, "pre-write-conversion");
8899 DEFSYM (Qdefault_char, "default-char");
8900 DEFSYM (Qundecided, "undecided");
8901 DEFSYM (Qno_conversion, "no-conversion");
8902 DEFSYM (Qraw_text, "raw-text");
8903
8904 DEFSYM (Qiso_2022, "iso-2022");
8905
8906 DEFSYM (Qutf_8, "utf-8");
8907 DEFSYM (Qutf_8_emacs, "utf-8-emacs");
8908
8909 DEFSYM (Qutf_16, "utf-16");
8910 DEFSYM (Qbig, "big");
8911 DEFSYM (Qlittle, "little");
8912
8913 DEFSYM (Qshift_jis, "shift-jis");
8914 DEFSYM (Qbig5, "big5");
8915
8916 DEFSYM (Qcoding_system_p, "coding-system-p");
8917
8918 DEFSYM (Qcoding_system_error, "coding-system-error");
8919 Fput (Qcoding_system_error, Qerror_conditions,
8920 Fcons (Qcoding_system_error, Fcons (Qerror, Qnil)));
8921 Fput (Qcoding_system_error, Qerror_message,
8922 build_string ("Invalid coding system"));
8923
8924 /* Intern this now in case it isn't already done.
8925 Setting this variable twice is harmless.
8926 But don't staticpro it here--that is done in alloc.c. */
8927 Qchar_table_extra_slots = intern ("char-table-extra-slots");
8928
8929 DEFSYM (Qtranslation_table, "translation-table");
8930 Fput (Qtranslation_table, Qchar_table_extra_slots, make_number (1));
8931 DEFSYM (Qtranslation_table_id, "translation-table-id");
8932 DEFSYM (Qtranslation_table_for_decode, "translation-table-for-decode");
8933 DEFSYM (Qtranslation_table_for_encode, "translation-table-for-encode");
8934
8935 DEFSYM (Qvalid_codes, "valid-codes");
8936
8937 DEFSYM (Qemacs_mule, "emacs-mule");
8938
8939 DEFSYM (QCcategory, ":category");
8940 DEFSYM (QCmnemonic, ":mnemonic");
8941 DEFSYM (QCdefalut_char, ":default-char");
8942 DEFSYM (QCdecode_translation_table, ":decode-translation-table");
8943 DEFSYM (QCencode_translation_table, ":encode-translation-table");
8944 DEFSYM (QCpost_read_conversion, ":post-read-conversion");
8945 DEFSYM (QCpre_write_conversion, ":pre-write-conversion");
8946
8947 Vcoding_category_table
8948 = Fmake_vector (make_number (coding_category_max), Qnil);
8949 staticpro (&Vcoding_category_table);
8950 /* Followings are target of code detection. */
8951 ASET (Vcoding_category_table, coding_category_iso_7,
8952 intern ("coding-category-iso-7"));
8953 ASET (Vcoding_category_table, coding_category_iso_7_tight,
8954 intern ("coding-category-iso-7-tight"));
8955 ASET (Vcoding_category_table, coding_category_iso_8_1,
8956 intern ("coding-category-iso-8-1"));
8957 ASET (Vcoding_category_table, coding_category_iso_8_2,
8958 intern ("coding-category-iso-8-2"));
8959 ASET (Vcoding_category_table, coding_category_iso_7_else,
8960 intern ("coding-category-iso-7-else"));
8961 ASET (Vcoding_category_table, coding_category_iso_8_else,
8962 intern ("coding-category-iso-8-else"));
8963 ASET (Vcoding_category_table, coding_category_utf_8,
8964 intern ("coding-category-utf-8"));
8965 ASET (Vcoding_category_table, coding_category_utf_16_be,
8966 intern ("coding-category-utf-16-be"));
8967 ASET (Vcoding_category_table, coding_category_utf_16_auto,
8968 intern ("coding-category-utf-16-auto"));
8969 ASET (Vcoding_category_table, coding_category_utf_16_le,
8970 intern ("coding-category-utf-16-le"));
8971 ASET (Vcoding_category_table, coding_category_utf_16_be_nosig,
8972 intern ("coding-category-utf-16-be-nosig"));
8973 ASET (Vcoding_category_table, coding_category_utf_16_le_nosig,
8974 intern ("coding-category-utf-16-le-nosig"));
8975 ASET (Vcoding_category_table, coding_category_charset,
8976 intern ("coding-category-charset"));
8977 ASET (Vcoding_category_table, coding_category_sjis,
8978 intern ("coding-category-sjis"));
8979 ASET (Vcoding_category_table, coding_category_big5,
8980 intern ("coding-category-big5"));
8981 ASET (Vcoding_category_table, coding_category_ccl,
8982 intern ("coding-category-ccl"));
8983 ASET (Vcoding_category_table, coding_category_emacs_mule,
8984 intern ("coding-category-emacs-mule"));
8985 /* Followings are NOT target of code detection. */
8986 ASET (Vcoding_category_table, coding_category_raw_text,
8987 intern ("coding-category-raw-text"));
8988 ASET (Vcoding_category_table, coding_category_undecided,
8989 intern ("coding-category-undecided"));
8990
8991 DEFSYM (Qinsufficient_source, "insufficient-source");
8992 DEFSYM (Qinconsistent_eol, "inconsistent-eol");
8993 DEFSYM (Qinvalid_source, "invalid-source");
8994 DEFSYM (Qinterrupted, "interrupted");
8995 DEFSYM (Qinsufficient_memory, "insufficient-memory");
8996
8997 defsubr (&Scoding_system_p);
8998 defsubr (&Sread_coding_system);
8999 defsubr (&Sread_non_nil_coding_system);
9000 defsubr (&Scheck_coding_system);
9001 defsubr (&Sdetect_coding_region);
9002 defsubr (&Sdetect_coding_string);
9003 defsubr (&Sfind_coding_systems_region_internal);
9004 defsubr (&Sunencodable_char_position);
9005 defsubr (&Scheck_coding_systems_region);
9006 defsubr (&Sdecode_coding_region);
9007 defsubr (&Sencode_coding_region);
9008 defsubr (&Sdecode_coding_string);
9009 defsubr (&Sencode_coding_string);
9010 defsubr (&Sdecode_sjis_char);
9011 defsubr (&Sencode_sjis_char);
9012 defsubr (&Sdecode_big5_char);
9013 defsubr (&Sencode_big5_char);
9014 defsubr (&Sset_terminal_coding_system_internal);
9015 defsubr (&Sset_safe_terminal_coding_system_internal);
9016 defsubr (&Sterminal_coding_system);
9017 defsubr (&Sset_keyboard_coding_system_internal);
9018 defsubr (&Skeyboard_coding_system);
9019 defsubr (&Sfind_operation_coding_system);
9020 defsubr (&Sset_coding_system_priority);
9021 defsubr (&Sdefine_coding_system_internal);
9022 defsubr (&Sdefine_coding_system_alias);
9023 defsubr (&Scoding_system_put);
9024 defsubr (&Scoding_system_base);
9025 defsubr (&Scoding_system_plist);
9026 defsubr (&Scoding_system_aliases);
9027 defsubr (&Scoding_system_eol_type);
9028 defsubr (&Scoding_system_priority_list);
9029
9030 DEFVAR_LISP ("coding-system-list", &Vcoding_system_list,
9031 doc: /* List of coding systems.
9032
9033 Do not alter the value of this variable manually. This variable should be
9034 updated by the functions `define-coding-system' and
9035 `define-coding-system-alias'. */);
9036 Vcoding_system_list = Qnil;
9037
9038 DEFVAR_LISP ("coding-system-alist", &Vcoding_system_alist,
9039 doc: /* Alist of coding system names.
9040 Each element is one element list of coding system name.
9041 This variable is given to `completing-read' as TABLE argument.
9042
9043 Do not alter the value of this variable manually. This variable should be
9044 updated by the functions `make-coding-system' and
9045 `define-coding-system-alias'. */);
9046 Vcoding_system_alist = Qnil;
9047
9048 DEFVAR_LISP ("coding-category-list", &Vcoding_category_list,
9049 doc: /* List of coding-categories (symbols) ordered by priority.
9050
9051 On detecting a coding system, Emacs tries code detection algorithms
9052 associated with each coding-category one by one in this order. When
9053 one algorithm agrees with a byte sequence of source text, the coding
9054 system bound to the corresponding coding-category is selected. */);
9055 {
9056 int i;
9057
9058 Vcoding_category_list = Qnil;
9059 for (i = coding_category_max - 1; i >= 0; i--)
9060 Vcoding_category_list
9061 = Fcons (XVECTOR (Vcoding_category_table)->contents[i],
9062 Vcoding_category_list);
9063 }
9064
9065 DEFVAR_LISP ("coding-system-for-read", &Vcoding_system_for_read,
9066 doc: /* Specify the coding system for read operations.
9067 It is useful to bind this variable with `let', but do not set it globally.
9068 If the value is a coding system, it is used for decoding on read operation.
9069 If not, an appropriate element is used from one of the coding system alists:
9070 There are three such tables, `file-coding-system-alist',
9071 `process-coding-system-alist', and `network-coding-system-alist'. */);
9072 Vcoding_system_for_read = Qnil;
9073
9074 DEFVAR_LISP ("coding-system-for-write", &Vcoding_system_for_write,
9075 doc: /* Specify the coding system for write operations.
9076 Programs bind this variable with `let', but you should not set it globally.
9077 If the value is a coding system, it is used for encoding of output,
9078 when writing it to a file and when sending it to a file or subprocess.
9079
9080 If this does not specify a coding system, an appropriate element
9081 is used from one of the coding system alists:
9082 There are three such tables, `file-coding-system-alist',
9083 `process-coding-system-alist', and `network-coding-system-alist'.
9084 For output to files, if the above procedure does not specify a coding system,
9085 the value of `buffer-file-coding-system' is used. */);
9086 Vcoding_system_for_write = Qnil;
9087
9088 DEFVAR_LISP ("last-coding-system-used", &Vlast_coding_system_used,
9089 doc: /*
9090 Coding system used in the latest file or process I/O. */);
9091 Vlast_coding_system_used = Qnil;
9092
9093 DEFVAR_LISP ("last-code-conversion-error", &Vlast_code_conversion_error,
9094 doc: /*
9095 Error status of the last code conversion.
9096
9097 When an error was detected in the last code conversion, this variable
9098 is set to one of the following symbols.
9099 `insufficient-source'
9100 `inconsistent-eol'
9101 `invalid-source'
9102 `interrupted'
9103 `insufficient-memory'
9104 When no error was detected, the value doesn't change. So, to check
9105 the error status of a code conversion by this variable, you must
9106 explicitly set this variable to nil before performing code
9107 conversion. */);
9108 Vlast_code_conversion_error = Qnil;
9109
9110 DEFVAR_BOOL ("inhibit-eol-conversion", &inhibit_eol_conversion,
9111 doc: /*
9112 *Non-nil means always inhibit code conversion of end-of-line format.
9113 See info node `Coding Systems' and info node `Text and Binary' concerning
9114 such conversion. */);
9115 inhibit_eol_conversion = 0;
9116
9117 DEFVAR_BOOL ("inherit-process-coding-system", &inherit_process_coding_system,
9118 doc: /*
9119 Non-nil means process buffer inherits coding system of process output.
9120 Bind it to t if the process output is to be treated as if it were a file
9121 read from some filesystem. */);
9122 inherit_process_coding_system = 0;
9123
9124 DEFVAR_LISP ("file-coding-system-alist", &Vfile_coding_system_alist,
9125 doc: /*
9126 Alist to decide a coding system to use for a file I/O operation.
9127 The format is ((PATTERN . VAL) ...),
9128 where PATTERN is a regular expression matching a file name,
9129 VAL is a coding system, a cons of coding systems, or a function symbol.
9130 If VAL is a coding system, it is used for both decoding and encoding
9131 the file contents.
9132 If VAL is a cons of coding systems, the car part is used for decoding,
9133 and the cdr part is used for encoding.
9134 If VAL is a function symbol, the function must return a coding system
9135 or a cons of coding systems which are used as above. The function gets
9136 the arguments with which `find-operation-coding-systems' was called.
9137
9138 See also the function `find-operation-coding-system'
9139 and the variable `auto-coding-alist'. */);
9140 Vfile_coding_system_alist = Qnil;
9141
9142 DEFVAR_LISP ("process-coding-system-alist", &Vprocess_coding_system_alist,
9143 doc: /*
9144 Alist to decide a coding system to use for a process I/O operation.
9145 The format is ((PATTERN . VAL) ...),
9146 where PATTERN is a regular expression matching a program name,
9147 VAL is a coding system, a cons of coding systems, or a function symbol.
9148 If VAL is a coding system, it is used for both decoding what received
9149 from the program and encoding what sent to the program.
9150 If VAL is a cons of coding systems, the car part is used for decoding,
9151 and the cdr part is used for encoding.
9152 If VAL is a function symbol, the function must return a coding system
9153 or a cons of coding systems which are used as above.
9154
9155 See also the function `find-operation-coding-system'. */);
9156 Vprocess_coding_system_alist = Qnil;
9157
9158 DEFVAR_LISP ("network-coding-system-alist", &Vnetwork_coding_system_alist,
9159 doc: /*
9160 Alist to decide a coding system to use for a network I/O operation.
9161 The format is ((PATTERN . VAL) ...),
9162 where PATTERN is a regular expression matching a network service name
9163 or is a port number to connect to,
9164 VAL is a coding system, a cons of coding systems, or a function symbol.
9165 If VAL is a coding system, it is used for both decoding what received
9166 from the network stream and encoding what sent to the network stream.
9167 If VAL is a cons of coding systems, the car part is used for decoding,
9168 and the cdr part is used for encoding.
9169 If VAL is a function symbol, the function must return a coding system
9170 or a cons of coding systems which are used as above.
9171
9172 See also the function `find-operation-coding-system'. */);
9173 Vnetwork_coding_system_alist = Qnil;
9174
9175 DEFVAR_LISP ("locale-coding-system", &Vlocale_coding_system,
9176 doc: /* Coding system to use with system messages.
9177 Also used for decoding keyboard input on X Window system. */);
9178 Vlocale_coding_system = Qnil;
9179
9180 /* The eol mnemonics are reset in startup.el system-dependently. */
9181 DEFVAR_LISP ("eol-mnemonic-unix", &eol_mnemonic_unix,
9182 doc: /*
9183 *String displayed in mode line for UNIX-like (LF) end-of-line format. */);
9184 eol_mnemonic_unix = build_string (":");
9185
9186 DEFVAR_LISP ("eol-mnemonic-dos", &eol_mnemonic_dos,
9187 doc: /*
9188 *String displayed in mode line for DOS-like (CRLF) end-of-line format. */);
9189 eol_mnemonic_dos = build_string ("\\");
9190
9191 DEFVAR_LISP ("eol-mnemonic-mac", &eol_mnemonic_mac,
9192 doc: /*
9193 *String displayed in mode line for MAC-like (CR) end-of-line format. */);
9194 eol_mnemonic_mac = build_string ("/");
9195
9196 DEFVAR_LISP ("eol-mnemonic-undecided", &eol_mnemonic_undecided,
9197 doc: /*
9198 *String displayed in mode line when end-of-line format is not yet determined. */);
9199 eol_mnemonic_undecided = build_string (":");
9200
9201 DEFVAR_LISP ("enable-character-translation", &Venable_character_translation,
9202 doc: /*
9203 *Non-nil enables character translation while encoding and decoding. */);
9204 Venable_character_translation = Qt;
9205
9206 DEFVAR_LISP ("standard-translation-table-for-decode",
9207 &Vstandard_translation_table_for_decode,
9208 doc: /* Table for translating characters while decoding. */);
9209 Vstandard_translation_table_for_decode = Qnil;
9210
9211 DEFVAR_LISP ("standard-translation-table-for-encode",
9212 &Vstandard_translation_table_for_encode,
9213 doc: /* Table for translating characters while encoding. */);
9214 Vstandard_translation_table_for_encode = Qnil;
9215
9216 DEFVAR_LISP ("charset-revision-table", &Vcharset_revision_table,
9217 doc: /* Alist of charsets vs revision numbers.
9218 While encoding, if a charset (car part of an element) is found,
9219 designate it with the escape sequence identifying revision (cdr part
9220 of the element). */);
9221 Vcharset_revision_table = Qnil;
9222
9223 DEFVAR_LISP ("default-process-coding-system",
9224 &Vdefault_process_coding_system,
9225 doc: /* Cons of coding systems used for process I/O by default.
9226 The car part is used for decoding a process output,
9227 the cdr part is used for encoding a text to be sent to a process. */);
9228 Vdefault_process_coding_system = Qnil;
9229
9230 DEFVAR_LISP ("latin-extra-code-table", &Vlatin_extra_code_table,
9231 doc: /*
9232 Table of extra Latin codes in the range 128..159 (inclusive).
9233 This is a vector of length 256.
9234 If Nth element is non-nil, the existence of code N in a file
9235 \(or output of subprocess) doesn't prevent it to be detected as
9236 a coding system of ISO 2022 variant which has a flag
9237 `accept-latin-extra-code' t (e.g. iso-latin-1) on reading a file
9238 or reading output of a subprocess.
9239 Only 128th through 159th elements has a meaning. */);
9240 Vlatin_extra_code_table = Fmake_vector (make_number (256), Qnil);
9241
9242 DEFVAR_LISP ("select-safe-coding-system-function",
9243 &Vselect_safe_coding_system_function,
9244 doc: /*
9245 Function to call to select safe coding system for encoding a text.
9246
9247 If set, this function is called to force a user to select a proper
9248 coding system which can encode the text in the case that a default
9249 coding system used in each operation can't encode the text.
9250
9251 The default value is `select-safe-coding-system' (which see). */);
9252 Vselect_safe_coding_system_function = Qnil;
9253
9254 DEFVAR_BOOL ("coding-system-require-warning",
9255 &coding_system_require_warning,
9256 doc: /* Internal use only.
9257 If non-nil, on writing a file, `select-safe-coding-system-function' is
9258 called even if `coding-system-for-write' is non-nil. The command
9259 `universal-coding-system-argument' binds this variable to t temporarily. */);
9260 coding_system_require_warning = 0;
9261
9262
9263 DEFVAR_BOOL ("inhibit-iso-escape-detection",
9264 &inhibit_iso_escape_detection,
9265 doc: /*
9266 If non-nil, Emacs ignores ISO2022's escape sequence on code detection.
9267
9268 By default, on reading a file, Emacs tries to detect how the text is
9269 encoded. This code detection is sensitive to escape sequences. If
9270 the sequence is valid as ISO2022, the code is determined as one of
9271 the ISO2022 encodings, and the file is decoded by the corresponding
9272 coding system (e.g. `iso-2022-7bit').
9273
9274 However, there may be a case that you want to read escape sequences in
9275 a file as is. In such a case, you can set this variable to non-nil.
9276 Then, as the code detection ignores any escape sequences, no file is
9277 detected as encoded in some ISO2022 encoding. The result is that all
9278 escape sequences become visible in a buffer.
9279
9280 The default value is nil, and it is strongly recommended not to change
9281 it. That is because many Emacs Lisp source files that contain
9282 non-ASCII characters are encoded by the coding system `iso-2022-7bit'
9283 in Emacs's distribution, and they won't be decoded correctly on
9284 reading if you suppress escape sequence detection.
9285
9286 The other way to read escape sequences in a file without decoding is
9287 to explicitly specify some coding system that doesn't use ISO2022's
9288 escape sequence (e.g `latin-1') on reading by \\[universal-coding-system-argument]. */);
9289 inhibit_iso_escape_detection = 0;
9290
9291 DEFVAR_LISP ("translation-table-for-input", &Vtranslation_table_for_input,
9292 doc: /* Char table for translating self-inserting characters.
9293 This is applied to the result of input methods, not their input. See also
9294 `keyboard-translate-table'. */);
9295 Vtranslation_table_for_input = Qnil;
9296
9297 {
9298 Lisp_Object args[coding_arg_max];
9299 Lisp_Object plist[16];
9300 int i;
9301
9302 for (i = 0; i < coding_arg_max; i++)
9303 args[i] = Qnil;
9304
9305 plist[0] = intern (":name");
9306 plist[1] = args[coding_arg_name] = Qno_conversion;
9307 plist[2] = intern (":mnemonic");
9308 plist[3] = args[coding_arg_mnemonic] = make_number ('=');
9309 plist[4] = intern (":coding-type");
9310 plist[5] = args[coding_arg_coding_type] = Qraw_text;
9311 plist[6] = intern (":ascii-compatible-p");
9312 plist[7] = args[coding_arg_ascii_compatible_p] = Qt;
9313 plist[8] = intern (":default-char");
9314 plist[9] = args[coding_arg_default_char] = make_number (0);
9315 plist[10] = intern (":for-unibyte");
9316 plist[11] = args[coding_arg_for_unibyte] = Qt;
9317 plist[12] = intern (":docstring");
9318 plist[13] = build_string ("Do no conversion.\n\
9319 \n\
9320 When you visit a file with this coding, the file is read into a\n\
9321 unibyte buffer as is, thus each byte of a file is treated as a\n\
9322 character.");
9323 plist[14] = intern (":eol-type");
9324 plist[15] = args[coding_arg_eol_type] = Qunix;
9325 args[coding_arg_plist] = Flist (16, plist);
9326 Fdefine_coding_system_internal (coding_arg_max, args);
9327 }
9328
9329 setup_coding_system (Qno_conversion, &keyboard_coding);
9330 setup_coding_system (Qno_conversion, &terminal_coding);
9331 setup_coding_system (Qno_conversion, &safe_terminal_coding);
9332
9333 {
9334 int i;
9335
9336 for (i = 0; i < coding_category_max; i++)
9337 Fset (AREF (Vcoding_category_table, i), Qno_conversion);
9338 }
9339 }
9340
9341 char *
9342 emacs_strerror (error_number)
9343 int error_number;
9344 {
9345 char *str;
9346
9347 synchronize_system_messages_locale ();
9348 str = strerror (error_number);
9349
9350 if (! NILP (Vlocale_coding_system))
9351 {
9352 Lisp_Object dec = code_convert_string_norecord (build_string (str),
9353 Vlocale_coding_system,
9354 0);
9355 str = (char *) SDATA (dec);
9356 }
9357
9358 return str;
9359 }
9360
9361 #endif /* emacs */