(detect_coding_emacs_mule): Fix counting of encoded
[bpt/emacs.git] / src / coding.c
1 /* Coding system handler (conversion, detection, etc).
2 Copyright (C) 1995, 1997, 1998 Electrotechnical Laboratory, JAPAN.
3 Licensed to the Free Software Foundation.
4 Copyright (C) 2001, 2002 Free Software Foundation, Inc.
5 Copyright (C) 2003
6 National Institute of Advanced Industrial Science and Technology (AIST)
7 Registration Number H13PRO009
8
9 This file is part of GNU Emacs.
10
11 GNU Emacs is free software; you can redistribute it and/or modify
12 it under the terms of the GNU General Public License as published by
13 the Free Software Foundation; either version 2, or (at your option)
14 any later version.
15
16 GNU Emacs is distributed in the hope that it will be useful,
17 but WITHOUT ANY WARRANTY; without even the implied warranty of
18 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19 GNU General Public License for more details.
20
21 You should have received a copy of the GNU General Public License
22 along with GNU Emacs; see the file COPYING. If not, write to
23 the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
24 Boston, MA 02111-1307, USA. */
25
26 /*** TABLE OF CONTENTS ***
27
28 0. General comments
29 1. Preamble
30 2. Emacs' internal format (emacs-utf-8) handlers
31 3. UTF-8 handlers
32 4. UTF-16 handlers
33 5. Charset-base coding systems handlers
34 6. emacs-mule (old Emacs' internal format) handlers
35 7. ISO2022 handlers
36 8. Shift-JIS and BIG5 handlers
37 9. CCL handlers
38 10. C library functions
39 11. Emacs Lisp library functions
40 12. Postamble
41
42 */
43
44 /*** 0. General comments ***
45
46
47 CODING SYSTEM
48
49 A coding system is an object for an encoding mechanism that contains
50 information about how to convert byte sequences to character
51 sequences and vice versa. When we say "decode", it means converting
52 a byte sequence of a specific coding system into a character
53 sequence that is represented by Emacs' internal coding system
54 `emacs-utf-8', and when we say "encode", it means converting a
55 character sequence of emacs-utf-8 to a byte sequence of a specific
56 coding system.
57
58 In Emacs Lisp, a coding system is represented by a Lisp symbol. In
59 C level, a coding system is represented by a vector of attributes
60 stored in the hash table Vcharset_hash_table. The conversion from
61 coding system symbol to attributes vector is done by looking up
62 Vcharset_hash_table by the symbol.
63
64 Coding systems are classified into the following types depending on
65 the encoding mechanism. Here's a brief description of the types.
66
67 o UTF-8
68
69 o UTF-16
70
71 o Charset-base coding system
72
73 A coding system defined by one or more (coded) character sets.
74 Decoding and encoding are done by a code converter defined for each
75 character set.
76
77 o Old Emacs internal format (emacs-mule)
78
79 The coding system adopted by old versions of Emacs (20 and 21).
80
81 o ISO2022-base coding system
82
83 The most famous coding system for multiple character sets. X's
84 Compound Text, various EUCs (Extended Unix Code), and coding systems
85 used in the Internet communication such as ISO-2022-JP are all
86 variants of ISO2022.
87
88 o SJIS (or Shift-JIS or MS-Kanji-Code)
89
90 A coding system to encode character sets: ASCII, JISX0201, and
91 JISX0208. Widely used for PC's in Japan. Details are described in
92 section 8.
93
94 o BIG5
95
96 A coding system to encode character sets: ASCII and Big5. Widely
97 used for Chinese (mainly in Taiwan and Hong Kong). Details are
98 described in section 8. In this file, when we write "big5" (all
99 lowercase), we mean the coding system, and when we write "Big5"
100 (capitalized), we mean the character set.
101
102 o CCL
103
104 If a user wants to decode/encode text encoded in a coding system
105 not listed above, he can supply a decoder and an encoder for it in
106 CCL (Code Conversion Language) programs. Emacs executes the CCL
107 program while decoding/encoding.
108
109 o Raw-text
110
111 A coding system for text containing raw eight-bit data. Emacs
112 treats each byte of source text as a character (except for
113 end-of-line conversion).
114
115 o No-conversion
116
117 Like raw text, but don't do end-of-line conversion.
118
119
120 END-OF-LINE FORMAT
121
122 How text end-of-line is encoded depends on operating system. For
123 instance, Unix's format is just one byte of LF (line-feed) code,
124 whereas DOS's format is two-byte sequence of `carriage-return' and
125 `line-feed' codes. MacOS's format is usually one byte of
126 `carriage-return'.
127
128 Since text character encoding and end-of-line encoding are
129 independent, any coding system described above can take any format
130 of end-of-line (except for no-conversion).
131
132 STRUCT CODING_SYSTEM
133
134 Before using a coding system for code conversion (i.e. decoding and
135 encoding), we setup a structure of type `struct coding_system'.
136 This structure keeps various information about a specific code
137 conversion (e.g. the location of source and destination data).
138
139 */
140
141 /* COMMON MACROS */
142
143
144 /*** GENERAL NOTES on `detect_coding_XXX ()' functions ***
145
146 These functions check if a byte sequence specified as a source in
147 CODING conforms to the format of XXX, and update the members of
148 DETECT_INFO.
149
150 Return 1 if the byte sequence conforms to XXX, otherwise return 0.
151
152 Below is the template of these functions. */
153
154 #if 0
155 static int
156 detect_coding_XXX (coding, detect_info)
157 struct coding_system *coding;
158 struct coding_detection_info *detect_info;
159 {
160 unsigned char *src = coding->source;
161 unsigned char *src_end = coding->source + coding->src_bytes;
162 int multibytep = coding->src_multibyte;
163 int consumed_chars = 0;
164 int found = 0;
165 ...;
166
167 while (1)
168 {
169 /* Get one byte from the source. If the souce is exausted, jump
170 to no_more_source:. */
171 ONE_MORE_BYTE (c);
172
173 if (! __C_conforms_to_XXX___ (c))
174 break;
175 if (! __C_strongly_suggests_XXX__ (c))
176 found = CATEGORY_MASK_XXX;
177 }
178 /* The byte sequence is invalid for XXX. */
179 detect_info->rejected |= CATEGORY_MASK_XXX;
180 return 0;
181
182 no_more_source:
183 /* The source exausted successfully. */
184 detect_info->found |= found;
185 return 1;
186 }
187 #endif
188
189 /*** GENERAL NOTES on `decode_coding_XXX ()' functions ***
190
191 These functions decode a byte sequence specified as a source by
192 CODING. The resulting multibyte text goes to a place pointed to by
193 CODING->charbuf, the length of which should not exceed
194 CODING->charbuf_size;
195
196 These functions set the information of original and decoded texts in
197 CODING->consumed, CODING->consumed_char, and CODING->charbuf_used.
198 They also set CODING->result to one of CODING_RESULT_XXX indicating
199 how the decoding is finished.
200
201 Below is the template of these functions. */
202
203 #if 0
204 static void
205 decode_coding_XXXX (coding)
206 struct coding_system *coding;
207 {
208 unsigned char *src = coding->source + coding->consumed;
209 unsigned char *src_end = coding->source + coding->src_bytes;
210 /* SRC_BASE remembers the start position in source in each loop.
211 The loop will be exited when there's not enough source code, or
212 when there's no room in CHARBUF for a decoded character. */
213 unsigned char *src_base;
214 /* A buffer to produce decoded characters. */
215 int *charbuf = coding->charbuf;
216 int *charbuf_end = charbuf + coding->charbuf_size;
217 int multibytep = coding->src_multibyte;
218
219 while (1)
220 {
221 src_base = src;
222 if (charbuf < charbuf_end)
223 /* No more room to produce a decoded character. */
224 break;
225 ONE_MORE_BYTE (c);
226 /* Decode it. */
227 }
228
229 no_more_source:
230 if (src_base < src_end
231 && coding->mode & CODING_MODE_LAST_BLOCK)
232 /* If the source ends by partial bytes to construct a character,
233 treat them as eight-bit raw data. */
234 while (src_base < src_end && charbuf < charbuf_end)
235 *charbuf++ = *src_base++;
236 /* Remember how many bytes and characters we consumed. If the
237 source is multibyte, the bytes and chars are not identical. */
238 coding->consumed = coding->consumed_char = src_base - coding->source;
239 /* Remember how many characters we produced. */
240 coding->charbuf_used = charbuf - coding->charbuf;
241 }
242 #endif
243
244 /*** GENERAL NOTES on `encode_coding_XXX ()' functions ***
245
246 These functions encode SRC_BYTES length text at SOURCE of Emacs'
247 internal multibyte format by CODING. The resulting byte sequence
248 goes to a place pointed to by DESTINATION, the length of which
249 should not exceed DST_BYTES.
250
251 These functions set the information of original and encoded texts in
252 the members produced, produced_char, consumed, and consumed_char of
253 the structure *CODING. They also set the member result to one of
254 CODING_RESULT_XXX indicating how the encoding finished.
255
256 DST_BYTES zero means that source area and destination area are
257 overlapped, which means that we can produce a encoded text until it
258 reaches at the head of not-yet-encoded source text.
259
260 Below is a template of these functions. */
261 #if 0
262 static void
263 encode_coding_XXX (coding)
264 struct coding_system *coding;
265 {
266 int multibytep = coding->dst_multibyte;
267 int *charbuf = coding->charbuf;
268 int *charbuf_end = charbuf->charbuf + coding->charbuf_used;
269 unsigned char *dst = coding->destination + coding->produced;
270 unsigned char *dst_end = coding->destination + coding->dst_bytes;
271 unsigned char *adjusted_dst_end = dst_end - _MAX_BYTES_PRODUCED_IN_LOOP_;
272 int produced_chars = 0;
273
274 for (; charbuf < charbuf_end && dst < adjusted_dst_end; charbuf++)
275 {
276 int c = *charbuf;
277 /* Encode C into DST, and increment DST. */
278 }
279 label_no_more_destination:
280 /* How many chars and bytes we produced. */
281 coding->produced_char += produced_chars;
282 coding->produced = dst - coding->destination;
283 }
284 #endif
285
286 \f
287 /*** 1. Preamble ***/
288
289 #include <config.h>
290 #include <stdio.h>
291
292 #include "lisp.h"
293 #include "buffer.h"
294 #include "character.h"
295 #include "charset.h"
296 #include "ccl.h"
297 #include "composite.h"
298 #include "coding.h"
299 #include "window.h"
300
301 Lisp_Object Vcoding_system_hash_table;
302
303 Lisp_Object Qcoding_system, Qcoding_aliases, Qeol_type;
304 Lisp_Object Qunix, Qdos;
305 extern Lisp_Object Qmac; /* frame.c */
306 Lisp_Object Qbuffer_file_coding_system;
307 Lisp_Object Qpost_read_conversion, Qpre_write_conversion;
308 Lisp_Object Qdefault_char;
309 Lisp_Object Qno_conversion, Qundecided;
310 Lisp_Object Qcharset, Qiso_2022, Qutf_8, Qutf_16, Qshift_jis, Qbig5;
311 Lisp_Object Qbig, Qlittle;
312 Lisp_Object Qcoding_system_history;
313 Lisp_Object Qvalid_codes;
314 Lisp_Object QCcategory, QCmnemonic, QCdefalut_char;
315 Lisp_Object QCdecode_translation_table, QCencode_translation_table;
316 Lisp_Object QCpost_read_conversion, QCpre_write_conversion;
317
318 extern Lisp_Object Qinsert_file_contents, Qwrite_region;
319 Lisp_Object Qcall_process, Qcall_process_region, Qprocess_argument;
320 Lisp_Object Qstart_process, Qopen_network_stream;
321 Lisp_Object Qtarget_idx;
322
323 Lisp_Object Qinsufficient_source, Qinconsistent_eol, Qinvalid_source;
324 Lisp_Object Qinterrupted, Qinsufficient_memory;
325
326 int coding_system_require_warning;
327
328 Lisp_Object Vselect_safe_coding_system_function;
329
330 /* Mnemonic string for each format of end-of-line. */
331 Lisp_Object eol_mnemonic_unix, eol_mnemonic_dos, eol_mnemonic_mac;
332 /* Mnemonic string to indicate format of end-of-line is not yet
333 decided. */
334 Lisp_Object eol_mnemonic_undecided;
335
336 #ifdef emacs
337
338 Lisp_Object Vcoding_system_list, Vcoding_system_alist;
339
340 Lisp_Object Qcoding_system_p, Qcoding_system_error;
341
342 /* Coding system emacs-mule and raw-text are for converting only
343 end-of-line format. */
344 Lisp_Object Qemacs_mule, Qraw_text;
345 Lisp_Object Qutf_8_emacs;
346
347 /* Coding-systems are handed between Emacs Lisp programs and C internal
348 routines by the following three variables. */
349 /* Coding-system for reading files and receiving data from process. */
350 Lisp_Object Vcoding_system_for_read;
351 /* Coding-system for writing files and sending data to process. */
352 Lisp_Object Vcoding_system_for_write;
353 /* Coding-system actually used in the latest I/O. */
354 Lisp_Object Vlast_coding_system_used;
355 /* Set to non-nil when an error is detected while code conversion. */
356 Lisp_Object Vlast_code_conversion_error;
357 /* A vector of length 256 which contains information about special
358 Latin codes (especially for dealing with Microsoft codes). */
359 Lisp_Object Vlatin_extra_code_table;
360
361 /* Flag to inhibit code conversion of end-of-line format. */
362 int inhibit_eol_conversion;
363
364 /* Flag to inhibit ISO2022 escape sequence detection. */
365 int inhibit_iso_escape_detection;
366
367 /* Flag to make buffer-file-coding-system inherit from process-coding. */
368 int inherit_process_coding_system;
369
370 /* Coding system to be used to encode text for terminal display. */
371 struct coding_system terminal_coding;
372
373 /* Coding system to be used to encode text for terminal display when
374 terminal coding system is nil. */
375 struct coding_system safe_terminal_coding;
376
377 /* Coding system of what is sent from terminal keyboard. */
378 struct coding_system keyboard_coding;
379
380 Lisp_Object Vfile_coding_system_alist;
381 Lisp_Object Vprocess_coding_system_alist;
382 Lisp_Object Vnetwork_coding_system_alist;
383
384 Lisp_Object Vlocale_coding_system;
385
386 #endif /* emacs */
387
388 /* Flag to tell if we look up translation table on character code
389 conversion. */
390 Lisp_Object Venable_character_translation;
391 /* Standard translation table to look up on decoding (reading). */
392 Lisp_Object Vstandard_translation_table_for_decode;
393 /* Standard translation table to look up on encoding (writing). */
394 Lisp_Object Vstandard_translation_table_for_encode;
395
396 Lisp_Object Qtranslation_table;
397 Lisp_Object Qtranslation_table_id;
398 Lisp_Object Qtranslation_table_for_decode;
399 Lisp_Object Qtranslation_table_for_encode;
400
401 /* Alist of charsets vs revision number. */
402 static Lisp_Object Vcharset_revision_table;
403
404 /* Default coding systems used for process I/O. */
405 Lisp_Object Vdefault_process_coding_system;
406
407 /* Char table for translating Quail and self-inserting input. */
408 Lisp_Object Vtranslation_table_for_input;
409
410 /* Two special coding systems. */
411 Lisp_Object Vsjis_coding_system;
412 Lisp_Object Vbig5_coding_system;
413
414 static void record_conversion_result (struct coding_system *coding,
415 enum coding_result_code result);
416 static int detect_coding_utf_8 P_ ((struct coding_system *,
417 struct coding_detection_info *info));
418 static void decode_coding_utf_8 P_ ((struct coding_system *));
419 static int encode_coding_utf_8 P_ ((struct coding_system *));
420
421 static int detect_coding_utf_16 P_ ((struct coding_system *,
422 struct coding_detection_info *info));
423 static void decode_coding_utf_16 P_ ((struct coding_system *));
424 static int encode_coding_utf_16 P_ ((struct coding_system *));
425
426 static int detect_coding_iso_2022 P_ ((struct coding_system *,
427 struct coding_detection_info *info));
428 static void decode_coding_iso_2022 P_ ((struct coding_system *));
429 static int encode_coding_iso_2022 P_ ((struct coding_system *));
430
431 static int detect_coding_emacs_mule P_ ((struct coding_system *,
432 struct coding_detection_info *info));
433 static void decode_coding_emacs_mule P_ ((struct coding_system *));
434 static int encode_coding_emacs_mule P_ ((struct coding_system *));
435
436 static int detect_coding_sjis P_ ((struct coding_system *,
437 struct coding_detection_info *info));
438 static void decode_coding_sjis P_ ((struct coding_system *));
439 static int encode_coding_sjis P_ ((struct coding_system *));
440
441 static int detect_coding_big5 P_ ((struct coding_system *,
442 struct coding_detection_info *info));
443 static void decode_coding_big5 P_ ((struct coding_system *));
444 static int encode_coding_big5 P_ ((struct coding_system *));
445
446 static int detect_coding_ccl P_ ((struct coding_system *,
447 struct coding_detection_info *info));
448 static void decode_coding_ccl P_ ((struct coding_system *));
449 static int encode_coding_ccl P_ ((struct coding_system *));
450
451 static void decode_coding_raw_text P_ ((struct coding_system *));
452 static int encode_coding_raw_text P_ ((struct coding_system *));
453
454
455 /* ISO2022 section */
456
457 #define CODING_ISO_INITIAL(coding, reg) \
458 (XINT (AREF (AREF (CODING_ID_ATTRS ((coding)->id), \
459 coding_attr_iso_initial), \
460 reg)))
461
462
463 #define CODING_ISO_REQUEST(coding, charset_id) \
464 ((charset_id <= (coding)->max_charset_id \
465 ? (coding)->safe_charsets[charset_id] \
466 : -1))
467
468
469 #define CODING_ISO_FLAGS(coding) \
470 ((coding)->spec.iso_2022.flags)
471 #define CODING_ISO_DESIGNATION(coding, reg) \
472 ((coding)->spec.iso_2022.current_designation[reg])
473 #define CODING_ISO_INVOCATION(coding, plane) \
474 ((coding)->spec.iso_2022.current_invocation[plane])
475 #define CODING_ISO_SINGLE_SHIFTING(coding) \
476 ((coding)->spec.iso_2022.single_shifting)
477 #define CODING_ISO_BOL(coding) \
478 ((coding)->spec.iso_2022.bol)
479 #define CODING_ISO_INVOKED_CHARSET(coding, plane) \
480 CODING_ISO_DESIGNATION ((coding), CODING_ISO_INVOCATION ((coding), (plane)))
481
482 /* Control characters of ISO2022. */
483 /* code */ /* function */
484 #define ISO_CODE_LF 0x0A /* line-feed */
485 #define ISO_CODE_CR 0x0D /* carriage-return */
486 #define ISO_CODE_SO 0x0E /* shift-out */
487 #define ISO_CODE_SI 0x0F /* shift-in */
488 #define ISO_CODE_SS2_7 0x19 /* single-shift-2 for 7-bit code */
489 #define ISO_CODE_ESC 0x1B /* escape */
490 #define ISO_CODE_SS2 0x8E /* single-shift-2 */
491 #define ISO_CODE_SS3 0x8F /* single-shift-3 */
492 #define ISO_CODE_CSI 0x9B /* control-sequence-introducer */
493
494 /* All code (1-byte) of ISO2022 is classified into one of the
495 followings. */
496 enum iso_code_class_type
497 {
498 ISO_control_0, /* Control codes in the range
499 0x00..0x1F and 0x7F, except for the
500 following 5 codes. */
501 ISO_shift_out, /* ISO_CODE_SO (0x0E) */
502 ISO_shift_in, /* ISO_CODE_SI (0x0F) */
503 ISO_single_shift_2_7, /* ISO_CODE_SS2_7 (0x19) */
504 ISO_escape, /* ISO_CODE_SO (0x1B) */
505 ISO_control_1, /* Control codes in the range
506 0x80..0x9F, except for the
507 following 3 codes. */
508 ISO_single_shift_2, /* ISO_CODE_SS2 (0x8E) */
509 ISO_single_shift_3, /* ISO_CODE_SS3 (0x8F) */
510 ISO_control_sequence_introducer, /* ISO_CODE_CSI (0x9B) */
511 ISO_0x20_or_0x7F, /* Codes of the values 0x20 or 0x7F. */
512 ISO_graphic_plane_0, /* Graphic codes in the range 0x21..0x7E. */
513 ISO_0xA0_or_0xFF, /* Codes of the values 0xA0 or 0xFF. */
514 ISO_graphic_plane_1 /* Graphic codes in the range 0xA1..0xFE. */
515 };
516
517 /** The macros CODING_ISO_FLAG_XXX defines a flag bit of the
518 `iso-flags' attribute of an iso2022 coding system. */
519
520 /* If set, produce long-form designation sequence (e.g. ESC $ ( A)
521 instead of the correct short-form sequence (e.g. ESC $ A). */
522 #define CODING_ISO_FLAG_LONG_FORM 0x0001
523
524 /* If set, reset graphic planes and registers at end-of-line to the
525 initial state. */
526 #define CODING_ISO_FLAG_RESET_AT_EOL 0x0002
527
528 /* If set, reset graphic planes and registers before any control
529 characters to the initial state. */
530 #define CODING_ISO_FLAG_RESET_AT_CNTL 0x0004
531
532 /* If set, encode by 7-bit environment. */
533 #define CODING_ISO_FLAG_SEVEN_BITS 0x0008
534
535 /* If set, use locking-shift function. */
536 #define CODING_ISO_FLAG_LOCKING_SHIFT 0x0010
537
538 /* If set, use single-shift function. Overwrite
539 CODING_ISO_FLAG_LOCKING_SHIFT. */
540 #define CODING_ISO_FLAG_SINGLE_SHIFT 0x0020
541
542 /* If set, use designation escape sequence. */
543 #define CODING_ISO_FLAG_DESIGNATION 0x0040
544
545 /* If set, produce revision number sequence. */
546 #define CODING_ISO_FLAG_REVISION 0x0080
547
548 /* If set, produce ISO6429's direction specifying sequence. */
549 #define CODING_ISO_FLAG_DIRECTION 0x0100
550
551 /* If set, assume designation states are reset at beginning of line on
552 output. */
553 #define CODING_ISO_FLAG_INIT_AT_BOL 0x0200
554
555 /* If set, designation sequence should be placed at beginning of line
556 on output. */
557 #define CODING_ISO_FLAG_DESIGNATE_AT_BOL 0x0400
558
559 /* If set, do not encode unsafe charactes on output. */
560 #define CODING_ISO_FLAG_SAFE 0x0800
561
562 /* If set, extra latin codes (128..159) are accepted as a valid code
563 on input. */
564 #define CODING_ISO_FLAG_LATIN_EXTRA 0x1000
565
566 #define CODING_ISO_FLAG_COMPOSITION 0x2000
567
568 #define CODING_ISO_FLAG_EUC_TW_SHIFT 0x4000
569
570 #define CODING_ISO_FLAG_USE_ROMAN 0x8000
571
572 #define CODING_ISO_FLAG_USE_OLDJIS 0x10000
573
574 #define CODING_ISO_FLAG_FULL_SUPPORT 0x100000
575
576 /* A character to be produced on output if encoding of the original
577 character is prohibited by CODING_ISO_FLAG_SAFE. */
578 #define CODING_INHIBIT_CHARACTER_SUBSTITUTION '?'
579
580
581 /* UTF-16 section */
582 #define CODING_UTF_16_BOM(coding) \
583 ((coding)->spec.utf_16.bom)
584
585 #define CODING_UTF_16_ENDIAN(coding) \
586 ((coding)->spec.utf_16.endian)
587
588 #define CODING_UTF_16_SURROGATE(coding) \
589 ((coding)->spec.utf_16.surrogate)
590
591
592 /* CCL section */
593 #define CODING_CCL_DECODER(coding) \
594 AREF (CODING_ID_ATTRS ((coding)->id), coding_attr_ccl_decoder)
595 #define CODING_CCL_ENCODER(coding) \
596 AREF (CODING_ID_ATTRS ((coding)->id), coding_attr_ccl_encoder)
597 #define CODING_CCL_VALIDS(coding) \
598 (SDATA (AREF (CODING_ID_ATTRS ((coding)->id), coding_attr_ccl_valids)))
599
600 /* Index for each coding category in `coding_categories' */
601
602 enum coding_category
603 {
604 coding_category_iso_7,
605 coding_category_iso_7_tight,
606 coding_category_iso_8_1,
607 coding_category_iso_8_2,
608 coding_category_iso_7_else,
609 coding_category_iso_8_else,
610 coding_category_utf_8,
611 coding_category_utf_16_auto,
612 coding_category_utf_16_be,
613 coding_category_utf_16_le,
614 coding_category_utf_16_be_nosig,
615 coding_category_utf_16_le_nosig,
616 coding_category_charset,
617 coding_category_sjis,
618 coding_category_big5,
619 coding_category_ccl,
620 coding_category_emacs_mule,
621 /* All above are targets of code detection. */
622 coding_category_raw_text,
623 coding_category_undecided,
624 coding_category_max
625 };
626
627 /* Definitions of flag bits used in detect_coding_XXXX. */
628 #define CATEGORY_MASK_ISO_7 (1 << coding_category_iso_7)
629 #define CATEGORY_MASK_ISO_7_TIGHT (1 << coding_category_iso_7_tight)
630 #define CATEGORY_MASK_ISO_8_1 (1 << coding_category_iso_8_1)
631 #define CATEGORY_MASK_ISO_8_2 (1 << coding_category_iso_8_2)
632 #define CATEGORY_MASK_ISO_7_ELSE (1 << coding_category_iso_7_else)
633 #define CATEGORY_MASK_ISO_8_ELSE (1 << coding_category_iso_8_else)
634 #define CATEGORY_MASK_UTF_8 (1 << coding_category_utf_8)
635 #define CATEGORY_MASK_UTF_16_AUTO (1 << coding_category_utf_16_auto)
636 #define CATEGORY_MASK_UTF_16_BE (1 << coding_category_utf_16_be)
637 #define CATEGORY_MASK_UTF_16_LE (1 << coding_category_utf_16_le)
638 #define CATEGORY_MASK_UTF_16_BE_NOSIG (1 << coding_category_utf_16_be_nosig)
639 #define CATEGORY_MASK_UTF_16_LE_NOSIG (1 << coding_category_utf_16_le_nosig)
640 #define CATEGORY_MASK_CHARSET (1 << coding_category_charset)
641 #define CATEGORY_MASK_SJIS (1 << coding_category_sjis)
642 #define CATEGORY_MASK_BIG5 (1 << coding_category_big5)
643 #define CATEGORY_MASK_CCL (1 << coding_category_ccl)
644 #define CATEGORY_MASK_EMACS_MULE (1 << coding_category_emacs_mule)
645 #define CATEGORY_MASK_RAW_TEXT (1 << coding_category_raw_text)
646
647 /* This value is returned if detect_coding_mask () find nothing other
648 than ASCII characters. */
649 #define CATEGORY_MASK_ANY \
650 (CATEGORY_MASK_ISO_7 \
651 | CATEGORY_MASK_ISO_7_TIGHT \
652 | CATEGORY_MASK_ISO_8_1 \
653 | CATEGORY_MASK_ISO_8_2 \
654 | CATEGORY_MASK_ISO_7_ELSE \
655 | CATEGORY_MASK_ISO_8_ELSE \
656 | CATEGORY_MASK_UTF_8 \
657 | CATEGORY_MASK_UTF_16_BE \
658 | CATEGORY_MASK_UTF_16_LE \
659 | CATEGORY_MASK_UTF_16_BE_NOSIG \
660 | CATEGORY_MASK_UTF_16_LE_NOSIG \
661 | CATEGORY_MASK_CHARSET \
662 | CATEGORY_MASK_SJIS \
663 | CATEGORY_MASK_BIG5 \
664 | CATEGORY_MASK_CCL \
665 | CATEGORY_MASK_EMACS_MULE)
666
667
668 #define CATEGORY_MASK_ISO_7BIT \
669 (CATEGORY_MASK_ISO_7 | CATEGORY_MASK_ISO_7_TIGHT)
670
671 #define CATEGORY_MASK_ISO_8BIT \
672 (CATEGORY_MASK_ISO_8_1 | CATEGORY_MASK_ISO_8_2)
673
674 #define CATEGORY_MASK_ISO_ELSE \
675 (CATEGORY_MASK_ISO_7_ELSE | CATEGORY_MASK_ISO_8_ELSE)
676
677 #define CATEGORY_MASK_ISO_ESCAPE \
678 (CATEGORY_MASK_ISO_7 \
679 | CATEGORY_MASK_ISO_7_TIGHT \
680 | CATEGORY_MASK_ISO_7_ELSE \
681 | CATEGORY_MASK_ISO_8_ELSE)
682
683 #define CATEGORY_MASK_ISO \
684 ( CATEGORY_MASK_ISO_7BIT \
685 | CATEGORY_MASK_ISO_8BIT \
686 | CATEGORY_MASK_ISO_ELSE)
687
688 #define CATEGORY_MASK_UTF_16 \
689 (CATEGORY_MASK_UTF_16_BE \
690 | CATEGORY_MASK_UTF_16_LE \
691 | CATEGORY_MASK_UTF_16_BE_NOSIG \
692 | CATEGORY_MASK_UTF_16_LE_NOSIG)
693
694
695 /* List of symbols `coding-category-xxx' ordered by priority. This
696 variable is exposed to Emacs Lisp. */
697 static Lisp_Object Vcoding_category_list;
698
699 /* Table of coding categories (Lisp symbols). This variable is for
700 internal use oly. */
701 static Lisp_Object Vcoding_category_table;
702
703 /* Table of coding-categories ordered by priority. */
704 static enum coding_category coding_priorities[coding_category_max];
705
706 /* Nth element is a coding context for the coding system bound to the
707 Nth coding category. */
708 static struct coding_system coding_categories[coding_category_max];
709
710 /*** Commonly used macros and functions ***/
711
712 #ifndef min
713 #define min(a, b) ((a) < (b) ? (a) : (b))
714 #endif
715 #ifndef max
716 #define max(a, b) ((a) > (b) ? (a) : (b))
717 #endif
718
719 #define CODING_GET_INFO(coding, attrs, charset_list) \
720 do { \
721 (attrs) = CODING_ID_ATTRS ((coding)->id); \
722 (charset_list) = CODING_ATTR_CHARSET_LIST (attrs); \
723 } while (0)
724
725
726 /* Safely get one byte from the source text pointed by SRC which ends
727 at SRC_END, and set C to that byte. If there are not enough bytes
728 in the source, it jumps to `no_more_source'. If multibytep is
729 nonzero, and a multibyte character is found at SRC, set C to the
730 negative value of the character code. The caller should declare
731 and set these variables appropriately in advance:
732 src, src_end, multibytep */
733
734 #define ONE_MORE_BYTE(c) \
735 do { \
736 if (src == src_end) \
737 { \
738 if (src_base < src) \
739 record_conversion_result \
740 (coding, CODING_RESULT_INSUFFICIENT_SRC); \
741 goto no_more_source; \
742 } \
743 c = *src++; \
744 if (multibytep && (c & 0x80)) \
745 { \
746 if ((c & 0xFE) == 0xC0) \
747 c = ((c & 1) << 6) | *src++; \
748 else \
749 { \
750 c = - string_char (--src, &src, NULL); \
751 record_conversion_result \
752 (coding, CODING_RESULT_INVALID_SRC); \
753 } \
754 } \
755 consumed_chars++; \
756 } while (0)
757
758
759 #define ONE_MORE_BYTE_NO_CHECK(c) \
760 do { \
761 c = *src++; \
762 if (multibytep && (c & 0x80)) \
763 { \
764 if ((c & 0xFE) == 0xC0) \
765 c = ((c & 1) << 6) | *src++; \
766 else \
767 { \
768 c = - string_char (--src, &src, NULL); \
769 record_conversion_result \
770 (coding, CODING_RESULT_INVALID_SRC); \
771 } \
772 } \
773 consumed_chars++; \
774 } while (0)
775
776
777 /* Store a byte C in the place pointed by DST and increment DST to the
778 next free point, and increment PRODUCED_CHARS. The caller should
779 assure that C is 0..127, and declare and set the variable `dst'
780 appropriately in advance.
781 */
782
783
784 #define EMIT_ONE_ASCII_BYTE(c) \
785 do { \
786 produced_chars++; \
787 *dst++ = (c); \
788 } while (0)
789
790
791 /* Like EMIT_ONE_ASCII_BYTE byt store two bytes; C1 and C2. */
792
793 #define EMIT_TWO_ASCII_BYTES(c1, c2) \
794 do { \
795 produced_chars += 2; \
796 *dst++ = (c1), *dst++ = (c2); \
797 } while (0)
798
799
800 /* Store a byte C in the place pointed by DST and increment DST to the
801 next free point, and increment PRODUCED_CHARS. If MULTIBYTEP is
802 nonzero, store in an appropriate multibyte from. The caller should
803 declare and set the variables `dst' and `multibytep' appropriately
804 in advance. */
805
806 #define EMIT_ONE_BYTE(c) \
807 do { \
808 produced_chars++; \
809 if (multibytep) \
810 { \
811 int ch = (c); \
812 if (ch >= 0x80) \
813 ch = BYTE8_TO_CHAR (ch); \
814 CHAR_STRING_ADVANCE (ch, dst); \
815 } \
816 else \
817 *dst++ = (c); \
818 } while (0)
819
820
821 /* Like EMIT_ONE_BYTE, but emit two bytes; C1 and C2. */
822
823 #define EMIT_TWO_BYTES(c1, c2) \
824 do { \
825 produced_chars += 2; \
826 if (multibytep) \
827 { \
828 int ch; \
829 \
830 ch = (c1); \
831 if (ch >= 0x80) \
832 ch = BYTE8_TO_CHAR (ch); \
833 CHAR_STRING_ADVANCE (ch, dst); \
834 ch = (c2); \
835 if (ch >= 0x80) \
836 ch = BYTE8_TO_CHAR (ch); \
837 CHAR_STRING_ADVANCE (ch, dst); \
838 } \
839 else \
840 { \
841 *dst++ = (c1); \
842 *dst++ = (c2); \
843 } \
844 } while (0)
845
846
847 #define EMIT_THREE_BYTES(c1, c2, c3) \
848 do { \
849 EMIT_ONE_BYTE (c1); \
850 EMIT_TWO_BYTES (c2, c3); \
851 } while (0)
852
853
854 #define EMIT_FOUR_BYTES(c1, c2, c3, c4) \
855 do { \
856 EMIT_TWO_BYTES (c1, c2); \
857 EMIT_TWO_BYTES (c3, c4); \
858 } while (0)
859
860
861 static void
862 record_conversion_result (struct coding_system *coding,
863 enum coding_result_code result)
864 {
865 coding->result = result;
866 switch (result)
867 {
868 case CODING_RESULT_INSUFFICIENT_SRC:
869 Vlast_code_conversion_error = Qinsufficient_source;
870 break;
871 case CODING_RESULT_INCONSISTENT_EOL:
872 Vlast_code_conversion_error = Qinconsistent_eol;
873 break;
874 case CODING_RESULT_INVALID_SRC:
875 Vlast_code_conversion_error = Qinvalid_source;
876 break;
877 case CODING_RESULT_INTERRUPT:
878 Vlast_code_conversion_error = Qinterrupted;
879 break;
880 case CODING_RESULT_INSUFFICIENT_MEM:
881 Vlast_code_conversion_error = Qinsufficient_memory;
882 break;
883 }
884 }
885
886 #define CODING_DECODE_CHAR(coding, src, src_base, src_end, charset, code, c) \
887 do { \
888 charset_map_loaded = 0; \
889 c = DECODE_CHAR (charset, code); \
890 if (charset_map_loaded) \
891 { \
892 const unsigned char *orig = coding->source; \
893 EMACS_INT offset; \
894 \
895 coding_set_source (coding); \
896 offset = coding->source - orig; \
897 src += offset; \
898 src_base += offset; \
899 src_end += offset; \
900 } \
901 } while (0)
902
903
904 #define ASSURE_DESTINATION(bytes) \
905 do { \
906 if (dst + (bytes) >= dst_end) \
907 { \
908 int more_bytes = charbuf_end - charbuf + (bytes); \
909 \
910 dst = alloc_destination (coding, more_bytes, dst); \
911 dst_end = coding->destination + coding->dst_bytes; \
912 } \
913 } while (0)
914
915
916
917 static void
918 coding_set_source (coding)
919 struct coding_system *coding;
920 {
921 if (BUFFERP (coding->src_object))
922 {
923 struct buffer *buf = XBUFFER (coding->src_object);
924
925 if (coding->src_pos < 0)
926 coding->source = BUF_GAP_END_ADDR (buf) + coding->src_pos_byte;
927 else
928 coding->source = BUF_BYTE_ADDRESS (buf, coding->src_pos_byte);
929 }
930 else if (STRINGP (coding->src_object))
931 {
932 coding->source = SDATA (coding->src_object) + coding->src_pos_byte;
933 }
934 else
935 /* Otherwise, the source is C string and is never relocated
936 automatically. Thus we don't have to update anything. */
937 ;
938 }
939
940 static void
941 coding_set_destination (coding)
942 struct coding_system *coding;
943 {
944 if (BUFFERP (coding->dst_object))
945 {
946 if (coding->src_pos < 0)
947 {
948 coding->destination = BEG_ADDR + coding->dst_pos_byte - 1;
949 coding->dst_bytes = (GAP_END_ADDR
950 - (coding->src_bytes - coding->consumed)
951 - coding->destination);
952 }
953 else
954 {
955 /* We are sure that coding->dst_pos_byte is before the gap
956 of the buffer. */
957 coding->destination = (BUF_BEG_ADDR (XBUFFER (coding->dst_object))
958 + coding->dst_pos_byte - 1);
959 coding->dst_bytes = (BUF_GAP_END_ADDR (XBUFFER (coding->dst_object))
960 - coding->destination);
961 }
962 }
963 else
964 /* Otherwise, the destination is C string and is never relocated
965 automatically. Thus we don't have to update anything. */
966 ;
967 }
968
969
970 static void
971 coding_alloc_by_realloc (coding, bytes)
972 struct coding_system *coding;
973 EMACS_INT bytes;
974 {
975 coding->destination = (unsigned char *) xrealloc (coding->destination,
976 coding->dst_bytes + bytes);
977 coding->dst_bytes += bytes;
978 }
979
980 static void
981 coding_alloc_by_making_gap (coding, bytes)
982 struct coding_system *coding;
983 EMACS_INT bytes;
984 {
985 if (BUFFERP (coding->dst_object)
986 && EQ (coding->src_object, coding->dst_object))
987 {
988 EMACS_INT add = coding->src_bytes - coding->consumed;
989
990 GAP_SIZE -= add; ZV += add; Z += add; ZV_BYTE += add; Z_BYTE += add;
991 make_gap (bytes);
992 GAP_SIZE += add; ZV -= add; Z -= add; ZV_BYTE -= add; Z_BYTE -= add;
993 }
994 else
995 {
996 Lisp_Object this_buffer;
997
998 this_buffer = Fcurrent_buffer ();
999 set_buffer_internal (XBUFFER (coding->dst_object));
1000 make_gap (bytes);
1001 set_buffer_internal (XBUFFER (this_buffer));
1002 }
1003 }
1004
1005
1006 static unsigned char *
1007 alloc_destination (coding, nbytes, dst)
1008 struct coding_system *coding;
1009 EMACS_INT nbytes;
1010 unsigned char *dst;
1011 {
1012 EMACS_INT offset = dst - coding->destination;
1013
1014 if (BUFFERP (coding->dst_object))
1015 coding_alloc_by_making_gap (coding, nbytes);
1016 else
1017 coding_alloc_by_realloc (coding, nbytes);
1018 record_conversion_result (coding, CODING_RESULT_SUCCESS);
1019 coding_set_destination (coding);
1020 dst = coding->destination + offset;
1021 return dst;
1022 }
1023
1024 /** Macros for annotations. */
1025
1026 /* Maximum length of annotation data (sum of annotations for
1027 composition and charset). */
1028 #define MAX_ANNOTATION_LENGTH (5 + (MAX_COMPOSITION_COMPONENTS * 2) - 1 + 5)
1029
1030 /* An annotation data is stored in the array coding->charbuf in this
1031 format:
1032 [ -LENGTH ANNOTATION_MASK FROM TO ... ]
1033 LENGTH is the number of elements in the annotation.
1034 ANNOTATION_MASK is one of CODING_ANNOTATE_XXX_MASK.
1035 FROM and TO specify the range of text annotated. They are relative
1036 to coding->src_pos (on encoding) or coding->dst_pos (on decoding).
1037
1038 The format of the following elements depend on ANNOTATION_MASK.
1039
1040 In the case of CODING_ANNOTATE_COMPOSITION_MASK, these elements
1041 follows:
1042 ... METHOD [ COMPOSITION-COMPONENTS ... ]
1043 METHOD is one of enum composition_method.
1044 Optionnal COMPOSITION-COMPONENTS are characters and composition
1045 rules.
1046
1047 In the case of CODING_ANNOTATE_CHARSET_MASK, one element CHARSET-ID
1048 follows. */
1049
1050 #define ADD_ANNOTATION_DATA(buf, len, mask, from, to) \
1051 do { \
1052 *(buf)++ = -(len); \
1053 *(buf)++ = (mask); \
1054 *(buf)++ = (from); \
1055 *(buf)++ = (to); \
1056 coding->annotated = 1; \
1057 } while (0);
1058
1059 #define ADD_COMPOSITION_DATA(buf, from, to, method) \
1060 do { \
1061 ADD_ANNOTATION_DATA (buf, 5, CODING_ANNOTATE_COMPOSITION_MASK, from, to); \
1062 *buf++ = method; \
1063 } while (0)
1064
1065
1066 #define ADD_CHARSET_DATA(buf, from, to, id) \
1067 do { \
1068 ADD_ANNOTATION_DATA (buf, 5, CODING_ANNOTATE_CHARSET_MASK, from, to); \
1069 *buf++ = id; \
1070 } while (0)
1071
1072 \f
1073 /*** 2. Emacs' internal format (emacs-utf-8) ***/
1074
1075
1076
1077 \f
1078 /*** 3. UTF-8 ***/
1079
1080 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
1081 Check if a text is encoded in UTF-8. If it is, return 1, else
1082 return 0. */
1083
1084 #define UTF_8_1_OCTET_P(c) ((c) < 0x80)
1085 #define UTF_8_EXTRA_OCTET_P(c) (((c) & 0xC0) == 0x80)
1086 #define UTF_8_2_OCTET_LEADING_P(c) (((c) & 0xE0) == 0xC0)
1087 #define UTF_8_3_OCTET_LEADING_P(c) (((c) & 0xF0) == 0xE0)
1088 #define UTF_8_4_OCTET_LEADING_P(c) (((c) & 0xF8) == 0xF0)
1089 #define UTF_8_5_OCTET_LEADING_P(c) (((c) & 0xFC) == 0xF8)
1090
1091 static int
1092 detect_coding_utf_8 (coding, detect_info)
1093 struct coding_system *coding;
1094 struct coding_detection_info *detect_info;
1095 {
1096 const unsigned char *src = coding->source, *src_base;
1097 const unsigned char *src_end = coding->source + coding->src_bytes;
1098 int multibytep = coding->src_multibyte;
1099 int consumed_chars = 0;
1100 int found = 0;
1101
1102 detect_info->checked |= CATEGORY_MASK_UTF_8;
1103 /* A coding system of this category is always ASCII compatible. */
1104 src += coding->head_ascii;
1105
1106 while (1)
1107 {
1108 int c, c1, c2, c3, c4;
1109
1110 src_base = src;
1111 ONE_MORE_BYTE (c);
1112 if (c < 0 || UTF_8_1_OCTET_P (c))
1113 continue;
1114 ONE_MORE_BYTE (c1);
1115 if (c1 < 0 || ! UTF_8_EXTRA_OCTET_P (c1))
1116 break;
1117 if (UTF_8_2_OCTET_LEADING_P (c))
1118 {
1119 found = CATEGORY_MASK_UTF_8;
1120 continue;
1121 }
1122 ONE_MORE_BYTE (c2);
1123 if (c2 < 0 || ! UTF_8_EXTRA_OCTET_P (c2))
1124 break;
1125 if (UTF_8_3_OCTET_LEADING_P (c))
1126 {
1127 found = CATEGORY_MASK_UTF_8;
1128 continue;
1129 }
1130 ONE_MORE_BYTE (c3);
1131 if (c3 < 0 || ! UTF_8_EXTRA_OCTET_P (c3))
1132 break;
1133 if (UTF_8_4_OCTET_LEADING_P (c))
1134 {
1135 found = CATEGORY_MASK_UTF_8;
1136 continue;
1137 }
1138 ONE_MORE_BYTE (c4);
1139 if (c4 < 0 || ! UTF_8_EXTRA_OCTET_P (c4))
1140 break;
1141 if (UTF_8_5_OCTET_LEADING_P (c))
1142 {
1143 found = CATEGORY_MASK_UTF_8;
1144 continue;
1145 }
1146 break;
1147 }
1148 detect_info->rejected |= CATEGORY_MASK_UTF_8;
1149 return 0;
1150
1151 no_more_source:
1152 if (src_base < src && coding->mode & CODING_MODE_LAST_BLOCK)
1153 {
1154 detect_info->rejected |= CATEGORY_MASK_UTF_8;
1155 return 0;
1156 }
1157 detect_info->found |= found;
1158 return 1;
1159 }
1160
1161
1162 static void
1163 decode_coding_utf_8 (coding)
1164 struct coding_system *coding;
1165 {
1166 const unsigned char *src = coding->source + coding->consumed;
1167 const unsigned char *src_end = coding->source + coding->src_bytes;
1168 const unsigned char *src_base;
1169 int *charbuf = coding->charbuf;
1170 int *charbuf_end = charbuf + coding->charbuf_size;
1171 int consumed_chars = 0, consumed_chars_base;
1172 int multibytep = coding->src_multibyte;
1173 Lisp_Object attr, charset_list;
1174
1175 CODING_GET_INFO (coding, attr, charset_list);
1176
1177 while (1)
1178 {
1179 int c, c1, c2, c3, c4, c5;
1180
1181 src_base = src;
1182 consumed_chars_base = consumed_chars;
1183
1184 if (charbuf >= charbuf_end)
1185 break;
1186
1187 ONE_MORE_BYTE (c1);
1188 if (c1 < 0)
1189 {
1190 c = - c1;
1191 }
1192 else if (UTF_8_1_OCTET_P(c1))
1193 {
1194 c = c1;
1195 }
1196 else
1197 {
1198 ONE_MORE_BYTE (c2);
1199 if (c2 < 0 || ! UTF_8_EXTRA_OCTET_P (c2))
1200 goto invalid_code;
1201 if (UTF_8_2_OCTET_LEADING_P (c1))
1202 {
1203 c = ((c1 & 0x1F) << 6) | (c2 & 0x3F);
1204 /* Reject overlong sequences here and below. Encoders
1205 producing them are incorrect, they can be misleading,
1206 and they mess up read/write invariance. */
1207 if (c < 128)
1208 goto invalid_code;
1209 }
1210 else
1211 {
1212 ONE_MORE_BYTE (c3);
1213 if (c3 < 0 || ! UTF_8_EXTRA_OCTET_P (c3))
1214 goto invalid_code;
1215 if (UTF_8_3_OCTET_LEADING_P (c1))
1216 {
1217 c = (((c1 & 0xF) << 12)
1218 | ((c2 & 0x3F) << 6) | (c3 & 0x3F));
1219 if (c < 0x800
1220 || (c >= 0xd800 && c < 0xe000)) /* surrogates (invalid) */
1221 goto invalid_code;
1222 }
1223 else
1224 {
1225 ONE_MORE_BYTE (c4);
1226 if (c4 < 0 || ! UTF_8_EXTRA_OCTET_P (c4))
1227 goto invalid_code;
1228 if (UTF_8_4_OCTET_LEADING_P (c1))
1229 {
1230 c = (((c1 & 0x7) << 18) | ((c2 & 0x3F) << 12)
1231 | ((c3 & 0x3F) << 6) | (c4 & 0x3F));
1232 if (c < 0x10000)
1233 goto invalid_code;
1234 }
1235 else
1236 {
1237 ONE_MORE_BYTE (c5);
1238 if (c5 < 0 || ! UTF_8_EXTRA_OCTET_P (c5))
1239 goto invalid_code;
1240 if (UTF_8_5_OCTET_LEADING_P (c1))
1241 {
1242 c = (((c1 & 0x3) << 24) | ((c2 & 0x3F) << 18)
1243 | ((c3 & 0x3F) << 12) | ((c4 & 0x3F) << 6)
1244 | (c5 & 0x3F));
1245 if ((c > MAX_CHAR) || (c < 0x200000))
1246 goto invalid_code;
1247 }
1248 else
1249 goto invalid_code;
1250 }
1251 }
1252 }
1253 }
1254
1255 *charbuf++ = c;
1256 continue;
1257
1258 invalid_code:
1259 src = src_base;
1260 consumed_chars = consumed_chars_base;
1261 ONE_MORE_BYTE (c);
1262 *charbuf++ = ASCII_BYTE_P (c) ? c : BYTE8_TO_CHAR (c);
1263 coding->errors++;
1264 }
1265
1266 no_more_source:
1267 coding->consumed_char += consumed_chars_base;
1268 coding->consumed = src_base - coding->source;
1269 coding->charbuf_used = charbuf - coding->charbuf;
1270 }
1271
1272
1273 static int
1274 encode_coding_utf_8 (coding)
1275 struct coding_system *coding;
1276 {
1277 int multibytep = coding->dst_multibyte;
1278 int *charbuf = coding->charbuf;
1279 int *charbuf_end = charbuf + coding->charbuf_used;
1280 unsigned char *dst = coding->destination + coding->produced;
1281 unsigned char *dst_end = coding->destination + coding->dst_bytes;
1282 int produced_chars = 0;
1283 int c;
1284
1285 if (multibytep)
1286 {
1287 int safe_room = MAX_MULTIBYTE_LENGTH * 2;
1288
1289 while (charbuf < charbuf_end)
1290 {
1291 unsigned char str[MAX_MULTIBYTE_LENGTH], *p, *pend = str;
1292
1293 ASSURE_DESTINATION (safe_room);
1294 c = *charbuf++;
1295 if (CHAR_BYTE8_P (c))
1296 {
1297 c = CHAR_TO_BYTE8 (c);
1298 EMIT_ONE_BYTE (c);
1299 }
1300 else
1301 {
1302 CHAR_STRING_ADVANCE (c, pend);
1303 for (p = str; p < pend; p++)
1304 EMIT_ONE_BYTE (*p);
1305 }
1306 }
1307 }
1308 else
1309 {
1310 int safe_room = MAX_MULTIBYTE_LENGTH;
1311
1312 while (charbuf < charbuf_end)
1313 {
1314 ASSURE_DESTINATION (safe_room);
1315 c = *charbuf++;
1316 dst += CHAR_STRING (c, dst);
1317 produced_chars++;
1318 }
1319 }
1320 record_conversion_result (coding, CODING_RESULT_SUCCESS);
1321 coding->produced_char += produced_chars;
1322 coding->produced = dst - coding->destination;
1323 return 0;
1324 }
1325
1326
1327 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
1328 Check if a text is encoded in one of UTF-16 based coding systems.
1329 If it is, return 1, else return 0. */
1330
1331 #define UTF_16_HIGH_SURROGATE_P(val) \
1332 (((val) & 0xFC00) == 0xD800)
1333
1334 #define UTF_16_LOW_SURROGATE_P(val) \
1335 (((val) & 0xFC00) == 0xDC00)
1336
1337 #define UTF_16_INVALID_P(val) \
1338 (((val) == 0xFFFE) \
1339 || ((val) == 0xFFFF) \
1340 || UTF_16_LOW_SURROGATE_P (val))
1341
1342
1343 static int
1344 detect_coding_utf_16 (coding, detect_info)
1345 struct coding_system *coding;
1346 struct coding_detection_info *detect_info;
1347 {
1348 const unsigned char *src = coding->source, *src_base = src;
1349 const unsigned char *src_end = coding->source + coding->src_bytes;
1350 int multibytep = coding->src_multibyte;
1351 int consumed_chars = 0;
1352 int c1, c2;
1353
1354 detect_info->checked |= CATEGORY_MASK_UTF_16;
1355 if (coding->mode & CODING_MODE_LAST_BLOCK
1356 && (coding->src_chars & 1))
1357 {
1358 detect_info->rejected |= CATEGORY_MASK_UTF_16;
1359 return 0;
1360 }
1361
1362 ONE_MORE_BYTE (c1);
1363 ONE_MORE_BYTE (c2);
1364 if ((c1 == 0xFF) && (c2 == 0xFE))
1365 {
1366 detect_info->found |= (CATEGORY_MASK_UTF_16_LE
1367 | CATEGORY_MASK_UTF_16_AUTO);
1368 detect_info->rejected |= (CATEGORY_MASK_UTF_16_BE
1369 | CATEGORY_MASK_UTF_16_BE_NOSIG
1370 | CATEGORY_MASK_UTF_16_LE_NOSIG);
1371 }
1372 else if ((c1 == 0xFE) && (c2 == 0xFF))
1373 {
1374 detect_info->found |= (CATEGORY_MASK_UTF_16_BE
1375 | CATEGORY_MASK_UTF_16_AUTO);
1376 detect_info->rejected |= (CATEGORY_MASK_UTF_16_LE
1377 | CATEGORY_MASK_UTF_16_BE_NOSIG
1378 | CATEGORY_MASK_UTF_16_LE_NOSIG);
1379 }
1380 else if (c1 >= 0 && c2 >= 0)
1381 {
1382 unsigned char b1[256], b2[256];
1383 int b1_variants = 1, b2_variants = 1;
1384 int n;
1385
1386 bzero (b1, 256), bzero (b2, 256);
1387 b1[c1]++, b2[c2]++;
1388 for (n = 0; n < 256 && src < src_end; n++)
1389 {
1390 src_base = src;
1391 ONE_MORE_BYTE (c1);
1392 ONE_MORE_BYTE (c2);
1393 if (c1 < 0 || c2 < 0)
1394 break;
1395 if (! b1[c1++]) b1_variants++;
1396 if (! b2[c2++]) b2_variants++;
1397 }
1398 if (b1_variants < b2_variants)
1399 detect_info->found |= CATEGORY_MASK_UTF_16_BE_NOSIG;
1400 else
1401 detect_info->found |= CATEGORY_MASK_UTF_16_LE_NOSIG;
1402 detect_info->rejected
1403 |= (CATEGORY_MASK_UTF_16_BE | CATEGORY_MASK_UTF_16_LE);
1404 }
1405 no_more_source:
1406 return 1;
1407 }
1408
1409 static void
1410 decode_coding_utf_16 (coding)
1411 struct coding_system *coding;
1412 {
1413 const unsigned char *src = coding->source + coding->consumed;
1414 const unsigned char *src_end = coding->source + coding->src_bytes;
1415 const unsigned char *src_base;
1416 int *charbuf = coding->charbuf;
1417 int *charbuf_end = charbuf + coding->charbuf_size;
1418 int consumed_chars = 0, consumed_chars_base;
1419 int multibytep = coding->src_multibyte;
1420 enum utf_16_bom_type bom = CODING_UTF_16_BOM (coding);
1421 enum utf_16_endian_type endian = CODING_UTF_16_ENDIAN (coding);
1422 int surrogate = CODING_UTF_16_SURROGATE (coding);
1423 Lisp_Object attr, charset_list;
1424
1425 CODING_GET_INFO (coding, attr, charset_list);
1426
1427 if (bom == utf_16_with_bom)
1428 {
1429 int c, c1, c2;
1430
1431 src_base = src;
1432 ONE_MORE_BYTE (c1);
1433 ONE_MORE_BYTE (c2);
1434 c = (c1 << 8) | c2;
1435
1436 if (endian == utf_16_big_endian
1437 ? c != 0xFEFF : c != 0xFFFE)
1438 {
1439 /* The first two bytes are not BOM. Treat them as bytes
1440 for a normal character. */
1441 src = src_base;
1442 coding->errors++;
1443 }
1444 CODING_UTF_16_BOM (coding) = utf_16_without_bom;
1445 }
1446 else if (bom == utf_16_detect_bom)
1447 {
1448 /* We have already tried to detect BOM and failed in
1449 detect_coding. */
1450 CODING_UTF_16_BOM (coding) = utf_16_without_bom;
1451 }
1452
1453 while (1)
1454 {
1455 int c, c1, c2;
1456
1457 src_base = src;
1458 consumed_chars_base = consumed_chars;
1459
1460 if (charbuf + 2 >= charbuf_end)
1461 break;
1462
1463 ONE_MORE_BYTE (c1);
1464 if (c1 < 0)
1465 {
1466 *charbuf++ = -c1;
1467 continue;
1468 }
1469 ONE_MORE_BYTE (c2);
1470 if (c2 < 0)
1471 {
1472 *charbuf++ = ASCII_BYTE_P (c1) ? c1 : BYTE8_TO_CHAR (c1);
1473 *charbuf++ = -c2;
1474 continue;
1475 }
1476 c = (endian == utf_16_big_endian
1477 ? ((c1 << 8) | c2) : ((c2 << 8) | c1));
1478 if (surrogate)
1479 {
1480 if (! UTF_16_LOW_SURROGATE_P (c))
1481 {
1482 if (endian == utf_16_big_endian)
1483 c1 = surrogate >> 8, c2 = surrogate & 0xFF;
1484 else
1485 c1 = surrogate & 0xFF, c2 = surrogate >> 8;
1486 *charbuf++ = c1;
1487 *charbuf++ = c2;
1488 coding->errors++;
1489 if (UTF_16_HIGH_SURROGATE_P (c))
1490 CODING_UTF_16_SURROGATE (coding) = surrogate = c;
1491 else
1492 *charbuf++ = c;
1493 }
1494 else
1495 {
1496 c = ((surrogate - 0xD800) << 10) | (c - 0xDC00);
1497 CODING_UTF_16_SURROGATE (coding) = surrogate = 0;
1498 *charbuf++ = 0x10000 + c;
1499 }
1500 }
1501 else
1502 {
1503 if (UTF_16_HIGH_SURROGATE_P (c))
1504 CODING_UTF_16_SURROGATE (coding) = surrogate = c;
1505 else
1506 *charbuf++ = c;
1507 }
1508 }
1509
1510 no_more_source:
1511 coding->consumed_char += consumed_chars_base;
1512 coding->consumed = src_base - coding->source;
1513 coding->charbuf_used = charbuf - coding->charbuf;
1514 }
1515
1516 static int
1517 encode_coding_utf_16 (coding)
1518 struct coding_system *coding;
1519 {
1520 int multibytep = coding->dst_multibyte;
1521 int *charbuf = coding->charbuf;
1522 int *charbuf_end = charbuf + coding->charbuf_used;
1523 unsigned char *dst = coding->destination + coding->produced;
1524 unsigned char *dst_end = coding->destination + coding->dst_bytes;
1525 int safe_room = 8;
1526 enum utf_16_bom_type bom = CODING_UTF_16_BOM (coding);
1527 int big_endian = CODING_UTF_16_ENDIAN (coding) == utf_16_big_endian;
1528 int produced_chars = 0;
1529 Lisp_Object attrs, charset_list;
1530 int c;
1531
1532 CODING_GET_INFO (coding, attrs, charset_list);
1533
1534 if (bom != utf_16_without_bom)
1535 {
1536 ASSURE_DESTINATION (safe_room);
1537 if (big_endian)
1538 EMIT_TWO_BYTES (0xFE, 0xFF);
1539 else
1540 EMIT_TWO_BYTES (0xFF, 0xFE);
1541 CODING_UTF_16_BOM (coding) = utf_16_without_bom;
1542 }
1543
1544 while (charbuf < charbuf_end)
1545 {
1546 ASSURE_DESTINATION (safe_room);
1547 c = *charbuf++;
1548 if (c >= MAX_UNICODE_CHAR)
1549 c = coding->default_char;
1550
1551 if (c < 0x10000)
1552 {
1553 if (big_endian)
1554 EMIT_TWO_BYTES (c >> 8, c & 0xFF);
1555 else
1556 EMIT_TWO_BYTES (c & 0xFF, c >> 8);
1557 }
1558 else
1559 {
1560 int c1, c2;
1561
1562 c -= 0x10000;
1563 c1 = (c >> 10) + 0xD800;
1564 c2 = (c & 0x3FF) + 0xDC00;
1565 if (big_endian)
1566 EMIT_FOUR_BYTES (c1 >> 8, c1 & 0xFF, c2 >> 8, c2 & 0xFF);
1567 else
1568 EMIT_FOUR_BYTES (c1 & 0xFF, c1 >> 8, c2 & 0xFF, c2 >> 8);
1569 }
1570 }
1571 record_conversion_result (coding, CODING_RESULT_SUCCESS);
1572 coding->produced = dst - coding->destination;
1573 coding->produced_char += produced_chars;
1574 return 0;
1575 }
1576
1577 \f
1578 /*** 6. Old Emacs' internal format (emacs-mule) ***/
1579
1580 /* Emacs' internal format for representation of multiple character
1581 sets is a kind of multi-byte encoding, i.e. characters are
1582 represented by variable-length sequences of one-byte codes.
1583
1584 ASCII characters and control characters (e.g. `tab', `newline') are
1585 represented by one-byte sequences which are their ASCII codes, in
1586 the range 0x00 through 0x7F.
1587
1588 8-bit characters of the range 0x80..0x9F are represented by
1589 two-byte sequences of LEADING_CODE_8_BIT_CONTROL and (their 8-bit
1590 code + 0x20).
1591
1592 8-bit characters of the range 0xA0..0xFF are represented by
1593 one-byte sequences which are their 8-bit code.
1594
1595 The other characters are represented by a sequence of `base
1596 leading-code', optional `extended leading-code', and one or two
1597 `position-code's. The length of the sequence is determined by the
1598 base leading-code. Leading-code takes the range 0x81 through 0x9D,
1599 whereas extended leading-code and position-code take the range 0xA0
1600 through 0xFF. See `charset.h' for more details about leading-code
1601 and position-code.
1602
1603 --- CODE RANGE of Emacs' internal format ---
1604 character set range
1605 ------------- -----
1606 ascii 0x00..0x7F
1607 eight-bit-control LEADING_CODE_8_BIT_CONTROL + 0xA0..0xBF
1608 eight-bit-graphic 0xA0..0xBF
1609 ELSE 0x81..0x9D + [0xA0..0xFF]+
1610 ---------------------------------------------
1611
1612 As this is the internal character representation, the format is
1613 usually not used externally (i.e. in a file or in a data sent to a
1614 process). But, it is possible to have a text externally in this
1615 format (i.e. by encoding by the coding system `emacs-mule').
1616
1617 In that case, a sequence of one-byte codes has a slightly different
1618 form.
1619
1620 At first, all characters in eight-bit-control are represented by
1621 one-byte sequences which are their 8-bit code.
1622
1623 Next, character composition data are represented by the byte
1624 sequence of the form: 0x80 METHOD BYTES CHARS COMPONENT ...,
1625 where,
1626 METHOD is 0xF0 plus one of composition method (enum
1627 composition_method),
1628
1629 BYTES is 0xA0 plus a byte length of this composition data,
1630
1631 CHARS is 0x20 plus a number of characters composed by this
1632 data,
1633
1634 COMPONENTs are characters of multibye form or composition
1635 rules encoded by two-byte of ASCII codes.
1636
1637 In addition, for backward compatibility, the following formats are
1638 also recognized as composition data on decoding.
1639
1640 0x80 MSEQ ...
1641 0x80 0xFF MSEQ RULE MSEQ RULE ... MSEQ
1642
1643 Here,
1644 MSEQ is a multibyte form but in these special format:
1645 ASCII: 0xA0 ASCII_CODE+0x80,
1646 other: LEADING_CODE+0x20 FOLLOWING-BYTE ...,
1647 RULE is a one byte code of the range 0xA0..0xF0 that
1648 represents a composition rule.
1649 */
1650
1651 char emacs_mule_bytes[256];
1652
1653 int
1654 emacs_mule_char (coding, src, nbytes, nchars, id)
1655 struct coding_system *coding;
1656 const unsigned char *src;
1657 int *nbytes, *nchars, *id;
1658 {
1659 const unsigned char *src_end = coding->source + coding->src_bytes;
1660 const unsigned char *src_base = src;
1661 int multibytep = coding->src_multibyte;
1662 struct charset *charset;
1663 unsigned code;
1664 int c;
1665 int consumed_chars = 0;
1666
1667 ONE_MORE_BYTE (c);
1668 if (c < 0)
1669 {
1670 c = -c;
1671 charset = emacs_mule_charset[0];
1672 }
1673 else
1674 {
1675 switch (emacs_mule_bytes[c])
1676 {
1677 case 2:
1678 if (! (charset = emacs_mule_charset[c]))
1679 goto invalid_code;
1680 ONE_MORE_BYTE (c);
1681 if (c < 0)
1682 goto invalid_code;
1683 code = c & 0x7F;
1684 break;
1685
1686 case 3:
1687 if (c == EMACS_MULE_LEADING_CODE_PRIVATE_11
1688 || c == EMACS_MULE_LEADING_CODE_PRIVATE_12)
1689 {
1690 ONE_MORE_BYTE (c);
1691 if (c < 0 || ! (charset = emacs_mule_charset[c]))
1692 goto invalid_code;
1693 ONE_MORE_BYTE (c);
1694 if (c < 0)
1695 goto invalid_code;
1696 code = c & 0x7F;
1697 }
1698 else
1699 {
1700 if (! (charset = emacs_mule_charset[c]))
1701 goto invalid_code;
1702 ONE_MORE_BYTE (c);
1703 if (c < 0)
1704 goto invalid_code;
1705 code = (c & 0x7F) << 8;
1706 ONE_MORE_BYTE (c);
1707 if (c < 0)
1708 goto invalid_code;
1709 code |= c & 0x7F;
1710 }
1711 break;
1712
1713 case 4:
1714 ONE_MORE_BYTE (c);
1715 if (c < 0 || ! (charset = emacs_mule_charset[c]))
1716 goto invalid_code;
1717 ONE_MORE_BYTE (c);
1718 if (c < 0)
1719 goto invalid_code;
1720 code = (c & 0x7F) << 8;
1721 ONE_MORE_BYTE (c);
1722 if (c < 0)
1723 goto invalid_code;
1724 code |= c & 0x7F;
1725 break;
1726
1727 case 1:
1728 code = c;
1729 charset = CHARSET_FROM_ID (ASCII_BYTE_P (code)
1730 ? charset_ascii : charset_eight_bit);
1731 break;
1732
1733 default:
1734 abort ();
1735 }
1736 c = DECODE_CHAR (charset, code);
1737 if (c < 0)
1738 goto invalid_code;
1739 }
1740 *nbytes = src - src_base;
1741 *nchars = consumed_chars;
1742 if (id)
1743 *id = charset->id;
1744 return c;
1745
1746 no_more_source:
1747 return -2;
1748
1749 invalid_code:
1750 return -1;
1751 }
1752
1753
1754 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
1755 Check if a text is encoded in `emacs-mule'. If it is, return 1,
1756 else return 0. */
1757
1758 static int
1759 detect_coding_emacs_mule (coding, detect_info)
1760 struct coding_system *coding;
1761 struct coding_detection_info *detect_info;
1762 {
1763 const unsigned char *src = coding->source, *src_base;
1764 const unsigned char *src_end = coding->source + coding->src_bytes;
1765 int multibytep = coding->src_multibyte;
1766 int consumed_chars = 0;
1767 int c;
1768 int found = 0;
1769
1770 detect_info->checked |= CATEGORY_MASK_EMACS_MULE;
1771 /* A coding system of this category is always ASCII compatible. */
1772 src += coding->head_ascii;
1773
1774 while (1)
1775 {
1776 src_base = src;
1777 ONE_MORE_BYTE (c);
1778 if (c < 0)
1779 continue;
1780 if (c == 0x80)
1781 {
1782 /* Perhaps the start of composite character. We simple skip
1783 it because analyzing it is too heavy for detecting. But,
1784 at least, we check that the composite character
1785 constitues of more than 4 bytes. */
1786 const unsigned char *src_base;
1787
1788 repeat:
1789 src_base = src;
1790 do
1791 {
1792 ONE_MORE_BYTE (c);
1793 }
1794 while (c >= 0xA0);
1795
1796 if (src - src_base <= 4)
1797 break;
1798 found = CATEGORY_MASK_EMACS_MULE;
1799 if (c == 0x80)
1800 goto repeat;
1801 }
1802
1803 if (c < 0x80)
1804 {
1805 if (c < 0x20
1806 && (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO))
1807 break;
1808 }
1809 else
1810 {
1811 int more_bytes = emacs_mule_bytes[*src_base] - 1;
1812
1813 while (more_bytes > 0)
1814 {
1815 ONE_MORE_BYTE (c);
1816 if (c < 0xA0)
1817 {
1818 src--; /* Unread the last byte. */
1819 break;
1820 }
1821 more_bytes--;
1822 }
1823 if (more_bytes != 0)
1824 break;
1825 found = CATEGORY_MASK_EMACS_MULE;
1826 }
1827 }
1828 detect_info->rejected |= CATEGORY_MASK_EMACS_MULE;
1829 return 0;
1830
1831 no_more_source:
1832 if (src_base < src && coding->mode & CODING_MODE_LAST_BLOCK)
1833 {
1834 detect_info->rejected |= CATEGORY_MASK_EMACS_MULE;
1835 return 0;
1836 }
1837 detect_info->found |= found;
1838 return 1;
1839 }
1840
1841
1842 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions". */
1843
1844 /* Decode a character represented as a component of composition
1845 sequence of Emacs 20/21 style at SRC. Set C to that character and
1846 update SRC to the head of next character (or an encoded composition
1847 rule). If SRC doesn't points a composition component, set C to -1.
1848 If SRC points an invalid byte sequence, global exit by a return
1849 value 0. */
1850
1851 #define DECODE_EMACS_MULE_COMPOSITION_CHAR(buf) \
1852 if (1) \
1853 { \
1854 int c; \
1855 int nbytes, nchars; \
1856 \
1857 if (src == src_end) \
1858 break; \
1859 c = emacs_mule_char (coding, src, &nbytes, &nchars, NULL);\
1860 if (c < 0) \
1861 { \
1862 if (c == -2) \
1863 break; \
1864 goto invalid_code; \
1865 } \
1866 *buf++ = c; \
1867 src += nbytes; \
1868 consumed_chars += nchars; \
1869 } \
1870 else
1871
1872
1873 /* Decode a composition rule represented as a component of composition
1874 sequence of Emacs 20 style at SRC. Store the decoded rule in *BUF,
1875 and increment BUF. If SRC points an invalid byte sequence, set C
1876 to -1. */
1877
1878 #define DECODE_EMACS_MULE_COMPOSITION_RULE_20(buf) \
1879 do { \
1880 int c, gref, nref; \
1881 \
1882 if (src >= src_end) \
1883 goto invalid_code; \
1884 ONE_MORE_BYTE_NO_CHECK (c); \
1885 c -= 0x20; \
1886 if (c < 0 || c >= 81) \
1887 goto invalid_code; \
1888 \
1889 gref = c / 9, nref = c % 9; \
1890 *buf++ = COMPOSITION_ENCODE_RULE (gref, nref); \
1891 } while (0)
1892
1893
1894 /* Decode a composition rule represented as a component of composition
1895 sequence of Emacs 21 style at SRC. Store the decoded rule in *BUF,
1896 and increment BUF. If SRC points an invalid byte sequence, set C
1897 to -1. */
1898
1899 #define DECODE_EMACS_MULE_COMPOSITION_RULE_21(buf) \
1900 do { \
1901 int gref, nref; \
1902 \
1903 if (src + 1>= src_end) \
1904 goto invalid_code; \
1905 ONE_MORE_BYTE_NO_CHECK (gref); \
1906 gref -= 0x20; \
1907 ONE_MORE_BYTE_NO_CHECK (nref); \
1908 nref -= 0x20; \
1909 if (gref < 0 || gref >= 81 \
1910 || nref < 0 || nref >= 81) \
1911 goto invalid_code; \
1912 *buf++ = COMPOSITION_ENCODE_RULE (gref, nref); \
1913 } while (0)
1914
1915
1916 #define DECODE_EMACS_MULE_21_COMPOSITION(c) \
1917 do { \
1918 /* Emacs 21 style format. The first three bytes at SRC are \
1919 (METHOD - 0xF2), (BYTES - 0xA0), (CHARS - 0xA0), where BYTES is \
1920 the byte length of this composition information, CHARS is the \
1921 number of characters composed by this composition. */ \
1922 enum composition_method method = c - 0xF2; \
1923 int *charbuf_base = charbuf; \
1924 int from, to; \
1925 int consumed_chars_limit; \
1926 int nbytes, nchars; \
1927 \
1928 ONE_MORE_BYTE (c); \
1929 if (c < 0) \
1930 goto invalid_code; \
1931 nbytes = c - 0xA0; \
1932 if (nbytes < 3) \
1933 goto invalid_code; \
1934 ONE_MORE_BYTE (c); \
1935 if (c < 0) \
1936 goto invalid_code; \
1937 nchars = c - 0xA0; \
1938 from = coding->produced + char_offset; \
1939 to = from + nchars; \
1940 ADD_COMPOSITION_DATA (charbuf, from, to, method); \
1941 consumed_chars_limit = consumed_chars_base + nbytes; \
1942 if (method != COMPOSITION_RELATIVE) \
1943 { \
1944 int i = 0; \
1945 while (consumed_chars < consumed_chars_limit) \
1946 { \
1947 if (i % 2 && method != COMPOSITION_WITH_ALTCHARS) \
1948 DECODE_EMACS_MULE_COMPOSITION_RULE_21 (charbuf); \
1949 else \
1950 DECODE_EMACS_MULE_COMPOSITION_CHAR (charbuf); \
1951 i++; \
1952 } \
1953 if (consumed_chars < consumed_chars_limit) \
1954 goto invalid_code; \
1955 charbuf_base[0] -= i; \
1956 } \
1957 } while (0)
1958
1959
1960 #define DECODE_EMACS_MULE_20_RELATIVE_COMPOSITION(c) \
1961 do { \
1962 /* Emacs 20 style format for relative composition. */ \
1963 /* Store multibyte form of characters to be composed. */ \
1964 enum composition_method method = COMPOSITION_RELATIVE; \
1965 int components[MAX_COMPOSITION_COMPONENTS * 2 - 1]; \
1966 int *buf = components; \
1967 int i, j; \
1968 int from, to; \
1969 \
1970 src = src_base; \
1971 ONE_MORE_BYTE (c); /* skip 0x80 */ \
1972 for (i = 0; i < MAX_COMPOSITION_COMPONENTS; i++) \
1973 DECODE_EMACS_MULE_COMPOSITION_CHAR (buf); \
1974 if (i < 2) \
1975 goto invalid_code; \
1976 from = coding->produced_char + char_offset; \
1977 to = from + i; \
1978 ADD_COMPOSITION_DATA (charbuf, from, to, method); \
1979 for (j = 0; j < i; j++) \
1980 *charbuf++ = components[j]; \
1981 } while (0)
1982
1983
1984 #define DECODE_EMACS_MULE_20_RULEBASE_COMPOSITION(c) \
1985 do { \
1986 /* Emacs 20 style format for rule-base composition. */ \
1987 /* Store multibyte form of characters to be composed. */ \
1988 enum composition_method method = COMPOSITION_WITH_RULE; \
1989 int components[MAX_COMPOSITION_COMPONENTS * 2 - 1]; \
1990 int *buf = components; \
1991 int i, j; \
1992 int from, to; \
1993 \
1994 DECODE_EMACS_MULE_COMPOSITION_CHAR (buf); \
1995 for (i = 0; i < MAX_COMPOSITION_COMPONENTS; i++) \
1996 { \
1997 DECODE_EMACS_MULE_COMPOSITION_RULE_20 (buf); \
1998 DECODE_EMACS_MULE_COMPOSITION_CHAR (buf); \
1999 } \
2000 if (i < 1 || (buf - components) % 2 == 0) \
2001 goto invalid_code; \
2002 if (charbuf + i + (i / 2) + 1 < charbuf_end) \
2003 goto no_more_source; \
2004 from = coding->produced_char + char_offset; \
2005 to = from + i; \
2006 ADD_COMPOSITION_DATA (buf, from, to, method); \
2007 for (j = 0; j < i; j++) \
2008 *charbuf++ = components[j]; \
2009 for (j = 0; j < i; j += 2) \
2010 *charbuf++ = components[j]; \
2011 } while (0)
2012
2013
2014 static void
2015 decode_coding_emacs_mule (coding)
2016 struct coding_system *coding;
2017 {
2018 const unsigned char *src = coding->source + coding->consumed;
2019 const unsigned char *src_end = coding->source + coding->src_bytes;
2020 const unsigned char *src_base;
2021 int *charbuf = coding->charbuf;
2022 int *charbuf_end = charbuf + coding->charbuf_size - MAX_ANNOTATION_LENGTH;
2023 int consumed_chars = 0, consumed_chars_base;
2024 int multibytep = coding->src_multibyte;
2025 Lisp_Object attrs, charset_list;
2026 int char_offset = coding->produced_char;
2027 int last_offset = char_offset;
2028 int last_id = charset_ascii;
2029
2030 CODING_GET_INFO (coding, attrs, charset_list);
2031
2032 while (1)
2033 {
2034 int c;
2035
2036 src_base = src;
2037 consumed_chars_base = consumed_chars;
2038
2039 if (charbuf >= charbuf_end)
2040 break;
2041
2042 ONE_MORE_BYTE (c);
2043 if (c < 0)
2044 {
2045 *charbuf++ = -c;
2046 char_offset++;
2047 }
2048 else if (c < 0x80)
2049 {
2050 *charbuf++ = c;
2051 char_offset++;
2052 }
2053 else if (c == 0x80)
2054 {
2055 ONE_MORE_BYTE (c);
2056 if (c < 0)
2057 goto invalid_code;
2058 if (c - 0xF2 >= COMPOSITION_RELATIVE
2059 && c - 0xF2 <= COMPOSITION_WITH_RULE_ALTCHARS)
2060 DECODE_EMACS_MULE_21_COMPOSITION (c);
2061 else if (c < 0xC0)
2062 DECODE_EMACS_MULE_20_RELATIVE_COMPOSITION (c);
2063 else if (c == 0xFF)
2064 DECODE_EMACS_MULE_20_RULEBASE_COMPOSITION (c);
2065 else
2066 goto invalid_code;
2067 }
2068 else if (c < 0xA0 && emacs_mule_bytes[c] > 1)
2069 {
2070 int nbytes, nchars;
2071 int id;
2072
2073 src = src_base;
2074 consumed_chars = consumed_chars_base;
2075 c = emacs_mule_char (coding, src, &nbytes, &nchars, &id);
2076 if (c < 0)
2077 {
2078 if (c == -2)
2079 break;
2080 goto invalid_code;
2081 }
2082 if (last_id != id)
2083 {
2084 if (last_id != charset_ascii)
2085 ADD_CHARSET_DATA (charbuf, last_offset, char_offset, last_id);
2086 last_id = id;
2087 last_offset = char_offset;
2088 }
2089 *charbuf++ = c;
2090 src += nbytes;
2091 consumed_chars += nchars;
2092 char_offset++;
2093 }
2094 continue;
2095
2096 invalid_code:
2097 src = src_base;
2098 consumed_chars = consumed_chars_base;
2099 ONE_MORE_BYTE (c);
2100 *charbuf++ = ASCII_BYTE_P (c) ? c : BYTE8_TO_CHAR (c);
2101 char_offset++;
2102 coding->errors++;
2103 }
2104
2105 no_more_source:
2106 if (last_id != charset_ascii)
2107 ADD_CHARSET_DATA (charbuf, last_offset, char_offset, last_id);
2108 coding->consumed_char += consumed_chars_base;
2109 coding->consumed = src_base - coding->source;
2110 coding->charbuf_used = charbuf - coding->charbuf;
2111 }
2112
2113
2114 #define EMACS_MULE_LEADING_CODES(id, codes) \
2115 do { \
2116 if (id < 0xA0) \
2117 codes[0] = id, codes[1] = 0; \
2118 else if (id < 0xE0) \
2119 codes[0] = 0x9A, codes[1] = id; \
2120 else if (id < 0xF0) \
2121 codes[0] = 0x9B, codes[1] = id; \
2122 else if (id < 0xF5) \
2123 codes[0] = 0x9C, codes[1] = id; \
2124 else \
2125 codes[0] = 0x9D, codes[1] = id; \
2126 } while (0);
2127
2128
2129 static int
2130 encode_coding_emacs_mule (coding)
2131 struct coding_system *coding;
2132 {
2133 int multibytep = coding->dst_multibyte;
2134 int *charbuf = coding->charbuf;
2135 int *charbuf_end = charbuf + coding->charbuf_used;
2136 unsigned char *dst = coding->destination + coding->produced;
2137 unsigned char *dst_end = coding->destination + coding->dst_bytes;
2138 int safe_room = 8;
2139 int produced_chars = 0;
2140 Lisp_Object attrs, charset_list;
2141 int c;
2142 int preferred_charset_id = -1;
2143
2144 CODING_GET_INFO (coding, attrs, charset_list);
2145 if (! EQ (charset_list, Vemacs_mule_charset_list))
2146 {
2147 CODING_ATTR_CHARSET_LIST (attrs)
2148 = charset_list = Vemacs_mule_charset_list;
2149 }
2150
2151 while (charbuf < charbuf_end)
2152 {
2153 ASSURE_DESTINATION (safe_room);
2154 c = *charbuf++;
2155
2156 if (c < 0)
2157 {
2158 /* Handle an annotation. */
2159 switch (*charbuf)
2160 {
2161 case CODING_ANNOTATE_COMPOSITION_MASK:
2162 /* Not yet implemented. */
2163 break;
2164 case CODING_ANNOTATE_CHARSET_MASK:
2165 preferred_charset_id = charbuf[3];
2166 if (preferred_charset_id >= 0
2167 && NILP (Fmemq (make_number (preferred_charset_id),
2168 charset_list)))
2169 preferred_charset_id = -1;
2170 break;
2171 default:
2172 abort ();
2173 }
2174 charbuf += -c - 1;
2175 continue;
2176 }
2177
2178 if (ASCII_CHAR_P (c))
2179 EMIT_ONE_ASCII_BYTE (c);
2180 else if (CHAR_BYTE8_P (c))
2181 {
2182 c = CHAR_TO_BYTE8 (c);
2183 EMIT_ONE_BYTE (c);
2184 }
2185 else
2186 {
2187 struct charset *charset;
2188 unsigned code;
2189 int dimension;
2190 int emacs_mule_id;
2191 unsigned char leading_codes[2];
2192
2193 if (preferred_charset_id >= 0)
2194 {
2195 charset = CHARSET_FROM_ID (preferred_charset_id);
2196 if (! CHAR_CHARSET_P (c, charset))
2197 charset = char_charset (c, charset_list, NULL);
2198 }
2199 else
2200 charset = char_charset (c, charset_list, &code);
2201 if (! charset)
2202 {
2203 c = coding->default_char;
2204 if (ASCII_CHAR_P (c))
2205 {
2206 EMIT_ONE_ASCII_BYTE (c);
2207 continue;
2208 }
2209 charset = char_charset (c, charset_list, &code);
2210 }
2211 dimension = CHARSET_DIMENSION (charset);
2212 emacs_mule_id = CHARSET_EMACS_MULE_ID (charset);
2213 EMACS_MULE_LEADING_CODES (emacs_mule_id, leading_codes);
2214 EMIT_ONE_BYTE (leading_codes[0]);
2215 if (leading_codes[1])
2216 EMIT_ONE_BYTE (leading_codes[1]);
2217 if (dimension == 1)
2218 EMIT_ONE_BYTE (code | 0x80);
2219 else
2220 {
2221 code |= 0x8080;
2222 EMIT_ONE_BYTE (code >> 8);
2223 EMIT_ONE_BYTE (code & 0xFF);
2224 }
2225 }
2226 }
2227 record_conversion_result (coding, CODING_RESULT_SUCCESS);
2228 coding->produced_char += produced_chars;
2229 coding->produced = dst - coding->destination;
2230 return 0;
2231 }
2232
2233 \f
2234 /*** 7. ISO2022 handlers ***/
2235
2236 /* The following note describes the coding system ISO2022 briefly.
2237 Since the intention of this note is to help understand the
2238 functions in this file, some parts are NOT ACCURATE or are OVERLY
2239 SIMPLIFIED. For thorough understanding, please refer to the
2240 original document of ISO2022. This is equivalent to the standard
2241 ECMA-35, obtainable from <URL:http://www.ecma.ch/> (*).
2242
2243 ISO2022 provides many mechanisms to encode several character sets
2244 in 7-bit and 8-bit environments. For 7-bit environments, all text
2245 is encoded using bytes less than 128. This may make the encoded
2246 text a little bit longer, but the text passes more easily through
2247 several types of gateway, some of which strip off the MSB (Most
2248 Significant Bit).
2249
2250 There are two kinds of character sets: control character sets and
2251 graphic character sets. The former contain control characters such
2252 as `newline' and `escape' to provide control functions (control
2253 functions are also provided by escape sequences). The latter
2254 contain graphic characters such as 'A' and '-'. Emacs recognizes
2255 two control character sets and many graphic character sets.
2256
2257 Graphic character sets are classified into one of the following
2258 four classes, according to the number of bytes (DIMENSION) and
2259 number of characters in one dimension (CHARS) of the set:
2260 - DIMENSION1_CHARS94
2261 - DIMENSION1_CHARS96
2262 - DIMENSION2_CHARS94
2263 - DIMENSION2_CHARS96
2264
2265 In addition, each character set is assigned an identification tag,
2266 unique for each set, called the "final character" (denoted as <F>
2267 hereafter). The <F> of each character set is decided by ECMA(*)
2268 when it is registered in ISO. The code range of <F> is 0x30..0x7F
2269 (0x30..0x3F are for private use only).
2270
2271 Note (*): ECMA = European Computer Manufacturers Association
2272
2273 Here are examples of graphic character sets [NAME(<F>)]:
2274 o DIMENSION1_CHARS94 -- ASCII('B'), right-half-of-JISX0201('I'), ...
2275 o DIMENSION1_CHARS96 -- right-half-of-ISO8859-1('A'), ...
2276 o DIMENSION2_CHARS94 -- GB2312('A'), JISX0208('B'), ...
2277 o DIMENSION2_CHARS96 -- none for the moment
2278
2279 A code area (1 byte=8 bits) is divided into 4 areas, C0, GL, C1, and GR.
2280 C0 [0x00..0x1F] -- control character plane 0
2281 GL [0x20..0x7F] -- graphic character plane 0
2282 C1 [0x80..0x9F] -- control character plane 1
2283 GR [0xA0..0xFF] -- graphic character plane 1
2284
2285 A control character set is directly designated and invoked to C0 or
2286 C1 by an escape sequence. The most common case is that:
2287 - ISO646's control character set is designated/invoked to C0, and
2288 - ISO6429's control character set is designated/invoked to C1,
2289 and usually these designations/invocations are omitted in encoded
2290 text. In a 7-bit environment, only C0 can be used, and a control
2291 character for C1 is encoded by an appropriate escape sequence to
2292 fit into the environment. All control characters for C1 are
2293 defined to have corresponding escape sequences.
2294
2295 A graphic character set is at first designated to one of four
2296 graphic registers (G0 through G3), then these graphic registers are
2297 invoked to GL or GR. These designations and invocations can be
2298 done independently. The most common case is that G0 is invoked to
2299 GL, G1 is invoked to GR, and ASCII is designated to G0. Usually
2300 these invocations and designations are omitted in encoded text.
2301 In a 7-bit environment, only GL can be used.
2302
2303 When a graphic character set of CHARS94 is invoked to GL, codes
2304 0x20 and 0x7F of the GL area work as control characters SPACE and
2305 DEL respectively, and codes 0xA0 and 0xFF of the GR area should not
2306 be used.
2307
2308 There are two ways of invocation: locking-shift and single-shift.
2309 With locking-shift, the invocation lasts until the next different
2310 invocation, whereas with single-shift, the invocation affects the
2311 following character only and doesn't affect the locking-shift
2312 state. Invocations are done by the following control characters or
2313 escape sequences:
2314
2315 ----------------------------------------------------------------------
2316 abbrev function cntrl escape seq description
2317 ----------------------------------------------------------------------
2318 SI/LS0 (shift-in) 0x0F none invoke G0 into GL
2319 SO/LS1 (shift-out) 0x0E none invoke G1 into GL
2320 LS2 (locking-shift-2) none ESC 'n' invoke G2 into GL
2321 LS3 (locking-shift-3) none ESC 'o' invoke G3 into GL
2322 LS1R (locking-shift-1 right) none ESC '~' invoke G1 into GR (*)
2323 LS2R (locking-shift-2 right) none ESC '}' invoke G2 into GR (*)
2324 LS3R (locking-shift 3 right) none ESC '|' invoke G3 into GR (*)
2325 SS2 (single-shift-2) 0x8E ESC 'N' invoke G2 for one char
2326 SS3 (single-shift-3) 0x8F ESC 'O' invoke G3 for one char
2327 ----------------------------------------------------------------------
2328 (*) These are not used by any known coding system.
2329
2330 Control characters for these functions are defined by macros
2331 ISO_CODE_XXX in `coding.h'.
2332
2333 Designations are done by the following escape sequences:
2334 ----------------------------------------------------------------------
2335 escape sequence description
2336 ----------------------------------------------------------------------
2337 ESC '(' <F> designate DIMENSION1_CHARS94<F> to G0
2338 ESC ')' <F> designate DIMENSION1_CHARS94<F> to G1
2339 ESC '*' <F> designate DIMENSION1_CHARS94<F> to G2
2340 ESC '+' <F> designate DIMENSION1_CHARS94<F> to G3
2341 ESC ',' <F> designate DIMENSION1_CHARS96<F> to G0 (*)
2342 ESC '-' <F> designate DIMENSION1_CHARS96<F> to G1
2343 ESC '.' <F> designate DIMENSION1_CHARS96<F> to G2
2344 ESC '/' <F> designate DIMENSION1_CHARS96<F> to G3
2345 ESC '$' '(' <F> designate DIMENSION2_CHARS94<F> to G0 (**)
2346 ESC '$' ')' <F> designate DIMENSION2_CHARS94<F> to G1
2347 ESC '$' '*' <F> designate DIMENSION2_CHARS94<F> to G2
2348 ESC '$' '+' <F> designate DIMENSION2_CHARS94<F> to G3
2349 ESC '$' ',' <F> designate DIMENSION2_CHARS96<F> to G0 (*)
2350 ESC '$' '-' <F> designate DIMENSION2_CHARS96<F> to G1
2351 ESC '$' '.' <F> designate DIMENSION2_CHARS96<F> to G2
2352 ESC '$' '/' <F> designate DIMENSION2_CHARS96<F> to G3
2353 ----------------------------------------------------------------------
2354
2355 In this list, "DIMENSION1_CHARS94<F>" means a graphic character set
2356 of dimension 1, chars 94, and final character <F>, etc...
2357
2358 Note (*): Although these designations are not allowed in ISO2022,
2359 Emacs accepts them on decoding, and produces them on encoding
2360 CHARS96 character sets in a coding system which is characterized as
2361 7-bit environment, non-locking-shift, and non-single-shift.
2362
2363 Note (**): If <F> is '@', 'A', or 'B', the intermediate character
2364 '(' must be omitted. We refer to this as "short-form" hereafter.
2365
2366 Now you may notice that there are a lot of ways of encoding the
2367 same multilingual text in ISO2022. Actually, there exist many
2368 coding systems such as Compound Text (used in X11's inter client
2369 communication, ISO-2022-JP (used in Japanese Internet), ISO-2022-KR
2370 (used in Korean Internet), EUC (Extended UNIX Code, used in Asian
2371 localized platforms), and all of these are variants of ISO2022.
2372
2373 In addition to the above, Emacs handles two more kinds of escape
2374 sequences: ISO6429's direction specification and Emacs' private
2375 sequence for specifying character composition.
2376
2377 ISO6429's direction specification takes the following form:
2378 o CSI ']' -- end of the current direction
2379 o CSI '0' ']' -- end of the current direction
2380 o CSI '1' ']' -- start of left-to-right text
2381 o CSI '2' ']' -- start of right-to-left text
2382 The control character CSI (0x9B: control sequence introducer) is
2383 abbreviated to the escape sequence ESC '[' in a 7-bit environment.
2384
2385 Character composition specification takes the following form:
2386 o ESC '0' -- start relative composition
2387 o ESC '1' -- end composition
2388 o ESC '2' -- start rule-base composition (*)
2389 o ESC '3' -- start relative composition with alternate chars (**)
2390 o ESC '4' -- start rule-base composition with alternate chars (**)
2391 Since these are not standard escape sequences of any ISO standard,
2392 the use of them with these meanings is restricted to Emacs only.
2393
2394 (*) This form is used only in Emacs 20.7 and older versions,
2395 but newer versions can safely decode it.
2396 (**) This form is used only in Emacs 21.1 and newer versions,
2397 and older versions can't decode it.
2398
2399 Here's a list of example usages of these composition escape
2400 sequences (categorized by `enum composition_method').
2401
2402 COMPOSITION_RELATIVE:
2403 ESC 0 CHAR [ CHAR ] ESC 1
2404 COMPOSITION_WITH_RULE:
2405 ESC 2 CHAR [ RULE CHAR ] ESC 1
2406 COMPOSITION_WITH_ALTCHARS:
2407 ESC 3 ALTCHAR [ ALTCHAR ] ESC 0 CHAR [ CHAR ] ESC 1
2408 COMPOSITION_WITH_RULE_ALTCHARS:
2409 ESC 4 ALTCHAR [ RULE ALTCHAR ] ESC 0 CHAR [ CHAR ] ESC 1 */
2410
2411 enum iso_code_class_type iso_code_class[256];
2412
2413 #define SAFE_CHARSET_P(coding, id) \
2414 ((id) <= (coding)->max_charset_id \
2415 && (coding)->safe_charsets[id] >= 0)
2416
2417
2418 #define SHIFT_OUT_OK(category) \
2419 (CODING_ISO_INITIAL (&coding_categories[category], 1) >= 0)
2420
2421 static void
2422 setup_iso_safe_charsets (attrs)
2423 Lisp_Object attrs;
2424 {
2425 Lisp_Object charset_list, safe_charsets;
2426 Lisp_Object request;
2427 Lisp_Object reg_usage;
2428 Lisp_Object tail;
2429 int reg94, reg96;
2430 int flags = XINT (AREF (attrs, coding_attr_iso_flags));
2431 int max_charset_id;
2432
2433 charset_list = CODING_ATTR_CHARSET_LIST (attrs);
2434 if ((flags & CODING_ISO_FLAG_FULL_SUPPORT)
2435 && ! EQ (charset_list, Viso_2022_charset_list))
2436 {
2437 CODING_ATTR_CHARSET_LIST (attrs)
2438 = charset_list = Viso_2022_charset_list;
2439 ASET (attrs, coding_attr_safe_charsets, Qnil);
2440 }
2441
2442 if (STRINGP (AREF (attrs, coding_attr_safe_charsets)))
2443 return;
2444
2445 max_charset_id = 0;
2446 for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
2447 {
2448 int id = XINT (XCAR (tail));
2449 if (max_charset_id < id)
2450 max_charset_id = id;
2451 }
2452
2453 safe_charsets = Fmake_string (make_number (max_charset_id + 1),
2454 make_number (255));
2455 request = AREF (attrs, coding_attr_iso_request);
2456 reg_usage = AREF (attrs, coding_attr_iso_usage);
2457 reg94 = XINT (XCAR (reg_usage));
2458 reg96 = XINT (XCDR (reg_usage));
2459
2460 for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
2461 {
2462 Lisp_Object id;
2463 Lisp_Object reg;
2464 struct charset *charset;
2465
2466 id = XCAR (tail);
2467 charset = CHARSET_FROM_ID (XINT (id));
2468 reg = Fcdr (Fassq (id, request));
2469 if (! NILP (reg))
2470 SSET (safe_charsets, XINT (id), XINT (reg));
2471 else if (charset->iso_chars_96)
2472 {
2473 if (reg96 < 4)
2474 SSET (safe_charsets, XINT (id), reg96);
2475 }
2476 else
2477 {
2478 if (reg94 < 4)
2479 SSET (safe_charsets, XINT (id), reg94);
2480 }
2481 }
2482 ASET (attrs, coding_attr_safe_charsets, safe_charsets);
2483 }
2484
2485
2486 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
2487 Check if a text is encoded in one of ISO-2022 based codig systems.
2488 If it is, return 1, else return 0. */
2489
2490 static int
2491 detect_coding_iso_2022 (coding, detect_info)
2492 struct coding_system *coding;
2493 struct coding_detection_info *detect_info;
2494 {
2495 const unsigned char *src = coding->source, *src_base = src;
2496 const unsigned char *src_end = coding->source + coding->src_bytes;
2497 int multibytep = coding->src_multibyte;
2498 int single_shifting = 0;
2499 int id;
2500 int c, c1;
2501 int consumed_chars = 0;
2502 int i;
2503 int rejected = 0;
2504 int found = 0;
2505
2506 detect_info->checked |= CATEGORY_MASK_ISO;
2507
2508 for (i = coding_category_iso_7; i <= coding_category_iso_8_else; i++)
2509 {
2510 struct coding_system *this = &(coding_categories[i]);
2511 Lisp_Object attrs, val;
2512
2513 attrs = CODING_ID_ATTRS (this->id);
2514 if (CODING_ISO_FLAGS (this) & CODING_ISO_FLAG_FULL_SUPPORT
2515 && ! EQ (CODING_ATTR_SAFE_CHARSETS (attrs), Viso_2022_charset_list))
2516 setup_iso_safe_charsets (attrs);
2517 val = CODING_ATTR_SAFE_CHARSETS (attrs);
2518 this->max_charset_id = SCHARS (val) - 1;
2519 this->safe_charsets = (char *) SDATA (val);
2520 }
2521
2522 /* A coding system of this category is always ASCII compatible. */
2523 src += coding->head_ascii;
2524
2525 while (rejected != CATEGORY_MASK_ISO)
2526 {
2527 src_base = src;
2528 ONE_MORE_BYTE (c);
2529 switch (c)
2530 {
2531 case ISO_CODE_ESC:
2532 if (inhibit_iso_escape_detection)
2533 break;
2534 single_shifting = 0;
2535 ONE_MORE_BYTE (c);
2536 if (c >= '(' && c <= '/')
2537 {
2538 /* Designation sequence for a charset of dimension 1. */
2539 ONE_MORE_BYTE (c1);
2540 if (c1 < ' ' || c1 >= 0x80
2541 || (id = iso_charset_table[0][c >= ','][c1]) < 0)
2542 /* Invalid designation sequence. Just ignore. */
2543 break;
2544 }
2545 else if (c == '$')
2546 {
2547 /* Designation sequence for a charset of dimension 2. */
2548 ONE_MORE_BYTE (c);
2549 if (c >= '@' && c <= 'B')
2550 /* Designation for JISX0208.1978, GB2312, or JISX0208. */
2551 id = iso_charset_table[1][0][c];
2552 else if (c >= '(' && c <= '/')
2553 {
2554 ONE_MORE_BYTE (c1);
2555 if (c1 < ' ' || c1 >= 0x80
2556 || (id = iso_charset_table[1][c >= ','][c1]) < 0)
2557 /* Invalid designation sequence. Just ignore. */
2558 break;
2559 }
2560 else
2561 /* Invalid designation sequence. Just ignore it. */
2562 break;
2563 }
2564 else if (c == 'N' || c == 'O')
2565 {
2566 /* ESC <Fe> for SS2 or SS3. */
2567 single_shifting = 1;
2568 rejected |= CATEGORY_MASK_ISO_7BIT | CATEGORY_MASK_ISO_8BIT;
2569 break;
2570 }
2571 else if (c >= '0' && c <= '4')
2572 {
2573 /* ESC <Fp> for start/end composition. */
2574 found |= CATEGORY_MASK_ISO;
2575 break;
2576 }
2577 else
2578 {
2579 /* Invalid escape sequence. Just ignore it. */
2580 break;
2581 }
2582
2583 /* We found a valid designation sequence for CHARSET. */
2584 rejected |= CATEGORY_MASK_ISO_8BIT;
2585 if (SAFE_CHARSET_P (&coding_categories[coding_category_iso_7],
2586 id))
2587 found |= CATEGORY_MASK_ISO_7;
2588 else
2589 rejected |= CATEGORY_MASK_ISO_7;
2590 if (SAFE_CHARSET_P (&coding_categories[coding_category_iso_7_tight],
2591 id))
2592 found |= CATEGORY_MASK_ISO_7_TIGHT;
2593 else
2594 rejected |= CATEGORY_MASK_ISO_7_TIGHT;
2595 if (SAFE_CHARSET_P (&coding_categories[coding_category_iso_7_else],
2596 id))
2597 found |= CATEGORY_MASK_ISO_7_ELSE;
2598 else
2599 rejected |= CATEGORY_MASK_ISO_7_ELSE;
2600 if (SAFE_CHARSET_P (&coding_categories[coding_category_iso_8_else],
2601 id))
2602 found |= CATEGORY_MASK_ISO_8_ELSE;
2603 else
2604 rejected |= CATEGORY_MASK_ISO_8_ELSE;
2605 break;
2606
2607 case ISO_CODE_SO:
2608 case ISO_CODE_SI:
2609 /* Locking shift out/in. */
2610 if (inhibit_iso_escape_detection)
2611 break;
2612 single_shifting = 0;
2613 rejected |= CATEGORY_MASK_ISO_7BIT | CATEGORY_MASK_ISO_8BIT;
2614 found |= CATEGORY_MASK_ISO_ELSE;
2615 break;
2616
2617 case ISO_CODE_CSI:
2618 /* Control sequence introducer. */
2619 single_shifting = 0;
2620 rejected |= CATEGORY_MASK_ISO_7BIT | CATEGORY_MASK_ISO_7_ELSE;
2621 found |= CATEGORY_MASK_ISO_8_ELSE;
2622 goto check_extra_latin;
2623
2624 case ISO_CODE_SS2:
2625 case ISO_CODE_SS3:
2626 /* Single shift. */
2627 if (inhibit_iso_escape_detection)
2628 break;
2629 single_shifting = 0;
2630 rejected |= CATEGORY_MASK_ISO_7BIT;
2631 if (CODING_ISO_FLAGS (&coding_categories[coding_category_iso_8_1])
2632 & CODING_ISO_FLAG_SINGLE_SHIFT)
2633 found |= CATEGORY_MASK_ISO_8_1, single_shifting = 1;
2634 if (CODING_ISO_FLAGS (&coding_categories[coding_category_iso_8_2])
2635 & CODING_ISO_FLAG_SINGLE_SHIFT)
2636 found |= CATEGORY_MASK_ISO_8_2, single_shifting = 1;
2637 if (single_shifting)
2638 break;
2639 goto check_extra_latin;
2640
2641 default:
2642 if (c < 0)
2643 continue;
2644 if (c < 0x80)
2645 {
2646 single_shifting = 0;
2647 break;
2648 }
2649 if (c >= 0xA0)
2650 {
2651 rejected |= CATEGORY_MASK_ISO_7BIT | CATEGORY_MASK_ISO_7_ELSE;
2652 found |= CATEGORY_MASK_ISO_8_1;
2653 /* Check the length of succeeding codes of the range
2654 0xA0..0FF. If the byte length is even, we include
2655 CATEGORY_MASK_ISO_8_2 in `found'. We can check this
2656 only when we are not single shifting. */
2657 if (! single_shifting
2658 && ! (rejected & CATEGORY_MASK_ISO_8_2))
2659 {
2660 int i = 1;
2661 while (src < src_end)
2662 {
2663 ONE_MORE_BYTE (c);
2664 if (c < 0xA0)
2665 break;
2666 i++;
2667 }
2668
2669 if (i & 1 && src < src_end)
2670 rejected |= CATEGORY_MASK_ISO_8_2;
2671 else
2672 found |= CATEGORY_MASK_ISO_8_2;
2673 }
2674 break;
2675 }
2676 check_extra_latin:
2677 single_shifting = 0;
2678 if (! VECTORP (Vlatin_extra_code_table)
2679 || NILP (XVECTOR (Vlatin_extra_code_table)->contents[c]))
2680 {
2681 rejected = CATEGORY_MASK_ISO;
2682 break;
2683 }
2684 if (CODING_ISO_FLAGS (&coding_categories[coding_category_iso_8_1])
2685 & CODING_ISO_FLAG_LATIN_EXTRA)
2686 found |= CATEGORY_MASK_ISO_8_1;
2687 else
2688 rejected |= CATEGORY_MASK_ISO_8_1;
2689 rejected |= CATEGORY_MASK_ISO_8_2;
2690 }
2691 }
2692 detect_info->rejected |= CATEGORY_MASK_ISO;
2693 return 0;
2694
2695 no_more_source:
2696 detect_info->rejected |= rejected;
2697 detect_info->found |= (found & ~rejected);
2698 return 1;
2699 }
2700
2701
2702 /* Set designation state into CODING. */
2703 #define DECODE_DESIGNATION(reg, dim, chars_96, final) \
2704 do { \
2705 int id, prev; \
2706 \
2707 if (final < '0' || final >= 128 \
2708 || ((id = ISO_CHARSET_TABLE (dim, chars_96, final)) < 0) \
2709 || !SAFE_CHARSET_P (coding, id)) \
2710 { \
2711 CODING_ISO_DESIGNATION (coding, reg) = -2; \
2712 goto invalid_code; \
2713 } \
2714 prev = CODING_ISO_DESIGNATION (coding, reg); \
2715 if (id == charset_jisx0201_roman) \
2716 { \
2717 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_USE_ROMAN) \
2718 id = charset_ascii; \
2719 } \
2720 else if (id == charset_jisx0208_1978) \
2721 { \
2722 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_USE_OLDJIS) \
2723 id = charset_jisx0208; \
2724 } \
2725 CODING_ISO_DESIGNATION (coding, reg) = id; \
2726 /* If there was an invalid designation to REG previously, and this \
2727 designation is ASCII to REG, we should keep this designation \
2728 sequence. */ \
2729 if (prev == -2 && id == charset_ascii) \
2730 goto invalid_code; \
2731 } while (0)
2732
2733
2734 #define MAYBE_FINISH_COMPOSITION() \
2735 do { \
2736 int i; \
2737 if (composition_state == COMPOSING_NO) \
2738 break; \
2739 /* It is assured that we have enough room for producing \
2740 characters stored in the table `components'. */ \
2741 if (charbuf + component_idx > charbuf_end) \
2742 goto no_more_source; \
2743 composition_state = COMPOSING_NO; \
2744 if (method == COMPOSITION_RELATIVE \
2745 || method == COMPOSITION_WITH_ALTCHARS) \
2746 { \
2747 for (i = 0; i < component_idx; i++) \
2748 *charbuf++ = components[i]; \
2749 char_offset += component_idx; \
2750 } \
2751 else \
2752 { \
2753 for (i = 0; i < component_idx; i += 2) \
2754 *charbuf++ = components[i]; \
2755 char_offset += (component_idx / 2) + 1; \
2756 } \
2757 } while (0)
2758
2759
2760 /* Handle composition start sequence ESC 0, ESC 2, ESC 3, or ESC 4.
2761 ESC 0 : relative composition : ESC 0 CHAR ... ESC 1
2762 ESC 2 : rulebase composition : ESC 2 CHAR RULE CHAR RULE ... CHAR ESC 1
2763 ESC 3 : altchar composition : ESC 3 CHAR ... ESC 0 CHAR ... ESC 1
2764 ESC 4 : alt&rule composition : ESC 4 CHAR RULE ... CHAR ESC 0 CHAR ... ESC 1
2765 */
2766
2767 #define DECODE_COMPOSITION_START(c1) \
2768 do { \
2769 if (c1 == '0' \
2770 && composition_state == COMPOSING_COMPONENT_RULE) \
2771 { \
2772 component_len = component_idx; \
2773 composition_state = COMPOSING_CHAR; \
2774 } \
2775 else \
2776 { \
2777 const unsigned char *p; \
2778 \
2779 MAYBE_FINISH_COMPOSITION (); \
2780 if (charbuf + MAX_COMPOSITION_COMPONENTS > charbuf_end) \
2781 goto no_more_source; \
2782 for (p = src; p < src_end - 1; p++) \
2783 if (*p == ISO_CODE_ESC && p[1] == '1') \
2784 break; \
2785 if (p == src_end - 1) \
2786 { \
2787 if (coding->mode & CODING_MODE_LAST_BLOCK) \
2788 goto invalid_code; \
2789 goto no_more_source; \
2790 } \
2791 \
2792 /* This is surely the start of a composition. */ \
2793 method = (c1 == '0' ? COMPOSITION_RELATIVE \
2794 : c1 == '2' ? COMPOSITION_WITH_RULE \
2795 : c1 == '3' ? COMPOSITION_WITH_ALTCHARS \
2796 : COMPOSITION_WITH_RULE_ALTCHARS); \
2797 composition_state = (c1 <= '2' ? COMPOSING_CHAR \
2798 : COMPOSING_COMPONENT_CHAR); \
2799 component_idx = component_len = 0; \
2800 } \
2801 } while (0)
2802
2803
2804 /* Handle compositoin end sequence ESC 1. */
2805
2806 #define DECODE_COMPOSITION_END() \
2807 do { \
2808 int nchars = (component_len > 0 ? component_idx - component_len \
2809 : method == COMPOSITION_RELATIVE ? component_idx \
2810 : (component_idx + 1) / 2); \
2811 int i; \
2812 int *saved_charbuf = charbuf; \
2813 int from = char_offset; \
2814 int to = from + nchars; \
2815 \
2816 ADD_COMPOSITION_DATA (charbuf, from, to, method); \
2817 if (method != COMPOSITION_RELATIVE) \
2818 { \
2819 if (component_len == 0) \
2820 for (i = 0; i < component_idx; i++) \
2821 *charbuf++ = components[i]; \
2822 else \
2823 for (i = 0; i < component_len; i++) \
2824 *charbuf++ = components[i]; \
2825 *saved_charbuf = saved_charbuf - charbuf; \
2826 } \
2827 if (method == COMPOSITION_WITH_RULE) \
2828 for (i = 0; i < component_idx; i += 2, char_offset++) \
2829 *charbuf++ = components[i]; \
2830 else \
2831 for (i = component_len; i < component_idx; i++, char_offset++) \
2832 *charbuf++ = components[i]; \
2833 coding->annotated = 1; \
2834 composition_state = COMPOSING_NO; \
2835 } while (0)
2836
2837
2838 /* Decode a composition rule from the byte C1 (and maybe one more byte
2839 from SRC) and store one encoded composition rule in
2840 coding->cmp_data. */
2841
2842 #define DECODE_COMPOSITION_RULE(c1) \
2843 do { \
2844 (c1) -= 32; \
2845 if (c1 < 81) /* old format (before ver.21) */ \
2846 { \
2847 int gref = (c1) / 9; \
2848 int nref = (c1) % 9; \
2849 if (gref == 4) gref = 10; \
2850 if (nref == 4) nref = 10; \
2851 c1 = COMPOSITION_ENCODE_RULE (gref, nref); \
2852 } \
2853 else if (c1 < 93) /* new format (after ver.21) */ \
2854 { \
2855 ONE_MORE_BYTE (c2); \
2856 c1 = COMPOSITION_ENCODE_RULE (c1 - 81, c2 - 32); \
2857 } \
2858 else \
2859 c1 = 0; \
2860 } while (0)
2861
2862
2863 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions". */
2864
2865 static void
2866 decode_coding_iso_2022 (coding)
2867 struct coding_system *coding;
2868 {
2869 const unsigned char *src = coding->source + coding->consumed;
2870 const unsigned char *src_end = coding->source + coding->src_bytes;
2871 const unsigned char *src_base;
2872 int *charbuf = coding->charbuf;
2873 int *charbuf_end
2874 = charbuf + coding->charbuf_size - 4 - MAX_ANNOTATION_LENGTH;
2875 int consumed_chars = 0, consumed_chars_base;
2876 int multibytep = coding->src_multibyte;
2877 /* Charsets invoked to graphic plane 0 and 1 respectively. */
2878 int charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
2879 int charset_id_1 = CODING_ISO_INVOKED_CHARSET (coding, 1);
2880 struct charset *charset;
2881 int c;
2882 /* For handling composition sequence. */
2883 #define COMPOSING_NO 0
2884 #define COMPOSING_CHAR 1
2885 #define COMPOSING_RULE 2
2886 #define COMPOSING_COMPONENT_CHAR 3
2887 #define COMPOSING_COMPONENT_RULE 4
2888
2889 int composition_state = COMPOSING_NO;
2890 enum composition_method method;
2891 int components[MAX_COMPOSITION_COMPONENTS * 2 + 1];
2892 int component_idx;
2893 int component_len;
2894 Lisp_Object attrs, charset_list;
2895 int char_offset = coding->produced_char;
2896 int last_offset = char_offset;
2897 int last_id = charset_ascii;
2898
2899 CODING_GET_INFO (coding, attrs, charset_list);
2900 setup_iso_safe_charsets (attrs);
2901
2902 while (1)
2903 {
2904 int c1, c2;
2905
2906 src_base = src;
2907 consumed_chars_base = consumed_chars;
2908
2909 if (charbuf >= charbuf_end)
2910 break;
2911
2912 ONE_MORE_BYTE (c1);
2913 if (c1 < 0)
2914 goto invalid_code;
2915
2916 /* We produce at most one character. */
2917 switch (iso_code_class [c1])
2918 {
2919 case ISO_0x20_or_0x7F:
2920 if (composition_state != COMPOSING_NO)
2921 {
2922 if (composition_state == COMPOSING_RULE
2923 || composition_state == COMPOSING_COMPONENT_RULE)
2924 {
2925 DECODE_COMPOSITION_RULE (c1);
2926 components[component_idx++] = c1;
2927 composition_state--;
2928 continue;
2929 }
2930 }
2931 if (charset_id_0 < 0
2932 || ! CHARSET_ISO_CHARS_96 (CHARSET_FROM_ID (charset_id_0)))
2933 /* This is SPACE or DEL. */
2934 charset = CHARSET_FROM_ID (charset_ascii);
2935 else
2936 charset = CHARSET_FROM_ID (charset_id_0);
2937 break;
2938
2939 case ISO_graphic_plane_0:
2940 if (composition_state != COMPOSING_NO)
2941 {
2942 if (composition_state == COMPOSING_RULE
2943 || composition_state == COMPOSING_COMPONENT_RULE)
2944 {
2945 DECODE_COMPOSITION_RULE (c1);
2946 components[component_idx++] = c1;
2947 composition_state--;
2948 continue;
2949 }
2950 }
2951 charset = CHARSET_FROM_ID (charset_id_0);
2952 break;
2953
2954 case ISO_0xA0_or_0xFF:
2955 if (charset_id_1 < 0
2956 || ! CHARSET_ISO_CHARS_96 (CHARSET_FROM_ID (charset_id_1))
2957 || CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS)
2958 goto invalid_code;
2959 /* This is a graphic character, we fall down ... */
2960
2961 case ISO_graphic_plane_1:
2962 if (charset_id_1 < 0)
2963 goto invalid_code;
2964 charset = CHARSET_FROM_ID (charset_id_1);
2965 break;
2966
2967 case ISO_control_0:
2968 MAYBE_FINISH_COMPOSITION ();
2969 charset = CHARSET_FROM_ID (charset_ascii);
2970 break;
2971
2972 case ISO_control_1:
2973 MAYBE_FINISH_COMPOSITION ();
2974 goto invalid_code;
2975
2976 case ISO_shift_out:
2977 if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LOCKING_SHIFT)
2978 || CODING_ISO_DESIGNATION (coding, 1) < 0)
2979 goto invalid_code;
2980 CODING_ISO_INVOCATION (coding, 0) = 1;
2981 charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
2982 continue;
2983
2984 case ISO_shift_in:
2985 if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LOCKING_SHIFT))
2986 goto invalid_code;
2987 CODING_ISO_INVOCATION (coding, 0) = 0;
2988 charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
2989 continue;
2990
2991 case ISO_single_shift_2_7:
2992 case ISO_single_shift_2:
2993 if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT))
2994 goto invalid_code;
2995 /* SS2 is handled as an escape sequence of ESC 'N' */
2996 c1 = 'N';
2997 goto label_escape_sequence;
2998
2999 case ISO_single_shift_3:
3000 if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT))
3001 goto invalid_code;
3002 /* SS2 is handled as an escape sequence of ESC 'O' */
3003 c1 = 'O';
3004 goto label_escape_sequence;
3005
3006 case ISO_control_sequence_introducer:
3007 /* CSI is handled as an escape sequence of ESC '[' ... */
3008 c1 = '[';
3009 goto label_escape_sequence;
3010
3011 case ISO_escape:
3012 ONE_MORE_BYTE (c1);
3013 label_escape_sequence:
3014 /* Escape sequences handled here are invocation,
3015 designation, direction specification, and character
3016 composition specification. */
3017 switch (c1)
3018 {
3019 case '&': /* revision of following character set */
3020 ONE_MORE_BYTE (c1);
3021 if (!(c1 >= '@' && c1 <= '~'))
3022 goto invalid_code;
3023 ONE_MORE_BYTE (c1);
3024 if (c1 != ISO_CODE_ESC)
3025 goto invalid_code;
3026 ONE_MORE_BYTE (c1);
3027 goto label_escape_sequence;
3028
3029 case '$': /* designation of 2-byte character set */
3030 if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DESIGNATION))
3031 goto invalid_code;
3032 ONE_MORE_BYTE (c1);
3033 if (c1 >= '@' && c1 <= 'B')
3034 { /* designation of JISX0208.1978, GB2312.1980,
3035 or JISX0208.1980 */
3036 DECODE_DESIGNATION (0, 2, 0, c1);
3037 }
3038 else if (c1 >= 0x28 && c1 <= 0x2B)
3039 { /* designation of DIMENSION2_CHARS94 character set */
3040 ONE_MORE_BYTE (c2);
3041 DECODE_DESIGNATION (c1 - 0x28, 2, 0, c2);
3042 }
3043 else if (c1 >= 0x2C && c1 <= 0x2F)
3044 { /* designation of DIMENSION2_CHARS96 character set */
3045 ONE_MORE_BYTE (c2);
3046 DECODE_DESIGNATION (c1 - 0x2C, 2, 1, c2);
3047 }
3048 else
3049 goto invalid_code;
3050 /* We must update these variables now. */
3051 charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3052 charset_id_1 = CODING_ISO_INVOKED_CHARSET (coding, 1);
3053 continue;
3054
3055 case 'n': /* invocation of locking-shift-2 */
3056 if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LOCKING_SHIFT)
3057 || CODING_ISO_DESIGNATION (coding, 2) < 0)
3058 goto invalid_code;
3059 CODING_ISO_INVOCATION (coding, 0) = 2;
3060 charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3061 continue;
3062
3063 case 'o': /* invocation of locking-shift-3 */
3064 if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LOCKING_SHIFT)
3065 || CODING_ISO_DESIGNATION (coding, 3) < 0)
3066 goto invalid_code;
3067 CODING_ISO_INVOCATION (coding, 0) = 3;
3068 charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3069 continue;
3070
3071 case 'N': /* invocation of single-shift-2 */
3072 if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT)
3073 || CODING_ISO_DESIGNATION (coding, 2) < 0)
3074 goto invalid_code;
3075 charset = CHARSET_FROM_ID (CODING_ISO_DESIGNATION (coding, 2));
3076 ONE_MORE_BYTE (c1);
3077 if (c1 < 0x20 || (c1 >= 0x80 && c1 < 0xA0))
3078 goto invalid_code;
3079 break;
3080
3081 case 'O': /* invocation of single-shift-3 */
3082 if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT)
3083 || CODING_ISO_DESIGNATION (coding, 3) < 0)
3084 goto invalid_code;
3085 charset = CHARSET_FROM_ID (CODING_ISO_DESIGNATION (coding, 3));
3086 ONE_MORE_BYTE (c1);
3087 if (c1 < 0x20 || (c1 >= 0x80 && c1 < 0xA0))
3088 goto invalid_code;
3089 break;
3090
3091 case '0': case '2': case '3': case '4': /* start composition */
3092 if (! (coding->common_flags & CODING_ANNOTATE_COMPOSITION_MASK))
3093 goto invalid_code;
3094 DECODE_COMPOSITION_START (c1);
3095 continue;
3096
3097 case '1': /* end composition */
3098 if (composition_state == COMPOSING_NO)
3099 goto invalid_code;
3100 DECODE_COMPOSITION_END ();
3101 continue;
3102
3103 case '[': /* specification of direction */
3104 if (! CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DIRECTION)
3105 goto invalid_code;
3106 /* For the moment, nested direction is not supported.
3107 So, `coding->mode & CODING_MODE_DIRECTION' zero means
3108 left-to-right, and nozero means right-to-left. */
3109 ONE_MORE_BYTE (c1);
3110 switch (c1)
3111 {
3112 case ']': /* end of the current direction */
3113 coding->mode &= ~CODING_MODE_DIRECTION;
3114
3115 case '0': /* end of the current direction */
3116 case '1': /* start of left-to-right direction */
3117 ONE_MORE_BYTE (c1);
3118 if (c1 == ']')
3119 coding->mode &= ~CODING_MODE_DIRECTION;
3120 else
3121 goto invalid_code;
3122 break;
3123
3124 case '2': /* start of right-to-left direction */
3125 ONE_MORE_BYTE (c1);
3126 if (c1 == ']')
3127 coding->mode |= CODING_MODE_DIRECTION;
3128 else
3129 goto invalid_code;
3130 break;
3131
3132 default:
3133 goto invalid_code;
3134 }
3135 continue;
3136
3137 case '%':
3138 ONE_MORE_BYTE (c1);
3139 if (c1 == '/')
3140 {
3141 /* CTEXT extended segment:
3142 ESC % / [0-4] M L --ENCODING-NAME-- \002 --BYTES--
3143 We keep these bytes as is for the moment.
3144 They may be decoded by post-read-conversion. */
3145 int dim, M, L;
3146 int size;
3147
3148 ONE_MORE_BYTE (dim);
3149 ONE_MORE_BYTE (M);
3150 ONE_MORE_BYTE (L);
3151 size = ((M - 128) * 128) + (L - 128);
3152 if (charbuf + 8 + size > charbuf_end)
3153 goto break_loop;
3154 *charbuf++ = ISO_CODE_ESC;
3155 *charbuf++ = '%';
3156 *charbuf++ = '/';
3157 *charbuf++ = dim;
3158 *charbuf++ = BYTE8_TO_CHAR (M);
3159 *charbuf++ = BYTE8_TO_CHAR (L);
3160 while (size-- > 0)
3161 {
3162 ONE_MORE_BYTE (c1);
3163 *charbuf++ = ASCII_BYTE_P (c1) ? c1 : BYTE8_TO_CHAR (c1);
3164 }
3165 }
3166 else if (c1 == 'G')
3167 {
3168 /* XFree86 extension for embedding UTF-8 in CTEXT:
3169 ESC % G --UTF-8-BYTES-- ESC % @
3170 We keep these bytes as is for the moment.
3171 They may be decoded by post-read-conversion. */
3172 int *p = charbuf;
3173
3174 if (p + 6 > charbuf_end)
3175 goto break_loop;
3176 *p++ = ISO_CODE_ESC;
3177 *p++ = '%';
3178 *p++ = 'G';
3179 while (p < charbuf_end)
3180 {
3181 ONE_MORE_BYTE (c1);
3182 if (c1 == ISO_CODE_ESC
3183 && src + 1 < src_end
3184 && src[0] == '%'
3185 && src[1] == '@')
3186 break;
3187 *p++ = ASCII_BYTE_P (c1) ? c1 : BYTE8_TO_CHAR (c1);
3188 }
3189 if (p + 3 > charbuf_end)
3190 goto break_loop;
3191 *p++ = ISO_CODE_ESC;
3192 *p++ = '%';
3193 *p++ = '@';
3194 charbuf = p;
3195 }
3196 else
3197 goto invalid_code;
3198 continue;
3199 break;
3200
3201 default:
3202 if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DESIGNATION))
3203 goto invalid_code;
3204 if (c1 >= 0x28 && c1 <= 0x2B)
3205 { /* designation of DIMENSION1_CHARS94 character set */
3206 ONE_MORE_BYTE (c2);
3207 DECODE_DESIGNATION (c1 - 0x28, 1, 0, c2);
3208 }
3209 else if (c1 >= 0x2C && c1 <= 0x2F)
3210 { /* designation of DIMENSION1_CHARS96 character set */
3211 ONE_MORE_BYTE (c2);
3212 DECODE_DESIGNATION (c1 - 0x2C, 1, 1, c2);
3213 }
3214 else
3215 goto invalid_code;
3216 /* We must update these variables now. */
3217 charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3218 charset_id_1 = CODING_ISO_INVOKED_CHARSET (coding, 1);
3219 continue;
3220 }
3221 }
3222
3223 if (charset->id != charset_ascii
3224 && last_id != charset->id)
3225 {
3226 if (last_id != charset_ascii)
3227 ADD_CHARSET_DATA (charbuf, last_offset, char_offset, last_id);
3228 last_id = charset->id;
3229 last_offset = char_offset;
3230 }
3231
3232 /* Now we know CHARSET and 1st position code C1 of a character.
3233 Produce a decoded character while getting 2nd position code
3234 C2 if necessary. */
3235 c1 &= 0x7F;
3236 if (CHARSET_DIMENSION (charset) > 1)
3237 {
3238 ONE_MORE_BYTE (c2);
3239 if (c2 < 0x20 || (c2 >= 0x80 && c2 < 0xA0))
3240 /* C2 is not in a valid range. */
3241 goto invalid_code;
3242 c1 = (c1 << 8) | (c2 & 0x7F);
3243 if (CHARSET_DIMENSION (charset) > 2)
3244 {
3245 ONE_MORE_BYTE (c2);
3246 if (c2 < 0x20 || (c2 >= 0x80 && c2 < 0xA0))
3247 /* C2 is not in a valid range. */
3248 goto invalid_code;
3249 c1 = (c1 << 8) | (c2 & 0x7F);
3250 }
3251 }
3252
3253 CODING_DECODE_CHAR (coding, src, src_base, src_end, charset, c1, c);
3254 if (c < 0)
3255 {
3256 MAYBE_FINISH_COMPOSITION ();
3257 for (; src_base < src; src_base++, char_offset++)
3258 {
3259 if (ASCII_BYTE_P (*src_base))
3260 *charbuf++ = *src_base;
3261 else
3262 *charbuf++ = BYTE8_TO_CHAR (*src_base);
3263 }
3264 }
3265 else if (composition_state == COMPOSING_NO)
3266 {
3267 *charbuf++ = c;
3268 char_offset++;
3269 }
3270 else
3271 {
3272 components[component_idx++] = c;
3273 if (method == COMPOSITION_WITH_RULE
3274 || (method == COMPOSITION_WITH_RULE_ALTCHARS
3275 && composition_state == COMPOSING_COMPONENT_CHAR))
3276 composition_state++;
3277 }
3278 continue;
3279
3280 invalid_code:
3281 MAYBE_FINISH_COMPOSITION ();
3282 src = src_base;
3283 consumed_chars = consumed_chars_base;
3284 ONE_MORE_BYTE (c);
3285 *charbuf++ = c < 0 ? -c : ASCII_BYTE_P (c) ? c : BYTE8_TO_CHAR (c);
3286 char_offset++;
3287 coding->errors++;
3288 continue;
3289
3290 break_loop:
3291 break;
3292 }
3293
3294 no_more_source:
3295 if (last_id != charset_ascii)
3296 ADD_CHARSET_DATA (charbuf, last_offset, char_offset, last_id);
3297 coding->consumed_char += consumed_chars_base;
3298 coding->consumed = src_base - coding->source;
3299 coding->charbuf_used = charbuf - coding->charbuf;
3300 }
3301
3302
3303 /* ISO2022 encoding stuff. */
3304
3305 /*
3306 It is not enough to say just "ISO2022" on encoding, we have to
3307 specify more details. In Emacs, each coding system of ISO2022
3308 variant has the following specifications:
3309 1. Initial designation to G0 thru G3.
3310 2. Allows short-form designation?
3311 3. ASCII should be designated to G0 before control characters?
3312 4. ASCII should be designated to G0 at end of line?
3313 5. 7-bit environment or 8-bit environment?
3314 6. Use locking-shift?
3315 7. Use Single-shift?
3316 And the following two are only for Japanese:
3317 8. Use ASCII in place of JIS0201-1976-Roman?
3318 9. Use JISX0208-1983 in place of JISX0208-1978?
3319 These specifications are encoded in CODING_ISO_FLAGS (coding) as flag bits
3320 defined by macros CODING_ISO_FLAG_XXX. See `coding.h' for more
3321 details.
3322 */
3323
3324 /* Produce codes (escape sequence) for designating CHARSET to graphic
3325 register REG at DST, and increment DST. If <final-char> of CHARSET is
3326 '@', 'A', or 'B' and the coding system CODING allows, produce
3327 designation sequence of short-form. */
3328
3329 #define ENCODE_DESIGNATION(charset, reg, coding) \
3330 do { \
3331 unsigned char final_char = CHARSET_ISO_FINAL (charset); \
3332 char *intermediate_char_94 = "()*+"; \
3333 char *intermediate_char_96 = ",-./"; \
3334 int revision = -1; \
3335 int c; \
3336 \
3337 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_REVISION) \
3338 revision = CHARSET_ISO_REVISION (charset); \
3339 \
3340 if (revision >= 0) \
3341 { \
3342 EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, '&'); \
3343 EMIT_ONE_BYTE ('@' + revision); \
3344 } \
3345 EMIT_ONE_ASCII_BYTE (ISO_CODE_ESC); \
3346 if (CHARSET_DIMENSION (charset) == 1) \
3347 { \
3348 if (! CHARSET_ISO_CHARS_96 (charset)) \
3349 c = intermediate_char_94[reg]; \
3350 else \
3351 c = intermediate_char_96[reg]; \
3352 EMIT_ONE_ASCII_BYTE (c); \
3353 } \
3354 else \
3355 { \
3356 EMIT_ONE_ASCII_BYTE ('$'); \
3357 if (! CHARSET_ISO_CHARS_96 (charset)) \
3358 { \
3359 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LONG_FORM \
3360 || reg != 0 \
3361 || final_char < '@' || final_char > 'B') \
3362 EMIT_ONE_ASCII_BYTE (intermediate_char_94[reg]); \
3363 } \
3364 else \
3365 EMIT_ONE_ASCII_BYTE (intermediate_char_96[reg]); \
3366 } \
3367 EMIT_ONE_ASCII_BYTE (final_char); \
3368 \
3369 CODING_ISO_DESIGNATION (coding, reg) = CHARSET_ID (charset); \
3370 } while (0)
3371
3372
3373 /* The following two macros produce codes (control character or escape
3374 sequence) for ISO2022 single-shift functions (single-shift-2 and
3375 single-shift-3). */
3376
3377 #define ENCODE_SINGLE_SHIFT_2 \
3378 do { \
3379 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS) \
3380 EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'N'); \
3381 else \
3382 EMIT_ONE_BYTE (ISO_CODE_SS2); \
3383 CODING_ISO_SINGLE_SHIFTING (coding) = 1; \
3384 } while (0)
3385
3386
3387 #define ENCODE_SINGLE_SHIFT_3 \
3388 do { \
3389 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS) \
3390 EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'O'); \
3391 else \
3392 EMIT_ONE_BYTE (ISO_CODE_SS3); \
3393 CODING_ISO_SINGLE_SHIFTING (coding) = 1; \
3394 } while (0)
3395
3396
3397 /* The following four macros produce codes (control character or
3398 escape sequence) for ISO2022 locking-shift functions (shift-in,
3399 shift-out, locking-shift-2, and locking-shift-3). */
3400
3401 #define ENCODE_SHIFT_IN \
3402 do { \
3403 EMIT_ONE_ASCII_BYTE (ISO_CODE_SI); \
3404 CODING_ISO_INVOCATION (coding, 0) = 0; \
3405 } while (0)
3406
3407
3408 #define ENCODE_SHIFT_OUT \
3409 do { \
3410 EMIT_ONE_ASCII_BYTE (ISO_CODE_SO); \
3411 CODING_ISO_INVOCATION (coding, 0) = 1; \
3412 } while (0)
3413
3414
3415 #define ENCODE_LOCKING_SHIFT_2 \
3416 do { \
3417 EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'n'); \
3418 CODING_ISO_INVOCATION (coding, 0) = 2; \
3419 } while (0)
3420
3421
3422 #define ENCODE_LOCKING_SHIFT_3 \
3423 do { \
3424 EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'n'); \
3425 CODING_ISO_INVOCATION (coding, 0) = 3; \
3426 } while (0)
3427
3428
3429 /* Produce codes for a DIMENSION1 character whose character set is
3430 CHARSET and whose position-code is C1. Designation and invocation
3431 sequences are also produced in advance if necessary. */
3432
3433 #define ENCODE_ISO_CHARACTER_DIMENSION1(charset, c1) \
3434 do { \
3435 int id = CHARSET_ID (charset); \
3436 \
3437 if ((CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_USE_ROMAN) \
3438 && id == charset_ascii) \
3439 { \
3440 id = charset_jisx0201_roman; \
3441 charset = CHARSET_FROM_ID (id); \
3442 } \
3443 \
3444 if (CODING_ISO_SINGLE_SHIFTING (coding)) \
3445 { \
3446 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS) \
3447 EMIT_ONE_ASCII_BYTE (c1 & 0x7F); \
3448 else \
3449 EMIT_ONE_BYTE (c1 | 0x80); \
3450 CODING_ISO_SINGLE_SHIFTING (coding) = 0; \
3451 break; \
3452 } \
3453 else if (id == CODING_ISO_INVOKED_CHARSET (coding, 0)) \
3454 { \
3455 EMIT_ONE_ASCII_BYTE (c1 & 0x7F); \
3456 break; \
3457 } \
3458 else if (id == CODING_ISO_INVOKED_CHARSET (coding, 1)) \
3459 { \
3460 EMIT_ONE_BYTE (c1 | 0x80); \
3461 break; \
3462 } \
3463 else \
3464 /* Since CHARSET is not yet invoked to any graphic planes, we \
3465 must invoke it, or, at first, designate it to some graphic \
3466 register. Then repeat the loop to actually produce the \
3467 character. */ \
3468 dst = encode_invocation_designation (charset, coding, dst, \
3469 &produced_chars); \
3470 } while (1)
3471
3472
3473 /* Produce codes for a DIMENSION2 character whose character set is
3474 CHARSET and whose position-codes are C1 and C2. Designation and
3475 invocation codes are also produced in advance if necessary. */
3476
3477 #define ENCODE_ISO_CHARACTER_DIMENSION2(charset, c1, c2) \
3478 do { \
3479 int id = CHARSET_ID (charset); \
3480 \
3481 if ((CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_USE_OLDJIS) \
3482 && id == charset_jisx0208) \
3483 { \
3484 id = charset_jisx0208_1978; \
3485 charset = CHARSET_FROM_ID (id); \
3486 } \
3487 \
3488 if (CODING_ISO_SINGLE_SHIFTING (coding)) \
3489 { \
3490 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS) \
3491 EMIT_TWO_ASCII_BYTES ((c1) & 0x7F, (c2) & 0x7F); \
3492 else \
3493 EMIT_TWO_BYTES ((c1) | 0x80, (c2) | 0x80); \
3494 CODING_ISO_SINGLE_SHIFTING (coding) = 0; \
3495 break; \
3496 } \
3497 else if (id == CODING_ISO_INVOKED_CHARSET (coding, 0)) \
3498 { \
3499 EMIT_TWO_ASCII_BYTES ((c1) & 0x7F, (c2) & 0x7F); \
3500 break; \
3501 } \
3502 else if (id == CODING_ISO_INVOKED_CHARSET (coding, 1)) \
3503 { \
3504 EMIT_TWO_BYTES ((c1) | 0x80, (c2) | 0x80); \
3505 break; \
3506 } \
3507 else \
3508 /* Since CHARSET is not yet invoked to any graphic planes, we \
3509 must invoke it, or, at first, designate it to some graphic \
3510 register. Then repeat the loop to actually produce the \
3511 character. */ \
3512 dst = encode_invocation_designation (charset, coding, dst, \
3513 &produced_chars); \
3514 } while (1)
3515
3516
3517 #define ENCODE_ISO_CHARACTER(charset, c) \
3518 do { \
3519 int code = ENCODE_CHAR ((charset),(c)); \
3520 \
3521 if (CHARSET_DIMENSION (charset) == 1) \
3522 ENCODE_ISO_CHARACTER_DIMENSION1 ((charset), code); \
3523 else \
3524 ENCODE_ISO_CHARACTER_DIMENSION2 ((charset), code >> 8, code & 0xFF); \
3525 } while (0)
3526
3527
3528 /* Produce designation and invocation codes at a place pointed by DST
3529 to use CHARSET. The element `spec.iso_2022' of *CODING is updated.
3530 Return new DST. */
3531
3532 unsigned char *
3533 encode_invocation_designation (charset, coding, dst, p_nchars)
3534 struct charset *charset;
3535 struct coding_system *coding;
3536 unsigned char *dst;
3537 int *p_nchars;
3538 {
3539 int multibytep = coding->dst_multibyte;
3540 int produced_chars = *p_nchars;
3541 int reg; /* graphic register number */
3542 int id = CHARSET_ID (charset);
3543
3544 /* At first, check designations. */
3545 for (reg = 0; reg < 4; reg++)
3546 if (id == CODING_ISO_DESIGNATION (coding, reg))
3547 break;
3548
3549 if (reg >= 4)
3550 {
3551 /* CHARSET is not yet designated to any graphic registers. */
3552 /* At first check the requested designation. */
3553 reg = CODING_ISO_REQUEST (coding, id);
3554 if (reg < 0)
3555 /* Since CHARSET requests no special designation, designate it
3556 to graphic register 0. */
3557 reg = 0;
3558
3559 ENCODE_DESIGNATION (charset, reg, coding);
3560 }
3561
3562 if (CODING_ISO_INVOCATION (coding, 0) != reg
3563 && CODING_ISO_INVOCATION (coding, 1) != reg)
3564 {
3565 /* Since the graphic register REG is not invoked to any graphic
3566 planes, invoke it to graphic plane 0. */
3567 switch (reg)
3568 {
3569 case 0: /* graphic register 0 */
3570 ENCODE_SHIFT_IN;
3571 break;
3572
3573 case 1: /* graphic register 1 */
3574 ENCODE_SHIFT_OUT;
3575 break;
3576
3577 case 2: /* graphic register 2 */
3578 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT)
3579 ENCODE_SINGLE_SHIFT_2;
3580 else
3581 ENCODE_LOCKING_SHIFT_2;
3582 break;
3583
3584 case 3: /* graphic register 3 */
3585 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT)
3586 ENCODE_SINGLE_SHIFT_3;
3587 else
3588 ENCODE_LOCKING_SHIFT_3;
3589 break;
3590 }
3591 }
3592
3593 *p_nchars = produced_chars;
3594 return dst;
3595 }
3596
3597 /* The following three macros produce codes for indicating direction
3598 of text. */
3599 #define ENCODE_CONTROL_SEQUENCE_INTRODUCER \
3600 do { \
3601 if (CODING_ISO_FLAGS (coding) == CODING_ISO_FLAG_SEVEN_BITS) \
3602 EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, '['); \
3603 else \
3604 EMIT_ONE_BYTE (ISO_CODE_CSI); \
3605 } while (0)
3606
3607
3608 #define ENCODE_DIRECTION_R2L() \
3609 do { \
3610 ENCODE_CONTROL_SEQUENCE_INTRODUCER (dst); \
3611 EMIT_TWO_ASCII_BYTES ('2', ']'); \
3612 } while (0)
3613
3614
3615 #define ENCODE_DIRECTION_L2R() \
3616 do { \
3617 ENCODE_CONTROL_SEQUENCE_INTRODUCER (dst); \
3618 EMIT_TWO_ASCII_BYTES ('0', ']'); \
3619 } while (0)
3620
3621
3622 /* Produce codes for designation and invocation to reset the graphic
3623 planes and registers to initial state. */
3624 #define ENCODE_RESET_PLANE_AND_REGISTER() \
3625 do { \
3626 int reg; \
3627 struct charset *charset; \
3628 \
3629 if (CODING_ISO_INVOCATION (coding, 0) != 0) \
3630 ENCODE_SHIFT_IN; \
3631 for (reg = 0; reg < 4; reg++) \
3632 if (CODING_ISO_INITIAL (coding, reg) >= 0 \
3633 && (CODING_ISO_DESIGNATION (coding, reg) \
3634 != CODING_ISO_INITIAL (coding, reg))) \
3635 { \
3636 charset = CHARSET_FROM_ID (CODING_ISO_INITIAL (coding, reg)); \
3637 ENCODE_DESIGNATION (charset, reg, coding); \
3638 } \
3639 } while (0)
3640
3641
3642 /* Produce designation sequences of charsets in the line started from
3643 SRC to a place pointed by DST, and return updated DST.
3644
3645 If the current block ends before any end-of-line, we may fail to
3646 find all the necessary designations. */
3647
3648 static unsigned char *
3649 encode_designation_at_bol (coding, charbuf, charbuf_end, dst)
3650 struct coding_system *coding;
3651 int *charbuf, *charbuf_end;
3652 unsigned char *dst;
3653 {
3654 struct charset *charset;
3655 /* Table of charsets to be designated to each graphic register. */
3656 int r[4];
3657 int c, found = 0, reg;
3658 int produced_chars = 0;
3659 int multibytep = coding->dst_multibyte;
3660 Lisp_Object attrs;
3661 Lisp_Object charset_list;
3662
3663 attrs = CODING_ID_ATTRS (coding->id);
3664 charset_list = CODING_ATTR_CHARSET_LIST (attrs);
3665 if (EQ (charset_list, Qiso_2022))
3666 charset_list = Viso_2022_charset_list;
3667
3668 for (reg = 0; reg < 4; reg++)
3669 r[reg] = -1;
3670
3671 while (found < 4)
3672 {
3673 int id;
3674
3675 c = *charbuf++;
3676 if (c == '\n')
3677 break;
3678 charset = char_charset (c, charset_list, NULL);
3679 id = CHARSET_ID (charset);
3680 reg = CODING_ISO_REQUEST (coding, id);
3681 if (reg >= 0 && r[reg] < 0)
3682 {
3683 found++;
3684 r[reg] = id;
3685 }
3686 }
3687
3688 if (found)
3689 {
3690 for (reg = 0; reg < 4; reg++)
3691 if (r[reg] >= 0
3692 && CODING_ISO_DESIGNATION (coding, reg) != r[reg])
3693 ENCODE_DESIGNATION (CHARSET_FROM_ID (r[reg]), reg, coding);
3694 }
3695
3696 return dst;
3697 }
3698
3699 /* See the above "GENERAL NOTES on `encode_coding_XXX ()' functions". */
3700
3701 static int
3702 encode_coding_iso_2022 (coding)
3703 struct coding_system *coding;
3704 {
3705 int multibytep = coding->dst_multibyte;
3706 int *charbuf = coding->charbuf;
3707 int *charbuf_end = charbuf + coding->charbuf_used;
3708 unsigned char *dst = coding->destination + coding->produced;
3709 unsigned char *dst_end = coding->destination + coding->dst_bytes;
3710 int safe_room = 16;
3711 int bol_designation
3712 = (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DESIGNATE_AT_BOL
3713 && CODING_ISO_BOL (coding));
3714 int produced_chars = 0;
3715 Lisp_Object attrs, eol_type, charset_list;
3716 int ascii_compatible;
3717 int c;
3718 int preferred_charset_id = -1;
3719
3720 CODING_GET_INFO (coding, attrs, charset_list);
3721 eol_type = CODING_ID_EOL_TYPE (coding->id);
3722 if (VECTORP (eol_type))
3723 eol_type = Qunix;
3724
3725 setup_iso_safe_charsets (attrs);
3726 /* Charset list may have been changed. */
3727 charset_list = CODING_ATTR_CHARSET_LIST (attrs); \
3728 coding->safe_charsets = (char *) SDATA (CODING_ATTR_SAFE_CHARSETS(attrs));
3729
3730 ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs));
3731
3732 while (charbuf < charbuf_end)
3733 {
3734 ASSURE_DESTINATION (safe_room);
3735
3736 if (bol_designation)
3737 {
3738 unsigned char *dst_prev = dst;
3739
3740 /* We have to produce designation sequences if any now. */
3741 dst = encode_designation_at_bol (coding, charbuf, charbuf_end, dst);
3742 bol_designation = 0;
3743 /* We are sure that designation sequences are all ASCII bytes. */
3744 produced_chars += dst - dst_prev;
3745 }
3746
3747 c = *charbuf++;
3748
3749 if (c < 0)
3750 {
3751 /* Handle an annotation. */
3752 switch (*charbuf)
3753 {
3754 case CODING_ANNOTATE_COMPOSITION_MASK:
3755 /* Not yet implemented. */
3756 break;
3757 case CODING_ANNOTATE_CHARSET_MASK:
3758 preferred_charset_id = charbuf[3];
3759 if (preferred_charset_id >= 0
3760 && NILP (Fmemq (make_number (preferred_charset_id),
3761 charset_list)))
3762 preferred_charset_id = -1;
3763 break;
3764 default:
3765 abort ();
3766 }
3767 charbuf += -c - 1;
3768 continue;
3769 }
3770
3771 /* Now encode the character C. */
3772 if (c < 0x20 || c == 0x7F)
3773 {
3774 if (c == '\n'
3775 || (c == '\r' && EQ (eol_type, Qmac)))
3776 {
3777 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_RESET_AT_EOL)
3778 ENCODE_RESET_PLANE_AND_REGISTER ();
3779 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_INIT_AT_BOL)
3780 {
3781 int i;
3782
3783 for (i = 0; i < 4; i++)
3784 CODING_ISO_DESIGNATION (coding, i)
3785 = CODING_ISO_INITIAL (coding, i);
3786 }
3787 bol_designation
3788 = CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DESIGNATE_AT_BOL;
3789 }
3790 else if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_RESET_AT_CNTL)
3791 ENCODE_RESET_PLANE_AND_REGISTER ();
3792 EMIT_ONE_ASCII_BYTE (c);
3793 }
3794 else if (ASCII_CHAR_P (c))
3795 {
3796 if (ascii_compatible)
3797 EMIT_ONE_ASCII_BYTE (c);
3798 else
3799 {
3800 struct charset *charset = CHARSET_FROM_ID (charset_ascii);
3801 ENCODE_ISO_CHARACTER (charset, c);
3802 }
3803 }
3804 else if (CHAR_BYTE8_P (c))
3805 {
3806 c = CHAR_TO_BYTE8 (c);
3807 EMIT_ONE_BYTE (c);
3808 }
3809 else
3810 {
3811 struct charset *charset;
3812
3813 if (preferred_charset_id >= 0)
3814 {
3815 charset = CHARSET_FROM_ID (preferred_charset_id);
3816 if (! CHAR_CHARSET_P (c, charset))
3817 charset = char_charset (c, charset_list, NULL);
3818 }
3819 else
3820 charset = char_charset (c, charset_list, NULL);
3821 if (!charset)
3822 {
3823 if (coding->mode & CODING_MODE_SAFE_ENCODING)
3824 {
3825 c = CODING_INHIBIT_CHARACTER_SUBSTITUTION;
3826 charset = CHARSET_FROM_ID (charset_ascii);
3827 }
3828 else
3829 {
3830 c = coding->default_char;
3831 charset = char_charset (c, charset_list, NULL);
3832 }
3833 }
3834 ENCODE_ISO_CHARACTER (charset, c);
3835 }
3836 }
3837
3838 if (coding->mode & CODING_MODE_LAST_BLOCK
3839 && CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_RESET_AT_EOL)
3840 {
3841 ASSURE_DESTINATION (safe_room);
3842 ENCODE_RESET_PLANE_AND_REGISTER ();
3843 }
3844 record_conversion_result (coding, CODING_RESULT_SUCCESS);
3845 CODING_ISO_BOL (coding) = bol_designation;
3846 coding->produced_char += produced_chars;
3847 coding->produced = dst - coding->destination;
3848 return 0;
3849 }
3850
3851 \f
3852 /*** 8,9. SJIS and BIG5 handlers ***/
3853
3854 /* Although SJIS and BIG5 are not ISO's coding system, they are used
3855 quite widely. So, for the moment, Emacs supports them in the bare
3856 C code. But, in the future, they may be supported only by CCL. */
3857
3858 /* SJIS is a coding system encoding three character sets: ASCII, right
3859 half of JISX0201-Kana, and JISX0208. An ASCII character is encoded
3860 as is. A character of charset katakana-jisx0201 is encoded by
3861 "position-code + 0x80". A character of charset japanese-jisx0208
3862 is encoded in 2-byte but two position-codes are divided and shifted
3863 so that it fit in the range below.
3864
3865 --- CODE RANGE of SJIS ---
3866 (character set) (range)
3867 ASCII 0x00 .. 0x7F
3868 KATAKANA-JISX0201 0xA0 .. 0xDF
3869 JISX0208 (1st byte) 0x81 .. 0x9F and 0xE0 .. 0xEF
3870 (2nd byte) 0x40 .. 0x7E and 0x80 .. 0xFC
3871 -------------------------------
3872
3873 */
3874
3875 /* BIG5 is a coding system encoding two character sets: ASCII and
3876 Big5. An ASCII character is encoded as is. Big5 is a two-byte
3877 character set and is encoded in two-byte.
3878
3879 --- CODE RANGE of BIG5 ---
3880 (character set) (range)
3881 ASCII 0x00 .. 0x7F
3882 Big5 (1st byte) 0xA1 .. 0xFE
3883 (2nd byte) 0x40 .. 0x7E and 0xA1 .. 0xFE
3884 --------------------------
3885
3886 */
3887
3888 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
3889 Check if a text is encoded in SJIS. If it is, return
3890 CATEGORY_MASK_SJIS, else return 0. */
3891
3892 static int
3893 detect_coding_sjis (coding, detect_info)
3894 struct coding_system *coding;
3895 struct coding_detection_info *detect_info;
3896 {
3897 const unsigned char *src = coding->source, *src_base;
3898 const unsigned char *src_end = coding->source + coding->src_bytes;
3899 int multibytep = coding->src_multibyte;
3900 int consumed_chars = 0;
3901 int found = 0;
3902 int c;
3903
3904 detect_info->checked |= CATEGORY_MASK_SJIS;
3905 /* A coding system of this category is always ASCII compatible. */
3906 src += coding->head_ascii;
3907
3908 while (1)
3909 {
3910 src_base = src;
3911 ONE_MORE_BYTE (c);
3912 if (c < 0x80)
3913 continue;
3914 if ((c >= 0x81 && c <= 0x9F) || (c >= 0xE0 && c <= 0xEF))
3915 {
3916 ONE_MORE_BYTE (c);
3917 if (c < 0x40 || c == 0x7F || c > 0xFC)
3918 break;
3919 found = CATEGORY_MASK_SJIS;
3920 }
3921 else if (c >= 0xA0 && c < 0xE0)
3922 found = CATEGORY_MASK_SJIS;
3923 else
3924 break;
3925 }
3926 detect_info->rejected |= CATEGORY_MASK_SJIS;
3927 return 0;
3928
3929 no_more_source:
3930 if (src_base < src && coding->mode & CODING_MODE_LAST_BLOCK)
3931 {
3932 detect_info->rejected |= CATEGORY_MASK_SJIS;
3933 return 0;
3934 }
3935 detect_info->found |= found;
3936 return 1;
3937 }
3938
3939 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
3940 Check if a text is encoded in BIG5. If it is, return
3941 CATEGORY_MASK_BIG5, else return 0. */
3942
3943 static int
3944 detect_coding_big5 (coding, detect_info)
3945 struct coding_system *coding;
3946 struct coding_detection_info *detect_info;
3947 {
3948 const unsigned char *src = coding->source, *src_base;
3949 const unsigned char *src_end = coding->source + coding->src_bytes;
3950 int multibytep = coding->src_multibyte;
3951 int consumed_chars = 0;
3952 int found = 0;
3953 int c;
3954
3955 detect_info->checked |= CATEGORY_MASK_BIG5;
3956 /* A coding system of this category is always ASCII compatible. */
3957 src += coding->head_ascii;
3958
3959 while (1)
3960 {
3961 src_base = src;
3962 ONE_MORE_BYTE (c);
3963 if (c < 0x80)
3964 continue;
3965 if (c >= 0xA1)
3966 {
3967 ONE_MORE_BYTE (c);
3968 if (c < 0x40 || (c >= 0x7F && c <= 0xA0))
3969 return 0;
3970 found = CATEGORY_MASK_BIG5;
3971 }
3972 else
3973 break;
3974 }
3975 detect_info->rejected |= CATEGORY_MASK_BIG5;
3976 return 0;
3977
3978 no_more_source:
3979 if (src_base < src && coding->mode & CODING_MODE_LAST_BLOCK)
3980 {
3981 detect_info->rejected |= CATEGORY_MASK_BIG5;
3982 return 0;
3983 }
3984 detect_info->found |= found;
3985 return 1;
3986 }
3987
3988 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".
3989 If SJIS_P is 1, decode SJIS text, else decode BIG5 test. */
3990
3991 static void
3992 decode_coding_sjis (coding)
3993 struct coding_system *coding;
3994 {
3995 const unsigned char *src = coding->source + coding->consumed;
3996 const unsigned char *src_end = coding->source + coding->src_bytes;
3997 const unsigned char *src_base;
3998 int *charbuf = coding->charbuf;
3999 int *charbuf_end = charbuf + coding->charbuf_size - MAX_ANNOTATION_LENGTH;
4000 int consumed_chars = 0, consumed_chars_base;
4001 int multibytep = coding->src_multibyte;
4002 struct charset *charset_roman, *charset_kanji, *charset_kana;
4003 struct charset *charset_kanji2;
4004 Lisp_Object attrs, charset_list, val;
4005 int char_offset = coding->produced_char;
4006 int last_offset = char_offset;
4007 int last_id = charset_ascii;
4008
4009 CODING_GET_INFO (coding, attrs, charset_list);
4010
4011 val = charset_list;
4012 charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4013 charset_kana = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4014 charset_kanji = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4015 charset_kanji2 = NILP (val) ? NULL : CHARSET_FROM_ID (XINT (XCAR (val)));
4016
4017 while (1)
4018 {
4019 int c, c1;
4020 struct charset *charset;
4021
4022 src_base = src;
4023 consumed_chars_base = consumed_chars;
4024
4025 if (charbuf >= charbuf_end)
4026 break;
4027
4028 ONE_MORE_BYTE (c);
4029 if (c < 0)
4030 goto invalid_code;
4031 if (c < 0x80)
4032 charset = charset_roman;
4033 else if (c == 0x80 || c == 0xA0)
4034 goto invalid_code;
4035 else if (c >= 0xA1 && c <= 0xDF)
4036 {
4037 /* SJIS -> JISX0201-Kana */
4038 c &= 0x7F;
4039 charset = charset_kana;
4040 }
4041 else if (c <= 0xEF)
4042 {
4043 /* SJIS -> JISX0208 */
4044 ONE_MORE_BYTE (c1);
4045 if (c1 < 0x40 || c1 == 0x7F || c1 > 0xFC)
4046 goto invalid_code;
4047 c = (c << 8) | c1;
4048 SJIS_TO_JIS (c);
4049 charset = charset_kanji;
4050 }
4051 else if (c <= 0xFC && charset_kanji2)
4052 {
4053 /* SJIS -> JISX0213-2 */
4054 ONE_MORE_BYTE (c1);
4055 if (c1 < 0x40 || c1 == 0x7F || c1 > 0xFC)
4056 goto invalid_code;
4057 c = (c << 8) | c1;
4058 SJIS_TO_JIS2 (c);
4059 charset = charset_kanji2;
4060 }
4061 else
4062 goto invalid_code;
4063 if (charset->id != charset_ascii
4064 && last_id != charset->id)
4065 {
4066 if (last_id != charset_ascii)
4067 ADD_CHARSET_DATA (charbuf, last_offset, char_offset, last_id);
4068 last_id = charset->id;
4069 last_offset = char_offset;
4070 }
4071 CODING_DECODE_CHAR (coding, src, src_base, src_end, charset, c, c);
4072 *charbuf++ = c;
4073 char_offset++;
4074 continue;
4075
4076 invalid_code:
4077 src = src_base;
4078 consumed_chars = consumed_chars_base;
4079 ONE_MORE_BYTE (c);
4080 *charbuf++ = c < 0 ? -c : BYTE8_TO_CHAR (c);
4081 char_offset++;
4082 coding->errors++;
4083 }
4084
4085 no_more_source:
4086 if (last_id != charset_ascii)
4087 ADD_CHARSET_DATA (charbuf, last_offset, char_offset, last_id);
4088 coding->consumed_char += consumed_chars_base;
4089 coding->consumed = src_base - coding->source;
4090 coding->charbuf_used = charbuf - coding->charbuf;
4091 }
4092
4093 static void
4094 decode_coding_big5 (coding)
4095 struct coding_system *coding;
4096 {
4097 const unsigned char *src = coding->source + coding->consumed;
4098 const unsigned char *src_end = coding->source + coding->src_bytes;
4099 const unsigned char *src_base;
4100 int *charbuf = coding->charbuf;
4101 int *charbuf_end = charbuf + coding->charbuf_size - MAX_ANNOTATION_LENGTH;
4102 int consumed_chars = 0, consumed_chars_base;
4103 int multibytep = coding->src_multibyte;
4104 struct charset *charset_roman, *charset_big5;
4105 Lisp_Object attrs, charset_list, val;
4106 int char_offset = coding->produced_char;
4107 int last_offset = char_offset;
4108 int last_id = charset_ascii;
4109
4110 CODING_GET_INFO (coding, attrs, charset_list);
4111 val = charset_list;
4112 charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4113 charset_big5 = CHARSET_FROM_ID (XINT (XCAR (val)));
4114
4115 while (1)
4116 {
4117 int c, c1;
4118 struct charset *charset;
4119
4120 src_base = src;
4121 consumed_chars_base = consumed_chars;
4122
4123 if (charbuf >= charbuf_end)
4124 break;
4125
4126 ONE_MORE_BYTE (c);
4127
4128 if (c < 0)
4129 goto invalid_code;
4130 if (c < 0x80)
4131 charset = charset_roman;
4132 else
4133 {
4134 /* BIG5 -> Big5 */
4135 if (c < 0xA1 || c > 0xFE)
4136 goto invalid_code;
4137 ONE_MORE_BYTE (c1);
4138 if (c1 < 0x40 || (c1 > 0x7E && c1 < 0xA1) || c1 > 0xFE)
4139 goto invalid_code;
4140 c = c << 8 | c1;
4141 charset = charset_big5;
4142 }
4143 if (charset->id != charset_ascii
4144 && last_id != charset->id)
4145 {
4146 if (last_id != charset_ascii)
4147 ADD_CHARSET_DATA (charbuf, last_offset, char_offset, last_id);
4148 last_id = charset->id;
4149 last_offset = char_offset;
4150 }
4151 CODING_DECODE_CHAR (coding, src, src_base, src_end, charset, c, c);
4152 *charbuf++ = c;
4153 char_offset++;
4154 continue;
4155
4156 invalid_code:
4157 src = src_base;
4158 consumed_chars = consumed_chars_base;
4159 ONE_MORE_BYTE (c);
4160 *charbuf++ = c < 0 ? -c : BYTE8_TO_CHAR (c);
4161 char_offset++;
4162 coding->errors++;
4163 }
4164
4165 no_more_source:
4166 if (last_id != charset_ascii)
4167 ADD_CHARSET_DATA (charbuf, last_offset, char_offset, last_id);
4168 coding->consumed_char += consumed_chars_base;
4169 coding->consumed = src_base - coding->source;
4170 coding->charbuf_used = charbuf - coding->charbuf;
4171 }
4172
4173 /* See the above "GENERAL NOTES on `encode_coding_XXX ()' functions".
4174 This function can encode charsets `ascii', `katakana-jisx0201',
4175 `japanese-jisx0208', `chinese-big5-1', and `chinese-big5-2'. We
4176 are sure that all these charsets are registered as official charset
4177 (i.e. do not have extended leading-codes). Characters of other
4178 charsets are produced without any encoding. If SJIS_P is 1, encode
4179 SJIS text, else encode BIG5 text. */
4180
4181 static int
4182 encode_coding_sjis (coding)
4183 struct coding_system *coding;
4184 {
4185 int multibytep = coding->dst_multibyte;
4186 int *charbuf = coding->charbuf;
4187 int *charbuf_end = charbuf + coding->charbuf_used;
4188 unsigned char *dst = coding->destination + coding->produced;
4189 unsigned char *dst_end = coding->destination + coding->dst_bytes;
4190 int safe_room = 4;
4191 int produced_chars = 0;
4192 Lisp_Object attrs, charset_list, val;
4193 int ascii_compatible;
4194 struct charset *charset_roman, *charset_kanji, *charset_kana;
4195 struct charset *charset_kanji2;
4196 int c;
4197
4198 CODING_GET_INFO (coding, attrs, charset_list);
4199 val = charset_list;
4200 charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4201 charset_kana = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4202 charset_kanji = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4203 charset_kanji2 = NILP (val) ? NULL : CHARSET_FROM_ID (XINT (XCAR (val)));
4204
4205 ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs));
4206
4207 while (charbuf < charbuf_end)
4208 {
4209 ASSURE_DESTINATION (safe_room);
4210 c = *charbuf++;
4211 /* Now encode the character C. */
4212 if (ASCII_CHAR_P (c) && ascii_compatible)
4213 EMIT_ONE_ASCII_BYTE (c);
4214 else if (CHAR_BYTE8_P (c))
4215 {
4216 c = CHAR_TO_BYTE8 (c);
4217 EMIT_ONE_BYTE (c);
4218 }
4219 else
4220 {
4221 unsigned code;
4222 struct charset *charset = char_charset (c, charset_list, &code);
4223
4224 if (!charset)
4225 {
4226 if (coding->mode & CODING_MODE_SAFE_ENCODING)
4227 {
4228 code = CODING_INHIBIT_CHARACTER_SUBSTITUTION;
4229 charset = CHARSET_FROM_ID (charset_ascii);
4230 }
4231 else
4232 {
4233 c = coding->default_char;
4234 charset = char_charset (c, charset_list, &code);
4235 }
4236 }
4237 if (code == CHARSET_INVALID_CODE (charset))
4238 abort ();
4239 if (charset == charset_kanji)
4240 {
4241 int c1, c2;
4242 JIS_TO_SJIS (code);
4243 c1 = code >> 8, c2 = code & 0xFF;
4244 EMIT_TWO_BYTES (c1, c2);
4245 }
4246 else if (charset == charset_kana)
4247 EMIT_ONE_BYTE (code | 0x80);
4248 else if (charset_kanji2 && charset == charset_kanji2)
4249 {
4250 int c1, c2;
4251
4252 c1 = code >> 8;
4253 if (c1 == 0x21 || (c1 >= 0x23 && c1 < 0x25)
4254 || (c1 >= 0x2C && c1 <= 0x2F) || c1 >= 0x6E)
4255 {
4256 JIS_TO_SJIS2 (code);
4257 c1 = code >> 8, c2 = code & 0xFF;
4258 EMIT_TWO_BYTES (c1, c2);
4259 }
4260 else
4261 EMIT_ONE_ASCII_BYTE (code & 0x7F);
4262 }
4263 else
4264 EMIT_ONE_ASCII_BYTE (code & 0x7F);
4265 }
4266 }
4267 record_conversion_result (coding, CODING_RESULT_SUCCESS);
4268 coding->produced_char += produced_chars;
4269 coding->produced = dst - coding->destination;
4270 return 0;
4271 }
4272
4273 static int
4274 encode_coding_big5 (coding)
4275 struct coding_system *coding;
4276 {
4277 int multibytep = coding->dst_multibyte;
4278 int *charbuf = coding->charbuf;
4279 int *charbuf_end = charbuf + coding->charbuf_used;
4280 unsigned char *dst = coding->destination + coding->produced;
4281 unsigned char *dst_end = coding->destination + coding->dst_bytes;
4282 int safe_room = 4;
4283 int produced_chars = 0;
4284 Lisp_Object attrs, charset_list, val;
4285 int ascii_compatible;
4286 struct charset *charset_roman, *charset_big5;
4287 int c;
4288
4289 CODING_GET_INFO (coding, attrs, charset_list);
4290 val = charset_list;
4291 charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4292 charset_big5 = CHARSET_FROM_ID (XINT (XCAR (val)));
4293 ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs));
4294
4295 while (charbuf < charbuf_end)
4296 {
4297 ASSURE_DESTINATION (safe_room);
4298 c = *charbuf++;
4299 /* Now encode the character C. */
4300 if (ASCII_CHAR_P (c) && ascii_compatible)
4301 EMIT_ONE_ASCII_BYTE (c);
4302 else if (CHAR_BYTE8_P (c))
4303 {
4304 c = CHAR_TO_BYTE8 (c);
4305 EMIT_ONE_BYTE (c);
4306 }
4307 else
4308 {
4309 unsigned code;
4310 struct charset *charset = char_charset (c, charset_list, &code);
4311
4312 if (! charset)
4313 {
4314 if (coding->mode & CODING_MODE_SAFE_ENCODING)
4315 {
4316 code = CODING_INHIBIT_CHARACTER_SUBSTITUTION;
4317 charset = CHARSET_FROM_ID (charset_ascii);
4318 }
4319 else
4320 {
4321 c = coding->default_char;
4322 charset = char_charset (c, charset_list, &code);
4323 }
4324 }
4325 if (code == CHARSET_INVALID_CODE (charset))
4326 abort ();
4327 if (charset == charset_big5)
4328 {
4329 int c1, c2;
4330
4331 c1 = code >> 8, c2 = code & 0xFF;
4332 EMIT_TWO_BYTES (c1, c2);
4333 }
4334 else
4335 EMIT_ONE_ASCII_BYTE (code & 0x7F);
4336 }
4337 }
4338 record_conversion_result (coding, CODING_RESULT_SUCCESS);
4339 coding->produced_char += produced_chars;
4340 coding->produced = dst - coding->destination;
4341 return 0;
4342 }
4343
4344 \f
4345 /*** 10. CCL handlers ***/
4346
4347 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
4348 Check if a text is encoded in a coding system of which
4349 encoder/decoder are written in CCL program. If it is, return
4350 CATEGORY_MASK_CCL, else return 0. */
4351
4352 static int
4353 detect_coding_ccl (coding, detect_info)
4354 struct coding_system *coding;
4355 struct coding_detection_info *detect_info;
4356 {
4357 const unsigned char *src = coding->source, *src_base;
4358 const unsigned char *src_end = coding->source + coding->src_bytes;
4359 int multibytep = coding->src_multibyte;
4360 int consumed_chars = 0;
4361 int found = 0;
4362 unsigned char *valids;
4363 int head_ascii = coding->head_ascii;
4364 Lisp_Object attrs;
4365
4366 detect_info->checked |= CATEGORY_MASK_CCL;
4367
4368 coding = &coding_categories[coding_category_ccl];
4369 valids = CODING_CCL_VALIDS (coding);
4370 attrs = CODING_ID_ATTRS (coding->id);
4371 if (! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
4372 src += head_ascii;
4373
4374 while (1)
4375 {
4376 int c;
4377
4378 src_base = src;
4379 ONE_MORE_BYTE (c);
4380 if (c < 0 || ! valids[c])
4381 break;
4382 if ((valids[c] > 1))
4383 found = CATEGORY_MASK_CCL;
4384 }
4385 detect_info->rejected |= CATEGORY_MASK_CCL;
4386 return 0;
4387
4388 no_more_source:
4389 detect_info->found |= found;
4390 return 1;
4391 }
4392
4393 static void
4394 decode_coding_ccl (coding)
4395 struct coding_system *coding;
4396 {
4397 const unsigned char *src = coding->source + coding->consumed;
4398 const unsigned char *src_end = coding->source + coding->src_bytes;
4399 int *charbuf = coding->charbuf;
4400 int *charbuf_end = charbuf + coding->charbuf_size;
4401 int consumed_chars = 0;
4402 int multibytep = coding->src_multibyte;
4403 struct ccl_program ccl;
4404 int source_charbuf[1024];
4405 int source_byteidx[1024];
4406 Lisp_Object attrs, charset_list;
4407
4408 CODING_GET_INFO (coding, attrs, charset_list);
4409 setup_ccl_program (&ccl, CODING_CCL_DECODER (coding));
4410
4411 while (src < src_end)
4412 {
4413 const unsigned char *p = src;
4414 int *source, *source_end;
4415 int i = 0;
4416
4417 if (multibytep)
4418 while (i < 1024 && p < src_end)
4419 {
4420 source_byteidx[i] = p - src;
4421 source_charbuf[i++] = STRING_CHAR_ADVANCE (p);
4422 }
4423 else
4424 while (i < 1024 && p < src_end)
4425 source_charbuf[i++] = *p++;
4426
4427 if (p == src_end && coding->mode & CODING_MODE_LAST_BLOCK)
4428 ccl.last_block = 1;
4429
4430 source = source_charbuf;
4431 source_end = source + i;
4432 while (source < source_end)
4433 {
4434 ccl_driver (&ccl, source, charbuf,
4435 source_end - source, charbuf_end - charbuf,
4436 charset_list);
4437 source += ccl.consumed;
4438 charbuf += ccl.produced;
4439 if (ccl.status != CCL_STAT_SUSPEND_BY_DST)
4440 break;
4441 }
4442 if (source < source_end)
4443 src += source_byteidx[source - source_charbuf];
4444 else
4445 src = p;
4446 consumed_chars += source - source_charbuf;
4447
4448 if (ccl.status != CCL_STAT_SUSPEND_BY_SRC
4449 && ccl.status != CODING_RESULT_INSUFFICIENT_SRC)
4450 break;
4451 }
4452
4453 switch (ccl.status)
4454 {
4455 case CCL_STAT_SUSPEND_BY_SRC:
4456 record_conversion_result (coding, CODING_RESULT_INSUFFICIENT_SRC);
4457 break;
4458 case CCL_STAT_SUSPEND_BY_DST:
4459 break;
4460 case CCL_STAT_QUIT:
4461 case CCL_STAT_INVALID_CMD:
4462 record_conversion_result (coding, CODING_RESULT_INTERRUPT);
4463 break;
4464 default:
4465 record_conversion_result (coding, CODING_RESULT_SUCCESS);
4466 break;
4467 }
4468 coding->consumed_char += consumed_chars;
4469 coding->consumed = src - coding->source;
4470 coding->charbuf_used = charbuf - coding->charbuf;
4471 }
4472
4473 static int
4474 encode_coding_ccl (coding)
4475 struct coding_system *coding;
4476 {
4477 struct ccl_program ccl;
4478 int multibytep = coding->dst_multibyte;
4479 int *charbuf = coding->charbuf;
4480 int *charbuf_end = charbuf + coding->charbuf_used;
4481 unsigned char *dst = coding->destination + coding->produced;
4482 unsigned char *dst_end = coding->destination + coding->dst_bytes;
4483 unsigned char *adjusted_dst_end = dst_end - 1;
4484 int destination_charbuf[1024];
4485 int i, produced_chars = 0;
4486 Lisp_Object attrs, charset_list;
4487
4488 CODING_GET_INFO (coding, attrs, charset_list);
4489 setup_ccl_program (&ccl, CODING_CCL_ENCODER (coding));
4490
4491 ccl.last_block = coding->mode & CODING_MODE_LAST_BLOCK;
4492 ccl.dst_multibyte = coding->dst_multibyte;
4493
4494 while (charbuf < charbuf_end && dst < adjusted_dst_end)
4495 {
4496 int dst_bytes = dst_end - dst;
4497 if (dst_bytes > 1024)
4498 dst_bytes = 1024;
4499
4500 ccl_driver (&ccl, charbuf, destination_charbuf,
4501 charbuf_end - charbuf, dst_bytes, charset_list);
4502 charbuf += ccl.consumed;
4503 if (multibytep)
4504 for (i = 0; i < ccl.produced; i++)
4505 EMIT_ONE_BYTE (destination_charbuf[i] & 0xFF);
4506 else
4507 {
4508 for (i = 0; i < ccl.produced; i++)
4509 *dst++ = destination_charbuf[i] & 0xFF;
4510 produced_chars += ccl.produced;
4511 }
4512 }
4513
4514 switch (ccl.status)
4515 {
4516 case CCL_STAT_SUSPEND_BY_SRC:
4517 record_conversion_result (coding, CODING_RESULT_INSUFFICIENT_SRC);
4518 break;
4519 case CCL_STAT_SUSPEND_BY_DST:
4520 record_conversion_result (coding, CODING_RESULT_INSUFFICIENT_DST);
4521 break;
4522 case CCL_STAT_QUIT:
4523 case CCL_STAT_INVALID_CMD:
4524 record_conversion_result (coding, CODING_RESULT_INTERRUPT);
4525 break;
4526 default:
4527 record_conversion_result (coding, CODING_RESULT_SUCCESS);
4528 break;
4529 }
4530
4531 coding->produced_char += produced_chars;
4532 coding->produced = dst - coding->destination;
4533 return 0;
4534 }
4535
4536
4537 \f
4538 /*** 10, 11. no-conversion handlers ***/
4539
4540 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions". */
4541
4542 static void
4543 decode_coding_raw_text (coding)
4544 struct coding_system *coding;
4545 {
4546 coding->chars_at_source = 1;
4547 coding->consumed_char = 0;
4548 coding->consumed = 0;
4549 record_conversion_result (coding, CODING_RESULT_SUCCESS);
4550 }
4551
4552 static int
4553 encode_coding_raw_text (coding)
4554 struct coding_system *coding;
4555 {
4556 int multibytep = coding->dst_multibyte;
4557 int *charbuf = coding->charbuf;
4558 int *charbuf_end = coding->charbuf + coding->charbuf_used;
4559 unsigned char *dst = coding->destination + coding->produced;
4560 unsigned char *dst_end = coding->destination + coding->dst_bytes;
4561 int produced_chars = 0;
4562 int c;
4563
4564 if (multibytep)
4565 {
4566 int safe_room = MAX_MULTIBYTE_LENGTH * 2;
4567
4568 if (coding->src_multibyte)
4569 while (charbuf < charbuf_end)
4570 {
4571 ASSURE_DESTINATION (safe_room);
4572 c = *charbuf++;
4573 if (ASCII_CHAR_P (c))
4574 EMIT_ONE_ASCII_BYTE (c);
4575 else if (CHAR_BYTE8_P (c))
4576 {
4577 c = CHAR_TO_BYTE8 (c);
4578 EMIT_ONE_BYTE (c);
4579 }
4580 else
4581 {
4582 unsigned char str[MAX_MULTIBYTE_LENGTH], *p0 = str, *p1 = str;
4583
4584 CHAR_STRING_ADVANCE (c, p1);
4585 while (p0 < p1)
4586 {
4587 EMIT_ONE_BYTE (*p0);
4588 p0++;
4589 }
4590 }
4591 }
4592 else
4593 while (charbuf < charbuf_end)
4594 {
4595 ASSURE_DESTINATION (safe_room);
4596 c = *charbuf++;
4597 EMIT_ONE_BYTE (c);
4598 }
4599 }
4600 else
4601 {
4602 if (coding->src_multibyte)
4603 {
4604 int safe_room = MAX_MULTIBYTE_LENGTH;
4605
4606 while (charbuf < charbuf_end)
4607 {
4608 ASSURE_DESTINATION (safe_room);
4609 c = *charbuf++;
4610 if (ASCII_CHAR_P (c))
4611 *dst++ = c;
4612 else if (CHAR_BYTE8_P (c))
4613 *dst++ = CHAR_TO_BYTE8 (c);
4614 else
4615 CHAR_STRING_ADVANCE (c, dst);
4616 produced_chars++;
4617 }
4618 }
4619 else
4620 {
4621 ASSURE_DESTINATION (charbuf_end - charbuf);
4622 while (charbuf < charbuf_end && dst < dst_end)
4623 *dst++ = *charbuf++;
4624 produced_chars = dst - (coding->destination + coding->dst_bytes);
4625 }
4626 }
4627 record_conversion_result (coding, CODING_RESULT_SUCCESS);
4628 coding->produced_char += produced_chars;
4629 coding->produced = dst - coding->destination;
4630 return 0;
4631 }
4632
4633 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
4634 Check if a text is encoded in a charset-based coding system. If it
4635 is, return 1, else return 0. */
4636
4637 static int
4638 detect_coding_charset (coding, detect_info)
4639 struct coding_system *coding;
4640 struct coding_detection_info *detect_info;
4641 {
4642 const unsigned char *src = coding->source, *src_base;
4643 const unsigned char *src_end = coding->source + coding->src_bytes;
4644 int multibytep = coding->src_multibyte;
4645 int consumed_chars = 0;
4646 Lisp_Object attrs, valids;
4647 int found = 0;
4648
4649 detect_info->checked |= CATEGORY_MASK_CHARSET;
4650
4651 coding = &coding_categories[coding_category_charset];
4652 attrs = CODING_ID_ATTRS (coding->id);
4653 valids = AREF (attrs, coding_attr_charset_valids);
4654
4655 if (! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
4656 src += coding->head_ascii;
4657
4658 while (1)
4659 {
4660 int c;
4661
4662 src_base = src;
4663 ONE_MORE_BYTE (c);
4664 if (c < 0)
4665 continue;
4666 if (NILP (AREF (valids, c)))
4667 break;
4668 if (c >= 0x80)
4669 found = CATEGORY_MASK_CHARSET;
4670 }
4671 detect_info->rejected |= CATEGORY_MASK_CHARSET;
4672 return 0;
4673
4674 no_more_source:
4675 detect_info->found |= found;
4676 return 1;
4677 }
4678
4679 static void
4680 decode_coding_charset (coding)
4681 struct coding_system *coding;
4682 {
4683 const unsigned char *src = coding->source + coding->consumed;
4684 const unsigned char *src_end = coding->source + coding->src_bytes;
4685 const unsigned char *src_base;
4686 int *charbuf = coding->charbuf;
4687 int *charbuf_end = charbuf + coding->charbuf_size - MAX_ANNOTATION_LENGTH;
4688 int consumed_chars = 0, consumed_chars_base;
4689 int multibytep = coding->src_multibyte;
4690 Lisp_Object attrs, charset_list, valids;
4691 int char_offset = coding->produced_char;
4692 int last_offset = char_offset;
4693 int last_id = charset_ascii;
4694
4695 CODING_GET_INFO (coding, attrs, charset_list);
4696 valids = AREF (attrs, coding_attr_charset_valids);
4697
4698 while (1)
4699 {
4700 int c;
4701 Lisp_Object val;
4702 struct charset *charset;
4703 int dim;
4704 int len = 1;
4705 unsigned code;
4706
4707 src_base = src;
4708 consumed_chars_base = consumed_chars;
4709
4710 if (charbuf >= charbuf_end)
4711 break;
4712
4713 ONE_MORE_BYTE (c);
4714 if (c < 0)
4715 goto invalid_code;
4716 code = c;
4717
4718 val = AREF (valids, c);
4719 if (NILP (val))
4720 goto invalid_code;
4721 if (INTEGERP (val))
4722 {
4723 charset = CHARSET_FROM_ID (XFASTINT (val));
4724 dim = CHARSET_DIMENSION (charset);
4725 while (len < dim)
4726 {
4727 ONE_MORE_BYTE (c);
4728 code = (code << 8) | c;
4729 len++;
4730 }
4731 CODING_DECODE_CHAR (coding, src, src_base, src_end,
4732 charset, code, c);
4733 }
4734 else
4735 {
4736 /* VAL is a list of charset IDs. It is assured that the
4737 list is sorted by charset dimensions (smaller one
4738 comes first). */
4739 while (CONSP (val))
4740 {
4741 charset = CHARSET_FROM_ID (XFASTINT (XCAR (val)));
4742 dim = CHARSET_DIMENSION (charset);
4743 while (len < dim)
4744 {
4745 ONE_MORE_BYTE (c);
4746 code = (code << 8) | c;
4747 len++;
4748 }
4749 CODING_DECODE_CHAR (coding, src, src_base,
4750 src_end, charset, code, c);
4751 if (c >= 0)
4752 break;
4753 val = XCDR (val);
4754 }
4755 }
4756 if (c < 0)
4757 goto invalid_code;
4758 if (charset->id != charset_ascii
4759 && last_id != charset->id)
4760 {
4761 if (last_id != charset_ascii)
4762 ADD_CHARSET_DATA (charbuf, last_offset, char_offset, last_id);
4763 last_id = charset->id;
4764 last_offset = char_offset;
4765 }
4766
4767 *charbuf++ = c;
4768 char_offset++;
4769 continue;
4770
4771 invalid_code:
4772 src = src_base;
4773 consumed_chars = consumed_chars_base;
4774 ONE_MORE_BYTE (c);
4775 *charbuf++ = c < 0 ? -c : ASCII_BYTE_P (c) ? c : BYTE8_TO_CHAR (c);
4776 char_offset++;
4777 coding->errors++;
4778 }
4779
4780 no_more_source:
4781 if (last_id != charset_ascii)
4782 ADD_CHARSET_DATA (charbuf, last_offset, char_offset, last_id);
4783 coding->consumed_char += consumed_chars_base;
4784 coding->consumed = src_base - coding->source;
4785 coding->charbuf_used = charbuf - coding->charbuf;
4786 }
4787
4788 static int
4789 encode_coding_charset (coding)
4790 struct coding_system *coding;
4791 {
4792 int multibytep = coding->dst_multibyte;
4793 int *charbuf = coding->charbuf;
4794 int *charbuf_end = charbuf + coding->charbuf_used;
4795 unsigned char *dst = coding->destination + coding->produced;
4796 unsigned char *dst_end = coding->destination + coding->dst_bytes;
4797 int safe_room = MAX_MULTIBYTE_LENGTH;
4798 int produced_chars = 0;
4799 Lisp_Object attrs, charset_list;
4800 int ascii_compatible;
4801 int c;
4802
4803 CODING_GET_INFO (coding, attrs, charset_list);
4804 ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs));
4805
4806 while (charbuf < charbuf_end)
4807 {
4808 struct charset *charset;
4809 unsigned code;
4810
4811 ASSURE_DESTINATION (safe_room);
4812 c = *charbuf++;
4813 if (ascii_compatible && ASCII_CHAR_P (c))
4814 EMIT_ONE_ASCII_BYTE (c);
4815 else if (CHAR_BYTE8_P (c))
4816 {
4817 c = CHAR_TO_BYTE8 (c);
4818 EMIT_ONE_BYTE (c);
4819 }
4820 else
4821 {
4822 charset = char_charset (c, charset_list, &code);
4823 if (charset)
4824 {
4825 if (CHARSET_DIMENSION (charset) == 1)
4826 EMIT_ONE_BYTE (code);
4827 else if (CHARSET_DIMENSION (charset) == 2)
4828 EMIT_TWO_BYTES (code >> 8, code & 0xFF);
4829 else if (CHARSET_DIMENSION (charset) == 3)
4830 EMIT_THREE_BYTES (code >> 16, (code >> 8) & 0xFF, code & 0xFF);
4831 else
4832 EMIT_FOUR_BYTES (code >> 24, (code >> 16) & 0xFF,
4833 (code >> 8) & 0xFF, code & 0xFF);
4834 }
4835 else
4836 {
4837 if (coding->mode & CODING_MODE_SAFE_ENCODING)
4838 c = CODING_INHIBIT_CHARACTER_SUBSTITUTION;
4839 else
4840 c = coding->default_char;
4841 EMIT_ONE_BYTE (c);
4842 }
4843 }
4844 }
4845
4846 record_conversion_result (coding, CODING_RESULT_SUCCESS);
4847 coding->produced_char += produced_chars;
4848 coding->produced = dst - coding->destination;
4849 return 0;
4850 }
4851
4852 \f
4853 /*** 7. C library functions ***/
4854
4855 /* Setup coding context CODING from information about CODING_SYSTEM.
4856 If CODING_SYSTEM is nil, `no-conversion' is assumed. If
4857 CODING_SYSTEM is invalid, signal an error. */
4858
4859 void
4860 setup_coding_system (coding_system, coding)
4861 Lisp_Object coding_system;
4862 struct coding_system *coding;
4863 {
4864 Lisp_Object attrs;
4865 Lisp_Object eol_type;
4866 Lisp_Object coding_type;
4867 Lisp_Object val;
4868
4869 if (NILP (coding_system))
4870 coding_system = Qno_conversion;
4871
4872 CHECK_CODING_SYSTEM_GET_ID (coding_system, coding->id);
4873
4874 attrs = CODING_ID_ATTRS (coding->id);
4875 eol_type = CODING_ID_EOL_TYPE (coding->id);
4876
4877 coding->mode = 0;
4878 coding->head_ascii = -1;
4879 coding->common_flags
4880 = (VECTORP (eol_type) ? CODING_REQUIRE_DETECTION_MASK : 0);
4881 if (! NILP (CODING_ATTR_POST_READ (attrs)))
4882 coding->common_flags |= CODING_REQUIRE_DECODING_MASK;
4883 if (! NILP (CODING_ATTR_PRE_WRITE (attrs)))
4884 coding->common_flags |= CODING_REQUIRE_ENCODING_MASK;
4885 if (! NILP (CODING_ATTR_FOR_UNIBYTE (attrs)))
4886 coding->common_flags |= CODING_FOR_UNIBYTE_MASK;
4887
4888 val = CODING_ATTR_SAFE_CHARSETS (attrs);
4889 coding->max_charset_id = SCHARS (val) - 1;
4890 coding->safe_charsets = (char *) SDATA (val);
4891 coding->default_char = XINT (CODING_ATTR_DEFAULT_CHAR (attrs));
4892
4893 coding_type = CODING_ATTR_TYPE (attrs);
4894 if (EQ (coding_type, Qundecided))
4895 {
4896 coding->detector = NULL;
4897 coding->decoder = decode_coding_raw_text;
4898 coding->encoder = encode_coding_raw_text;
4899 coding->common_flags |= CODING_REQUIRE_DETECTION_MASK;
4900 }
4901 else if (EQ (coding_type, Qiso_2022))
4902 {
4903 int i;
4904 int flags = XINT (AREF (attrs, coding_attr_iso_flags));
4905
4906 /* Invoke graphic register 0 to plane 0. */
4907 CODING_ISO_INVOCATION (coding, 0) = 0;
4908 /* Invoke graphic register 1 to plane 1 if we can use 8-bit. */
4909 CODING_ISO_INVOCATION (coding, 1)
4910 = (flags & CODING_ISO_FLAG_SEVEN_BITS ? -1 : 1);
4911 /* Setup the initial status of designation. */
4912 for (i = 0; i < 4; i++)
4913 CODING_ISO_DESIGNATION (coding, i) = CODING_ISO_INITIAL (coding, i);
4914 /* Not single shifting initially. */
4915 CODING_ISO_SINGLE_SHIFTING (coding) = 0;
4916 /* Beginning of buffer should also be regarded as bol. */
4917 CODING_ISO_BOL (coding) = 1;
4918 coding->detector = detect_coding_iso_2022;
4919 coding->decoder = decode_coding_iso_2022;
4920 coding->encoder = encode_coding_iso_2022;
4921 if (flags & CODING_ISO_FLAG_SAFE)
4922 coding->mode |= CODING_MODE_SAFE_ENCODING;
4923 coding->common_flags
4924 |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK
4925 | CODING_REQUIRE_FLUSHING_MASK);
4926 if (flags & CODING_ISO_FLAG_COMPOSITION)
4927 coding->common_flags |= CODING_ANNOTATE_COMPOSITION_MASK;
4928 if (flags & CODING_ISO_FLAG_DESIGNATION)
4929 coding->common_flags |= CODING_ANNOTATE_CHARSET_MASK;
4930 if (flags & CODING_ISO_FLAG_FULL_SUPPORT)
4931 {
4932 setup_iso_safe_charsets (attrs);
4933 val = CODING_ATTR_SAFE_CHARSETS (attrs);
4934 coding->max_charset_id = SCHARS (val) - 1;
4935 coding->safe_charsets = (char *) SDATA (val);
4936 }
4937 CODING_ISO_FLAGS (coding) = flags;
4938 }
4939 else if (EQ (coding_type, Qcharset))
4940 {
4941 coding->detector = detect_coding_charset;
4942 coding->decoder = decode_coding_charset;
4943 coding->encoder = encode_coding_charset;
4944 coding->common_flags
4945 |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
4946 }
4947 else if (EQ (coding_type, Qutf_8))
4948 {
4949 coding->detector = detect_coding_utf_8;
4950 coding->decoder = decode_coding_utf_8;
4951 coding->encoder = encode_coding_utf_8;
4952 coding->common_flags
4953 |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
4954 }
4955 else if (EQ (coding_type, Qutf_16))
4956 {
4957 val = AREF (attrs, coding_attr_utf_16_bom);
4958 CODING_UTF_16_BOM (coding) = (CONSP (val) ? utf_16_detect_bom
4959 : EQ (val, Qt) ? utf_16_with_bom
4960 : utf_16_without_bom);
4961 val = AREF (attrs, coding_attr_utf_16_endian);
4962 CODING_UTF_16_ENDIAN (coding) = (EQ (val, Qbig) ? utf_16_big_endian
4963 : utf_16_little_endian);
4964 CODING_UTF_16_SURROGATE (coding) = 0;
4965 coding->detector = detect_coding_utf_16;
4966 coding->decoder = decode_coding_utf_16;
4967 coding->encoder = encode_coding_utf_16;
4968 coding->common_flags
4969 |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
4970 if (CODING_UTF_16_BOM (coding) == utf_16_detect_bom)
4971 coding->common_flags |= CODING_REQUIRE_DETECTION_MASK;
4972 }
4973 else if (EQ (coding_type, Qccl))
4974 {
4975 coding->detector = detect_coding_ccl;
4976 coding->decoder = decode_coding_ccl;
4977 coding->encoder = encode_coding_ccl;
4978 coding->common_flags
4979 |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK
4980 | CODING_REQUIRE_FLUSHING_MASK);
4981 }
4982 else if (EQ (coding_type, Qemacs_mule))
4983 {
4984 coding->detector = detect_coding_emacs_mule;
4985 coding->decoder = decode_coding_emacs_mule;
4986 coding->encoder = encode_coding_emacs_mule;
4987 coding->common_flags
4988 |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
4989 if (! NILP (AREF (attrs, coding_attr_emacs_mule_full))
4990 && ! EQ (CODING_ATTR_CHARSET_LIST (attrs), Vemacs_mule_charset_list))
4991 {
4992 Lisp_Object tail, safe_charsets;
4993 int max_charset_id = 0;
4994
4995 for (tail = Vemacs_mule_charset_list; CONSP (tail);
4996 tail = XCDR (tail))
4997 if (max_charset_id < XFASTINT (XCAR (tail)))
4998 max_charset_id = XFASTINT (XCAR (tail));
4999 safe_charsets = Fmake_string (make_number (max_charset_id + 1),
5000 make_number (255));
5001 for (tail = Vemacs_mule_charset_list; CONSP (tail);
5002 tail = XCDR (tail))
5003 SSET (safe_charsets, XFASTINT (XCAR (tail)), 0);
5004 coding->max_charset_id = max_charset_id;
5005 coding->safe_charsets = (char *) SDATA (safe_charsets);
5006 }
5007 }
5008 else if (EQ (coding_type, Qshift_jis))
5009 {
5010 coding->detector = detect_coding_sjis;
5011 coding->decoder = decode_coding_sjis;
5012 coding->encoder = encode_coding_sjis;
5013 coding->common_flags
5014 |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
5015 }
5016 else if (EQ (coding_type, Qbig5))
5017 {
5018 coding->detector = detect_coding_big5;
5019 coding->decoder = decode_coding_big5;
5020 coding->encoder = encode_coding_big5;
5021 coding->common_flags
5022 |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
5023 }
5024 else /* EQ (coding_type, Qraw_text) */
5025 {
5026 coding->detector = NULL;
5027 coding->decoder = decode_coding_raw_text;
5028 coding->encoder = encode_coding_raw_text;
5029 }
5030
5031 return;
5032 }
5033
5034 /* Return raw-text or one of its subsidiaries that has the same
5035 eol_type as CODING-SYSTEM. */
5036
5037 Lisp_Object
5038 raw_text_coding_system (coding_system)
5039 Lisp_Object coding_system;
5040 {
5041 Lisp_Object spec, attrs;
5042 Lisp_Object eol_type, raw_text_eol_type;
5043
5044 if (NILP (coding_system))
5045 return Qraw_text;
5046 spec = CODING_SYSTEM_SPEC (coding_system);
5047 attrs = AREF (spec, 0);
5048
5049 if (EQ (CODING_ATTR_TYPE (attrs), Qraw_text))
5050 return coding_system;
5051
5052 eol_type = AREF (spec, 2);
5053 if (VECTORP (eol_type))
5054 return Qraw_text;
5055 spec = CODING_SYSTEM_SPEC (Qraw_text);
5056 raw_text_eol_type = AREF (spec, 2);
5057 return (EQ (eol_type, Qunix) ? AREF (raw_text_eol_type, 0)
5058 : EQ (eol_type, Qdos) ? AREF (raw_text_eol_type, 1)
5059 : AREF (raw_text_eol_type, 2));
5060 }
5061
5062
5063 /* If CODING_SYSTEM doesn't specify end-of-line format but PARENT
5064 does, return one of the subsidiary that has the same eol-spec as
5065 PARENT. Otherwise, return CODING_SYSTEM. */
5066
5067 Lisp_Object
5068 coding_inherit_eol_type (coding_system, parent)
5069 Lisp_Object coding_system, parent;
5070 {
5071 Lisp_Object spec, eol_type;
5072
5073 if (NILP (coding_system))
5074 coding_system = Qraw_text;
5075 spec = CODING_SYSTEM_SPEC (coding_system);
5076 eol_type = AREF (spec, 2);
5077 if (VECTORP (eol_type)
5078 && ! NILP (parent))
5079 {
5080 Lisp_Object parent_spec;
5081 Lisp_Object parent_eol_type;
5082
5083 parent_spec
5084 = CODING_SYSTEM_SPEC (buffer_defaults.buffer_file_coding_system);
5085 parent_eol_type = AREF (parent_spec, 2);
5086 if (EQ (parent_eol_type, Qunix))
5087 coding_system = AREF (eol_type, 0);
5088 else if (EQ (parent_eol_type, Qdos))
5089 coding_system = AREF (eol_type, 1);
5090 else if (EQ (parent_eol_type, Qmac))
5091 coding_system = AREF (eol_type, 2);
5092 }
5093 return coding_system;
5094 }
5095
5096 /* Emacs has a mechanism to automatically detect a coding system if it
5097 is one of Emacs' internal format, ISO2022, SJIS, and BIG5. But,
5098 it's impossible to distinguish some coding systems accurately
5099 because they use the same range of codes. So, at first, coding
5100 systems are categorized into 7, those are:
5101
5102 o coding-category-emacs-mule
5103
5104 The category for a coding system which has the same code range
5105 as Emacs' internal format. Assigned the coding-system (Lisp
5106 symbol) `emacs-mule' by default.
5107
5108 o coding-category-sjis
5109
5110 The category for a coding system which has the same code range
5111 as SJIS. Assigned the coding-system (Lisp
5112 symbol) `japanese-shift-jis' by default.
5113
5114 o coding-category-iso-7
5115
5116 The category for a coding system which has the same code range
5117 as ISO2022 of 7-bit environment. This doesn't use any locking
5118 shift and single shift functions. This can encode/decode all
5119 charsets. Assigned the coding-system (Lisp symbol)
5120 `iso-2022-7bit' by default.
5121
5122 o coding-category-iso-7-tight
5123
5124 Same as coding-category-iso-7 except that this can
5125 encode/decode only the specified charsets.
5126
5127 o coding-category-iso-8-1
5128
5129 The category for a coding system which has the same code range
5130 as ISO2022 of 8-bit environment and graphic plane 1 used only
5131 for DIMENSION1 charset. This doesn't use any locking shift
5132 and single shift functions. Assigned the coding-system (Lisp
5133 symbol) `iso-latin-1' by default.
5134
5135 o coding-category-iso-8-2
5136
5137 The category for a coding system which has the same code range
5138 as ISO2022 of 8-bit environment and graphic plane 1 used only
5139 for DIMENSION2 charset. This doesn't use any locking shift
5140 and single shift functions. Assigned the coding-system (Lisp
5141 symbol) `japanese-iso-8bit' by default.
5142
5143 o coding-category-iso-7-else
5144
5145 The category for a coding system which has the same code range
5146 as ISO2022 of 7-bit environemnt but uses locking shift or
5147 single shift functions. Assigned the coding-system (Lisp
5148 symbol) `iso-2022-7bit-lock' by default.
5149
5150 o coding-category-iso-8-else
5151
5152 The category for a coding system which has the same code range
5153 as ISO2022 of 8-bit environemnt but uses locking shift or
5154 single shift functions. Assigned the coding-system (Lisp
5155 symbol) `iso-2022-8bit-ss2' by default.
5156
5157 o coding-category-big5
5158
5159 The category for a coding system which has the same code range
5160 as BIG5. Assigned the coding-system (Lisp symbol)
5161 `cn-big5' by default.
5162
5163 o coding-category-utf-8
5164
5165 The category for a coding system which has the same code range
5166 as UTF-8 (cf. RFC2279). Assigned the coding-system (Lisp
5167 symbol) `utf-8' by default.
5168
5169 o coding-category-utf-16-be
5170
5171 The category for a coding system in which a text has an
5172 Unicode signature (cf. Unicode Standard) in the order of BIG
5173 endian at the head. Assigned the coding-system (Lisp symbol)
5174 `utf-16-be' by default.
5175
5176 o coding-category-utf-16-le
5177
5178 The category for a coding system in which a text has an
5179 Unicode signature (cf. Unicode Standard) in the order of
5180 LITTLE endian at the head. Assigned the coding-system (Lisp
5181 symbol) `utf-16-le' by default.
5182
5183 o coding-category-ccl
5184
5185 The category for a coding system of which encoder/decoder is
5186 written in CCL programs. The default value is nil, i.e., no
5187 coding system is assigned.
5188
5189 o coding-category-binary
5190
5191 The category for a coding system not categorized in any of the
5192 above. Assigned the coding-system (Lisp symbol)
5193 `no-conversion' by default.
5194
5195 Each of them is a Lisp symbol and the value is an actual
5196 `coding-system's (this is also a Lisp symbol) assigned by a user.
5197 What Emacs does actually is to detect a category of coding system.
5198 Then, it uses a `coding-system' assigned to it. If Emacs can't
5199 decide only one possible category, it selects a category of the
5200 highest priority. Priorities of categories are also specified by a
5201 user in a Lisp variable `coding-category-list'.
5202
5203 */
5204
5205 #define EOL_SEEN_NONE 0
5206 #define EOL_SEEN_LF 1
5207 #define EOL_SEEN_CR 2
5208 #define EOL_SEEN_CRLF 4
5209
5210 /* Detect how end-of-line of a text of length SRC_BYTES pointed by
5211 SOURCE is encoded. If CATEGORY is one of
5212 coding_category_utf_16_XXXX, assume that CR and LF are encoded by
5213 two-byte, else they are encoded by one-byte.
5214
5215 Return one of EOL_SEEN_XXX. */
5216
5217 #define MAX_EOL_CHECK_COUNT 3
5218
5219 static int
5220 detect_eol (source, src_bytes, category)
5221 unsigned char *source;
5222 EMACS_INT src_bytes;
5223 enum coding_category category;
5224 {
5225 unsigned char *src = source, *src_end = src + src_bytes;
5226 unsigned char c;
5227 int total = 0;
5228 int eol_seen = EOL_SEEN_NONE;
5229
5230 if ((1 << category) & CATEGORY_MASK_UTF_16)
5231 {
5232 int msb, lsb;
5233
5234 msb = category == (coding_category_utf_16_le
5235 | coding_category_utf_16_le_nosig);
5236 lsb = 1 - msb;
5237
5238 while (src + 1 < src_end)
5239 {
5240 c = src[lsb];
5241 if (src[msb] == 0 && (c == '\n' || c == '\r'))
5242 {
5243 int this_eol;
5244
5245 if (c == '\n')
5246 this_eol = EOL_SEEN_LF;
5247 else if (src + 3 >= src_end
5248 || src[msb + 2] != 0
5249 || src[lsb + 2] != '\n')
5250 this_eol = EOL_SEEN_CR;
5251 else
5252 this_eol = EOL_SEEN_CRLF;
5253
5254 if (eol_seen == EOL_SEEN_NONE)
5255 /* This is the first end-of-line. */
5256 eol_seen = this_eol;
5257 else if (eol_seen != this_eol)
5258 {
5259 /* The found type is different from what found before. */
5260 eol_seen = EOL_SEEN_LF;
5261 break;
5262 }
5263 if (++total == MAX_EOL_CHECK_COUNT)
5264 break;
5265 }
5266 src += 2;
5267 }
5268 }
5269 else
5270 {
5271 while (src < src_end)
5272 {
5273 c = *src++;
5274 if (c == '\n' || c == '\r')
5275 {
5276 int this_eol;
5277
5278 if (c == '\n')
5279 this_eol = EOL_SEEN_LF;
5280 else if (src >= src_end || *src != '\n')
5281 this_eol = EOL_SEEN_CR;
5282 else
5283 this_eol = EOL_SEEN_CRLF, src++;
5284
5285 if (eol_seen == EOL_SEEN_NONE)
5286 /* This is the first end-of-line. */
5287 eol_seen = this_eol;
5288 else if (eol_seen != this_eol)
5289 {
5290 /* The found type is different from what found before. */
5291 eol_seen = EOL_SEEN_LF;
5292 break;
5293 }
5294 if (++total == MAX_EOL_CHECK_COUNT)
5295 break;
5296 }
5297 }
5298 }
5299 return eol_seen;
5300 }
5301
5302
5303 static Lisp_Object
5304 adjust_coding_eol_type (coding, eol_seen)
5305 struct coding_system *coding;
5306 int eol_seen;
5307 {
5308 Lisp_Object eol_type;
5309
5310 eol_type = CODING_ID_EOL_TYPE (coding->id);
5311 if (eol_seen & EOL_SEEN_LF)
5312 {
5313 coding->id = CODING_SYSTEM_ID (AREF (eol_type, 0));
5314 eol_type = Qunix;
5315 }
5316 else if (eol_seen & EOL_SEEN_CRLF)
5317 {
5318 coding->id = CODING_SYSTEM_ID (AREF (eol_type, 1));
5319 eol_type = Qdos;
5320 }
5321 else if (eol_seen & EOL_SEEN_CR)
5322 {
5323 coding->id = CODING_SYSTEM_ID (AREF (eol_type, 2));
5324 eol_type = Qmac;
5325 }
5326 return eol_type;
5327 }
5328
5329 /* Detect how a text specified in CODING is encoded. If a coding
5330 system is detected, update fields of CODING by the detected coding
5331 system. */
5332
5333 void
5334 detect_coding (coding)
5335 struct coding_system *coding;
5336 {
5337 const unsigned char *src, *src_end;
5338 Lisp_Object attrs, coding_type;
5339
5340 coding->consumed = coding->consumed_char = 0;
5341 coding->produced = coding->produced_char = 0;
5342 coding_set_source (coding);
5343
5344 src_end = coding->source + coding->src_bytes;
5345
5346 /* If we have not yet decided the text encoding type, detect it
5347 now. */
5348 if (EQ (CODING_ATTR_TYPE (CODING_ID_ATTRS (coding->id)), Qundecided))
5349 {
5350 int c, i;
5351
5352 for (i = 0, src = coding->source; src < src_end; i++, src++)
5353 {
5354 c = *src;
5355 if (c & 0x80 || (c < 0x20 && (c == ISO_CODE_ESC
5356 || c == ISO_CODE_SI
5357 || c == ISO_CODE_SO)))
5358 break;
5359 }
5360 coding->head_ascii = src - (coding->source + coding->consumed);
5361
5362 if (coding->head_ascii < coding->src_bytes)
5363 {
5364 struct coding_detection_info detect_info;
5365 enum coding_category category;
5366 struct coding_system *this;
5367
5368 detect_info.checked = detect_info.found = detect_info.rejected = 0;
5369 for (i = 0; i < coding_category_raw_text; i++)
5370 {
5371 category = coding_priorities[i];
5372 this = coding_categories + category;
5373 if (this->id < 0)
5374 {
5375 /* No coding system of this category is defined. */
5376 detect_info.rejected |= (1 << category);
5377 }
5378 else if (category >= coding_category_raw_text)
5379 continue;
5380 else if (detect_info.checked & (1 << category))
5381 {
5382 if (detect_info.found & (1 << category))
5383 break;
5384 }
5385 else if ((*(this->detector)) (coding, &detect_info)
5386 && detect_info.found & (1 << category))
5387 {
5388 if (category == coding_category_utf_16_auto)
5389 {
5390 if (detect_info.found & CATEGORY_MASK_UTF_16_LE)
5391 category = coding_category_utf_16_le;
5392 else
5393 category = coding_category_utf_16_be;
5394 }
5395 break;
5396 }
5397 }
5398 if (i < coding_category_raw_text)
5399 setup_coding_system (CODING_ID_NAME (this->id), coding);
5400 else if (detect_info.rejected == CATEGORY_MASK_ANY)
5401 setup_coding_system (Qraw_text, coding);
5402 else if (detect_info.rejected)
5403 for (i = 0; i < coding_category_raw_text; i++)
5404 if (! (detect_info.rejected & (1 << coding_priorities[i])))
5405 {
5406 this = coding_categories + coding_priorities[i];
5407 setup_coding_system (CODING_ID_NAME (this->id), coding);
5408 break;
5409 }
5410 }
5411 }
5412 else if (XINT (CODING_ATTR_CATEGORY (CODING_ID_ATTRS (coding->id)))
5413 == coding_category_utf_16_auto)
5414 {
5415 Lisp_Object coding_systems;
5416 struct coding_detection_info detect_info;
5417
5418 coding_systems
5419 = AREF (CODING_ID_ATTRS (coding->id), coding_attr_utf_16_bom);
5420 detect_info.found = detect_info.rejected = 0;
5421 if (CONSP (coding_systems)
5422 && detect_coding_utf_16 (coding, &detect_info))
5423 {
5424 if (detect_info.found & CATEGORY_MASK_UTF_16_LE)
5425 setup_coding_system (XCAR (coding_systems), coding);
5426 else if (detect_info.found & CATEGORY_MASK_UTF_16_BE)
5427 setup_coding_system (XCDR (coding_systems), coding);
5428 }
5429 }
5430 }
5431
5432
5433 static void
5434 decode_eol (coding)
5435 struct coding_system *coding;
5436 {
5437 Lisp_Object eol_type;
5438 unsigned char *p, *pbeg, *pend;
5439
5440 eol_type = CODING_ID_EOL_TYPE (coding->id);
5441 if (EQ (eol_type, Qunix))
5442 return;
5443
5444 if (NILP (coding->dst_object))
5445 pbeg = coding->destination;
5446 else
5447 pbeg = BYTE_POS_ADDR (coding->dst_pos_byte);
5448 pend = pbeg + coding->produced;
5449
5450 if (VECTORP (eol_type))
5451 {
5452 int eol_seen = EOL_SEEN_NONE;
5453
5454 for (p = pbeg; p < pend; p++)
5455 {
5456 if (*p == '\n')
5457 eol_seen |= EOL_SEEN_LF;
5458 else if (*p == '\r')
5459 {
5460 if (p + 1 < pend && *(p + 1) == '\n')
5461 {
5462 eol_seen |= EOL_SEEN_CRLF;
5463 p++;
5464 }
5465 else
5466 eol_seen |= EOL_SEEN_CR;
5467 }
5468 }
5469 if (eol_seen != EOL_SEEN_NONE
5470 && eol_seen != EOL_SEEN_LF
5471 && eol_seen != EOL_SEEN_CRLF
5472 && eol_seen != EOL_SEEN_CR)
5473 eol_seen = EOL_SEEN_LF;
5474 if (eol_seen != EOL_SEEN_NONE)
5475 eol_type = adjust_coding_eol_type (coding, eol_seen);
5476 }
5477
5478 if (EQ (eol_type, Qmac))
5479 {
5480 for (p = pbeg; p < pend; p++)
5481 if (*p == '\r')
5482 *p = '\n';
5483 }
5484 else if (EQ (eol_type, Qdos))
5485 {
5486 int n = 0;
5487
5488 if (NILP (coding->dst_object))
5489 {
5490 for (p = pend - 2; p >= pbeg; p--)
5491 if (*p == '\r')
5492 {
5493 safe_bcopy ((char *) (p + 1), (char *) p, pend-- - p - 1);
5494 n++;
5495 }
5496 }
5497 else
5498 {
5499 for (p = pend - 2; p >= pbeg; p--)
5500 if (*p == '\r')
5501 {
5502 int pos_byte = coding->dst_pos_byte + (p - pbeg);
5503 int pos = BYTE_TO_CHAR (pos_byte);
5504
5505 del_range_2 (pos, pos_byte, pos + 1, pos_byte + 1, 0);
5506 n++;
5507 }
5508 }
5509 coding->produced -= n;
5510 coding->produced_char -= n;
5511 }
5512 }
5513
5514
5515 /* Return a translation table (or list of them) from coding system
5516 attribute vector ATTRS for encoding (ENCODEP is nonzero) or
5517 decoding (ENCODEP is zero). */
5518
5519 static INLINE
5520 get_translation_table (attrs, encodep)
5521 {
5522 Lisp_Object standard, translation_table;
5523
5524 if (encodep)
5525 translation_table = CODING_ATTR_ENCODE_TBL (attrs),
5526 standard = Vstandard_translation_table_for_encode;
5527 else
5528 translation_table = CODING_ATTR_DECODE_TBL (attrs),
5529 standard = Vstandard_translation_table_for_decode;
5530 if (NILP (translation_table))
5531 return standard;
5532 if (SYMBOLP (translation_table))
5533 translation_table = Fget (translation_table, Qtranslation_table);
5534 else if (CONSP (translation_table))
5535 {
5536 Lisp_Object val;
5537
5538 translation_table = Fcopy_sequence (translation_table);
5539 for (val = translation_table; CONSP (val); val = XCDR (val))
5540 if (SYMBOLP (XCAR (val)))
5541 XSETCAR (val, Fget (XCAR (val), Qtranslation_table));
5542 }
5543 if (! NILP (standard))
5544 {
5545 if (CONSP (translation_table))
5546 translation_table = nconc2 (translation_table, Fcons (standard, Qnil));
5547 else
5548 translation_table = Fcons (translation_table, Fcons (standard, Qnil));
5549 }
5550 return translation_table;
5551 }
5552
5553
5554 static void
5555 translate_chars (coding, table)
5556 struct coding_system *coding;
5557 Lisp_Object table;
5558 {
5559 int *charbuf = coding->charbuf;
5560 int *charbuf_end = charbuf + coding->charbuf_used;
5561 int c;
5562
5563 if (coding->chars_at_source)
5564 return;
5565
5566 while (charbuf < charbuf_end)
5567 {
5568 c = *charbuf;
5569 if (c < 0)
5570 charbuf += -c;
5571 else
5572 *charbuf++ = translate_char (table, c);
5573 }
5574 }
5575
5576 static int
5577 produce_chars (coding)
5578 struct coding_system *coding;
5579 {
5580 unsigned char *dst = coding->destination + coding->produced;
5581 unsigned char *dst_end = coding->destination + coding->dst_bytes;
5582 int produced;
5583 int produced_chars = 0;
5584
5585 if (! coding->chars_at_source)
5586 {
5587 /* Characters are in coding->charbuf. */
5588 int *buf = coding->charbuf;
5589 int *buf_end = buf + coding->charbuf_used;
5590 unsigned char *adjusted_dst_end;
5591
5592 if (BUFFERP (coding->src_object)
5593 && EQ (coding->src_object, coding->dst_object))
5594 dst_end = ((unsigned char *) coding->source) + coding->consumed;
5595 adjusted_dst_end = dst_end - MAX_MULTIBYTE_LENGTH;
5596
5597 while (buf < buf_end)
5598 {
5599 int c = *buf++;
5600
5601 if (dst >= adjusted_dst_end)
5602 {
5603 dst = alloc_destination (coding,
5604 buf_end - buf + MAX_MULTIBYTE_LENGTH,
5605 dst);
5606 dst_end = coding->destination + coding->dst_bytes;
5607 adjusted_dst_end = dst_end - MAX_MULTIBYTE_LENGTH;
5608 }
5609 if (c >= 0)
5610 {
5611 if (coding->dst_multibyte
5612 || ! CHAR_BYTE8_P (c))
5613 CHAR_STRING_ADVANCE (c, dst);
5614 else
5615 *dst++ = CHAR_TO_BYTE8 (c);
5616 produced_chars++;
5617 }
5618 else
5619 /* This is an annotation datum. (-C) is the length of
5620 it. */
5621 buf += -c - 1;
5622 }
5623 }
5624 else
5625 {
5626 const unsigned char *src = coding->source;
5627 const unsigned char *src_end = src + coding->src_bytes;
5628 Lisp_Object eol_type;
5629
5630 eol_type = CODING_ID_EOL_TYPE (coding->id);
5631
5632 if (coding->src_multibyte != coding->dst_multibyte)
5633 {
5634 if (coding->src_multibyte)
5635 {
5636 int multibytep = 1;
5637 int consumed_chars;
5638
5639 while (1)
5640 {
5641 const unsigned char *src_base = src;
5642 int c;
5643
5644 ONE_MORE_BYTE (c);
5645 if (c == '\r')
5646 {
5647 if (EQ (eol_type, Qdos))
5648 {
5649 if (src == src_end)
5650 {
5651 record_conversion_result
5652 (coding, CODING_RESULT_INSUFFICIENT_SRC);
5653 goto no_more_source;
5654 }
5655 if (*src == '\n')
5656 c = *src++;
5657 }
5658 else if (EQ (eol_type, Qmac))
5659 c = '\n';
5660 }
5661 if (dst == dst_end)
5662 {
5663 coding->consumed = src - coding->source;
5664
5665 if (EQ (coding->src_object, coding->dst_object))
5666 dst_end = (unsigned char *) src;
5667 if (dst == dst_end)
5668 {
5669 dst = alloc_destination (coding, src_end - src + 1,
5670 dst);
5671 dst_end = coding->destination + coding->dst_bytes;
5672 coding_set_source (coding);
5673 src = coding->source + coding->consumed;
5674 src_end = coding->source + coding->src_bytes;
5675 }
5676 }
5677 *dst++ = c;
5678 produced_chars++;
5679 }
5680 no_more_source:
5681 ;
5682 }
5683 else
5684 while (src < src_end)
5685 {
5686 int multibytep = 1;
5687 int c = *src++;
5688
5689 if (c == '\r')
5690 {
5691 if (EQ (eol_type, Qdos))
5692 {
5693 if (src < src_end
5694 && *src == '\n')
5695 c = *src++;
5696 }
5697 else if (EQ (eol_type, Qmac))
5698 c = '\n';
5699 }
5700 if (dst >= dst_end - 1)
5701 {
5702 coding->consumed = src - coding->source;
5703
5704 if (EQ (coding->src_object, coding->dst_object))
5705 dst_end = (unsigned char *) src;
5706 if (dst >= dst_end - 1)
5707 {
5708 dst = alloc_destination (coding, src_end - src + 2,
5709 dst);
5710 dst_end = coding->destination + coding->dst_bytes;
5711 coding_set_source (coding);
5712 src = coding->source + coding->consumed;
5713 src_end = coding->source + coding->src_bytes;
5714 }
5715 }
5716 EMIT_ONE_BYTE (c);
5717 }
5718 }
5719 else
5720 {
5721 if (!EQ (coding->src_object, coding->dst_object))
5722 {
5723 int require = coding->src_bytes - coding->dst_bytes;
5724
5725 if (require > 0)
5726 {
5727 EMACS_INT offset = src - coding->source;
5728
5729 dst = alloc_destination (coding, require, dst);
5730 coding_set_source (coding);
5731 src = coding->source + offset;
5732 src_end = coding->source + coding->src_bytes;
5733 }
5734 }
5735 produced_chars = coding->src_chars;
5736 while (src < src_end)
5737 {
5738 int c = *src++;
5739
5740 if (c == '\r')
5741 {
5742 if (EQ (eol_type, Qdos))
5743 {
5744 if (src < src_end
5745 && *src == '\n')
5746 c = *src++;
5747 produced_chars--;
5748 }
5749 else if (EQ (eol_type, Qmac))
5750 c = '\n';
5751 }
5752 *dst++ = c;
5753 }
5754 }
5755 coding->consumed = coding->src_bytes;
5756 coding->consumed_char = coding->src_chars;
5757 }
5758
5759 produced = dst - (coding->destination + coding->produced);
5760 if (BUFFERP (coding->dst_object))
5761 insert_from_gap (produced_chars, produced);
5762 coding->produced += produced;
5763 coding->produced_char += produced_chars;
5764 return produced_chars;
5765 }
5766
5767 /* Compose text in CODING->object according to the annotation data at
5768 CHARBUF. CHARBUF is an array:
5769 [ -LENGTH ANNOTATION_MASK FROM TO METHOD COMP_LEN [ COMPONENTS... ] ]
5770 */
5771
5772 static INLINE void
5773 produce_composition (coding, charbuf)
5774 struct coding_system *coding;
5775 int *charbuf;
5776 {
5777 int len;
5778 EMACS_INT from, to;
5779 enum composition_method method;
5780 Lisp_Object components;
5781
5782 len = -charbuf[0];
5783 from = coding->dst_pos + charbuf[2];
5784 to = coding->dst_pos + charbuf[3];
5785 method = (enum composition_method) (charbuf[4]);
5786
5787 if (method == COMPOSITION_RELATIVE)
5788 components = Qnil;
5789 else
5790 {
5791 Lisp_Object args[MAX_COMPOSITION_COMPONENTS * 2 - 1];
5792 int i;
5793
5794 len -= 5;
5795 charbuf += 5;
5796 for (i = 0; i < len; i++)
5797 args[i] = make_number (charbuf[i]);
5798 components = (method == COMPOSITION_WITH_ALTCHARS
5799 ? Fstring (len, args) : Fvector (len, args));
5800 }
5801 compose_text (from, to, components, Qnil, coding->dst_object);
5802 }
5803
5804
5805 /* Put `charset' property on text in CODING->object according to
5806 the annotation data at CHARBUF. CHARBUF is an array:
5807 [ -LENGTH ANNOTATION_MASK FROM TO CHARSET-ID ]
5808 */
5809
5810 static INLINE void
5811 produce_charset (coding, charbuf)
5812 struct coding_system *coding;
5813 int *charbuf;
5814 {
5815 EMACS_INT from = coding->dst_pos + charbuf[2];
5816 EMACS_INT to = coding->dst_pos + charbuf[3];
5817 struct charset *charset = CHARSET_FROM_ID (charbuf[4]);
5818
5819 Fput_text_property (make_number (from), make_number (to),
5820 Qcharset, CHARSET_NAME (charset),
5821 coding->dst_object);
5822 }
5823
5824
5825 #define CHARBUF_SIZE 0x4000
5826
5827 #define ALLOC_CONVERSION_WORK_AREA(coding) \
5828 do { \
5829 int size = CHARBUF_SIZE;; \
5830 \
5831 coding->charbuf = NULL; \
5832 while (size > 1024) \
5833 { \
5834 coding->charbuf = (int *) alloca (sizeof (int) * size); \
5835 if (coding->charbuf) \
5836 break; \
5837 size >>= 1; \
5838 } \
5839 if (! coding->charbuf) \
5840 { \
5841 record_conversion_result (coding, CODING_RESULT_INSUFFICIENT_MEM); \
5842 return coding->result; \
5843 } \
5844 coding->charbuf_size = size; \
5845 } while (0)
5846
5847
5848 static void
5849 produce_annotation (coding)
5850 struct coding_system *coding;
5851 {
5852 int *charbuf = coding->charbuf;
5853 int *charbuf_end = charbuf + coding->charbuf_used;
5854
5855 if (NILP (coding->dst_object))
5856 return;
5857
5858 while (charbuf < charbuf_end)
5859 {
5860 if (*charbuf >= 0)
5861 charbuf++;
5862 else
5863 {
5864 int len = -*charbuf;
5865 switch (charbuf[1])
5866 {
5867 case CODING_ANNOTATE_COMPOSITION_MASK:
5868 produce_composition (coding, charbuf);
5869 break;
5870 case CODING_ANNOTATE_CHARSET_MASK:
5871 produce_charset (coding, charbuf);
5872 break;
5873 default:
5874 abort ();
5875 }
5876 charbuf += len;
5877 }
5878 }
5879 }
5880
5881 /* Decode the data at CODING->src_object into CODING->dst_object.
5882 CODING->src_object is a buffer, a string, or nil.
5883 CODING->dst_object is a buffer.
5884
5885 If CODING->src_object is a buffer, it must be the current buffer.
5886 In this case, if CODING->src_pos is positive, it is a position of
5887 the source text in the buffer, otherwise, the source text is in the
5888 gap area of the buffer, and CODING->src_pos specifies the offset of
5889 the text from GPT (which must be the same as PT). If this is the
5890 same buffer as CODING->dst_object, CODING->src_pos must be
5891 negative.
5892
5893 If CODING->src_object is a string, CODING->src_pos in an index to
5894 that string.
5895
5896 If CODING->src_object is nil, CODING->source must already point to
5897 the non-relocatable memory area. In this case, CODING->src_pos is
5898 an offset from CODING->source.
5899
5900 The decoded data is inserted at the current point of the buffer
5901 CODING->dst_object.
5902 */
5903
5904 static int
5905 decode_coding (coding)
5906 struct coding_system *coding;
5907 {
5908 Lisp_Object attrs;
5909 Lisp_Object undo_list;
5910 Lisp_Object translation_table;
5911
5912 if (BUFFERP (coding->src_object)
5913 && coding->src_pos > 0
5914 && coding->src_pos < GPT
5915 && coding->src_pos + coding->src_chars > GPT)
5916 move_gap_both (coding->src_pos, coding->src_pos_byte);
5917
5918 undo_list = Qt;
5919 if (BUFFERP (coding->dst_object))
5920 {
5921 if (current_buffer != XBUFFER (coding->dst_object))
5922 set_buffer_internal (XBUFFER (coding->dst_object));
5923 if (GPT != PT)
5924 move_gap_both (PT, PT_BYTE);
5925 undo_list = current_buffer->undo_list;
5926 current_buffer->undo_list = Qt;
5927 }
5928
5929 coding->consumed = coding->consumed_char = 0;
5930 coding->produced = coding->produced_char = 0;
5931 coding->chars_at_source = 0;
5932 record_conversion_result (coding, CODING_RESULT_SUCCESS);
5933 coding->errors = 0;
5934
5935 ALLOC_CONVERSION_WORK_AREA (coding);
5936
5937 attrs = CODING_ID_ATTRS (coding->id);
5938 translation_table = get_translation_table (attrs, 0);
5939
5940 do
5941 {
5942 coding_set_source (coding);
5943 coding->annotated = 0;
5944 (*(coding->decoder)) (coding);
5945 if (!NILP (translation_table))
5946 translate_chars (coding, translation_table);
5947 coding_set_destination (coding);
5948 produce_chars (coding);
5949 if (coding->annotated)
5950 produce_annotation (coding);
5951 }
5952 while (coding->consumed < coding->src_bytes
5953 && ! coding->result);
5954
5955 coding->carryover_bytes = 0;
5956 if (coding->consumed < coding->src_bytes)
5957 {
5958 int nbytes = coding->src_bytes - coding->consumed;
5959 const unsigned char *src;
5960
5961 coding_set_source (coding);
5962 coding_set_destination (coding);
5963 src = coding->source + coding->consumed;
5964
5965 if (coding->mode & CODING_MODE_LAST_BLOCK)
5966 {
5967 /* Flush out unprocessed data as binary chars. We are sure
5968 that the number of data is less than the size of
5969 coding->charbuf. */
5970 coding->charbuf_used = 0;
5971 while (nbytes-- > 0)
5972 {
5973 int c = *src++;
5974
5975 coding->charbuf[coding->charbuf_used++] = (c & 0x80 ? - c : c);
5976 }
5977 produce_chars (coding);
5978 }
5979 else
5980 {
5981 /* Record unprocessed bytes in coding->carryover. We are
5982 sure that the number of data is less than the size of
5983 coding->carryover. */
5984 unsigned char *p = coding->carryover;
5985
5986 coding->carryover_bytes = nbytes;
5987 while (nbytes-- > 0)
5988 *p++ = *src++;
5989 }
5990 coding->consumed = coding->src_bytes;
5991 }
5992
5993 if (BUFFERP (coding->dst_object))
5994 {
5995 current_buffer->undo_list = undo_list;
5996 record_insert (coding->dst_pos, coding->produced_char);
5997 }
5998 if (! EQ (CODING_ID_EOL_TYPE (coding->id), Qunix))
5999 decode_eol (coding);
6000 return coding->result;
6001 }
6002
6003
6004 /* Extract an annotation datum from a composition starting at POS and
6005 ending before LIMIT of CODING->src_object (buffer or string), store
6006 the data in BUF, set *STOP to a starting position of the next
6007 composition (if any) or to LIMIT, and return the address of the
6008 next element of BUF.
6009
6010 If such an annotation is not found, set *STOP to a starting
6011 position of a composition after POS (if any) or to LIMIT, and
6012 return BUF. */
6013
6014 static INLINE int *
6015 handle_composition_annotation (pos, limit, coding, buf, stop)
6016 EMACS_INT pos, limit;
6017 struct coding_system *coding;
6018 int *buf;
6019 EMACS_INT *stop;
6020 {
6021 EMACS_INT start, end;
6022 Lisp_Object prop;
6023
6024 if (! find_composition (pos, limit, &start, &end, &prop, coding->src_object)
6025 || end > limit)
6026 *stop = limit;
6027 else if (start > pos)
6028 *stop = start;
6029 else
6030 {
6031 if (start == pos)
6032 {
6033 /* We found a composition. Store the corresponding
6034 annotation data in BUF. */
6035 int *head = buf;
6036 enum composition_method method = COMPOSITION_METHOD (prop);
6037 int nchars = COMPOSITION_LENGTH (prop);
6038
6039 ADD_COMPOSITION_DATA (buf, 0, nchars, method);
6040 if (method != COMPOSITION_RELATIVE)
6041 {
6042 Lisp_Object components;
6043 int len, i, i_byte;
6044
6045 components = COMPOSITION_COMPONENTS (prop);
6046 if (VECTORP (components))
6047 {
6048 len = XVECTOR (components)->size;
6049 for (i = 0; i < len; i++)
6050 *buf++ = XINT (AREF (components, i));
6051 }
6052 else if (STRINGP (components))
6053 {
6054 len = SCHARS (components);
6055 i = i_byte = 0;
6056 while (i < len)
6057 {
6058 FETCH_STRING_CHAR_ADVANCE (*buf, components, i, i_byte);
6059 buf++;
6060 }
6061 }
6062 else if (INTEGERP (components))
6063 {
6064 len = 1;
6065 *buf++ = XINT (components);
6066 }
6067 else if (CONSP (components))
6068 {
6069 for (len = 0; CONSP (components);
6070 len++, components = XCDR (components))
6071 *buf++ = XINT (XCAR (components));
6072 }
6073 else
6074 abort ();
6075 *head -= len;
6076 }
6077 }
6078
6079 if (find_composition (end, limit, &start, &end, &prop,
6080 coding->src_object)
6081 && end <= limit)
6082 *stop = start;
6083 else
6084 *stop = limit;
6085 }
6086 return buf;
6087 }
6088
6089
6090 /* Extract an annotation datum from a text property `charset' at POS of
6091 CODING->src_object (buffer of string), store the data in BUF, set
6092 *STOP to the position where the value of `charset' property changes
6093 (limiting by LIMIT), and return the address of the next element of
6094 BUF.
6095
6096 If the property value is nil, set *STOP to the position where the
6097 property value is non-nil (limiting by LIMIT), and return BUF. */
6098
6099 static INLINE int *
6100 handle_charset_annotation (pos, limit, coding, buf, stop)
6101 EMACS_INT pos, limit;
6102 struct coding_system *coding;
6103 int *buf;
6104 EMACS_INT *stop;
6105 {
6106 Lisp_Object val, next;
6107 int id;
6108
6109 val = Fget_text_property (make_number (pos), Qcharset, coding->src_object);
6110 if (! NILP (val) && CHARSETP (val))
6111 id = XINT (CHARSET_SYMBOL_ID (val));
6112 else
6113 id = -1;
6114 ADD_CHARSET_DATA (buf, 0, 0, id);
6115 next = Fnext_single_property_change (make_number (pos), Qcharset,
6116 coding->src_object,
6117 make_number (limit));
6118 *stop = XINT (next);
6119 return buf;
6120 }
6121
6122
6123 static void
6124 consume_chars (coding)
6125 struct coding_system *coding;
6126 {
6127 int *buf = coding->charbuf;
6128 int *buf_end = coding->charbuf + coding->charbuf_size;
6129 const unsigned char *src = coding->source + coding->consumed;
6130 const unsigned char *src_end = coding->source + coding->src_bytes;
6131 EMACS_INT pos = coding->src_pos + coding->consumed_char;
6132 EMACS_INT end_pos = coding->src_pos + coding->src_chars;
6133 int multibytep = coding->src_multibyte;
6134 Lisp_Object eol_type;
6135 int c;
6136 EMACS_INT stop, stop_composition, stop_charset;
6137
6138 eol_type = CODING_ID_EOL_TYPE (coding->id);
6139 if (VECTORP (eol_type))
6140 eol_type = Qunix;
6141
6142 /* Note: composition handling is not yet implemented. */
6143 coding->common_flags &= ~CODING_ANNOTATE_COMPOSITION_MASK;
6144
6145 if (NILP (coding->src_object))
6146 stop = stop_composition = stop_charset = end_pos;
6147 else
6148 {
6149 if (coding->common_flags & CODING_ANNOTATE_COMPOSITION_MASK)
6150 stop = stop_composition = pos;
6151 else
6152 stop = stop_composition = end_pos;
6153 if (coding->common_flags & CODING_ANNOTATE_CHARSET_MASK)
6154 stop = stop_charset = pos;
6155 else
6156 stop_charset = end_pos;
6157 }
6158
6159 /* Compensate for CRLF and conversion. */
6160 buf_end -= 1 + MAX_ANNOTATION_LENGTH;
6161 while (buf < buf_end)
6162 {
6163 if (pos == stop)
6164 {
6165 if (pos == end_pos)
6166 break;
6167 if (pos == stop_composition)
6168 buf = handle_composition_annotation (pos, end_pos, coding,
6169 buf, &stop_composition);
6170 if (pos == stop_charset)
6171 buf = handle_charset_annotation (pos, end_pos, coding,
6172 buf, &stop_charset);
6173 stop = (stop_composition < stop_charset
6174 ? stop_composition : stop_charset);
6175 }
6176
6177 if (! multibytep)
6178 {
6179 EMACS_INT bytes;
6180
6181 if (! CODING_FOR_UNIBYTE (coding)
6182 && (bytes = MULTIBYTE_LENGTH (src, src_end)) > 0)
6183 c = STRING_CHAR_ADVANCE (src), pos += bytes;
6184 else
6185 c = *src++, pos++;
6186 }
6187 else
6188 c = STRING_CHAR_ADVANCE (src), pos++;
6189 if ((c == '\r') && (coding->mode & CODING_MODE_SELECTIVE_DISPLAY))
6190 c = '\n';
6191 if (! EQ (eol_type, Qunix))
6192 {
6193 if (c == '\n')
6194 {
6195 if (EQ (eol_type, Qdos))
6196 *buf++ = '\r';
6197 else
6198 c = '\r';
6199 }
6200 }
6201 *buf++ = c;
6202 }
6203
6204 coding->consumed = src - coding->source;
6205 coding->consumed_char = pos - coding->src_pos;
6206 coding->charbuf_used = buf - coding->charbuf;
6207 coding->chars_at_source = 0;
6208 }
6209
6210
6211 /* Encode the text at CODING->src_object into CODING->dst_object.
6212 CODING->src_object is a buffer or a string.
6213 CODING->dst_object is a buffer or nil.
6214
6215 If CODING->src_object is a buffer, it must be the current buffer.
6216 In this case, if CODING->src_pos is positive, it is a position of
6217 the source text in the buffer, otherwise. the source text is in the
6218 gap area of the buffer, and coding->src_pos specifies the offset of
6219 the text from GPT (which must be the same as PT). If this is the
6220 same buffer as CODING->dst_object, CODING->src_pos must be
6221 negative and CODING should not have `pre-write-conversion'.
6222
6223 If CODING->src_object is a string, CODING should not have
6224 `pre-write-conversion'.
6225
6226 If CODING->dst_object is a buffer, the encoded data is inserted at
6227 the current point of that buffer.
6228
6229 If CODING->dst_object is nil, the encoded data is placed at the
6230 memory area specified by CODING->destination. */
6231
6232 static int
6233 encode_coding (coding)
6234 struct coding_system *coding;
6235 {
6236 Lisp_Object attrs;
6237 Lisp_Object translation_table;
6238
6239 attrs = CODING_ID_ATTRS (coding->id);
6240 translation_table = get_translation_table (attrs, 1);
6241
6242 if (BUFFERP (coding->dst_object))
6243 {
6244 set_buffer_internal (XBUFFER (coding->dst_object));
6245 coding->dst_multibyte
6246 = ! NILP (current_buffer->enable_multibyte_characters);
6247 }
6248
6249 coding->consumed = coding->consumed_char = 0;
6250 coding->produced = coding->produced_char = 0;
6251 record_conversion_result (coding, CODING_RESULT_SUCCESS);
6252 coding->errors = 0;
6253
6254 ALLOC_CONVERSION_WORK_AREA (coding);
6255
6256 do {
6257 coding_set_source (coding);
6258 consume_chars (coding);
6259
6260 if (!NILP (translation_table))
6261 translate_chars (coding, translation_table);
6262
6263 coding_set_destination (coding);
6264 (*(coding->encoder)) (coding);
6265 } while (coding->consumed_char < coding->src_chars);
6266
6267 if (BUFFERP (coding->dst_object))
6268 insert_from_gap (coding->produced_char, coding->produced);
6269
6270 return (coding->result);
6271 }
6272
6273
6274 /* Name (or base name) of work buffer for code conversion. */
6275 static Lisp_Object Vcode_conversion_workbuf_name;
6276
6277 /* A working buffer used by the top level conversion. Once it is
6278 created, it is never destroyed. It has the name
6279 Vcode_conversion_workbuf_name. The other working buffers are
6280 destroyed after the use is finished, and their names are modified
6281 versions of Vcode_conversion_workbuf_name. */
6282 static Lisp_Object Vcode_conversion_reused_workbuf;
6283
6284 /* 1 iff Vcode_conversion_reused_workbuf is already in use. */
6285 static int reused_workbuf_in_use;
6286
6287
6288 /* Return a working buffer of code convesion. MULTIBYTE specifies the
6289 multibyteness of returning buffer. */
6290
6291 Lisp_Object
6292 make_conversion_work_buffer (multibyte)
6293 {
6294 Lisp_Object name, workbuf;
6295 struct buffer *current;
6296
6297 if (reused_workbuf_in_use++)
6298 {
6299 name = Fgenerate_new_buffer_name (Vcode_conversion_workbuf_name, Qnil);
6300 workbuf = Fget_buffer_create (name);
6301 }
6302 else
6303 {
6304 name = Vcode_conversion_workbuf_name;
6305 workbuf = Fget_buffer_create (name);
6306 if (NILP (Vcode_conversion_reused_workbuf))
6307 Vcode_conversion_reused_workbuf = workbuf;
6308 }
6309 current = current_buffer;
6310 set_buffer_internal (XBUFFER (workbuf));
6311 Ferase_buffer ();
6312 current_buffer->undo_list = Qt;
6313 current_buffer->enable_multibyte_characters = multibyte ? Qt : Qnil;
6314 set_buffer_internal (current);
6315 return workbuf;
6316 }
6317
6318
6319 static Lisp_Object
6320 code_conversion_restore (arg)
6321 Lisp_Object arg;
6322 {
6323 Lisp_Object current, workbuf;
6324
6325 current = XCAR (arg);
6326 workbuf = XCDR (arg);
6327 if (! NILP (workbuf))
6328 {
6329 if (EQ (workbuf, Vcode_conversion_reused_workbuf))
6330 reused_workbuf_in_use = 0;
6331 else if (! NILP (Fbuffer_live_p (workbuf)))
6332 Fkill_buffer (workbuf);
6333 }
6334 set_buffer_internal (XBUFFER (current));
6335 return Qnil;
6336 }
6337
6338 Lisp_Object
6339 code_conversion_save (with_work_buf, multibyte)
6340 int with_work_buf, multibyte;
6341 {
6342 Lisp_Object workbuf = Qnil;
6343
6344 if (with_work_buf)
6345 workbuf = make_conversion_work_buffer (multibyte);
6346 record_unwind_protect (code_conversion_restore,
6347 Fcons (Fcurrent_buffer (), workbuf));
6348 return workbuf;
6349 }
6350
6351 int
6352 decode_coding_gap (coding, chars, bytes)
6353 struct coding_system *coding;
6354 EMACS_INT chars, bytes;
6355 {
6356 int count = specpdl_ptr - specpdl;
6357 Lisp_Object attrs;
6358
6359 code_conversion_save (0, 0);
6360
6361 coding->src_object = Fcurrent_buffer ();
6362 coding->src_chars = chars;
6363 coding->src_bytes = bytes;
6364 coding->src_pos = -chars;
6365 coding->src_pos_byte = -bytes;
6366 coding->src_multibyte = chars < bytes;
6367 coding->dst_object = coding->src_object;
6368 coding->dst_pos = PT;
6369 coding->dst_pos_byte = PT_BYTE;
6370 coding->dst_multibyte = ! NILP (current_buffer->enable_multibyte_characters);
6371 coding->mode |= CODING_MODE_LAST_BLOCK;
6372
6373 if (CODING_REQUIRE_DETECTION (coding))
6374 detect_coding (coding);
6375
6376 decode_coding (coding);
6377
6378 attrs = CODING_ID_ATTRS (coding->id);
6379 if (! NILP (CODING_ATTR_POST_READ (attrs)))
6380 {
6381 EMACS_INT prev_Z = Z, prev_Z_BYTE = Z_BYTE;
6382 Lisp_Object val;
6383
6384 TEMP_SET_PT_BOTH (coding->dst_pos, coding->dst_pos_byte);
6385 val = call1 (CODING_ATTR_POST_READ (attrs),
6386 make_number (coding->produced_char));
6387 CHECK_NATNUM (val);
6388 coding->produced_char += Z - prev_Z;
6389 coding->produced += Z_BYTE - prev_Z_BYTE;
6390 }
6391
6392 unbind_to (count, Qnil);
6393 return coding->result;
6394 }
6395
6396 int
6397 encode_coding_gap (coding, chars, bytes)
6398 struct coding_system *coding;
6399 EMACS_INT chars, bytes;
6400 {
6401 int count = specpdl_ptr - specpdl;
6402
6403 code_conversion_save (0, 0);
6404
6405 coding->src_object = Fcurrent_buffer ();
6406 coding->src_chars = chars;
6407 coding->src_bytes = bytes;
6408 coding->src_pos = -chars;
6409 coding->src_pos_byte = -bytes;
6410 coding->src_multibyte = chars < bytes;
6411 coding->dst_object = coding->src_object;
6412 coding->dst_pos = PT;
6413 coding->dst_pos_byte = PT_BYTE;
6414
6415 encode_coding (coding);
6416
6417 unbind_to (count, Qnil);
6418 return coding->result;
6419 }
6420
6421
6422 /* Decode the text in the range FROM/FROM_BYTE and TO/TO_BYTE in
6423 SRC_OBJECT into DST_OBJECT by coding context CODING.
6424
6425 SRC_OBJECT is a buffer, a string, or Qnil.
6426
6427 If it is a buffer, the text is at point of the buffer. FROM and TO
6428 are positions in the buffer.
6429
6430 If it is a string, the text is at the beginning of the string.
6431 FROM and TO are indices to the string.
6432
6433 If it is nil, the text is at coding->source. FROM and TO are
6434 indices to coding->source.
6435
6436 DST_OBJECT is a buffer, Qt, or Qnil.
6437
6438 If it is a buffer, the decoded text is inserted at point of the
6439 buffer. If the buffer is the same as SRC_OBJECT, the source text
6440 is deleted.
6441
6442 If it is Qt, a string is made from the decoded text, and
6443 set in CODING->dst_object.
6444
6445 If it is Qnil, the decoded text is stored at CODING->destination.
6446 The caller must allocate CODING->dst_bytes bytes at
6447 CODING->destination by xmalloc. If the decoded text is longer than
6448 CODING->dst_bytes, CODING->destination is relocated by xrealloc.
6449 */
6450
6451 void
6452 decode_coding_object (coding, src_object, from, from_byte, to, to_byte,
6453 dst_object)
6454 struct coding_system *coding;
6455 Lisp_Object src_object;
6456 EMACS_INT from, from_byte, to, to_byte;
6457 Lisp_Object dst_object;
6458 {
6459 int count = specpdl_ptr - specpdl;
6460 unsigned char *destination;
6461 EMACS_INT dst_bytes;
6462 EMACS_INT chars = to - from;
6463 EMACS_INT bytes = to_byte - from_byte;
6464 Lisp_Object attrs;
6465 Lisp_Object buffer;
6466 int saved_pt = -1, saved_pt_byte;
6467
6468 buffer = Fcurrent_buffer ();
6469
6470 if (NILP (dst_object))
6471 {
6472 destination = coding->destination;
6473 dst_bytes = coding->dst_bytes;
6474 }
6475
6476 coding->src_object = src_object;
6477 coding->src_chars = chars;
6478 coding->src_bytes = bytes;
6479 coding->src_multibyte = chars < bytes;
6480
6481 if (STRINGP (src_object))
6482 {
6483 coding->src_pos = from;
6484 coding->src_pos_byte = from_byte;
6485 }
6486 else if (BUFFERP (src_object))
6487 {
6488 set_buffer_internal (XBUFFER (src_object));
6489 if (from != GPT)
6490 move_gap_both (from, from_byte);
6491 if (EQ (src_object, dst_object))
6492 {
6493 saved_pt = PT, saved_pt_byte = PT_BYTE;
6494 TEMP_SET_PT_BOTH (from, from_byte);
6495 del_range_both (from, from_byte, to, to_byte, 1);
6496 coding->src_pos = -chars;
6497 coding->src_pos_byte = -bytes;
6498 }
6499 else
6500 {
6501 coding->src_pos = from;
6502 coding->src_pos_byte = from_byte;
6503 }
6504 }
6505
6506 if (CODING_REQUIRE_DETECTION (coding))
6507 detect_coding (coding);
6508 attrs = CODING_ID_ATTRS (coding->id);
6509
6510 if (EQ (dst_object, Qt)
6511 || (! NILP (CODING_ATTR_POST_READ (attrs))
6512 && NILP (dst_object)))
6513 {
6514 coding->dst_object = code_conversion_save (1, 1);
6515 coding->dst_pos = BEG;
6516 coding->dst_pos_byte = BEG_BYTE;
6517 coding->dst_multibyte = 1;
6518 }
6519 else if (BUFFERP (dst_object))
6520 {
6521 code_conversion_save (0, 0);
6522 coding->dst_object = dst_object;
6523 coding->dst_pos = BUF_PT (XBUFFER (dst_object));
6524 coding->dst_pos_byte = BUF_PT_BYTE (XBUFFER (dst_object));
6525 coding->dst_multibyte
6526 = ! NILP (XBUFFER (dst_object)->enable_multibyte_characters);
6527 }
6528 else
6529 {
6530 code_conversion_save (0, 0);
6531 coding->dst_object = Qnil;
6532 coding->dst_multibyte = 1;
6533 }
6534
6535 decode_coding (coding);
6536
6537 if (BUFFERP (coding->dst_object))
6538 set_buffer_internal (XBUFFER (coding->dst_object));
6539
6540 if (! NILP (CODING_ATTR_POST_READ (attrs)))
6541 {
6542 struct gcpro gcpro1, gcpro2;
6543 EMACS_INT prev_Z = Z, prev_Z_BYTE = Z_BYTE;
6544 Lisp_Object val;
6545
6546 TEMP_SET_PT_BOTH (coding->dst_pos, coding->dst_pos_byte);
6547 GCPRO2 (coding->src_object, coding->dst_object);
6548 val = call1 (CODING_ATTR_POST_READ (attrs),
6549 make_number (coding->produced_char));
6550 UNGCPRO;
6551 CHECK_NATNUM (val);
6552 coding->produced_char += Z - prev_Z;
6553 coding->produced += Z_BYTE - prev_Z_BYTE;
6554 }
6555
6556 if (EQ (dst_object, Qt))
6557 {
6558 coding->dst_object = Fbuffer_string ();
6559 }
6560 else if (NILP (dst_object) && BUFFERP (coding->dst_object))
6561 {
6562 set_buffer_internal (XBUFFER (coding->dst_object));
6563 if (dst_bytes < coding->produced)
6564 {
6565 destination
6566 = (unsigned char *) xrealloc (destination, coding->produced);
6567 if (! destination)
6568 {
6569 record_conversion_result (coding,
6570 CODING_RESULT_INSUFFICIENT_DST);
6571 unbind_to (count, Qnil);
6572 return;
6573 }
6574 if (BEGV < GPT && GPT < BEGV + coding->produced_char)
6575 move_gap_both (BEGV, BEGV_BYTE);
6576 bcopy (BEGV_ADDR, destination, coding->produced);
6577 coding->destination = destination;
6578 }
6579 }
6580
6581 if (saved_pt >= 0)
6582 {
6583 /* This is the case of:
6584 (BUFFERP (src_object) && EQ (src_object, dst_object))
6585 As we have moved PT while replacing the original buffer
6586 contents, we must recover it now. */
6587 set_buffer_internal (XBUFFER (src_object));
6588 if (saved_pt < from)
6589 TEMP_SET_PT_BOTH (saved_pt, saved_pt_byte);
6590 else if (saved_pt < from + chars)
6591 TEMP_SET_PT_BOTH (from, from_byte);
6592 else if (! NILP (current_buffer->enable_multibyte_characters))
6593 TEMP_SET_PT_BOTH (saved_pt + (coding->produced_char - chars),
6594 saved_pt_byte + (coding->produced - bytes));
6595 else
6596 TEMP_SET_PT_BOTH (saved_pt + (coding->produced - bytes),
6597 saved_pt_byte + (coding->produced - bytes));
6598 }
6599
6600 unbind_to (count, coding->dst_object);
6601 }
6602
6603
6604 void
6605 encode_coding_object (coding, src_object, from, from_byte, to, to_byte,
6606 dst_object)
6607 struct coding_system *coding;
6608 Lisp_Object src_object;
6609 EMACS_INT from, from_byte, to, to_byte;
6610 Lisp_Object dst_object;
6611 {
6612 int count = specpdl_ptr - specpdl;
6613 EMACS_INT chars = to - from;
6614 EMACS_INT bytes = to_byte - from_byte;
6615 Lisp_Object attrs;
6616 Lisp_Object buffer;
6617 int saved_pt = -1, saved_pt_byte;
6618
6619 buffer = Fcurrent_buffer ();
6620
6621 coding->src_object = src_object;
6622 coding->src_chars = chars;
6623 coding->src_bytes = bytes;
6624 coding->src_multibyte = chars < bytes;
6625
6626 attrs = CODING_ID_ATTRS (coding->id);
6627
6628 if (! NILP (CODING_ATTR_PRE_WRITE (attrs)))
6629 {
6630 coding->src_object = code_conversion_save (1, coding->src_multibyte);
6631 set_buffer_internal (XBUFFER (coding->src_object));
6632 if (STRINGP (src_object))
6633 insert_from_string (src_object, from, from_byte, chars, bytes, 0);
6634 else if (BUFFERP (src_object))
6635 insert_from_buffer (XBUFFER (src_object), from, chars, 0);
6636 else
6637 insert_1_both (coding->source + from, chars, bytes, 0, 0, 0);
6638
6639 if (EQ (src_object, dst_object))
6640 {
6641 set_buffer_internal (XBUFFER (src_object));
6642 saved_pt = PT, saved_pt_byte = PT_BYTE;
6643 del_range_both (from, from_byte, to, to_byte, 1);
6644 set_buffer_internal (XBUFFER (coding->src_object));
6645 }
6646
6647 call2 (CODING_ATTR_PRE_WRITE (attrs),
6648 make_number (BEG), make_number (Z));
6649 coding->src_object = Fcurrent_buffer ();
6650 if (BEG != GPT)
6651 move_gap_both (BEG, BEG_BYTE);
6652 coding->src_chars = Z - BEG;
6653 coding->src_bytes = Z_BYTE - BEG_BYTE;
6654 coding->src_pos = BEG;
6655 coding->src_pos_byte = BEG_BYTE;
6656 coding->src_multibyte = Z < Z_BYTE;
6657 }
6658 else if (STRINGP (src_object))
6659 {
6660 code_conversion_save (0, 0);
6661 coding->src_pos = from;
6662 coding->src_pos_byte = from_byte;
6663 }
6664 else if (BUFFERP (src_object))
6665 {
6666 code_conversion_save (0, 0);
6667 set_buffer_internal (XBUFFER (src_object));
6668 if (EQ (src_object, dst_object))
6669 {
6670 saved_pt = PT, saved_pt_byte = PT_BYTE;
6671 coding->src_object = del_range_1 (from, to, 1, 1);
6672 coding->src_pos = 0;
6673 coding->src_pos_byte = 0;
6674 }
6675 else
6676 {
6677 if (from < GPT && to >= GPT)
6678 move_gap_both (from, from_byte);
6679 coding->src_pos = from;
6680 coding->src_pos_byte = from_byte;
6681 }
6682 }
6683 else
6684 code_conversion_save (0, 0);
6685
6686 if (BUFFERP (dst_object))
6687 {
6688 coding->dst_object = dst_object;
6689 if (EQ (src_object, dst_object))
6690 {
6691 coding->dst_pos = from;
6692 coding->dst_pos_byte = from_byte;
6693 }
6694 else
6695 {
6696 coding->dst_pos = BUF_PT (XBUFFER (dst_object));
6697 coding->dst_pos_byte = BUF_PT_BYTE (XBUFFER (dst_object));
6698 }
6699 coding->dst_multibyte
6700 = ! NILP (XBUFFER (dst_object)->enable_multibyte_characters);
6701 }
6702 else if (EQ (dst_object, Qt))
6703 {
6704 coding->dst_object = Qnil;
6705 coding->dst_bytes = coding->src_chars;
6706 if (coding->dst_bytes == 0)
6707 coding->dst_bytes = 1;
6708 coding->destination = (unsigned char *) xmalloc (coding->dst_bytes);
6709 coding->dst_multibyte = 0;
6710 }
6711 else
6712 {
6713 coding->dst_object = Qnil;
6714 coding->dst_multibyte = 0;
6715 }
6716
6717 encode_coding (coding);
6718
6719 if (EQ (dst_object, Qt))
6720 {
6721 if (BUFFERP (coding->dst_object))
6722 coding->dst_object = Fbuffer_string ();
6723 else
6724 {
6725 coding->dst_object
6726 = make_unibyte_string ((char *) coding->destination,
6727 coding->produced);
6728 xfree (coding->destination);
6729 }
6730 }
6731
6732 if (saved_pt >= 0)
6733 {
6734 /* This is the case of:
6735 (BUFFERP (src_object) && EQ (src_object, dst_object))
6736 As we have moved PT while replacing the original buffer
6737 contents, we must recover it now. */
6738 set_buffer_internal (XBUFFER (src_object));
6739 if (saved_pt < from)
6740 TEMP_SET_PT_BOTH (saved_pt, saved_pt_byte);
6741 else if (saved_pt < from + chars)
6742 TEMP_SET_PT_BOTH (from, from_byte);
6743 else if (! NILP (current_buffer->enable_multibyte_characters))
6744 TEMP_SET_PT_BOTH (saved_pt + (coding->produced_char - chars),
6745 saved_pt_byte + (coding->produced - bytes));
6746 else
6747 TEMP_SET_PT_BOTH (saved_pt + (coding->produced - bytes),
6748 saved_pt_byte + (coding->produced - bytes));
6749 }
6750
6751 unbind_to (count, Qnil);
6752 }
6753
6754
6755 Lisp_Object
6756 preferred_coding_system ()
6757 {
6758 int id = coding_categories[coding_priorities[0]].id;
6759
6760 return CODING_ID_NAME (id);
6761 }
6762
6763 \f
6764 #ifdef emacs
6765 /*** 8. Emacs Lisp library functions ***/
6766
6767 DEFUN ("coding-system-p", Fcoding_system_p, Scoding_system_p, 1, 1, 0,
6768 doc: /* Return t if OBJECT is nil or a coding-system.
6769 See the documentation of `define-coding-system' for information
6770 about coding-system objects. */)
6771 (obj)
6772 Lisp_Object obj;
6773 {
6774 return ((NILP (obj) || CODING_SYSTEM_P (obj)) ? Qt : Qnil);
6775 }
6776
6777 DEFUN ("read-non-nil-coding-system", Fread_non_nil_coding_system,
6778 Sread_non_nil_coding_system, 1, 1, 0,
6779 doc: /* Read a coding system from the minibuffer, prompting with string PROMPT. */)
6780 (prompt)
6781 Lisp_Object prompt;
6782 {
6783 Lisp_Object val;
6784 do
6785 {
6786 val = Fcompleting_read (prompt, Vcoding_system_alist, Qnil,
6787 Qt, Qnil, Qcoding_system_history, Qnil, Qnil);
6788 }
6789 while (SCHARS (val) == 0);
6790 return (Fintern (val, Qnil));
6791 }
6792
6793 DEFUN ("read-coding-system", Fread_coding_system, Sread_coding_system, 1, 2, 0,
6794 doc: /* Read a coding system from the minibuffer, prompting with string PROMPT.
6795 If the user enters null input, return second argument DEFAULT-CODING-SYSTEM. */)
6796 (prompt, default_coding_system)
6797 Lisp_Object prompt, default_coding_system;
6798 {
6799 Lisp_Object val;
6800 if (SYMBOLP (default_coding_system))
6801 XSETSTRING (default_coding_system, XPNTR (SYMBOL_NAME (default_coding_system)));
6802 val = Fcompleting_read (prompt, Vcoding_system_alist, Qnil,
6803 Qt, Qnil, Qcoding_system_history,
6804 default_coding_system, Qnil);
6805 return (SCHARS (val) == 0 ? Qnil : Fintern (val, Qnil));
6806 }
6807
6808 DEFUN ("check-coding-system", Fcheck_coding_system, Scheck_coding_system,
6809 1, 1, 0,
6810 doc: /* Check validity of CODING-SYSTEM.
6811 If valid, return CODING-SYSTEM, else signal a `coding-system-error' error. */)
6812 (coding_system)
6813 Lisp_Object coding_system;
6814 {
6815 CHECK_SYMBOL (coding_system);
6816 if (!NILP (Fcoding_system_p (coding_system)))
6817 return coding_system;
6818 while (1)
6819 Fsignal (Qcoding_system_error, Fcons (coding_system, Qnil));
6820 }
6821
6822 \f
6823 /* Detect how the bytes at SRC of length SRC_BYTES are encoded. If
6824 HIGHEST is nonzero, return the coding system of the highest
6825 priority among the detected coding systems. Otherwize return a
6826 list of detected coding systems sorted by their priorities. If
6827 MULTIBYTEP is nonzero, it is assumed that the bytes are in correct
6828 multibyte form but contains only ASCII and eight-bit chars.
6829 Otherwise, the bytes are raw bytes.
6830
6831 CODING-SYSTEM controls the detection as below:
6832
6833 If it is nil, detect both text-format and eol-format. If the
6834 text-format part of CODING-SYSTEM is already specified
6835 (e.g. `iso-latin-1'), detect only eol-format. If the eol-format
6836 part of CODING-SYSTEM is already specified (e.g. `undecided-unix'),
6837 detect only text-format. */
6838
6839 Lisp_Object
6840 detect_coding_system (src, src_chars, src_bytes, highest, multibytep,
6841 coding_system)
6842 const unsigned char *src;
6843 int src_chars, src_bytes, highest;
6844 int multibytep;
6845 Lisp_Object coding_system;
6846 {
6847 const unsigned char *src_end = src + src_bytes;
6848 Lisp_Object attrs, eol_type;
6849 Lisp_Object val;
6850 struct coding_system coding;
6851 int id;
6852 struct coding_detection_info detect_info;
6853 enum coding_category base_category;
6854
6855 if (NILP (coding_system))
6856 coding_system = Qundecided;
6857 setup_coding_system (coding_system, &coding);
6858 attrs = CODING_ID_ATTRS (coding.id);
6859 eol_type = CODING_ID_EOL_TYPE (coding.id);
6860 coding_system = CODING_ATTR_BASE_NAME (attrs);
6861
6862 coding.source = src;
6863 coding.src_chars = src_chars;
6864 coding.src_bytes = src_bytes;
6865 coding.src_multibyte = multibytep;
6866 coding.consumed = 0;
6867 coding.mode |= CODING_MODE_LAST_BLOCK;
6868
6869 detect_info.checked = detect_info.found = detect_info.rejected = 0;
6870
6871 /* At first, detect text-format if necessary. */
6872 base_category = XINT (CODING_ATTR_CATEGORY (attrs));
6873 if (base_category == coding_category_undecided)
6874 {
6875 enum coding_category category;
6876 struct coding_system *this;
6877 int c, i;
6878
6879 /* Skip all ASCII bytes except for a few ISO2022 controls. */
6880 for (i = 0; src < src_end; i++, src++)
6881 {
6882 c = *src;
6883 if (c & 0x80 || (c < 0x20 && (c == ISO_CODE_ESC
6884 || c == ISO_CODE_SI
6885 || c == ISO_CODE_SO)))
6886 break;
6887 }
6888 coding.head_ascii = src - coding.source;
6889
6890 if (src < src_end)
6891 for (i = 0; i < coding_category_raw_text; i++)
6892 {
6893 category = coding_priorities[i];
6894 this = coding_categories + category;
6895
6896 if (this->id < 0)
6897 {
6898 /* No coding system of this category is defined. */
6899 detect_info.rejected |= (1 << category);
6900 }
6901 else if (category >= coding_category_raw_text)
6902 continue;
6903 else if (detect_info.checked & (1 << category))
6904 {
6905 if (highest
6906 && (detect_info.found & (1 << category)))
6907 break;
6908 }
6909 else
6910 {
6911 if ((*(this->detector)) (&coding, &detect_info)
6912 && highest
6913 && (detect_info.found & (1 << category)))
6914 {
6915 if (category == coding_category_utf_16_auto)
6916 {
6917 if (detect_info.found & CATEGORY_MASK_UTF_16_LE)
6918 category = coding_category_utf_16_le;
6919 else
6920 category = coding_category_utf_16_be;
6921 }
6922 break;
6923 }
6924 }
6925 }
6926
6927 if (detect_info.rejected == CATEGORY_MASK_ANY)
6928 {
6929 detect_info.found = CATEGORY_MASK_RAW_TEXT;
6930 id = coding_categories[coding_category_raw_text].id;
6931 val = Fcons (make_number (id), Qnil);
6932 }
6933 else if (! detect_info.rejected && ! detect_info.found)
6934 {
6935 detect_info.found = CATEGORY_MASK_ANY;
6936 id = coding_categories[coding_category_undecided].id;
6937 val = Fcons (make_number (id), Qnil);
6938 }
6939 else if (highest)
6940 {
6941 if (detect_info.found)
6942 {
6943 detect_info.found = 1 << category;
6944 val = Fcons (make_number (this->id), Qnil);
6945 }
6946 else
6947 for (i = 0; i < coding_category_raw_text; i++)
6948 if (! (detect_info.rejected & (1 << coding_priorities[i])))
6949 {
6950 detect_info.found = 1 << coding_priorities[i];
6951 id = coding_categories[coding_priorities[i]].id;
6952 val = Fcons (make_number (id), Qnil);
6953 break;
6954 }
6955 }
6956 else
6957 {
6958 int mask = detect_info.rejected | detect_info.found;
6959 int found = 0;
6960 val = Qnil;
6961
6962 for (i = coding_category_raw_text - 1; i >= 0; i--)
6963 {
6964 category = coding_priorities[i];
6965 if (! (mask & (1 << category)))
6966 {
6967 found |= 1 << category;
6968 id = coding_categories[category].id;
6969 val = Fcons (make_number (id), val);
6970 }
6971 }
6972 for (i = coding_category_raw_text - 1; i >= 0; i--)
6973 {
6974 category = coding_priorities[i];
6975 if (detect_info.found & (1 << category))
6976 {
6977 id = coding_categories[category].id;
6978 val = Fcons (make_number (id), val);
6979 }
6980 }
6981 detect_info.found |= found;
6982 }
6983 }
6984 else if (base_category == coding_category_utf_16_auto)
6985 {
6986 if (detect_coding_utf_16 (&coding, &detect_info))
6987 {
6988 enum coding_category category;
6989 struct coding_system *this;
6990
6991 if (detect_info.found & CATEGORY_MASK_UTF_16_LE)
6992 this = coding_categories + coding_category_utf_16_le;
6993 else if (detect_info.found & CATEGORY_MASK_UTF_16_BE)
6994 this = coding_categories + coding_category_utf_16_be;
6995 else if (detect_info.rejected & CATEGORY_MASK_UTF_16_LE_NOSIG)
6996 this = coding_categories + coding_category_utf_16_be_nosig;
6997 else
6998 this = coding_categories + coding_category_utf_16_le_nosig;
6999 val = Fcons (make_number (this->id), Qnil);
7000 }
7001 }
7002 else
7003 {
7004 detect_info.found = 1 << XINT (CODING_ATTR_CATEGORY (attrs));
7005 val = Fcons (make_number (coding.id), Qnil);
7006 }
7007
7008 /* Then, detect eol-format if necessary. */
7009 {
7010 int normal_eol = -1, utf_16_be_eol = -1, utf_16_le_eol;
7011 Lisp_Object tail;
7012
7013 if (VECTORP (eol_type))
7014 {
7015 if (detect_info.found & ~CATEGORY_MASK_UTF_16)
7016 normal_eol = detect_eol (coding.source, src_bytes,
7017 coding_category_raw_text);
7018 if (detect_info.found & (CATEGORY_MASK_UTF_16_BE
7019 | CATEGORY_MASK_UTF_16_BE_NOSIG))
7020 utf_16_be_eol = detect_eol (coding.source, src_bytes,
7021 coding_category_utf_16_be);
7022 if (detect_info.found & (CATEGORY_MASK_UTF_16_LE
7023 | CATEGORY_MASK_UTF_16_LE_NOSIG))
7024 utf_16_le_eol = detect_eol (coding.source, src_bytes,
7025 coding_category_utf_16_le);
7026 }
7027 else
7028 {
7029 if (EQ (eol_type, Qunix))
7030 normal_eol = utf_16_be_eol = utf_16_le_eol = EOL_SEEN_LF;
7031 else if (EQ (eol_type, Qdos))
7032 normal_eol = utf_16_be_eol = utf_16_le_eol = EOL_SEEN_CRLF;
7033 else
7034 normal_eol = utf_16_be_eol = utf_16_le_eol = EOL_SEEN_CR;
7035 }
7036
7037 for (tail = val; CONSP (tail); tail = XCDR (tail))
7038 {
7039 enum coding_category category;
7040 int this_eol;
7041
7042 id = XINT (XCAR (tail));
7043 attrs = CODING_ID_ATTRS (id);
7044 category = XINT (CODING_ATTR_CATEGORY (attrs));
7045 eol_type = CODING_ID_EOL_TYPE (id);
7046 if (VECTORP (eol_type))
7047 {
7048 if (category == coding_category_utf_16_be
7049 || category == coding_category_utf_16_be_nosig)
7050 this_eol = utf_16_be_eol;
7051 else if (category == coding_category_utf_16_le
7052 || category == coding_category_utf_16_le_nosig)
7053 this_eol = utf_16_le_eol;
7054 else
7055 this_eol = normal_eol;
7056
7057 if (this_eol == EOL_SEEN_LF)
7058 XSETCAR (tail, AREF (eol_type, 0));
7059 else if (this_eol == EOL_SEEN_CRLF)
7060 XSETCAR (tail, AREF (eol_type, 1));
7061 else if (this_eol == EOL_SEEN_CR)
7062 XSETCAR (tail, AREF (eol_type, 2));
7063 else
7064 XSETCAR (tail, CODING_ID_NAME (id));
7065 }
7066 else
7067 XSETCAR (tail, CODING_ID_NAME (id));
7068 }
7069 }
7070
7071 return (highest ? XCAR (val) : val);
7072 }
7073
7074
7075 DEFUN ("detect-coding-region", Fdetect_coding_region, Sdetect_coding_region,
7076 2, 3, 0,
7077 doc: /* Detect coding system of the text in the region between START and END.
7078 Return a list of possible coding systems ordered by priority.
7079
7080 If only ASCII characters are found, it returns a list of single element
7081 `undecided' or its subsidiary coding system according to a detected
7082 end-of-line format.
7083
7084 If optional argument HIGHEST is non-nil, return the coding system of
7085 highest priority. */)
7086 (start, end, highest)
7087 Lisp_Object start, end, highest;
7088 {
7089 int from, to;
7090 int from_byte, to_byte;
7091
7092 CHECK_NUMBER_COERCE_MARKER (start);
7093 CHECK_NUMBER_COERCE_MARKER (end);
7094
7095 validate_region (&start, &end);
7096 from = XINT (start), to = XINT (end);
7097 from_byte = CHAR_TO_BYTE (from);
7098 to_byte = CHAR_TO_BYTE (to);
7099
7100 if (from < GPT && to >= GPT)
7101 move_gap_both (to, to_byte);
7102
7103 return detect_coding_system (BYTE_POS_ADDR (from_byte),
7104 to - from, to_byte - from_byte,
7105 !NILP (highest),
7106 !NILP (current_buffer
7107 ->enable_multibyte_characters),
7108 Qnil);
7109 }
7110
7111 DEFUN ("detect-coding-string", Fdetect_coding_string, Sdetect_coding_string,
7112 1, 2, 0,
7113 doc: /* Detect coding system of the text in STRING.
7114 Return a list of possible coding systems ordered by priority.
7115
7116 If only ASCII characters are found, it returns a list of single element
7117 `undecided' or its subsidiary coding system according to a detected
7118 end-of-line format.
7119
7120 If optional argument HIGHEST is non-nil, return the coding system of
7121 highest priority. */)
7122 (string, highest)
7123 Lisp_Object string, highest;
7124 {
7125 CHECK_STRING (string);
7126
7127 return detect_coding_system (SDATA (string),
7128 SCHARS (string), SBYTES (string),
7129 !NILP (highest), STRING_MULTIBYTE (string),
7130 Qnil);
7131 }
7132
7133
7134 static INLINE int
7135 char_encodable_p (c, attrs)
7136 int c;
7137 Lisp_Object attrs;
7138 {
7139 Lisp_Object tail;
7140 struct charset *charset;
7141 Lisp_Object translation_table;
7142
7143 translation_table = CODING_ATTR_TRANS_TBL (attrs);
7144 if (! NILP (translation_table))
7145 c = translate_char (translation_table, c);
7146 for (tail = CODING_ATTR_CHARSET_LIST (attrs);
7147 CONSP (tail); tail = XCDR (tail))
7148 {
7149 charset = CHARSET_FROM_ID (XINT (XCAR (tail)));
7150 if (CHAR_CHARSET_P (c, charset))
7151 break;
7152 }
7153 return (! NILP (tail));
7154 }
7155
7156
7157 /* Return a list of coding systems that safely encode the text between
7158 START and END. If EXCLUDE is non-nil, it is a list of coding
7159 systems not to check. The returned list doesn't contain any such
7160 coding systems. In any case, if the text contains only ASCII or is
7161 unibyte, return t. */
7162
7163 DEFUN ("find-coding-systems-region-internal",
7164 Ffind_coding_systems_region_internal,
7165 Sfind_coding_systems_region_internal, 2, 3, 0,
7166 doc: /* Internal use only. */)
7167 (start, end, exclude)
7168 Lisp_Object start, end, exclude;
7169 {
7170 Lisp_Object coding_attrs_list, safe_codings;
7171 EMACS_INT start_byte, end_byte;
7172 const unsigned char *p, *pbeg, *pend;
7173 int c;
7174 Lisp_Object tail, elt;
7175
7176 if (STRINGP (start))
7177 {
7178 if (!STRING_MULTIBYTE (start)
7179 || SCHARS (start) == SBYTES (start))
7180 return Qt;
7181 start_byte = 0;
7182 end_byte = SBYTES (start);
7183 }
7184 else
7185 {
7186 CHECK_NUMBER_COERCE_MARKER (start);
7187 CHECK_NUMBER_COERCE_MARKER (end);
7188 if (XINT (start) < BEG || XINT (end) > Z || XINT (start) > XINT (end))
7189 args_out_of_range (start, end);
7190 if (NILP (current_buffer->enable_multibyte_characters))
7191 return Qt;
7192 start_byte = CHAR_TO_BYTE (XINT (start));
7193 end_byte = CHAR_TO_BYTE (XINT (end));
7194 if (XINT (end) - XINT (start) == end_byte - start_byte)
7195 return Qt;
7196
7197 if (XINT (start) < GPT && XINT (end) > GPT)
7198 {
7199 if ((GPT - XINT (start)) < (XINT (end) - GPT))
7200 move_gap_both (XINT (start), start_byte);
7201 else
7202 move_gap_both (XINT (end), end_byte);
7203 }
7204 }
7205
7206 coding_attrs_list = Qnil;
7207 for (tail = Vcoding_system_list; CONSP (tail); tail = XCDR (tail))
7208 if (NILP (exclude)
7209 || NILP (Fmemq (XCAR (tail), exclude)))
7210 {
7211 Lisp_Object attrs;
7212
7213 attrs = AREF (CODING_SYSTEM_SPEC (XCAR (tail)), 0);
7214 if (EQ (XCAR (tail), CODING_ATTR_BASE_NAME (attrs))
7215 && ! EQ (CODING_ATTR_TYPE (attrs), Qundecided))
7216 {
7217 ASET (attrs, coding_attr_trans_tbl,
7218 get_translation_table (attrs, 1));
7219 coding_attrs_list = Fcons (attrs, coding_attrs_list);
7220 }
7221 }
7222
7223 if (STRINGP (start))
7224 p = pbeg = SDATA (start);
7225 else
7226 p = pbeg = BYTE_POS_ADDR (start_byte);
7227 pend = p + (end_byte - start_byte);
7228
7229 while (p < pend && ASCII_BYTE_P (*p)) p++;
7230 while (p < pend && ASCII_BYTE_P (*(pend - 1))) pend--;
7231
7232 while (p < pend)
7233 {
7234 if (ASCII_BYTE_P (*p))
7235 p++;
7236 else
7237 {
7238 c = STRING_CHAR_ADVANCE (p);
7239
7240 charset_map_loaded = 0;
7241 for (tail = coding_attrs_list; CONSP (tail);)
7242 {
7243 elt = XCAR (tail);
7244 if (NILP (elt))
7245 tail = XCDR (tail);
7246 else if (char_encodable_p (c, elt))
7247 tail = XCDR (tail);
7248 else if (CONSP (XCDR (tail)))
7249 {
7250 XSETCAR (tail, XCAR (XCDR (tail)));
7251 XSETCDR (tail, XCDR (XCDR (tail)));
7252 }
7253 else
7254 {
7255 XSETCAR (tail, Qnil);
7256 tail = XCDR (tail);
7257 }
7258 }
7259 if (charset_map_loaded)
7260 {
7261 EMACS_INT p_offset = p - pbeg, pend_offset = pend - pbeg;
7262
7263 if (STRINGP (start))
7264 pbeg = SDATA (start);
7265 else
7266 pbeg = BYTE_POS_ADDR (start_byte);
7267 p = pbeg + p_offset;
7268 pend = pbeg + pend_offset;
7269 }
7270 }
7271 }
7272
7273 safe_codings = Qnil;
7274 for (tail = coding_attrs_list; CONSP (tail); tail = XCDR (tail))
7275 if (! NILP (XCAR (tail)))
7276 safe_codings = Fcons (CODING_ATTR_BASE_NAME (XCAR (tail)), safe_codings);
7277
7278 return safe_codings;
7279 }
7280
7281
7282 DEFUN ("unencodable-char-position", Funencodable_char_position,
7283 Sunencodable_char_position, 3, 5, 0,
7284 doc: /*
7285 Return position of first un-encodable character in a region.
7286 START and END specfiy the region and CODING-SYSTEM specifies the
7287 encoding to check. Return nil if CODING-SYSTEM does encode the region.
7288
7289 If optional 4th argument COUNT is non-nil, it specifies at most how
7290 many un-encodable characters to search. In this case, the value is a
7291 list of positions.
7292
7293 If optional 5th argument STRING is non-nil, it is a string to search
7294 for un-encodable characters. In that case, START and END are indexes
7295 to the string. */)
7296 (start, end, coding_system, count, string)
7297 Lisp_Object start, end, coding_system, count, string;
7298 {
7299 int n;
7300 struct coding_system coding;
7301 Lisp_Object attrs, charset_list, translation_table;
7302 Lisp_Object positions;
7303 int from, to;
7304 const unsigned char *p, *stop, *pend;
7305 int ascii_compatible;
7306
7307 setup_coding_system (Fcheck_coding_system (coding_system), &coding);
7308 attrs = CODING_ID_ATTRS (coding.id);
7309 if (EQ (CODING_ATTR_TYPE (attrs), Qraw_text))
7310 return Qnil;
7311 ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs));
7312 charset_list = CODING_ATTR_CHARSET_LIST (attrs);
7313 translation_table = get_translation_table (attrs, 1);
7314
7315 if (NILP (string))
7316 {
7317 validate_region (&start, &end);
7318 from = XINT (start);
7319 to = XINT (end);
7320 if (NILP (current_buffer->enable_multibyte_characters)
7321 || (ascii_compatible
7322 && (to - from) == (CHAR_TO_BYTE (to) - (CHAR_TO_BYTE (from)))))
7323 return Qnil;
7324 p = CHAR_POS_ADDR (from);
7325 pend = CHAR_POS_ADDR (to);
7326 if (from < GPT && to >= GPT)
7327 stop = GPT_ADDR;
7328 else
7329 stop = pend;
7330 }
7331 else
7332 {
7333 CHECK_STRING (string);
7334 CHECK_NATNUM (start);
7335 CHECK_NATNUM (end);
7336 from = XINT (start);
7337 to = XINT (end);
7338 if (from > to
7339 || to > SCHARS (string))
7340 args_out_of_range_3 (string, start, end);
7341 if (! STRING_MULTIBYTE (string))
7342 return Qnil;
7343 p = SDATA (string) + string_char_to_byte (string, from);
7344 stop = pend = SDATA (string) + string_char_to_byte (string, to);
7345 if (ascii_compatible && (to - from) == (pend - p))
7346 return Qnil;
7347 }
7348
7349 if (NILP (count))
7350 n = 1;
7351 else
7352 {
7353 CHECK_NATNUM (count);
7354 n = XINT (count);
7355 }
7356
7357 positions = Qnil;
7358 while (1)
7359 {
7360 int c;
7361
7362 if (ascii_compatible)
7363 while (p < stop && ASCII_BYTE_P (*p))
7364 p++, from++;
7365 if (p >= stop)
7366 {
7367 if (p >= pend)
7368 break;
7369 stop = pend;
7370 p = GAP_END_ADDR;
7371 }
7372
7373 c = STRING_CHAR_ADVANCE (p);
7374 if (! (ASCII_CHAR_P (c) && ascii_compatible)
7375 && ! char_charset (translate_char (translation_table, c),
7376 charset_list, NULL))
7377 {
7378 positions = Fcons (make_number (from), positions);
7379 n--;
7380 if (n == 0)
7381 break;
7382 }
7383
7384 from++;
7385 }
7386
7387 return (NILP (count) ? Fcar (positions) : Fnreverse (positions));
7388 }
7389
7390
7391 DEFUN ("check-coding-systems-region", Fcheck_coding_systems_region,
7392 Scheck_coding_systems_region, 3, 3, 0,
7393 doc: /* Check if the region is encodable by coding systems.
7394
7395 START and END are buffer positions specifying the region.
7396 CODING-SYSTEM-LIST is a list of coding systems to check.
7397
7398 The value is an alist ((CODING-SYSTEM POS0 POS1 ...) ...), where
7399 CODING-SYSTEM is a member of CODING-SYSTEM-LIst and can't encode the
7400 whole region, POS0, POS1, ... are buffer positions where non-encodable
7401 characters are found.
7402
7403 If all coding systems in CODING-SYSTEM-LIST can encode the region, the
7404 value is nil.
7405
7406 START may be a string. In that case, check if the string is
7407 encodable, and the value contains indices to the string instead of
7408 buffer positions. END is ignored. */)
7409 (start, end, coding_system_list)
7410 Lisp_Object start, end, coding_system_list;
7411 {
7412 Lisp_Object list;
7413 EMACS_INT start_byte, end_byte;
7414 int pos;
7415 const unsigned char *p, *pbeg, *pend;
7416 int c;
7417 Lisp_Object tail, elt, attrs;
7418
7419 if (STRINGP (start))
7420 {
7421 if (!STRING_MULTIBYTE (start)
7422 && SCHARS (start) != SBYTES (start))
7423 return Qnil;
7424 start_byte = 0;
7425 end_byte = SBYTES (start);
7426 pos = 0;
7427 }
7428 else
7429 {
7430 CHECK_NUMBER_COERCE_MARKER (start);
7431 CHECK_NUMBER_COERCE_MARKER (end);
7432 if (XINT (start) < BEG || XINT (end) > Z || XINT (start) > XINT (end))
7433 args_out_of_range (start, end);
7434 if (NILP (current_buffer->enable_multibyte_characters))
7435 return Qnil;
7436 start_byte = CHAR_TO_BYTE (XINT (start));
7437 end_byte = CHAR_TO_BYTE (XINT (end));
7438 if (XINT (end) - XINT (start) == end_byte - start_byte)
7439 return Qt;
7440
7441 if (XINT (start) < GPT && XINT (end) > GPT)
7442 {
7443 if ((GPT - XINT (start)) < (XINT (end) - GPT))
7444 move_gap_both (XINT (start), start_byte);
7445 else
7446 move_gap_both (XINT (end), end_byte);
7447 }
7448 pos = XINT (start);
7449 }
7450
7451 list = Qnil;
7452 for (tail = coding_system_list; CONSP (tail); tail = XCDR (tail))
7453 {
7454 elt = XCAR (tail);
7455 attrs = AREF (CODING_SYSTEM_SPEC (elt), 0);
7456 ASET (attrs, coding_attr_trans_tbl, get_translation_table (attrs, 1));
7457 list = Fcons (Fcons (elt, Fcons (attrs, Qnil)), list);
7458 }
7459
7460 if (STRINGP (start))
7461 p = pbeg = SDATA (start);
7462 else
7463 p = pbeg = BYTE_POS_ADDR (start_byte);
7464 pend = p + (end_byte - start_byte);
7465
7466 while (p < pend && ASCII_BYTE_P (*p)) p++, pos++;
7467 while (p < pend && ASCII_BYTE_P (*(pend - 1))) pend--;
7468
7469 while (p < pend)
7470 {
7471 if (ASCII_BYTE_P (*p))
7472 p++;
7473 else
7474 {
7475 c = STRING_CHAR_ADVANCE (p);
7476
7477 charset_map_loaded = 0;
7478 for (tail = list; CONSP (tail); tail = XCDR (tail))
7479 {
7480 elt = XCDR (XCAR (tail));
7481 if (! char_encodable_p (c, XCAR (elt)))
7482 XSETCDR (elt, Fcons (make_number (pos), XCDR (elt)));
7483 }
7484 if (charset_map_loaded)
7485 {
7486 EMACS_INT p_offset = p - pbeg, pend_offset = pend - pbeg;
7487
7488 if (STRINGP (start))
7489 pbeg = SDATA (start);
7490 else
7491 pbeg = BYTE_POS_ADDR (start_byte);
7492 p = pbeg + p_offset;
7493 pend = pbeg + pend_offset;
7494 }
7495 }
7496 pos++;
7497 }
7498
7499 tail = list;
7500 list = Qnil;
7501 for (; CONSP (tail); tail = XCDR (tail))
7502 {
7503 elt = XCAR (tail);
7504 if (CONSP (XCDR (XCDR (elt))))
7505 list = Fcons (Fcons (XCAR (elt), Fnreverse (XCDR (XCDR (elt)))),
7506 list);
7507 }
7508
7509 return list;
7510 }
7511
7512
7513 Lisp_Object
7514 code_convert_region (start, end, coding_system, dst_object, encodep, norecord)
7515 Lisp_Object start, end, coding_system, dst_object;
7516 int encodep, norecord;
7517 {
7518 struct coding_system coding;
7519 EMACS_INT from, from_byte, to, to_byte;
7520 Lisp_Object src_object;
7521
7522 CHECK_NUMBER_COERCE_MARKER (start);
7523 CHECK_NUMBER_COERCE_MARKER (end);
7524 if (NILP (coding_system))
7525 coding_system = Qno_conversion;
7526 else
7527 CHECK_CODING_SYSTEM (coding_system);
7528 src_object = Fcurrent_buffer ();
7529 if (NILP (dst_object))
7530 dst_object = src_object;
7531 else if (! EQ (dst_object, Qt))
7532 CHECK_BUFFER (dst_object);
7533
7534 validate_region (&start, &end);
7535 from = XFASTINT (start);
7536 from_byte = CHAR_TO_BYTE (from);
7537 to = XFASTINT (end);
7538 to_byte = CHAR_TO_BYTE (to);
7539
7540 setup_coding_system (coding_system, &coding);
7541 coding.mode |= CODING_MODE_LAST_BLOCK;
7542
7543 if (encodep)
7544 encode_coding_object (&coding, src_object, from, from_byte, to, to_byte,
7545 dst_object);
7546 else
7547 decode_coding_object (&coding, src_object, from, from_byte, to, to_byte,
7548 dst_object);
7549 if (! norecord)
7550 Vlast_coding_system_used = CODING_ID_NAME (coding.id);
7551
7552 return (BUFFERP (dst_object)
7553 ? make_number (coding.produced_char)
7554 : coding.dst_object);
7555 }
7556
7557
7558 DEFUN ("decode-coding-region", Fdecode_coding_region, Sdecode_coding_region,
7559 3, 4, "r\nzCoding system: ",
7560 doc: /* Decode the current region from the specified coding system.
7561 When called from a program, takes four arguments:
7562 START, END, CODING-SYSTEM, and DESTINATION.
7563 START and END are buffer positions.
7564
7565 Optional 4th arguments DESTINATION specifies where the decoded text goes.
7566 If nil, the region between START and END is replace by the decoded text.
7567 If buffer, the decoded text is inserted in the buffer.
7568 If t, the decoded text is returned.
7569
7570 This function sets `last-coding-system-used' to the precise coding system
7571 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
7572 not fully specified.)
7573 It returns the length of the decoded text. */)
7574 (start, end, coding_system, destination)
7575 Lisp_Object start, end, coding_system, destination;
7576 {
7577 return code_convert_region (start, end, coding_system, destination, 0, 0);
7578 }
7579
7580 DEFUN ("encode-coding-region", Fencode_coding_region, Sencode_coding_region,
7581 3, 4, "r\nzCoding system: ",
7582 doc: /* Encode the current region by specified coding system.
7583 When called from a program, takes three arguments:
7584 START, END, and CODING-SYSTEM. START and END are buffer positions.
7585
7586 Optional 4th arguments DESTINATION specifies where the encoded text goes.
7587 If nil, the region between START and END is replace by the encoded text.
7588 If buffer, the encoded text is inserted in the buffer.
7589 If t, the encoded text is returned.
7590
7591 This function sets `last-coding-system-used' to the precise coding system
7592 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
7593 not fully specified.)
7594 It returns the length of the encoded text. */)
7595 (start, end, coding_system, destination)
7596 Lisp_Object start, end, coding_system, destination;
7597 {
7598 return code_convert_region (start, end, coding_system, destination, 1, 0);
7599 }
7600
7601 Lisp_Object
7602 code_convert_string (string, coding_system, dst_object,
7603 encodep, nocopy, norecord)
7604 Lisp_Object string, coding_system, dst_object;
7605 int encodep, nocopy, norecord;
7606 {
7607 struct coding_system coding;
7608 EMACS_INT chars, bytes;
7609
7610 CHECK_STRING (string);
7611 if (NILP (coding_system))
7612 {
7613 if (! norecord)
7614 Vlast_coding_system_used = Qno_conversion;
7615 if (NILP (dst_object))
7616 return (nocopy ? Fcopy_sequence (string) : string);
7617 }
7618
7619 if (NILP (coding_system))
7620 coding_system = Qno_conversion;
7621 else
7622 CHECK_CODING_SYSTEM (coding_system);
7623 if (NILP (dst_object))
7624 dst_object = Qt;
7625 else if (! EQ (dst_object, Qt))
7626 CHECK_BUFFER (dst_object);
7627
7628 setup_coding_system (coding_system, &coding);
7629 coding.mode |= CODING_MODE_LAST_BLOCK;
7630 chars = SCHARS (string);
7631 bytes = SBYTES (string);
7632 if (encodep)
7633 encode_coding_object (&coding, string, 0, 0, chars, bytes, dst_object);
7634 else
7635 decode_coding_object (&coding, string, 0, 0, chars, bytes, dst_object);
7636 if (! norecord)
7637 Vlast_coding_system_used = CODING_ID_NAME (coding.id);
7638
7639 return (BUFFERP (dst_object)
7640 ? make_number (coding.produced_char)
7641 : coding.dst_object);
7642 }
7643
7644
7645 /* Encode or decode STRING according to CODING_SYSTEM.
7646 Do not set Vlast_coding_system_used.
7647
7648 This function is called only from macros DECODE_FILE and
7649 ENCODE_FILE, thus we ignore character composition. */
7650
7651 Lisp_Object
7652 code_convert_string_norecord (string, coding_system, encodep)
7653 Lisp_Object string, coding_system;
7654 int encodep;
7655 {
7656 return code_convert_string (string, coding_system, Qt, encodep, 0, 1);
7657 }
7658
7659
7660 DEFUN ("decode-coding-string", Fdecode_coding_string, Sdecode_coding_string,
7661 2, 4, 0,
7662 doc: /* Decode STRING which is encoded in CODING-SYSTEM, and return the result.
7663
7664 Optional third arg NOCOPY non-nil means it is OK to return STRING itself
7665 if the decoding operation is trivial.
7666
7667 Optional fourth arg BUFFER non-nil meant that the decoded text is
7668 inserted in BUFFER instead of returned as a string. In this case,
7669 the return value is BUFFER.
7670
7671 This function sets `last-coding-system-used' to the precise coding system
7672 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
7673 not fully specified. */)
7674 (string, coding_system, nocopy, buffer)
7675 Lisp_Object string, coding_system, nocopy, buffer;
7676 {
7677 return code_convert_string (string, coding_system, buffer,
7678 0, ! NILP (nocopy), 0);
7679 }
7680
7681 DEFUN ("encode-coding-string", Fencode_coding_string, Sencode_coding_string,
7682 2, 4, 0,
7683 doc: /* Encode STRING to CODING-SYSTEM, and return the result.
7684
7685 Optional third arg NOCOPY non-nil means it is OK to return STRING
7686 itself if the encoding operation is trivial.
7687
7688 Optional fourth arg BUFFER non-nil meant that the encoded text is
7689 inserted in BUFFER instead of returned as a string. In this case,
7690 the return value is BUFFER.
7691
7692 This function sets `last-coding-system-used' to the precise coding system
7693 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
7694 not fully specified.) */)
7695 (string, coding_system, nocopy, buffer)
7696 Lisp_Object string, coding_system, nocopy, buffer;
7697 {
7698 return code_convert_string (string, coding_system, buffer,
7699 1, ! NILP (nocopy), 1);
7700 }
7701
7702 \f
7703 DEFUN ("decode-sjis-char", Fdecode_sjis_char, Sdecode_sjis_char, 1, 1, 0,
7704 doc: /* Decode a Japanese character which has CODE in shift_jis encoding.
7705 Return the corresponding character. */)
7706 (code)
7707 Lisp_Object code;
7708 {
7709 Lisp_Object spec, attrs, val;
7710 struct charset *charset_roman, *charset_kanji, *charset_kana, *charset;
7711 int c;
7712
7713 CHECK_NATNUM (code);
7714 c = XFASTINT (code);
7715 CHECK_CODING_SYSTEM_GET_SPEC (Vsjis_coding_system, spec);
7716 attrs = AREF (spec, 0);
7717
7718 if (ASCII_BYTE_P (c)
7719 && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
7720 return code;
7721
7722 val = CODING_ATTR_CHARSET_LIST (attrs);
7723 charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
7724 charset_kana = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
7725 charset_kanji = CHARSET_FROM_ID (XINT (XCAR (val)));
7726
7727 if (c <= 0x7F)
7728 charset = charset_roman;
7729 else if (c >= 0xA0 && c < 0xDF)
7730 {
7731 charset = charset_kana;
7732 c -= 0x80;
7733 }
7734 else
7735 {
7736 int s1 = c >> 8, s2 = c & 0xFF;
7737
7738 if (s1 < 0x81 || (s1 > 0x9F && s1 < 0xE0) || s1 > 0xEF
7739 || s2 < 0x40 || s2 == 0x7F || s2 > 0xFC)
7740 error ("Invalid code: %d", code);
7741 SJIS_TO_JIS (c);
7742 charset = charset_kanji;
7743 }
7744 c = DECODE_CHAR (charset, c);
7745 if (c < 0)
7746 error ("Invalid code: %d", code);
7747 return make_number (c);
7748 }
7749
7750
7751 DEFUN ("encode-sjis-char", Fencode_sjis_char, Sencode_sjis_char, 1, 1, 0,
7752 doc: /* Encode a Japanese character CHAR to shift_jis encoding.
7753 Return the corresponding code in SJIS. */)
7754 (ch)
7755 Lisp_Object ch;
7756 {
7757 Lisp_Object spec, attrs, charset_list;
7758 int c;
7759 struct charset *charset;
7760 unsigned code;
7761
7762 CHECK_CHARACTER (ch);
7763 c = XFASTINT (ch);
7764 CHECK_CODING_SYSTEM_GET_SPEC (Vsjis_coding_system, spec);
7765 attrs = AREF (spec, 0);
7766
7767 if (ASCII_CHAR_P (c)
7768 && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
7769 return ch;
7770
7771 charset_list = CODING_ATTR_CHARSET_LIST (attrs);
7772 charset = char_charset (c, charset_list, &code);
7773 if (code == CHARSET_INVALID_CODE (charset))
7774 error ("Can't encode by shift_jis encoding: %d", c);
7775 JIS_TO_SJIS (code);
7776
7777 return make_number (code);
7778 }
7779
7780 DEFUN ("decode-big5-char", Fdecode_big5_char, Sdecode_big5_char, 1, 1, 0,
7781 doc: /* Decode a Big5 character which has CODE in BIG5 coding system.
7782 Return the corresponding character. */)
7783 (code)
7784 Lisp_Object code;
7785 {
7786 Lisp_Object spec, attrs, val;
7787 struct charset *charset_roman, *charset_big5, *charset;
7788 int c;
7789
7790 CHECK_NATNUM (code);
7791 c = XFASTINT (code);
7792 CHECK_CODING_SYSTEM_GET_SPEC (Vbig5_coding_system, spec);
7793 attrs = AREF (spec, 0);
7794
7795 if (ASCII_BYTE_P (c)
7796 && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
7797 return code;
7798
7799 val = CODING_ATTR_CHARSET_LIST (attrs);
7800 charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
7801 charset_big5 = CHARSET_FROM_ID (XINT (XCAR (val)));
7802
7803 if (c <= 0x7F)
7804 charset = charset_roman;
7805 else
7806 {
7807 int b1 = c >> 8, b2 = c & 0x7F;
7808 if (b1 < 0xA1 || b1 > 0xFE
7809 || b2 < 0x40 || (b2 > 0x7E && b2 < 0xA1) || b2 > 0xFE)
7810 error ("Invalid code: %d", code);
7811 charset = charset_big5;
7812 }
7813 c = DECODE_CHAR (charset, (unsigned )c);
7814 if (c < 0)
7815 error ("Invalid code: %d", code);
7816 return make_number (c);
7817 }
7818
7819 DEFUN ("encode-big5-char", Fencode_big5_char, Sencode_big5_char, 1, 1, 0,
7820 doc: /* Encode the Big5 character CHAR to BIG5 coding system.
7821 Return the corresponding character code in Big5. */)
7822 (ch)
7823 Lisp_Object ch;
7824 {
7825 Lisp_Object spec, attrs, charset_list;
7826 struct charset *charset;
7827 int c;
7828 unsigned code;
7829
7830 CHECK_CHARACTER (ch);
7831 c = XFASTINT (ch);
7832 CHECK_CODING_SYSTEM_GET_SPEC (Vbig5_coding_system, spec);
7833 attrs = AREF (spec, 0);
7834 if (ASCII_CHAR_P (c)
7835 && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
7836 return ch;
7837
7838 charset_list = CODING_ATTR_CHARSET_LIST (attrs);
7839 charset = char_charset (c, charset_list, &code);
7840 if (code == CHARSET_INVALID_CODE (charset))
7841 error ("Can't encode by Big5 encoding: %d", c);
7842
7843 return make_number (code);
7844 }
7845
7846 \f
7847 DEFUN ("set-terminal-coding-system-internal",
7848 Fset_terminal_coding_system_internal,
7849 Sset_terminal_coding_system_internal, 1, 1, 0,
7850 doc: /* Internal use only. */)
7851 (coding_system)
7852 Lisp_Object coding_system;
7853 {
7854 CHECK_SYMBOL (coding_system);
7855 setup_coding_system (Fcheck_coding_system (coding_system),
7856 &terminal_coding);
7857
7858 /* We had better not send unsafe characters to terminal. */
7859 terminal_coding.mode |= CODING_MODE_SAFE_ENCODING;
7860 /* Characer composition should be disabled. */
7861 terminal_coding.common_flags &= ~CODING_ANNOTATE_COMPOSITION_MASK;
7862 terminal_coding.src_multibyte = 1;
7863 terminal_coding.dst_multibyte = 0;
7864 return Qnil;
7865 }
7866
7867 DEFUN ("set-safe-terminal-coding-system-internal",
7868 Fset_safe_terminal_coding_system_internal,
7869 Sset_safe_terminal_coding_system_internal, 1, 1, 0,
7870 doc: /* Internal use only. */)
7871 (coding_system)
7872 Lisp_Object coding_system;
7873 {
7874 CHECK_SYMBOL (coding_system);
7875 setup_coding_system (Fcheck_coding_system (coding_system),
7876 &safe_terminal_coding);
7877 /* Characer composition should be disabled. */
7878 safe_terminal_coding.common_flags &= ~CODING_ANNOTATE_COMPOSITION_MASK;
7879 safe_terminal_coding.src_multibyte = 1;
7880 safe_terminal_coding.dst_multibyte = 0;
7881 return Qnil;
7882 }
7883
7884 DEFUN ("terminal-coding-system",
7885 Fterminal_coding_system, Sterminal_coding_system, 0, 0, 0,
7886 doc: /* Return coding system specified for terminal output. */)
7887 ()
7888 {
7889 return CODING_ID_NAME (terminal_coding.id);
7890 }
7891
7892 DEFUN ("set-keyboard-coding-system-internal",
7893 Fset_keyboard_coding_system_internal,
7894 Sset_keyboard_coding_system_internal, 1, 1, 0,
7895 doc: /* Internal use only. */)
7896 (coding_system)
7897 Lisp_Object coding_system;
7898 {
7899 CHECK_SYMBOL (coding_system);
7900 setup_coding_system (Fcheck_coding_system (coding_system),
7901 &keyboard_coding);
7902 /* Characer composition should be disabled. */
7903 keyboard_coding.common_flags &= ~CODING_ANNOTATE_COMPOSITION_MASK;
7904 return Qnil;
7905 }
7906
7907 DEFUN ("keyboard-coding-system",
7908 Fkeyboard_coding_system, Skeyboard_coding_system, 0, 0, 0,
7909 doc: /* Return coding system specified for decoding keyboard input. */)
7910 ()
7911 {
7912 return CODING_ID_NAME (keyboard_coding.id);
7913 }
7914
7915 \f
7916 DEFUN ("find-operation-coding-system", Ffind_operation_coding_system,
7917 Sfind_operation_coding_system, 1, MANY, 0,
7918 doc: /* Choose a coding system for an operation based on the target name.
7919 The value names a pair of coding systems: (DECODING-SYSTEM . ENCODING-SYSTEM).
7920 DECODING-SYSTEM is the coding system to use for decoding
7921 \(in case OPERATION does decoding), and ENCODING-SYSTEM is the coding system
7922 for encoding (in case OPERATION does encoding).
7923
7924 The first argument OPERATION specifies an I/O primitive:
7925 For file I/O, `insert-file-contents' or `write-region'.
7926 For process I/O, `call-process', `call-process-region', or `start-process'.
7927 For network I/O, `open-network-stream'.
7928
7929 The remaining arguments should be the same arguments that were passed
7930 to the primitive. Depending on which primitive, one of those arguments
7931 is selected as the TARGET. For example, if OPERATION does file I/O,
7932 whichever argument specifies the file name is TARGET.
7933
7934 TARGET has a meaning which depends on OPERATION:
7935 For file I/O, TARGET is a file name.
7936 For process I/O, TARGET is a process name.
7937 For network I/O, TARGET is a service name or a port number
7938
7939 This function looks up what specified for TARGET in,
7940 `file-coding-system-alist', `process-coding-system-alist',
7941 or `network-coding-system-alist' depending on OPERATION.
7942 They may specify a coding system, a cons of coding systems,
7943 or a function symbol to call.
7944 In the last case, we call the function with one argument,
7945 which is a list of all the arguments given to this function.
7946
7947 usage: (find-operation-coding-system OPERATION ARGUMENTS ...) */)
7948 (nargs, args)
7949 int nargs;
7950 Lisp_Object *args;
7951 {
7952 Lisp_Object operation, target_idx, target, val;
7953 register Lisp_Object chain;
7954
7955 if (nargs < 2)
7956 error ("Too few arguments");
7957 operation = args[0];
7958 if (!SYMBOLP (operation)
7959 || !INTEGERP (target_idx = Fget (operation, Qtarget_idx)))
7960 error ("Invalid first arguement");
7961 if (nargs < 1 + XINT (target_idx))
7962 error ("Too few arguments for operation: %s",
7963 SDATA (SYMBOL_NAME (operation)));
7964 target = args[XINT (target_idx) + 1];
7965 if (!(STRINGP (target)
7966 || (EQ (operation, Qopen_network_stream) && INTEGERP (target))))
7967 error ("Invalid %dth argument", XINT (target_idx) + 1);
7968
7969 chain = ((EQ (operation, Qinsert_file_contents)
7970 || EQ (operation, Qwrite_region))
7971 ? Vfile_coding_system_alist
7972 : (EQ (operation, Qopen_network_stream)
7973 ? Vnetwork_coding_system_alist
7974 : Vprocess_coding_system_alist));
7975 if (NILP (chain))
7976 return Qnil;
7977
7978 for (; CONSP (chain); chain = XCDR (chain))
7979 {
7980 Lisp_Object elt;
7981
7982 elt = XCAR (chain);
7983 if (CONSP (elt)
7984 && ((STRINGP (target)
7985 && STRINGP (XCAR (elt))
7986 && fast_string_match (XCAR (elt), target) >= 0)
7987 || (INTEGERP (target) && EQ (target, XCAR (elt)))))
7988 {
7989 val = XCDR (elt);
7990 /* Here, if VAL is both a valid coding system and a valid
7991 function symbol, we return VAL as a coding system. */
7992 if (CONSP (val))
7993 return val;
7994 if (! SYMBOLP (val))
7995 return Qnil;
7996 if (! NILP (Fcoding_system_p (val)))
7997 return Fcons (val, val);
7998 if (! NILP (Ffboundp (val)))
7999 {
8000 val = call1 (val, Flist (nargs, args));
8001 if (CONSP (val))
8002 return val;
8003 if (SYMBOLP (val) && ! NILP (Fcoding_system_p (val)))
8004 return Fcons (val, val);
8005 }
8006 return Qnil;
8007 }
8008 }
8009 return Qnil;
8010 }
8011
8012 DEFUN ("set-coding-system-priority", Fset_coding_system_priority,
8013 Sset_coding_system_priority, 0, MANY, 0,
8014 doc: /* Assign higher priority to the coding systems given as arguments.
8015 If multiple coding systems belongs to the same category,
8016 all but the first one are ignored.
8017
8018 usage: (set-coding-system-priority ...) */)
8019 (nargs, args)
8020 int nargs;
8021 Lisp_Object *args;
8022 {
8023 int i, j;
8024 int changed[coding_category_max];
8025 enum coding_category priorities[coding_category_max];
8026
8027 bzero (changed, sizeof changed);
8028
8029 for (i = j = 0; i < nargs; i++)
8030 {
8031 enum coding_category category;
8032 Lisp_Object spec, attrs;
8033
8034 CHECK_CODING_SYSTEM_GET_SPEC (args[i], spec);
8035 attrs = AREF (spec, 0);
8036 category = XINT (CODING_ATTR_CATEGORY (attrs));
8037 if (changed[category])
8038 /* Ignore this coding system because a coding system of the
8039 same category already had a higher priority. */
8040 continue;
8041 changed[category] = 1;
8042 priorities[j++] = category;
8043 if (coding_categories[category].id >= 0
8044 && ! EQ (args[i], CODING_ID_NAME (coding_categories[category].id)))
8045 setup_coding_system (args[i], &coding_categories[category]);
8046 Fset (AREF (Vcoding_category_table, category), args[i]);
8047 }
8048
8049 /* Now we have decided top J priorities. Reflect the order of the
8050 original priorities to the remaining priorities. */
8051
8052 for (i = j, j = 0; i < coding_category_max; i++, j++)
8053 {
8054 while (j < coding_category_max
8055 && changed[coding_priorities[j]])
8056 j++;
8057 if (j == coding_category_max)
8058 abort ();
8059 priorities[i] = coding_priorities[j];
8060 }
8061
8062 bcopy (priorities, coding_priorities, sizeof priorities);
8063
8064 /* Update `coding-category-list'. */
8065 Vcoding_category_list = Qnil;
8066 for (i = coding_category_max - 1; i >= 0; i--)
8067 Vcoding_category_list
8068 = Fcons (AREF (Vcoding_category_table, priorities[i]),
8069 Vcoding_category_list);
8070
8071 return Qnil;
8072 }
8073
8074 DEFUN ("coding-system-priority-list", Fcoding_system_priority_list,
8075 Scoding_system_priority_list, 0, 1, 0,
8076 doc: /* Return a list of coding systems ordered by their priorities.
8077 HIGHESTP non-nil means just return the highest priority one. */)
8078 (highestp)
8079 Lisp_Object highestp;
8080 {
8081 int i;
8082 Lisp_Object val;
8083
8084 for (i = 0, val = Qnil; i < coding_category_max; i++)
8085 {
8086 enum coding_category category = coding_priorities[i];
8087 int id = coding_categories[category].id;
8088 Lisp_Object attrs;
8089
8090 if (id < 0)
8091 continue;
8092 attrs = CODING_ID_ATTRS (id);
8093 if (! NILP (highestp))
8094 return CODING_ATTR_BASE_NAME (attrs);
8095 val = Fcons (CODING_ATTR_BASE_NAME (attrs), val);
8096 }
8097 return Fnreverse (val);
8098 }
8099
8100 static char *suffixes[] = { "-unix", "-dos", "-mac" };
8101
8102 static Lisp_Object
8103 make_subsidiaries (base)
8104 Lisp_Object base;
8105 {
8106 Lisp_Object subsidiaries;
8107 int base_name_len = SBYTES (SYMBOL_NAME (base));
8108 char *buf = (char *) alloca (base_name_len + 6);
8109 int i;
8110
8111 bcopy (SDATA (SYMBOL_NAME (base)), buf, base_name_len);
8112 subsidiaries = Fmake_vector (make_number (3), Qnil);
8113 for (i = 0; i < 3; i++)
8114 {
8115 bcopy (suffixes[i], buf + base_name_len, strlen (suffixes[i]) + 1);
8116 ASET (subsidiaries, i, intern (buf));
8117 }
8118 return subsidiaries;
8119 }
8120
8121
8122 DEFUN ("define-coding-system-internal", Fdefine_coding_system_internal,
8123 Sdefine_coding_system_internal, coding_arg_max, MANY, 0,
8124 doc: /* For internal use only.
8125 usage: (define-coding-system-internal ...) */)
8126 (nargs, args)
8127 int nargs;
8128 Lisp_Object *args;
8129 {
8130 Lisp_Object name;
8131 Lisp_Object spec_vec; /* [ ATTRS ALIASE EOL_TYPE ] */
8132 Lisp_Object attrs; /* Vector of attributes. */
8133 Lisp_Object eol_type;
8134 Lisp_Object aliases;
8135 Lisp_Object coding_type, charset_list, safe_charsets;
8136 enum coding_category category;
8137 Lisp_Object tail, val;
8138 int max_charset_id = 0;
8139 int i;
8140
8141 if (nargs < coding_arg_max)
8142 goto short_args;
8143
8144 attrs = Fmake_vector (make_number (coding_attr_last_index), Qnil);
8145
8146 name = args[coding_arg_name];
8147 CHECK_SYMBOL (name);
8148 CODING_ATTR_BASE_NAME (attrs) = name;
8149
8150 val = args[coding_arg_mnemonic];
8151 if (! STRINGP (val))
8152 CHECK_CHARACTER (val);
8153 CODING_ATTR_MNEMONIC (attrs) = val;
8154
8155 coding_type = args[coding_arg_coding_type];
8156 CHECK_SYMBOL (coding_type);
8157 CODING_ATTR_TYPE (attrs) = coding_type;
8158
8159 charset_list = args[coding_arg_charset_list];
8160 if (SYMBOLP (charset_list))
8161 {
8162 if (EQ (charset_list, Qiso_2022))
8163 {
8164 if (! EQ (coding_type, Qiso_2022))
8165 error ("Invalid charset-list");
8166 charset_list = Viso_2022_charset_list;
8167 }
8168 else if (EQ (charset_list, Qemacs_mule))
8169 {
8170 if (! EQ (coding_type, Qemacs_mule))
8171 error ("Invalid charset-list");
8172 charset_list = Vemacs_mule_charset_list;
8173 }
8174 for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
8175 if (max_charset_id < XFASTINT (XCAR (tail)))
8176 max_charset_id = XFASTINT (XCAR (tail));
8177 }
8178 else
8179 {
8180 charset_list = Fcopy_sequence (charset_list);
8181 for (tail = charset_list; !NILP (tail); tail = Fcdr (tail))
8182 {
8183 struct charset *charset;
8184
8185 val = Fcar (tail);
8186 CHECK_CHARSET_GET_CHARSET (val, charset);
8187 if (EQ (coding_type, Qiso_2022)
8188 ? CHARSET_ISO_FINAL (charset) < 0
8189 : EQ (coding_type, Qemacs_mule)
8190 ? CHARSET_EMACS_MULE_ID (charset) < 0
8191 : 0)
8192 error ("Can't handle charset `%s'",
8193 SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
8194
8195 XSETCAR (tail, make_number (charset->id));
8196 if (max_charset_id < charset->id)
8197 max_charset_id = charset->id;
8198 }
8199 }
8200 CODING_ATTR_CHARSET_LIST (attrs) = charset_list;
8201
8202 safe_charsets = Fmake_string (make_number (max_charset_id + 1),
8203 make_number (255));
8204 for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
8205 SSET (safe_charsets, XFASTINT (XCAR (tail)), 0);
8206 CODING_ATTR_SAFE_CHARSETS (attrs) = safe_charsets;
8207
8208 CODING_ATTR_ASCII_COMPAT (attrs) = args[coding_arg_ascii_compatible_p];
8209
8210 val = args[coding_arg_decode_translation_table];
8211 if (! CHAR_TABLE_P (val) && ! CONSP (val))
8212 CHECK_SYMBOL (val);
8213 CODING_ATTR_DECODE_TBL (attrs) = val;
8214
8215 val = args[coding_arg_encode_translation_table];
8216 if (! CHAR_TABLE_P (val) && ! CONSP (val))
8217 CHECK_SYMBOL (val);
8218 CODING_ATTR_ENCODE_TBL (attrs) = val;
8219
8220 val = args[coding_arg_post_read_conversion];
8221 CHECK_SYMBOL (val);
8222 CODING_ATTR_POST_READ (attrs) = val;
8223
8224 val = args[coding_arg_pre_write_conversion];
8225 CHECK_SYMBOL (val);
8226 CODING_ATTR_PRE_WRITE (attrs) = val;
8227
8228 val = args[coding_arg_default_char];
8229 if (NILP (val))
8230 CODING_ATTR_DEFAULT_CHAR (attrs) = make_number (' ');
8231 else
8232 {
8233 CHECK_CHARACTER (val);
8234 CODING_ATTR_DEFAULT_CHAR (attrs) = val;
8235 }
8236
8237 val = args[coding_arg_for_unibyte];
8238 CODING_ATTR_FOR_UNIBYTE (attrs) = NILP (val) ? Qnil : Qt;
8239
8240 val = args[coding_arg_plist];
8241 CHECK_LIST (val);
8242 CODING_ATTR_PLIST (attrs) = val;
8243
8244 if (EQ (coding_type, Qcharset))
8245 {
8246 /* Generate a lisp vector of 256 elements. Each element is nil,
8247 integer, or a list of charset IDs.
8248
8249 If Nth element is nil, the byte code N is invalid in this
8250 coding system.
8251
8252 If Nth element is a number NUM, N is the first byte of a
8253 charset whose ID is NUM.
8254
8255 If Nth element is a list of charset IDs, N is the first byte
8256 of one of them. The list is sorted by dimensions of the
8257 charsets. A charset of smaller dimension comes firtst. */
8258 val = Fmake_vector (make_number (256), Qnil);
8259
8260 for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
8261 {
8262 struct charset *charset = CHARSET_FROM_ID (XFASTINT (XCAR (tail)));
8263 int dim = CHARSET_DIMENSION (charset);
8264 int idx = (dim - 1) * 4;
8265
8266 if (CHARSET_ASCII_COMPATIBLE_P (charset))
8267 CODING_ATTR_ASCII_COMPAT (attrs) = Qt;
8268
8269 for (i = charset->code_space[idx];
8270 i <= charset->code_space[idx + 1]; i++)
8271 {
8272 Lisp_Object tmp, tmp2;
8273 int dim2;
8274
8275 tmp = AREF (val, i);
8276 if (NILP (tmp))
8277 tmp = XCAR (tail);
8278 else if (NUMBERP (tmp))
8279 {
8280 dim2 = CHARSET_DIMENSION (CHARSET_FROM_ID (XFASTINT (tmp)));
8281 if (dim < dim2)
8282 tmp = Fcons (XCAR (tail), Fcons (tmp, Qnil));
8283 else
8284 tmp = Fcons (tmp, Fcons (XCAR (tail), Qnil));
8285 }
8286 else
8287 {
8288 for (tmp2 = tmp; CONSP (tmp2); tmp2 = XCDR (tmp2))
8289 {
8290 dim2 = CHARSET_DIMENSION (CHARSET_FROM_ID (XFASTINT (XCAR (tmp2))));
8291 if (dim < dim2)
8292 break;
8293 }
8294 if (NILP (tmp2))
8295 tmp = nconc2 (tmp, Fcons (XCAR (tail), Qnil));
8296 else
8297 {
8298 XSETCDR (tmp2, Fcons (XCAR (tmp2), XCDR (tmp2)));
8299 XSETCAR (tmp2, XCAR (tail));
8300 }
8301 }
8302 ASET (val, i, tmp);
8303 }
8304 }
8305 ASET (attrs, coding_attr_charset_valids, val);
8306 category = coding_category_charset;
8307 }
8308 else if (EQ (coding_type, Qccl))
8309 {
8310 Lisp_Object valids;
8311
8312 if (nargs < coding_arg_ccl_max)
8313 goto short_args;
8314
8315 val = args[coding_arg_ccl_decoder];
8316 CHECK_CCL_PROGRAM (val);
8317 if (VECTORP (val))
8318 val = Fcopy_sequence (val);
8319 ASET (attrs, coding_attr_ccl_decoder, val);
8320
8321 val = args[coding_arg_ccl_encoder];
8322 CHECK_CCL_PROGRAM (val);
8323 if (VECTORP (val))
8324 val = Fcopy_sequence (val);
8325 ASET (attrs, coding_attr_ccl_encoder, val);
8326
8327 val = args[coding_arg_ccl_valids];
8328 valids = Fmake_string (make_number (256), make_number (0));
8329 for (tail = val; !NILP (tail); tail = Fcdr (tail))
8330 {
8331 int from, to;
8332
8333 val = Fcar (tail);
8334 if (INTEGERP (val))
8335 {
8336 from = to = XINT (val);
8337 if (from < 0 || from > 255)
8338 args_out_of_range_3 (val, make_number (0), make_number (255));
8339 }
8340 else
8341 {
8342 CHECK_CONS (val);
8343 CHECK_NATNUM_CAR (val);
8344 CHECK_NATNUM_CDR (val);
8345 from = XINT (XCAR (val));
8346 if (from > 255)
8347 args_out_of_range_3 (XCAR (val),
8348 make_number (0), make_number (255));
8349 to = XINT (XCDR (val));
8350 if (to < from || to > 255)
8351 args_out_of_range_3 (XCDR (val),
8352 XCAR (val), make_number (255));
8353 }
8354 for (i = from; i <= to; i++)
8355 SSET (valids, i, 1);
8356 }
8357 ASET (attrs, coding_attr_ccl_valids, valids);
8358
8359 category = coding_category_ccl;
8360 }
8361 else if (EQ (coding_type, Qutf_16))
8362 {
8363 Lisp_Object bom, endian;
8364
8365 CODING_ATTR_ASCII_COMPAT (attrs) = Qnil;
8366
8367 if (nargs < coding_arg_utf16_max)
8368 goto short_args;
8369
8370 bom = args[coding_arg_utf16_bom];
8371 if (! NILP (bom) && ! EQ (bom, Qt))
8372 {
8373 CHECK_CONS (bom);
8374 val = XCAR (bom);
8375 CHECK_CODING_SYSTEM (val);
8376 val = XCDR (bom);
8377 CHECK_CODING_SYSTEM (val);
8378 }
8379 ASET (attrs, coding_attr_utf_16_bom, bom);
8380
8381 endian = args[coding_arg_utf16_endian];
8382 CHECK_SYMBOL (endian);
8383 if (NILP (endian))
8384 endian = Qbig;
8385 else if (! EQ (endian, Qbig) && ! EQ (endian, Qlittle))
8386 error ("Invalid endian: %s", SDATA (SYMBOL_NAME (endian)));
8387 ASET (attrs, coding_attr_utf_16_endian, endian);
8388
8389 category = (CONSP (bom)
8390 ? coding_category_utf_16_auto
8391 : NILP (bom)
8392 ? (EQ (endian, Qbig)
8393 ? coding_category_utf_16_be_nosig
8394 : coding_category_utf_16_le_nosig)
8395 : (EQ (endian, Qbig)
8396 ? coding_category_utf_16_be
8397 : coding_category_utf_16_le));
8398 }
8399 else if (EQ (coding_type, Qiso_2022))
8400 {
8401 Lisp_Object initial, reg_usage, request, flags;
8402 int i;
8403
8404 if (nargs < coding_arg_iso2022_max)
8405 goto short_args;
8406
8407 initial = Fcopy_sequence (args[coding_arg_iso2022_initial]);
8408 CHECK_VECTOR (initial);
8409 for (i = 0; i < 4; i++)
8410 {
8411 val = Faref (initial, make_number (i));
8412 if (! NILP (val))
8413 {
8414 struct charset *charset;
8415
8416 CHECK_CHARSET_GET_CHARSET (val, charset);
8417 ASET (initial, i, make_number (CHARSET_ID (charset)));
8418 if (i == 0 && CHARSET_ASCII_COMPATIBLE_P (charset))
8419 CODING_ATTR_ASCII_COMPAT (attrs) = Qt;
8420 }
8421 else
8422 ASET (initial, i, make_number (-1));
8423 }
8424
8425 reg_usage = args[coding_arg_iso2022_reg_usage];
8426 CHECK_CONS (reg_usage);
8427 CHECK_NUMBER_CAR (reg_usage);
8428 CHECK_NUMBER_CDR (reg_usage);
8429
8430 request = Fcopy_sequence (args[coding_arg_iso2022_request]);
8431 for (tail = request; ! NILP (tail); tail = Fcdr (tail))
8432 {
8433 int id;
8434 Lisp_Object tmp;
8435
8436 val = Fcar (tail);
8437 CHECK_CONS (val);
8438 tmp = XCAR (val);
8439 CHECK_CHARSET_GET_ID (tmp, id);
8440 CHECK_NATNUM_CDR (val);
8441 if (XINT (XCDR (val)) >= 4)
8442 error ("Invalid graphic register number: %d", XINT (XCDR (val)));
8443 XSETCAR (val, make_number (id));
8444 }
8445
8446 flags = args[coding_arg_iso2022_flags];
8447 CHECK_NATNUM (flags);
8448 i = XINT (flags);
8449 if (EQ (args[coding_arg_charset_list], Qiso_2022))
8450 flags = make_number (i | CODING_ISO_FLAG_FULL_SUPPORT);
8451
8452 ASET (attrs, coding_attr_iso_initial, initial);
8453 ASET (attrs, coding_attr_iso_usage, reg_usage);
8454 ASET (attrs, coding_attr_iso_request, request);
8455 ASET (attrs, coding_attr_iso_flags, flags);
8456 setup_iso_safe_charsets (attrs);
8457
8458 if (i & CODING_ISO_FLAG_SEVEN_BITS)
8459 category = ((i & (CODING_ISO_FLAG_LOCKING_SHIFT
8460 | CODING_ISO_FLAG_SINGLE_SHIFT))
8461 ? coding_category_iso_7_else
8462 : EQ (args[coding_arg_charset_list], Qiso_2022)
8463 ? coding_category_iso_7
8464 : coding_category_iso_7_tight);
8465 else
8466 {
8467 int id = XINT (AREF (initial, 1));
8468
8469 category = (((i & CODING_ISO_FLAG_LOCKING_SHIFT)
8470 || EQ (args[coding_arg_charset_list], Qiso_2022)
8471 || id < 0)
8472 ? coding_category_iso_8_else
8473 : (CHARSET_DIMENSION (CHARSET_FROM_ID (id)) == 1)
8474 ? coding_category_iso_8_1
8475 : coding_category_iso_8_2);
8476 }
8477 if (category != coding_category_iso_8_1
8478 && category != coding_category_iso_8_2)
8479 CODING_ATTR_ASCII_COMPAT (attrs) = Qnil;
8480 }
8481 else if (EQ (coding_type, Qemacs_mule))
8482 {
8483 if (EQ (args[coding_arg_charset_list], Qemacs_mule))
8484 ASET (attrs, coding_attr_emacs_mule_full, Qt);
8485 CODING_ATTR_ASCII_COMPAT (attrs) = Qt;
8486 category = coding_category_emacs_mule;
8487 }
8488 else if (EQ (coding_type, Qshift_jis))
8489 {
8490
8491 struct charset *charset;
8492
8493 if (XINT (Flength (charset_list)) != 3
8494 && XINT (Flength (charset_list)) != 4)
8495 error ("There should be three or four charsets");
8496
8497 charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
8498 if (CHARSET_DIMENSION (charset) != 1)
8499 error ("Dimension of charset %s is not one",
8500 SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
8501 if (CHARSET_ASCII_COMPATIBLE_P (charset))
8502 CODING_ATTR_ASCII_COMPAT (attrs) = Qt;
8503
8504 charset_list = XCDR (charset_list);
8505 charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
8506 if (CHARSET_DIMENSION (charset) != 1)
8507 error ("Dimension of charset %s is not one",
8508 SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
8509
8510 charset_list = XCDR (charset_list);
8511 charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
8512 if (CHARSET_DIMENSION (charset) != 2)
8513 error ("Dimension of charset %s is not two",
8514 SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
8515
8516 charset_list = XCDR (charset_list);
8517 if (! NILP (charset_list))
8518 {
8519 charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
8520 if (CHARSET_DIMENSION (charset) != 2)
8521 error ("Dimension of charset %s is not two",
8522 SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
8523 }
8524
8525 category = coding_category_sjis;
8526 Vsjis_coding_system = name;
8527 }
8528 else if (EQ (coding_type, Qbig5))
8529 {
8530 struct charset *charset;
8531
8532 if (XINT (Flength (charset_list)) != 2)
8533 error ("There should be just two charsets");
8534
8535 charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
8536 if (CHARSET_DIMENSION (charset) != 1)
8537 error ("Dimension of charset %s is not one",
8538 SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
8539 if (CHARSET_ASCII_COMPATIBLE_P (charset))
8540 CODING_ATTR_ASCII_COMPAT (attrs) = Qt;
8541
8542 charset_list = XCDR (charset_list);
8543 charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
8544 if (CHARSET_DIMENSION (charset) != 2)
8545 error ("Dimension of charset %s is not two",
8546 SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
8547
8548 category = coding_category_big5;
8549 Vbig5_coding_system = name;
8550 }
8551 else if (EQ (coding_type, Qraw_text))
8552 {
8553 category = coding_category_raw_text;
8554 CODING_ATTR_ASCII_COMPAT (attrs) = Qt;
8555 }
8556 else if (EQ (coding_type, Qutf_8))
8557 {
8558 category = coding_category_utf_8;
8559 CODING_ATTR_ASCII_COMPAT (attrs) = Qt;
8560 }
8561 else if (EQ (coding_type, Qundecided))
8562 category = coding_category_undecided;
8563 else
8564 error ("Invalid coding system type: %s",
8565 SDATA (SYMBOL_NAME (coding_type)));
8566
8567 CODING_ATTR_CATEGORY (attrs) = make_number (category);
8568 CODING_ATTR_PLIST (attrs)
8569 = Fcons (QCcategory, Fcons (AREF (Vcoding_category_table, category),
8570 CODING_ATTR_PLIST (attrs)));
8571
8572 eol_type = args[coding_arg_eol_type];
8573 if (! NILP (eol_type)
8574 && ! EQ (eol_type, Qunix)
8575 && ! EQ (eol_type, Qdos)
8576 && ! EQ (eol_type, Qmac))
8577 error ("Invalid eol-type");
8578
8579 aliases = Fcons (name, Qnil);
8580
8581 if (NILP (eol_type))
8582 {
8583 eol_type = make_subsidiaries (name);
8584 for (i = 0; i < 3; i++)
8585 {
8586 Lisp_Object this_spec, this_name, this_aliases, this_eol_type;
8587
8588 this_name = AREF (eol_type, i);
8589 this_aliases = Fcons (this_name, Qnil);
8590 this_eol_type = (i == 0 ? Qunix : i == 1 ? Qdos : Qmac);
8591 this_spec = Fmake_vector (make_number (3), attrs);
8592 ASET (this_spec, 1, this_aliases);
8593 ASET (this_spec, 2, this_eol_type);
8594 Fputhash (this_name, this_spec, Vcoding_system_hash_table);
8595 Vcoding_system_list = Fcons (this_name, Vcoding_system_list);
8596 Vcoding_system_alist = Fcons (Fcons (Fsymbol_name (this_name), Qnil),
8597 Vcoding_system_alist);
8598 }
8599 }
8600
8601 spec_vec = Fmake_vector (make_number (3), attrs);
8602 ASET (spec_vec, 1, aliases);
8603 ASET (spec_vec, 2, eol_type);
8604
8605 Fputhash (name, spec_vec, Vcoding_system_hash_table);
8606 Vcoding_system_list = Fcons (name, Vcoding_system_list);
8607 Vcoding_system_alist = Fcons (Fcons (Fsymbol_name (name), Qnil),
8608 Vcoding_system_alist);
8609
8610 {
8611 int id = coding_categories[category].id;
8612
8613 if (id < 0 || EQ (name, CODING_ID_NAME (id)))
8614 setup_coding_system (name, &coding_categories[category]);
8615 }
8616
8617 return Qnil;
8618
8619 short_args:
8620 return Fsignal (Qwrong_number_of_arguments,
8621 Fcons (intern ("define-coding-system-internal"),
8622 make_number (nargs)));
8623 }
8624
8625
8626 DEFUN ("coding-system-put", Fcoding_system_put, Scoding_system_put,
8627 3, 3, 0,
8628 doc: /* Change value in CODING-SYSTEM's property list PROP to VAL. */)
8629 (coding_system, prop, val)
8630 Lisp_Object coding_system, prop, val;
8631 {
8632 Lisp_Object spec, attrs, plist;
8633
8634 CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
8635 attrs = AREF (spec, 0);
8636 if (EQ (prop, QCmnemonic))
8637 {
8638 if (! STRINGP (val))
8639 CHECK_CHARACTER (val);
8640 CODING_ATTR_MNEMONIC (attrs) = val;
8641 }
8642 else if (EQ (prop, QCdefalut_char))
8643 {
8644 if (NILP (val))
8645 val = make_number (' ');
8646 else
8647 CHECK_CHARACTER (val);
8648 CODING_ATTR_DEFAULT_CHAR (attrs) = val;
8649 }
8650 else if (EQ (prop, QCdecode_translation_table))
8651 {
8652 if (! CHAR_TABLE_P (val) && ! CONSP (val))
8653 CHECK_SYMBOL (val);
8654 CODING_ATTR_DECODE_TBL (attrs) = val;
8655 }
8656 else if (EQ (prop, QCencode_translation_table))
8657 {
8658 if (! CHAR_TABLE_P (val) && ! CONSP (val))
8659 CHECK_SYMBOL (val);
8660 CODING_ATTR_ENCODE_TBL (attrs) = val;
8661 }
8662 else if (EQ (prop, QCpost_read_conversion))
8663 {
8664 CHECK_SYMBOL (val);
8665 CODING_ATTR_POST_READ (attrs) = val;
8666 }
8667 else if (EQ (prop, QCpre_write_conversion))
8668 {
8669 CHECK_SYMBOL (val);
8670 CODING_ATTR_PRE_WRITE (attrs) = val;
8671 }
8672
8673 CODING_ATTR_PLIST (attrs)
8674 = Fplist_put (CODING_ATTR_PLIST (attrs), prop, val);
8675 return val;
8676 }
8677
8678
8679 DEFUN ("define-coding-system-alias", Fdefine_coding_system_alias,
8680 Sdefine_coding_system_alias, 2, 2, 0,
8681 doc: /* Define ALIAS as an alias for CODING-SYSTEM. */)
8682 (alias, coding_system)
8683 Lisp_Object alias, coding_system;
8684 {
8685 Lisp_Object spec, aliases, eol_type;
8686
8687 CHECK_SYMBOL (alias);
8688 CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
8689 aliases = AREF (spec, 1);
8690 /* ALISES should be a list of length more than zero, and the first
8691 element is a base coding system. Append ALIAS at the tail of the
8692 list. */
8693 while (!NILP (XCDR (aliases)))
8694 aliases = XCDR (aliases);
8695 XSETCDR (aliases, Fcons (alias, Qnil));
8696
8697 eol_type = AREF (spec, 2);
8698 if (VECTORP (eol_type))
8699 {
8700 Lisp_Object subsidiaries;
8701 int i;
8702
8703 subsidiaries = make_subsidiaries (alias);
8704 for (i = 0; i < 3; i++)
8705 Fdefine_coding_system_alias (AREF (subsidiaries, i),
8706 AREF (eol_type, i));
8707 }
8708
8709 Fputhash (alias, spec, Vcoding_system_hash_table);
8710 Vcoding_system_list = Fcons (alias, Vcoding_system_list);
8711 Vcoding_system_alist = Fcons (Fcons (Fsymbol_name (alias), Qnil),
8712 Vcoding_system_alist);
8713
8714 return Qnil;
8715 }
8716
8717 DEFUN ("coding-system-base", Fcoding_system_base, Scoding_system_base,
8718 1, 1, 0,
8719 doc: /* Return the base of CODING-SYSTEM.
8720 Any alias or subsidiary coding system is not a base coding system. */)
8721 (coding_system)
8722 Lisp_Object coding_system;
8723 {
8724 Lisp_Object spec, attrs;
8725
8726 if (NILP (coding_system))
8727 return (Qno_conversion);
8728 CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
8729 attrs = AREF (spec, 0);
8730 return CODING_ATTR_BASE_NAME (attrs);
8731 }
8732
8733 DEFUN ("coding-system-plist", Fcoding_system_plist, Scoding_system_plist,
8734 1, 1, 0,
8735 doc: "Return the property list of CODING-SYSTEM.")
8736 (coding_system)
8737 Lisp_Object coding_system;
8738 {
8739 Lisp_Object spec, attrs;
8740
8741 if (NILP (coding_system))
8742 coding_system = Qno_conversion;
8743 CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
8744 attrs = AREF (spec, 0);
8745 return CODING_ATTR_PLIST (attrs);
8746 }
8747
8748
8749 DEFUN ("coding-system-aliases", Fcoding_system_aliases, Scoding_system_aliases,
8750 1, 1, 0,
8751 doc: /* Return the list of aliases of CODING-SYSTEM. */)
8752 (coding_system)
8753 Lisp_Object coding_system;
8754 {
8755 Lisp_Object spec;
8756
8757 if (NILP (coding_system))
8758 coding_system = Qno_conversion;
8759 CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
8760 return AREF (spec, 1);
8761 }
8762
8763 DEFUN ("coding-system-eol-type", Fcoding_system_eol_type,
8764 Scoding_system_eol_type, 1, 1, 0,
8765 doc: /* Return eol-type of CODING-SYSTEM.
8766 An eol-type is integer 0, 1, 2, or a vector of coding systems.
8767
8768 Integer values 0, 1, and 2 indicate a format of end-of-line; LF, CRLF,
8769 and CR respectively.
8770
8771 A vector value indicates that a format of end-of-line should be
8772 detected automatically. Nth element of the vector is the subsidiary
8773 coding system whose eol-type is N. */)
8774 (coding_system)
8775 Lisp_Object coding_system;
8776 {
8777 Lisp_Object spec, eol_type;
8778 int n;
8779
8780 if (NILP (coding_system))
8781 coding_system = Qno_conversion;
8782 if (! CODING_SYSTEM_P (coding_system))
8783 return Qnil;
8784 spec = CODING_SYSTEM_SPEC (coding_system);
8785 eol_type = AREF (spec, 2);
8786 if (VECTORP (eol_type))
8787 return Fcopy_sequence (eol_type);
8788 n = EQ (eol_type, Qunix) ? 0 : EQ (eol_type, Qdos) ? 1 : 2;
8789 return make_number (n);
8790 }
8791
8792 #endif /* emacs */
8793
8794 \f
8795 /*** 9. Post-amble ***/
8796
8797 void
8798 init_coding_once ()
8799 {
8800 int i;
8801
8802 for (i = 0; i < coding_category_max; i++)
8803 {
8804 coding_categories[i].id = -1;
8805 coding_priorities[i] = i;
8806 }
8807
8808 /* ISO2022 specific initialize routine. */
8809 for (i = 0; i < 0x20; i++)
8810 iso_code_class[i] = ISO_control_0;
8811 for (i = 0x21; i < 0x7F; i++)
8812 iso_code_class[i] = ISO_graphic_plane_0;
8813 for (i = 0x80; i < 0xA0; i++)
8814 iso_code_class[i] = ISO_control_1;
8815 for (i = 0xA1; i < 0xFF; i++)
8816 iso_code_class[i] = ISO_graphic_plane_1;
8817 iso_code_class[0x20] = iso_code_class[0x7F] = ISO_0x20_or_0x7F;
8818 iso_code_class[0xA0] = iso_code_class[0xFF] = ISO_0xA0_or_0xFF;
8819 iso_code_class[ISO_CODE_SO] = ISO_shift_out;
8820 iso_code_class[ISO_CODE_SI] = ISO_shift_in;
8821 iso_code_class[ISO_CODE_SS2_7] = ISO_single_shift_2_7;
8822 iso_code_class[ISO_CODE_ESC] = ISO_escape;
8823 iso_code_class[ISO_CODE_SS2] = ISO_single_shift_2;
8824 iso_code_class[ISO_CODE_SS3] = ISO_single_shift_3;
8825 iso_code_class[ISO_CODE_CSI] = ISO_control_sequence_introducer;
8826
8827 for (i = 0; i < 256; i++)
8828 {
8829 emacs_mule_bytes[i] = 1;
8830 }
8831 emacs_mule_bytes[EMACS_MULE_LEADING_CODE_PRIVATE_11] = 3;
8832 emacs_mule_bytes[EMACS_MULE_LEADING_CODE_PRIVATE_12] = 3;
8833 emacs_mule_bytes[EMACS_MULE_LEADING_CODE_PRIVATE_21] = 4;
8834 emacs_mule_bytes[EMACS_MULE_LEADING_CODE_PRIVATE_22] = 4;
8835 }
8836
8837 #ifdef emacs
8838
8839 void
8840 syms_of_coding ()
8841 {
8842 staticpro (&Vcoding_system_hash_table);
8843 {
8844 Lisp_Object args[2];
8845 args[0] = QCtest;
8846 args[1] = Qeq;
8847 Vcoding_system_hash_table = Fmake_hash_table (2, args);
8848 }
8849
8850 staticpro (&Vsjis_coding_system);
8851 Vsjis_coding_system = Qnil;
8852
8853 staticpro (&Vbig5_coding_system);
8854 Vbig5_coding_system = Qnil;
8855
8856 staticpro (&Vcode_conversion_reused_workbuf);
8857 Vcode_conversion_reused_workbuf = Qnil;
8858
8859 staticpro (&Vcode_conversion_workbuf_name);
8860 Vcode_conversion_workbuf_name = build_string (" *code-conversion-work*");
8861
8862 reused_workbuf_in_use = 0;
8863
8864 DEFSYM (Qcharset, "charset");
8865 DEFSYM (Qtarget_idx, "target-idx");
8866 DEFSYM (Qcoding_system_history, "coding-system-history");
8867 Fset (Qcoding_system_history, Qnil);
8868
8869 /* Target FILENAME is the first argument. */
8870 Fput (Qinsert_file_contents, Qtarget_idx, make_number (0));
8871 /* Target FILENAME is the third argument. */
8872 Fput (Qwrite_region, Qtarget_idx, make_number (2));
8873
8874 DEFSYM (Qcall_process, "call-process");
8875 /* Target PROGRAM is the first argument. */
8876 Fput (Qcall_process, Qtarget_idx, make_number (0));
8877
8878 DEFSYM (Qcall_process_region, "call-process-region");
8879 /* Target PROGRAM is the third argument. */
8880 Fput (Qcall_process_region, Qtarget_idx, make_number (2));
8881
8882 DEFSYM (Qstart_process, "start-process");
8883 /* Target PROGRAM is the third argument. */
8884 Fput (Qstart_process, Qtarget_idx, make_number (2));
8885
8886 DEFSYM (Qopen_network_stream, "open-network-stream");
8887 /* Target SERVICE is the fourth argument. */
8888 Fput (Qopen_network_stream, Qtarget_idx, make_number (3));
8889
8890 DEFSYM (Qcoding_system, "coding-system");
8891 DEFSYM (Qcoding_aliases, "coding-aliases");
8892
8893 DEFSYM (Qeol_type, "eol-type");
8894 DEFSYM (Qunix, "unix");
8895 DEFSYM (Qdos, "dos");
8896
8897 DEFSYM (Qbuffer_file_coding_system, "buffer-file-coding-system");
8898 DEFSYM (Qpost_read_conversion, "post-read-conversion");
8899 DEFSYM (Qpre_write_conversion, "pre-write-conversion");
8900 DEFSYM (Qdefault_char, "default-char");
8901 DEFSYM (Qundecided, "undecided");
8902 DEFSYM (Qno_conversion, "no-conversion");
8903 DEFSYM (Qraw_text, "raw-text");
8904
8905 DEFSYM (Qiso_2022, "iso-2022");
8906
8907 DEFSYM (Qutf_8, "utf-8");
8908 DEFSYM (Qutf_8_emacs, "utf-8-emacs");
8909
8910 DEFSYM (Qutf_16, "utf-16");
8911 DEFSYM (Qbig, "big");
8912 DEFSYM (Qlittle, "little");
8913
8914 DEFSYM (Qshift_jis, "shift-jis");
8915 DEFSYM (Qbig5, "big5");
8916
8917 DEFSYM (Qcoding_system_p, "coding-system-p");
8918
8919 DEFSYM (Qcoding_system_error, "coding-system-error");
8920 Fput (Qcoding_system_error, Qerror_conditions,
8921 Fcons (Qcoding_system_error, Fcons (Qerror, Qnil)));
8922 Fput (Qcoding_system_error, Qerror_message,
8923 build_string ("Invalid coding system"));
8924
8925 /* Intern this now in case it isn't already done.
8926 Setting this variable twice is harmless.
8927 But don't staticpro it here--that is done in alloc.c. */
8928 Qchar_table_extra_slots = intern ("char-table-extra-slots");
8929
8930 DEFSYM (Qtranslation_table, "translation-table");
8931 Fput (Qtranslation_table, Qchar_table_extra_slots, make_number (1));
8932 DEFSYM (Qtranslation_table_id, "translation-table-id");
8933 DEFSYM (Qtranslation_table_for_decode, "translation-table-for-decode");
8934 DEFSYM (Qtranslation_table_for_encode, "translation-table-for-encode");
8935
8936 DEFSYM (Qvalid_codes, "valid-codes");
8937
8938 DEFSYM (Qemacs_mule, "emacs-mule");
8939
8940 DEFSYM (QCcategory, ":category");
8941 DEFSYM (QCmnemonic, ":mnemonic");
8942 DEFSYM (QCdefalut_char, ":default-char");
8943 DEFSYM (QCdecode_translation_table, ":decode-translation-table");
8944 DEFSYM (QCencode_translation_table, ":encode-translation-table");
8945 DEFSYM (QCpost_read_conversion, ":post-read-conversion");
8946 DEFSYM (QCpre_write_conversion, ":pre-write-conversion");
8947
8948 Vcoding_category_table
8949 = Fmake_vector (make_number (coding_category_max), Qnil);
8950 staticpro (&Vcoding_category_table);
8951 /* Followings are target of code detection. */
8952 ASET (Vcoding_category_table, coding_category_iso_7,
8953 intern ("coding-category-iso-7"));
8954 ASET (Vcoding_category_table, coding_category_iso_7_tight,
8955 intern ("coding-category-iso-7-tight"));
8956 ASET (Vcoding_category_table, coding_category_iso_8_1,
8957 intern ("coding-category-iso-8-1"));
8958 ASET (Vcoding_category_table, coding_category_iso_8_2,
8959 intern ("coding-category-iso-8-2"));
8960 ASET (Vcoding_category_table, coding_category_iso_7_else,
8961 intern ("coding-category-iso-7-else"));
8962 ASET (Vcoding_category_table, coding_category_iso_8_else,
8963 intern ("coding-category-iso-8-else"));
8964 ASET (Vcoding_category_table, coding_category_utf_8,
8965 intern ("coding-category-utf-8"));
8966 ASET (Vcoding_category_table, coding_category_utf_16_be,
8967 intern ("coding-category-utf-16-be"));
8968 ASET (Vcoding_category_table, coding_category_utf_16_auto,
8969 intern ("coding-category-utf-16-auto"));
8970 ASET (Vcoding_category_table, coding_category_utf_16_le,
8971 intern ("coding-category-utf-16-le"));
8972 ASET (Vcoding_category_table, coding_category_utf_16_be_nosig,
8973 intern ("coding-category-utf-16-be-nosig"));
8974 ASET (Vcoding_category_table, coding_category_utf_16_le_nosig,
8975 intern ("coding-category-utf-16-le-nosig"));
8976 ASET (Vcoding_category_table, coding_category_charset,
8977 intern ("coding-category-charset"));
8978 ASET (Vcoding_category_table, coding_category_sjis,
8979 intern ("coding-category-sjis"));
8980 ASET (Vcoding_category_table, coding_category_big5,
8981 intern ("coding-category-big5"));
8982 ASET (Vcoding_category_table, coding_category_ccl,
8983 intern ("coding-category-ccl"));
8984 ASET (Vcoding_category_table, coding_category_emacs_mule,
8985 intern ("coding-category-emacs-mule"));
8986 /* Followings are NOT target of code detection. */
8987 ASET (Vcoding_category_table, coding_category_raw_text,
8988 intern ("coding-category-raw-text"));
8989 ASET (Vcoding_category_table, coding_category_undecided,
8990 intern ("coding-category-undecided"));
8991
8992 DEFSYM (Qinsufficient_source, "insufficient-source");
8993 DEFSYM (Qinconsistent_eol, "inconsistent-eol");
8994 DEFSYM (Qinvalid_source, "invalid-source");
8995 DEFSYM (Qinterrupted, "interrupted");
8996 DEFSYM (Qinsufficient_memory, "insufficient-memory");
8997
8998 defsubr (&Scoding_system_p);
8999 defsubr (&Sread_coding_system);
9000 defsubr (&Sread_non_nil_coding_system);
9001 defsubr (&Scheck_coding_system);
9002 defsubr (&Sdetect_coding_region);
9003 defsubr (&Sdetect_coding_string);
9004 defsubr (&Sfind_coding_systems_region_internal);
9005 defsubr (&Sunencodable_char_position);
9006 defsubr (&Scheck_coding_systems_region);
9007 defsubr (&Sdecode_coding_region);
9008 defsubr (&Sencode_coding_region);
9009 defsubr (&Sdecode_coding_string);
9010 defsubr (&Sencode_coding_string);
9011 defsubr (&Sdecode_sjis_char);
9012 defsubr (&Sencode_sjis_char);
9013 defsubr (&Sdecode_big5_char);
9014 defsubr (&Sencode_big5_char);
9015 defsubr (&Sset_terminal_coding_system_internal);
9016 defsubr (&Sset_safe_terminal_coding_system_internal);
9017 defsubr (&Sterminal_coding_system);
9018 defsubr (&Sset_keyboard_coding_system_internal);
9019 defsubr (&Skeyboard_coding_system);
9020 defsubr (&Sfind_operation_coding_system);
9021 defsubr (&Sset_coding_system_priority);
9022 defsubr (&Sdefine_coding_system_internal);
9023 defsubr (&Sdefine_coding_system_alias);
9024 defsubr (&Scoding_system_put);
9025 defsubr (&Scoding_system_base);
9026 defsubr (&Scoding_system_plist);
9027 defsubr (&Scoding_system_aliases);
9028 defsubr (&Scoding_system_eol_type);
9029 defsubr (&Scoding_system_priority_list);
9030
9031 DEFVAR_LISP ("coding-system-list", &Vcoding_system_list,
9032 doc: /* List of coding systems.
9033
9034 Do not alter the value of this variable manually. This variable should be
9035 updated by the functions `define-coding-system' and
9036 `define-coding-system-alias'. */);
9037 Vcoding_system_list = Qnil;
9038
9039 DEFVAR_LISP ("coding-system-alist", &Vcoding_system_alist,
9040 doc: /* Alist of coding system names.
9041 Each element is one element list of coding system name.
9042 This variable is given to `completing-read' as TABLE argument.
9043
9044 Do not alter the value of this variable manually. This variable should be
9045 updated by the functions `make-coding-system' and
9046 `define-coding-system-alias'. */);
9047 Vcoding_system_alist = Qnil;
9048
9049 DEFVAR_LISP ("coding-category-list", &Vcoding_category_list,
9050 doc: /* List of coding-categories (symbols) ordered by priority.
9051
9052 On detecting a coding system, Emacs tries code detection algorithms
9053 associated with each coding-category one by one in this order. When
9054 one algorithm agrees with a byte sequence of source text, the coding
9055 system bound to the corresponding coding-category is selected. */);
9056 {
9057 int i;
9058
9059 Vcoding_category_list = Qnil;
9060 for (i = coding_category_max - 1; i >= 0; i--)
9061 Vcoding_category_list
9062 = Fcons (XVECTOR (Vcoding_category_table)->contents[i],
9063 Vcoding_category_list);
9064 }
9065
9066 DEFVAR_LISP ("coding-system-for-read", &Vcoding_system_for_read,
9067 doc: /* Specify the coding system for read operations.
9068 It is useful to bind this variable with `let', but do not set it globally.
9069 If the value is a coding system, it is used for decoding on read operation.
9070 If not, an appropriate element is used from one of the coding system alists:
9071 There are three such tables, `file-coding-system-alist',
9072 `process-coding-system-alist', and `network-coding-system-alist'. */);
9073 Vcoding_system_for_read = Qnil;
9074
9075 DEFVAR_LISP ("coding-system-for-write", &Vcoding_system_for_write,
9076 doc: /* Specify the coding system for write operations.
9077 Programs bind this variable with `let', but you should not set it globally.
9078 If the value is a coding system, it is used for encoding of output,
9079 when writing it to a file and when sending it to a file or subprocess.
9080
9081 If this does not specify a coding system, an appropriate element
9082 is used from one of the coding system alists:
9083 There are three such tables, `file-coding-system-alist',
9084 `process-coding-system-alist', and `network-coding-system-alist'.
9085 For output to files, if the above procedure does not specify a coding system,
9086 the value of `buffer-file-coding-system' is used. */);
9087 Vcoding_system_for_write = Qnil;
9088
9089 DEFVAR_LISP ("last-coding-system-used", &Vlast_coding_system_used,
9090 doc: /*
9091 Coding system used in the latest file or process I/O. */);
9092 Vlast_coding_system_used = Qnil;
9093
9094 DEFVAR_LISP ("last-code-conversion-error", &Vlast_code_conversion_error,
9095 doc: /*
9096 Error status of the last code conversion.
9097
9098 When an error was detected in the last code conversion, this variable
9099 is set to one of the following symbols.
9100 `insufficient-source'
9101 `inconsistent-eol'
9102 `invalid-source'
9103 `interrupted'
9104 `insufficient-memory'
9105 When no error was detected, the value doesn't change. So, to check
9106 the error status of a code conversion by this variable, you must
9107 explicitly set this variable to nil before performing code
9108 conversion. */);
9109 Vlast_code_conversion_error = Qnil;
9110
9111 DEFVAR_BOOL ("inhibit-eol-conversion", &inhibit_eol_conversion,
9112 doc: /*
9113 *Non-nil means always inhibit code conversion of end-of-line format.
9114 See info node `Coding Systems' and info node `Text and Binary' concerning
9115 such conversion. */);
9116 inhibit_eol_conversion = 0;
9117
9118 DEFVAR_BOOL ("inherit-process-coding-system", &inherit_process_coding_system,
9119 doc: /*
9120 Non-nil means process buffer inherits coding system of process output.
9121 Bind it to t if the process output is to be treated as if it were a file
9122 read from some filesystem. */);
9123 inherit_process_coding_system = 0;
9124
9125 DEFVAR_LISP ("file-coding-system-alist", &Vfile_coding_system_alist,
9126 doc: /*
9127 Alist to decide a coding system to use for a file I/O operation.
9128 The format is ((PATTERN . VAL) ...),
9129 where PATTERN is a regular expression matching a file name,
9130 VAL is a coding system, a cons of coding systems, or a function symbol.
9131 If VAL is a coding system, it is used for both decoding and encoding
9132 the file contents.
9133 If VAL is a cons of coding systems, the car part is used for decoding,
9134 and the cdr part is used for encoding.
9135 If VAL is a function symbol, the function must return a coding system
9136 or a cons of coding systems which are used as above. The function gets
9137 the arguments with which `find-operation-coding-systems' was called.
9138
9139 See also the function `find-operation-coding-system'
9140 and the variable `auto-coding-alist'. */);
9141 Vfile_coding_system_alist = Qnil;
9142
9143 DEFVAR_LISP ("process-coding-system-alist", &Vprocess_coding_system_alist,
9144 doc: /*
9145 Alist to decide a coding system to use for a process I/O operation.
9146 The format is ((PATTERN . VAL) ...),
9147 where PATTERN is a regular expression matching a program name,
9148 VAL is a coding system, a cons of coding systems, or a function symbol.
9149 If VAL is a coding system, it is used for both decoding what received
9150 from the program and encoding what sent to the program.
9151 If VAL is a cons of coding systems, the car part is used for decoding,
9152 and the cdr part is used for encoding.
9153 If VAL is a function symbol, the function must return a coding system
9154 or a cons of coding systems which are used as above.
9155
9156 See also the function `find-operation-coding-system'. */);
9157 Vprocess_coding_system_alist = Qnil;
9158
9159 DEFVAR_LISP ("network-coding-system-alist", &Vnetwork_coding_system_alist,
9160 doc: /*
9161 Alist to decide a coding system to use for a network I/O operation.
9162 The format is ((PATTERN . VAL) ...),
9163 where PATTERN is a regular expression matching a network service name
9164 or is a port number to connect to,
9165 VAL is a coding system, a cons of coding systems, or a function symbol.
9166 If VAL is a coding system, it is used for both decoding what received
9167 from the network stream and encoding what sent to the network stream.
9168 If VAL is a cons of coding systems, the car part is used for decoding,
9169 and the cdr part is used for encoding.
9170 If VAL is a function symbol, the function must return a coding system
9171 or a cons of coding systems which are used as above.
9172
9173 See also the function `find-operation-coding-system'. */);
9174 Vnetwork_coding_system_alist = Qnil;
9175
9176 DEFVAR_LISP ("locale-coding-system", &Vlocale_coding_system,
9177 doc: /* Coding system to use with system messages.
9178 Also used for decoding keyboard input on X Window system. */);
9179 Vlocale_coding_system = Qnil;
9180
9181 /* The eol mnemonics are reset in startup.el system-dependently. */
9182 DEFVAR_LISP ("eol-mnemonic-unix", &eol_mnemonic_unix,
9183 doc: /*
9184 *String displayed in mode line for UNIX-like (LF) end-of-line format. */);
9185 eol_mnemonic_unix = build_string (":");
9186
9187 DEFVAR_LISP ("eol-mnemonic-dos", &eol_mnemonic_dos,
9188 doc: /*
9189 *String displayed in mode line for DOS-like (CRLF) end-of-line format. */);
9190 eol_mnemonic_dos = build_string ("\\");
9191
9192 DEFVAR_LISP ("eol-mnemonic-mac", &eol_mnemonic_mac,
9193 doc: /*
9194 *String displayed in mode line for MAC-like (CR) end-of-line format. */);
9195 eol_mnemonic_mac = build_string ("/");
9196
9197 DEFVAR_LISP ("eol-mnemonic-undecided", &eol_mnemonic_undecided,
9198 doc: /*
9199 *String displayed in mode line when end-of-line format is not yet determined. */);
9200 eol_mnemonic_undecided = build_string (":");
9201
9202 DEFVAR_LISP ("enable-character-translation", &Venable_character_translation,
9203 doc: /*
9204 *Non-nil enables character translation while encoding and decoding. */);
9205 Venable_character_translation = Qt;
9206
9207 DEFVAR_LISP ("standard-translation-table-for-decode",
9208 &Vstandard_translation_table_for_decode,
9209 doc: /* Table for translating characters while decoding. */);
9210 Vstandard_translation_table_for_decode = Qnil;
9211
9212 DEFVAR_LISP ("standard-translation-table-for-encode",
9213 &Vstandard_translation_table_for_encode,
9214 doc: /* Table for translating characters while encoding. */);
9215 Vstandard_translation_table_for_encode = Qnil;
9216
9217 DEFVAR_LISP ("charset-revision-table", &Vcharset_revision_table,
9218 doc: /* Alist of charsets vs revision numbers.
9219 While encoding, if a charset (car part of an element) is found,
9220 designate it with the escape sequence identifying revision (cdr part
9221 of the element). */);
9222 Vcharset_revision_table = Qnil;
9223
9224 DEFVAR_LISP ("default-process-coding-system",
9225 &Vdefault_process_coding_system,
9226 doc: /* Cons of coding systems used for process I/O by default.
9227 The car part is used for decoding a process output,
9228 the cdr part is used for encoding a text to be sent to a process. */);
9229 Vdefault_process_coding_system = Qnil;
9230
9231 DEFVAR_LISP ("latin-extra-code-table", &Vlatin_extra_code_table,
9232 doc: /*
9233 Table of extra Latin codes in the range 128..159 (inclusive).
9234 This is a vector of length 256.
9235 If Nth element is non-nil, the existence of code N in a file
9236 \(or output of subprocess) doesn't prevent it to be detected as
9237 a coding system of ISO 2022 variant which has a flag
9238 `accept-latin-extra-code' t (e.g. iso-latin-1) on reading a file
9239 or reading output of a subprocess.
9240 Only 128th through 159th elements has a meaning. */);
9241 Vlatin_extra_code_table = Fmake_vector (make_number (256), Qnil);
9242
9243 DEFVAR_LISP ("select-safe-coding-system-function",
9244 &Vselect_safe_coding_system_function,
9245 doc: /*
9246 Function to call to select safe coding system for encoding a text.
9247
9248 If set, this function is called to force a user to select a proper
9249 coding system which can encode the text in the case that a default
9250 coding system used in each operation can't encode the text.
9251
9252 The default value is `select-safe-coding-system' (which see). */);
9253 Vselect_safe_coding_system_function = Qnil;
9254
9255 DEFVAR_BOOL ("coding-system-require-warning",
9256 &coding_system_require_warning,
9257 doc: /* Internal use only.
9258 If non-nil, on writing a file, `select-safe-coding-system-function' is
9259 called even if `coding-system-for-write' is non-nil. The command
9260 `universal-coding-system-argument' binds this variable to t temporarily. */);
9261 coding_system_require_warning = 0;
9262
9263
9264 DEFVAR_BOOL ("inhibit-iso-escape-detection",
9265 &inhibit_iso_escape_detection,
9266 doc: /*
9267 If non-nil, Emacs ignores ISO2022's escape sequence on code detection.
9268
9269 By default, on reading a file, Emacs tries to detect how the text is
9270 encoded. This code detection is sensitive to escape sequences. If
9271 the sequence is valid as ISO2022, the code is determined as one of
9272 the ISO2022 encodings, and the file is decoded by the corresponding
9273 coding system (e.g. `iso-2022-7bit').
9274
9275 However, there may be a case that you want to read escape sequences in
9276 a file as is. In such a case, you can set this variable to non-nil.
9277 Then, as the code detection ignores any escape sequences, no file is
9278 detected as encoded in some ISO2022 encoding. The result is that all
9279 escape sequences become visible in a buffer.
9280
9281 The default value is nil, and it is strongly recommended not to change
9282 it. That is because many Emacs Lisp source files that contain
9283 non-ASCII characters are encoded by the coding system `iso-2022-7bit'
9284 in Emacs's distribution, and they won't be decoded correctly on
9285 reading if you suppress escape sequence detection.
9286
9287 The other way to read escape sequences in a file without decoding is
9288 to explicitly specify some coding system that doesn't use ISO2022's
9289 escape sequence (e.g `latin-1') on reading by \\[universal-coding-system-argument]. */);
9290 inhibit_iso_escape_detection = 0;
9291
9292 DEFVAR_LISP ("translation-table-for-input", &Vtranslation_table_for_input,
9293 doc: /* Char table for translating self-inserting characters.
9294 This is applied to the result of input methods, not their input. See also
9295 `keyboard-translate-table'. */);
9296 Vtranslation_table_for_input = Qnil;
9297
9298 {
9299 Lisp_Object args[coding_arg_max];
9300 Lisp_Object plist[16];
9301 int i;
9302
9303 for (i = 0; i < coding_arg_max; i++)
9304 args[i] = Qnil;
9305
9306 plist[0] = intern (":name");
9307 plist[1] = args[coding_arg_name] = Qno_conversion;
9308 plist[2] = intern (":mnemonic");
9309 plist[3] = args[coding_arg_mnemonic] = make_number ('=');
9310 plist[4] = intern (":coding-type");
9311 plist[5] = args[coding_arg_coding_type] = Qraw_text;
9312 plist[6] = intern (":ascii-compatible-p");
9313 plist[7] = args[coding_arg_ascii_compatible_p] = Qt;
9314 plist[8] = intern (":default-char");
9315 plist[9] = args[coding_arg_default_char] = make_number (0);
9316 plist[10] = intern (":for-unibyte");
9317 plist[11] = args[coding_arg_for_unibyte] = Qt;
9318 plist[12] = intern (":docstring");
9319 plist[13] = build_string ("Do no conversion.\n\
9320 \n\
9321 When you visit a file with this coding, the file is read into a\n\
9322 unibyte buffer as is, thus each byte of a file is treated as a\n\
9323 character.");
9324 plist[14] = intern (":eol-type");
9325 plist[15] = args[coding_arg_eol_type] = Qunix;
9326 args[coding_arg_plist] = Flist (16, plist);
9327 Fdefine_coding_system_internal (coding_arg_max, args);
9328 }
9329
9330 setup_coding_system (Qno_conversion, &keyboard_coding);
9331 setup_coding_system (Qno_conversion, &terminal_coding);
9332 setup_coding_system (Qno_conversion, &safe_terminal_coding);
9333
9334 {
9335 int i;
9336
9337 for (i = 0; i < coding_category_max; i++)
9338 Fset (AREF (Vcoding_category_table, i), Qno_conversion);
9339 }
9340 }
9341
9342 char *
9343 emacs_strerror (error_number)
9344 int error_number;
9345 {
9346 char *str;
9347
9348 synchronize_system_messages_locale ();
9349 str = strerror (error_number);
9350
9351 if (! NILP (Vlocale_coding_system))
9352 {
9353 Lisp_Object dec = code_convert_string_norecord (build_string (str),
9354 Vlocale_coding_system,
9355 0);
9356 str = (char *) SDATA (dec);
9357 }
9358
9359 return str;
9360 }
9361
9362 #endif /* emacs */