(Ffind_coding_systems_region_internal): Cache checked characters.
[bpt/emacs.git] / src / coding.c
1 /* Coding system handler (conversion, detection, etc).
2 Copyright (C) 2001, 2002, 2003, 2004, 2005,
3 2006, 2007, 2008, 2009 Free Software Foundation, Inc.
4 Copyright (C) 1995, 1996, 1997, 1998, 1999, 2000, 2001, 2002, 2003, 2004,
5 2005, 2006, 2007, 2008, 2009
6 National Institute of Advanced Industrial Science and Technology (AIST)
7 Registration Number H14PRO021
8 Copyright (C) 2003
9 National Institute of Advanced Industrial Science and Technology (AIST)
10 Registration Number H13PRO009
11
12 This file is part of GNU Emacs.
13
14 GNU Emacs is free software: you can redistribute it and/or modify
15 it under the terms of the GNU General Public License as published by
16 the Free Software Foundation, either version 3 of the License, or
17 (at your option) any later version.
18
19 GNU Emacs is distributed in the hope that it will be useful,
20 but WITHOUT ANY WARRANTY; without even the implied warranty of
21 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
22 GNU General Public License for more details.
23
24 You should have received a copy of the GNU General Public License
25 along with GNU Emacs. If not, see <http://www.gnu.org/licenses/>. */
26
27 /*** TABLE OF CONTENTS ***
28
29 0. General comments
30 1. Preamble
31 2. Emacs' internal format (emacs-utf-8) handlers
32 3. UTF-8 handlers
33 4. UTF-16 handlers
34 5. Charset-base coding systems handlers
35 6. emacs-mule (old Emacs' internal format) handlers
36 7. ISO2022 handlers
37 8. Shift-JIS and BIG5 handlers
38 9. CCL handlers
39 10. C library functions
40 11. Emacs Lisp library functions
41 12. Postamble
42
43 */
44
45 /*** 0. General comments ***
46
47
48 CODING SYSTEM
49
50 A coding system is an object for an encoding mechanism that contains
51 information about how to convert byte sequences to character
52 sequences and vice versa. When we say "decode", it means converting
53 a byte sequence of a specific coding system into a character
54 sequence that is represented by Emacs' internal coding system
55 `emacs-utf-8', and when we say "encode", it means converting a
56 character sequence of emacs-utf-8 to a byte sequence of a specific
57 coding system.
58
59 In Emacs Lisp, a coding system is represented by a Lisp symbol. In
60 C level, a coding system is represented by a vector of attributes
61 stored in the hash table Vcharset_hash_table. The conversion from
62 coding system symbol to attributes vector is done by looking up
63 Vcharset_hash_table by the symbol.
64
65 Coding systems are classified into the following types depending on
66 the encoding mechanism. Here's a brief description of the types.
67
68 o UTF-8
69
70 o UTF-16
71
72 o Charset-base coding system
73
74 A coding system defined by one or more (coded) character sets.
75 Decoding and encoding are done by a code converter defined for each
76 character set.
77
78 o Old Emacs internal format (emacs-mule)
79
80 The coding system adopted by old versions of Emacs (20 and 21).
81
82 o ISO2022-base coding system
83
84 The most famous coding system for multiple character sets. X's
85 Compound Text, various EUCs (Extended Unix Code), and coding systems
86 used in the Internet communication such as ISO-2022-JP are all
87 variants of ISO2022.
88
89 o SJIS (or Shift-JIS or MS-Kanji-Code)
90
91 A coding system to encode character sets: ASCII, JISX0201, and
92 JISX0208. Widely used for PC's in Japan. Details are described in
93 section 8.
94
95 o BIG5
96
97 A coding system to encode character sets: ASCII and Big5. Widely
98 used for Chinese (mainly in Taiwan and Hong Kong). Details are
99 described in section 8. In this file, when we write "big5" (all
100 lowercase), we mean the coding system, and when we write "Big5"
101 (capitalized), we mean the character set.
102
103 o CCL
104
105 If a user wants to decode/encode text encoded in a coding system
106 not listed above, he can supply a decoder and an encoder for it in
107 CCL (Code Conversion Language) programs. Emacs executes the CCL
108 program while decoding/encoding.
109
110 o Raw-text
111
112 A coding system for text containing raw eight-bit data. Emacs
113 treats each byte of source text as a character (except for
114 end-of-line conversion).
115
116 o No-conversion
117
118 Like raw text, but don't do end-of-line conversion.
119
120
121 END-OF-LINE FORMAT
122
123 How text end-of-line is encoded depends on operating system. For
124 instance, Unix's format is just one byte of LF (line-feed) code,
125 whereas DOS's format is two-byte sequence of `carriage-return' and
126 `line-feed' codes. MacOS's format is usually one byte of
127 `carriage-return'.
128
129 Since text character encoding and end-of-line encoding are
130 independent, any coding system described above can take any format
131 of end-of-line (except for no-conversion).
132
133 STRUCT CODING_SYSTEM
134
135 Before using a coding system for code conversion (i.e. decoding and
136 encoding), we setup a structure of type `struct coding_system'.
137 This structure keeps various information about a specific code
138 conversion (e.g. the location of source and destination data).
139
140 */
141
142 /* COMMON MACROS */
143
144
145 /*** GENERAL NOTES on `detect_coding_XXX ()' functions ***
146
147 These functions check if a byte sequence specified as a source in
148 CODING conforms to the format of XXX, and update the members of
149 DETECT_INFO.
150
151 Return 1 if the byte sequence conforms to XXX, otherwise return 0.
152
153 Below is the template of these functions. */
154
155 #if 0
156 static int
157 detect_coding_XXX (coding, detect_info)
158 struct coding_system *coding;
159 struct coding_detection_info *detect_info;
160 {
161 const unsigned char *src = coding->source;
162 const unsigned char *src_end = coding->source + coding->src_bytes;
163 int multibytep = coding->src_multibyte;
164 int consumed_chars = 0;
165 int found = 0;
166 ...;
167
168 while (1)
169 {
170 /* Get one byte from the source. If the souce is exausted, jump
171 to no_more_source:. */
172 ONE_MORE_BYTE (c);
173
174 if (! __C_conforms_to_XXX___ (c))
175 break;
176 if (! __C_strongly_suggests_XXX__ (c))
177 found = CATEGORY_MASK_XXX;
178 }
179 /* The byte sequence is invalid for XXX. */
180 detect_info->rejected |= CATEGORY_MASK_XXX;
181 return 0;
182
183 no_more_source:
184 /* The source exausted successfully. */
185 detect_info->found |= found;
186 return 1;
187 }
188 #endif
189
190 /*** GENERAL NOTES on `decode_coding_XXX ()' functions ***
191
192 These functions decode a byte sequence specified as a source by
193 CODING. The resulting multibyte text goes to a place pointed to by
194 CODING->charbuf, the length of which should not exceed
195 CODING->charbuf_size;
196
197 These functions set the information of original and decoded texts in
198 CODING->consumed, CODING->consumed_char, and CODING->charbuf_used.
199 They also set CODING->result to one of CODING_RESULT_XXX indicating
200 how the decoding is finished.
201
202 Below is the template of these functions. */
203
204 #if 0
205 static void
206 decode_coding_XXXX (coding)
207 struct coding_system *coding;
208 {
209 const unsigned char *src = coding->source + coding->consumed;
210 const unsigned char *src_end = coding->source + coding->src_bytes;
211 /* SRC_BASE remembers the start position in source in each loop.
212 The loop will be exited when there's not enough source code, or
213 when there's no room in CHARBUF for a decoded character. */
214 const unsigned char *src_base;
215 /* A buffer to produce decoded characters. */
216 int *charbuf = coding->charbuf + coding->charbuf_used;
217 int *charbuf_end = coding->charbuf + coding->charbuf_size;
218 int multibytep = coding->src_multibyte;
219
220 while (1)
221 {
222 src_base = src;
223 if (charbuf < charbuf_end)
224 /* No more room to produce a decoded character. */
225 break;
226 ONE_MORE_BYTE (c);
227 /* Decode it. */
228 }
229
230 no_more_source:
231 if (src_base < src_end
232 && coding->mode & CODING_MODE_LAST_BLOCK)
233 /* If the source ends by partial bytes to construct a character,
234 treat them as eight-bit raw data. */
235 while (src_base < src_end && charbuf < charbuf_end)
236 *charbuf++ = *src_base++;
237 /* Remember how many bytes and characters we consumed. If the
238 source is multibyte, the bytes and chars are not identical. */
239 coding->consumed = coding->consumed_char = src_base - coding->source;
240 /* Remember how many characters we produced. */
241 coding->charbuf_used = charbuf - coding->charbuf;
242 }
243 #endif
244
245 /*** GENERAL NOTES on `encode_coding_XXX ()' functions ***
246
247 These functions encode SRC_BYTES length text at SOURCE of Emacs'
248 internal multibyte format by CODING. The resulting byte sequence
249 goes to a place pointed to by DESTINATION, the length of which
250 should not exceed DST_BYTES.
251
252 These functions set the information of original and encoded texts in
253 the members produced, produced_char, consumed, and consumed_char of
254 the structure *CODING. They also set the member result to one of
255 CODING_RESULT_XXX indicating how the encoding finished.
256
257 DST_BYTES zero means that source area and destination area are
258 overlapped, which means that we can produce a encoded text until it
259 reaches at the head of not-yet-encoded source text.
260
261 Below is a template of these functions. */
262 #if 0
263 static void
264 encode_coding_XXX (coding)
265 struct coding_system *coding;
266 {
267 int multibytep = coding->dst_multibyte;
268 int *charbuf = coding->charbuf;
269 int *charbuf_end = charbuf->charbuf + coding->charbuf_used;
270 unsigned char *dst = coding->destination + coding->produced;
271 unsigned char *dst_end = coding->destination + coding->dst_bytes;
272 unsigned char *adjusted_dst_end = dst_end - _MAX_BYTES_PRODUCED_IN_LOOP_;
273 int produced_chars = 0;
274
275 for (; charbuf < charbuf_end && dst < adjusted_dst_end; charbuf++)
276 {
277 int c = *charbuf;
278 /* Encode C into DST, and increment DST. */
279 }
280 label_no_more_destination:
281 /* How many chars and bytes we produced. */
282 coding->produced_char += produced_chars;
283 coding->produced = dst - coding->destination;
284 }
285 #endif
286
287 \f
288 /*** 1. Preamble ***/
289
290 #include <config.h>
291 #include <stdio.h>
292
293 #include "lisp.h"
294 #include "buffer.h"
295 #include "character.h"
296 #include "charset.h"
297 #include "ccl.h"
298 #include "composite.h"
299 #include "coding.h"
300 #include "window.h"
301 #include "frame.h"
302 #include "termhooks.h"
303
304 Lisp_Object Vcoding_system_hash_table;
305
306 Lisp_Object Qcoding_system, Qcoding_aliases, Qeol_type;
307 Lisp_Object Qunix, Qdos;
308 extern Lisp_Object Qmac; /* frame.c */
309 Lisp_Object Qbuffer_file_coding_system;
310 Lisp_Object Qpost_read_conversion, Qpre_write_conversion;
311 Lisp_Object Qdefault_char;
312 Lisp_Object Qno_conversion, Qundecided;
313 Lisp_Object Qcharset, Qiso_2022, Qutf_8, Qutf_16, Qshift_jis, Qbig5;
314 Lisp_Object Qbig, Qlittle;
315 Lisp_Object Qcoding_system_history;
316 Lisp_Object Qvalid_codes;
317 Lisp_Object QCcategory, QCmnemonic, QCdefault_char;
318 Lisp_Object QCdecode_translation_table, QCencode_translation_table;
319 Lisp_Object QCpost_read_conversion, QCpre_write_conversion;
320 Lisp_Object QCascii_compatible_p;
321
322 extern Lisp_Object Qinsert_file_contents, Qwrite_region;
323 Lisp_Object Qcall_process, Qcall_process_region;
324 Lisp_Object Qstart_process, Qopen_network_stream;
325 Lisp_Object Qtarget_idx;
326
327 Lisp_Object Qinsufficient_source, Qinconsistent_eol, Qinvalid_source;
328 Lisp_Object Qinterrupted, Qinsufficient_memory;
329
330 extern Lisp_Object Qcompletion_ignore_case;
331
332 /* If a symbol has this property, evaluate the value to define the
333 symbol as a coding system. */
334 static Lisp_Object Qcoding_system_define_form;
335
336 int coding_system_require_warning;
337
338 Lisp_Object Vselect_safe_coding_system_function;
339
340 /* Mnemonic string for each format of end-of-line. */
341 Lisp_Object eol_mnemonic_unix, eol_mnemonic_dos, eol_mnemonic_mac;
342 /* Mnemonic string to indicate format of end-of-line is not yet
343 decided. */
344 Lisp_Object eol_mnemonic_undecided;
345
346 /* Format of end-of-line decided by system. This is Qunix on
347 Unix and Mac, Qdos on DOS/Windows.
348 This has an effect only for external encoding (i.e. for output to
349 file and process), not for in-buffer or Lisp string encoding. */
350 static Lisp_Object system_eol_type;
351
352 #ifdef emacs
353
354 Lisp_Object Vcoding_system_list, Vcoding_system_alist;
355
356 Lisp_Object Qcoding_system_p, Qcoding_system_error;
357
358 /* Coding system emacs-mule and raw-text are for converting only
359 end-of-line format. */
360 Lisp_Object Qemacs_mule, Qraw_text;
361 Lisp_Object Qutf_8_emacs;
362
363 /* Coding-systems are handed between Emacs Lisp programs and C internal
364 routines by the following three variables. */
365 /* Coding-system for reading files and receiving data from process. */
366 Lisp_Object Vcoding_system_for_read;
367 /* Coding-system for writing files and sending data to process. */
368 Lisp_Object Vcoding_system_for_write;
369 /* Coding-system actually used in the latest I/O. */
370 Lisp_Object Vlast_coding_system_used;
371 /* Set to non-nil when an error is detected while code conversion. */
372 Lisp_Object Vlast_code_conversion_error;
373 /* A vector of length 256 which contains information about special
374 Latin codes (especially for dealing with Microsoft codes). */
375 Lisp_Object Vlatin_extra_code_table;
376
377 /* Flag to inhibit code conversion of end-of-line format. */
378 int inhibit_eol_conversion;
379
380 /* Flag to inhibit ISO2022 escape sequence detection. */
381 int inhibit_iso_escape_detection;
382
383 /* Flag to inhibit detection of binary files through null bytes. */
384 int inhibit_null_byte_detection;
385
386 /* Flag to make buffer-file-coding-system inherit from process-coding. */
387 int inherit_process_coding_system;
388
389 /* Coding system to be used to encode text for terminal display when
390 terminal coding system is nil. */
391 struct coding_system safe_terminal_coding;
392
393 Lisp_Object Vfile_coding_system_alist;
394 Lisp_Object Vprocess_coding_system_alist;
395 Lisp_Object Vnetwork_coding_system_alist;
396
397 Lisp_Object Vlocale_coding_system;
398
399 #endif /* emacs */
400
401 /* Flag to tell if we look up translation table on character code
402 conversion. */
403 Lisp_Object Venable_character_translation;
404 /* Standard translation table to look up on decoding (reading). */
405 Lisp_Object Vstandard_translation_table_for_decode;
406 /* Standard translation table to look up on encoding (writing). */
407 Lisp_Object Vstandard_translation_table_for_encode;
408
409 Lisp_Object Qtranslation_table;
410 Lisp_Object Qtranslation_table_id;
411 Lisp_Object Qtranslation_table_for_decode;
412 Lisp_Object Qtranslation_table_for_encode;
413
414 /* Alist of charsets vs revision number. */
415 static Lisp_Object Vcharset_revision_table;
416
417 /* Default coding systems used for process I/O. */
418 Lisp_Object Vdefault_process_coding_system;
419
420 /* Char table for translating Quail and self-inserting input. */
421 Lisp_Object Vtranslation_table_for_input;
422
423 /* Two special coding systems. */
424 Lisp_Object Vsjis_coding_system;
425 Lisp_Object Vbig5_coding_system;
426
427 /* ISO2022 section */
428
429 #define CODING_ISO_INITIAL(coding, reg) \
430 (XINT (AREF (AREF (CODING_ID_ATTRS ((coding)->id), \
431 coding_attr_iso_initial), \
432 reg)))
433
434
435 #define CODING_ISO_REQUEST(coding, charset_id) \
436 (((charset_id) <= (coding)->max_charset_id \
437 ? ((coding)->safe_charsets[charset_id] != 255 \
438 ? (coding)->safe_charsets[charset_id] \
439 : -1) \
440 : -1))
441
442
443 #define CODING_ISO_FLAGS(coding) \
444 ((coding)->spec.iso_2022.flags)
445 #define CODING_ISO_DESIGNATION(coding, reg) \
446 ((coding)->spec.iso_2022.current_designation[reg])
447 #define CODING_ISO_INVOCATION(coding, plane) \
448 ((coding)->spec.iso_2022.current_invocation[plane])
449 #define CODING_ISO_SINGLE_SHIFTING(coding) \
450 ((coding)->spec.iso_2022.single_shifting)
451 #define CODING_ISO_BOL(coding) \
452 ((coding)->spec.iso_2022.bol)
453 #define CODING_ISO_INVOKED_CHARSET(coding, plane) \
454 CODING_ISO_DESIGNATION ((coding), CODING_ISO_INVOCATION ((coding), (plane)))
455 #define CODING_ISO_CMP_STATUS(coding) \
456 (&(coding)->spec.iso_2022.cmp_status)
457 #define CODING_ISO_EXTSEGMENT_LEN(coding) \
458 ((coding)->spec.iso_2022.ctext_extended_segment_len)
459 #define CODING_ISO_EMBEDDED_UTF_8(coding) \
460 ((coding)->spec.iso_2022.embedded_utf_8)
461
462 /* Control characters of ISO2022. */
463 /* code */ /* function */
464 #define ISO_CODE_LF 0x0A /* line-feed */
465 #define ISO_CODE_CR 0x0D /* carriage-return */
466 #define ISO_CODE_SO 0x0E /* shift-out */
467 #define ISO_CODE_SI 0x0F /* shift-in */
468 #define ISO_CODE_SS2_7 0x19 /* single-shift-2 for 7-bit code */
469 #define ISO_CODE_ESC 0x1B /* escape */
470 #define ISO_CODE_SS2 0x8E /* single-shift-2 */
471 #define ISO_CODE_SS3 0x8F /* single-shift-3 */
472 #define ISO_CODE_CSI 0x9B /* control-sequence-introducer */
473
474 /* All code (1-byte) of ISO2022 is classified into one of the
475 followings. */
476 enum iso_code_class_type
477 {
478 ISO_control_0, /* Control codes in the range
479 0x00..0x1F and 0x7F, except for the
480 following 5 codes. */
481 ISO_shift_out, /* ISO_CODE_SO (0x0E) */
482 ISO_shift_in, /* ISO_CODE_SI (0x0F) */
483 ISO_single_shift_2_7, /* ISO_CODE_SS2_7 (0x19) */
484 ISO_escape, /* ISO_CODE_SO (0x1B) */
485 ISO_control_1, /* Control codes in the range
486 0x80..0x9F, except for the
487 following 3 codes. */
488 ISO_single_shift_2, /* ISO_CODE_SS2 (0x8E) */
489 ISO_single_shift_3, /* ISO_CODE_SS3 (0x8F) */
490 ISO_control_sequence_introducer, /* ISO_CODE_CSI (0x9B) */
491 ISO_0x20_or_0x7F, /* Codes of the values 0x20 or 0x7F. */
492 ISO_graphic_plane_0, /* Graphic codes in the range 0x21..0x7E. */
493 ISO_0xA0_or_0xFF, /* Codes of the values 0xA0 or 0xFF. */
494 ISO_graphic_plane_1 /* Graphic codes in the range 0xA1..0xFE. */
495 };
496
497 /** The macros CODING_ISO_FLAG_XXX defines a flag bit of the
498 `iso-flags' attribute of an iso2022 coding system. */
499
500 /* If set, produce long-form designation sequence (e.g. ESC $ ( A)
501 instead of the correct short-form sequence (e.g. ESC $ A). */
502 #define CODING_ISO_FLAG_LONG_FORM 0x0001
503
504 /* If set, reset graphic planes and registers at end-of-line to the
505 initial state. */
506 #define CODING_ISO_FLAG_RESET_AT_EOL 0x0002
507
508 /* If set, reset graphic planes and registers before any control
509 characters to the initial state. */
510 #define CODING_ISO_FLAG_RESET_AT_CNTL 0x0004
511
512 /* If set, encode by 7-bit environment. */
513 #define CODING_ISO_FLAG_SEVEN_BITS 0x0008
514
515 /* If set, use locking-shift function. */
516 #define CODING_ISO_FLAG_LOCKING_SHIFT 0x0010
517
518 /* If set, use single-shift function. Overwrite
519 CODING_ISO_FLAG_LOCKING_SHIFT. */
520 #define CODING_ISO_FLAG_SINGLE_SHIFT 0x0020
521
522 /* If set, use designation escape sequence. */
523 #define CODING_ISO_FLAG_DESIGNATION 0x0040
524
525 /* If set, produce revision number sequence. */
526 #define CODING_ISO_FLAG_REVISION 0x0080
527
528 /* If set, produce ISO6429's direction specifying sequence. */
529 #define CODING_ISO_FLAG_DIRECTION 0x0100
530
531 /* If set, assume designation states are reset at beginning of line on
532 output. */
533 #define CODING_ISO_FLAG_INIT_AT_BOL 0x0200
534
535 /* If set, designation sequence should be placed at beginning of line
536 on output. */
537 #define CODING_ISO_FLAG_DESIGNATE_AT_BOL 0x0400
538
539 /* If set, do not encode unsafe charactes on output. */
540 #define CODING_ISO_FLAG_SAFE 0x0800
541
542 /* If set, extra latin codes (128..159) are accepted as a valid code
543 on input. */
544 #define CODING_ISO_FLAG_LATIN_EXTRA 0x1000
545
546 #define CODING_ISO_FLAG_COMPOSITION 0x2000
547
548 #define CODING_ISO_FLAG_EUC_TW_SHIFT 0x4000
549
550 #define CODING_ISO_FLAG_USE_ROMAN 0x8000
551
552 #define CODING_ISO_FLAG_USE_OLDJIS 0x10000
553
554 #define CODING_ISO_FLAG_FULL_SUPPORT 0x100000
555
556 /* A character to be produced on output if encoding of the original
557 character is prohibited by CODING_ISO_FLAG_SAFE. */
558 #define CODING_INHIBIT_CHARACTER_SUBSTITUTION '?'
559
560 /* UTF-8 section */
561 #define CODING_UTF_8_BOM(coding) \
562 ((coding)->spec.utf_8_bom)
563
564 /* UTF-16 section */
565 #define CODING_UTF_16_BOM(coding) \
566 ((coding)->spec.utf_16.bom)
567
568 #define CODING_UTF_16_ENDIAN(coding) \
569 ((coding)->spec.utf_16.endian)
570
571 #define CODING_UTF_16_SURROGATE(coding) \
572 ((coding)->spec.utf_16.surrogate)
573
574
575 /* CCL section */
576 #define CODING_CCL_DECODER(coding) \
577 AREF (CODING_ID_ATTRS ((coding)->id), coding_attr_ccl_decoder)
578 #define CODING_CCL_ENCODER(coding) \
579 AREF (CODING_ID_ATTRS ((coding)->id), coding_attr_ccl_encoder)
580 #define CODING_CCL_VALIDS(coding) \
581 (SDATA (AREF (CODING_ID_ATTRS ((coding)->id), coding_attr_ccl_valids)))
582
583 /* Index for each coding category in `coding_categories' */
584
585 enum coding_category
586 {
587 coding_category_iso_7,
588 coding_category_iso_7_tight,
589 coding_category_iso_8_1,
590 coding_category_iso_8_2,
591 coding_category_iso_7_else,
592 coding_category_iso_8_else,
593 coding_category_utf_8_auto,
594 coding_category_utf_8_nosig,
595 coding_category_utf_8_sig,
596 coding_category_utf_16_auto,
597 coding_category_utf_16_be,
598 coding_category_utf_16_le,
599 coding_category_utf_16_be_nosig,
600 coding_category_utf_16_le_nosig,
601 coding_category_charset,
602 coding_category_sjis,
603 coding_category_big5,
604 coding_category_ccl,
605 coding_category_emacs_mule,
606 /* All above are targets of code detection. */
607 coding_category_raw_text,
608 coding_category_undecided,
609 coding_category_max
610 };
611
612 /* Definitions of flag bits used in detect_coding_XXXX. */
613 #define CATEGORY_MASK_ISO_7 (1 << coding_category_iso_7)
614 #define CATEGORY_MASK_ISO_7_TIGHT (1 << coding_category_iso_7_tight)
615 #define CATEGORY_MASK_ISO_8_1 (1 << coding_category_iso_8_1)
616 #define CATEGORY_MASK_ISO_8_2 (1 << coding_category_iso_8_2)
617 #define CATEGORY_MASK_ISO_7_ELSE (1 << coding_category_iso_7_else)
618 #define CATEGORY_MASK_ISO_8_ELSE (1 << coding_category_iso_8_else)
619 #define CATEGORY_MASK_UTF_8_AUTO (1 << coding_category_utf_8_auto)
620 #define CATEGORY_MASK_UTF_8_NOSIG (1 << coding_category_utf_8_nosig)
621 #define CATEGORY_MASK_UTF_8_SIG (1 << coding_category_utf_8_sig)
622 #define CATEGORY_MASK_UTF_16_AUTO (1 << coding_category_utf_16_auto)
623 #define CATEGORY_MASK_UTF_16_BE (1 << coding_category_utf_16_be)
624 #define CATEGORY_MASK_UTF_16_LE (1 << coding_category_utf_16_le)
625 #define CATEGORY_MASK_UTF_16_BE_NOSIG (1 << coding_category_utf_16_be_nosig)
626 #define CATEGORY_MASK_UTF_16_LE_NOSIG (1 << coding_category_utf_16_le_nosig)
627 #define CATEGORY_MASK_CHARSET (1 << coding_category_charset)
628 #define CATEGORY_MASK_SJIS (1 << coding_category_sjis)
629 #define CATEGORY_MASK_BIG5 (1 << coding_category_big5)
630 #define CATEGORY_MASK_CCL (1 << coding_category_ccl)
631 #define CATEGORY_MASK_EMACS_MULE (1 << coding_category_emacs_mule)
632 #define CATEGORY_MASK_RAW_TEXT (1 << coding_category_raw_text)
633
634 /* This value is returned if detect_coding_mask () find nothing other
635 than ASCII characters. */
636 #define CATEGORY_MASK_ANY \
637 (CATEGORY_MASK_ISO_7 \
638 | CATEGORY_MASK_ISO_7_TIGHT \
639 | CATEGORY_MASK_ISO_8_1 \
640 | CATEGORY_MASK_ISO_8_2 \
641 | CATEGORY_MASK_ISO_7_ELSE \
642 | CATEGORY_MASK_ISO_8_ELSE \
643 | CATEGORY_MASK_UTF_8_AUTO \
644 | CATEGORY_MASK_UTF_8_NOSIG \
645 | CATEGORY_MASK_UTF_8_SIG \
646 | CATEGORY_MASK_UTF_16_AUTO \
647 | CATEGORY_MASK_UTF_16_BE \
648 | CATEGORY_MASK_UTF_16_LE \
649 | CATEGORY_MASK_UTF_16_BE_NOSIG \
650 | CATEGORY_MASK_UTF_16_LE_NOSIG \
651 | CATEGORY_MASK_CHARSET \
652 | CATEGORY_MASK_SJIS \
653 | CATEGORY_MASK_BIG5 \
654 | CATEGORY_MASK_CCL \
655 | CATEGORY_MASK_EMACS_MULE)
656
657
658 #define CATEGORY_MASK_ISO_7BIT \
659 (CATEGORY_MASK_ISO_7 | CATEGORY_MASK_ISO_7_TIGHT)
660
661 #define CATEGORY_MASK_ISO_8BIT \
662 (CATEGORY_MASK_ISO_8_1 | CATEGORY_MASK_ISO_8_2)
663
664 #define CATEGORY_MASK_ISO_ELSE \
665 (CATEGORY_MASK_ISO_7_ELSE | CATEGORY_MASK_ISO_8_ELSE)
666
667 #define CATEGORY_MASK_ISO_ESCAPE \
668 (CATEGORY_MASK_ISO_7 \
669 | CATEGORY_MASK_ISO_7_TIGHT \
670 | CATEGORY_MASK_ISO_7_ELSE \
671 | CATEGORY_MASK_ISO_8_ELSE)
672
673 #define CATEGORY_MASK_ISO \
674 ( CATEGORY_MASK_ISO_7BIT \
675 | CATEGORY_MASK_ISO_8BIT \
676 | CATEGORY_MASK_ISO_ELSE)
677
678 #define CATEGORY_MASK_UTF_16 \
679 (CATEGORY_MASK_UTF_16_AUTO \
680 | CATEGORY_MASK_UTF_16_BE \
681 | CATEGORY_MASK_UTF_16_LE \
682 | CATEGORY_MASK_UTF_16_BE_NOSIG \
683 | CATEGORY_MASK_UTF_16_LE_NOSIG)
684
685 #define CATEGORY_MASK_UTF_8 \
686 (CATEGORY_MASK_UTF_8_AUTO \
687 | CATEGORY_MASK_UTF_8_NOSIG \
688 | CATEGORY_MASK_UTF_8_SIG)
689
690 /* List of symbols `coding-category-xxx' ordered by priority. This
691 variable is exposed to Emacs Lisp. */
692 static Lisp_Object Vcoding_category_list;
693
694 /* Table of coding categories (Lisp symbols). This variable is for
695 internal use oly. */
696 static Lisp_Object Vcoding_category_table;
697
698 /* Table of coding-categories ordered by priority. */
699 static enum coding_category coding_priorities[coding_category_max];
700
701 /* Nth element is a coding context for the coding system bound to the
702 Nth coding category. */
703 static struct coding_system coding_categories[coding_category_max];
704
705 /*** Commonly used macros and functions ***/
706
707 #ifndef min
708 #define min(a, b) ((a) < (b) ? (a) : (b))
709 #endif
710 #ifndef max
711 #define max(a, b) ((a) > (b) ? (a) : (b))
712 #endif
713
714 #define CODING_GET_INFO(coding, attrs, charset_list) \
715 do { \
716 (attrs) = CODING_ID_ATTRS ((coding)->id); \
717 (charset_list) = CODING_ATTR_CHARSET_LIST (attrs); \
718 } while (0)
719
720
721 /* Safely get one byte from the source text pointed by SRC which ends
722 at SRC_END, and set C to that byte. If there are not enough bytes
723 in the source, it jumps to `no_more_source'. If multibytep is
724 nonzero, and a multibyte character is found at SRC, set C to the
725 negative value of the character code. The caller should declare
726 and set these variables appropriately in advance:
727 src, src_end, multibytep */
728
729 #define ONE_MORE_BYTE(c) \
730 do { \
731 if (src == src_end) \
732 { \
733 if (src_base < src) \
734 record_conversion_result \
735 (coding, CODING_RESULT_INSUFFICIENT_SRC); \
736 goto no_more_source; \
737 } \
738 c = *src++; \
739 if (multibytep && (c & 0x80)) \
740 { \
741 if ((c & 0xFE) == 0xC0) \
742 c = ((c & 1) << 6) | *src++; \
743 else \
744 { \
745 src--; \
746 c = - string_char (src, &src, NULL); \
747 record_conversion_result \
748 (coding, CODING_RESULT_INVALID_SRC); \
749 } \
750 } \
751 consumed_chars++; \
752 } while (0)
753
754 /* Safely get two bytes from the source text pointed by SRC which ends
755 at SRC_END, and set C1 and C2 to those bytes while skipping the
756 heading multibyte characters. If there are not enough bytes in the
757 source, it jumps to `no_more_source'. If multibytep is nonzero and
758 a multibyte character is found for C2, set C2 to the negative value
759 of the character code. The caller should declare and set these
760 variables appropriately in advance:
761 src, src_end, multibytep
762 It is intended that this macro is used in detect_coding_utf_16. */
763
764 #define TWO_MORE_BYTES(c1, c2) \
765 do { \
766 do { \
767 if (src == src_end) \
768 goto no_more_source; \
769 c1 = *src++; \
770 if (multibytep && (c1 & 0x80)) \
771 { \
772 if ((c1 & 0xFE) == 0xC0) \
773 c1 = ((c1 & 1) << 6) | *src++; \
774 else \
775 { \
776 src += BYTES_BY_CHAR_HEAD (c1) - 1; \
777 c1 = -1; \
778 } \
779 } \
780 } while (c1 < 0); \
781 if (src == src_end) \
782 goto no_more_source; \
783 c2 = *src++; \
784 if (multibytep && (c2 & 0x80)) \
785 { \
786 if ((c2 & 0xFE) == 0xC0) \
787 c2 = ((c2 & 1) << 6) | *src++; \
788 else \
789 c2 = -1; \
790 } \
791 } while (0)
792
793
794 #define ONE_MORE_BYTE_NO_CHECK(c) \
795 do { \
796 c = *src++; \
797 if (multibytep && (c & 0x80)) \
798 { \
799 if ((c & 0xFE) == 0xC0) \
800 c = ((c & 1) << 6) | *src++; \
801 else \
802 { \
803 src--; \
804 c = - string_char (src, &src, NULL); \
805 record_conversion_result \
806 (coding, CODING_RESULT_INVALID_SRC); \
807 } \
808 } \
809 consumed_chars++; \
810 } while (0)
811
812
813 /* Store a byte C in the place pointed by DST and increment DST to the
814 next free point, and increment PRODUCED_CHARS. The caller should
815 assure that C is 0..127, and declare and set the variable `dst'
816 appropriately in advance.
817 */
818
819
820 #define EMIT_ONE_ASCII_BYTE(c) \
821 do { \
822 produced_chars++; \
823 *dst++ = (c); \
824 } while (0)
825
826
827 /* Like EMIT_ONE_ASCII_BYTE byt store two bytes; C1 and C2. */
828
829 #define EMIT_TWO_ASCII_BYTES(c1, c2) \
830 do { \
831 produced_chars += 2; \
832 *dst++ = (c1), *dst++ = (c2); \
833 } while (0)
834
835
836 /* Store a byte C in the place pointed by DST and increment DST to the
837 next free point, and increment PRODUCED_CHARS. If MULTIBYTEP is
838 nonzero, store in an appropriate multibyte from. The caller should
839 declare and set the variables `dst' and `multibytep' appropriately
840 in advance. */
841
842 #define EMIT_ONE_BYTE(c) \
843 do { \
844 produced_chars++; \
845 if (multibytep) \
846 { \
847 int ch = (c); \
848 if (ch >= 0x80) \
849 ch = BYTE8_TO_CHAR (ch); \
850 CHAR_STRING_ADVANCE (ch, dst); \
851 } \
852 else \
853 *dst++ = (c); \
854 } while (0)
855
856
857 /* Like EMIT_ONE_BYTE, but emit two bytes; C1 and C2. */
858
859 #define EMIT_TWO_BYTES(c1, c2) \
860 do { \
861 produced_chars += 2; \
862 if (multibytep) \
863 { \
864 int ch; \
865 \
866 ch = (c1); \
867 if (ch >= 0x80) \
868 ch = BYTE8_TO_CHAR (ch); \
869 CHAR_STRING_ADVANCE (ch, dst); \
870 ch = (c2); \
871 if (ch >= 0x80) \
872 ch = BYTE8_TO_CHAR (ch); \
873 CHAR_STRING_ADVANCE (ch, dst); \
874 } \
875 else \
876 { \
877 *dst++ = (c1); \
878 *dst++ = (c2); \
879 } \
880 } while (0)
881
882
883 #define EMIT_THREE_BYTES(c1, c2, c3) \
884 do { \
885 EMIT_ONE_BYTE (c1); \
886 EMIT_TWO_BYTES (c2, c3); \
887 } while (0)
888
889
890 #define EMIT_FOUR_BYTES(c1, c2, c3, c4) \
891 do { \
892 EMIT_TWO_BYTES (c1, c2); \
893 EMIT_TWO_BYTES (c3, c4); \
894 } while (0)
895
896
897 /* Prototypes for static functions. */
898 static void record_conversion_result P_ ((struct coding_system *coding,
899 enum coding_result_code result));
900 static int detect_coding_utf_8 P_ ((struct coding_system *,
901 struct coding_detection_info *info));
902 static void decode_coding_utf_8 P_ ((struct coding_system *));
903 static int encode_coding_utf_8 P_ ((struct coding_system *));
904
905 static int detect_coding_utf_16 P_ ((struct coding_system *,
906 struct coding_detection_info *info));
907 static void decode_coding_utf_16 P_ ((struct coding_system *));
908 static int encode_coding_utf_16 P_ ((struct coding_system *));
909
910 static int detect_coding_iso_2022 P_ ((struct coding_system *,
911 struct coding_detection_info *info));
912 static void decode_coding_iso_2022 P_ ((struct coding_system *));
913 static int encode_coding_iso_2022 P_ ((struct coding_system *));
914
915 static int detect_coding_emacs_mule P_ ((struct coding_system *,
916 struct coding_detection_info *info));
917 static void decode_coding_emacs_mule P_ ((struct coding_system *));
918 static int encode_coding_emacs_mule P_ ((struct coding_system *));
919
920 static int detect_coding_sjis P_ ((struct coding_system *,
921 struct coding_detection_info *info));
922 static void decode_coding_sjis P_ ((struct coding_system *));
923 static int encode_coding_sjis P_ ((struct coding_system *));
924
925 static int detect_coding_big5 P_ ((struct coding_system *,
926 struct coding_detection_info *info));
927 static void decode_coding_big5 P_ ((struct coding_system *));
928 static int encode_coding_big5 P_ ((struct coding_system *));
929
930 static int detect_coding_ccl P_ ((struct coding_system *,
931 struct coding_detection_info *info));
932 static void decode_coding_ccl P_ ((struct coding_system *));
933 static int encode_coding_ccl P_ ((struct coding_system *));
934
935 static void decode_coding_raw_text P_ ((struct coding_system *));
936 static int encode_coding_raw_text P_ ((struct coding_system *));
937
938 static void coding_set_source P_ ((struct coding_system *));
939 static void coding_set_destination P_ ((struct coding_system *));
940 static void coding_alloc_by_realloc P_ ((struct coding_system *, EMACS_INT));
941 static void coding_alloc_by_making_gap P_ ((struct coding_system *,
942 EMACS_INT, EMACS_INT));
943 static unsigned char *alloc_destination P_ ((struct coding_system *,
944 EMACS_INT, unsigned char *));
945 static void setup_iso_safe_charsets P_ ((Lisp_Object));
946 static unsigned char *encode_designation_at_bol P_ ((struct coding_system *,
947 int *, int *,
948 unsigned char *));
949 static int detect_eol P_ ((const unsigned char *,
950 EMACS_INT, enum coding_category));
951 static Lisp_Object adjust_coding_eol_type P_ ((struct coding_system *, int));
952 static void decode_eol P_ ((struct coding_system *));
953 static Lisp_Object get_translation_table P_ ((Lisp_Object, int, int *));
954 static Lisp_Object get_translation P_ ((Lisp_Object, int *, int *));
955 static int produce_chars P_ ((struct coding_system *, Lisp_Object, int));
956 static INLINE void produce_charset P_ ((struct coding_system *, int *,
957 EMACS_INT));
958 static void produce_annotation P_ ((struct coding_system *, EMACS_INT));
959 static int decode_coding P_ ((struct coding_system *));
960 static INLINE int *handle_composition_annotation P_ ((EMACS_INT, EMACS_INT,
961 struct coding_system *,
962 int *, EMACS_INT *));
963 static INLINE int *handle_charset_annotation P_ ((EMACS_INT, EMACS_INT,
964 struct coding_system *,
965 int *, EMACS_INT *));
966 static void consume_chars P_ ((struct coding_system *, Lisp_Object, int));
967 static int encode_coding P_ ((struct coding_system *));
968 static Lisp_Object make_conversion_work_buffer P_ ((int));
969 static Lisp_Object code_conversion_restore P_ ((Lisp_Object));
970 static INLINE int char_encodable_p P_ ((int, Lisp_Object));
971 static Lisp_Object make_subsidiaries P_ ((Lisp_Object));
972
973 static void
974 record_conversion_result (struct coding_system *coding,
975 enum coding_result_code result)
976 {
977 coding->result = result;
978 switch (result)
979 {
980 case CODING_RESULT_INSUFFICIENT_SRC:
981 Vlast_code_conversion_error = Qinsufficient_source;
982 break;
983 case CODING_RESULT_INCONSISTENT_EOL:
984 Vlast_code_conversion_error = Qinconsistent_eol;
985 break;
986 case CODING_RESULT_INVALID_SRC:
987 Vlast_code_conversion_error = Qinvalid_source;
988 break;
989 case CODING_RESULT_INTERRUPT:
990 Vlast_code_conversion_error = Qinterrupted;
991 break;
992 case CODING_RESULT_INSUFFICIENT_MEM:
993 Vlast_code_conversion_error = Qinsufficient_memory;
994 break;
995 case CODING_RESULT_SUCCESS:
996 break;
997 default:
998 Vlast_code_conversion_error = intern ("Unknown error");
999 }
1000 }
1001
1002 #define CODING_DECODE_CHAR(coding, src, src_base, src_end, charset, code, c) \
1003 do { \
1004 charset_map_loaded = 0; \
1005 c = DECODE_CHAR (charset, code); \
1006 if (charset_map_loaded) \
1007 { \
1008 const unsigned char *orig = coding->source; \
1009 EMACS_INT offset; \
1010 \
1011 coding_set_source (coding); \
1012 offset = coding->source - orig; \
1013 src += offset; \
1014 src_base += offset; \
1015 src_end += offset; \
1016 } \
1017 } while (0)
1018
1019
1020 /* If there are at least BYTES length of room at dst, allocate memory
1021 for coding->destination and update dst and dst_end. We don't have
1022 to take care of coding->source which will be relocated. It is
1023 handled by calling coding_set_source in encode_coding. */
1024
1025 #define ASSURE_DESTINATION(bytes) \
1026 do { \
1027 if (dst + (bytes) >= dst_end) \
1028 { \
1029 int more_bytes = charbuf_end - charbuf + (bytes); \
1030 \
1031 dst = alloc_destination (coding, more_bytes, dst); \
1032 dst_end = coding->destination + coding->dst_bytes; \
1033 } \
1034 } while (0)
1035
1036
1037 /* Store multibyte form of the character C in P, and advance P to the
1038 end of the multibyte form. This is like CHAR_STRING_ADVANCE but it
1039 never calls MAYBE_UNIFY_CHAR. */
1040
1041 #define CHAR_STRING_ADVANCE_NO_UNIFY(c, p) \
1042 do { \
1043 if ((c) <= MAX_1_BYTE_CHAR) \
1044 *(p)++ = (c); \
1045 else if ((c) <= MAX_2_BYTE_CHAR) \
1046 *(p)++ = (0xC0 | ((c) >> 6)), \
1047 *(p)++ = (0x80 | ((c) & 0x3F)); \
1048 else if ((c) <= MAX_3_BYTE_CHAR) \
1049 *(p)++ = (0xE0 | ((c) >> 12)), \
1050 *(p)++ = (0x80 | (((c) >> 6) & 0x3F)), \
1051 *(p)++ = (0x80 | ((c) & 0x3F)); \
1052 else if ((c) <= MAX_4_BYTE_CHAR) \
1053 *(p)++ = (0xF0 | (c >> 18)), \
1054 *(p)++ = (0x80 | ((c >> 12) & 0x3F)), \
1055 *(p)++ = (0x80 | ((c >> 6) & 0x3F)), \
1056 *(p)++ = (0x80 | (c & 0x3F)); \
1057 else if ((c) <= MAX_5_BYTE_CHAR) \
1058 *(p)++ = 0xF8, \
1059 *(p)++ = (0x80 | ((c >> 18) & 0x0F)), \
1060 *(p)++ = (0x80 | ((c >> 12) & 0x3F)), \
1061 *(p)++ = (0x80 | ((c >> 6) & 0x3F)), \
1062 *(p)++ = (0x80 | (c & 0x3F)); \
1063 else \
1064 (p) += BYTE8_STRING ((c) - 0x3FFF80, p); \
1065 } while (0)
1066
1067
1068 /* Return the character code of character whose multibyte form is at
1069 P, and advance P to the end of the multibyte form. This is like
1070 STRING_CHAR_ADVANCE, but it never calls MAYBE_UNIFY_CHAR. */
1071
1072 #define STRING_CHAR_ADVANCE_NO_UNIFY(p) \
1073 (!((p)[0] & 0x80) \
1074 ? *(p)++ \
1075 : ! ((p)[0] & 0x20) \
1076 ? ((p) += 2, \
1077 ((((p)[-2] & 0x1F) << 6) \
1078 | ((p)[-1] & 0x3F) \
1079 | ((unsigned char) ((p)[-2]) < 0xC2 ? 0x3FFF80 : 0))) \
1080 : ! ((p)[0] & 0x10) \
1081 ? ((p) += 3, \
1082 ((((p)[-3] & 0x0F) << 12) \
1083 | (((p)[-2] & 0x3F) << 6) \
1084 | ((p)[-1] & 0x3F))) \
1085 : ! ((p)[0] & 0x08) \
1086 ? ((p) += 4, \
1087 ((((p)[-4] & 0xF) << 18) \
1088 | (((p)[-3] & 0x3F) << 12) \
1089 | (((p)[-2] & 0x3F) << 6) \
1090 | ((p)[-1] & 0x3F))) \
1091 : ((p) += 5, \
1092 ((((p)[-4] & 0x3F) << 18) \
1093 | (((p)[-3] & 0x3F) << 12) \
1094 | (((p)[-2] & 0x3F) << 6) \
1095 | ((p)[-1] & 0x3F))))
1096
1097
1098 static void
1099 coding_set_source (coding)
1100 struct coding_system *coding;
1101 {
1102 if (BUFFERP (coding->src_object))
1103 {
1104 struct buffer *buf = XBUFFER (coding->src_object);
1105
1106 if (coding->src_pos < 0)
1107 coding->source = BUF_GAP_END_ADDR (buf) + coding->src_pos_byte;
1108 else
1109 coding->source = BUF_BYTE_ADDRESS (buf, coding->src_pos_byte);
1110 }
1111 else if (STRINGP (coding->src_object))
1112 {
1113 coding->source = SDATA (coding->src_object) + coding->src_pos_byte;
1114 }
1115 else
1116 /* Otherwise, the source is C string and is never relocated
1117 automatically. Thus we don't have to update anything. */
1118 ;
1119 }
1120
1121 static void
1122 coding_set_destination (coding)
1123 struct coding_system *coding;
1124 {
1125 if (BUFFERP (coding->dst_object))
1126 {
1127 if (coding->src_pos < 0)
1128 {
1129 coding->destination = BEG_ADDR + coding->dst_pos_byte - BEG_BYTE;
1130 coding->dst_bytes = (GAP_END_ADDR
1131 - (coding->src_bytes - coding->consumed)
1132 - coding->destination);
1133 }
1134 else
1135 {
1136 /* We are sure that coding->dst_pos_byte is before the gap
1137 of the buffer. */
1138 coding->destination = (BUF_BEG_ADDR (XBUFFER (coding->dst_object))
1139 + coding->dst_pos_byte - BEG_BYTE);
1140 coding->dst_bytes = (BUF_GAP_END_ADDR (XBUFFER (coding->dst_object))
1141 - coding->destination);
1142 }
1143 }
1144 else
1145 /* Otherwise, the destination is C string and is never relocated
1146 automatically. Thus we don't have to update anything. */
1147 ;
1148 }
1149
1150
1151 static void
1152 coding_alloc_by_realloc (coding, bytes)
1153 struct coding_system *coding;
1154 EMACS_INT bytes;
1155 {
1156 coding->destination = (unsigned char *) xrealloc (coding->destination,
1157 coding->dst_bytes + bytes);
1158 coding->dst_bytes += bytes;
1159 }
1160
1161 static void
1162 coding_alloc_by_making_gap (coding, gap_head_used, bytes)
1163 struct coding_system *coding;
1164 EMACS_INT gap_head_used, bytes;
1165 {
1166 if (EQ (coding->src_object, coding->dst_object))
1167 {
1168 /* The gap may contain the produced data at the head and not-yet
1169 consumed data at the tail. To preserve those data, we at
1170 first make the gap size to zero, then increase the gap
1171 size. */
1172 EMACS_INT add = GAP_SIZE;
1173
1174 GPT += gap_head_used, GPT_BYTE += gap_head_used;
1175 GAP_SIZE = 0; ZV += add; Z += add; ZV_BYTE += add; Z_BYTE += add;
1176 make_gap (bytes);
1177 GAP_SIZE += add; ZV -= add; Z -= add; ZV_BYTE -= add; Z_BYTE -= add;
1178 GPT -= gap_head_used, GPT_BYTE -= gap_head_used;
1179 }
1180 else
1181 {
1182 Lisp_Object this_buffer;
1183
1184 this_buffer = Fcurrent_buffer ();
1185 set_buffer_internal (XBUFFER (coding->dst_object));
1186 make_gap (bytes);
1187 set_buffer_internal (XBUFFER (this_buffer));
1188 }
1189 }
1190
1191
1192 static unsigned char *
1193 alloc_destination (coding, nbytes, dst)
1194 struct coding_system *coding;
1195 EMACS_INT nbytes;
1196 unsigned char *dst;
1197 {
1198 EMACS_INT offset = dst - coding->destination;
1199
1200 if (BUFFERP (coding->dst_object))
1201 {
1202 struct buffer *buf = XBUFFER (coding->dst_object);
1203
1204 coding_alloc_by_making_gap (coding, dst - BUF_GPT_ADDR (buf), nbytes);
1205 }
1206 else
1207 coding_alloc_by_realloc (coding, nbytes);
1208 coding_set_destination (coding);
1209 dst = coding->destination + offset;
1210 return dst;
1211 }
1212
1213 /** Macros for annotations. */
1214
1215 /* An annotation data is stored in the array coding->charbuf in this
1216 format:
1217 [ -LENGTH ANNOTATION_MASK NCHARS ... ]
1218 LENGTH is the number of elements in the annotation.
1219 ANNOTATION_MASK is one of CODING_ANNOTATE_XXX_MASK.
1220 NCHARS is the number of characters in the text annotated.
1221
1222 The format of the following elements depend on ANNOTATION_MASK.
1223
1224 In the case of CODING_ANNOTATE_COMPOSITION_MASK, these elements
1225 follows:
1226 ... NBYTES METHOD [ COMPOSITION-COMPONENTS ... ]
1227
1228 NBYTES is the number of bytes specified in the header part of
1229 old-style emacs-mule encoding, or 0 for the other kind of
1230 composition.
1231
1232 METHOD is one of enum composition_method.
1233
1234 Optionnal COMPOSITION-COMPONENTS are characters and composition
1235 rules.
1236
1237 In the case of CODING_ANNOTATE_CHARSET_MASK, one element CHARSET-ID
1238 follows.
1239
1240 If ANNOTATION_MASK is 0, this annotation is just a space holder to
1241 recover from an invalid annotation, and should be skipped by
1242 produce_annotation. */
1243
1244 /* Maximum length of the header of annotation data. */
1245 #define MAX_ANNOTATION_LENGTH 5
1246
1247 #define ADD_ANNOTATION_DATA(buf, len, mask, nchars) \
1248 do { \
1249 *(buf)++ = -(len); \
1250 *(buf)++ = (mask); \
1251 *(buf)++ = (nchars); \
1252 coding->annotated = 1; \
1253 } while (0);
1254
1255 #define ADD_COMPOSITION_DATA(buf, nchars, nbytes, method) \
1256 do { \
1257 ADD_ANNOTATION_DATA (buf, 5, CODING_ANNOTATE_COMPOSITION_MASK, nchars); \
1258 *buf++ = nbytes; \
1259 *buf++ = method; \
1260 } while (0)
1261
1262
1263 #define ADD_CHARSET_DATA(buf, nchars, id) \
1264 do { \
1265 ADD_ANNOTATION_DATA (buf, 4, CODING_ANNOTATE_CHARSET_MASK, nchars); \
1266 *buf++ = id; \
1267 } while (0)
1268
1269 \f
1270 /*** 2. Emacs' internal format (emacs-utf-8) ***/
1271
1272
1273
1274 \f
1275 /*** 3. UTF-8 ***/
1276
1277 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
1278 Check if a text is encoded in UTF-8. If it is, return 1, else
1279 return 0. */
1280
1281 #define UTF_8_1_OCTET_P(c) ((c) < 0x80)
1282 #define UTF_8_EXTRA_OCTET_P(c) (((c) & 0xC0) == 0x80)
1283 #define UTF_8_2_OCTET_LEADING_P(c) (((c) & 0xE0) == 0xC0)
1284 #define UTF_8_3_OCTET_LEADING_P(c) (((c) & 0xF0) == 0xE0)
1285 #define UTF_8_4_OCTET_LEADING_P(c) (((c) & 0xF8) == 0xF0)
1286 #define UTF_8_5_OCTET_LEADING_P(c) (((c) & 0xFC) == 0xF8)
1287
1288 #define UTF_BOM 0xFEFF
1289 #define UTF_8_BOM_1 0xEF
1290 #define UTF_8_BOM_2 0xBB
1291 #define UTF_8_BOM_3 0xBF
1292
1293 static int
1294 detect_coding_utf_8 (coding, detect_info)
1295 struct coding_system *coding;
1296 struct coding_detection_info *detect_info;
1297 {
1298 const unsigned char *src = coding->source, *src_base;
1299 const unsigned char *src_end = coding->source + coding->src_bytes;
1300 int multibytep = coding->src_multibyte;
1301 int consumed_chars = 0;
1302 int bom_found = 0;
1303 int found = 0;
1304
1305 detect_info->checked |= CATEGORY_MASK_UTF_8;
1306 /* A coding system of this category is always ASCII compatible. */
1307 src += coding->head_ascii;
1308
1309 while (1)
1310 {
1311 int c, c1, c2, c3, c4;
1312
1313 src_base = src;
1314 ONE_MORE_BYTE (c);
1315 if (c < 0 || UTF_8_1_OCTET_P (c))
1316 continue;
1317 ONE_MORE_BYTE (c1);
1318 if (c1 < 0 || ! UTF_8_EXTRA_OCTET_P (c1))
1319 break;
1320 if (UTF_8_2_OCTET_LEADING_P (c))
1321 {
1322 found = 1;
1323 continue;
1324 }
1325 ONE_MORE_BYTE (c2);
1326 if (c2 < 0 || ! UTF_8_EXTRA_OCTET_P (c2))
1327 break;
1328 if (UTF_8_3_OCTET_LEADING_P (c))
1329 {
1330 found = 1;
1331 if (src_base == coding->source
1332 && c == UTF_8_BOM_1 && c1 == UTF_8_BOM_2 && c2 == UTF_8_BOM_3)
1333 bom_found = 1;
1334 continue;
1335 }
1336 ONE_MORE_BYTE (c3);
1337 if (c3 < 0 || ! UTF_8_EXTRA_OCTET_P (c3))
1338 break;
1339 if (UTF_8_4_OCTET_LEADING_P (c))
1340 {
1341 found = 1;
1342 continue;
1343 }
1344 ONE_MORE_BYTE (c4);
1345 if (c4 < 0 || ! UTF_8_EXTRA_OCTET_P (c4))
1346 break;
1347 if (UTF_8_5_OCTET_LEADING_P (c))
1348 {
1349 found = 1;
1350 continue;
1351 }
1352 break;
1353 }
1354 detect_info->rejected |= CATEGORY_MASK_UTF_8;
1355 return 0;
1356
1357 no_more_source:
1358 if (src_base < src && coding->mode & CODING_MODE_LAST_BLOCK)
1359 {
1360 detect_info->rejected |= CATEGORY_MASK_UTF_8;
1361 return 0;
1362 }
1363 if (bom_found)
1364 {
1365 /* The first character 0xFFFE doesn't necessarily mean a BOM. */
1366 detect_info->found |= CATEGORY_MASK_UTF_8_SIG | CATEGORY_MASK_UTF_8_NOSIG;
1367 }
1368 else
1369 {
1370 detect_info->rejected |= CATEGORY_MASK_UTF_8_SIG;
1371 if (found)
1372 detect_info->found |= CATEGORY_MASK_UTF_8_NOSIG;
1373 }
1374 return 1;
1375 }
1376
1377
1378 static void
1379 decode_coding_utf_8 (coding)
1380 struct coding_system *coding;
1381 {
1382 const unsigned char *src = coding->source + coding->consumed;
1383 const unsigned char *src_end = coding->source + coding->src_bytes;
1384 const unsigned char *src_base;
1385 int *charbuf = coding->charbuf + coding->charbuf_used;
1386 int *charbuf_end = coding->charbuf + coding->charbuf_size;
1387 int consumed_chars = 0, consumed_chars_base = 0;
1388 int multibytep = coding->src_multibyte;
1389 enum utf_bom_type bom = CODING_UTF_8_BOM (coding);
1390 Lisp_Object attr, charset_list;
1391 int eol_crlf =
1392 !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
1393 int byte_after_cr = -1;
1394
1395 CODING_GET_INFO (coding, attr, charset_list);
1396
1397 if (bom != utf_without_bom)
1398 {
1399 int c1, c2, c3;
1400
1401 src_base = src;
1402 ONE_MORE_BYTE (c1);
1403 if (! UTF_8_3_OCTET_LEADING_P (c1))
1404 src = src_base;
1405 else
1406 {
1407 ONE_MORE_BYTE (c2);
1408 if (! UTF_8_EXTRA_OCTET_P (c2))
1409 src = src_base;
1410 else
1411 {
1412 ONE_MORE_BYTE (c3);
1413 if (! UTF_8_EXTRA_OCTET_P (c3))
1414 src = src_base;
1415 else
1416 {
1417 if ((c1 != UTF_8_BOM_1)
1418 || (c2 != UTF_8_BOM_2) || (c3 != UTF_8_BOM_3))
1419 src = src_base;
1420 else
1421 CODING_UTF_8_BOM (coding) = utf_without_bom;
1422 }
1423 }
1424 }
1425 }
1426 CODING_UTF_8_BOM (coding) = utf_without_bom;
1427
1428
1429
1430 while (1)
1431 {
1432 int c, c1, c2, c3, c4, c5;
1433
1434 src_base = src;
1435 consumed_chars_base = consumed_chars;
1436
1437 if (charbuf >= charbuf_end)
1438 {
1439 if (byte_after_cr >= 0)
1440 src_base--;
1441 break;
1442 }
1443
1444 if (byte_after_cr >= 0)
1445 c1 = byte_after_cr, byte_after_cr = -1;
1446 else
1447 ONE_MORE_BYTE (c1);
1448 if (c1 < 0)
1449 {
1450 c = - c1;
1451 }
1452 else if (UTF_8_1_OCTET_P(c1))
1453 {
1454 if (eol_crlf && c1 == '\r')
1455 ONE_MORE_BYTE (byte_after_cr);
1456 c = c1;
1457 }
1458 else
1459 {
1460 ONE_MORE_BYTE (c2);
1461 if (c2 < 0 || ! UTF_8_EXTRA_OCTET_P (c2))
1462 goto invalid_code;
1463 if (UTF_8_2_OCTET_LEADING_P (c1))
1464 {
1465 c = ((c1 & 0x1F) << 6) | (c2 & 0x3F);
1466 /* Reject overlong sequences here and below. Encoders
1467 producing them are incorrect, they can be misleading,
1468 and they mess up read/write invariance. */
1469 if (c < 128)
1470 goto invalid_code;
1471 }
1472 else
1473 {
1474 ONE_MORE_BYTE (c3);
1475 if (c3 < 0 || ! UTF_8_EXTRA_OCTET_P (c3))
1476 goto invalid_code;
1477 if (UTF_8_3_OCTET_LEADING_P (c1))
1478 {
1479 c = (((c1 & 0xF) << 12)
1480 | ((c2 & 0x3F) << 6) | (c3 & 0x3F));
1481 if (c < 0x800
1482 || (c >= 0xd800 && c < 0xe000)) /* surrogates (invalid) */
1483 goto invalid_code;
1484 }
1485 else
1486 {
1487 ONE_MORE_BYTE (c4);
1488 if (c4 < 0 || ! UTF_8_EXTRA_OCTET_P (c4))
1489 goto invalid_code;
1490 if (UTF_8_4_OCTET_LEADING_P (c1))
1491 {
1492 c = (((c1 & 0x7) << 18) | ((c2 & 0x3F) << 12)
1493 | ((c3 & 0x3F) << 6) | (c4 & 0x3F));
1494 if (c < 0x10000)
1495 goto invalid_code;
1496 }
1497 else
1498 {
1499 ONE_MORE_BYTE (c5);
1500 if (c5 < 0 || ! UTF_8_EXTRA_OCTET_P (c5))
1501 goto invalid_code;
1502 if (UTF_8_5_OCTET_LEADING_P (c1))
1503 {
1504 c = (((c1 & 0x3) << 24) | ((c2 & 0x3F) << 18)
1505 | ((c3 & 0x3F) << 12) | ((c4 & 0x3F) << 6)
1506 | (c5 & 0x3F));
1507 if ((c > MAX_CHAR) || (c < 0x200000))
1508 goto invalid_code;
1509 }
1510 else
1511 goto invalid_code;
1512 }
1513 }
1514 }
1515 }
1516
1517 *charbuf++ = c;
1518 continue;
1519
1520 invalid_code:
1521 src = src_base;
1522 consumed_chars = consumed_chars_base;
1523 ONE_MORE_BYTE (c);
1524 *charbuf++ = ASCII_BYTE_P (c) ? c : BYTE8_TO_CHAR (c);
1525 coding->errors++;
1526 }
1527
1528 no_more_source:
1529 coding->consumed_char += consumed_chars_base;
1530 coding->consumed = src_base - coding->source;
1531 coding->charbuf_used = charbuf - coding->charbuf;
1532 }
1533
1534
1535 static int
1536 encode_coding_utf_8 (coding)
1537 struct coding_system *coding;
1538 {
1539 int multibytep = coding->dst_multibyte;
1540 int *charbuf = coding->charbuf;
1541 int *charbuf_end = charbuf + coding->charbuf_used;
1542 unsigned char *dst = coding->destination + coding->produced;
1543 unsigned char *dst_end = coding->destination + coding->dst_bytes;
1544 int produced_chars = 0;
1545 int c;
1546
1547 if (CODING_UTF_8_BOM (coding) == utf_with_bom)
1548 {
1549 ASSURE_DESTINATION (3);
1550 EMIT_THREE_BYTES (UTF_8_BOM_1, UTF_8_BOM_2, UTF_8_BOM_3);
1551 CODING_UTF_8_BOM (coding) = utf_without_bom;
1552 }
1553
1554 if (multibytep)
1555 {
1556 int safe_room = MAX_MULTIBYTE_LENGTH * 2;
1557
1558 while (charbuf < charbuf_end)
1559 {
1560 unsigned char str[MAX_MULTIBYTE_LENGTH], *p, *pend = str;
1561
1562 ASSURE_DESTINATION (safe_room);
1563 c = *charbuf++;
1564 if (CHAR_BYTE8_P (c))
1565 {
1566 c = CHAR_TO_BYTE8 (c);
1567 EMIT_ONE_BYTE (c);
1568 }
1569 else
1570 {
1571 CHAR_STRING_ADVANCE_NO_UNIFY (c, pend);
1572 for (p = str; p < pend; p++)
1573 EMIT_ONE_BYTE (*p);
1574 }
1575 }
1576 }
1577 else
1578 {
1579 int safe_room = MAX_MULTIBYTE_LENGTH;
1580
1581 while (charbuf < charbuf_end)
1582 {
1583 ASSURE_DESTINATION (safe_room);
1584 c = *charbuf++;
1585 if (CHAR_BYTE8_P (c))
1586 *dst++ = CHAR_TO_BYTE8 (c);
1587 else
1588 CHAR_STRING_ADVANCE_NO_UNIFY (c, dst);
1589 produced_chars++;
1590 }
1591 }
1592 record_conversion_result (coding, CODING_RESULT_SUCCESS);
1593 coding->produced_char += produced_chars;
1594 coding->produced = dst - coding->destination;
1595 return 0;
1596 }
1597
1598
1599 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
1600 Check if a text is encoded in one of UTF-16 based coding systems.
1601 If it is, return 1, else return 0. */
1602
1603 #define UTF_16_HIGH_SURROGATE_P(val) \
1604 (((val) & 0xFC00) == 0xD800)
1605
1606 #define UTF_16_LOW_SURROGATE_P(val) \
1607 (((val) & 0xFC00) == 0xDC00)
1608
1609 #define UTF_16_INVALID_P(val) \
1610 (((val) == 0xFFFE) \
1611 || ((val) == 0xFFFF) \
1612 || UTF_16_LOW_SURROGATE_P (val))
1613
1614
1615 static int
1616 detect_coding_utf_16 (coding, detect_info)
1617 struct coding_system *coding;
1618 struct coding_detection_info *detect_info;
1619 {
1620 const unsigned char *src = coding->source, *src_base = src;
1621 const unsigned char *src_end = coding->source + coding->src_bytes;
1622 int multibytep = coding->src_multibyte;
1623 int consumed_chars = 0;
1624 int c1, c2;
1625
1626 detect_info->checked |= CATEGORY_MASK_UTF_16;
1627 if (coding->mode & CODING_MODE_LAST_BLOCK
1628 && (coding->src_chars & 1))
1629 {
1630 detect_info->rejected |= CATEGORY_MASK_UTF_16;
1631 return 0;
1632 }
1633
1634 TWO_MORE_BYTES (c1, c2);
1635 if ((c1 == 0xFF) && (c2 == 0xFE))
1636 {
1637 detect_info->found |= (CATEGORY_MASK_UTF_16_LE
1638 | CATEGORY_MASK_UTF_16_AUTO);
1639 detect_info->rejected |= (CATEGORY_MASK_UTF_16_BE
1640 | CATEGORY_MASK_UTF_16_BE_NOSIG
1641 | CATEGORY_MASK_UTF_16_LE_NOSIG);
1642 }
1643 else if ((c1 == 0xFE) && (c2 == 0xFF))
1644 {
1645 detect_info->found |= (CATEGORY_MASK_UTF_16_BE
1646 | CATEGORY_MASK_UTF_16_AUTO);
1647 detect_info->rejected |= (CATEGORY_MASK_UTF_16_LE
1648 | CATEGORY_MASK_UTF_16_BE_NOSIG
1649 | CATEGORY_MASK_UTF_16_LE_NOSIG);
1650 }
1651 else if (c2 < 0)
1652 {
1653 detect_info->rejected |= CATEGORY_MASK_UTF_16;
1654 return 0;
1655 }
1656 else
1657 {
1658 /* We check the dispersion of Eth and Oth bytes where E is even and
1659 O is odd. If both are high, we assume binary data.*/
1660 unsigned char e[256], o[256];
1661 unsigned e_num = 1, o_num = 1;
1662
1663 memset (e, 0, 256);
1664 memset (o, 0, 256);
1665 e[c1] = 1;
1666 o[c2] = 1;
1667
1668 detect_info->rejected |= (CATEGORY_MASK_UTF_16_AUTO
1669 |CATEGORY_MASK_UTF_16_BE
1670 | CATEGORY_MASK_UTF_16_LE);
1671
1672 while ((detect_info->rejected & CATEGORY_MASK_UTF_16)
1673 != CATEGORY_MASK_UTF_16)
1674 {
1675 TWO_MORE_BYTES (c1, c2);
1676 if (c2 < 0)
1677 break;
1678 if (! e[c1])
1679 {
1680 e[c1] = 1;
1681 e_num++;
1682 if (e_num >= 128)
1683 detect_info->rejected |= CATEGORY_MASK_UTF_16_BE_NOSIG;
1684 }
1685 if (! o[c2])
1686 {
1687 o[c2] = 1;
1688 o_num++;
1689 if (o_num >= 128)
1690 detect_info->rejected |= CATEGORY_MASK_UTF_16_LE_NOSIG;
1691 }
1692 }
1693 return 0;
1694 }
1695
1696 no_more_source:
1697 return 1;
1698 }
1699
1700 static void
1701 decode_coding_utf_16 (coding)
1702 struct coding_system *coding;
1703 {
1704 const unsigned char *src = coding->source + coding->consumed;
1705 const unsigned char *src_end = coding->source + coding->src_bytes;
1706 const unsigned char *src_base;
1707 int *charbuf = coding->charbuf + coding->charbuf_used;
1708 /* We may produces at most 3 chars in one loop. */
1709 int *charbuf_end = coding->charbuf + coding->charbuf_size - 2;
1710 int consumed_chars = 0, consumed_chars_base = 0;
1711 int multibytep = coding->src_multibyte;
1712 enum utf_bom_type bom = CODING_UTF_16_BOM (coding);
1713 enum utf_16_endian_type endian = CODING_UTF_16_ENDIAN (coding);
1714 int surrogate = CODING_UTF_16_SURROGATE (coding);
1715 Lisp_Object attr, charset_list;
1716 int eol_crlf =
1717 !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
1718 int byte_after_cr1 = -1, byte_after_cr2 = -1;
1719
1720 CODING_GET_INFO (coding, attr, charset_list);
1721
1722 if (bom == utf_with_bom)
1723 {
1724 int c, c1, c2;
1725
1726 src_base = src;
1727 ONE_MORE_BYTE (c1);
1728 ONE_MORE_BYTE (c2);
1729 c = (c1 << 8) | c2;
1730
1731 if (endian == utf_16_big_endian
1732 ? c != 0xFEFF : c != 0xFFFE)
1733 {
1734 /* The first two bytes are not BOM. Treat them as bytes
1735 for a normal character. */
1736 src = src_base;
1737 coding->errors++;
1738 }
1739 CODING_UTF_16_BOM (coding) = utf_without_bom;
1740 }
1741 else if (bom == utf_detect_bom)
1742 {
1743 /* We have already tried to detect BOM and failed in
1744 detect_coding. */
1745 CODING_UTF_16_BOM (coding) = utf_without_bom;
1746 }
1747
1748 while (1)
1749 {
1750 int c, c1, c2;
1751
1752 src_base = src;
1753 consumed_chars_base = consumed_chars;
1754
1755 if (charbuf >= charbuf_end)
1756 {
1757 if (byte_after_cr1 >= 0)
1758 src_base -= 2;
1759 break;
1760 }
1761
1762 if (byte_after_cr1 >= 0)
1763 c1 = byte_after_cr1, byte_after_cr1 = -1;
1764 else
1765 ONE_MORE_BYTE (c1);
1766 if (c1 < 0)
1767 {
1768 *charbuf++ = -c1;
1769 continue;
1770 }
1771 if (byte_after_cr2 >= 0)
1772 c2 = byte_after_cr2, byte_after_cr2 = -1;
1773 else
1774 ONE_MORE_BYTE (c2);
1775 if (c2 < 0)
1776 {
1777 *charbuf++ = ASCII_BYTE_P (c1) ? c1 : BYTE8_TO_CHAR (c1);
1778 *charbuf++ = -c2;
1779 continue;
1780 }
1781 c = (endian == utf_16_big_endian
1782 ? ((c1 << 8) | c2) : ((c2 << 8) | c1));
1783
1784 if (surrogate)
1785 {
1786 if (! UTF_16_LOW_SURROGATE_P (c))
1787 {
1788 if (endian == utf_16_big_endian)
1789 c1 = surrogate >> 8, c2 = surrogate & 0xFF;
1790 else
1791 c1 = surrogate & 0xFF, c2 = surrogate >> 8;
1792 *charbuf++ = c1;
1793 *charbuf++ = c2;
1794 coding->errors++;
1795 if (UTF_16_HIGH_SURROGATE_P (c))
1796 CODING_UTF_16_SURROGATE (coding) = surrogate = c;
1797 else
1798 *charbuf++ = c;
1799 }
1800 else
1801 {
1802 c = ((surrogate - 0xD800) << 10) | (c - 0xDC00);
1803 CODING_UTF_16_SURROGATE (coding) = surrogate = 0;
1804 *charbuf++ = 0x10000 + c;
1805 }
1806 }
1807 else
1808 {
1809 if (UTF_16_HIGH_SURROGATE_P (c))
1810 CODING_UTF_16_SURROGATE (coding) = surrogate = c;
1811 else
1812 {
1813 if (eol_crlf && c == '\r')
1814 {
1815 ONE_MORE_BYTE (byte_after_cr1);
1816 ONE_MORE_BYTE (byte_after_cr2);
1817 }
1818 *charbuf++ = c;
1819 }
1820 }
1821 }
1822
1823 no_more_source:
1824 coding->consumed_char += consumed_chars_base;
1825 coding->consumed = src_base - coding->source;
1826 coding->charbuf_used = charbuf - coding->charbuf;
1827 }
1828
1829 static int
1830 encode_coding_utf_16 (coding)
1831 struct coding_system *coding;
1832 {
1833 int multibytep = coding->dst_multibyte;
1834 int *charbuf = coding->charbuf;
1835 int *charbuf_end = charbuf + coding->charbuf_used;
1836 unsigned char *dst = coding->destination + coding->produced;
1837 unsigned char *dst_end = coding->destination + coding->dst_bytes;
1838 int safe_room = 8;
1839 enum utf_bom_type bom = CODING_UTF_16_BOM (coding);
1840 int big_endian = CODING_UTF_16_ENDIAN (coding) == utf_16_big_endian;
1841 int produced_chars = 0;
1842 Lisp_Object attrs, charset_list;
1843 int c;
1844
1845 CODING_GET_INFO (coding, attrs, charset_list);
1846
1847 if (bom != utf_without_bom)
1848 {
1849 ASSURE_DESTINATION (safe_room);
1850 if (big_endian)
1851 EMIT_TWO_BYTES (0xFE, 0xFF);
1852 else
1853 EMIT_TWO_BYTES (0xFF, 0xFE);
1854 CODING_UTF_16_BOM (coding) = utf_without_bom;
1855 }
1856
1857 while (charbuf < charbuf_end)
1858 {
1859 ASSURE_DESTINATION (safe_room);
1860 c = *charbuf++;
1861 if (c >= MAX_UNICODE_CHAR)
1862 c = coding->default_char;
1863
1864 if (c < 0x10000)
1865 {
1866 if (big_endian)
1867 EMIT_TWO_BYTES (c >> 8, c & 0xFF);
1868 else
1869 EMIT_TWO_BYTES (c & 0xFF, c >> 8);
1870 }
1871 else
1872 {
1873 int c1, c2;
1874
1875 c -= 0x10000;
1876 c1 = (c >> 10) + 0xD800;
1877 c2 = (c & 0x3FF) + 0xDC00;
1878 if (big_endian)
1879 EMIT_FOUR_BYTES (c1 >> 8, c1 & 0xFF, c2 >> 8, c2 & 0xFF);
1880 else
1881 EMIT_FOUR_BYTES (c1 & 0xFF, c1 >> 8, c2 & 0xFF, c2 >> 8);
1882 }
1883 }
1884 record_conversion_result (coding, CODING_RESULT_SUCCESS);
1885 coding->produced = dst - coding->destination;
1886 coding->produced_char += produced_chars;
1887 return 0;
1888 }
1889
1890 \f
1891 /*** 6. Old Emacs' internal format (emacs-mule) ***/
1892
1893 /* Emacs' internal format for representation of multiple character
1894 sets is a kind of multi-byte encoding, i.e. characters are
1895 represented by variable-length sequences of one-byte codes.
1896
1897 ASCII characters and control characters (e.g. `tab', `newline') are
1898 represented by one-byte sequences which are their ASCII codes, in
1899 the range 0x00 through 0x7F.
1900
1901 8-bit characters of the range 0x80..0x9F are represented by
1902 two-byte sequences of LEADING_CODE_8_BIT_CONTROL and (their 8-bit
1903 code + 0x20).
1904
1905 8-bit characters of the range 0xA0..0xFF are represented by
1906 one-byte sequences which are their 8-bit code.
1907
1908 The other characters are represented by a sequence of `base
1909 leading-code', optional `extended leading-code', and one or two
1910 `position-code's. The length of the sequence is determined by the
1911 base leading-code. Leading-code takes the range 0x81 through 0x9D,
1912 whereas extended leading-code and position-code take the range 0xA0
1913 through 0xFF. See `charset.h' for more details about leading-code
1914 and position-code.
1915
1916 --- CODE RANGE of Emacs' internal format ---
1917 character set range
1918 ------------- -----
1919 ascii 0x00..0x7F
1920 eight-bit-control LEADING_CODE_8_BIT_CONTROL + 0xA0..0xBF
1921 eight-bit-graphic 0xA0..0xBF
1922 ELSE 0x81..0x9D + [0xA0..0xFF]+
1923 ---------------------------------------------
1924
1925 As this is the internal character representation, the format is
1926 usually not used externally (i.e. in a file or in a data sent to a
1927 process). But, it is possible to have a text externally in this
1928 format (i.e. by encoding by the coding system `emacs-mule').
1929
1930 In that case, a sequence of one-byte codes has a slightly different
1931 form.
1932
1933 At first, all characters in eight-bit-control are represented by
1934 one-byte sequences which are their 8-bit code.
1935
1936 Next, character composition data are represented by the byte
1937 sequence of the form: 0x80 METHOD BYTES CHARS COMPONENT ...,
1938 where,
1939 METHOD is 0xF2 plus one of composition method (enum
1940 composition_method),
1941
1942 BYTES is 0xA0 plus a byte length of this composition data,
1943
1944 CHARS is 0xA0 plus a number of characters composed by this
1945 data,
1946
1947 COMPONENTs are characters of multibye form or composition
1948 rules encoded by two-byte of ASCII codes.
1949
1950 In addition, for backward compatibility, the following formats are
1951 also recognized as composition data on decoding.
1952
1953 0x80 MSEQ ...
1954 0x80 0xFF MSEQ RULE MSEQ RULE ... MSEQ
1955
1956 Here,
1957 MSEQ is a multibyte form but in these special format:
1958 ASCII: 0xA0 ASCII_CODE+0x80,
1959 other: LEADING_CODE+0x20 FOLLOWING-BYTE ...,
1960 RULE is a one byte code of the range 0xA0..0xF0 that
1961 represents a composition rule.
1962 */
1963
1964 char emacs_mule_bytes[256];
1965
1966
1967 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
1968 Check if a text is encoded in `emacs-mule'. If it is, return 1,
1969 else return 0. */
1970
1971 static int
1972 detect_coding_emacs_mule (coding, detect_info)
1973 struct coding_system *coding;
1974 struct coding_detection_info *detect_info;
1975 {
1976 const unsigned char *src = coding->source, *src_base;
1977 const unsigned char *src_end = coding->source + coding->src_bytes;
1978 int multibytep = coding->src_multibyte;
1979 int consumed_chars = 0;
1980 int c;
1981 int found = 0;
1982
1983 detect_info->checked |= CATEGORY_MASK_EMACS_MULE;
1984 /* A coding system of this category is always ASCII compatible. */
1985 src += coding->head_ascii;
1986
1987 while (1)
1988 {
1989 src_base = src;
1990 ONE_MORE_BYTE (c);
1991 if (c < 0)
1992 continue;
1993 if (c == 0x80)
1994 {
1995 /* Perhaps the start of composite character. We simply skip
1996 it because analyzing it is too heavy for detecting. But,
1997 at least, we check that the composite character
1998 constitutes of more than 4 bytes. */
1999 const unsigned char *src_base;
2000
2001 repeat:
2002 src_base = src;
2003 do
2004 {
2005 ONE_MORE_BYTE (c);
2006 }
2007 while (c >= 0xA0);
2008
2009 if (src - src_base <= 4)
2010 break;
2011 found = CATEGORY_MASK_EMACS_MULE;
2012 if (c == 0x80)
2013 goto repeat;
2014 }
2015
2016 if (c < 0x80)
2017 {
2018 if (c < 0x20
2019 && (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO))
2020 break;
2021 }
2022 else
2023 {
2024 int more_bytes = emacs_mule_bytes[*src_base] - 1;
2025
2026 while (more_bytes > 0)
2027 {
2028 ONE_MORE_BYTE (c);
2029 if (c < 0xA0)
2030 {
2031 src--; /* Unread the last byte. */
2032 break;
2033 }
2034 more_bytes--;
2035 }
2036 if (more_bytes != 0)
2037 break;
2038 found = CATEGORY_MASK_EMACS_MULE;
2039 }
2040 }
2041 detect_info->rejected |= CATEGORY_MASK_EMACS_MULE;
2042 return 0;
2043
2044 no_more_source:
2045 if (src_base < src && coding->mode & CODING_MODE_LAST_BLOCK)
2046 {
2047 detect_info->rejected |= CATEGORY_MASK_EMACS_MULE;
2048 return 0;
2049 }
2050 detect_info->found |= found;
2051 return 1;
2052 }
2053
2054
2055 /* Parse emacs-mule multibyte sequence at SRC and return the decoded
2056 character. If CMP_STATUS indicates that we must expect MSEQ or
2057 RULE described above, decode it and return the negative value of
2058 the deocded character or rule. If an invalid byte is found, return
2059 -1. If SRC is too short, return -2. */
2060
2061 int
2062 emacs_mule_char (coding, src, nbytes, nchars, id, cmp_status)
2063 struct coding_system *coding;
2064 const unsigned char *src;
2065 int *nbytes, *nchars, *id;
2066 struct composition_status *cmp_status;
2067 {
2068 const unsigned char *src_end = coding->source + coding->src_bytes;
2069 const unsigned char *src_base = src;
2070 int multibytep = coding->src_multibyte;
2071 struct charset *charset;
2072 unsigned code;
2073 int c;
2074 int consumed_chars = 0;
2075 int mseq_found = 0;
2076
2077 ONE_MORE_BYTE (c);
2078 if (c < 0)
2079 {
2080 c = -c;
2081 charset = emacs_mule_charset[0];
2082 }
2083 else
2084 {
2085 if (c >= 0xA0)
2086 {
2087 if (cmp_status->state != COMPOSING_NO
2088 && cmp_status->old_form)
2089 {
2090 if (cmp_status->state == COMPOSING_CHAR)
2091 {
2092 if (c == 0xA0)
2093 {
2094 ONE_MORE_BYTE (c);
2095 c -= 0x80;
2096 if (c < 0)
2097 goto invalid_code;
2098 }
2099 else
2100 c -= 0x20;
2101 mseq_found = 1;
2102 }
2103 else
2104 {
2105 *nbytes = src - src_base;
2106 *nchars = consumed_chars;
2107 return -c;
2108 }
2109 }
2110 else
2111 goto invalid_code;
2112 }
2113
2114 switch (emacs_mule_bytes[c])
2115 {
2116 case 2:
2117 if (! (charset = emacs_mule_charset[c]))
2118 goto invalid_code;
2119 ONE_MORE_BYTE (c);
2120 if (c < 0xA0)
2121 goto invalid_code;
2122 code = c & 0x7F;
2123 break;
2124
2125 case 3:
2126 if (c == EMACS_MULE_LEADING_CODE_PRIVATE_11
2127 || c == EMACS_MULE_LEADING_CODE_PRIVATE_12)
2128 {
2129 ONE_MORE_BYTE (c);
2130 if (c < 0xA0 || ! (charset = emacs_mule_charset[c]))
2131 goto invalid_code;
2132 ONE_MORE_BYTE (c);
2133 if (c < 0xA0)
2134 goto invalid_code;
2135 code = c & 0x7F;
2136 }
2137 else
2138 {
2139 if (! (charset = emacs_mule_charset[c]))
2140 goto invalid_code;
2141 ONE_MORE_BYTE (c);
2142 if (c < 0xA0)
2143 goto invalid_code;
2144 code = (c & 0x7F) << 8;
2145 ONE_MORE_BYTE (c);
2146 if (c < 0xA0)
2147 goto invalid_code;
2148 code |= c & 0x7F;
2149 }
2150 break;
2151
2152 case 4:
2153 ONE_MORE_BYTE (c);
2154 if (c < 0 || ! (charset = emacs_mule_charset[c]))
2155 goto invalid_code;
2156 ONE_MORE_BYTE (c);
2157 if (c < 0xA0)
2158 goto invalid_code;
2159 code = (c & 0x7F) << 8;
2160 ONE_MORE_BYTE (c);
2161 if (c < 0xA0)
2162 goto invalid_code;
2163 code |= c & 0x7F;
2164 break;
2165
2166 case 1:
2167 code = c;
2168 charset = CHARSET_FROM_ID (ASCII_BYTE_P (code)
2169 ? charset_ascii : charset_eight_bit);
2170 break;
2171
2172 default:
2173 abort ();
2174 }
2175 c = DECODE_CHAR (charset, code);
2176 if (c < 0)
2177 goto invalid_code;
2178 }
2179 *nbytes = src - src_base;
2180 *nchars = consumed_chars;
2181 if (id)
2182 *id = charset->id;
2183 return (mseq_found ? -c : c);
2184
2185 no_more_source:
2186 return -2;
2187
2188 invalid_code:
2189 return -1;
2190 }
2191
2192
2193 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions". */
2194
2195 /* Handle these composition sequence ('|': the end of header elements,
2196 BYTES and CHARS >= 0xA0):
2197
2198 (1) relative composition: 0x80 0xF2 BYTES CHARS | CHAR ...
2199 (2) altchar composition: 0x80 0xF4 BYTES CHARS | ALT ... ALT CHAR ...
2200 (3) alt&rule composition: 0x80 0xF5 BYTES CHARS | ALT RULE ... ALT CHAR ...
2201
2202 and these old form:
2203
2204 (4) relative composition: 0x80 | MSEQ ... MSEQ
2205 (5) rulebase composition: 0x80 0xFF | MSEQ MRULE ... MSEQ
2206
2207 When the starter 0x80 and the following header elements are found,
2208 this annotation header is produced.
2209
2210 [ -LENGTH(==-5) CODING_ANNOTATE_COMPOSITION_MASK NCHARS NBYTES METHOD ]
2211
2212 NCHARS is CHARS - 0xA0 for (1), (2), (3), and 0 for (4), (5).
2213 NBYTES is BYTES - 0xA0 for (1), (2), (3), and 0 for (4), (5).
2214
2215 Then, upon reading the following elements, these codes are produced
2216 until the composition end is found:
2217
2218 (1) CHAR ... CHAR
2219 (2) ALT ... ALT CHAR ... CHAR
2220 (3) ALT -2 DECODED-RULE ALT -2 DECODED-RULE ... ALT CHAR ... CHAR
2221 (4) CHAR ... CHAR
2222 (5) CHAR -2 DECODED-RULE CHAR -2 DECODED-RULE ... CHAR
2223
2224 When the composition end is found, LENGTH and NCHARS in the
2225 annotation header is updated as below:
2226
2227 (1) LENGTH: unchanged, NCHARS: unchanged
2228 (2) LENGTH: length of the whole sequence minus NCHARS, NCHARS: unchanged
2229 (3) LENGTH: length of the whole sequence minus NCHARS, NCHARS: unchanged
2230 (4) LENGTH: unchanged, NCHARS: number of CHARs
2231 (5) LENGTH: unchanged, NCHARS: number of CHARs
2232
2233 If an error is found while composing, the annotation header is
2234 changed to the original composition header (plus filler -1s) as
2235 below:
2236
2237 (1),(2),(3) [ 0x80 0xF2+METHOD BYTES CHARS -1 ]
2238 (5) [ 0x80 0xFF -1 -1- -1 ]
2239
2240 and the sequence [ -2 DECODED-RULE ] is changed to the original
2241 byte sequence as below:
2242 o the original byte sequence is B: [ B -1 ]
2243 o the original byte sequence is B1 B2: [ B1 B2 ]
2244
2245 Most of the routines are implemented by macros because many
2246 variables and labels in the caller decode_coding_emacs_mule must be
2247 accessible, and they are usually called just once (thus doesn't
2248 increase the size of compiled object). */
2249
2250 /* Decode a composition rule represented by C as a component of
2251 composition sequence of Emacs 20 style. Set RULE to the decoded
2252 rule. */
2253
2254 #define DECODE_EMACS_MULE_COMPOSITION_RULE_20(c, rule) \
2255 do { \
2256 int gref, nref; \
2257 \
2258 c -= 0xA0; \
2259 if (c < 0 || c >= 81) \
2260 goto invalid_code; \
2261 gref = c / 9, nref = c % 9; \
2262 if (gref == 4) gref = 10; \
2263 if (nref == 4) nref = 10; \
2264 rule = COMPOSITION_ENCODE_RULE (gref, nref); \
2265 } while (0)
2266
2267
2268 /* Decode a composition rule represented by C and the following byte
2269 at SRC as a component of composition sequence of Emacs 21 style.
2270 Set RULE to the decoded rule. */
2271
2272 #define DECODE_EMACS_MULE_COMPOSITION_RULE_21(c, rule) \
2273 do { \
2274 int gref, nref; \
2275 \
2276 gref = c - 0x20; \
2277 if (gref < 0 || gref >= 81) \
2278 goto invalid_code; \
2279 ONE_MORE_BYTE (c); \
2280 nref = c - 0x20; \
2281 if (nref < 0 || nref >= 81) \
2282 goto invalid_code; \
2283 rule = COMPOSITION_ENCODE_RULE (gref, nref); \
2284 } while (0)
2285
2286
2287 /* Start of Emacs 21 style format. The first three bytes at SRC are
2288 (METHOD - 0xF2), (BYTES - 0xA0), (CHARS - 0xA0), where BYTES is the
2289 byte length of this composition information, CHARS is the number of
2290 characters composed by this composition. */
2291
2292 #define DECODE_EMACS_MULE_21_COMPOSITION() \
2293 do { \
2294 enum composition_method method = c - 0xF2; \
2295 int *charbuf_base = charbuf; \
2296 int nbytes, nchars; \
2297 \
2298 ONE_MORE_BYTE (c); \
2299 if (c < 0) \
2300 goto invalid_code; \
2301 nbytes = c - 0xA0; \
2302 if (nbytes < 3 || (method == COMPOSITION_RELATIVE && nbytes != 4)) \
2303 goto invalid_code; \
2304 ONE_MORE_BYTE (c); \
2305 nchars = c - 0xA0; \
2306 if (nchars <= 0 || nchars >= MAX_COMPOSITION_COMPONENTS) \
2307 goto invalid_code; \
2308 cmp_status->old_form = 0; \
2309 cmp_status->method = method; \
2310 if (method == COMPOSITION_RELATIVE) \
2311 cmp_status->state = COMPOSING_CHAR; \
2312 else \
2313 cmp_status->state = COMPOSING_COMPONENT_CHAR; \
2314 cmp_status->length = MAX_ANNOTATION_LENGTH; \
2315 cmp_status->nchars = nchars; \
2316 cmp_status->ncomps = nbytes - 4; \
2317 ADD_COMPOSITION_DATA (charbuf, nchars, nbytes, method); \
2318 } while (0)
2319
2320
2321 /* Start of Emacs 20 style format for relative composition. */
2322
2323 #define DECODE_EMACS_MULE_20_RELATIVE_COMPOSITION() \
2324 do { \
2325 cmp_status->old_form = 1; \
2326 cmp_status->method = COMPOSITION_RELATIVE; \
2327 cmp_status->state = COMPOSING_CHAR; \
2328 cmp_status->length = MAX_ANNOTATION_LENGTH; \
2329 cmp_status->nchars = cmp_status->ncomps = 0; \
2330 ADD_COMPOSITION_DATA (charbuf, 0, 0, cmp_status->method); \
2331 } while (0)
2332
2333
2334 /* Start of Emacs 20 style format for rule-base composition. */
2335
2336 #define DECODE_EMACS_MULE_20_RULEBASE_COMPOSITION() \
2337 do { \
2338 cmp_status->old_form = 1; \
2339 cmp_status->method = COMPOSITION_WITH_RULE; \
2340 cmp_status->state = COMPOSING_CHAR; \
2341 cmp_status->length = MAX_ANNOTATION_LENGTH; \
2342 cmp_status->nchars = cmp_status->ncomps = 0; \
2343 ADD_COMPOSITION_DATA (charbuf, 0, 0, cmp_status->method); \
2344 } while (0)
2345
2346
2347 #define DECODE_EMACS_MULE_COMPOSITION_START() \
2348 do { \
2349 const unsigned char *current_src = src; \
2350 \
2351 ONE_MORE_BYTE (c); \
2352 if (c < 0) \
2353 goto invalid_code; \
2354 if (c - 0xF2 >= COMPOSITION_RELATIVE \
2355 && c - 0xF2 <= COMPOSITION_WITH_RULE_ALTCHARS) \
2356 DECODE_EMACS_MULE_21_COMPOSITION (); \
2357 else if (c < 0xA0) \
2358 goto invalid_code; \
2359 else if (c < 0xC0) \
2360 { \
2361 DECODE_EMACS_MULE_20_RELATIVE_COMPOSITION (); \
2362 /* Re-read C as a composition component. */ \
2363 src = current_src; \
2364 } \
2365 else if (c == 0xFF) \
2366 DECODE_EMACS_MULE_20_RULEBASE_COMPOSITION (); \
2367 else \
2368 goto invalid_code; \
2369 } while (0)
2370
2371 #define EMACS_MULE_COMPOSITION_END() \
2372 do { \
2373 int idx = - cmp_status->length; \
2374 \
2375 if (cmp_status->old_form) \
2376 charbuf[idx + 2] = cmp_status->nchars; \
2377 else if (cmp_status->method > COMPOSITION_RELATIVE) \
2378 charbuf[idx] = charbuf[idx + 2] - cmp_status->length; \
2379 cmp_status->state = COMPOSING_NO; \
2380 } while (0)
2381
2382
2383 static int
2384 emacs_mule_finish_composition (charbuf, cmp_status)
2385 int *charbuf;
2386 struct composition_status *cmp_status;
2387 {
2388 int idx = - cmp_status->length;
2389 int new_chars;
2390
2391 if (cmp_status->old_form && cmp_status->nchars > 0)
2392 {
2393 charbuf[idx + 2] = cmp_status->nchars;
2394 new_chars = 0;
2395 if (cmp_status->method == COMPOSITION_WITH_RULE
2396 && cmp_status->state == COMPOSING_CHAR)
2397 {
2398 /* The last rule was invalid. */
2399 int rule = charbuf[-1] + 0xA0;
2400
2401 charbuf[-2] = BYTE8_TO_CHAR (rule);
2402 charbuf[-1] = -1;
2403 new_chars = 1;
2404 }
2405 }
2406 else
2407 {
2408 charbuf[idx++] = BYTE8_TO_CHAR (0x80);
2409
2410 if (cmp_status->method == COMPOSITION_WITH_RULE)
2411 {
2412 charbuf[idx++] = BYTE8_TO_CHAR (0xFF);
2413 charbuf[idx++] = -3;
2414 charbuf[idx++] = 0;
2415 new_chars = 1;
2416 }
2417 else
2418 {
2419 int nchars = charbuf[idx + 1] + 0xA0;
2420 int nbytes = charbuf[idx + 2] + 0xA0;
2421
2422 charbuf[idx++] = BYTE8_TO_CHAR (0xF2 + cmp_status->method);
2423 charbuf[idx++] = BYTE8_TO_CHAR (nbytes);
2424 charbuf[idx++] = BYTE8_TO_CHAR (nchars);
2425 charbuf[idx++] = -1;
2426 new_chars = 4;
2427 }
2428 }
2429 cmp_status->state = COMPOSING_NO;
2430 return new_chars;
2431 }
2432
2433 #define EMACS_MULE_MAYBE_FINISH_COMPOSITION() \
2434 do { \
2435 if (cmp_status->state != COMPOSING_NO) \
2436 char_offset += emacs_mule_finish_composition (charbuf, cmp_status); \
2437 } while (0)
2438
2439
2440 static void
2441 decode_coding_emacs_mule (coding)
2442 struct coding_system *coding;
2443 {
2444 const unsigned char *src = coding->source + coding->consumed;
2445 const unsigned char *src_end = coding->source + coding->src_bytes;
2446 const unsigned char *src_base;
2447 int *charbuf = coding->charbuf + coding->charbuf_used;
2448 /* We may produce two annocations (charset and composition) in one
2449 loop and one more charset annocation at the end. */
2450 int *charbuf_end
2451 = coding->charbuf + coding->charbuf_size - (MAX_ANNOTATION_LENGTH * 3);
2452 int consumed_chars = 0, consumed_chars_base;
2453 int multibytep = coding->src_multibyte;
2454 Lisp_Object attrs, charset_list;
2455 int char_offset = coding->produced_char;
2456 int last_offset = char_offset;
2457 int last_id = charset_ascii;
2458 int eol_crlf =
2459 !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
2460 int byte_after_cr = -1;
2461 struct composition_status *cmp_status = &coding->spec.emacs_mule.cmp_status;
2462
2463 CODING_GET_INFO (coding, attrs, charset_list);
2464
2465 if (cmp_status->state != COMPOSING_NO)
2466 {
2467 int i;
2468
2469 for (i = 0; i < cmp_status->length; i++)
2470 *charbuf++ = cmp_status->carryover[i];
2471 coding->annotated = 1;
2472 }
2473
2474 while (1)
2475 {
2476 int c, id;
2477
2478 src_base = src;
2479 consumed_chars_base = consumed_chars;
2480
2481 if (charbuf >= charbuf_end)
2482 {
2483 if (byte_after_cr >= 0)
2484 src_base--;
2485 break;
2486 }
2487
2488 if (byte_after_cr >= 0)
2489 c = byte_after_cr, byte_after_cr = -1;
2490 else
2491 ONE_MORE_BYTE (c);
2492
2493 if (c < 0 || c == 0x80)
2494 {
2495 EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
2496 if (c < 0)
2497 {
2498 *charbuf++ = -c;
2499 char_offset++;
2500 }
2501 else
2502 DECODE_EMACS_MULE_COMPOSITION_START ();
2503 continue;
2504 }
2505
2506 if (c < 0x80)
2507 {
2508 if (eol_crlf && c == '\r')
2509 ONE_MORE_BYTE (byte_after_cr);
2510 id = charset_ascii;
2511 if (cmp_status->state != COMPOSING_NO)
2512 {
2513 if (cmp_status->old_form)
2514 EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
2515 else if (cmp_status->state >= COMPOSING_COMPONENT_CHAR)
2516 cmp_status->ncomps--;
2517 }
2518 }
2519 else
2520 {
2521 int nchars, nbytes;
2522
2523 c = emacs_mule_char (coding, src_base, &nbytes, &nchars, &id,
2524 cmp_status);
2525 if (c < 0)
2526 {
2527 if (c == -1)
2528 goto invalid_code;
2529 if (c == -2)
2530 break;
2531 }
2532 src = src_base + nbytes;
2533 consumed_chars = consumed_chars_base + nchars;
2534 if (cmp_status->state >= COMPOSING_COMPONENT_CHAR)
2535 cmp_status->ncomps -= nchars;
2536 }
2537
2538 /* Now if C >= 0, we found a normally encoded characer, if C <
2539 0, we found an old-style composition component character or
2540 rule. */
2541
2542 if (cmp_status->state == COMPOSING_NO)
2543 {
2544 if (last_id != id)
2545 {
2546 if (last_id != charset_ascii)
2547 ADD_CHARSET_DATA (charbuf, char_offset - last_offset,
2548 last_id);
2549 last_id = id;
2550 last_offset = char_offset;
2551 }
2552 *charbuf++ = c;
2553 char_offset++;
2554 }
2555 else if (cmp_status->state == COMPOSING_CHAR)
2556 {
2557 if (cmp_status->old_form)
2558 {
2559 if (c >= 0)
2560 {
2561 EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
2562 *charbuf++ = c;
2563 char_offset++;
2564 }
2565 else
2566 {
2567 *charbuf++ = -c;
2568 cmp_status->nchars++;
2569 cmp_status->length++;
2570 if (cmp_status->nchars == MAX_COMPOSITION_COMPONENTS)
2571 EMACS_MULE_COMPOSITION_END ();
2572 else if (cmp_status->method == COMPOSITION_WITH_RULE)
2573 cmp_status->state = COMPOSING_RULE;
2574 }
2575 }
2576 else
2577 {
2578 *charbuf++ = c;
2579 cmp_status->length++;
2580 cmp_status->nchars--;
2581 if (cmp_status->nchars == 0)
2582 EMACS_MULE_COMPOSITION_END ();
2583 }
2584 }
2585 else if (cmp_status->state == COMPOSING_RULE)
2586 {
2587 int rule;
2588
2589 if (c >= 0)
2590 {
2591 EMACS_MULE_COMPOSITION_END ();
2592 *charbuf++ = c;
2593 char_offset++;
2594 }
2595 else
2596 {
2597 c = -c;
2598 DECODE_EMACS_MULE_COMPOSITION_RULE_20 (c, rule);
2599 if (rule < 0)
2600 goto invalid_code;
2601 *charbuf++ = -2;
2602 *charbuf++ = rule;
2603 cmp_status->length += 2;
2604 cmp_status->state = COMPOSING_CHAR;
2605 }
2606 }
2607 else if (cmp_status->state == COMPOSING_COMPONENT_CHAR)
2608 {
2609 *charbuf++ = c;
2610 cmp_status->length++;
2611 if (cmp_status->ncomps == 0)
2612 cmp_status->state = COMPOSING_CHAR;
2613 else if (cmp_status->ncomps > 0)
2614 {
2615 if (cmp_status->method == COMPOSITION_WITH_RULE_ALTCHARS)
2616 cmp_status->state = COMPOSING_COMPONENT_RULE;
2617 }
2618 else
2619 EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
2620 }
2621 else /* COMPOSING_COMPONENT_RULE */
2622 {
2623 int rule;
2624
2625 DECODE_EMACS_MULE_COMPOSITION_RULE_21 (c, rule);
2626 if (rule < 0)
2627 goto invalid_code;
2628 *charbuf++ = -2;
2629 *charbuf++ = rule;
2630 cmp_status->length += 2;
2631 cmp_status->ncomps--;
2632 if (cmp_status->ncomps > 0)
2633 cmp_status->state = COMPOSING_COMPONENT_CHAR;
2634 else
2635 EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
2636 }
2637 continue;
2638
2639 retry:
2640 src = src_base;
2641 consumed_chars = consumed_chars_base;
2642 continue;
2643
2644 invalid_code:
2645 EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
2646 src = src_base;
2647 consumed_chars = consumed_chars_base;
2648 ONE_MORE_BYTE (c);
2649 *charbuf++ = ASCII_BYTE_P (c) ? c : BYTE8_TO_CHAR (c);
2650 char_offset++;
2651 coding->errors++;
2652 }
2653
2654 no_more_source:
2655 if (cmp_status->state != COMPOSING_NO)
2656 {
2657 if (coding->mode & CODING_MODE_LAST_BLOCK)
2658 EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
2659 else
2660 {
2661 int i;
2662
2663 charbuf -= cmp_status->length;
2664 for (i = 0; i < cmp_status->length; i++)
2665 cmp_status->carryover[i] = charbuf[i];
2666 }
2667 }
2668 if (last_id != charset_ascii)
2669 ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
2670 coding->consumed_char += consumed_chars_base;
2671 coding->consumed = src_base - coding->source;
2672 coding->charbuf_used = charbuf - coding->charbuf;
2673 }
2674
2675
2676 #define EMACS_MULE_LEADING_CODES(id, codes) \
2677 do { \
2678 if (id < 0xA0) \
2679 codes[0] = id, codes[1] = 0; \
2680 else if (id < 0xE0) \
2681 codes[0] = 0x9A, codes[1] = id; \
2682 else if (id < 0xF0) \
2683 codes[0] = 0x9B, codes[1] = id; \
2684 else if (id < 0xF5) \
2685 codes[0] = 0x9C, codes[1] = id; \
2686 else \
2687 codes[0] = 0x9D, codes[1] = id; \
2688 } while (0);
2689
2690
2691 static int
2692 encode_coding_emacs_mule (coding)
2693 struct coding_system *coding;
2694 {
2695 int multibytep = coding->dst_multibyte;
2696 int *charbuf = coding->charbuf;
2697 int *charbuf_end = charbuf + coding->charbuf_used;
2698 unsigned char *dst = coding->destination + coding->produced;
2699 unsigned char *dst_end = coding->destination + coding->dst_bytes;
2700 int safe_room = 8;
2701 int produced_chars = 0;
2702 Lisp_Object attrs, charset_list;
2703 int c;
2704 int preferred_charset_id = -1;
2705
2706 CODING_GET_INFO (coding, attrs, charset_list);
2707 if (! EQ (charset_list, Vemacs_mule_charset_list))
2708 {
2709 CODING_ATTR_CHARSET_LIST (attrs)
2710 = charset_list = Vemacs_mule_charset_list;
2711 }
2712
2713 while (charbuf < charbuf_end)
2714 {
2715 ASSURE_DESTINATION (safe_room);
2716 c = *charbuf++;
2717
2718 if (c < 0)
2719 {
2720 /* Handle an annotation. */
2721 switch (*charbuf)
2722 {
2723 case CODING_ANNOTATE_COMPOSITION_MASK:
2724 /* Not yet implemented. */
2725 break;
2726 case CODING_ANNOTATE_CHARSET_MASK:
2727 preferred_charset_id = charbuf[3];
2728 if (preferred_charset_id >= 0
2729 && NILP (Fmemq (make_number (preferred_charset_id),
2730 charset_list)))
2731 preferred_charset_id = -1;
2732 break;
2733 default:
2734 abort ();
2735 }
2736 charbuf += -c - 1;
2737 continue;
2738 }
2739
2740 if (ASCII_CHAR_P (c))
2741 EMIT_ONE_ASCII_BYTE (c);
2742 else if (CHAR_BYTE8_P (c))
2743 {
2744 c = CHAR_TO_BYTE8 (c);
2745 EMIT_ONE_BYTE (c);
2746 }
2747 else
2748 {
2749 struct charset *charset;
2750 unsigned code;
2751 int dimension;
2752 int emacs_mule_id;
2753 unsigned char leading_codes[2];
2754
2755 if (preferred_charset_id >= 0)
2756 {
2757 charset = CHARSET_FROM_ID (preferred_charset_id);
2758 if (CHAR_CHARSET_P (c, charset))
2759 code = ENCODE_CHAR (charset, c);
2760 else
2761 charset = char_charset (c, charset_list, &code);
2762 }
2763 else
2764 charset = char_charset (c, charset_list, &code);
2765 if (! charset)
2766 {
2767 c = coding->default_char;
2768 if (ASCII_CHAR_P (c))
2769 {
2770 EMIT_ONE_ASCII_BYTE (c);
2771 continue;
2772 }
2773 charset = char_charset (c, charset_list, &code);
2774 }
2775 dimension = CHARSET_DIMENSION (charset);
2776 emacs_mule_id = CHARSET_EMACS_MULE_ID (charset);
2777 EMACS_MULE_LEADING_CODES (emacs_mule_id, leading_codes);
2778 EMIT_ONE_BYTE (leading_codes[0]);
2779 if (leading_codes[1])
2780 EMIT_ONE_BYTE (leading_codes[1]);
2781 if (dimension == 1)
2782 EMIT_ONE_BYTE (code | 0x80);
2783 else
2784 {
2785 code |= 0x8080;
2786 EMIT_ONE_BYTE (code >> 8);
2787 EMIT_ONE_BYTE (code & 0xFF);
2788 }
2789 }
2790 }
2791 record_conversion_result (coding, CODING_RESULT_SUCCESS);
2792 coding->produced_char += produced_chars;
2793 coding->produced = dst - coding->destination;
2794 return 0;
2795 }
2796
2797 \f
2798 /*** 7. ISO2022 handlers ***/
2799
2800 /* The following note describes the coding system ISO2022 briefly.
2801 Since the intention of this note is to help understand the
2802 functions in this file, some parts are NOT ACCURATE or are OVERLY
2803 SIMPLIFIED. For thorough understanding, please refer to the
2804 original document of ISO2022. This is equivalent to the standard
2805 ECMA-35, obtainable from <URL:http://www.ecma.ch/> (*).
2806
2807 ISO2022 provides many mechanisms to encode several character sets
2808 in 7-bit and 8-bit environments. For 7-bit environments, all text
2809 is encoded using bytes less than 128. This may make the encoded
2810 text a little bit longer, but the text passes more easily through
2811 several types of gateway, some of which strip off the MSB (Most
2812 Significant Bit).
2813
2814 There are two kinds of character sets: control character sets and
2815 graphic character sets. The former contain control characters such
2816 as `newline' and `escape' to provide control functions (control
2817 functions are also provided by escape sequences). The latter
2818 contain graphic characters such as 'A' and '-'. Emacs recognizes
2819 two control character sets and many graphic character sets.
2820
2821 Graphic character sets are classified into one of the following
2822 four classes, according to the number of bytes (DIMENSION) and
2823 number of characters in one dimension (CHARS) of the set:
2824 - DIMENSION1_CHARS94
2825 - DIMENSION1_CHARS96
2826 - DIMENSION2_CHARS94
2827 - DIMENSION2_CHARS96
2828
2829 In addition, each character set is assigned an identification tag,
2830 unique for each set, called the "final character" (denoted as <F>
2831 hereafter). The <F> of each character set is decided by ECMA(*)
2832 when it is registered in ISO. The code range of <F> is 0x30..0x7F
2833 (0x30..0x3F are for private use only).
2834
2835 Note (*): ECMA = European Computer Manufacturers Association
2836
2837 Here are examples of graphic character sets [NAME(<F>)]:
2838 o DIMENSION1_CHARS94 -- ASCII('B'), right-half-of-JISX0201('I'), ...
2839 o DIMENSION1_CHARS96 -- right-half-of-ISO8859-1('A'), ...
2840 o DIMENSION2_CHARS94 -- GB2312('A'), JISX0208('B'), ...
2841 o DIMENSION2_CHARS96 -- none for the moment
2842
2843 A code area (1 byte=8 bits) is divided into 4 areas, C0, GL, C1, and GR.
2844 C0 [0x00..0x1F] -- control character plane 0
2845 GL [0x20..0x7F] -- graphic character plane 0
2846 C1 [0x80..0x9F] -- control character plane 1
2847 GR [0xA0..0xFF] -- graphic character plane 1
2848
2849 A control character set is directly designated and invoked to C0 or
2850 C1 by an escape sequence. The most common case is that:
2851 - ISO646's control character set is designated/invoked to C0, and
2852 - ISO6429's control character set is designated/invoked to C1,
2853 and usually these designations/invocations are omitted in encoded
2854 text. In a 7-bit environment, only C0 can be used, and a control
2855 character for C1 is encoded by an appropriate escape sequence to
2856 fit into the environment. All control characters for C1 are
2857 defined to have corresponding escape sequences.
2858
2859 A graphic character set is at first designated to one of four
2860 graphic registers (G0 through G3), then these graphic registers are
2861 invoked to GL or GR. These designations and invocations can be
2862 done independently. The most common case is that G0 is invoked to
2863 GL, G1 is invoked to GR, and ASCII is designated to G0. Usually
2864 these invocations and designations are omitted in encoded text.
2865 In a 7-bit environment, only GL can be used.
2866
2867 When a graphic character set of CHARS94 is invoked to GL, codes
2868 0x20 and 0x7F of the GL area work as control characters SPACE and
2869 DEL respectively, and codes 0xA0 and 0xFF of the GR area should not
2870 be used.
2871
2872 There are two ways of invocation: locking-shift and single-shift.
2873 With locking-shift, the invocation lasts until the next different
2874 invocation, whereas with single-shift, the invocation affects the
2875 following character only and doesn't affect the locking-shift
2876 state. Invocations are done by the following control characters or
2877 escape sequences:
2878
2879 ----------------------------------------------------------------------
2880 abbrev function cntrl escape seq description
2881 ----------------------------------------------------------------------
2882 SI/LS0 (shift-in) 0x0F none invoke G0 into GL
2883 SO/LS1 (shift-out) 0x0E none invoke G1 into GL
2884 LS2 (locking-shift-2) none ESC 'n' invoke G2 into GL
2885 LS3 (locking-shift-3) none ESC 'o' invoke G3 into GL
2886 LS1R (locking-shift-1 right) none ESC '~' invoke G1 into GR (*)
2887 LS2R (locking-shift-2 right) none ESC '}' invoke G2 into GR (*)
2888 LS3R (locking-shift 3 right) none ESC '|' invoke G3 into GR (*)
2889 SS2 (single-shift-2) 0x8E ESC 'N' invoke G2 for one char
2890 SS3 (single-shift-3) 0x8F ESC 'O' invoke G3 for one char
2891 ----------------------------------------------------------------------
2892 (*) These are not used by any known coding system.
2893
2894 Control characters for these functions are defined by macros
2895 ISO_CODE_XXX in `coding.h'.
2896
2897 Designations are done by the following escape sequences:
2898 ----------------------------------------------------------------------
2899 escape sequence description
2900 ----------------------------------------------------------------------
2901 ESC '(' <F> designate DIMENSION1_CHARS94<F> to G0
2902 ESC ')' <F> designate DIMENSION1_CHARS94<F> to G1
2903 ESC '*' <F> designate DIMENSION1_CHARS94<F> to G2
2904 ESC '+' <F> designate DIMENSION1_CHARS94<F> to G3
2905 ESC ',' <F> designate DIMENSION1_CHARS96<F> to G0 (*)
2906 ESC '-' <F> designate DIMENSION1_CHARS96<F> to G1
2907 ESC '.' <F> designate DIMENSION1_CHARS96<F> to G2
2908 ESC '/' <F> designate DIMENSION1_CHARS96<F> to G3
2909 ESC '$' '(' <F> designate DIMENSION2_CHARS94<F> to G0 (**)
2910 ESC '$' ')' <F> designate DIMENSION2_CHARS94<F> to G1
2911 ESC '$' '*' <F> designate DIMENSION2_CHARS94<F> to G2
2912 ESC '$' '+' <F> designate DIMENSION2_CHARS94<F> to G3
2913 ESC '$' ',' <F> designate DIMENSION2_CHARS96<F> to G0 (*)
2914 ESC '$' '-' <F> designate DIMENSION2_CHARS96<F> to G1
2915 ESC '$' '.' <F> designate DIMENSION2_CHARS96<F> to G2
2916 ESC '$' '/' <F> designate DIMENSION2_CHARS96<F> to G3
2917 ----------------------------------------------------------------------
2918
2919 In this list, "DIMENSION1_CHARS94<F>" means a graphic character set
2920 of dimension 1, chars 94, and final character <F>, etc...
2921
2922 Note (*): Although these designations are not allowed in ISO2022,
2923 Emacs accepts them on decoding, and produces them on encoding
2924 CHARS96 character sets in a coding system which is characterized as
2925 7-bit environment, non-locking-shift, and non-single-shift.
2926
2927 Note (**): If <F> is '@', 'A', or 'B', the intermediate character
2928 '(' must be omitted. We refer to this as "short-form" hereafter.
2929
2930 Now you may notice that there are a lot of ways of encoding the
2931 same multilingual text in ISO2022. Actually, there exist many
2932 coding systems such as Compound Text (used in X11's inter client
2933 communication, ISO-2022-JP (used in Japanese Internet), ISO-2022-KR
2934 (used in Korean Internet), EUC (Extended UNIX Code, used in Asian
2935 localized platforms), and all of these are variants of ISO2022.
2936
2937 In addition to the above, Emacs handles two more kinds of escape
2938 sequences: ISO6429's direction specification and Emacs' private
2939 sequence for specifying character composition.
2940
2941 ISO6429's direction specification takes the following form:
2942 o CSI ']' -- end of the current direction
2943 o CSI '0' ']' -- end of the current direction
2944 o CSI '1' ']' -- start of left-to-right text
2945 o CSI '2' ']' -- start of right-to-left text
2946 The control character CSI (0x9B: control sequence introducer) is
2947 abbreviated to the escape sequence ESC '[' in a 7-bit environment.
2948
2949 Character composition specification takes the following form:
2950 o ESC '0' -- start relative composition
2951 o ESC '1' -- end composition
2952 o ESC '2' -- start rule-base composition (*)
2953 o ESC '3' -- start relative composition with alternate chars (**)
2954 o ESC '4' -- start rule-base composition with alternate chars (**)
2955 Since these are not standard escape sequences of any ISO standard,
2956 the use of them with these meanings is restricted to Emacs only.
2957
2958 (*) This form is used only in Emacs 20.7 and older versions,
2959 but newer versions can safely decode it.
2960 (**) This form is used only in Emacs 21.1 and newer versions,
2961 and older versions can't decode it.
2962
2963 Here's a list of example usages of these composition escape
2964 sequences (categorized by `enum composition_method').
2965
2966 COMPOSITION_RELATIVE:
2967 ESC 0 CHAR [ CHAR ] ESC 1
2968 COMPOSITION_WITH_RULE:
2969 ESC 2 CHAR [ RULE CHAR ] ESC 1
2970 COMPOSITION_WITH_ALTCHARS:
2971 ESC 3 ALTCHAR [ ALTCHAR ] ESC 0 CHAR [ CHAR ] ESC 1
2972 COMPOSITION_WITH_RULE_ALTCHARS:
2973 ESC 4 ALTCHAR [ RULE ALTCHAR ] ESC 0 CHAR [ CHAR ] ESC 1 */
2974
2975 enum iso_code_class_type iso_code_class[256];
2976
2977 #define SAFE_CHARSET_P(coding, id) \
2978 ((id) <= (coding)->max_charset_id \
2979 && (coding)->safe_charsets[id] != 255)
2980
2981
2982 #define SHIFT_OUT_OK(category) \
2983 (CODING_ISO_INITIAL (&coding_categories[category], 1) >= 0)
2984
2985 static void
2986 setup_iso_safe_charsets (attrs)
2987 Lisp_Object attrs;
2988 {
2989 Lisp_Object charset_list, safe_charsets;
2990 Lisp_Object request;
2991 Lisp_Object reg_usage;
2992 Lisp_Object tail;
2993 int reg94, reg96;
2994 int flags = XINT (AREF (attrs, coding_attr_iso_flags));
2995 int max_charset_id;
2996
2997 charset_list = CODING_ATTR_CHARSET_LIST (attrs);
2998 if ((flags & CODING_ISO_FLAG_FULL_SUPPORT)
2999 && ! EQ (charset_list, Viso_2022_charset_list))
3000 {
3001 CODING_ATTR_CHARSET_LIST (attrs)
3002 = charset_list = Viso_2022_charset_list;
3003 ASET (attrs, coding_attr_safe_charsets, Qnil);
3004 }
3005
3006 if (STRINGP (AREF (attrs, coding_attr_safe_charsets)))
3007 return;
3008
3009 max_charset_id = 0;
3010 for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
3011 {
3012 int id = XINT (XCAR (tail));
3013 if (max_charset_id < id)
3014 max_charset_id = id;
3015 }
3016
3017 safe_charsets = make_uninit_string (max_charset_id + 1);
3018 memset (SDATA (safe_charsets), 255, max_charset_id + 1);
3019 request = AREF (attrs, coding_attr_iso_request);
3020 reg_usage = AREF (attrs, coding_attr_iso_usage);
3021 reg94 = XINT (XCAR (reg_usage));
3022 reg96 = XINT (XCDR (reg_usage));
3023
3024 for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
3025 {
3026 Lisp_Object id;
3027 Lisp_Object reg;
3028 struct charset *charset;
3029
3030 id = XCAR (tail);
3031 charset = CHARSET_FROM_ID (XINT (id));
3032 reg = Fcdr (Fassq (id, request));
3033 if (! NILP (reg))
3034 SSET (safe_charsets, XINT (id), XINT (reg));
3035 else if (charset->iso_chars_96)
3036 {
3037 if (reg96 < 4)
3038 SSET (safe_charsets, XINT (id), reg96);
3039 }
3040 else
3041 {
3042 if (reg94 < 4)
3043 SSET (safe_charsets, XINT (id), reg94);
3044 }
3045 }
3046 ASET (attrs, coding_attr_safe_charsets, safe_charsets);
3047 }
3048
3049
3050 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
3051 Check if a text is encoded in one of ISO-2022 based codig systems.
3052 If it is, return 1, else return 0. */
3053
3054 static int
3055 detect_coding_iso_2022 (coding, detect_info)
3056 struct coding_system *coding;
3057 struct coding_detection_info *detect_info;
3058 {
3059 const unsigned char *src = coding->source, *src_base = src;
3060 const unsigned char *src_end = coding->source + coding->src_bytes;
3061 int multibytep = coding->src_multibyte;
3062 int single_shifting = 0;
3063 int id;
3064 int c, c1;
3065 int consumed_chars = 0;
3066 int i;
3067 int rejected = 0;
3068 int found = 0;
3069 int composition_count = -1;
3070
3071 detect_info->checked |= CATEGORY_MASK_ISO;
3072
3073 for (i = coding_category_iso_7; i <= coding_category_iso_8_else; i++)
3074 {
3075 struct coding_system *this = &(coding_categories[i]);
3076 Lisp_Object attrs, val;
3077
3078 if (this->id < 0)
3079 continue;
3080 attrs = CODING_ID_ATTRS (this->id);
3081 if (CODING_ISO_FLAGS (this) & CODING_ISO_FLAG_FULL_SUPPORT
3082 && ! EQ (CODING_ATTR_CHARSET_LIST (attrs), Viso_2022_charset_list))
3083 setup_iso_safe_charsets (attrs);
3084 val = CODING_ATTR_SAFE_CHARSETS (attrs);
3085 this->max_charset_id = SCHARS (val) - 1;
3086 this->safe_charsets = SDATA (val);
3087 }
3088
3089 /* A coding system of this category is always ASCII compatible. */
3090 src += coding->head_ascii;
3091
3092 while (rejected != CATEGORY_MASK_ISO)
3093 {
3094 src_base = src;
3095 ONE_MORE_BYTE (c);
3096 switch (c)
3097 {
3098 case ISO_CODE_ESC:
3099 if (inhibit_iso_escape_detection)
3100 break;
3101 single_shifting = 0;
3102 ONE_MORE_BYTE (c);
3103 if (c >= '(' && c <= '/')
3104 {
3105 /* Designation sequence for a charset of dimension 1. */
3106 ONE_MORE_BYTE (c1);
3107 if (c1 < ' ' || c1 >= 0x80
3108 || (id = iso_charset_table[0][c >= ','][c1]) < 0)
3109 /* Invalid designation sequence. Just ignore. */
3110 break;
3111 }
3112 else if (c == '$')
3113 {
3114 /* Designation sequence for a charset of dimension 2. */
3115 ONE_MORE_BYTE (c);
3116 if (c >= '@' && c <= 'B')
3117 /* Designation for JISX0208.1978, GB2312, or JISX0208. */
3118 id = iso_charset_table[1][0][c];
3119 else if (c >= '(' && c <= '/')
3120 {
3121 ONE_MORE_BYTE (c1);
3122 if (c1 < ' ' || c1 >= 0x80
3123 || (id = iso_charset_table[1][c >= ','][c1]) < 0)
3124 /* Invalid designation sequence. Just ignore. */
3125 break;
3126 }
3127 else
3128 /* Invalid designation sequence. Just ignore it. */
3129 break;
3130 }
3131 else if (c == 'N' || c == 'O')
3132 {
3133 /* ESC <Fe> for SS2 or SS3. */
3134 single_shifting = 1;
3135 rejected |= CATEGORY_MASK_ISO_7BIT | CATEGORY_MASK_ISO_8BIT;
3136 break;
3137 }
3138 else if (c == '1')
3139 {
3140 /* End of composition. */
3141 if (composition_count < 0
3142 || composition_count > MAX_COMPOSITION_COMPONENTS)
3143 /* Invalid */
3144 break;
3145 composition_count = -1;
3146 found |= CATEGORY_MASK_ISO;
3147 }
3148 else if (c >= '0' && c <= '4')
3149 {
3150 /* ESC <Fp> for start/end composition. */
3151 composition_count = 0;
3152 break;
3153 }
3154 else
3155 {
3156 /* Invalid escape sequence. Just ignore it. */
3157 break;
3158 }
3159
3160 /* We found a valid designation sequence for CHARSET. */
3161 rejected |= CATEGORY_MASK_ISO_8BIT;
3162 if (SAFE_CHARSET_P (&coding_categories[coding_category_iso_7],
3163 id))
3164 found |= CATEGORY_MASK_ISO_7;
3165 else
3166 rejected |= CATEGORY_MASK_ISO_7;
3167 if (SAFE_CHARSET_P (&coding_categories[coding_category_iso_7_tight],
3168 id))
3169 found |= CATEGORY_MASK_ISO_7_TIGHT;
3170 else
3171 rejected |= CATEGORY_MASK_ISO_7_TIGHT;
3172 if (SAFE_CHARSET_P (&coding_categories[coding_category_iso_7_else],
3173 id))
3174 found |= CATEGORY_MASK_ISO_7_ELSE;
3175 else
3176 rejected |= CATEGORY_MASK_ISO_7_ELSE;
3177 if (SAFE_CHARSET_P (&coding_categories[coding_category_iso_8_else],
3178 id))
3179 found |= CATEGORY_MASK_ISO_8_ELSE;
3180 else
3181 rejected |= CATEGORY_MASK_ISO_8_ELSE;
3182 break;
3183
3184 case ISO_CODE_SO:
3185 case ISO_CODE_SI:
3186 /* Locking shift out/in. */
3187 if (inhibit_iso_escape_detection)
3188 break;
3189 single_shifting = 0;
3190 rejected |= CATEGORY_MASK_ISO_7BIT | CATEGORY_MASK_ISO_8BIT;
3191 break;
3192
3193 case ISO_CODE_CSI:
3194 /* Control sequence introducer. */
3195 single_shifting = 0;
3196 rejected |= CATEGORY_MASK_ISO_7BIT | CATEGORY_MASK_ISO_7_ELSE;
3197 found |= CATEGORY_MASK_ISO_8_ELSE;
3198 goto check_extra_latin;
3199
3200 case ISO_CODE_SS2:
3201 case ISO_CODE_SS3:
3202 /* Single shift. */
3203 if (inhibit_iso_escape_detection)
3204 break;
3205 single_shifting = 0;
3206 rejected |= CATEGORY_MASK_ISO_7BIT;
3207 if (CODING_ISO_FLAGS (&coding_categories[coding_category_iso_8_1])
3208 & CODING_ISO_FLAG_SINGLE_SHIFT)
3209 found |= CATEGORY_MASK_ISO_8_1, single_shifting = 1;
3210 if (CODING_ISO_FLAGS (&coding_categories[coding_category_iso_8_2])
3211 & CODING_ISO_FLAG_SINGLE_SHIFT)
3212 found |= CATEGORY_MASK_ISO_8_2, single_shifting = 1;
3213 if (single_shifting)
3214 break;
3215 goto check_extra_latin;
3216
3217 default:
3218 if (c < 0)
3219 continue;
3220 if (c < 0x80)
3221 {
3222 if (composition_count >= 0)
3223 composition_count++;
3224 single_shifting = 0;
3225 break;
3226 }
3227 if (c >= 0xA0)
3228 {
3229 rejected |= CATEGORY_MASK_ISO_7BIT | CATEGORY_MASK_ISO_7_ELSE;
3230 found |= CATEGORY_MASK_ISO_8_1;
3231 /* Check the length of succeeding codes of the range
3232 0xA0..0FF. If the byte length is even, we include
3233 CATEGORY_MASK_ISO_8_2 in `found'. We can check this
3234 only when we are not single shifting. */
3235 if (! single_shifting
3236 && ! (rejected & CATEGORY_MASK_ISO_8_2))
3237 {
3238 int i = 1;
3239 while (src < src_end)
3240 {
3241 ONE_MORE_BYTE (c);
3242 if (c < 0xA0)
3243 break;
3244 i++;
3245 }
3246
3247 if (i & 1 && src < src_end)
3248 {
3249 rejected |= CATEGORY_MASK_ISO_8_2;
3250 if (composition_count >= 0)
3251 composition_count += i;
3252 }
3253 else
3254 {
3255 found |= CATEGORY_MASK_ISO_8_2;
3256 if (composition_count >= 0)
3257 composition_count += i / 2;
3258 }
3259 }
3260 break;
3261 }
3262 check_extra_latin:
3263 single_shifting = 0;
3264 if (! VECTORP (Vlatin_extra_code_table)
3265 || NILP (XVECTOR (Vlatin_extra_code_table)->contents[c]))
3266 {
3267 rejected = CATEGORY_MASK_ISO;
3268 break;
3269 }
3270 if (CODING_ISO_FLAGS (&coding_categories[coding_category_iso_8_1])
3271 & CODING_ISO_FLAG_LATIN_EXTRA)
3272 found |= CATEGORY_MASK_ISO_8_1;
3273 else
3274 rejected |= CATEGORY_MASK_ISO_8_1;
3275 rejected |= CATEGORY_MASK_ISO_8_2;
3276 }
3277 }
3278 detect_info->rejected |= CATEGORY_MASK_ISO;
3279 return 0;
3280
3281 no_more_source:
3282 detect_info->rejected |= rejected;
3283 detect_info->found |= (found & ~rejected);
3284 return 1;
3285 }
3286
3287
3288 /* Set designation state into CODING. Set CHARS_96 to -1 if the
3289 escape sequence should be kept. */
3290 #define DECODE_DESIGNATION(reg, dim, chars_96, final) \
3291 do { \
3292 int id, prev; \
3293 \
3294 if (final < '0' || final >= 128 \
3295 || ((id = ISO_CHARSET_TABLE (dim, chars_96, final)) < 0) \
3296 || !SAFE_CHARSET_P (coding, id)) \
3297 { \
3298 CODING_ISO_DESIGNATION (coding, reg) = -2; \
3299 chars_96 = -1; \
3300 break; \
3301 } \
3302 prev = CODING_ISO_DESIGNATION (coding, reg); \
3303 if (id == charset_jisx0201_roman) \
3304 { \
3305 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_USE_ROMAN) \
3306 id = charset_ascii; \
3307 } \
3308 else if (id == charset_jisx0208_1978) \
3309 { \
3310 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_USE_OLDJIS) \
3311 id = charset_jisx0208; \
3312 } \
3313 CODING_ISO_DESIGNATION (coding, reg) = id; \
3314 /* If there was an invalid designation to REG previously, and this \
3315 designation is ASCII to REG, we should keep this designation \
3316 sequence. */ \
3317 if (prev == -2 && id == charset_ascii) \
3318 chars_96 = -1; \
3319 } while (0)
3320
3321
3322 /* Handle these composition sequence (ALT: alternate char):
3323
3324 (1) relative composition: ESC 0 CHAR ... ESC 1
3325 (2) rulebase composition: ESC 2 CHAR RULE CHAR RULE ... CHAR ESC 1
3326 (3) altchar composition: ESC 3 ALT ... ALT ESC 0 CHAR ... ESC 1
3327 (4) alt&rule composition: ESC 4 ALT RULE ... ALT ESC 0 CHAR ... ESC 1
3328
3329 When the start sequence (ESC 0/2/3/4) is found, this annotation
3330 header is produced.
3331
3332 [ -LENGTH(==-5) CODING_ANNOTATE_COMPOSITION_MASK NCHARS(==0) 0 METHOD ]
3333
3334 Then, upon reading CHAR or RULE (one or two bytes), these codes are
3335 produced until the end sequence (ESC 1) is found:
3336
3337 (1) CHAR ... CHAR
3338 (2) CHAR -2 DECODED-RULE CHAR -2 DECODED-RULE ... CHAR
3339 (3) ALT ... ALT -1 -1 CHAR ... CHAR
3340 (4) ALT -2 DECODED-RULE ALT -2 DECODED-RULE ... ALT -1 -1 CHAR ... CHAR
3341
3342 When the end sequence (ESC 1) is found, LENGTH and NCHARS in the
3343 annotation header is updated as below:
3344
3345 (1) LENGTH: unchanged, NCHARS: number of CHARs
3346 (2) LENGTH: unchanged, NCHARS: number of CHARs
3347 (3) LENGTH: += number of ALTs + 2, NCHARS: number of CHARs
3348 (4) LENGTH: += number of ALTs * 3, NCHARS: number of CHARs
3349
3350 If an error is found while composing, the annotation header is
3351 changed to:
3352
3353 [ ESC '0'/'2'/'3'/'4' -2 0 ]
3354
3355 and the sequence [ -2 DECODED-RULE ] is changed to the original
3356 byte sequence as below:
3357 o the original byte sequence is B: [ B -1 ]
3358 o the original byte sequence is B1 B2: [ B1 B2 ]
3359 and the sequence [ -1 -1 ] is changed to the original byte
3360 sequence:
3361 [ ESC '0' ]
3362 */
3363
3364 /* Decode a composition rule C1 and maybe one more byte from the
3365 source, and set RULE to the encoded composition rule, NBYTES to the
3366 length of the composition rule. If the rule is invalid, set RULE
3367 to some negative value. */
3368
3369 #define DECODE_COMPOSITION_RULE(rule, nbytes) \
3370 do { \
3371 rule = c1 - 32; \
3372 if (rule < 0) \
3373 break; \
3374 if (rule < 81) /* old format (before ver.21) */ \
3375 { \
3376 int gref = (rule) / 9; \
3377 int nref = (rule) % 9; \
3378 if (gref == 4) gref = 10; \
3379 if (nref == 4) nref = 10; \
3380 rule = COMPOSITION_ENCODE_RULE (gref, nref); \
3381 nbytes = 1; \
3382 } \
3383 else /* new format (after ver.21) */ \
3384 { \
3385 int c; \
3386 \
3387 ONE_MORE_BYTE (c); \
3388 rule = COMPOSITION_ENCODE_RULE (rule - 81, c - 32); \
3389 if (rule >= 0) \
3390 rule += 0x100; /* to destinguish it from the old format */ \
3391 nbytes = 2; \
3392 } \
3393 } while (0)
3394
3395 #define ENCODE_COMPOSITION_RULE(rule) \
3396 do { \
3397 int gref = (rule % 0x100) / 12, nref = (rule % 0x100) % 12; \
3398 \
3399 if (rule < 0x100) /* old format */ \
3400 { \
3401 if (gref == 10) gref = 4; \
3402 if (nref == 10) nref = 4; \
3403 charbuf[idx] = 32 + gref * 9 + nref; \
3404 charbuf[idx + 1] = -1; \
3405 new_chars++; \
3406 } \
3407 else /* new format */ \
3408 { \
3409 charbuf[idx] = 32 + 81 + gref; \
3410 charbuf[idx + 1] = 32 + nref; \
3411 new_chars += 2; \
3412 } \
3413 } while (0)
3414
3415 /* Finish the current composition as invalid. */
3416
3417 static int finish_composition P_ ((int *, struct composition_status *));
3418
3419 static int
3420 finish_composition (charbuf, cmp_status)
3421 int *charbuf;
3422 struct composition_status *cmp_status;
3423 {
3424 int idx = - cmp_status->length;
3425 int new_chars;
3426
3427 /* Recover the original ESC sequence */
3428 charbuf[idx++] = ISO_CODE_ESC;
3429 charbuf[idx++] = (cmp_status->method == COMPOSITION_RELATIVE ? '0'
3430 : cmp_status->method == COMPOSITION_WITH_RULE ? '2'
3431 : cmp_status->method == COMPOSITION_WITH_ALTCHARS ? '3'
3432 /* cmp_status->method == COMPOSITION_WITH_RULE_ALTCHARS */
3433 : '4');
3434 charbuf[idx++] = -2;
3435 charbuf[idx++] = 0;
3436 charbuf[idx++] = -1;
3437 new_chars = cmp_status->nchars;
3438 if (cmp_status->method >= COMPOSITION_WITH_RULE)
3439 for (; idx < 0; idx++)
3440 {
3441 int elt = charbuf[idx];
3442
3443 if (elt == -2)
3444 {
3445 ENCODE_COMPOSITION_RULE (charbuf[idx + 1]);
3446 idx++;
3447 }
3448 else if (elt == -1)
3449 {
3450 charbuf[idx++] = ISO_CODE_ESC;
3451 charbuf[idx] = '0';
3452 new_chars += 2;
3453 }
3454 }
3455 cmp_status->state = COMPOSING_NO;
3456 return new_chars;
3457 }
3458
3459 /* If characers are under composition, finish the composition. */
3460 #define MAYBE_FINISH_COMPOSITION() \
3461 do { \
3462 if (cmp_status->state != COMPOSING_NO) \
3463 char_offset += finish_composition (charbuf, cmp_status); \
3464 } while (0)
3465
3466 /* Handle composition start sequence ESC 0, ESC 2, ESC 3, or ESC 4.
3467
3468 ESC 0 : relative composition : ESC 0 CHAR ... ESC 1
3469 ESC 2 : rulebase composition : ESC 2 CHAR RULE CHAR RULE ... CHAR ESC 1
3470 ESC 3 : altchar composition : ESC 3 CHAR ... ESC 0 CHAR ... ESC 1
3471 ESC 4 : alt&rule composition : ESC 4 CHAR RULE ... CHAR ESC 0 CHAR ... ESC 1
3472
3473 Produce this annotation sequence now:
3474
3475 [ -LENGTH(==-4) CODING_ANNOTATE_COMPOSITION_MASK NCHARS(==0) METHOD ]
3476 */
3477
3478 #define DECODE_COMPOSITION_START(c1) \
3479 do { \
3480 if (c1 == '0' \
3481 && ((cmp_status->state == COMPOSING_COMPONENT_CHAR \
3482 && cmp_status->method == COMPOSITION_WITH_ALTCHARS) \
3483 || (cmp_status->state == COMPOSING_COMPONENT_RULE \
3484 && cmp_status->method == COMPOSITION_WITH_RULE_ALTCHARS))) \
3485 { \
3486 *charbuf++ = -1; \
3487 *charbuf++= -1; \
3488 cmp_status->state = COMPOSING_CHAR; \
3489 cmp_status->length += 2; \
3490 } \
3491 else \
3492 { \
3493 MAYBE_FINISH_COMPOSITION (); \
3494 cmp_status->method = (c1 == '0' ? COMPOSITION_RELATIVE \
3495 : c1 == '2' ? COMPOSITION_WITH_RULE \
3496 : c1 == '3' ? COMPOSITION_WITH_ALTCHARS \
3497 : COMPOSITION_WITH_RULE_ALTCHARS); \
3498 cmp_status->state \
3499 = (c1 <= '2' ? COMPOSING_CHAR : COMPOSING_COMPONENT_CHAR); \
3500 ADD_COMPOSITION_DATA (charbuf, 0, 0, cmp_status->method); \
3501 cmp_status->length = MAX_ANNOTATION_LENGTH; \
3502 cmp_status->nchars = cmp_status->ncomps = 0; \
3503 coding->annotated = 1; \
3504 } \
3505 } while (0)
3506
3507
3508 /* Handle composition end sequence ESC 1. */
3509
3510 #define DECODE_COMPOSITION_END() \
3511 do { \
3512 if (cmp_status->nchars == 0 \
3513 || ((cmp_status->state == COMPOSING_CHAR) \
3514 == (cmp_status->method == COMPOSITION_WITH_RULE))) \
3515 { \
3516 MAYBE_FINISH_COMPOSITION (); \
3517 goto invalid_code; \
3518 } \
3519 if (cmp_status->method == COMPOSITION_WITH_ALTCHARS) \
3520 charbuf[- cmp_status->length] -= cmp_status->ncomps + 2; \
3521 else if (cmp_status->method == COMPOSITION_WITH_RULE_ALTCHARS) \
3522 charbuf[- cmp_status->length] -= cmp_status->ncomps * 3; \
3523 charbuf[- cmp_status->length + 2] = cmp_status->nchars; \
3524 char_offset += cmp_status->nchars; \
3525 cmp_status->state = COMPOSING_NO; \
3526 } while (0)
3527
3528 /* Store a composition rule RULE in charbuf, and update cmp_status. */
3529
3530 #define STORE_COMPOSITION_RULE(rule) \
3531 do { \
3532 *charbuf++ = -2; \
3533 *charbuf++ = rule; \
3534 cmp_status->length += 2; \
3535 cmp_status->state--; \
3536 } while (0)
3537
3538 /* Store a composed char or a component char C in charbuf, and update
3539 cmp_status. */
3540
3541 #define STORE_COMPOSITION_CHAR(c) \
3542 do { \
3543 *charbuf++ = (c); \
3544 cmp_status->length++; \
3545 if (cmp_status->state == COMPOSING_CHAR) \
3546 cmp_status->nchars++; \
3547 else \
3548 cmp_status->ncomps++; \
3549 if (cmp_status->method == COMPOSITION_WITH_RULE \
3550 || (cmp_status->method == COMPOSITION_WITH_RULE_ALTCHARS \
3551 && cmp_status->state == COMPOSING_COMPONENT_CHAR)) \
3552 cmp_status->state++; \
3553 } while (0)
3554
3555
3556 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions". */
3557
3558 static void
3559 decode_coding_iso_2022 (coding)
3560 struct coding_system *coding;
3561 {
3562 const unsigned char *src = coding->source + coding->consumed;
3563 const unsigned char *src_end = coding->source + coding->src_bytes;
3564 const unsigned char *src_base;
3565 int *charbuf = coding->charbuf + coding->charbuf_used;
3566 /* We may produce two annocations (charset and composition) in one
3567 loop and one more charset annocation at the end. */
3568 int *charbuf_end
3569 = coding->charbuf + coding->charbuf_size - (MAX_ANNOTATION_LENGTH * 3);
3570 int consumed_chars = 0, consumed_chars_base;
3571 int multibytep = coding->src_multibyte;
3572 /* Charsets invoked to graphic plane 0 and 1 respectively. */
3573 int charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3574 int charset_id_1 = CODING_ISO_INVOKED_CHARSET (coding, 1);
3575 int charset_id_2, charset_id_3;
3576 struct charset *charset;
3577 int c;
3578 struct composition_status *cmp_status = CODING_ISO_CMP_STATUS (coding);
3579 Lisp_Object attrs, charset_list;
3580 int char_offset = coding->produced_char;
3581 int last_offset = char_offset;
3582 int last_id = charset_ascii;
3583 int eol_crlf =
3584 !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
3585 int byte_after_cr = -1;
3586 int i;
3587
3588 CODING_GET_INFO (coding, attrs, charset_list);
3589 setup_iso_safe_charsets (attrs);
3590 /* Charset list may have been changed. */
3591 charset_list = CODING_ATTR_CHARSET_LIST (attrs);
3592 coding->safe_charsets = SDATA (CODING_ATTR_SAFE_CHARSETS (attrs));
3593
3594 if (cmp_status->state != COMPOSING_NO)
3595 {
3596 for (i = 0; i < cmp_status->length; i++)
3597 *charbuf++ = cmp_status->carryover[i];
3598 coding->annotated = 1;
3599 }
3600
3601 while (1)
3602 {
3603 int c1, c2, c3;
3604
3605 src_base = src;
3606 consumed_chars_base = consumed_chars;
3607
3608 if (charbuf >= charbuf_end)
3609 {
3610 if (byte_after_cr >= 0)
3611 src_base--;
3612 break;
3613 }
3614
3615 if (byte_after_cr >= 0)
3616 c1 = byte_after_cr, byte_after_cr = -1;
3617 else
3618 ONE_MORE_BYTE (c1);
3619 if (c1 < 0)
3620 goto invalid_code;
3621
3622 if (CODING_ISO_EXTSEGMENT_LEN (coding) > 0)
3623 {
3624 *charbuf++ = ASCII_BYTE_P (c1) ? c1 : BYTE8_TO_CHAR (c1);
3625 char_offset++;
3626 CODING_ISO_EXTSEGMENT_LEN (coding)--;
3627 continue;
3628 }
3629
3630 if (CODING_ISO_EMBEDDED_UTF_8 (coding))
3631 {
3632 if (c1 == ISO_CODE_ESC)
3633 {
3634 if (src + 1 >= src_end)
3635 goto no_more_source;
3636 *charbuf++ = ISO_CODE_ESC;
3637 char_offset++;
3638 if (src[0] == '%' && src[1] == '@')
3639 {
3640 src += 2;
3641 consumed_chars += 2;
3642 char_offset += 2;
3643 /* We are sure charbuf can contain two more chars. */
3644 *charbuf++ = '%';
3645 *charbuf++ = '@';
3646 CODING_ISO_EMBEDDED_UTF_8 (coding) = 0;
3647 }
3648 }
3649 else
3650 {
3651 *charbuf++ = ASCII_BYTE_P (c1) ? c1 : BYTE8_TO_CHAR (c1);
3652 char_offset++;
3653 }
3654 continue;
3655 }
3656
3657 if ((cmp_status->state == COMPOSING_RULE
3658 || cmp_status->state == COMPOSING_COMPONENT_RULE)
3659 && c1 != ISO_CODE_ESC)
3660 {
3661 int rule, nbytes;
3662
3663 DECODE_COMPOSITION_RULE (rule, nbytes);
3664 if (rule < 0)
3665 goto invalid_code;
3666 STORE_COMPOSITION_RULE (rule);
3667 continue;
3668 }
3669
3670 /* We produce at most one character. */
3671 switch (iso_code_class [c1])
3672 {
3673 case ISO_0x20_or_0x7F:
3674 if (charset_id_0 < 0
3675 || ! CHARSET_ISO_CHARS_96 (CHARSET_FROM_ID (charset_id_0)))
3676 /* This is SPACE or DEL. */
3677 charset = CHARSET_FROM_ID (charset_ascii);
3678 else
3679 charset = CHARSET_FROM_ID (charset_id_0);
3680 break;
3681
3682 case ISO_graphic_plane_0:
3683 if (charset_id_0 < 0)
3684 charset = CHARSET_FROM_ID (charset_ascii);
3685 else
3686 charset = CHARSET_FROM_ID (charset_id_0);
3687 break;
3688
3689 case ISO_0xA0_or_0xFF:
3690 if (charset_id_1 < 0
3691 || ! CHARSET_ISO_CHARS_96 (CHARSET_FROM_ID (charset_id_1))
3692 || CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS)
3693 goto invalid_code;
3694 /* This is a graphic character, we fall down ... */
3695
3696 case ISO_graphic_plane_1:
3697 if (charset_id_1 < 0)
3698 goto invalid_code;
3699 charset = CHARSET_FROM_ID (charset_id_1);
3700 break;
3701
3702 case ISO_control_0:
3703 if (eol_crlf && c1 == '\r')
3704 ONE_MORE_BYTE (byte_after_cr);
3705 MAYBE_FINISH_COMPOSITION ();
3706 charset = CHARSET_FROM_ID (charset_ascii);
3707 break;
3708
3709 case ISO_control_1:
3710 goto invalid_code;
3711
3712 case ISO_shift_out:
3713 if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LOCKING_SHIFT)
3714 || CODING_ISO_DESIGNATION (coding, 1) < 0)
3715 goto invalid_code;
3716 CODING_ISO_INVOCATION (coding, 0) = 1;
3717 charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3718 continue;
3719
3720 case ISO_shift_in:
3721 if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LOCKING_SHIFT))
3722 goto invalid_code;
3723 CODING_ISO_INVOCATION (coding, 0) = 0;
3724 charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3725 continue;
3726
3727 case ISO_single_shift_2_7:
3728 case ISO_single_shift_2:
3729 if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT))
3730 goto invalid_code;
3731 /* SS2 is handled as an escape sequence of ESC 'N' */
3732 c1 = 'N';
3733 goto label_escape_sequence;
3734
3735 case ISO_single_shift_3:
3736 if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT))
3737 goto invalid_code;
3738 /* SS2 is handled as an escape sequence of ESC 'O' */
3739 c1 = 'O';
3740 goto label_escape_sequence;
3741
3742 case ISO_control_sequence_introducer:
3743 /* CSI is handled as an escape sequence of ESC '[' ... */
3744 c1 = '[';
3745 goto label_escape_sequence;
3746
3747 case ISO_escape:
3748 ONE_MORE_BYTE (c1);
3749 label_escape_sequence:
3750 /* Escape sequences handled here are invocation,
3751 designation, direction specification, and character
3752 composition specification. */
3753 switch (c1)
3754 {
3755 case '&': /* revision of following character set */
3756 ONE_MORE_BYTE (c1);
3757 if (!(c1 >= '@' && c1 <= '~'))
3758 goto invalid_code;
3759 ONE_MORE_BYTE (c1);
3760 if (c1 != ISO_CODE_ESC)
3761 goto invalid_code;
3762 ONE_MORE_BYTE (c1);
3763 goto label_escape_sequence;
3764
3765 case '$': /* designation of 2-byte character set */
3766 if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DESIGNATION))
3767 goto invalid_code;
3768 {
3769 int reg, chars96;
3770
3771 ONE_MORE_BYTE (c1);
3772 if (c1 >= '@' && c1 <= 'B')
3773 { /* designation of JISX0208.1978, GB2312.1980,
3774 or JISX0208.1980 */
3775 reg = 0, chars96 = 0;
3776 }
3777 else if (c1 >= 0x28 && c1 <= 0x2B)
3778 { /* designation of DIMENSION2_CHARS94 character set */
3779 reg = c1 - 0x28, chars96 = 0;
3780 ONE_MORE_BYTE (c1);
3781 }
3782 else if (c1 >= 0x2C && c1 <= 0x2F)
3783 { /* designation of DIMENSION2_CHARS96 character set */
3784 reg = c1 - 0x2C, chars96 = 1;
3785 ONE_MORE_BYTE (c1);
3786 }
3787 else
3788 goto invalid_code;
3789 DECODE_DESIGNATION (reg, 2, chars96, c1);
3790 /* We must update these variables now. */
3791 if (reg == 0)
3792 charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3793 else if (reg == 1)
3794 charset_id_1 = CODING_ISO_INVOKED_CHARSET (coding, 1);
3795 if (chars96 < 0)
3796 goto invalid_code;
3797 }
3798 continue;
3799
3800 case 'n': /* invocation of locking-shift-2 */
3801 if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LOCKING_SHIFT)
3802 || CODING_ISO_DESIGNATION (coding, 2) < 0)
3803 goto invalid_code;
3804 CODING_ISO_INVOCATION (coding, 0) = 2;
3805 charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3806 continue;
3807
3808 case 'o': /* invocation of locking-shift-3 */
3809 if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LOCKING_SHIFT)
3810 || CODING_ISO_DESIGNATION (coding, 3) < 0)
3811 goto invalid_code;
3812 CODING_ISO_INVOCATION (coding, 0) = 3;
3813 charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3814 continue;
3815
3816 case 'N': /* invocation of single-shift-2 */
3817 if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT)
3818 || CODING_ISO_DESIGNATION (coding, 2) < 0)
3819 goto invalid_code;
3820 charset_id_2 = CODING_ISO_DESIGNATION (coding, 2);
3821 if (charset_id_2 < 0)
3822 charset = CHARSET_FROM_ID (charset_ascii);
3823 else
3824 charset = CHARSET_FROM_ID (charset_id_2);
3825 ONE_MORE_BYTE (c1);
3826 if (c1 < 0x20 || (c1 >= 0x80 && c1 < 0xA0))
3827 goto invalid_code;
3828 break;
3829
3830 case 'O': /* invocation of single-shift-3 */
3831 if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT)
3832 || CODING_ISO_DESIGNATION (coding, 3) < 0)
3833 goto invalid_code;
3834 charset_id_3 = CODING_ISO_DESIGNATION (coding, 3);
3835 if (charset_id_3 < 0)
3836 charset = CHARSET_FROM_ID (charset_ascii);
3837 else
3838 charset = CHARSET_FROM_ID (charset_id_3);
3839 ONE_MORE_BYTE (c1);
3840 if (c1 < 0x20 || (c1 >= 0x80 && c1 < 0xA0))
3841 goto invalid_code;
3842 break;
3843
3844 case '0': case '2': case '3': case '4': /* start composition */
3845 if (! (coding->common_flags & CODING_ANNOTATE_COMPOSITION_MASK))
3846 goto invalid_code;
3847 if (last_id != charset_ascii)
3848 {
3849 ADD_CHARSET_DATA (charbuf, char_offset- last_offset, last_id);
3850 last_id = charset_ascii;
3851 last_offset = char_offset;
3852 }
3853 DECODE_COMPOSITION_START (c1);
3854 continue;
3855
3856 case '1': /* end composition */
3857 if (cmp_status->state == COMPOSING_NO)
3858 goto invalid_code;
3859 DECODE_COMPOSITION_END ();
3860 continue;
3861
3862 case '[': /* specification of direction */
3863 if (! CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DIRECTION)
3864 goto invalid_code;
3865 /* For the moment, nested direction is not supported.
3866 So, `coding->mode & CODING_MODE_DIRECTION' zero means
3867 left-to-right, and nozero means right-to-left. */
3868 ONE_MORE_BYTE (c1);
3869 switch (c1)
3870 {
3871 case ']': /* end of the current direction */
3872 coding->mode &= ~CODING_MODE_DIRECTION;
3873
3874 case '0': /* end of the current direction */
3875 case '1': /* start of left-to-right direction */
3876 ONE_MORE_BYTE (c1);
3877 if (c1 == ']')
3878 coding->mode &= ~CODING_MODE_DIRECTION;
3879 else
3880 goto invalid_code;
3881 break;
3882
3883 case '2': /* start of right-to-left direction */
3884 ONE_MORE_BYTE (c1);
3885 if (c1 == ']')
3886 coding->mode |= CODING_MODE_DIRECTION;
3887 else
3888 goto invalid_code;
3889 break;
3890
3891 default:
3892 goto invalid_code;
3893 }
3894 continue;
3895
3896 case '%':
3897 ONE_MORE_BYTE (c1);
3898 if (c1 == '/')
3899 {
3900 /* CTEXT extended segment:
3901 ESC % / [0-4] M L --ENCODING-NAME-- \002 --BYTES--
3902 We keep these bytes as is for the moment.
3903 They may be decoded by post-read-conversion. */
3904 int dim, M, L;
3905 int size;
3906
3907 ONE_MORE_BYTE (dim);
3908 if (dim < 0 || dim > 4)
3909 goto invalid_code;
3910 ONE_MORE_BYTE (M);
3911 if (M < 128)
3912 goto invalid_code;
3913 ONE_MORE_BYTE (L);
3914 if (L < 128)
3915 goto invalid_code;
3916 size = ((M - 128) * 128) + (L - 128);
3917 if (charbuf + 6 > charbuf_end)
3918 goto break_loop;
3919 *charbuf++ = ISO_CODE_ESC;
3920 *charbuf++ = '%';
3921 *charbuf++ = '/';
3922 *charbuf++ = dim;
3923 *charbuf++ = BYTE8_TO_CHAR (M);
3924 *charbuf++ = BYTE8_TO_CHAR (L);
3925 CODING_ISO_EXTSEGMENT_LEN (coding) = size;
3926 }
3927 else if (c1 == 'G')
3928 {
3929 /* XFree86 extension for embedding UTF-8 in CTEXT:
3930 ESC % G --UTF-8-BYTES-- ESC % @
3931 We keep these bytes as is for the moment.
3932 They may be decoded by post-read-conversion. */
3933 if (charbuf + 3 > charbuf_end)
3934 goto break_loop;
3935 *charbuf++ = ISO_CODE_ESC;
3936 *charbuf++ = '%';
3937 *charbuf++ = 'G';
3938 CODING_ISO_EMBEDDED_UTF_8 (coding) = 1;
3939 }
3940 else
3941 goto invalid_code;
3942 continue;
3943 break;
3944
3945 default:
3946 if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DESIGNATION))
3947 goto invalid_code;
3948 {
3949 int reg, chars96;
3950
3951 if (c1 >= 0x28 && c1 <= 0x2B)
3952 { /* designation of DIMENSION1_CHARS94 character set */
3953 reg = c1 - 0x28, chars96 = 0;
3954 ONE_MORE_BYTE (c1);
3955 }
3956 else if (c1 >= 0x2C && c1 <= 0x2F)
3957 { /* designation of DIMENSION1_CHARS96 character set */
3958 reg = c1 - 0x2C, chars96 = 1;
3959 ONE_MORE_BYTE (c1);
3960 }
3961 else
3962 goto invalid_code;
3963 DECODE_DESIGNATION (reg, 1, chars96, c1);
3964 /* We must update these variables now. */
3965 if (reg == 0)
3966 charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3967 else if (reg == 1)
3968 charset_id_1 = CODING_ISO_INVOKED_CHARSET (coding, 1);
3969 if (chars96 < 0)
3970 goto invalid_code;
3971 }
3972 continue;
3973 }
3974 }
3975
3976 if (cmp_status->state == COMPOSING_NO
3977 && charset->id != charset_ascii
3978 && last_id != charset->id)
3979 {
3980 if (last_id != charset_ascii)
3981 ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
3982 last_id = charset->id;
3983 last_offset = char_offset;
3984 }
3985
3986 /* Now we know CHARSET and 1st position code C1 of a character.
3987 Produce a decoded character while getting 2nd and 3rd
3988 position codes C2, C3 if necessary. */
3989 if (CHARSET_DIMENSION (charset) > 1)
3990 {
3991 ONE_MORE_BYTE (c2);
3992 if (c2 < 0x20 || (c2 >= 0x80 && c2 < 0xA0)
3993 || ((c1 & 0x80) != (c2 & 0x80)))
3994 /* C2 is not in a valid range. */
3995 goto invalid_code;
3996 if (CHARSET_DIMENSION (charset) == 2)
3997 c1 = (c1 << 8) | c2;
3998 else
3999 {
4000 ONE_MORE_BYTE (c3);
4001 if (c3 < 0x20 || (c3 >= 0x80 && c3 < 0xA0)
4002 || ((c1 & 0x80) != (c3 & 0x80)))
4003 /* C3 is not in a valid range. */
4004 goto invalid_code;
4005 c1 = (c1 << 16) | (c2 << 8) | c2;
4006 }
4007 }
4008 c1 &= 0x7F7F7F;
4009 CODING_DECODE_CHAR (coding, src, src_base, src_end, charset, c1, c);
4010 if (c < 0)
4011 {
4012 MAYBE_FINISH_COMPOSITION ();
4013 for (; src_base < src; src_base++, char_offset++)
4014 {
4015 if (ASCII_BYTE_P (*src_base))
4016 *charbuf++ = *src_base;
4017 else
4018 *charbuf++ = BYTE8_TO_CHAR (*src_base);
4019 }
4020 }
4021 else if (cmp_status->state == COMPOSING_NO)
4022 {
4023 *charbuf++ = c;
4024 char_offset++;
4025 }
4026 else if ((cmp_status->state == COMPOSING_CHAR
4027 ? cmp_status->nchars
4028 : cmp_status->ncomps)
4029 >= MAX_COMPOSITION_COMPONENTS)
4030 {
4031 /* Too long composition. */
4032 MAYBE_FINISH_COMPOSITION ();
4033 *charbuf++ = c;
4034 char_offset++;
4035 }
4036 else
4037 STORE_COMPOSITION_CHAR (c);
4038 continue;
4039
4040 invalid_code:
4041 MAYBE_FINISH_COMPOSITION ();
4042 src = src_base;
4043 consumed_chars = consumed_chars_base;
4044 ONE_MORE_BYTE (c);
4045 *charbuf++ = c < 0 ? -c : ASCII_BYTE_P (c) ? c : BYTE8_TO_CHAR (c);
4046 char_offset++;
4047 coding->errors++;
4048 continue;
4049
4050 break_loop:
4051 break;
4052 }
4053
4054 no_more_source:
4055 if (cmp_status->state != COMPOSING_NO)
4056 {
4057 if (coding->mode & CODING_MODE_LAST_BLOCK)
4058 MAYBE_FINISH_COMPOSITION ();
4059 else
4060 {
4061 charbuf -= cmp_status->length;
4062 for (i = 0; i < cmp_status->length; i++)
4063 cmp_status->carryover[i] = charbuf[i];
4064 }
4065 }
4066 else if (last_id != charset_ascii)
4067 ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
4068 coding->consumed_char += consumed_chars_base;
4069 coding->consumed = src_base - coding->source;
4070 coding->charbuf_used = charbuf - coding->charbuf;
4071 }
4072
4073
4074 /* ISO2022 encoding stuff. */
4075
4076 /*
4077 It is not enough to say just "ISO2022" on encoding, we have to
4078 specify more details. In Emacs, each coding system of ISO2022
4079 variant has the following specifications:
4080 1. Initial designation to G0 thru G3.
4081 2. Allows short-form designation?
4082 3. ASCII should be designated to G0 before control characters?
4083 4. ASCII should be designated to G0 at end of line?
4084 5. 7-bit environment or 8-bit environment?
4085 6. Use locking-shift?
4086 7. Use Single-shift?
4087 And the following two are only for Japanese:
4088 8. Use ASCII in place of JIS0201-1976-Roman?
4089 9. Use JISX0208-1983 in place of JISX0208-1978?
4090 These specifications are encoded in CODING_ISO_FLAGS (coding) as flag bits
4091 defined by macros CODING_ISO_FLAG_XXX. See `coding.h' for more
4092 details.
4093 */
4094
4095 /* Produce codes (escape sequence) for designating CHARSET to graphic
4096 register REG at DST, and increment DST. If <final-char> of CHARSET is
4097 '@', 'A', or 'B' and the coding system CODING allows, produce
4098 designation sequence of short-form. */
4099
4100 #define ENCODE_DESIGNATION(charset, reg, coding) \
4101 do { \
4102 unsigned char final_char = CHARSET_ISO_FINAL (charset); \
4103 char *intermediate_char_94 = "()*+"; \
4104 char *intermediate_char_96 = ",-./"; \
4105 int revision = -1; \
4106 int c; \
4107 \
4108 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_REVISION) \
4109 revision = CHARSET_ISO_REVISION (charset); \
4110 \
4111 if (revision >= 0) \
4112 { \
4113 EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, '&'); \
4114 EMIT_ONE_BYTE ('@' + revision); \
4115 } \
4116 EMIT_ONE_ASCII_BYTE (ISO_CODE_ESC); \
4117 if (CHARSET_DIMENSION (charset) == 1) \
4118 { \
4119 if (! CHARSET_ISO_CHARS_96 (charset)) \
4120 c = intermediate_char_94[reg]; \
4121 else \
4122 c = intermediate_char_96[reg]; \
4123 EMIT_ONE_ASCII_BYTE (c); \
4124 } \
4125 else \
4126 { \
4127 EMIT_ONE_ASCII_BYTE ('$'); \
4128 if (! CHARSET_ISO_CHARS_96 (charset)) \
4129 { \
4130 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LONG_FORM \
4131 || reg != 0 \
4132 || final_char < '@' || final_char > 'B') \
4133 EMIT_ONE_ASCII_BYTE (intermediate_char_94[reg]); \
4134 } \
4135 else \
4136 EMIT_ONE_ASCII_BYTE (intermediate_char_96[reg]); \
4137 } \
4138 EMIT_ONE_ASCII_BYTE (final_char); \
4139 \
4140 CODING_ISO_DESIGNATION (coding, reg) = CHARSET_ID (charset); \
4141 } while (0)
4142
4143
4144 /* The following two macros produce codes (control character or escape
4145 sequence) for ISO2022 single-shift functions (single-shift-2 and
4146 single-shift-3). */
4147
4148 #define ENCODE_SINGLE_SHIFT_2 \
4149 do { \
4150 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS) \
4151 EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'N'); \
4152 else \
4153 EMIT_ONE_BYTE (ISO_CODE_SS2); \
4154 CODING_ISO_SINGLE_SHIFTING (coding) = 1; \
4155 } while (0)
4156
4157
4158 #define ENCODE_SINGLE_SHIFT_3 \
4159 do { \
4160 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS) \
4161 EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'O'); \
4162 else \
4163 EMIT_ONE_BYTE (ISO_CODE_SS3); \
4164 CODING_ISO_SINGLE_SHIFTING (coding) = 1; \
4165 } while (0)
4166
4167
4168 /* The following four macros produce codes (control character or
4169 escape sequence) for ISO2022 locking-shift functions (shift-in,
4170 shift-out, locking-shift-2, and locking-shift-3). */
4171
4172 #define ENCODE_SHIFT_IN \
4173 do { \
4174 EMIT_ONE_ASCII_BYTE (ISO_CODE_SI); \
4175 CODING_ISO_INVOCATION (coding, 0) = 0; \
4176 } while (0)
4177
4178
4179 #define ENCODE_SHIFT_OUT \
4180 do { \
4181 EMIT_ONE_ASCII_BYTE (ISO_CODE_SO); \
4182 CODING_ISO_INVOCATION (coding, 0) = 1; \
4183 } while (0)
4184
4185
4186 #define ENCODE_LOCKING_SHIFT_2 \
4187 do { \
4188 EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'n'); \
4189 CODING_ISO_INVOCATION (coding, 0) = 2; \
4190 } while (0)
4191
4192
4193 #define ENCODE_LOCKING_SHIFT_3 \
4194 do { \
4195 EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'n'); \
4196 CODING_ISO_INVOCATION (coding, 0) = 3; \
4197 } while (0)
4198
4199
4200 /* Produce codes for a DIMENSION1 character whose character set is
4201 CHARSET and whose position-code is C1. Designation and invocation
4202 sequences are also produced in advance if necessary. */
4203
4204 #define ENCODE_ISO_CHARACTER_DIMENSION1(charset, c1) \
4205 do { \
4206 int id = CHARSET_ID (charset); \
4207 \
4208 if ((CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_USE_ROMAN) \
4209 && id == charset_ascii) \
4210 { \
4211 id = charset_jisx0201_roman; \
4212 charset = CHARSET_FROM_ID (id); \
4213 } \
4214 \
4215 if (CODING_ISO_SINGLE_SHIFTING (coding)) \
4216 { \
4217 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS) \
4218 EMIT_ONE_ASCII_BYTE (c1 & 0x7F); \
4219 else \
4220 EMIT_ONE_BYTE (c1 | 0x80); \
4221 CODING_ISO_SINGLE_SHIFTING (coding) = 0; \
4222 break; \
4223 } \
4224 else if (id == CODING_ISO_INVOKED_CHARSET (coding, 0)) \
4225 { \
4226 EMIT_ONE_ASCII_BYTE (c1 & 0x7F); \
4227 break; \
4228 } \
4229 else if (id == CODING_ISO_INVOKED_CHARSET (coding, 1)) \
4230 { \
4231 EMIT_ONE_BYTE (c1 | 0x80); \
4232 break; \
4233 } \
4234 else \
4235 /* Since CHARSET is not yet invoked to any graphic planes, we \
4236 must invoke it, or, at first, designate it to some graphic \
4237 register. Then repeat the loop to actually produce the \
4238 character. */ \
4239 dst = encode_invocation_designation (charset, coding, dst, \
4240 &produced_chars); \
4241 } while (1)
4242
4243
4244 /* Produce codes for a DIMENSION2 character whose character set is
4245 CHARSET and whose position-codes are C1 and C2. Designation and
4246 invocation codes are also produced in advance if necessary. */
4247
4248 #define ENCODE_ISO_CHARACTER_DIMENSION2(charset, c1, c2) \
4249 do { \
4250 int id = CHARSET_ID (charset); \
4251 \
4252 if ((CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_USE_OLDJIS) \
4253 && id == charset_jisx0208) \
4254 { \
4255 id = charset_jisx0208_1978; \
4256 charset = CHARSET_FROM_ID (id); \
4257 } \
4258 \
4259 if (CODING_ISO_SINGLE_SHIFTING (coding)) \
4260 { \
4261 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS) \
4262 EMIT_TWO_ASCII_BYTES ((c1) & 0x7F, (c2) & 0x7F); \
4263 else \
4264 EMIT_TWO_BYTES ((c1) | 0x80, (c2) | 0x80); \
4265 CODING_ISO_SINGLE_SHIFTING (coding) = 0; \
4266 break; \
4267 } \
4268 else if (id == CODING_ISO_INVOKED_CHARSET (coding, 0)) \
4269 { \
4270 EMIT_TWO_ASCII_BYTES ((c1) & 0x7F, (c2) & 0x7F); \
4271 break; \
4272 } \
4273 else if (id == CODING_ISO_INVOKED_CHARSET (coding, 1)) \
4274 { \
4275 EMIT_TWO_BYTES ((c1) | 0x80, (c2) | 0x80); \
4276 break; \
4277 } \
4278 else \
4279 /* Since CHARSET is not yet invoked to any graphic planes, we \
4280 must invoke it, or, at first, designate it to some graphic \
4281 register. Then repeat the loop to actually produce the \
4282 character. */ \
4283 dst = encode_invocation_designation (charset, coding, dst, \
4284 &produced_chars); \
4285 } while (1)
4286
4287
4288 #define ENCODE_ISO_CHARACTER(charset, c) \
4289 do { \
4290 int code = ENCODE_CHAR ((charset),(c)); \
4291 \
4292 if (CHARSET_DIMENSION (charset) == 1) \
4293 ENCODE_ISO_CHARACTER_DIMENSION1 ((charset), code); \
4294 else \
4295 ENCODE_ISO_CHARACTER_DIMENSION2 ((charset), code >> 8, code & 0xFF); \
4296 } while (0)
4297
4298
4299 /* Produce designation and invocation codes at a place pointed by DST
4300 to use CHARSET. The element `spec.iso_2022' of *CODING is updated.
4301 Return new DST. */
4302
4303 unsigned char *
4304 encode_invocation_designation (charset, coding, dst, p_nchars)
4305 struct charset *charset;
4306 struct coding_system *coding;
4307 unsigned char *dst;
4308 int *p_nchars;
4309 {
4310 int multibytep = coding->dst_multibyte;
4311 int produced_chars = *p_nchars;
4312 int reg; /* graphic register number */
4313 int id = CHARSET_ID (charset);
4314
4315 /* At first, check designations. */
4316 for (reg = 0; reg < 4; reg++)
4317 if (id == CODING_ISO_DESIGNATION (coding, reg))
4318 break;
4319
4320 if (reg >= 4)
4321 {
4322 /* CHARSET is not yet designated to any graphic registers. */
4323 /* At first check the requested designation. */
4324 reg = CODING_ISO_REQUEST (coding, id);
4325 if (reg < 0)
4326 /* Since CHARSET requests no special designation, designate it
4327 to graphic register 0. */
4328 reg = 0;
4329
4330 ENCODE_DESIGNATION (charset, reg, coding);
4331 }
4332
4333 if (CODING_ISO_INVOCATION (coding, 0) != reg
4334 && CODING_ISO_INVOCATION (coding, 1) != reg)
4335 {
4336 /* Since the graphic register REG is not invoked to any graphic
4337 planes, invoke it to graphic plane 0. */
4338 switch (reg)
4339 {
4340 case 0: /* graphic register 0 */
4341 ENCODE_SHIFT_IN;
4342 break;
4343
4344 case 1: /* graphic register 1 */
4345 ENCODE_SHIFT_OUT;
4346 break;
4347
4348 case 2: /* graphic register 2 */
4349 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT)
4350 ENCODE_SINGLE_SHIFT_2;
4351 else
4352 ENCODE_LOCKING_SHIFT_2;
4353 break;
4354
4355 case 3: /* graphic register 3 */
4356 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT)
4357 ENCODE_SINGLE_SHIFT_3;
4358 else
4359 ENCODE_LOCKING_SHIFT_3;
4360 break;
4361 }
4362 }
4363
4364 *p_nchars = produced_chars;
4365 return dst;
4366 }
4367
4368 /* The following three macros produce codes for indicating direction
4369 of text. */
4370 #define ENCODE_CONTROL_SEQUENCE_INTRODUCER \
4371 do { \
4372 if (CODING_ISO_FLAGS (coding) == CODING_ISO_FLAG_SEVEN_BITS) \
4373 EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, '['); \
4374 else \
4375 EMIT_ONE_BYTE (ISO_CODE_CSI); \
4376 } while (0)
4377
4378
4379 #define ENCODE_DIRECTION_R2L() \
4380 do { \
4381 ENCODE_CONTROL_SEQUENCE_INTRODUCER (dst); \
4382 EMIT_TWO_ASCII_BYTES ('2', ']'); \
4383 } while (0)
4384
4385
4386 #define ENCODE_DIRECTION_L2R() \
4387 do { \
4388 ENCODE_CONTROL_SEQUENCE_INTRODUCER (dst); \
4389 EMIT_TWO_ASCII_BYTES ('0', ']'); \
4390 } while (0)
4391
4392
4393 /* Produce codes for designation and invocation to reset the graphic
4394 planes and registers to initial state. */
4395 #define ENCODE_RESET_PLANE_AND_REGISTER() \
4396 do { \
4397 int reg; \
4398 struct charset *charset; \
4399 \
4400 if (CODING_ISO_INVOCATION (coding, 0) != 0) \
4401 ENCODE_SHIFT_IN; \
4402 for (reg = 0; reg < 4; reg++) \
4403 if (CODING_ISO_INITIAL (coding, reg) >= 0 \
4404 && (CODING_ISO_DESIGNATION (coding, reg) \
4405 != CODING_ISO_INITIAL (coding, reg))) \
4406 { \
4407 charset = CHARSET_FROM_ID (CODING_ISO_INITIAL (coding, reg)); \
4408 ENCODE_DESIGNATION (charset, reg, coding); \
4409 } \
4410 } while (0)
4411
4412
4413 /* Produce designation sequences of charsets in the line started from
4414 SRC to a place pointed by DST, and return updated DST.
4415
4416 If the current block ends before any end-of-line, we may fail to
4417 find all the necessary designations. */
4418
4419 static unsigned char *
4420 encode_designation_at_bol (coding, charbuf, charbuf_end, dst)
4421 struct coding_system *coding;
4422 int *charbuf, *charbuf_end;
4423 unsigned char *dst;
4424 {
4425 struct charset *charset;
4426 /* Table of charsets to be designated to each graphic register. */
4427 int r[4];
4428 int c, found = 0, reg;
4429 int produced_chars = 0;
4430 int multibytep = coding->dst_multibyte;
4431 Lisp_Object attrs;
4432 Lisp_Object charset_list;
4433
4434 attrs = CODING_ID_ATTRS (coding->id);
4435 charset_list = CODING_ATTR_CHARSET_LIST (attrs);
4436 if (EQ (charset_list, Qiso_2022))
4437 charset_list = Viso_2022_charset_list;
4438
4439 for (reg = 0; reg < 4; reg++)
4440 r[reg] = -1;
4441
4442 while (found < 4)
4443 {
4444 int id;
4445
4446 c = *charbuf++;
4447 if (c == '\n')
4448 break;
4449 charset = char_charset (c, charset_list, NULL);
4450 id = CHARSET_ID (charset);
4451 reg = CODING_ISO_REQUEST (coding, id);
4452 if (reg >= 0 && r[reg] < 0)
4453 {
4454 found++;
4455 r[reg] = id;
4456 }
4457 }
4458
4459 if (found)
4460 {
4461 for (reg = 0; reg < 4; reg++)
4462 if (r[reg] >= 0
4463 && CODING_ISO_DESIGNATION (coding, reg) != r[reg])
4464 ENCODE_DESIGNATION (CHARSET_FROM_ID (r[reg]), reg, coding);
4465 }
4466
4467 return dst;
4468 }
4469
4470 /* See the above "GENERAL NOTES on `encode_coding_XXX ()' functions". */
4471
4472 static int
4473 encode_coding_iso_2022 (coding)
4474 struct coding_system *coding;
4475 {
4476 int multibytep = coding->dst_multibyte;
4477 int *charbuf = coding->charbuf;
4478 int *charbuf_end = charbuf + coding->charbuf_used;
4479 unsigned char *dst = coding->destination + coding->produced;
4480 unsigned char *dst_end = coding->destination + coding->dst_bytes;
4481 int safe_room = 16;
4482 int bol_designation
4483 = (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DESIGNATE_AT_BOL
4484 && CODING_ISO_BOL (coding));
4485 int produced_chars = 0;
4486 Lisp_Object attrs, eol_type, charset_list;
4487 int ascii_compatible;
4488 int c;
4489 int preferred_charset_id = -1;
4490
4491 CODING_GET_INFO (coding, attrs, charset_list);
4492 eol_type = inhibit_eol_conversion ? Qunix : CODING_ID_EOL_TYPE (coding->id);
4493 if (VECTORP (eol_type))
4494 eol_type = Qunix;
4495
4496 setup_iso_safe_charsets (attrs);
4497 /* Charset list may have been changed. */
4498 charset_list = CODING_ATTR_CHARSET_LIST (attrs);
4499 coding->safe_charsets = SDATA (CODING_ATTR_SAFE_CHARSETS (attrs));
4500
4501 ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs));
4502
4503 while (charbuf < charbuf_end)
4504 {
4505 ASSURE_DESTINATION (safe_room);
4506
4507 if (bol_designation)
4508 {
4509 unsigned char *dst_prev = dst;
4510
4511 /* We have to produce designation sequences if any now. */
4512 dst = encode_designation_at_bol (coding, charbuf, charbuf_end, dst);
4513 bol_designation = 0;
4514 /* We are sure that designation sequences are all ASCII bytes. */
4515 produced_chars += dst - dst_prev;
4516 }
4517
4518 c = *charbuf++;
4519
4520 if (c < 0)
4521 {
4522 /* Handle an annotation. */
4523 switch (*charbuf)
4524 {
4525 case CODING_ANNOTATE_COMPOSITION_MASK:
4526 /* Not yet implemented. */
4527 break;
4528 case CODING_ANNOTATE_CHARSET_MASK:
4529 preferred_charset_id = charbuf[2];
4530 if (preferred_charset_id >= 0
4531 && NILP (Fmemq (make_number (preferred_charset_id),
4532 charset_list)))
4533 preferred_charset_id = -1;
4534 break;
4535 default:
4536 abort ();
4537 }
4538 charbuf += -c - 1;
4539 continue;
4540 }
4541
4542 /* Now encode the character C. */
4543 if (c < 0x20 || c == 0x7F)
4544 {
4545 if (c == '\n'
4546 || (c == '\r' && EQ (eol_type, Qmac)))
4547 {
4548 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_RESET_AT_EOL)
4549 ENCODE_RESET_PLANE_AND_REGISTER ();
4550 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_INIT_AT_BOL)
4551 {
4552 int i;
4553
4554 for (i = 0; i < 4; i++)
4555 CODING_ISO_DESIGNATION (coding, i)
4556 = CODING_ISO_INITIAL (coding, i);
4557 }
4558 bol_designation
4559 = CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DESIGNATE_AT_BOL;
4560 }
4561 else if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_RESET_AT_CNTL)
4562 ENCODE_RESET_PLANE_AND_REGISTER ();
4563 EMIT_ONE_ASCII_BYTE (c);
4564 }
4565 else if (ASCII_CHAR_P (c))
4566 {
4567 if (ascii_compatible)
4568 EMIT_ONE_ASCII_BYTE (c);
4569 else
4570 {
4571 struct charset *charset = CHARSET_FROM_ID (charset_ascii);
4572 ENCODE_ISO_CHARACTER (charset, c);
4573 }
4574 }
4575 else if (CHAR_BYTE8_P (c))
4576 {
4577 c = CHAR_TO_BYTE8 (c);
4578 EMIT_ONE_BYTE (c);
4579 }
4580 else
4581 {
4582 struct charset *charset;
4583
4584 if (preferred_charset_id >= 0)
4585 {
4586 charset = CHARSET_FROM_ID (preferred_charset_id);
4587 if (! CHAR_CHARSET_P (c, charset))
4588 charset = char_charset (c, charset_list, NULL);
4589 }
4590 else
4591 charset = char_charset (c, charset_list, NULL);
4592 if (!charset)
4593 {
4594 if (coding->mode & CODING_MODE_SAFE_ENCODING)
4595 {
4596 c = CODING_INHIBIT_CHARACTER_SUBSTITUTION;
4597 charset = CHARSET_FROM_ID (charset_ascii);
4598 }
4599 else
4600 {
4601 c = coding->default_char;
4602 charset = char_charset (c, charset_list, NULL);
4603 }
4604 }
4605 ENCODE_ISO_CHARACTER (charset, c);
4606 }
4607 }
4608
4609 if (coding->mode & CODING_MODE_LAST_BLOCK
4610 && CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_RESET_AT_EOL)
4611 {
4612 ASSURE_DESTINATION (safe_room);
4613 ENCODE_RESET_PLANE_AND_REGISTER ();
4614 }
4615 record_conversion_result (coding, CODING_RESULT_SUCCESS);
4616 CODING_ISO_BOL (coding) = bol_designation;
4617 coding->produced_char += produced_chars;
4618 coding->produced = dst - coding->destination;
4619 return 0;
4620 }
4621
4622 \f
4623 /*** 8,9. SJIS and BIG5 handlers ***/
4624
4625 /* Although SJIS and BIG5 are not ISO's coding system, they are used
4626 quite widely. So, for the moment, Emacs supports them in the bare
4627 C code. But, in the future, they may be supported only by CCL. */
4628
4629 /* SJIS is a coding system encoding three character sets: ASCII, right
4630 half of JISX0201-Kana, and JISX0208. An ASCII character is encoded
4631 as is. A character of charset katakana-jisx0201 is encoded by
4632 "position-code + 0x80". A character of charset japanese-jisx0208
4633 is encoded in 2-byte but two position-codes are divided and shifted
4634 so that it fit in the range below.
4635
4636 --- CODE RANGE of SJIS ---
4637 (character set) (range)
4638 ASCII 0x00 .. 0x7F
4639 KATAKANA-JISX0201 0xA0 .. 0xDF
4640 JISX0208 (1st byte) 0x81 .. 0x9F and 0xE0 .. 0xEF
4641 (2nd byte) 0x40 .. 0x7E and 0x80 .. 0xFC
4642 -------------------------------
4643
4644 */
4645
4646 /* BIG5 is a coding system encoding two character sets: ASCII and
4647 Big5. An ASCII character is encoded as is. Big5 is a two-byte
4648 character set and is encoded in two-byte.
4649
4650 --- CODE RANGE of BIG5 ---
4651 (character set) (range)
4652 ASCII 0x00 .. 0x7F
4653 Big5 (1st byte) 0xA1 .. 0xFE
4654 (2nd byte) 0x40 .. 0x7E and 0xA1 .. 0xFE
4655 --------------------------
4656
4657 */
4658
4659 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
4660 Check if a text is encoded in SJIS. If it is, return
4661 CATEGORY_MASK_SJIS, else return 0. */
4662
4663 static int
4664 detect_coding_sjis (coding, detect_info)
4665 struct coding_system *coding;
4666 struct coding_detection_info *detect_info;
4667 {
4668 const unsigned char *src = coding->source, *src_base;
4669 const unsigned char *src_end = coding->source + coding->src_bytes;
4670 int multibytep = coding->src_multibyte;
4671 int consumed_chars = 0;
4672 int found = 0;
4673 int c;
4674
4675 detect_info->checked |= CATEGORY_MASK_SJIS;
4676 /* A coding system of this category is always ASCII compatible. */
4677 src += coding->head_ascii;
4678
4679 while (1)
4680 {
4681 src_base = src;
4682 ONE_MORE_BYTE (c);
4683 if (c < 0x80)
4684 continue;
4685 if ((c >= 0x81 && c <= 0x9F) || (c >= 0xE0 && c <= 0xEF))
4686 {
4687 ONE_MORE_BYTE (c);
4688 if (c < 0x40 || c == 0x7F || c > 0xFC)
4689 break;
4690 found = CATEGORY_MASK_SJIS;
4691 }
4692 else if (c >= 0xA0 && c < 0xE0)
4693 found = CATEGORY_MASK_SJIS;
4694 else
4695 break;
4696 }
4697 detect_info->rejected |= CATEGORY_MASK_SJIS;
4698 return 0;
4699
4700 no_more_source:
4701 if (src_base < src && coding->mode & CODING_MODE_LAST_BLOCK)
4702 {
4703 detect_info->rejected |= CATEGORY_MASK_SJIS;
4704 return 0;
4705 }
4706 detect_info->found |= found;
4707 return 1;
4708 }
4709
4710 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
4711 Check if a text is encoded in BIG5. If it is, return
4712 CATEGORY_MASK_BIG5, else return 0. */
4713
4714 static int
4715 detect_coding_big5 (coding, detect_info)
4716 struct coding_system *coding;
4717 struct coding_detection_info *detect_info;
4718 {
4719 const unsigned char *src = coding->source, *src_base;
4720 const unsigned char *src_end = coding->source + coding->src_bytes;
4721 int multibytep = coding->src_multibyte;
4722 int consumed_chars = 0;
4723 int found = 0;
4724 int c;
4725
4726 detect_info->checked |= CATEGORY_MASK_BIG5;
4727 /* A coding system of this category is always ASCII compatible. */
4728 src += coding->head_ascii;
4729
4730 while (1)
4731 {
4732 src_base = src;
4733 ONE_MORE_BYTE (c);
4734 if (c < 0x80)
4735 continue;
4736 if (c >= 0xA1)
4737 {
4738 ONE_MORE_BYTE (c);
4739 if (c < 0x40 || (c >= 0x7F && c <= 0xA0))
4740 return 0;
4741 found = CATEGORY_MASK_BIG5;
4742 }
4743 else
4744 break;
4745 }
4746 detect_info->rejected |= CATEGORY_MASK_BIG5;
4747 return 0;
4748
4749 no_more_source:
4750 if (src_base < src && coding->mode & CODING_MODE_LAST_BLOCK)
4751 {
4752 detect_info->rejected |= CATEGORY_MASK_BIG5;
4753 return 0;
4754 }
4755 detect_info->found |= found;
4756 return 1;
4757 }
4758
4759 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".
4760 If SJIS_P is 1, decode SJIS text, else decode BIG5 test. */
4761
4762 static void
4763 decode_coding_sjis (coding)
4764 struct coding_system *coding;
4765 {
4766 const unsigned char *src = coding->source + coding->consumed;
4767 const unsigned char *src_end = coding->source + coding->src_bytes;
4768 const unsigned char *src_base;
4769 int *charbuf = coding->charbuf + coding->charbuf_used;
4770 /* We may produce one charset annocation in one loop and one more at
4771 the end. */
4772 int *charbuf_end
4773 = coding->charbuf + coding->charbuf_size - (MAX_ANNOTATION_LENGTH * 2);
4774 int consumed_chars = 0, consumed_chars_base;
4775 int multibytep = coding->src_multibyte;
4776 struct charset *charset_roman, *charset_kanji, *charset_kana;
4777 struct charset *charset_kanji2;
4778 Lisp_Object attrs, charset_list, val;
4779 int char_offset = coding->produced_char;
4780 int last_offset = char_offset;
4781 int last_id = charset_ascii;
4782 int eol_crlf =
4783 !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
4784 int byte_after_cr = -1;
4785
4786 CODING_GET_INFO (coding, attrs, charset_list);
4787
4788 val = charset_list;
4789 charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4790 charset_kana = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4791 charset_kanji = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4792 charset_kanji2 = NILP (val) ? NULL : CHARSET_FROM_ID (XINT (XCAR (val)));
4793
4794 while (1)
4795 {
4796 int c, c1;
4797 struct charset *charset;
4798
4799 src_base = src;
4800 consumed_chars_base = consumed_chars;
4801
4802 if (charbuf >= charbuf_end)
4803 {
4804 if (byte_after_cr >= 0)
4805 src_base--;
4806 break;
4807 }
4808
4809 if (byte_after_cr >= 0)
4810 c = byte_after_cr, byte_after_cr = -1;
4811 else
4812 ONE_MORE_BYTE (c);
4813 if (c < 0)
4814 goto invalid_code;
4815 if (c < 0x80)
4816 {
4817 if (eol_crlf && c == '\r')
4818 ONE_MORE_BYTE (byte_after_cr);
4819 charset = charset_roman;
4820 }
4821 else if (c == 0x80 || c == 0xA0)
4822 goto invalid_code;
4823 else if (c >= 0xA1 && c <= 0xDF)
4824 {
4825 /* SJIS -> JISX0201-Kana */
4826 c &= 0x7F;
4827 charset = charset_kana;
4828 }
4829 else if (c <= 0xEF)
4830 {
4831 /* SJIS -> JISX0208 */
4832 ONE_MORE_BYTE (c1);
4833 if (c1 < 0x40 || c1 == 0x7F || c1 > 0xFC)
4834 goto invalid_code;
4835 c = (c << 8) | c1;
4836 SJIS_TO_JIS (c);
4837 charset = charset_kanji;
4838 }
4839 else if (c <= 0xFC && charset_kanji2)
4840 {
4841 /* SJIS -> JISX0213-2 */
4842 ONE_MORE_BYTE (c1);
4843 if (c1 < 0x40 || c1 == 0x7F || c1 > 0xFC)
4844 goto invalid_code;
4845 c = (c << 8) | c1;
4846 SJIS_TO_JIS2 (c);
4847 charset = charset_kanji2;
4848 }
4849 else
4850 goto invalid_code;
4851 if (charset->id != charset_ascii
4852 && last_id != charset->id)
4853 {
4854 if (last_id != charset_ascii)
4855 ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
4856 last_id = charset->id;
4857 last_offset = char_offset;
4858 }
4859 CODING_DECODE_CHAR (coding, src, src_base, src_end, charset, c, c);
4860 *charbuf++ = c;
4861 char_offset++;
4862 continue;
4863
4864 invalid_code:
4865 src = src_base;
4866 consumed_chars = consumed_chars_base;
4867 ONE_MORE_BYTE (c);
4868 *charbuf++ = c < 0 ? -c : BYTE8_TO_CHAR (c);
4869 char_offset++;
4870 coding->errors++;
4871 }
4872
4873 no_more_source:
4874 if (last_id != charset_ascii)
4875 ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
4876 coding->consumed_char += consumed_chars_base;
4877 coding->consumed = src_base - coding->source;
4878 coding->charbuf_used = charbuf - coding->charbuf;
4879 }
4880
4881 static void
4882 decode_coding_big5 (coding)
4883 struct coding_system *coding;
4884 {
4885 const unsigned char *src = coding->source + coding->consumed;
4886 const unsigned char *src_end = coding->source + coding->src_bytes;
4887 const unsigned char *src_base;
4888 int *charbuf = coding->charbuf + coding->charbuf_used;
4889 /* We may produce one charset annocation in one loop and one more at
4890 the end. */
4891 int *charbuf_end
4892 = coding->charbuf + coding->charbuf_size - (MAX_ANNOTATION_LENGTH * 2);
4893 int consumed_chars = 0, consumed_chars_base;
4894 int multibytep = coding->src_multibyte;
4895 struct charset *charset_roman, *charset_big5;
4896 Lisp_Object attrs, charset_list, val;
4897 int char_offset = coding->produced_char;
4898 int last_offset = char_offset;
4899 int last_id = charset_ascii;
4900 int eol_crlf =
4901 !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
4902 int byte_after_cr = -1;
4903
4904 CODING_GET_INFO (coding, attrs, charset_list);
4905 val = charset_list;
4906 charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4907 charset_big5 = CHARSET_FROM_ID (XINT (XCAR (val)));
4908
4909 while (1)
4910 {
4911 int c, c1;
4912 struct charset *charset;
4913
4914 src_base = src;
4915 consumed_chars_base = consumed_chars;
4916
4917 if (charbuf >= charbuf_end)
4918 {
4919 if (byte_after_cr >= 0)
4920 src_base--;
4921 break;
4922 }
4923
4924 if (byte_after_cr >= 0)
4925 c = byte_after_cr, byte_after_cr = -1;
4926 else
4927 ONE_MORE_BYTE (c);
4928
4929 if (c < 0)
4930 goto invalid_code;
4931 if (c < 0x80)
4932 {
4933 if (eol_crlf && c == '\r')
4934 ONE_MORE_BYTE (byte_after_cr);
4935 charset = charset_roman;
4936 }
4937 else
4938 {
4939 /* BIG5 -> Big5 */
4940 if (c < 0xA1 || c > 0xFE)
4941 goto invalid_code;
4942 ONE_MORE_BYTE (c1);
4943 if (c1 < 0x40 || (c1 > 0x7E && c1 < 0xA1) || c1 > 0xFE)
4944 goto invalid_code;
4945 c = c << 8 | c1;
4946 charset = charset_big5;
4947 }
4948 if (charset->id != charset_ascii
4949 && last_id != charset->id)
4950 {
4951 if (last_id != charset_ascii)
4952 ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
4953 last_id = charset->id;
4954 last_offset = char_offset;
4955 }
4956 CODING_DECODE_CHAR (coding, src, src_base, src_end, charset, c, c);
4957 *charbuf++ = c;
4958 char_offset++;
4959 continue;
4960
4961 invalid_code:
4962 src = src_base;
4963 consumed_chars = consumed_chars_base;
4964 ONE_MORE_BYTE (c);
4965 *charbuf++ = c < 0 ? -c : BYTE8_TO_CHAR (c);
4966 char_offset++;
4967 coding->errors++;
4968 }
4969
4970 no_more_source:
4971 if (last_id != charset_ascii)
4972 ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
4973 coding->consumed_char += consumed_chars_base;
4974 coding->consumed = src_base - coding->source;
4975 coding->charbuf_used = charbuf - coding->charbuf;
4976 }
4977
4978 /* See the above "GENERAL NOTES on `encode_coding_XXX ()' functions".
4979 This function can encode charsets `ascii', `katakana-jisx0201',
4980 `japanese-jisx0208', `chinese-big5-1', and `chinese-big5-2'. We
4981 are sure that all these charsets are registered as official charset
4982 (i.e. do not have extended leading-codes). Characters of other
4983 charsets are produced without any encoding. If SJIS_P is 1, encode
4984 SJIS text, else encode BIG5 text. */
4985
4986 static int
4987 encode_coding_sjis (coding)
4988 struct coding_system *coding;
4989 {
4990 int multibytep = coding->dst_multibyte;
4991 int *charbuf = coding->charbuf;
4992 int *charbuf_end = charbuf + coding->charbuf_used;
4993 unsigned char *dst = coding->destination + coding->produced;
4994 unsigned char *dst_end = coding->destination + coding->dst_bytes;
4995 int safe_room = 4;
4996 int produced_chars = 0;
4997 Lisp_Object attrs, charset_list, val;
4998 int ascii_compatible;
4999 struct charset *charset_roman, *charset_kanji, *charset_kana;
5000 struct charset *charset_kanji2;
5001 int c;
5002
5003 CODING_GET_INFO (coding, attrs, charset_list);
5004 val = charset_list;
5005 charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
5006 charset_kana = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
5007 charset_kanji = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
5008 charset_kanji2 = NILP (val) ? NULL : CHARSET_FROM_ID (XINT (XCAR (val)));
5009
5010 ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs));
5011
5012 while (charbuf < charbuf_end)
5013 {
5014 ASSURE_DESTINATION (safe_room);
5015 c = *charbuf++;
5016 /* Now encode the character C. */
5017 if (ASCII_CHAR_P (c) && ascii_compatible)
5018 EMIT_ONE_ASCII_BYTE (c);
5019 else if (CHAR_BYTE8_P (c))
5020 {
5021 c = CHAR_TO_BYTE8 (c);
5022 EMIT_ONE_BYTE (c);
5023 }
5024 else
5025 {
5026 unsigned code;
5027 struct charset *charset = char_charset (c, charset_list, &code);
5028
5029 if (!charset)
5030 {
5031 if (coding->mode & CODING_MODE_SAFE_ENCODING)
5032 {
5033 code = CODING_INHIBIT_CHARACTER_SUBSTITUTION;
5034 charset = CHARSET_FROM_ID (charset_ascii);
5035 }
5036 else
5037 {
5038 c = coding->default_char;
5039 charset = char_charset (c, charset_list, &code);
5040 }
5041 }
5042 if (code == CHARSET_INVALID_CODE (charset))
5043 abort ();
5044 if (charset == charset_kanji)
5045 {
5046 int c1, c2;
5047 JIS_TO_SJIS (code);
5048 c1 = code >> 8, c2 = code & 0xFF;
5049 EMIT_TWO_BYTES (c1, c2);
5050 }
5051 else if (charset == charset_kana)
5052 EMIT_ONE_BYTE (code | 0x80);
5053 else if (charset_kanji2 && charset == charset_kanji2)
5054 {
5055 int c1, c2;
5056
5057 c1 = code >> 8;
5058 if (c1 == 0x21 || (c1 >= 0x23 && c1 < 0x25)
5059 || (c1 >= 0x2C && c1 <= 0x2F) || c1 >= 0x6E)
5060 {
5061 JIS_TO_SJIS2 (code);
5062 c1 = code >> 8, c2 = code & 0xFF;
5063 EMIT_TWO_BYTES (c1, c2);
5064 }
5065 else
5066 EMIT_ONE_ASCII_BYTE (code & 0x7F);
5067 }
5068 else
5069 EMIT_ONE_ASCII_BYTE (code & 0x7F);
5070 }
5071 }
5072 record_conversion_result (coding, CODING_RESULT_SUCCESS);
5073 coding->produced_char += produced_chars;
5074 coding->produced = dst - coding->destination;
5075 return 0;
5076 }
5077
5078 static int
5079 encode_coding_big5 (coding)
5080 struct coding_system *coding;
5081 {
5082 int multibytep = coding->dst_multibyte;
5083 int *charbuf = coding->charbuf;
5084 int *charbuf_end = charbuf + coding->charbuf_used;
5085 unsigned char *dst = coding->destination + coding->produced;
5086 unsigned char *dst_end = coding->destination + coding->dst_bytes;
5087 int safe_room = 4;
5088 int produced_chars = 0;
5089 Lisp_Object attrs, charset_list, val;
5090 int ascii_compatible;
5091 struct charset *charset_roman, *charset_big5;
5092 int c;
5093
5094 CODING_GET_INFO (coding, attrs, charset_list);
5095 val = charset_list;
5096 charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
5097 charset_big5 = CHARSET_FROM_ID (XINT (XCAR (val)));
5098 ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs));
5099
5100 while (charbuf < charbuf_end)
5101 {
5102 ASSURE_DESTINATION (safe_room);
5103 c = *charbuf++;
5104 /* Now encode the character C. */
5105 if (ASCII_CHAR_P (c) && ascii_compatible)
5106 EMIT_ONE_ASCII_BYTE (c);
5107 else if (CHAR_BYTE8_P (c))
5108 {
5109 c = CHAR_TO_BYTE8 (c);
5110 EMIT_ONE_BYTE (c);
5111 }
5112 else
5113 {
5114 unsigned code;
5115 struct charset *charset = char_charset (c, charset_list, &code);
5116
5117 if (! charset)
5118 {
5119 if (coding->mode & CODING_MODE_SAFE_ENCODING)
5120 {
5121 code = CODING_INHIBIT_CHARACTER_SUBSTITUTION;
5122 charset = CHARSET_FROM_ID (charset_ascii);
5123 }
5124 else
5125 {
5126 c = coding->default_char;
5127 charset = char_charset (c, charset_list, &code);
5128 }
5129 }
5130 if (code == CHARSET_INVALID_CODE (charset))
5131 abort ();
5132 if (charset == charset_big5)
5133 {
5134 int c1, c2;
5135
5136 c1 = code >> 8, c2 = code & 0xFF;
5137 EMIT_TWO_BYTES (c1, c2);
5138 }
5139 else
5140 EMIT_ONE_ASCII_BYTE (code & 0x7F);
5141 }
5142 }
5143 record_conversion_result (coding, CODING_RESULT_SUCCESS);
5144 coding->produced_char += produced_chars;
5145 coding->produced = dst - coding->destination;
5146 return 0;
5147 }
5148
5149 \f
5150 /*** 10. CCL handlers ***/
5151
5152 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
5153 Check if a text is encoded in a coding system of which
5154 encoder/decoder are written in CCL program. If it is, return
5155 CATEGORY_MASK_CCL, else return 0. */
5156
5157 static int
5158 detect_coding_ccl (coding, detect_info)
5159 struct coding_system *coding;
5160 struct coding_detection_info *detect_info;
5161 {
5162 const unsigned char *src = coding->source, *src_base;
5163 const unsigned char *src_end = coding->source + coding->src_bytes;
5164 int multibytep = coding->src_multibyte;
5165 int consumed_chars = 0;
5166 int found = 0;
5167 unsigned char *valids;
5168 int head_ascii = coding->head_ascii;
5169 Lisp_Object attrs;
5170
5171 detect_info->checked |= CATEGORY_MASK_CCL;
5172
5173 coding = &coding_categories[coding_category_ccl];
5174 valids = CODING_CCL_VALIDS (coding);
5175 attrs = CODING_ID_ATTRS (coding->id);
5176 if (! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
5177 src += head_ascii;
5178
5179 while (1)
5180 {
5181 int c;
5182
5183 src_base = src;
5184 ONE_MORE_BYTE (c);
5185 if (c < 0 || ! valids[c])
5186 break;
5187 if ((valids[c] > 1))
5188 found = CATEGORY_MASK_CCL;
5189 }
5190 detect_info->rejected |= CATEGORY_MASK_CCL;
5191 return 0;
5192
5193 no_more_source:
5194 detect_info->found |= found;
5195 return 1;
5196 }
5197
5198 static void
5199 decode_coding_ccl (coding)
5200 struct coding_system *coding;
5201 {
5202 const unsigned char *src = coding->source + coding->consumed;
5203 const unsigned char *src_end = coding->source + coding->src_bytes;
5204 int *charbuf = coding->charbuf + coding->charbuf_used;
5205 int *charbuf_end = coding->charbuf + coding->charbuf_size;
5206 int consumed_chars = 0;
5207 int multibytep = coding->src_multibyte;
5208 struct ccl_program ccl;
5209 int source_charbuf[1024];
5210 int source_byteidx[1024];
5211 Lisp_Object attrs, charset_list;
5212
5213 CODING_GET_INFO (coding, attrs, charset_list);
5214 setup_ccl_program (&ccl, CODING_CCL_DECODER (coding));
5215
5216 while (src < src_end)
5217 {
5218 const unsigned char *p = src;
5219 int *source, *source_end;
5220 int i = 0;
5221
5222 if (multibytep)
5223 while (i < 1024 && p < src_end)
5224 {
5225 source_byteidx[i] = p - src;
5226 source_charbuf[i++] = STRING_CHAR_ADVANCE (p);
5227 }
5228 else
5229 while (i < 1024 && p < src_end)
5230 source_charbuf[i++] = *p++;
5231
5232 if (p == src_end && coding->mode & CODING_MODE_LAST_BLOCK)
5233 ccl.last_block = 1;
5234
5235 source = source_charbuf;
5236 source_end = source + i;
5237 while (source < source_end)
5238 {
5239 ccl_driver (&ccl, source, charbuf,
5240 source_end - source, charbuf_end - charbuf,
5241 charset_list);
5242 source += ccl.consumed;
5243 charbuf += ccl.produced;
5244 if (ccl.status != CCL_STAT_SUSPEND_BY_DST)
5245 break;
5246 }
5247 if (source < source_end)
5248 src += source_byteidx[source - source_charbuf];
5249 else
5250 src = p;
5251 consumed_chars += source - source_charbuf;
5252
5253 if (ccl.status != CCL_STAT_SUSPEND_BY_SRC
5254 && ccl.status != CODING_RESULT_INSUFFICIENT_SRC)
5255 break;
5256 }
5257
5258 switch (ccl.status)
5259 {
5260 case CCL_STAT_SUSPEND_BY_SRC:
5261 record_conversion_result (coding, CODING_RESULT_INSUFFICIENT_SRC);
5262 break;
5263 case CCL_STAT_SUSPEND_BY_DST:
5264 break;
5265 case CCL_STAT_QUIT:
5266 case CCL_STAT_INVALID_CMD:
5267 record_conversion_result (coding, CODING_RESULT_INTERRUPT);
5268 break;
5269 default:
5270 record_conversion_result (coding, CODING_RESULT_SUCCESS);
5271 break;
5272 }
5273 coding->consumed_char += consumed_chars;
5274 coding->consumed = src - coding->source;
5275 coding->charbuf_used = charbuf - coding->charbuf;
5276 }
5277
5278 static int
5279 encode_coding_ccl (coding)
5280 struct coding_system *coding;
5281 {
5282 struct ccl_program ccl;
5283 int multibytep = coding->dst_multibyte;
5284 int *charbuf = coding->charbuf;
5285 int *charbuf_end = charbuf + coding->charbuf_used;
5286 unsigned char *dst = coding->destination + coding->produced;
5287 unsigned char *dst_end = coding->destination + coding->dst_bytes;
5288 int destination_charbuf[1024];
5289 int i, produced_chars = 0;
5290 Lisp_Object attrs, charset_list;
5291
5292 CODING_GET_INFO (coding, attrs, charset_list);
5293 setup_ccl_program (&ccl, CODING_CCL_ENCODER (coding));
5294
5295 ccl.last_block = coding->mode & CODING_MODE_LAST_BLOCK;
5296 ccl.dst_multibyte = coding->dst_multibyte;
5297
5298 while (charbuf < charbuf_end)
5299 {
5300 ccl_driver (&ccl, charbuf, destination_charbuf,
5301 charbuf_end - charbuf, 1024, charset_list);
5302 if (multibytep)
5303 {
5304 ASSURE_DESTINATION (ccl.produced * 2);
5305 for (i = 0; i < ccl.produced; i++)
5306 EMIT_ONE_BYTE (destination_charbuf[i] & 0xFF);
5307 }
5308 else
5309 {
5310 ASSURE_DESTINATION (ccl.produced);
5311 for (i = 0; i < ccl.produced; i++)
5312 *dst++ = destination_charbuf[i] & 0xFF;
5313 produced_chars += ccl.produced;
5314 }
5315 charbuf += ccl.consumed;
5316 if (ccl.status == CCL_STAT_QUIT
5317 || ccl.status == CCL_STAT_INVALID_CMD)
5318 break;
5319 }
5320
5321 switch (ccl.status)
5322 {
5323 case CCL_STAT_SUSPEND_BY_SRC:
5324 record_conversion_result (coding, CODING_RESULT_INSUFFICIENT_SRC);
5325 break;
5326 case CCL_STAT_SUSPEND_BY_DST:
5327 record_conversion_result (coding, CODING_RESULT_INSUFFICIENT_DST);
5328 break;
5329 case CCL_STAT_QUIT:
5330 case CCL_STAT_INVALID_CMD:
5331 record_conversion_result (coding, CODING_RESULT_INTERRUPT);
5332 break;
5333 default:
5334 record_conversion_result (coding, CODING_RESULT_SUCCESS);
5335 break;
5336 }
5337
5338 coding->produced_char += produced_chars;
5339 coding->produced = dst - coding->destination;
5340 return 0;
5341 }
5342
5343
5344 \f
5345 /*** 10, 11. no-conversion handlers ***/
5346
5347 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions". */
5348
5349 static void
5350 decode_coding_raw_text (coding)
5351 struct coding_system *coding;
5352 {
5353 int eol_crlf =
5354 !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
5355
5356 coding->chars_at_source = 1;
5357 coding->consumed_char = coding->src_chars;
5358 coding->consumed = coding->src_bytes;
5359 if (eol_crlf && coding->source[coding->src_bytes - 1] == '\r')
5360 {
5361 coding->consumed_char--;
5362 coding->consumed--;
5363 record_conversion_result (coding, CODING_RESULT_INSUFFICIENT_SRC);
5364 }
5365 else
5366 record_conversion_result (coding, CODING_RESULT_SUCCESS);
5367 }
5368
5369 static int
5370 encode_coding_raw_text (coding)
5371 struct coding_system *coding;
5372 {
5373 int multibytep = coding->dst_multibyte;
5374 int *charbuf = coding->charbuf;
5375 int *charbuf_end = coding->charbuf + coding->charbuf_used;
5376 unsigned char *dst = coding->destination + coding->produced;
5377 unsigned char *dst_end = coding->destination + coding->dst_bytes;
5378 int produced_chars = 0;
5379 int c;
5380
5381 if (multibytep)
5382 {
5383 int safe_room = MAX_MULTIBYTE_LENGTH * 2;
5384
5385 if (coding->src_multibyte)
5386 while (charbuf < charbuf_end)
5387 {
5388 ASSURE_DESTINATION (safe_room);
5389 c = *charbuf++;
5390 if (ASCII_CHAR_P (c))
5391 EMIT_ONE_ASCII_BYTE (c);
5392 else if (CHAR_BYTE8_P (c))
5393 {
5394 c = CHAR_TO_BYTE8 (c);
5395 EMIT_ONE_BYTE (c);
5396 }
5397 else
5398 {
5399 unsigned char str[MAX_MULTIBYTE_LENGTH], *p0 = str, *p1 = str;
5400
5401 CHAR_STRING_ADVANCE (c, p1);
5402 while (p0 < p1)
5403 {
5404 EMIT_ONE_BYTE (*p0);
5405 p0++;
5406 }
5407 }
5408 }
5409 else
5410 while (charbuf < charbuf_end)
5411 {
5412 ASSURE_DESTINATION (safe_room);
5413 c = *charbuf++;
5414 EMIT_ONE_BYTE (c);
5415 }
5416 }
5417 else
5418 {
5419 if (coding->src_multibyte)
5420 {
5421 int safe_room = MAX_MULTIBYTE_LENGTH;
5422
5423 while (charbuf < charbuf_end)
5424 {
5425 ASSURE_DESTINATION (safe_room);
5426 c = *charbuf++;
5427 if (ASCII_CHAR_P (c))
5428 *dst++ = c;
5429 else if (CHAR_BYTE8_P (c))
5430 *dst++ = CHAR_TO_BYTE8 (c);
5431 else
5432 CHAR_STRING_ADVANCE (c, dst);
5433 }
5434 }
5435 else
5436 {
5437 ASSURE_DESTINATION (charbuf_end - charbuf);
5438 while (charbuf < charbuf_end && dst < dst_end)
5439 *dst++ = *charbuf++;
5440 }
5441 produced_chars = dst - (coding->destination + coding->produced);
5442 }
5443 record_conversion_result (coding, CODING_RESULT_SUCCESS);
5444 coding->produced_char += produced_chars;
5445 coding->produced = dst - coding->destination;
5446 return 0;
5447 }
5448
5449 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
5450 Check if a text is encoded in a charset-based coding system. If it
5451 is, return 1, else return 0. */
5452
5453 static int
5454 detect_coding_charset (coding, detect_info)
5455 struct coding_system *coding;
5456 struct coding_detection_info *detect_info;
5457 {
5458 const unsigned char *src = coding->source, *src_base;
5459 const unsigned char *src_end = coding->source + coding->src_bytes;
5460 int multibytep = coding->src_multibyte;
5461 int consumed_chars = 0;
5462 Lisp_Object attrs, valids, name;
5463 int found = 0;
5464 int head_ascii = coding->head_ascii;
5465 int check_latin_extra = 0;
5466
5467 detect_info->checked |= CATEGORY_MASK_CHARSET;
5468
5469 coding = &coding_categories[coding_category_charset];
5470 attrs = CODING_ID_ATTRS (coding->id);
5471 valids = AREF (attrs, coding_attr_charset_valids);
5472 name = CODING_ID_NAME (coding->id);
5473 if (strncmp ((char *) SDATA (SYMBOL_NAME (name)),
5474 "iso-8859-", sizeof ("iso-8859-") - 1) == 0
5475 || strncmp ((char *) SDATA (SYMBOL_NAME (name)),
5476 "iso-latin-", sizeof ("iso-latin-") - 1) == 0)
5477 check_latin_extra = 1;
5478
5479 if (! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
5480 src += head_ascii;
5481
5482 while (1)
5483 {
5484 int c;
5485 Lisp_Object val;
5486 struct charset *charset;
5487 int dim, idx;
5488
5489 src_base = src;
5490 ONE_MORE_BYTE (c);
5491 if (c < 0)
5492 continue;
5493 val = AREF (valids, c);
5494 if (NILP (val))
5495 break;
5496 if (c >= 0x80)
5497 {
5498 if (c < 0xA0
5499 && check_latin_extra
5500 && (!VECTORP (Vlatin_extra_code_table)
5501 || NILP (XVECTOR (Vlatin_extra_code_table)->contents[c])))
5502 break;
5503 found = CATEGORY_MASK_CHARSET;
5504 }
5505 if (INTEGERP (val))
5506 {
5507 charset = CHARSET_FROM_ID (XFASTINT (val));
5508 dim = CHARSET_DIMENSION (charset);
5509 for (idx = 1; idx < dim; idx++)
5510 {
5511 if (src == src_end)
5512 goto too_short;
5513 ONE_MORE_BYTE (c);
5514 if (c < charset->code_space[(dim - 1 - idx) * 2]
5515 || c > charset->code_space[(dim - 1 - idx) * 2 + 1])
5516 break;
5517 }
5518 if (idx < dim)
5519 break;
5520 }
5521 else
5522 {
5523 idx = 1;
5524 for (; CONSP (val); val = XCDR (val))
5525 {
5526 charset = CHARSET_FROM_ID (XFASTINT (XCAR (val)));
5527 dim = CHARSET_DIMENSION (charset);
5528 while (idx < dim)
5529 {
5530 if (src == src_end)
5531 goto too_short;
5532 ONE_MORE_BYTE (c);
5533 if (c < charset->code_space[(dim - 1 - idx) * 4]
5534 || c > charset->code_space[(dim - 1 - idx) * 4 + 1])
5535 break;
5536 idx++;
5537 }
5538 if (idx == dim)
5539 {
5540 val = Qnil;
5541 break;
5542 }
5543 }
5544 if (CONSP (val))
5545 break;
5546 }
5547 }
5548 too_short:
5549 detect_info->rejected |= CATEGORY_MASK_CHARSET;
5550 return 0;
5551
5552 no_more_source:
5553 detect_info->found |= found;
5554 return 1;
5555 }
5556
5557 static void
5558 decode_coding_charset (coding)
5559 struct coding_system *coding;
5560 {
5561 const unsigned char *src = coding->source + coding->consumed;
5562 const unsigned char *src_end = coding->source + coding->src_bytes;
5563 const unsigned char *src_base;
5564 int *charbuf = coding->charbuf + coding->charbuf_used;
5565 /* We may produce one charset annocation in one loop and one more at
5566 the end. */
5567 int *charbuf_end
5568 = coding->charbuf + coding->charbuf_size - (MAX_ANNOTATION_LENGTH * 2);
5569 int consumed_chars = 0, consumed_chars_base;
5570 int multibytep = coding->src_multibyte;
5571 Lisp_Object attrs, charset_list, valids;
5572 int char_offset = coding->produced_char;
5573 int last_offset = char_offset;
5574 int last_id = charset_ascii;
5575 int eol_crlf =
5576 !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
5577 int byte_after_cr = -1;
5578
5579 CODING_GET_INFO (coding, attrs, charset_list);
5580 valids = AREF (attrs, coding_attr_charset_valids);
5581
5582 while (1)
5583 {
5584 int c;
5585 Lisp_Object val;
5586 struct charset *charset;
5587 int dim;
5588 int len = 1;
5589 unsigned code;
5590
5591 src_base = src;
5592 consumed_chars_base = consumed_chars;
5593
5594 if (charbuf >= charbuf_end)
5595 {
5596 if (byte_after_cr >= 0)
5597 src_base--;
5598 break;
5599 }
5600
5601 if (byte_after_cr >= 0)
5602 {
5603 c = byte_after_cr;
5604 byte_after_cr = -1;
5605 }
5606 else
5607 {
5608 ONE_MORE_BYTE (c);
5609 if (eol_crlf && c == '\r')
5610 ONE_MORE_BYTE (byte_after_cr);
5611 }
5612 if (c < 0)
5613 goto invalid_code;
5614 code = c;
5615
5616 val = AREF (valids, c);
5617 if (! INTEGERP (val) && ! CONSP (val))
5618 goto invalid_code;
5619 if (INTEGERP (val))
5620 {
5621 charset = CHARSET_FROM_ID (XFASTINT (val));
5622 dim = CHARSET_DIMENSION (charset);
5623 while (len < dim)
5624 {
5625 ONE_MORE_BYTE (c);
5626 code = (code << 8) | c;
5627 len++;
5628 }
5629 CODING_DECODE_CHAR (coding, src, src_base, src_end,
5630 charset, code, c);
5631 }
5632 else
5633 {
5634 /* VAL is a list of charset IDs. It is assured that the
5635 list is sorted by charset dimensions (smaller one
5636 comes first). */
5637 while (CONSP (val))
5638 {
5639 charset = CHARSET_FROM_ID (XFASTINT (XCAR (val)));
5640 dim = CHARSET_DIMENSION (charset);
5641 while (len < dim)
5642 {
5643 ONE_MORE_BYTE (c);
5644 code = (code << 8) | c;
5645 len++;
5646 }
5647 CODING_DECODE_CHAR (coding, src, src_base,
5648 src_end, charset, code, c);
5649 if (c >= 0)
5650 break;
5651 val = XCDR (val);
5652 }
5653 }
5654 if (c < 0)
5655 goto invalid_code;
5656 if (charset->id != charset_ascii
5657 && last_id != charset->id)
5658 {
5659 if (last_id != charset_ascii)
5660 ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
5661 last_id = charset->id;
5662 last_offset = char_offset;
5663 }
5664
5665 *charbuf++ = c;
5666 char_offset++;
5667 continue;
5668
5669 invalid_code:
5670 src = src_base;
5671 consumed_chars = consumed_chars_base;
5672 ONE_MORE_BYTE (c);
5673 *charbuf++ = c < 0 ? -c : ASCII_BYTE_P (c) ? c : BYTE8_TO_CHAR (c);
5674 char_offset++;
5675 coding->errors++;
5676 }
5677
5678 no_more_source:
5679 if (last_id != charset_ascii)
5680 ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
5681 coding->consumed_char += consumed_chars_base;
5682 coding->consumed = src_base - coding->source;
5683 coding->charbuf_used = charbuf - coding->charbuf;
5684 }
5685
5686 static int
5687 encode_coding_charset (coding)
5688 struct coding_system *coding;
5689 {
5690 int multibytep = coding->dst_multibyte;
5691 int *charbuf = coding->charbuf;
5692 int *charbuf_end = charbuf + coding->charbuf_used;
5693 unsigned char *dst = coding->destination + coding->produced;
5694 unsigned char *dst_end = coding->destination + coding->dst_bytes;
5695 int safe_room = MAX_MULTIBYTE_LENGTH;
5696 int produced_chars = 0;
5697 Lisp_Object attrs, charset_list;
5698 int ascii_compatible;
5699 int c;
5700
5701 CODING_GET_INFO (coding, attrs, charset_list);
5702 ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs));
5703
5704 while (charbuf < charbuf_end)
5705 {
5706 struct charset *charset;
5707 unsigned code;
5708
5709 ASSURE_DESTINATION (safe_room);
5710 c = *charbuf++;
5711 if (ascii_compatible && ASCII_CHAR_P (c))
5712 EMIT_ONE_ASCII_BYTE (c);
5713 else if (CHAR_BYTE8_P (c))
5714 {
5715 c = CHAR_TO_BYTE8 (c);
5716 EMIT_ONE_BYTE (c);
5717 }
5718 else
5719 {
5720 charset = char_charset (c, charset_list, &code);
5721 if (charset)
5722 {
5723 if (CHARSET_DIMENSION (charset) == 1)
5724 EMIT_ONE_BYTE (code);
5725 else if (CHARSET_DIMENSION (charset) == 2)
5726 EMIT_TWO_BYTES (code >> 8, code & 0xFF);
5727 else if (CHARSET_DIMENSION (charset) == 3)
5728 EMIT_THREE_BYTES (code >> 16, (code >> 8) & 0xFF, code & 0xFF);
5729 else
5730 EMIT_FOUR_BYTES (code >> 24, (code >> 16) & 0xFF,
5731 (code >> 8) & 0xFF, code & 0xFF);
5732 }
5733 else
5734 {
5735 if (coding->mode & CODING_MODE_SAFE_ENCODING)
5736 c = CODING_INHIBIT_CHARACTER_SUBSTITUTION;
5737 else
5738 c = coding->default_char;
5739 EMIT_ONE_BYTE (c);
5740 }
5741 }
5742 }
5743
5744 record_conversion_result (coding, CODING_RESULT_SUCCESS);
5745 coding->produced_char += produced_chars;
5746 coding->produced = dst - coding->destination;
5747 return 0;
5748 }
5749
5750 \f
5751 /*** 7. C library functions ***/
5752
5753 /* Setup coding context CODING from information about CODING_SYSTEM.
5754 If CODING_SYSTEM is nil, `no-conversion' is assumed. If
5755 CODING_SYSTEM is invalid, signal an error. */
5756
5757 void
5758 setup_coding_system (coding_system, coding)
5759 Lisp_Object coding_system;
5760 struct coding_system *coding;
5761 {
5762 Lisp_Object attrs;
5763 Lisp_Object eol_type;
5764 Lisp_Object coding_type;
5765 Lisp_Object val;
5766
5767 if (NILP (coding_system))
5768 coding_system = Qundecided;
5769
5770 CHECK_CODING_SYSTEM_GET_ID (coding_system, coding->id);
5771
5772 attrs = CODING_ID_ATTRS (coding->id);
5773 eol_type = inhibit_eol_conversion ? Qunix : CODING_ID_EOL_TYPE (coding->id);
5774
5775 coding->mode = 0;
5776 coding->head_ascii = -1;
5777 if (VECTORP (eol_type))
5778 coding->common_flags = (CODING_REQUIRE_DECODING_MASK
5779 | CODING_REQUIRE_DETECTION_MASK);
5780 else if (! EQ (eol_type, Qunix))
5781 coding->common_flags = (CODING_REQUIRE_DECODING_MASK
5782 | CODING_REQUIRE_ENCODING_MASK);
5783 else
5784 coding->common_flags = 0;
5785 if (! NILP (CODING_ATTR_POST_READ (attrs)))
5786 coding->common_flags |= CODING_REQUIRE_DECODING_MASK;
5787 if (! NILP (CODING_ATTR_PRE_WRITE (attrs)))
5788 coding->common_flags |= CODING_REQUIRE_ENCODING_MASK;
5789 if (! NILP (CODING_ATTR_FOR_UNIBYTE (attrs)))
5790 coding->common_flags |= CODING_FOR_UNIBYTE_MASK;
5791
5792 val = CODING_ATTR_SAFE_CHARSETS (attrs);
5793 coding->max_charset_id = SCHARS (val) - 1;
5794 coding->safe_charsets = SDATA (val);
5795 coding->default_char = XINT (CODING_ATTR_DEFAULT_CHAR (attrs));
5796
5797 coding_type = CODING_ATTR_TYPE (attrs);
5798 if (EQ (coding_type, Qundecided))
5799 {
5800 coding->detector = NULL;
5801 coding->decoder = decode_coding_raw_text;
5802 coding->encoder = encode_coding_raw_text;
5803 coding->common_flags |= CODING_REQUIRE_DETECTION_MASK;
5804 }
5805 else if (EQ (coding_type, Qiso_2022))
5806 {
5807 int i;
5808 int flags = XINT (AREF (attrs, coding_attr_iso_flags));
5809
5810 /* Invoke graphic register 0 to plane 0. */
5811 CODING_ISO_INVOCATION (coding, 0) = 0;
5812 /* Invoke graphic register 1 to plane 1 if we can use 8-bit. */
5813 CODING_ISO_INVOCATION (coding, 1)
5814 = (flags & CODING_ISO_FLAG_SEVEN_BITS ? -1 : 1);
5815 /* Setup the initial status of designation. */
5816 for (i = 0; i < 4; i++)
5817 CODING_ISO_DESIGNATION (coding, i) = CODING_ISO_INITIAL (coding, i);
5818 /* Not single shifting initially. */
5819 CODING_ISO_SINGLE_SHIFTING (coding) = 0;
5820 /* Beginning of buffer should also be regarded as bol. */
5821 CODING_ISO_BOL (coding) = 1;
5822 coding->detector = detect_coding_iso_2022;
5823 coding->decoder = decode_coding_iso_2022;
5824 coding->encoder = encode_coding_iso_2022;
5825 if (flags & CODING_ISO_FLAG_SAFE)
5826 coding->mode |= CODING_MODE_SAFE_ENCODING;
5827 coding->common_flags
5828 |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK
5829 | CODING_REQUIRE_FLUSHING_MASK);
5830 if (flags & CODING_ISO_FLAG_COMPOSITION)
5831 coding->common_flags |= CODING_ANNOTATE_COMPOSITION_MASK;
5832 if (flags & CODING_ISO_FLAG_DESIGNATION)
5833 coding->common_flags |= CODING_ANNOTATE_CHARSET_MASK;
5834 if (flags & CODING_ISO_FLAG_FULL_SUPPORT)
5835 {
5836 setup_iso_safe_charsets (attrs);
5837 val = CODING_ATTR_SAFE_CHARSETS (attrs);
5838 coding->max_charset_id = SCHARS (val) - 1;
5839 coding->safe_charsets = SDATA (val);
5840 }
5841 CODING_ISO_FLAGS (coding) = flags;
5842 CODING_ISO_CMP_STATUS (coding)->state = COMPOSING_NO;
5843 CODING_ISO_CMP_STATUS (coding)->method = COMPOSITION_NO;
5844 CODING_ISO_EXTSEGMENT_LEN (coding) = 0;
5845 CODING_ISO_EMBEDDED_UTF_8 (coding) = 0;
5846 }
5847 else if (EQ (coding_type, Qcharset))
5848 {
5849 coding->detector = detect_coding_charset;
5850 coding->decoder = decode_coding_charset;
5851 coding->encoder = encode_coding_charset;
5852 coding->common_flags
5853 |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
5854 }
5855 else if (EQ (coding_type, Qutf_8))
5856 {
5857 val = AREF (attrs, coding_attr_utf_bom);
5858 CODING_UTF_8_BOM (coding) = (CONSP (val) ? utf_detect_bom
5859 : EQ (val, Qt) ? utf_with_bom
5860 : utf_without_bom);
5861 coding->detector = detect_coding_utf_8;
5862 coding->decoder = decode_coding_utf_8;
5863 coding->encoder = encode_coding_utf_8;
5864 coding->common_flags
5865 |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
5866 if (CODING_UTF_8_BOM (coding) == utf_detect_bom)
5867 coding->common_flags |= CODING_REQUIRE_DETECTION_MASK;
5868 }
5869 else if (EQ (coding_type, Qutf_16))
5870 {
5871 val = AREF (attrs, coding_attr_utf_bom);
5872 CODING_UTF_16_BOM (coding) = (CONSP (val) ? utf_detect_bom
5873 : EQ (val, Qt) ? utf_with_bom
5874 : utf_without_bom);
5875 val = AREF (attrs, coding_attr_utf_16_endian);
5876 CODING_UTF_16_ENDIAN (coding) = (EQ (val, Qbig) ? utf_16_big_endian
5877 : utf_16_little_endian);
5878 CODING_UTF_16_SURROGATE (coding) = 0;
5879 coding->detector = detect_coding_utf_16;
5880 coding->decoder = decode_coding_utf_16;
5881 coding->encoder = encode_coding_utf_16;
5882 coding->common_flags
5883 |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
5884 if (CODING_UTF_16_BOM (coding) == utf_detect_bom)
5885 coding->common_flags |= CODING_REQUIRE_DETECTION_MASK;
5886 }
5887 else if (EQ (coding_type, Qccl))
5888 {
5889 coding->detector = detect_coding_ccl;
5890 coding->decoder = decode_coding_ccl;
5891 coding->encoder = encode_coding_ccl;
5892 coding->common_flags
5893 |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK
5894 | CODING_REQUIRE_FLUSHING_MASK);
5895 }
5896 else if (EQ (coding_type, Qemacs_mule))
5897 {
5898 coding->detector = detect_coding_emacs_mule;
5899 coding->decoder = decode_coding_emacs_mule;
5900 coding->encoder = encode_coding_emacs_mule;
5901 coding->common_flags
5902 |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
5903 coding->spec.emacs_mule.full_support = 1;
5904 if (! NILP (AREF (attrs, coding_attr_emacs_mule_full))
5905 && ! EQ (CODING_ATTR_CHARSET_LIST (attrs), Vemacs_mule_charset_list))
5906 {
5907 Lisp_Object tail, safe_charsets;
5908 int max_charset_id = 0;
5909
5910 for (tail = Vemacs_mule_charset_list; CONSP (tail);
5911 tail = XCDR (tail))
5912 if (max_charset_id < XFASTINT (XCAR (tail)))
5913 max_charset_id = XFASTINT (XCAR (tail));
5914 safe_charsets = make_uninit_string (max_charset_id + 1);
5915 memset (SDATA (safe_charsets), 255, max_charset_id + 1);
5916 for (tail = Vemacs_mule_charset_list; CONSP (tail);
5917 tail = XCDR (tail))
5918 SSET (safe_charsets, XFASTINT (XCAR (tail)), 0);
5919 coding->max_charset_id = max_charset_id;
5920 coding->safe_charsets = SDATA (safe_charsets);
5921 coding->spec.emacs_mule.full_support = 1;
5922 }
5923 coding->spec.emacs_mule.cmp_status.state = COMPOSING_NO;
5924 coding->spec.emacs_mule.cmp_status.method = COMPOSITION_NO;
5925 }
5926 else if (EQ (coding_type, Qshift_jis))
5927 {
5928 coding->detector = detect_coding_sjis;
5929 coding->decoder = decode_coding_sjis;
5930 coding->encoder = encode_coding_sjis;
5931 coding->common_flags
5932 |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
5933 }
5934 else if (EQ (coding_type, Qbig5))
5935 {
5936 coding->detector = detect_coding_big5;
5937 coding->decoder = decode_coding_big5;
5938 coding->encoder = encode_coding_big5;
5939 coding->common_flags
5940 |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
5941 }
5942 else /* EQ (coding_type, Qraw_text) */
5943 {
5944 coding->detector = NULL;
5945 coding->decoder = decode_coding_raw_text;
5946 coding->encoder = encode_coding_raw_text;
5947 if (! EQ (eol_type, Qunix))
5948 {
5949 coding->common_flags |= CODING_REQUIRE_DECODING_MASK;
5950 if (! VECTORP (eol_type))
5951 coding->common_flags |= CODING_REQUIRE_ENCODING_MASK;
5952 }
5953
5954 }
5955
5956 return;
5957 }
5958
5959 /* Return a list of charsets supported by CODING. */
5960
5961 Lisp_Object
5962 coding_charset_list (coding)
5963 struct coding_system *coding;
5964 {
5965 Lisp_Object attrs, charset_list;
5966
5967 CODING_GET_INFO (coding, attrs, charset_list);
5968 if (EQ (CODING_ATTR_TYPE (attrs), Qiso_2022))
5969 {
5970 int flags = XINT (AREF (attrs, coding_attr_iso_flags));
5971
5972 if (flags & CODING_ISO_FLAG_FULL_SUPPORT)
5973 charset_list = Viso_2022_charset_list;
5974 }
5975 else if (EQ (CODING_ATTR_TYPE (attrs), Qemacs_mule))
5976 {
5977 charset_list = Vemacs_mule_charset_list;
5978 }
5979 return charset_list;
5980 }
5981
5982
5983 /* Return a list of charsets supported by CODING-SYSTEM. */
5984
5985 Lisp_Object
5986 coding_system_charset_list (coding_system)
5987 Lisp_Object coding_system;
5988 {
5989 int id;
5990 Lisp_Object attrs, charset_list;
5991
5992 CHECK_CODING_SYSTEM_GET_ID (coding_system, id);
5993 attrs = CODING_ID_ATTRS (id);
5994
5995 if (EQ (CODING_ATTR_TYPE (attrs), Qiso_2022))
5996 {
5997 int flags = XINT (AREF (attrs, coding_attr_iso_flags));
5998
5999 if (flags & CODING_ISO_FLAG_FULL_SUPPORT)
6000 charset_list = Viso_2022_charset_list;
6001 else
6002 charset_list = CODING_ATTR_CHARSET_LIST (attrs);
6003 }
6004 else if (EQ (CODING_ATTR_TYPE (attrs), Qemacs_mule))
6005 {
6006 charset_list = Vemacs_mule_charset_list;
6007 }
6008 else
6009 {
6010 charset_list = CODING_ATTR_CHARSET_LIST (attrs);
6011 }
6012 return charset_list;
6013 }
6014
6015
6016 /* Return raw-text or one of its subsidiaries that has the same
6017 eol_type as CODING-SYSTEM. */
6018
6019 Lisp_Object
6020 raw_text_coding_system (coding_system)
6021 Lisp_Object coding_system;
6022 {
6023 Lisp_Object spec, attrs;
6024 Lisp_Object eol_type, raw_text_eol_type;
6025
6026 if (NILP (coding_system))
6027 return Qraw_text;
6028 spec = CODING_SYSTEM_SPEC (coding_system);
6029 attrs = AREF (spec, 0);
6030
6031 if (EQ (CODING_ATTR_TYPE (attrs), Qraw_text))
6032 return coding_system;
6033
6034 eol_type = AREF (spec, 2);
6035 if (VECTORP (eol_type))
6036 return Qraw_text;
6037 spec = CODING_SYSTEM_SPEC (Qraw_text);
6038 raw_text_eol_type = AREF (spec, 2);
6039 return (EQ (eol_type, Qunix) ? AREF (raw_text_eol_type, 0)
6040 : EQ (eol_type, Qdos) ? AREF (raw_text_eol_type, 1)
6041 : AREF (raw_text_eol_type, 2));
6042 }
6043
6044
6045 /* If CODING_SYSTEM doesn't specify end-of-line format but PARENT
6046 does, return one of the subsidiary that has the same eol-spec as
6047 PARENT. Otherwise, return CODING_SYSTEM. If PARENT is nil,
6048 inherit end-of-line format from the system's setting
6049 (system_eol_type). */
6050
6051 Lisp_Object
6052 coding_inherit_eol_type (coding_system, parent)
6053 Lisp_Object coding_system, parent;
6054 {
6055 Lisp_Object spec, eol_type;
6056
6057 if (NILP (coding_system))
6058 coding_system = Qraw_text;
6059 spec = CODING_SYSTEM_SPEC (coding_system);
6060 eol_type = AREF (spec, 2);
6061 if (VECTORP (eol_type))
6062 {
6063 Lisp_Object parent_eol_type;
6064
6065 if (! NILP (parent))
6066 {
6067 Lisp_Object parent_spec;
6068
6069 parent_spec = CODING_SYSTEM_SPEC (parent);
6070 parent_eol_type = AREF (parent_spec, 2);
6071 }
6072 else
6073 parent_eol_type = system_eol_type;
6074 if (EQ (parent_eol_type, Qunix))
6075 coding_system = AREF (eol_type, 0);
6076 else if (EQ (parent_eol_type, Qdos))
6077 coding_system = AREF (eol_type, 1);
6078 else if (EQ (parent_eol_type, Qmac))
6079 coding_system = AREF (eol_type, 2);
6080 }
6081 return coding_system;
6082 }
6083
6084 /* Emacs has a mechanism to automatically detect a coding system if it
6085 is one of Emacs' internal format, ISO2022, SJIS, and BIG5. But,
6086 it's impossible to distinguish some coding systems accurately
6087 because they use the same range of codes. So, at first, coding
6088 systems are categorized into 7, those are:
6089
6090 o coding-category-emacs-mule
6091
6092 The category for a coding system which has the same code range
6093 as Emacs' internal format. Assigned the coding-system (Lisp
6094 symbol) `emacs-mule' by default.
6095
6096 o coding-category-sjis
6097
6098 The category for a coding system which has the same code range
6099 as SJIS. Assigned the coding-system (Lisp
6100 symbol) `japanese-shift-jis' by default.
6101
6102 o coding-category-iso-7
6103
6104 The category for a coding system which has the same code range
6105 as ISO2022 of 7-bit environment. This doesn't use any locking
6106 shift and single shift functions. This can encode/decode all
6107 charsets. Assigned the coding-system (Lisp symbol)
6108 `iso-2022-7bit' by default.
6109
6110 o coding-category-iso-7-tight
6111
6112 Same as coding-category-iso-7 except that this can
6113 encode/decode only the specified charsets.
6114
6115 o coding-category-iso-8-1
6116
6117 The category for a coding system which has the same code range
6118 as ISO2022 of 8-bit environment and graphic plane 1 used only
6119 for DIMENSION1 charset. This doesn't use any locking shift
6120 and single shift functions. Assigned the coding-system (Lisp
6121 symbol) `iso-latin-1' by default.
6122
6123 o coding-category-iso-8-2
6124
6125 The category for a coding system which has the same code range
6126 as ISO2022 of 8-bit environment and graphic plane 1 used only
6127 for DIMENSION2 charset. This doesn't use any locking shift
6128 and single shift functions. Assigned the coding-system (Lisp
6129 symbol) `japanese-iso-8bit' by default.
6130
6131 o coding-category-iso-7-else
6132
6133 The category for a coding system which has the same code range
6134 as ISO2022 of 7-bit environemnt but uses locking shift or
6135 single shift functions. Assigned the coding-system (Lisp
6136 symbol) `iso-2022-7bit-lock' by default.
6137
6138 o coding-category-iso-8-else
6139
6140 The category for a coding system which has the same code range
6141 as ISO2022 of 8-bit environemnt but uses locking shift or
6142 single shift functions. Assigned the coding-system (Lisp
6143 symbol) `iso-2022-8bit-ss2' by default.
6144
6145 o coding-category-big5
6146
6147 The category for a coding system which has the same code range
6148 as BIG5. Assigned the coding-system (Lisp symbol)
6149 `cn-big5' by default.
6150
6151 o coding-category-utf-8
6152
6153 The category for a coding system which has the same code range
6154 as UTF-8 (cf. RFC3629). Assigned the coding-system (Lisp
6155 symbol) `utf-8' by default.
6156
6157 o coding-category-utf-16-be
6158
6159 The category for a coding system in which a text has an
6160 Unicode signature (cf. Unicode Standard) in the order of BIG
6161 endian at the head. Assigned the coding-system (Lisp symbol)
6162 `utf-16-be' by default.
6163
6164 o coding-category-utf-16-le
6165
6166 The category for a coding system in which a text has an
6167 Unicode signature (cf. Unicode Standard) in the order of
6168 LITTLE endian at the head. Assigned the coding-system (Lisp
6169 symbol) `utf-16-le' by default.
6170
6171 o coding-category-ccl
6172
6173 The category for a coding system of which encoder/decoder is
6174 written in CCL programs. The default value is nil, i.e., no
6175 coding system is assigned.
6176
6177 o coding-category-binary
6178
6179 The category for a coding system not categorized in any of the
6180 above. Assigned the coding-system (Lisp symbol)
6181 `no-conversion' by default.
6182
6183 Each of them is a Lisp symbol and the value is an actual
6184 `coding-system's (this is also a Lisp symbol) assigned by a user.
6185 What Emacs does actually is to detect a category of coding system.
6186 Then, it uses a `coding-system' assigned to it. If Emacs can't
6187 decide only one possible category, it selects a category of the
6188 highest priority. Priorities of categories are also specified by a
6189 user in a Lisp variable `coding-category-list'.
6190
6191 */
6192
6193 #define EOL_SEEN_NONE 0
6194 #define EOL_SEEN_LF 1
6195 #define EOL_SEEN_CR 2
6196 #define EOL_SEEN_CRLF 4
6197
6198 /* Detect how end-of-line of a text of length SRC_BYTES pointed by
6199 SOURCE is encoded. If CATEGORY is one of
6200 coding_category_utf_16_XXXX, assume that CR and LF are encoded by
6201 two-byte, else they are encoded by one-byte.
6202
6203 Return one of EOL_SEEN_XXX. */
6204
6205 #define MAX_EOL_CHECK_COUNT 3
6206
6207 static int
6208 detect_eol (source, src_bytes, category)
6209 const unsigned char *source;
6210 EMACS_INT src_bytes;
6211 enum coding_category category;
6212 {
6213 const unsigned char *src = source, *src_end = src + src_bytes;
6214 unsigned char c;
6215 int total = 0;
6216 int eol_seen = EOL_SEEN_NONE;
6217
6218 if ((1 << category) & CATEGORY_MASK_UTF_16)
6219 {
6220 int msb, lsb;
6221
6222 msb = category == (coding_category_utf_16_le
6223 | coding_category_utf_16_le_nosig);
6224 lsb = 1 - msb;
6225
6226 while (src + 1 < src_end)
6227 {
6228 c = src[lsb];
6229 if (src[msb] == 0 && (c == '\n' || c == '\r'))
6230 {
6231 int this_eol;
6232
6233 if (c == '\n')
6234 this_eol = EOL_SEEN_LF;
6235 else if (src + 3 >= src_end
6236 || src[msb + 2] != 0
6237 || src[lsb + 2] != '\n')
6238 this_eol = EOL_SEEN_CR;
6239 else
6240 {
6241 this_eol = EOL_SEEN_CRLF;
6242 src += 2;
6243 }
6244
6245 if (eol_seen == EOL_SEEN_NONE)
6246 /* This is the first end-of-line. */
6247 eol_seen = this_eol;
6248 else if (eol_seen != this_eol)
6249 {
6250 /* The found type is different from what found before.
6251 Allow for stray ^M characters in DOS EOL files. */
6252 if (eol_seen == EOL_SEEN_CR && this_eol == EOL_SEEN_CRLF
6253 || eol_seen == EOL_SEEN_CRLF && this_eol == EOL_SEEN_CR)
6254 eol_seen = EOL_SEEN_CRLF;
6255 else
6256 {
6257 eol_seen = EOL_SEEN_LF;
6258 break;
6259 }
6260 }
6261 if (++total == MAX_EOL_CHECK_COUNT)
6262 break;
6263 }
6264 src += 2;
6265 }
6266 }
6267 else
6268 {
6269 while (src < src_end)
6270 {
6271 c = *src++;
6272 if (c == '\n' || c == '\r')
6273 {
6274 int this_eol;
6275
6276 if (c == '\n')
6277 this_eol = EOL_SEEN_LF;
6278 else if (src >= src_end || *src != '\n')
6279 this_eol = EOL_SEEN_CR;
6280 else
6281 this_eol = EOL_SEEN_CRLF, src++;
6282
6283 if (eol_seen == EOL_SEEN_NONE)
6284 /* This is the first end-of-line. */
6285 eol_seen = this_eol;
6286 else if (eol_seen != this_eol)
6287 {
6288 /* The found type is different from what found before.
6289 Allow for stray ^M characters in DOS EOL files. */
6290 if (eol_seen == EOL_SEEN_CR && this_eol == EOL_SEEN_CRLF
6291 || eol_seen == EOL_SEEN_CRLF && this_eol == EOL_SEEN_CR)
6292 eol_seen = EOL_SEEN_CRLF;
6293 else
6294 {
6295 eol_seen = EOL_SEEN_LF;
6296 break;
6297 }
6298 }
6299 if (++total == MAX_EOL_CHECK_COUNT)
6300 break;
6301 }
6302 }
6303 }
6304 return eol_seen;
6305 }
6306
6307
6308 static Lisp_Object
6309 adjust_coding_eol_type (coding, eol_seen)
6310 struct coding_system *coding;
6311 int eol_seen;
6312 {
6313 Lisp_Object eol_type;
6314
6315 eol_type = CODING_ID_EOL_TYPE (coding->id);
6316 if (eol_seen & EOL_SEEN_LF)
6317 {
6318 coding->id = CODING_SYSTEM_ID (AREF (eol_type, 0));
6319 eol_type = Qunix;
6320 }
6321 else if (eol_seen & EOL_SEEN_CRLF)
6322 {
6323 coding->id = CODING_SYSTEM_ID (AREF (eol_type, 1));
6324 eol_type = Qdos;
6325 }
6326 else if (eol_seen & EOL_SEEN_CR)
6327 {
6328 coding->id = CODING_SYSTEM_ID (AREF (eol_type, 2));
6329 eol_type = Qmac;
6330 }
6331 return eol_type;
6332 }
6333
6334 /* Detect how a text specified in CODING is encoded. If a coding
6335 system is detected, update fields of CODING by the detected coding
6336 system. */
6337
6338 void
6339 detect_coding (coding)
6340 struct coding_system *coding;
6341 {
6342 const unsigned char *src, *src_end;
6343 int saved_mode = coding->mode;
6344
6345 coding->consumed = coding->consumed_char = 0;
6346 coding->produced = coding->produced_char = 0;
6347 coding_set_source (coding);
6348
6349 src_end = coding->source + coding->src_bytes;
6350 coding->head_ascii = 0;
6351
6352 /* If we have not yet decided the text encoding type, detect it
6353 now. */
6354 if (EQ (CODING_ATTR_TYPE (CODING_ID_ATTRS (coding->id)), Qundecided))
6355 {
6356 int c, i;
6357 struct coding_detection_info detect_info;
6358 int null_byte_found = 0, eight_bit_found = 0;
6359
6360 detect_info.checked = detect_info.found = detect_info.rejected = 0;
6361 for (src = coding->source; src < src_end; src++)
6362 {
6363 c = *src;
6364 if (c & 0x80)
6365 {
6366 eight_bit_found = 1;
6367 if (null_byte_found)
6368 break;
6369 }
6370 else if (c < 0x20)
6371 {
6372 if ((c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
6373 && ! inhibit_iso_escape_detection
6374 && ! detect_info.checked)
6375 {
6376 if (detect_coding_iso_2022 (coding, &detect_info))
6377 {
6378 /* We have scanned the whole data. */
6379 if (! (detect_info.rejected & CATEGORY_MASK_ISO_7_ELSE))
6380 {
6381 /* We didn't find an 8-bit code. We may
6382 have found a null-byte, but it's very
6383 rare that a binary file confirm to
6384 ISO-2022. */
6385 src = src_end;
6386 coding->head_ascii = src - coding->source;
6387 }
6388 detect_info.rejected |= ~CATEGORY_MASK_ISO_ESCAPE;
6389 break;
6390 }
6391 }
6392 else if (! c && !inhibit_null_byte_detection)
6393 {
6394 null_byte_found = 1;
6395 if (eight_bit_found)
6396 break;
6397 }
6398 if (! eight_bit_found)
6399 coding->head_ascii++;
6400 }
6401 else if (! eight_bit_found)
6402 coding->head_ascii++;
6403 }
6404
6405 if (null_byte_found || eight_bit_found
6406 || coding->head_ascii < coding->src_bytes
6407 || detect_info.found)
6408 {
6409 enum coding_category category;
6410 struct coding_system *this;
6411
6412 if (coding->head_ascii == coding->src_bytes)
6413 /* As all bytes are 7-bit, we can ignore non-ISO-2022 codings. */
6414 for (i = 0; i < coding_category_raw_text; i++)
6415 {
6416 category = coding_priorities[i];
6417 this = coding_categories + category;
6418 if (detect_info.found & (1 << category))
6419 break;
6420 }
6421 else
6422 {
6423 if (null_byte_found)
6424 {
6425 detect_info.checked |= ~CATEGORY_MASK_UTF_16;
6426 detect_info.rejected |= ~CATEGORY_MASK_UTF_16;
6427 }
6428 for (i = 0; i < coding_category_raw_text; i++)
6429 {
6430 category = coding_priorities[i];
6431 this = coding_categories + category;
6432 if (this->id < 0)
6433 {
6434 /* No coding system of this category is defined. */
6435 detect_info.rejected |= (1 << category);
6436 }
6437 else if (category >= coding_category_raw_text)
6438 continue;
6439 else if (detect_info.checked & (1 << category))
6440 {
6441 if (detect_info.found & (1 << category))
6442 break;
6443 }
6444 else if ((*(this->detector)) (coding, &detect_info)
6445 && detect_info.found & (1 << category))
6446 {
6447 if (category == coding_category_utf_16_auto)
6448 {
6449 if (detect_info.found & CATEGORY_MASK_UTF_16_LE)
6450 category = coding_category_utf_16_le;
6451 else
6452 category = coding_category_utf_16_be;
6453 }
6454 break;
6455 }
6456 }
6457 }
6458
6459 if (i < coding_category_raw_text)
6460 setup_coding_system (CODING_ID_NAME (this->id), coding);
6461 else if (null_byte_found)
6462 setup_coding_system (Qno_conversion, coding);
6463 else if ((detect_info.rejected & CATEGORY_MASK_ANY)
6464 == CATEGORY_MASK_ANY)
6465 setup_coding_system (Qraw_text, coding);
6466 else if (detect_info.rejected)
6467 for (i = 0; i < coding_category_raw_text; i++)
6468 if (! (detect_info.rejected & (1 << coding_priorities[i])))
6469 {
6470 this = coding_categories + coding_priorities[i];
6471 setup_coding_system (CODING_ID_NAME (this->id), coding);
6472 break;
6473 }
6474 }
6475 }
6476 else if (XINT (CODING_ATTR_CATEGORY (CODING_ID_ATTRS (coding->id)))
6477 == coding_category_utf_8_auto)
6478 {
6479 Lisp_Object coding_systems;
6480 struct coding_detection_info detect_info;
6481
6482 coding_systems
6483 = AREF (CODING_ID_ATTRS (coding->id), coding_attr_utf_bom);
6484 detect_info.found = detect_info.rejected = 0;
6485 coding->head_ascii = 0;
6486 if (CONSP (coding_systems)
6487 && detect_coding_utf_8 (coding, &detect_info))
6488 {
6489 if (detect_info.found & CATEGORY_MASK_UTF_8_SIG)
6490 setup_coding_system (XCAR (coding_systems), coding);
6491 else
6492 setup_coding_system (XCDR (coding_systems), coding);
6493 }
6494 }
6495 else if (XINT (CODING_ATTR_CATEGORY (CODING_ID_ATTRS (coding->id)))
6496 == coding_category_utf_16_auto)
6497 {
6498 Lisp_Object coding_systems;
6499 struct coding_detection_info detect_info;
6500
6501 coding_systems
6502 = AREF (CODING_ID_ATTRS (coding->id), coding_attr_utf_bom);
6503 detect_info.found = detect_info.rejected = 0;
6504 coding->head_ascii = 0;
6505 if (CONSP (coding_systems)
6506 && detect_coding_utf_16 (coding, &detect_info))
6507 {
6508 if (detect_info.found & CATEGORY_MASK_UTF_16_LE)
6509 setup_coding_system (XCAR (coding_systems), coding);
6510 else if (detect_info.found & CATEGORY_MASK_UTF_16_BE)
6511 setup_coding_system (XCDR (coding_systems), coding);
6512 }
6513 }
6514 coding->mode = saved_mode;
6515 }
6516
6517
6518 static void
6519 decode_eol (coding)
6520 struct coding_system *coding;
6521 {
6522 Lisp_Object eol_type;
6523 unsigned char *p, *pbeg, *pend;
6524
6525 eol_type = CODING_ID_EOL_TYPE (coding->id);
6526 if (EQ (eol_type, Qunix) || inhibit_eol_conversion)
6527 return;
6528
6529 if (NILP (coding->dst_object))
6530 pbeg = coding->destination;
6531 else
6532 pbeg = BYTE_POS_ADDR (coding->dst_pos_byte);
6533 pend = pbeg + coding->produced;
6534
6535 if (VECTORP (eol_type))
6536 {
6537 int eol_seen = EOL_SEEN_NONE;
6538
6539 for (p = pbeg; p < pend; p++)
6540 {
6541 if (*p == '\n')
6542 eol_seen |= EOL_SEEN_LF;
6543 else if (*p == '\r')
6544 {
6545 if (p + 1 < pend && *(p + 1) == '\n')
6546 {
6547 eol_seen |= EOL_SEEN_CRLF;
6548 p++;
6549 }
6550 else
6551 eol_seen |= EOL_SEEN_CR;
6552 }
6553 }
6554 /* Handle DOS-style EOLs in a file with stray ^M characters. */
6555 if ((eol_seen & EOL_SEEN_CRLF) != 0
6556 && (eol_seen & EOL_SEEN_CR) != 0
6557 && (eol_seen & EOL_SEEN_LF) == 0)
6558 eol_seen = EOL_SEEN_CRLF;
6559 else if (eol_seen != EOL_SEEN_NONE
6560 && eol_seen != EOL_SEEN_LF
6561 && eol_seen != EOL_SEEN_CRLF
6562 && eol_seen != EOL_SEEN_CR)
6563 eol_seen = EOL_SEEN_LF;
6564 if (eol_seen != EOL_SEEN_NONE)
6565 eol_type = adjust_coding_eol_type (coding, eol_seen);
6566 }
6567
6568 if (EQ (eol_type, Qmac))
6569 {
6570 for (p = pbeg; p < pend; p++)
6571 if (*p == '\r')
6572 *p = '\n';
6573 }
6574 else if (EQ (eol_type, Qdos))
6575 {
6576 int n = 0;
6577
6578 if (NILP (coding->dst_object))
6579 {
6580 /* Start deleting '\r' from the tail to minimize the memory
6581 movement. */
6582 for (p = pend - 2; p >= pbeg; p--)
6583 if (*p == '\r')
6584 {
6585 safe_bcopy ((char *) (p + 1), (char *) p, pend-- - p - 1);
6586 n++;
6587 }
6588 }
6589 else
6590 {
6591 int pos_byte = coding->dst_pos_byte;
6592 int pos = coding->dst_pos;
6593 int pos_end = pos + coding->produced_char - 1;
6594
6595 while (pos < pos_end)
6596 {
6597 p = BYTE_POS_ADDR (pos_byte);
6598 if (*p == '\r' && p[1] == '\n')
6599 {
6600 del_range_2 (pos, pos_byte, pos + 1, pos_byte + 1, 0);
6601 n++;
6602 pos_end--;
6603 }
6604 pos++;
6605 if (coding->dst_multibyte)
6606 pos_byte += BYTES_BY_CHAR_HEAD (*p);
6607 else
6608 pos_byte++;
6609 }
6610 }
6611 coding->produced -= n;
6612 coding->produced_char -= n;
6613 }
6614 }
6615
6616
6617 /* Return a translation table (or list of them) from coding system
6618 attribute vector ATTRS for encoding (ENCODEP is nonzero) or
6619 decoding (ENCODEP is zero). */
6620
6621 static Lisp_Object
6622 get_translation_table (attrs, encodep, max_lookup)
6623 Lisp_Object attrs;
6624 int encodep, *max_lookup;
6625 {
6626 Lisp_Object standard, translation_table;
6627 Lisp_Object val;
6628
6629 if (NILP (Venable_character_translation))
6630 {
6631 if (max_lookup)
6632 *max_lookup = 0;
6633 return Qnil;
6634 }
6635 if (encodep)
6636 translation_table = CODING_ATTR_ENCODE_TBL (attrs),
6637 standard = Vstandard_translation_table_for_encode;
6638 else
6639 translation_table = CODING_ATTR_DECODE_TBL (attrs),
6640 standard = Vstandard_translation_table_for_decode;
6641 if (NILP (translation_table))
6642 translation_table = standard;
6643 else
6644 {
6645 if (SYMBOLP (translation_table))
6646 translation_table = Fget (translation_table, Qtranslation_table);
6647 else if (CONSP (translation_table))
6648 {
6649 translation_table = Fcopy_sequence (translation_table);
6650 for (val = translation_table; CONSP (val); val = XCDR (val))
6651 if (SYMBOLP (XCAR (val)))
6652 XSETCAR (val, Fget (XCAR (val), Qtranslation_table));
6653 }
6654 if (CHAR_TABLE_P (standard))
6655 {
6656 if (CONSP (translation_table))
6657 translation_table = nconc2 (translation_table,
6658 Fcons (standard, Qnil));
6659 else
6660 translation_table = Fcons (translation_table,
6661 Fcons (standard, Qnil));
6662 }
6663 }
6664
6665 if (max_lookup)
6666 {
6667 *max_lookup = 1;
6668 if (CHAR_TABLE_P (translation_table)
6669 && CHAR_TABLE_EXTRA_SLOTS (XCHAR_TABLE (translation_table)) > 1)
6670 {
6671 val = XCHAR_TABLE (translation_table)->extras[1];
6672 if (NATNUMP (val) && *max_lookup < XFASTINT (val))
6673 *max_lookup = XFASTINT (val);
6674 }
6675 else if (CONSP (translation_table))
6676 {
6677 Lisp_Object tail, val;
6678
6679 for (tail = translation_table; CONSP (tail); tail = XCDR (tail))
6680 if (CHAR_TABLE_P (XCAR (tail))
6681 && CHAR_TABLE_EXTRA_SLOTS (XCHAR_TABLE (XCAR (tail))) > 1)
6682 {
6683 val = XCHAR_TABLE (XCAR (tail))->extras[1];
6684 if (NATNUMP (val) && *max_lookup < XFASTINT (val))
6685 *max_lookup = XFASTINT (val);
6686 }
6687 }
6688 }
6689 return translation_table;
6690 }
6691
6692 #define LOOKUP_TRANSLATION_TABLE(table, c, trans) \
6693 do { \
6694 trans = Qnil; \
6695 if (CHAR_TABLE_P (table)) \
6696 { \
6697 trans = CHAR_TABLE_REF (table, c); \
6698 if (CHARACTERP (trans)) \
6699 c = XFASTINT (trans), trans = Qnil; \
6700 } \
6701 else if (CONSP (table)) \
6702 { \
6703 Lisp_Object tail; \
6704 \
6705 for (tail = table; CONSP (tail); tail = XCDR (tail)) \
6706 if (CHAR_TABLE_P (XCAR (tail))) \
6707 { \
6708 trans = CHAR_TABLE_REF (XCAR (tail), c); \
6709 if (CHARACTERP (trans)) \
6710 c = XFASTINT (trans), trans = Qnil; \
6711 else if (! NILP (trans)) \
6712 break; \
6713 } \
6714 } \
6715 } while (0)
6716
6717
6718 /* Return a translation of character(s) at BUF according to TRANS.
6719 TRANS is TO-CHAR or ((FROM . TO) ...) where
6720 FROM = [FROM-CHAR ...], TO is TO-CHAR or [TO-CHAR ...].
6721 The return value is TO-CHAR or ([FROM-CHAR ...] . TO) if a
6722 translation is found, and Qnil if not found..
6723 If BUF is too short to lookup characters in FROM, return Qt. */
6724
6725 static Lisp_Object
6726 get_translation (trans, buf, buf_end)
6727 Lisp_Object trans;
6728 int *buf, *buf_end;
6729 {
6730
6731 if (INTEGERP (trans))
6732 return trans;
6733 for (; CONSP (trans); trans = XCDR (trans))
6734 {
6735 Lisp_Object val = XCAR (trans);
6736 Lisp_Object from = XCAR (val);
6737 int len = ASIZE (from);
6738 int i;
6739
6740 for (i = 0; i < len; i++)
6741 {
6742 if (buf + i == buf_end)
6743 return Qt;
6744 if (XINT (AREF (from, i)) != buf[i])
6745 break;
6746 }
6747 if (i == len)
6748 return val;
6749 }
6750 return Qnil;
6751 }
6752
6753
6754 static int
6755 produce_chars (coding, translation_table, last_block)
6756 struct coding_system *coding;
6757 Lisp_Object translation_table;
6758 int last_block;
6759 {
6760 unsigned char *dst = coding->destination + coding->produced;
6761 unsigned char *dst_end = coding->destination + coding->dst_bytes;
6762 EMACS_INT produced;
6763 EMACS_INT produced_chars = 0;
6764 int carryover = 0;
6765
6766 if (! coding->chars_at_source)
6767 {
6768 /* Source characters are in coding->charbuf. */
6769 int *buf = coding->charbuf;
6770 int *buf_end = buf + coding->charbuf_used;
6771
6772 if (EQ (coding->src_object, coding->dst_object))
6773 {
6774 coding_set_source (coding);
6775 dst_end = ((unsigned char *) coding->source) + coding->consumed;
6776 }
6777
6778 while (buf < buf_end)
6779 {
6780 int c = *buf, i;
6781
6782 if (c >= 0)
6783 {
6784 int from_nchars = 1, to_nchars = 1;
6785 Lisp_Object trans = Qnil;
6786
6787 LOOKUP_TRANSLATION_TABLE (translation_table, c, trans);
6788 if (! NILP (trans))
6789 {
6790 trans = get_translation (trans, buf, buf_end);
6791 if (INTEGERP (trans))
6792 c = XINT (trans);
6793 else if (CONSP (trans))
6794 {
6795 from_nchars = ASIZE (XCAR (trans));
6796 trans = XCDR (trans);
6797 if (INTEGERP (trans))
6798 c = XINT (trans);
6799 else
6800 {
6801 to_nchars = ASIZE (trans);
6802 c = XINT (AREF (trans, 0));
6803 }
6804 }
6805 else if (EQ (trans, Qt) && ! last_block)
6806 break;
6807 }
6808
6809 if (dst + MAX_MULTIBYTE_LENGTH * to_nchars > dst_end)
6810 {
6811 dst = alloc_destination (coding,
6812 buf_end - buf
6813 + MAX_MULTIBYTE_LENGTH * to_nchars,
6814 dst);
6815 if (EQ (coding->src_object, coding->dst_object))
6816 {
6817 coding_set_source (coding);
6818 dst_end = (((unsigned char *) coding->source)
6819 + coding->consumed);
6820 }
6821 else
6822 dst_end = coding->destination + coding->dst_bytes;
6823 }
6824
6825 for (i = 0; i < to_nchars; i++)
6826 {
6827 if (i > 0)
6828 c = XINT (AREF (trans, i));
6829 if (coding->dst_multibyte
6830 || ! CHAR_BYTE8_P (c))
6831 CHAR_STRING_ADVANCE_NO_UNIFY (c, dst);
6832 else
6833 *dst++ = CHAR_TO_BYTE8 (c);
6834 }
6835 produced_chars += to_nchars;
6836 buf += from_nchars;
6837 }
6838 else
6839 /* This is an annotation datum. (-C) is the length. */
6840 buf += -c;
6841 }
6842 carryover = buf_end - buf;
6843 }
6844 else
6845 {
6846 /* Source characters are at coding->source. */
6847 const unsigned char *src = coding->source;
6848 const unsigned char *src_end = src + coding->consumed;
6849
6850 if (EQ (coding->dst_object, coding->src_object))
6851 dst_end = (unsigned char *) src;
6852 if (coding->src_multibyte != coding->dst_multibyte)
6853 {
6854 if (coding->src_multibyte)
6855 {
6856 int multibytep = 1;
6857 EMACS_INT consumed_chars = 0;
6858
6859 while (1)
6860 {
6861 const unsigned char *src_base = src;
6862 int c;
6863
6864 ONE_MORE_BYTE (c);
6865 if (dst == dst_end)
6866 {
6867 if (EQ (coding->src_object, coding->dst_object))
6868 dst_end = (unsigned char *) src;
6869 if (dst == dst_end)
6870 {
6871 EMACS_INT offset = src - coding->source;
6872
6873 dst = alloc_destination (coding, src_end - src + 1,
6874 dst);
6875 dst_end = coding->destination + coding->dst_bytes;
6876 coding_set_source (coding);
6877 src = coding->source + offset;
6878 src_end = coding->source + coding->src_bytes;
6879 if (EQ (coding->src_object, coding->dst_object))
6880 dst_end = (unsigned char *) src;
6881 }
6882 }
6883 *dst++ = c;
6884 produced_chars++;
6885 }
6886 no_more_source:
6887 ;
6888 }
6889 else
6890 while (src < src_end)
6891 {
6892 int multibytep = 1;
6893 int c = *src++;
6894
6895 if (dst >= dst_end - 1)
6896 {
6897 if (EQ (coding->src_object, coding->dst_object))
6898 dst_end = (unsigned char *) src;
6899 if (dst >= dst_end - 1)
6900 {
6901 EMACS_INT offset = src - coding->source;
6902 EMACS_INT more_bytes;
6903
6904 if (EQ (coding->src_object, coding->dst_object))
6905 more_bytes = ((src_end - src) / 2) + 2;
6906 else
6907 more_bytes = src_end - src + 2;
6908 dst = alloc_destination (coding, more_bytes, dst);
6909 dst_end = coding->destination + coding->dst_bytes;
6910 coding_set_source (coding);
6911 src = coding->source + offset;
6912 src_end = coding->source + coding->src_bytes;
6913 if (EQ (coding->src_object, coding->dst_object))
6914 dst_end = (unsigned char *) src;
6915 }
6916 }
6917 EMIT_ONE_BYTE (c);
6918 }
6919 }
6920 else
6921 {
6922 if (!EQ (coding->src_object, coding->dst_object))
6923 {
6924 EMACS_INT require = coding->src_bytes - coding->dst_bytes;
6925
6926 if (require > 0)
6927 {
6928 EMACS_INT offset = src - coding->source;
6929
6930 dst = alloc_destination (coding, require, dst);
6931 coding_set_source (coding);
6932 src = coding->source + offset;
6933 src_end = coding->source + coding->src_bytes;
6934 }
6935 }
6936 produced_chars = coding->consumed_char;
6937 while (src < src_end)
6938 *dst++ = *src++;
6939 }
6940 }
6941
6942 produced = dst - (coding->destination + coding->produced);
6943 if (BUFFERP (coding->dst_object) && produced_chars > 0)
6944 insert_from_gap (produced_chars, produced);
6945 coding->produced += produced;
6946 coding->produced_char += produced_chars;
6947 return carryover;
6948 }
6949
6950 /* Compose text in CODING->object according to the annotation data at
6951 CHARBUF. CHARBUF is an array:
6952 [ -LENGTH ANNOTATION_MASK NCHARS NBYTES METHOD [ COMPONENTS... ] ]
6953 */
6954
6955 static INLINE void
6956 produce_composition (coding, charbuf, pos)
6957 struct coding_system *coding;
6958 int *charbuf;
6959 EMACS_INT pos;
6960 {
6961 int len;
6962 EMACS_INT to;
6963 enum composition_method method;
6964 Lisp_Object components;
6965
6966 len = -charbuf[0] - MAX_ANNOTATION_LENGTH;
6967 to = pos + charbuf[2];
6968 method = (enum composition_method) (charbuf[4]);
6969
6970 if (method == COMPOSITION_RELATIVE)
6971 components = Qnil;
6972 else
6973 {
6974 Lisp_Object args[MAX_COMPOSITION_COMPONENTS * 2 - 1];
6975 int i, j;
6976
6977 if (method == COMPOSITION_WITH_RULE)
6978 len = charbuf[2] * 3 - 2;
6979 charbuf += MAX_ANNOTATION_LENGTH;
6980 /* charbuf = [ CHRA ... CHAR] or [ CHAR -2 RULE ... CHAR ] */
6981 for (i = j = 0; i < len && charbuf[i] != -1; i++, j++)
6982 {
6983 if (charbuf[i] >= 0)
6984 args[j] = make_number (charbuf[i]);
6985 else
6986 {
6987 i++;
6988 args[j] = make_number (charbuf[i] % 0x100);
6989 }
6990 }
6991 components = (i == j ? Fstring (j, args) : Fvector (j, args));
6992 }
6993 compose_text (pos, to, components, Qnil, coding->dst_object);
6994 }
6995
6996
6997 /* Put `charset' property on text in CODING->object according to
6998 the annotation data at CHARBUF. CHARBUF is an array:
6999 [ -LENGTH ANNOTATION_MASK NCHARS CHARSET-ID ]
7000 */
7001
7002 static INLINE void
7003 produce_charset (coding, charbuf, pos)
7004 struct coding_system *coding;
7005 int *charbuf;
7006 EMACS_INT pos;
7007 {
7008 EMACS_INT from = pos - charbuf[2];
7009 struct charset *charset = CHARSET_FROM_ID (charbuf[3]);
7010
7011 Fput_text_property (make_number (from), make_number (pos),
7012 Qcharset, CHARSET_NAME (charset),
7013 coding->dst_object);
7014 }
7015
7016
7017 #define CHARBUF_SIZE 0x4000
7018
7019 #define ALLOC_CONVERSION_WORK_AREA(coding) \
7020 do { \
7021 int size = CHARBUF_SIZE; \
7022 \
7023 coding->charbuf = NULL; \
7024 while (size > 1024) \
7025 { \
7026 coding->charbuf = (int *) alloca (sizeof (int) * size); \
7027 if (coding->charbuf) \
7028 break; \
7029 size >>= 1; \
7030 } \
7031 if (! coding->charbuf) \
7032 { \
7033 record_conversion_result (coding, CODING_RESULT_INSUFFICIENT_MEM); \
7034 return coding->result; \
7035 } \
7036 coding->charbuf_size = size; \
7037 } while (0)
7038
7039
7040 static void
7041 produce_annotation (coding, pos)
7042 struct coding_system *coding;
7043 EMACS_INT pos;
7044 {
7045 int *charbuf = coding->charbuf;
7046 int *charbuf_end = charbuf + coding->charbuf_used;
7047
7048 if (NILP (coding->dst_object))
7049 return;
7050
7051 while (charbuf < charbuf_end)
7052 {
7053 if (*charbuf >= 0)
7054 pos++, charbuf++;
7055 else
7056 {
7057 int len = -*charbuf;
7058
7059 if (len > 2)
7060 switch (charbuf[1])
7061 {
7062 case CODING_ANNOTATE_COMPOSITION_MASK:
7063 produce_composition (coding, charbuf, pos);
7064 break;
7065 case CODING_ANNOTATE_CHARSET_MASK:
7066 produce_charset (coding, charbuf, pos);
7067 break;
7068 }
7069 charbuf += len;
7070 }
7071 }
7072 }
7073
7074 /* Decode the data at CODING->src_object into CODING->dst_object.
7075 CODING->src_object is a buffer, a string, or nil.
7076 CODING->dst_object is a buffer.
7077
7078 If CODING->src_object is a buffer, it must be the current buffer.
7079 In this case, if CODING->src_pos is positive, it is a position of
7080 the source text in the buffer, otherwise, the source text is in the
7081 gap area of the buffer, and CODING->src_pos specifies the offset of
7082 the text from GPT (which must be the same as PT). If this is the
7083 same buffer as CODING->dst_object, CODING->src_pos must be
7084 negative.
7085
7086 If CODING->src_object is a string, CODING->src_pos is an index to
7087 that string.
7088
7089 If CODING->src_object is nil, CODING->source must already point to
7090 the non-relocatable memory area. In this case, CODING->src_pos is
7091 an offset from CODING->source.
7092
7093 The decoded data is inserted at the current point of the buffer
7094 CODING->dst_object.
7095 */
7096
7097 static int
7098 decode_coding (coding)
7099 struct coding_system *coding;
7100 {
7101 Lisp_Object attrs;
7102 Lisp_Object undo_list;
7103 Lisp_Object translation_table;
7104 int carryover;
7105 int i;
7106
7107 if (BUFFERP (coding->src_object)
7108 && coding->src_pos > 0
7109 && coding->src_pos < GPT
7110 && coding->src_pos + coding->src_chars > GPT)
7111 move_gap_both (coding->src_pos, coding->src_pos_byte);
7112
7113 undo_list = Qt;
7114 if (BUFFERP (coding->dst_object))
7115 {
7116 if (current_buffer != XBUFFER (coding->dst_object))
7117 set_buffer_internal (XBUFFER (coding->dst_object));
7118 if (GPT != PT)
7119 move_gap_both (PT, PT_BYTE);
7120 undo_list = current_buffer->undo_list;
7121 current_buffer->undo_list = Qt;
7122 }
7123
7124 coding->consumed = coding->consumed_char = 0;
7125 coding->produced = coding->produced_char = 0;
7126 coding->chars_at_source = 0;
7127 record_conversion_result (coding, CODING_RESULT_SUCCESS);
7128 coding->errors = 0;
7129
7130 ALLOC_CONVERSION_WORK_AREA (coding);
7131
7132 attrs = CODING_ID_ATTRS (coding->id);
7133 translation_table = get_translation_table (attrs, 0, NULL);
7134
7135 carryover = 0;
7136 do
7137 {
7138 EMACS_INT pos = coding->dst_pos + coding->produced_char;
7139
7140 coding_set_source (coding);
7141 coding->annotated = 0;
7142 coding->charbuf_used = carryover;
7143 (*(coding->decoder)) (coding);
7144 coding_set_destination (coding);
7145 carryover = produce_chars (coding, translation_table, 0);
7146 if (coding->annotated)
7147 produce_annotation (coding, pos);
7148 for (i = 0; i < carryover; i++)
7149 coding->charbuf[i]
7150 = coding->charbuf[coding->charbuf_used - carryover + i];
7151 }
7152 while (coding->consumed < coding->src_bytes
7153 && (coding->result == CODING_RESULT_SUCCESS
7154 || coding->result == CODING_RESULT_INVALID_SRC));
7155
7156 if (carryover > 0)
7157 {
7158 coding_set_destination (coding);
7159 coding->charbuf_used = carryover;
7160 produce_chars (coding, translation_table, 1);
7161 }
7162
7163 coding->carryover_bytes = 0;
7164 if (coding->consumed < coding->src_bytes)
7165 {
7166 int nbytes = coding->src_bytes - coding->consumed;
7167 const unsigned char *src;
7168
7169 coding_set_source (coding);
7170 coding_set_destination (coding);
7171 src = coding->source + coding->consumed;
7172
7173 if (coding->mode & CODING_MODE_LAST_BLOCK)
7174 {
7175 /* Flush out unprocessed data as binary chars. We are sure
7176 that the number of data is less than the size of
7177 coding->charbuf. */
7178 coding->charbuf_used = 0;
7179 coding->chars_at_source = 0;
7180
7181 while (nbytes-- > 0)
7182 {
7183 int c = *src++;
7184
7185 if (c & 0x80)
7186 c = BYTE8_TO_CHAR (c);
7187 coding->charbuf[coding->charbuf_used++] = c;
7188 }
7189 produce_chars (coding, Qnil, 1);
7190 }
7191 else
7192 {
7193 /* Record unprocessed bytes in coding->carryover. We are
7194 sure that the number of data is less than the size of
7195 coding->carryover. */
7196 unsigned char *p = coding->carryover;
7197
7198 if (nbytes > sizeof coding->carryover)
7199 nbytes = sizeof coding->carryover;
7200 coding->carryover_bytes = nbytes;
7201 while (nbytes-- > 0)
7202 *p++ = *src++;
7203 }
7204 coding->consumed = coding->src_bytes;
7205 }
7206
7207 if (! EQ (CODING_ID_EOL_TYPE (coding->id), Qunix)
7208 && !inhibit_eol_conversion)
7209 decode_eol (coding);
7210 if (BUFFERP (coding->dst_object))
7211 {
7212 current_buffer->undo_list = undo_list;
7213 record_insert (coding->dst_pos, coding->produced_char);
7214 }
7215 return coding->result;
7216 }
7217
7218
7219 /* Extract an annotation datum from a composition starting at POS and
7220 ending before LIMIT of CODING->src_object (buffer or string), store
7221 the data in BUF, set *STOP to a starting position of the next
7222 composition (if any) or to LIMIT, and return the address of the
7223 next element of BUF.
7224
7225 If such an annotation is not found, set *STOP to a starting
7226 position of a composition after POS (if any) or to LIMIT, and
7227 return BUF. */
7228
7229 static INLINE int *
7230 handle_composition_annotation (pos, limit, coding, buf, stop)
7231 EMACS_INT pos, limit;
7232 struct coding_system *coding;
7233 int *buf;
7234 EMACS_INT *stop;
7235 {
7236 EMACS_INT start, end;
7237 Lisp_Object prop;
7238
7239 if (! find_composition (pos, limit, &start, &end, &prop, coding->src_object)
7240 || end > limit)
7241 *stop = limit;
7242 else if (start > pos)
7243 *stop = start;
7244 else
7245 {
7246 if (start == pos)
7247 {
7248 /* We found a composition. Store the corresponding
7249 annotation data in BUF. */
7250 int *head = buf;
7251 enum composition_method method = COMPOSITION_METHOD (prop);
7252 int nchars = COMPOSITION_LENGTH (prop);
7253
7254 ADD_COMPOSITION_DATA (buf, nchars, 0, method);
7255 if (method != COMPOSITION_RELATIVE)
7256 {
7257 Lisp_Object components;
7258 int len, i, i_byte;
7259
7260 components = COMPOSITION_COMPONENTS (prop);
7261 if (VECTORP (components))
7262 {
7263 len = XVECTOR (components)->size;
7264 for (i = 0; i < len; i++)
7265 *buf++ = XINT (AREF (components, i));
7266 }
7267 else if (STRINGP (components))
7268 {
7269 len = SCHARS (components);
7270 i = i_byte = 0;
7271 while (i < len)
7272 {
7273 FETCH_STRING_CHAR_ADVANCE (*buf, components, i, i_byte);
7274 buf++;
7275 }
7276 }
7277 else if (INTEGERP (components))
7278 {
7279 len = 1;
7280 *buf++ = XINT (components);
7281 }
7282 else if (CONSP (components))
7283 {
7284 for (len = 0; CONSP (components);
7285 len++, components = XCDR (components))
7286 *buf++ = XINT (XCAR (components));
7287 }
7288 else
7289 abort ();
7290 *head -= len;
7291 }
7292 }
7293
7294 if (find_composition (end, limit, &start, &end, &prop,
7295 coding->src_object)
7296 && end <= limit)
7297 *stop = start;
7298 else
7299 *stop = limit;
7300 }
7301 return buf;
7302 }
7303
7304
7305 /* Extract an annotation datum from a text property `charset' at POS of
7306 CODING->src_object (buffer of string), store the data in BUF, set
7307 *STOP to the position where the value of `charset' property changes
7308 (limiting by LIMIT), and return the address of the next element of
7309 BUF.
7310
7311 If the property value is nil, set *STOP to the position where the
7312 property value is non-nil (limiting by LIMIT), and return BUF. */
7313
7314 static INLINE int *
7315 handle_charset_annotation (pos, limit, coding, buf, stop)
7316 EMACS_INT pos, limit;
7317 struct coding_system *coding;
7318 int *buf;
7319 EMACS_INT *stop;
7320 {
7321 Lisp_Object val, next;
7322 int id;
7323
7324 val = Fget_text_property (make_number (pos), Qcharset, coding->src_object);
7325 if (! NILP (val) && CHARSETP (val))
7326 id = XINT (CHARSET_SYMBOL_ID (val));
7327 else
7328 id = -1;
7329 ADD_CHARSET_DATA (buf, 0, id);
7330 next = Fnext_single_property_change (make_number (pos), Qcharset,
7331 coding->src_object,
7332 make_number (limit));
7333 *stop = XINT (next);
7334 return buf;
7335 }
7336
7337
7338 static void
7339 consume_chars (coding, translation_table, max_lookup)
7340 struct coding_system *coding;
7341 Lisp_Object translation_table;
7342 int max_lookup;
7343 {
7344 int *buf = coding->charbuf;
7345 int *buf_end = coding->charbuf + coding->charbuf_size;
7346 const unsigned char *src = coding->source + coding->consumed;
7347 const unsigned char *src_end = coding->source + coding->src_bytes;
7348 EMACS_INT pos = coding->src_pos + coding->consumed_char;
7349 EMACS_INT end_pos = coding->src_pos + coding->src_chars;
7350 int multibytep = coding->src_multibyte;
7351 Lisp_Object eol_type;
7352 int c;
7353 EMACS_INT stop, stop_composition, stop_charset;
7354 int *lookup_buf = NULL;
7355
7356 if (! NILP (translation_table))
7357 lookup_buf = alloca (sizeof (int) * max_lookup);
7358
7359 eol_type = inhibit_eol_conversion ? Qunix : CODING_ID_EOL_TYPE (coding->id);
7360 if (VECTORP (eol_type))
7361 eol_type = Qunix;
7362
7363 /* Note: composition handling is not yet implemented. */
7364 coding->common_flags &= ~CODING_ANNOTATE_COMPOSITION_MASK;
7365
7366 if (NILP (coding->src_object))
7367 stop = stop_composition = stop_charset = end_pos;
7368 else
7369 {
7370 if (coding->common_flags & CODING_ANNOTATE_COMPOSITION_MASK)
7371 stop = stop_composition = pos;
7372 else
7373 stop = stop_composition = end_pos;
7374 if (coding->common_flags & CODING_ANNOTATE_CHARSET_MASK)
7375 stop = stop_charset = pos;
7376 else
7377 stop_charset = end_pos;
7378 }
7379
7380 /* Compensate for CRLF and conversion. */
7381 buf_end -= 1 + MAX_ANNOTATION_LENGTH;
7382 while (buf < buf_end)
7383 {
7384 Lisp_Object trans;
7385
7386 if (pos == stop)
7387 {
7388 if (pos == end_pos)
7389 break;
7390 if (pos == stop_composition)
7391 buf = handle_composition_annotation (pos, end_pos, coding,
7392 buf, &stop_composition);
7393 if (pos == stop_charset)
7394 buf = handle_charset_annotation (pos, end_pos, coding,
7395 buf, &stop_charset);
7396 stop = (stop_composition < stop_charset
7397 ? stop_composition : stop_charset);
7398 }
7399
7400 if (! multibytep)
7401 {
7402 EMACS_INT bytes;
7403
7404 if (coding->encoder == encode_coding_raw_text)
7405 c = *src++, pos++;
7406 else if ((bytes = MULTIBYTE_LENGTH (src, src_end)) > 0)
7407 c = STRING_CHAR_ADVANCE_NO_UNIFY (src), pos += bytes;
7408 else
7409 c = BYTE8_TO_CHAR (*src), src++, pos++;
7410 }
7411 else
7412 c = STRING_CHAR_ADVANCE_NO_UNIFY (src), pos++;
7413 if ((c == '\r') && (coding->mode & CODING_MODE_SELECTIVE_DISPLAY))
7414 c = '\n';
7415 if (! EQ (eol_type, Qunix))
7416 {
7417 if (c == '\n')
7418 {
7419 if (EQ (eol_type, Qdos))
7420 *buf++ = '\r';
7421 else
7422 c = '\r';
7423 }
7424 }
7425
7426 trans = Qnil;
7427 LOOKUP_TRANSLATION_TABLE (translation_table, c, trans);
7428 if (NILP (trans))
7429 *buf++ = c;
7430 else
7431 {
7432 int from_nchars = 1, to_nchars = 1;
7433 int *lookup_buf_end;
7434 const unsigned char *p = src;
7435 int i;
7436
7437 lookup_buf[0] = c;
7438 for (i = 1; i < max_lookup && p < src_end; i++)
7439 lookup_buf[i] = STRING_CHAR_ADVANCE (p);
7440 lookup_buf_end = lookup_buf + i;
7441 trans = get_translation (trans, lookup_buf, lookup_buf_end);
7442 if (INTEGERP (trans))
7443 c = XINT (trans);
7444 else if (CONSP (trans))
7445 {
7446 from_nchars = ASIZE (XCAR (trans));
7447 trans = XCDR (trans);
7448 if (INTEGERP (trans))
7449 c = XINT (trans);
7450 else
7451 {
7452 to_nchars = ASIZE (trans);
7453 if (buf + to_nchars > buf_end)
7454 break;
7455 c = XINT (AREF (trans, 0));
7456 }
7457 }
7458 else
7459 break;
7460 *buf++ = c;
7461 for (i = 1; i < to_nchars; i++)
7462 *buf++ = XINT (AREF (trans, i));
7463 for (i = 1; i < from_nchars; i++, pos++)
7464 src += MULTIBYTE_LENGTH_NO_CHECK (src);
7465 }
7466 }
7467
7468 coding->consumed = src - coding->source;
7469 coding->consumed_char = pos - coding->src_pos;
7470 coding->charbuf_used = buf - coding->charbuf;
7471 coding->chars_at_source = 0;
7472 }
7473
7474
7475 /* Encode the text at CODING->src_object into CODING->dst_object.
7476 CODING->src_object is a buffer or a string.
7477 CODING->dst_object is a buffer or nil.
7478
7479 If CODING->src_object is a buffer, it must be the current buffer.
7480 In this case, if CODING->src_pos is positive, it is a position of
7481 the source text in the buffer, otherwise. the source text is in the
7482 gap area of the buffer, and coding->src_pos specifies the offset of
7483 the text from GPT (which must be the same as PT). If this is the
7484 same buffer as CODING->dst_object, CODING->src_pos must be
7485 negative and CODING should not have `pre-write-conversion'.
7486
7487 If CODING->src_object is a string, CODING should not have
7488 `pre-write-conversion'.
7489
7490 If CODING->dst_object is a buffer, the encoded data is inserted at
7491 the current point of that buffer.
7492
7493 If CODING->dst_object is nil, the encoded data is placed at the
7494 memory area specified by CODING->destination. */
7495
7496 static int
7497 encode_coding (coding)
7498 struct coding_system *coding;
7499 {
7500 Lisp_Object attrs;
7501 Lisp_Object translation_table;
7502 int max_lookup;
7503
7504 attrs = CODING_ID_ATTRS (coding->id);
7505 if (coding->encoder == encode_coding_raw_text)
7506 translation_table = Qnil, max_lookup = 0;
7507 else
7508 translation_table = get_translation_table (attrs, 1, &max_lookup);
7509
7510 if (BUFFERP (coding->dst_object))
7511 {
7512 set_buffer_internal (XBUFFER (coding->dst_object));
7513 coding->dst_multibyte
7514 = ! NILP (current_buffer->enable_multibyte_characters);
7515 }
7516
7517 coding->consumed = coding->consumed_char = 0;
7518 coding->produced = coding->produced_char = 0;
7519 record_conversion_result (coding, CODING_RESULT_SUCCESS);
7520 coding->errors = 0;
7521
7522 ALLOC_CONVERSION_WORK_AREA (coding);
7523
7524 do {
7525 coding_set_source (coding);
7526 consume_chars (coding, translation_table, max_lookup);
7527 coding_set_destination (coding);
7528 (*(coding->encoder)) (coding);
7529 } while (coding->consumed_char < coding->src_chars);
7530
7531 if (BUFFERP (coding->dst_object) && coding->produced_char > 0)
7532 insert_from_gap (coding->produced_char, coding->produced);
7533
7534 return (coding->result);
7535 }
7536
7537
7538 /* Name (or base name) of work buffer for code conversion. */
7539 static Lisp_Object Vcode_conversion_workbuf_name;
7540
7541 /* A working buffer used by the top level conversion. Once it is
7542 created, it is never destroyed. It has the name
7543 Vcode_conversion_workbuf_name. The other working buffers are
7544 destroyed after the use is finished, and their names are modified
7545 versions of Vcode_conversion_workbuf_name. */
7546 static Lisp_Object Vcode_conversion_reused_workbuf;
7547
7548 /* 1 iff Vcode_conversion_reused_workbuf is already in use. */
7549 static int reused_workbuf_in_use;
7550
7551
7552 /* Return a working buffer of code convesion. MULTIBYTE specifies the
7553 multibyteness of returning buffer. */
7554
7555 static Lisp_Object
7556 make_conversion_work_buffer (multibyte)
7557 int multibyte;
7558 {
7559 Lisp_Object name, workbuf;
7560 struct buffer *current;
7561
7562 if (reused_workbuf_in_use++)
7563 {
7564 name = Fgenerate_new_buffer_name (Vcode_conversion_workbuf_name, Qnil);
7565 workbuf = Fget_buffer_create (name);
7566 }
7567 else
7568 {
7569 if (NILP (Fbuffer_live_p (Vcode_conversion_reused_workbuf)))
7570 Vcode_conversion_reused_workbuf
7571 = Fget_buffer_create (Vcode_conversion_workbuf_name);
7572 workbuf = Vcode_conversion_reused_workbuf;
7573 }
7574 current = current_buffer;
7575 set_buffer_internal (XBUFFER (workbuf));
7576 /* We can't allow modification hooks to run in the work buffer. For
7577 instance, directory_files_internal assumes that file decoding
7578 doesn't compile new regexps. */
7579 Fset (Fmake_local_variable (Qinhibit_modification_hooks), Qt);
7580 Ferase_buffer ();
7581 current_buffer->undo_list = Qt;
7582 current_buffer->enable_multibyte_characters = multibyte ? Qt : Qnil;
7583 set_buffer_internal (current);
7584 return workbuf;
7585 }
7586
7587
7588 static Lisp_Object
7589 code_conversion_restore (arg)
7590 Lisp_Object arg;
7591 {
7592 Lisp_Object current, workbuf;
7593 struct gcpro gcpro1;
7594
7595 GCPRO1 (arg);
7596 current = XCAR (arg);
7597 workbuf = XCDR (arg);
7598 if (! NILP (workbuf))
7599 {
7600 if (EQ (workbuf, Vcode_conversion_reused_workbuf))
7601 reused_workbuf_in_use = 0;
7602 else if (! NILP (Fbuffer_live_p (workbuf)))
7603 Fkill_buffer (workbuf);
7604 }
7605 set_buffer_internal (XBUFFER (current));
7606 UNGCPRO;
7607 return Qnil;
7608 }
7609
7610 Lisp_Object
7611 code_conversion_save (with_work_buf, multibyte)
7612 int with_work_buf, multibyte;
7613 {
7614 Lisp_Object workbuf = Qnil;
7615
7616 if (with_work_buf)
7617 workbuf = make_conversion_work_buffer (multibyte);
7618 record_unwind_protect (code_conversion_restore,
7619 Fcons (Fcurrent_buffer (), workbuf));
7620 return workbuf;
7621 }
7622
7623 int
7624 decode_coding_gap (coding, chars, bytes)
7625 struct coding_system *coding;
7626 EMACS_INT chars, bytes;
7627 {
7628 int count = specpdl_ptr - specpdl;
7629 Lisp_Object attrs;
7630
7631 code_conversion_save (0, 0);
7632
7633 coding->src_object = Fcurrent_buffer ();
7634 coding->src_chars = chars;
7635 coding->src_bytes = bytes;
7636 coding->src_pos = -chars;
7637 coding->src_pos_byte = -bytes;
7638 coding->src_multibyte = chars < bytes;
7639 coding->dst_object = coding->src_object;
7640 coding->dst_pos = PT;
7641 coding->dst_pos_byte = PT_BYTE;
7642 coding->dst_multibyte = ! NILP (current_buffer->enable_multibyte_characters);
7643
7644 if (CODING_REQUIRE_DETECTION (coding))
7645 detect_coding (coding);
7646
7647 coding->mode |= CODING_MODE_LAST_BLOCK;
7648 current_buffer->text->inhibit_shrinking = 1;
7649 decode_coding (coding);
7650 current_buffer->text->inhibit_shrinking = 0;
7651
7652 attrs = CODING_ID_ATTRS (coding->id);
7653 if (! NILP (CODING_ATTR_POST_READ (attrs)))
7654 {
7655 EMACS_INT prev_Z = Z, prev_Z_BYTE = Z_BYTE;
7656 Lisp_Object val;
7657
7658 TEMP_SET_PT_BOTH (coding->dst_pos, coding->dst_pos_byte);
7659 val = call1 (CODING_ATTR_POST_READ (attrs),
7660 make_number (coding->produced_char));
7661 CHECK_NATNUM (val);
7662 coding->produced_char += Z - prev_Z;
7663 coding->produced += Z_BYTE - prev_Z_BYTE;
7664 }
7665
7666 unbind_to (count, Qnil);
7667 return coding->result;
7668 }
7669
7670 int
7671 encode_coding_gap (coding, chars, bytes)
7672 struct coding_system *coding;
7673 EMACS_INT chars, bytes;
7674 {
7675 int count = specpdl_ptr - specpdl;
7676
7677 code_conversion_save (0, 0);
7678
7679 coding->src_object = Fcurrent_buffer ();
7680 coding->src_chars = chars;
7681 coding->src_bytes = bytes;
7682 coding->src_pos = -chars;
7683 coding->src_pos_byte = -bytes;
7684 coding->src_multibyte = chars < bytes;
7685 coding->dst_object = coding->src_object;
7686 coding->dst_pos = PT;
7687 coding->dst_pos_byte = PT_BYTE;
7688
7689 encode_coding (coding);
7690
7691 unbind_to (count, Qnil);
7692 return coding->result;
7693 }
7694
7695
7696 /* Decode the text in the range FROM/FROM_BYTE and TO/TO_BYTE in
7697 SRC_OBJECT into DST_OBJECT by coding context CODING.
7698
7699 SRC_OBJECT is a buffer, a string, or Qnil.
7700
7701 If it is a buffer, the text is at point of the buffer. FROM and TO
7702 are positions in the buffer.
7703
7704 If it is a string, the text is at the beginning of the string.
7705 FROM and TO are indices to the string.
7706
7707 If it is nil, the text is at coding->source. FROM and TO are
7708 indices to coding->source.
7709
7710 DST_OBJECT is a buffer, Qt, or Qnil.
7711
7712 If it is a buffer, the decoded text is inserted at point of the
7713 buffer. If the buffer is the same as SRC_OBJECT, the source text
7714 is deleted.
7715
7716 If it is Qt, a string is made from the decoded text, and
7717 set in CODING->dst_object.
7718
7719 If it is Qnil, the decoded text is stored at CODING->destination.
7720 The caller must allocate CODING->dst_bytes bytes at
7721 CODING->destination by xmalloc. If the decoded text is longer than
7722 CODING->dst_bytes, CODING->destination is relocated by xrealloc.
7723 */
7724
7725 void
7726 decode_coding_object (coding, src_object, from, from_byte, to, to_byte,
7727 dst_object)
7728 struct coding_system *coding;
7729 Lisp_Object src_object;
7730 EMACS_INT from, from_byte, to, to_byte;
7731 Lisp_Object dst_object;
7732 {
7733 int count = specpdl_ptr - specpdl;
7734 unsigned char *destination;
7735 EMACS_INT dst_bytes;
7736 EMACS_INT chars = to - from;
7737 EMACS_INT bytes = to_byte - from_byte;
7738 Lisp_Object attrs;
7739 int saved_pt = -1, saved_pt_byte;
7740 int need_marker_adjustment = 0;
7741 Lisp_Object old_deactivate_mark;
7742
7743 old_deactivate_mark = Vdeactivate_mark;
7744
7745 if (NILP (dst_object))
7746 {
7747 destination = coding->destination;
7748 dst_bytes = coding->dst_bytes;
7749 }
7750
7751 coding->src_object = src_object;
7752 coding->src_chars = chars;
7753 coding->src_bytes = bytes;
7754 coding->src_multibyte = chars < bytes;
7755
7756 if (STRINGP (src_object))
7757 {
7758 coding->src_pos = from;
7759 coding->src_pos_byte = from_byte;
7760 }
7761 else if (BUFFERP (src_object))
7762 {
7763 set_buffer_internal (XBUFFER (src_object));
7764 if (from != GPT)
7765 move_gap_both (from, from_byte);
7766 if (EQ (src_object, dst_object))
7767 {
7768 struct Lisp_Marker *tail;
7769
7770 for (tail = BUF_MARKERS (current_buffer); tail; tail = tail->next)
7771 {
7772 tail->need_adjustment
7773 = tail->charpos == (tail->insertion_type ? from : to);
7774 need_marker_adjustment |= tail->need_adjustment;
7775 }
7776 saved_pt = PT, saved_pt_byte = PT_BYTE;
7777 TEMP_SET_PT_BOTH (from, from_byte);
7778 current_buffer->text->inhibit_shrinking = 1;
7779 del_range_both (from, from_byte, to, to_byte, 1);
7780 coding->src_pos = -chars;
7781 coding->src_pos_byte = -bytes;
7782 }
7783 else
7784 {
7785 coding->src_pos = from;
7786 coding->src_pos_byte = from_byte;
7787 }
7788 }
7789
7790 if (CODING_REQUIRE_DETECTION (coding))
7791 detect_coding (coding);
7792 attrs = CODING_ID_ATTRS (coding->id);
7793
7794 if (EQ (dst_object, Qt)
7795 || (! NILP (CODING_ATTR_POST_READ (attrs))
7796 && NILP (dst_object)))
7797 {
7798 coding->dst_multibyte = !CODING_FOR_UNIBYTE (coding);
7799 coding->dst_object = code_conversion_save (1, coding->dst_multibyte);
7800 coding->dst_pos = BEG;
7801 coding->dst_pos_byte = BEG_BYTE;
7802 }
7803 else if (BUFFERP (dst_object))
7804 {
7805 code_conversion_save (0, 0);
7806 coding->dst_object = dst_object;
7807 coding->dst_pos = BUF_PT (XBUFFER (dst_object));
7808 coding->dst_pos_byte = BUF_PT_BYTE (XBUFFER (dst_object));
7809 coding->dst_multibyte
7810 = ! NILP (XBUFFER (dst_object)->enable_multibyte_characters);
7811 }
7812 else
7813 {
7814 code_conversion_save (0, 0);
7815 coding->dst_object = Qnil;
7816 /* Most callers presume this will return a multibyte result, and they
7817 won't use `binary' or `raw-text' anyway, so let's not worry about
7818 CODING_FOR_UNIBYTE. */
7819 coding->dst_multibyte = 1;
7820 }
7821
7822 decode_coding (coding);
7823
7824 if (BUFFERP (coding->dst_object))
7825 set_buffer_internal (XBUFFER (coding->dst_object));
7826
7827 if (! NILP (CODING_ATTR_POST_READ (attrs)))
7828 {
7829 struct gcpro gcpro1, gcpro2, gcpro3, gcpro4, gcpro5;
7830 EMACS_INT prev_Z = Z, prev_Z_BYTE = Z_BYTE;
7831 Lisp_Object val;
7832
7833 TEMP_SET_PT_BOTH (coding->dst_pos, coding->dst_pos_byte);
7834 GCPRO5 (coding->src_object, coding->dst_object, src_object, dst_object,
7835 old_deactivate_mark);
7836 val = safe_call1 (CODING_ATTR_POST_READ (attrs),
7837 make_number (coding->produced_char));
7838 UNGCPRO;
7839 CHECK_NATNUM (val);
7840 coding->produced_char += Z - prev_Z;
7841 coding->produced += Z_BYTE - prev_Z_BYTE;
7842 }
7843
7844 if (EQ (dst_object, Qt))
7845 {
7846 coding->dst_object = Fbuffer_string ();
7847 }
7848 else if (NILP (dst_object) && BUFFERP (coding->dst_object))
7849 {
7850 set_buffer_internal (XBUFFER (coding->dst_object));
7851 if (dst_bytes < coding->produced)
7852 {
7853 destination = xrealloc (destination, coding->produced);
7854 if (! destination)
7855 {
7856 record_conversion_result (coding,
7857 CODING_RESULT_INSUFFICIENT_DST);
7858 unbind_to (count, Qnil);
7859 return;
7860 }
7861 if (BEGV < GPT && GPT < BEGV + coding->produced_char)
7862 move_gap_both (BEGV, BEGV_BYTE);
7863 bcopy (BEGV_ADDR, destination, coding->produced);
7864 coding->destination = destination;
7865 }
7866 }
7867
7868 if (saved_pt >= 0)
7869 {
7870 /* This is the case of:
7871 (BUFFERP (src_object) && EQ (src_object, dst_object))
7872 As we have moved PT while replacing the original buffer
7873 contents, we must recover it now. */
7874 set_buffer_internal (XBUFFER (src_object));
7875 current_buffer->text->inhibit_shrinking = 0;
7876 if (saved_pt < from)
7877 TEMP_SET_PT_BOTH (saved_pt, saved_pt_byte);
7878 else if (saved_pt < from + chars)
7879 TEMP_SET_PT_BOTH (from, from_byte);
7880 else if (! NILP (current_buffer->enable_multibyte_characters))
7881 TEMP_SET_PT_BOTH (saved_pt + (coding->produced_char - chars),
7882 saved_pt_byte + (coding->produced - bytes));
7883 else
7884 TEMP_SET_PT_BOTH (saved_pt + (coding->produced - bytes),
7885 saved_pt_byte + (coding->produced - bytes));
7886
7887 if (need_marker_adjustment)
7888 {
7889 struct Lisp_Marker *tail;
7890
7891 for (tail = BUF_MARKERS (current_buffer); tail; tail = tail->next)
7892 if (tail->need_adjustment)
7893 {
7894 tail->need_adjustment = 0;
7895 if (tail->insertion_type)
7896 {
7897 tail->bytepos = from_byte;
7898 tail->charpos = from;
7899 }
7900 else
7901 {
7902 tail->bytepos = from_byte + coding->produced;
7903 tail->charpos
7904 = (NILP (current_buffer->enable_multibyte_characters)
7905 ? tail->bytepos : from + coding->produced_char);
7906 }
7907 }
7908 }
7909 }
7910
7911 Vdeactivate_mark = old_deactivate_mark;
7912 unbind_to (count, coding->dst_object);
7913 }
7914
7915
7916 void
7917 encode_coding_object (coding, src_object, from, from_byte, to, to_byte,
7918 dst_object)
7919 struct coding_system *coding;
7920 Lisp_Object src_object;
7921 EMACS_INT from, from_byte, to, to_byte;
7922 Lisp_Object dst_object;
7923 {
7924 int count = specpdl_ptr - specpdl;
7925 EMACS_INT chars = to - from;
7926 EMACS_INT bytes = to_byte - from_byte;
7927 Lisp_Object attrs;
7928 int saved_pt = -1, saved_pt_byte;
7929 int need_marker_adjustment = 0;
7930 int kill_src_buffer = 0;
7931 Lisp_Object old_deactivate_mark;
7932
7933 old_deactivate_mark = Vdeactivate_mark;
7934
7935 coding->src_object = src_object;
7936 coding->src_chars = chars;
7937 coding->src_bytes = bytes;
7938 coding->src_multibyte = chars < bytes;
7939
7940 attrs = CODING_ID_ATTRS (coding->id);
7941
7942 if (EQ (src_object, dst_object))
7943 {
7944 struct Lisp_Marker *tail;
7945
7946 for (tail = BUF_MARKERS (current_buffer); tail; tail = tail->next)
7947 {
7948 tail->need_adjustment
7949 = tail->charpos == (tail->insertion_type ? from : to);
7950 need_marker_adjustment |= tail->need_adjustment;
7951 }
7952 }
7953
7954 if (! NILP (CODING_ATTR_PRE_WRITE (attrs)))
7955 {
7956 coding->src_object = code_conversion_save (1, coding->src_multibyte);
7957 set_buffer_internal (XBUFFER (coding->src_object));
7958 if (STRINGP (src_object))
7959 insert_from_string (src_object, from, from_byte, chars, bytes, 0);
7960 else if (BUFFERP (src_object))
7961 insert_from_buffer (XBUFFER (src_object), from, chars, 0);
7962 else
7963 insert_1_both (coding->source + from, chars, bytes, 0, 0, 0);
7964
7965 if (EQ (src_object, dst_object))
7966 {
7967 set_buffer_internal (XBUFFER (src_object));
7968 saved_pt = PT, saved_pt_byte = PT_BYTE;
7969 del_range_both (from, from_byte, to, to_byte, 1);
7970 set_buffer_internal (XBUFFER (coding->src_object));
7971 }
7972
7973 {
7974 Lisp_Object args[3];
7975 struct gcpro gcpro1, gcpro2, gcpro3, gcpro4, gcpro5;
7976
7977 GCPRO5 (coding->src_object, coding->dst_object, src_object, dst_object,
7978 old_deactivate_mark);
7979 args[0] = CODING_ATTR_PRE_WRITE (attrs);
7980 args[1] = make_number (BEG);
7981 args[2] = make_number (Z);
7982 safe_call (3, args);
7983 UNGCPRO;
7984 }
7985 if (XBUFFER (coding->src_object) != current_buffer)
7986 kill_src_buffer = 1;
7987 coding->src_object = Fcurrent_buffer ();
7988 if (BEG != GPT)
7989 move_gap_both (BEG, BEG_BYTE);
7990 coding->src_chars = Z - BEG;
7991 coding->src_bytes = Z_BYTE - BEG_BYTE;
7992 coding->src_pos = BEG;
7993 coding->src_pos_byte = BEG_BYTE;
7994 coding->src_multibyte = Z < Z_BYTE;
7995 }
7996 else if (STRINGP (src_object))
7997 {
7998 code_conversion_save (0, 0);
7999 coding->src_pos = from;
8000 coding->src_pos_byte = from_byte;
8001 }
8002 else if (BUFFERP (src_object))
8003 {
8004 code_conversion_save (0, 0);
8005 set_buffer_internal (XBUFFER (src_object));
8006 if (EQ (src_object, dst_object))
8007 {
8008 saved_pt = PT, saved_pt_byte = PT_BYTE;
8009 coding->src_object = del_range_1 (from, to, 1, 1);
8010 coding->src_pos = 0;
8011 coding->src_pos_byte = 0;
8012 }
8013 else
8014 {
8015 if (from < GPT && to >= GPT)
8016 move_gap_both (from, from_byte);
8017 coding->src_pos = from;
8018 coding->src_pos_byte = from_byte;
8019 }
8020 }
8021 else
8022 code_conversion_save (0, 0);
8023
8024 if (BUFFERP (dst_object))
8025 {
8026 coding->dst_object = dst_object;
8027 if (EQ (src_object, dst_object))
8028 {
8029 coding->dst_pos = from;
8030 coding->dst_pos_byte = from_byte;
8031 }
8032 else
8033 {
8034 struct buffer *current = current_buffer;
8035
8036 set_buffer_temp (XBUFFER (dst_object));
8037 coding->dst_pos = PT;
8038 coding->dst_pos_byte = PT_BYTE;
8039 move_gap_both (coding->dst_pos, coding->dst_pos_byte);
8040 set_buffer_temp (current);
8041 }
8042 coding->dst_multibyte
8043 = ! NILP (XBUFFER (dst_object)->enable_multibyte_characters);
8044 }
8045 else if (EQ (dst_object, Qt))
8046 {
8047 coding->dst_object = Qnil;
8048 coding->dst_bytes = coding->src_chars;
8049 if (coding->dst_bytes == 0)
8050 coding->dst_bytes = 1;
8051 coding->destination = (unsigned char *) xmalloc (coding->dst_bytes);
8052 coding->dst_multibyte = 0;
8053 }
8054 else
8055 {
8056 coding->dst_object = Qnil;
8057 coding->dst_multibyte = 0;
8058 }
8059
8060 encode_coding (coding);
8061
8062 if (EQ (dst_object, Qt))
8063 {
8064 if (BUFFERP (coding->dst_object))
8065 coding->dst_object = Fbuffer_string ();
8066 else
8067 {
8068 coding->dst_object
8069 = make_unibyte_string ((char *) coding->destination,
8070 coding->produced);
8071 xfree (coding->destination);
8072 }
8073 }
8074
8075 if (saved_pt >= 0)
8076 {
8077 /* This is the case of:
8078 (BUFFERP (src_object) && EQ (src_object, dst_object))
8079 As we have moved PT while replacing the original buffer
8080 contents, we must recover it now. */
8081 set_buffer_internal (XBUFFER (src_object));
8082 if (saved_pt < from)
8083 TEMP_SET_PT_BOTH (saved_pt, saved_pt_byte);
8084 else if (saved_pt < from + chars)
8085 TEMP_SET_PT_BOTH (from, from_byte);
8086 else if (! NILP (current_buffer->enable_multibyte_characters))
8087 TEMP_SET_PT_BOTH (saved_pt + (coding->produced_char - chars),
8088 saved_pt_byte + (coding->produced - bytes));
8089 else
8090 TEMP_SET_PT_BOTH (saved_pt + (coding->produced - bytes),
8091 saved_pt_byte + (coding->produced - bytes));
8092
8093 if (need_marker_adjustment)
8094 {
8095 struct Lisp_Marker *tail;
8096
8097 for (tail = BUF_MARKERS (current_buffer); tail; tail = tail->next)
8098 if (tail->need_adjustment)
8099 {
8100 tail->need_adjustment = 0;
8101 if (tail->insertion_type)
8102 {
8103 tail->bytepos = from_byte;
8104 tail->charpos = from;
8105 }
8106 else
8107 {
8108 tail->bytepos = from_byte + coding->produced;
8109 tail->charpos
8110 = (NILP (current_buffer->enable_multibyte_characters)
8111 ? tail->bytepos : from + coding->produced_char);
8112 }
8113 }
8114 }
8115 }
8116
8117 if (kill_src_buffer)
8118 Fkill_buffer (coding->src_object);
8119
8120 Vdeactivate_mark = old_deactivate_mark;
8121 unbind_to (count, Qnil);
8122 }
8123
8124
8125 Lisp_Object
8126 preferred_coding_system ()
8127 {
8128 int id = coding_categories[coding_priorities[0]].id;
8129
8130 return CODING_ID_NAME (id);
8131 }
8132
8133 \f
8134 #ifdef emacs
8135 /*** 8. Emacs Lisp library functions ***/
8136
8137 DEFUN ("coding-system-p", Fcoding_system_p, Scoding_system_p, 1, 1, 0,
8138 doc: /* Return t if OBJECT is nil or a coding-system.
8139 See the documentation of `define-coding-system' for information
8140 about coding-system objects. */)
8141 (object)
8142 Lisp_Object object;
8143 {
8144 if (NILP (object)
8145 || CODING_SYSTEM_ID (object) >= 0)
8146 return Qt;
8147 if (! SYMBOLP (object)
8148 || NILP (Fget (object, Qcoding_system_define_form)))
8149 return Qnil;
8150 return Qt;
8151 }
8152
8153 DEFUN ("read-non-nil-coding-system", Fread_non_nil_coding_system,
8154 Sread_non_nil_coding_system, 1, 1, 0,
8155 doc: /* Read a coding system from the minibuffer, prompting with string PROMPT. */)
8156 (prompt)
8157 Lisp_Object prompt;
8158 {
8159 Lisp_Object val;
8160 do
8161 {
8162 val = Fcompleting_read (prompt, Vcoding_system_alist, Qnil,
8163 Qt, Qnil, Qcoding_system_history, Qnil, Qnil);
8164 }
8165 while (SCHARS (val) == 0);
8166 return (Fintern (val, Qnil));
8167 }
8168
8169 DEFUN ("read-coding-system", Fread_coding_system, Sread_coding_system, 1, 2, 0,
8170 doc: /* Read a coding system from the minibuffer, prompting with string PROMPT.
8171 If the user enters null input, return second argument DEFAULT-CODING-SYSTEM.
8172 Ignores case when completing coding systems (all Emacs coding systems
8173 are lower-case). */)
8174 (prompt, default_coding_system)
8175 Lisp_Object prompt, default_coding_system;
8176 {
8177 Lisp_Object val;
8178 int count = SPECPDL_INDEX ();
8179
8180 if (SYMBOLP (default_coding_system))
8181 default_coding_system = SYMBOL_NAME (default_coding_system);
8182 specbind (Qcompletion_ignore_case, Qt);
8183 val = Fcompleting_read (prompt, Vcoding_system_alist, Qnil,
8184 Qt, Qnil, Qcoding_system_history,
8185 default_coding_system, Qnil);
8186 unbind_to (count, Qnil);
8187 return (SCHARS (val) == 0 ? Qnil : Fintern (val, Qnil));
8188 }
8189
8190 DEFUN ("check-coding-system", Fcheck_coding_system, Scheck_coding_system,
8191 1, 1, 0,
8192 doc: /* Check validity of CODING-SYSTEM.
8193 If valid, return CODING-SYSTEM, else signal a `coding-system-error' error.
8194 It is valid if it is nil or a symbol defined as a coding system by the
8195 function `define-coding-system'. */)
8196 (coding_system)
8197 Lisp_Object coding_system;
8198 {
8199 Lisp_Object define_form;
8200
8201 define_form = Fget (coding_system, Qcoding_system_define_form);
8202 if (! NILP (define_form))
8203 {
8204 Fput (coding_system, Qcoding_system_define_form, Qnil);
8205 safe_eval (define_form);
8206 }
8207 if (!NILP (Fcoding_system_p (coding_system)))
8208 return coding_system;
8209 xsignal1 (Qcoding_system_error, coding_system);
8210 }
8211
8212 \f
8213 /* Detect how the bytes at SRC of length SRC_BYTES are encoded. If
8214 HIGHEST is nonzero, return the coding system of the highest
8215 priority among the detected coding systems. Otherwize return a
8216 list of detected coding systems sorted by their priorities. If
8217 MULTIBYTEP is nonzero, it is assumed that the bytes are in correct
8218 multibyte form but contains only ASCII and eight-bit chars.
8219 Otherwise, the bytes are raw bytes.
8220
8221 CODING-SYSTEM controls the detection as below:
8222
8223 If it is nil, detect both text-format and eol-format. If the
8224 text-format part of CODING-SYSTEM is already specified
8225 (e.g. `iso-latin-1'), detect only eol-format. If the eol-format
8226 part of CODING-SYSTEM is already specified (e.g. `undecided-unix'),
8227 detect only text-format. */
8228
8229 Lisp_Object
8230 detect_coding_system (src, src_chars, src_bytes, highest, multibytep,
8231 coding_system)
8232 const unsigned char *src;
8233 EMACS_INT src_chars, src_bytes;
8234 int highest;
8235 int multibytep;
8236 Lisp_Object coding_system;
8237 {
8238 const unsigned char *src_end = src + src_bytes;
8239 Lisp_Object attrs, eol_type;
8240 Lisp_Object val = Qnil;
8241 struct coding_system coding;
8242 int id;
8243 struct coding_detection_info detect_info;
8244 enum coding_category base_category;
8245 int null_byte_found = 0, eight_bit_found = 0;
8246
8247 if (NILP (coding_system))
8248 coding_system = Qundecided;
8249 setup_coding_system (coding_system, &coding);
8250 attrs = CODING_ID_ATTRS (coding.id);
8251 eol_type = CODING_ID_EOL_TYPE (coding.id);
8252 coding_system = CODING_ATTR_BASE_NAME (attrs);
8253
8254 coding.source = src;
8255 coding.src_chars = src_chars;
8256 coding.src_bytes = src_bytes;
8257 coding.src_multibyte = multibytep;
8258 coding.consumed = 0;
8259 coding.mode |= CODING_MODE_LAST_BLOCK;
8260 coding.head_ascii = 0;
8261
8262 detect_info.checked = detect_info.found = detect_info.rejected = 0;
8263
8264 /* At first, detect text-format if necessary. */
8265 base_category = XINT (CODING_ATTR_CATEGORY (attrs));
8266 if (base_category == coding_category_undecided)
8267 {
8268 enum coding_category category;
8269 struct coding_system *this;
8270 int c, i;
8271
8272 /* Skip all ASCII bytes except for a few ISO2022 controls. */
8273 for (; src < src_end; src++)
8274 {
8275 c = *src;
8276 if (c & 0x80)
8277 {
8278 eight_bit_found = 1;
8279 if (null_byte_found)
8280 break;
8281 }
8282 else if (c < 0x20)
8283 {
8284 if ((c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
8285 && ! inhibit_iso_escape_detection
8286 && ! detect_info.checked)
8287 {
8288 if (detect_coding_iso_2022 (&coding, &detect_info))
8289 {
8290 /* We have scanned the whole data. */
8291 if (! (detect_info.rejected & CATEGORY_MASK_ISO_7_ELSE))
8292 {
8293 /* We didn't find an 8-bit code. We may
8294 have found a null-byte, but it's very
8295 rare that a binary file confirm to
8296 ISO-2022. */
8297 src = src_end;
8298 coding.head_ascii = src - coding.source;
8299 }
8300 detect_info.rejected |= ~CATEGORY_MASK_ISO_ESCAPE;
8301 break;
8302 }
8303 }
8304 else if (! c && !inhibit_null_byte_detection)
8305 {
8306 null_byte_found = 1;
8307 if (eight_bit_found)
8308 break;
8309 }
8310 if (! eight_bit_found)
8311 coding.head_ascii++;
8312 }
8313 else if (! eight_bit_found)
8314 coding.head_ascii++;
8315 }
8316
8317 if (null_byte_found || eight_bit_found
8318 || coding.head_ascii < coding.src_bytes
8319 || detect_info.found)
8320 {
8321 if (coding.head_ascii == coding.src_bytes)
8322 /* As all bytes are 7-bit, we can ignore non-ISO-2022 codings. */
8323 for (i = 0; i < coding_category_raw_text; i++)
8324 {
8325 category = coding_priorities[i];
8326 this = coding_categories + category;
8327 if (detect_info.found & (1 << category))
8328 break;
8329 }
8330 else
8331 {
8332 if (null_byte_found)
8333 {
8334 detect_info.checked |= ~CATEGORY_MASK_UTF_16;
8335 detect_info.rejected |= ~CATEGORY_MASK_UTF_16;
8336 }
8337 for (i = 0; i < coding_category_raw_text; i++)
8338 {
8339 category = coding_priorities[i];
8340 this = coding_categories + category;
8341
8342 if (this->id < 0)
8343 {
8344 /* No coding system of this category is defined. */
8345 detect_info.rejected |= (1 << category);
8346 }
8347 else if (category >= coding_category_raw_text)
8348 continue;
8349 else if (detect_info.checked & (1 << category))
8350 {
8351 if (highest
8352 && (detect_info.found & (1 << category)))
8353 break;
8354 }
8355 else if ((*(this->detector)) (&coding, &detect_info)
8356 && highest
8357 && (detect_info.found & (1 << category)))
8358 {
8359 if (category == coding_category_utf_16_auto)
8360 {
8361 if (detect_info.found & CATEGORY_MASK_UTF_16_LE)
8362 category = coding_category_utf_16_le;
8363 else
8364 category = coding_category_utf_16_be;
8365 }
8366 break;
8367 }
8368 }
8369 }
8370 }
8371
8372 if ((detect_info.rejected & CATEGORY_MASK_ANY) == CATEGORY_MASK_ANY
8373 || null_byte_found)
8374 {
8375 detect_info.found = CATEGORY_MASK_RAW_TEXT;
8376 id = CODING_SYSTEM_ID (Qno_conversion);
8377 val = Fcons (make_number (id), Qnil);
8378 }
8379 else if (! detect_info.rejected && ! detect_info.found)
8380 {
8381 detect_info.found = CATEGORY_MASK_ANY;
8382 id = coding_categories[coding_category_undecided].id;
8383 val = Fcons (make_number (id), Qnil);
8384 }
8385 else if (highest)
8386 {
8387 if (detect_info.found)
8388 {
8389 detect_info.found = 1 << category;
8390 val = Fcons (make_number (this->id), Qnil);
8391 }
8392 else
8393 for (i = 0; i < coding_category_raw_text; i++)
8394 if (! (detect_info.rejected & (1 << coding_priorities[i])))
8395 {
8396 detect_info.found = 1 << coding_priorities[i];
8397 id = coding_categories[coding_priorities[i]].id;
8398 val = Fcons (make_number (id), Qnil);
8399 break;
8400 }
8401 }
8402 else
8403 {
8404 int mask = detect_info.rejected | detect_info.found;
8405 int found = 0;
8406
8407 for (i = coding_category_raw_text - 1; i >= 0; i--)
8408 {
8409 category = coding_priorities[i];
8410 if (! (mask & (1 << category)))
8411 {
8412 found |= 1 << category;
8413 id = coding_categories[category].id;
8414 if (id >= 0)
8415 val = Fcons (make_number (id), val);
8416 }
8417 }
8418 for (i = coding_category_raw_text - 1; i >= 0; i--)
8419 {
8420 category = coding_priorities[i];
8421 if (detect_info.found & (1 << category))
8422 {
8423 id = coding_categories[category].id;
8424 val = Fcons (make_number (id), val);
8425 }
8426 }
8427 detect_info.found |= found;
8428 }
8429 }
8430 else if (base_category == coding_category_utf_8_auto)
8431 {
8432 if (detect_coding_utf_8 (&coding, &detect_info))
8433 {
8434 struct coding_system *this;
8435
8436 if (detect_info.found & CATEGORY_MASK_UTF_8_SIG)
8437 this = coding_categories + coding_category_utf_8_sig;
8438 else
8439 this = coding_categories + coding_category_utf_8_nosig;
8440 val = Fcons (make_number (this->id), Qnil);
8441 }
8442 }
8443 else if (base_category == coding_category_utf_16_auto)
8444 {
8445 if (detect_coding_utf_16 (&coding, &detect_info))
8446 {
8447 struct coding_system *this;
8448
8449 if (detect_info.found & CATEGORY_MASK_UTF_16_LE)
8450 this = coding_categories + coding_category_utf_16_le;
8451 else if (detect_info.found & CATEGORY_MASK_UTF_16_BE)
8452 this = coding_categories + coding_category_utf_16_be;
8453 else if (detect_info.rejected & CATEGORY_MASK_UTF_16_LE_NOSIG)
8454 this = coding_categories + coding_category_utf_16_be_nosig;
8455 else
8456 this = coding_categories + coding_category_utf_16_le_nosig;
8457 val = Fcons (make_number (this->id), Qnil);
8458 }
8459 }
8460 else
8461 {
8462 detect_info.found = 1 << XINT (CODING_ATTR_CATEGORY (attrs));
8463 val = Fcons (make_number (coding.id), Qnil);
8464 }
8465
8466 /* Then, detect eol-format if necessary. */
8467 {
8468 int normal_eol = -1, utf_16_be_eol = -1, utf_16_le_eol = -1;
8469 Lisp_Object tail;
8470
8471 if (VECTORP (eol_type))
8472 {
8473 if (detect_info.found & ~CATEGORY_MASK_UTF_16)
8474 {
8475 if (null_byte_found)
8476 normal_eol = EOL_SEEN_LF;
8477 else
8478 normal_eol = detect_eol (coding.source, src_bytes,
8479 coding_category_raw_text);
8480 }
8481 if (detect_info.found & (CATEGORY_MASK_UTF_16_BE
8482 | CATEGORY_MASK_UTF_16_BE_NOSIG))
8483 utf_16_be_eol = detect_eol (coding.source, src_bytes,
8484 coding_category_utf_16_be);
8485 if (detect_info.found & (CATEGORY_MASK_UTF_16_LE
8486 | CATEGORY_MASK_UTF_16_LE_NOSIG))
8487 utf_16_le_eol = detect_eol (coding.source, src_bytes,
8488 coding_category_utf_16_le);
8489 }
8490 else
8491 {
8492 if (EQ (eol_type, Qunix))
8493 normal_eol = utf_16_be_eol = utf_16_le_eol = EOL_SEEN_LF;
8494 else if (EQ (eol_type, Qdos))
8495 normal_eol = utf_16_be_eol = utf_16_le_eol = EOL_SEEN_CRLF;
8496 else
8497 normal_eol = utf_16_be_eol = utf_16_le_eol = EOL_SEEN_CR;
8498 }
8499
8500 for (tail = val; CONSP (tail); tail = XCDR (tail))
8501 {
8502 enum coding_category category;
8503 int this_eol;
8504
8505 id = XINT (XCAR (tail));
8506 attrs = CODING_ID_ATTRS (id);
8507 category = XINT (CODING_ATTR_CATEGORY (attrs));
8508 eol_type = CODING_ID_EOL_TYPE (id);
8509 if (VECTORP (eol_type))
8510 {
8511 if (category == coding_category_utf_16_be
8512 || category == coding_category_utf_16_be_nosig)
8513 this_eol = utf_16_be_eol;
8514 else if (category == coding_category_utf_16_le
8515 || category == coding_category_utf_16_le_nosig)
8516 this_eol = utf_16_le_eol;
8517 else
8518 this_eol = normal_eol;
8519
8520 if (this_eol == EOL_SEEN_LF)
8521 XSETCAR (tail, AREF (eol_type, 0));
8522 else if (this_eol == EOL_SEEN_CRLF)
8523 XSETCAR (tail, AREF (eol_type, 1));
8524 else if (this_eol == EOL_SEEN_CR)
8525 XSETCAR (tail, AREF (eol_type, 2));
8526 else
8527 XSETCAR (tail, CODING_ID_NAME (id));
8528 }
8529 else
8530 XSETCAR (tail, CODING_ID_NAME (id));
8531 }
8532 }
8533
8534 return (highest ? (CONSP (val) ? XCAR (val) : Qnil) : val);
8535 }
8536
8537
8538 DEFUN ("detect-coding-region", Fdetect_coding_region, Sdetect_coding_region,
8539 2, 3, 0,
8540 doc: /* Detect coding system of the text in the region between START and END.
8541 Return a list of possible coding systems ordered by priority.
8542 The coding systems to try and their priorities follows what
8543 the function `coding-system-priority-list' (which see) returns.
8544
8545 If only ASCII characters are found (except for such ISO-2022 control
8546 characters as ESC), it returns a list of single element `undecided'
8547 or its subsidiary coding system according to a detected end-of-line
8548 format.
8549
8550 If optional argument HIGHEST is non-nil, return the coding system of
8551 highest priority. */)
8552 (start, end, highest)
8553 Lisp_Object start, end, highest;
8554 {
8555 int from, to;
8556 int from_byte, to_byte;
8557
8558 CHECK_NUMBER_COERCE_MARKER (start);
8559 CHECK_NUMBER_COERCE_MARKER (end);
8560
8561 validate_region (&start, &end);
8562 from = XINT (start), to = XINT (end);
8563 from_byte = CHAR_TO_BYTE (from);
8564 to_byte = CHAR_TO_BYTE (to);
8565
8566 if (from < GPT && to >= GPT)
8567 move_gap_both (to, to_byte);
8568
8569 return detect_coding_system (BYTE_POS_ADDR (from_byte),
8570 to - from, to_byte - from_byte,
8571 !NILP (highest),
8572 !NILP (current_buffer
8573 ->enable_multibyte_characters),
8574 Qnil);
8575 }
8576
8577 DEFUN ("detect-coding-string", Fdetect_coding_string, Sdetect_coding_string,
8578 1, 2, 0,
8579 doc: /* Detect coding system of the text in STRING.
8580 Return a list of possible coding systems ordered by priority.
8581 The coding systems to try and their priorities follows what
8582 the function `coding-system-priority-list' (which see) returns.
8583
8584 If only ASCII characters are found (except for such ISO-2022 control
8585 characters as ESC), it returns a list of single element `undecided'
8586 or its subsidiary coding system according to a detected end-of-line
8587 format.
8588
8589 If optional argument HIGHEST is non-nil, return the coding system of
8590 highest priority. */)
8591 (string, highest)
8592 Lisp_Object string, highest;
8593 {
8594 CHECK_STRING (string);
8595
8596 return detect_coding_system (SDATA (string),
8597 SCHARS (string), SBYTES (string),
8598 !NILP (highest), STRING_MULTIBYTE (string),
8599 Qnil);
8600 }
8601
8602
8603 static INLINE int
8604 char_encodable_p (c, attrs)
8605 int c;
8606 Lisp_Object attrs;
8607 {
8608 Lisp_Object tail;
8609 struct charset *charset;
8610 Lisp_Object translation_table;
8611
8612 translation_table = CODING_ATTR_TRANS_TBL (attrs);
8613 if (! NILP (translation_table))
8614 c = translate_char (translation_table, c);
8615 for (tail = CODING_ATTR_CHARSET_LIST (attrs);
8616 CONSP (tail); tail = XCDR (tail))
8617 {
8618 charset = CHARSET_FROM_ID (XINT (XCAR (tail)));
8619 if (CHAR_CHARSET_P (c, charset))
8620 break;
8621 }
8622 return (! NILP (tail));
8623 }
8624
8625
8626 /* Return a list of coding systems that safely encode the text between
8627 START and END. If EXCLUDE is non-nil, it is a list of coding
8628 systems not to check. The returned list doesn't contain any such
8629 coding systems. In any case, if the text contains only ASCII or is
8630 unibyte, return t. */
8631
8632 DEFUN ("find-coding-systems-region-internal",
8633 Ffind_coding_systems_region_internal,
8634 Sfind_coding_systems_region_internal, 2, 3, 0,
8635 doc: /* Internal use only. */)
8636 (start, end, exclude)
8637 Lisp_Object start, end, exclude;
8638 {
8639 Lisp_Object coding_attrs_list, safe_codings;
8640 EMACS_INT start_byte, end_byte;
8641 const unsigned char *p, *pbeg, *pend;
8642 int c;
8643 Lisp_Object tail, elt, work_table;
8644
8645 if (STRINGP (start))
8646 {
8647 if (!STRING_MULTIBYTE (start)
8648 || SCHARS (start) == SBYTES (start))
8649 return Qt;
8650 start_byte = 0;
8651 end_byte = SBYTES (start);
8652 }
8653 else
8654 {
8655 CHECK_NUMBER_COERCE_MARKER (start);
8656 CHECK_NUMBER_COERCE_MARKER (end);
8657 if (XINT (start) < BEG || XINT (end) > Z || XINT (start) > XINT (end))
8658 args_out_of_range (start, end);
8659 if (NILP (current_buffer->enable_multibyte_characters))
8660 return Qt;
8661 start_byte = CHAR_TO_BYTE (XINT (start));
8662 end_byte = CHAR_TO_BYTE (XINT (end));
8663 if (XINT (end) - XINT (start) == end_byte - start_byte)
8664 return Qt;
8665
8666 if (XINT (start) < GPT && XINT (end) > GPT)
8667 {
8668 if ((GPT - XINT (start)) < (XINT (end) - GPT))
8669 move_gap_both (XINT (start), start_byte);
8670 else
8671 move_gap_both (XINT (end), end_byte);
8672 }
8673 }
8674
8675 coding_attrs_list = Qnil;
8676 for (tail = Vcoding_system_list; CONSP (tail); tail = XCDR (tail))
8677 if (NILP (exclude)
8678 || NILP (Fmemq (XCAR (tail), exclude)))
8679 {
8680 Lisp_Object attrs;
8681
8682 attrs = AREF (CODING_SYSTEM_SPEC (XCAR (tail)), 0);
8683 if (EQ (XCAR (tail), CODING_ATTR_BASE_NAME (attrs))
8684 && ! EQ (CODING_ATTR_TYPE (attrs), Qundecided))
8685 {
8686 ASET (attrs, coding_attr_trans_tbl,
8687 get_translation_table (attrs, 1, NULL));
8688 coding_attrs_list = Fcons (attrs, coding_attrs_list);
8689 }
8690 }
8691
8692 if (STRINGP (start))
8693 p = pbeg = SDATA (start);
8694 else
8695 p = pbeg = BYTE_POS_ADDR (start_byte);
8696 pend = p + (end_byte - start_byte);
8697
8698 while (p < pend && ASCII_BYTE_P (*p)) p++;
8699 while (p < pend && ASCII_BYTE_P (*(pend - 1))) pend--;
8700
8701 work_table = Fmake_char_table (Qnil, Qnil);
8702 while (p < pend)
8703 {
8704 if (ASCII_BYTE_P (*p))
8705 p++;
8706 else
8707 {
8708 c = STRING_CHAR_ADVANCE (p);
8709 if (!NILP (char_table_ref (work_table, c)))
8710 /* This character was already checked. Ignore it. */
8711 continue;
8712
8713 charset_map_loaded = 0;
8714 for (tail = coding_attrs_list; CONSP (tail);)
8715 {
8716 elt = XCAR (tail);
8717 if (NILP (elt))
8718 tail = XCDR (tail);
8719 else if (char_encodable_p (c, elt))
8720 tail = XCDR (tail);
8721 else if (CONSP (XCDR (tail)))
8722 {
8723 XSETCAR (tail, XCAR (XCDR (tail)));
8724 XSETCDR (tail, XCDR (XCDR (tail)));
8725 }
8726 else
8727 {
8728 XSETCAR (tail, Qnil);
8729 tail = XCDR (tail);
8730 }
8731 }
8732 if (charset_map_loaded)
8733 {
8734 EMACS_INT p_offset = p - pbeg, pend_offset = pend - pbeg;
8735
8736 if (STRINGP (start))
8737 pbeg = SDATA (start);
8738 else
8739 pbeg = BYTE_POS_ADDR (start_byte);
8740 p = pbeg + p_offset;
8741 pend = pbeg + pend_offset;
8742 }
8743 char_table_set (work_table, c, Qt);
8744 }
8745 }
8746
8747 safe_codings = list2 (Qraw_text, Qno_conversion);
8748 for (tail = coding_attrs_list; CONSP (tail); tail = XCDR (tail))
8749 if (! NILP (XCAR (tail)))
8750 safe_codings = Fcons (CODING_ATTR_BASE_NAME (XCAR (tail)), safe_codings);
8751
8752 return safe_codings;
8753 }
8754
8755
8756 DEFUN ("unencodable-char-position", Funencodable_char_position,
8757 Sunencodable_char_position, 3, 5, 0,
8758 doc: /*
8759 Return position of first un-encodable character in a region.
8760 START and END specify the region and CODING-SYSTEM specifies the
8761 encoding to check. Return nil if CODING-SYSTEM does encode the region.
8762
8763 If optional 4th argument COUNT is non-nil, it specifies at most how
8764 many un-encodable characters to search. In this case, the value is a
8765 list of positions.
8766
8767 If optional 5th argument STRING is non-nil, it is a string to search
8768 for un-encodable characters. In that case, START and END are indexes
8769 to the string. */)
8770 (start, end, coding_system, count, string)
8771 Lisp_Object start, end, coding_system, count, string;
8772 {
8773 int n;
8774 struct coding_system coding;
8775 Lisp_Object attrs, charset_list, translation_table;
8776 Lisp_Object positions;
8777 int from, to;
8778 const unsigned char *p, *stop, *pend;
8779 int ascii_compatible;
8780
8781 setup_coding_system (Fcheck_coding_system (coding_system), &coding);
8782 attrs = CODING_ID_ATTRS (coding.id);
8783 if (EQ (CODING_ATTR_TYPE (attrs), Qraw_text))
8784 return Qnil;
8785 ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs));
8786 charset_list = CODING_ATTR_CHARSET_LIST (attrs);
8787 translation_table = get_translation_table (attrs, 1, NULL);
8788
8789 if (NILP (string))
8790 {
8791 validate_region (&start, &end);
8792 from = XINT (start);
8793 to = XINT (end);
8794 if (NILP (current_buffer->enable_multibyte_characters)
8795 || (ascii_compatible
8796 && (to - from) == (CHAR_TO_BYTE (to) - (CHAR_TO_BYTE (from)))))
8797 return Qnil;
8798 p = CHAR_POS_ADDR (from);
8799 pend = CHAR_POS_ADDR (to);
8800 if (from < GPT && to >= GPT)
8801 stop = GPT_ADDR;
8802 else
8803 stop = pend;
8804 }
8805 else
8806 {
8807 CHECK_STRING (string);
8808 CHECK_NATNUM (start);
8809 CHECK_NATNUM (end);
8810 from = XINT (start);
8811 to = XINT (end);
8812 if (from > to
8813 || to > SCHARS (string))
8814 args_out_of_range_3 (string, start, end);
8815 if (! STRING_MULTIBYTE (string))
8816 return Qnil;
8817 p = SDATA (string) + string_char_to_byte (string, from);
8818 stop = pend = SDATA (string) + string_char_to_byte (string, to);
8819 if (ascii_compatible && (to - from) == (pend - p))
8820 return Qnil;
8821 }
8822
8823 if (NILP (count))
8824 n = 1;
8825 else
8826 {
8827 CHECK_NATNUM (count);
8828 n = XINT (count);
8829 }
8830
8831 positions = Qnil;
8832 while (1)
8833 {
8834 int c;
8835
8836 if (ascii_compatible)
8837 while (p < stop && ASCII_BYTE_P (*p))
8838 p++, from++;
8839 if (p >= stop)
8840 {
8841 if (p >= pend)
8842 break;
8843 stop = pend;
8844 p = GAP_END_ADDR;
8845 }
8846
8847 c = STRING_CHAR_ADVANCE (p);
8848 if (! (ASCII_CHAR_P (c) && ascii_compatible)
8849 && ! char_charset (translate_char (translation_table, c),
8850 charset_list, NULL))
8851 {
8852 positions = Fcons (make_number (from), positions);
8853 n--;
8854 if (n == 0)
8855 break;
8856 }
8857
8858 from++;
8859 }
8860
8861 return (NILP (count) ? Fcar (positions) : Fnreverse (positions));
8862 }
8863
8864
8865 DEFUN ("check-coding-systems-region", Fcheck_coding_systems_region,
8866 Scheck_coding_systems_region, 3, 3, 0,
8867 doc: /* Check if the region is encodable by coding systems.
8868
8869 START and END are buffer positions specifying the region.
8870 CODING-SYSTEM-LIST is a list of coding systems to check.
8871
8872 The value is an alist ((CODING-SYSTEM POS0 POS1 ...) ...), where
8873 CODING-SYSTEM is a member of CODING-SYSTEM-LIST and can't encode the
8874 whole region, POS0, POS1, ... are buffer positions where non-encodable
8875 characters are found.
8876
8877 If all coding systems in CODING-SYSTEM-LIST can encode the region, the
8878 value is nil.
8879
8880 START may be a string. In that case, check if the string is
8881 encodable, and the value contains indices to the string instead of
8882 buffer positions. END is ignored.
8883
8884 If the current buffer (or START if it is a string) is unibyte, the value
8885 is nil. */)
8886 (start, end, coding_system_list)
8887 Lisp_Object start, end, coding_system_list;
8888 {
8889 Lisp_Object list;
8890 EMACS_INT start_byte, end_byte;
8891 int pos;
8892 const unsigned char *p, *pbeg, *pend;
8893 int c;
8894 Lisp_Object tail, elt, attrs;
8895
8896 if (STRINGP (start))
8897 {
8898 if (!STRING_MULTIBYTE (start)
8899 || SCHARS (start) == SBYTES (start))
8900 return Qnil;
8901 start_byte = 0;
8902 end_byte = SBYTES (start);
8903 pos = 0;
8904 }
8905 else
8906 {
8907 CHECK_NUMBER_COERCE_MARKER (start);
8908 CHECK_NUMBER_COERCE_MARKER (end);
8909 if (XINT (start) < BEG || XINT (end) > Z || XINT (start) > XINT (end))
8910 args_out_of_range (start, end);
8911 if (NILP (current_buffer->enable_multibyte_characters))
8912 return Qnil;
8913 start_byte = CHAR_TO_BYTE (XINT (start));
8914 end_byte = CHAR_TO_BYTE (XINT (end));
8915 if (XINT (end) - XINT (start) == end_byte - start_byte)
8916 return Qnil;
8917
8918 if (XINT (start) < GPT && XINT (end) > GPT)
8919 {
8920 if ((GPT - XINT (start)) < (XINT (end) - GPT))
8921 move_gap_both (XINT (start), start_byte);
8922 else
8923 move_gap_both (XINT (end), end_byte);
8924 }
8925 pos = XINT (start);
8926 }
8927
8928 list = Qnil;
8929 for (tail = coding_system_list; CONSP (tail); tail = XCDR (tail))
8930 {
8931 elt = XCAR (tail);
8932 attrs = AREF (CODING_SYSTEM_SPEC (elt), 0);
8933 ASET (attrs, coding_attr_trans_tbl,
8934 get_translation_table (attrs, 1, NULL));
8935 list = Fcons (Fcons (elt, Fcons (attrs, Qnil)), list);
8936 }
8937
8938 if (STRINGP (start))
8939 p = pbeg = SDATA (start);
8940 else
8941 p = pbeg = BYTE_POS_ADDR (start_byte);
8942 pend = p + (end_byte - start_byte);
8943
8944 while (p < pend && ASCII_BYTE_P (*p)) p++, pos++;
8945 while (p < pend && ASCII_BYTE_P (*(pend - 1))) pend--;
8946
8947 while (p < pend)
8948 {
8949 if (ASCII_BYTE_P (*p))
8950 p++;
8951 else
8952 {
8953 c = STRING_CHAR_ADVANCE (p);
8954
8955 charset_map_loaded = 0;
8956 for (tail = list; CONSP (tail); tail = XCDR (tail))
8957 {
8958 elt = XCDR (XCAR (tail));
8959 if (! char_encodable_p (c, XCAR (elt)))
8960 XSETCDR (elt, Fcons (make_number (pos), XCDR (elt)));
8961 }
8962 if (charset_map_loaded)
8963 {
8964 EMACS_INT p_offset = p - pbeg, pend_offset = pend - pbeg;
8965
8966 if (STRINGP (start))
8967 pbeg = SDATA (start);
8968 else
8969 pbeg = BYTE_POS_ADDR (start_byte);
8970 p = pbeg + p_offset;
8971 pend = pbeg + pend_offset;
8972 }
8973 }
8974 pos++;
8975 }
8976
8977 tail = list;
8978 list = Qnil;
8979 for (; CONSP (tail); tail = XCDR (tail))
8980 {
8981 elt = XCAR (tail);
8982 if (CONSP (XCDR (XCDR (elt))))
8983 list = Fcons (Fcons (XCAR (elt), Fnreverse (XCDR (XCDR (elt)))),
8984 list);
8985 }
8986
8987 return list;
8988 }
8989
8990
8991 Lisp_Object
8992 code_convert_region (start, end, coding_system, dst_object, encodep, norecord)
8993 Lisp_Object start, end, coding_system, dst_object;
8994 int encodep, norecord;
8995 {
8996 struct coding_system coding;
8997 EMACS_INT from, from_byte, to, to_byte;
8998 Lisp_Object src_object;
8999
9000 CHECK_NUMBER_COERCE_MARKER (start);
9001 CHECK_NUMBER_COERCE_MARKER (end);
9002 if (NILP (coding_system))
9003 coding_system = Qno_conversion;
9004 else
9005 CHECK_CODING_SYSTEM (coding_system);
9006 src_object = Fcurrent_buffer ();
9007 if (NILP (dst_object))
9008 dst_object = src_object;
9009 else if (! EQ (dst_object, Qt))
9010 CHECK_BUFFER (dst_object);
9011
9012 validate_region (&start, &end);
9013 from = XFASTINT (start);
9014 from_byte = CHAR_TO_BYTE (from);
9015 to = XFASTINT (end);
9016 to_byte = CHAR_TO_BYTE (to);
9017
9018 setup_coding_system (coding_system, &coding);
9019 coding.mode |= CODING_MODE_LAST_BLOCK;
9020
9021 if (encodep)
9022 encode_coding_object (&coding, src_object, from, from_byte, to, to_byte,
9023 dst_object);
9024 else
9025 decode_coding_object (&coding, src_object, from, from_byte, to, to_byte,
9026 dst_object);
9027 if (! norecord)
9028 Vlast_coding_system_used = CODING_ID_NAME (coding.id);
9029
9030 return (BUFFERP (dst_object)
9031 ? make_number (coding.produced_char)
9032 : coding.dst_object);
9033 }
9034
9035
9036 DEFUN ("decode-coding-region", Fdecode_coding_region, Sdecode_coding_region,
9037 3, 4, "r\nzCoding system: ",
9038 doc: /* Decode the current region from the specified coding system.
9039 When called from a program, takes four arguments:
9040 START, END, CODING-SYSTEM, and DESTINATION.
9041 START and END are buffer positions.
9042
9043 Optional 4th arguments DESTINATION specifies where the decoded text goes.
9044 If nil, the region between START and END is replaced by the decoded text.
9045 If buffer, the decoded text is inserted in that buffer after point (point
9046 does not move).
9047 In those cases, the length of the decoded text is returned.
9048 If DESTINATION is t, the decoded text is returned.
9049
9050 This function sets `last-coding-system-used' to the precise coding system
9051 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
9052 not fully specified.) */)
9053 (start, end, coding_system, destination)
9054 Lisp_Object start, end, coding_system, destination;
9055 {
9056 return code_convert_region (start, end, coding_system, destination, 0, 0);
9057 }
9058
9059 DEFUN ("encode-coding-region", Fencode_coding_region, Sencode_coding_region,
9060 3, 4, "r\nzCoding system: ",
9061 doc: /* Encode the current region by specified coding system.
9062 When called from a program, takes four arguments:
9063 START, END, CODING-SYSTEM and DESTINATION.
9064 START and END are buffer positions.
9065
9066 Optional 4th arguments DESTINATION specifies where the encoded text goes.
9067 If nil, the region between START and END is replace by the encoded text.
9068 If buffer, the encoded text is inserted in that buffer after point (point
9069 does not move).
9070 In those cases, the length of the encoded text is returned.
9071 If DESTINATION is t, the encoded text is returned.
9072
9073 This function sets `last-coding-system-used' to the precise coding system
9074 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
9075 not fully specified.) */)
9076 (start, end, coding_system, destination)
9077 Lisp_Object start, end, coding_system, destination;
9078 {
9079 return code_convert_region (start, end, coding_system, destination, 1, 0);
9080 }
9081
9082 Lisp_Object
9083 code_convert_string (string, coding_system, dst_object,
9084 encodep, nocopy, norecord)
9085 Lisp_Object string, coding_system, dst_object;
9086 int encodep, nocopy, norecord;
9087 {
9088 struct coding_system coding;
9089 EMACS_INT chars, bytes;
9090
9091 CHECK_STRING (string);
9092 if (NILP (coding_system))
9093 {
9094 if (! norecord)
9095 Vlast_coding_system_used = Qno_conversion;
9096 if (NILP (dst_object))
9097 return (nocopy ? Fcopy_sequence (string) : string);
9098 }
9099
9100 if (NILP (coding_system))
9101 coding_system = Qno_conversion;
9102 else
9103 CHECK_CODING_SYSTEM (coding_system);
9104 if (NILP (dst_object))
9105 dst_object = Qt;
9106 else if (! EQ (dst_object, Qt))
9107 CHECK_BUFFER (dst_object);
9108
9109 setup_coding_system (coding_system, &coding);
9110 coding.mode |= CODING_MODE_LAST_BLOCK;
9111 chars = SCHARS (string);
9112 bytes = SBYTES (string);
9113 if (encodep)
9114 encode_coding_object (&coding, string, 0, 0, chars, bytes, dst_object);
9115 else
9116 decode_coding_object (&coding, string, 0, 0, chars, bytes, dst_object);
9117 if (! norecord)
9118 Vlast_coding_system_used = CODING_ID_NAME (coding.id);
9119
9120 return (BUFFERP (dst_object)
9121 ? make_number (coding.produced_char)
9122 : coding.dst_object);
9123 }
9124
9125
9126 /* Encode or decode STRING according to CODING_SYSTEM.
9127 Do not set Vlast_coding_system_used.
9128
9129 This function is called only from macros DECODE_FILE and
9130 ENCODE_FILE, thus we ignore character composition. */
9131
9132 Lisp_Object
9133 code_convert_string_norecord (string, coding_system, encodep)
9134 Lisp_Object string, coding_system;
9135 int encodep;
9136 {
9137 return code_convert_string (string, coding_system, Qt, encodep, 0, 1);
9138 }
9139
9140
9141 DEFUN ("decode-coding-string", Fdecode_coding_string, Sdecode_coding_string,
9142 2, 4, 0,
9143 doc: /* Decode STRING which is encoded in CODING-SYSTEM, and return the result.
9144
9145 Optional third arg NOCOPY non-nil means it is OK to return STRING itself
9146 if the decoding operation is trivial.
9147
9148 Optional fourth arg BUFFER non-nil means that the decoded text is
9149 inserted in that buffer after point (point does not move). In this
9150 case, the return value is the length of the decoded text.
9151
9152 This function sets `last-coding-system-used' to the precise coding system
9153 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
9154 not fully specified.) */)
9155 (string, coding_system, nocopy, buffer)
9156 Lisp_Object string, coding_system, nocopy, buffer;
9157 {
9158 return code_convert_string (string, coding_system, buffer,
9159 0, ! NILP (nocopy), 0);
9160 }
9161
9162 DEFUN ("encode-coding-string", Fencode_coding_string, Sencode_coding_string,
9163 2, 4, 0,
9164 doc: /* Encode STRING to CODING-SYSTEM, and return the result.
9165
9166 Optional third arg NOCOPY non-nil means it is OK to return STRING
9167 itself if the encoding operation is trivial.
9168
9169 Optional fourth arg BUFFER non-nil means that the encoded text is
9170 inserted in that buffer after point (point does not move). In this
9171 case, the return value is the length of the encoded text.
9172
9173 This function sets `last-coding-system-used' to the precise coding system
9174 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
9175 not fully specified.) */)
9176 (string, coding_system, nocopy, buffer)
9177 Lisp_Object string, coding_system, nocopy, buffer;
9178 {
9179 return code_convert_string (string, coding_system, buffer,
9180 1, ! NILP (nocopy), 1);
9181 }
9182
9183 \f
9184 DEFUN ("decode-sjis-char", Fdecode_sjis_char, Sdecode_sjis_char, 1, 1, 0,
9185 doc: /* Decode a Japanese character which has CODE in shift_jis encoding.
9186 Return the corresponding character. */)
9187 (code)
9188 Lisp_Object code;
9189 {
9190 Lisp_Object spec, attrs, val;
9191 struct charset *charset_roman, *charset_kanji, *charset_kana, *charset;
9192 int c;
9193
9194 CHECK_NATNUM (code);
9195 c = XFASTINT (code);
9196 CHECK_CODING_SYSTEM_GET_SPEC (Vsjis_coding_system, spec);
9197 attrs = AREF (spec, 0);
9198
9199 if (ASCII_BYTE_P (c)
9200 && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
9201 return code;
9202
9203 val = CODING_ATTR_CHARSET_LIST (attrs);
9204 charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
9205 charset_kana = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
9206 charset_kanji = CHARSET_FROM_ID (XINT (XCAR (val)));
9207
9208 if (c <= 0x7F)
9209 charset = charset_roman;
9210 else if (c >= 0xA0 && c < 0xDF)
9211 {
9212 charset = charset_kana;
9213 c -= 0x80;
9214 }
9215 else
9216 {
9217 int s1 = c >> 8, s2 = c & 0xFF;
9218
9219 if (s1 < 0x81 || (s1 > 0x9F && s1 < 0xE0) || s1 > 0xEF
9220 || s2 < 0x40 || s2 == 0x7F || s2 > 0xFC)
9221 error ("Invalid code: %d", code);
9222 SJIS_TO_JIS (c);
9223 charset = charset_kanji;
9224 }
9225 c = DECODE_CHAR (charset, c);
9226 if (c < 0)
9227 error ("Invalid code: %d", code);
9228 return make_number (c);
9229 }
9230
9231
9232 DEFUN ("encode-sjis-char", Fencode_sjis_char, Sencode_sjis_char, 1, 1, 0,
9233 doc: /* Encode a Japanese character CH to shift_jis encoding.
9234 Return the corresponding code in SJIS. */)
9235 (ch)
9236 Lisp_Object ch;
9237 {
9238 Lisp_Object spec, attrs, charset_list;
9239 int c;
9240 struct charset *charset;
9241 unsigned code;
9242
9243 CHECK_CHARACTER (ch);
9244 c = XFASTINT (ch);
9245 CHECK_CODING_SYSTEM_GET_SPEC (Vsjis_coding_system, spec);
9246 attrs = AREF (spec, 0);
9247
9248 if (ASCII_CHAR_P (c)
9249 && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
9250 return ch;
9251
9252 charset_list = CODING_ATTR_CHARSET_LIST (attrs);
9253 charset = char_charset (c, charset_list, &code);
9254 if (code == CHARSET_INVALID_CODE (charset))
9255 error ("Can't encode by shift_jis encoding: %d", c);
9256 JIS_TO_SJIS (code);
9257
9258 return make_number (code);
9259 }
9260
9261 DEFUN ("decode-big5-char", Fdecode_big5_char, Sdecode_big5_char, 1, 1, 0,
9262 doc: /* Decode a Big5 character which has CODE in BIG5 coding system.
9263 Return the corresponding character. */)
9264 (code)
9265 Lisp_Object code;
9266 {
9267 Lisp_Object spec, attrs, val;
9268 struct charset *charset_roman, *charset_big5, *charset;
9269 int c;
9270
9271 CHECK_NATNUM (code);
9272 c = XFASTINT (code);
9273 CHECK_CODING_SYSTEM_GET_SPEC (Vbig5_coding_system, spec);
9274 attrs = AREF (spec, 0);
9275
9276 if (ASCII_BYTE_P (c)
9277 && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
9278 return code;
9279
9280 val = CODING_ATTR_CHARSET_LIST (attrs);
9281 charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
9282 charset_big5 = CHARSET_FROM_ID (XINT (XCAR (val)));
9283
9284 if (c <= 0x7F)
9285 charset = charset_roman;
9286 else
9287 {
9288 int b1 = c >> 8, b2 = c & 0x7F;
9289 if (b1 < 0xA1 || b1 > 0xFE
9290 || b2 < 0x40 || (b2 > 0x7E && b2 < 0xA1) || b2 > 0xFE)
9291 error ("Invalid code: %d", code);
9292 charset = charset_big5;
9293 }
9294 c = DECODE_CHAR (charset, (unsigned )c);
9295 if (c < 0)
9296 error ("Invalid code: %d", code);
9297 return make_number (c);
9298 }
9299
9300 DEFUN ("encode-big5-char", Fencode_big5_char, Sencode_big5_char, 1, 1, 0,
9301 doc: /* Encode the Big5 character CH to BIG5 coding system.
9302 Return the corresponding character code in Big5. */)
9303 (ch)
9304 Lisp_Object ch;
9305 {
9306 Lisp_Object spec, attrs, charset_list;
9307 struct charset *charset;
9308 int c;
9309 unsigned code;
9310
9311 CHECK_CHARACTER (ch);
9312 c = XFASTINT (ch);
9313 CHECK_CODING_SYSTEM_GET_SPEC (Vbig5_coding_system, spec);
9314 attrs = AREF (spec, 0);
9315 if (ASCII_CHAR_P (c)
9316 && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
9317 return ch;
9318
9319 charset_list = CODING_ATTR_CHARSET_LIST (attrs);
9320 charset = char_charset (c, charset_list, &code);
9321 if (code == CHARSET_INVALID_CODE (charset))
9322 error ("Can't encode by Big5 encoding: %d", c);
9323
9324 return make_number (code);
9325 }
9326
9327 \f
9328 DEFUN ("set-terminal-coding-system-internal", Fset_terminal_coding_system_internal,
9329 Sset_terminal_coding_system_internal, 1, 2, 0,
9330 doc: /* Internal use only. */)
9331 (coding_system, terminal)
9332 Lisp_Object coding_system;
9333 Lisp_Object terminal;
9334 {
9335 struct coding_system *terminal_coding = TERMINAL_TERMINAL_CODING (get_terminal (terminal, 1));
9336 CHECK_SYMBOL (coding_system);
9337 setup_coding_system (Fcheck_coding_system (coding_system), terminal_coding);
9338 /* We had better not send unsafe characters to terminal. */
9339 terminal_coding->mode |= CODING_MODE_SAFE_ENCODING;
9340 /* Characer composition should be disabled. */
9341 terminal_coding->common_flags &= ~CODING_ANNOTATE_COMPOSITION_MASK;
9342 terminal_coding->src_multibyte = 1;
9343 terminal_coding->dst_multibyte = 0;
9344 return Qnil;
9345 }
9346
9347 DEFUN ("set-safe-terminal-coding-system-internal",
9348 Fset_safe_terminal_coding_system_internal,
9349 Sset_safe_terminal_coding_system_internal, 1, 1, 0,
9350 doc: /* Internal use only. */)
9351 (coding_system)
9352 Lisp_Object coding_system;
9353 {
9354 CHECK_SYMBOL (coding_system);
9355 setup_coding_system (Fcheck_coding_system (coding_system),
9356 &safe_terminal_coding);
9357 /* Characer composition should be disabled. */
9358 safe_terminal_coding.common_flags &= ~CODING_ANNOTATE_COMPOSITION_MASK;
9359 safe_terminal_coding.src_multibyte = 1;
9360 safe_terminal_coding.dst_multibyte = 0;
9361 return Qnil;
9362 }
9363
9364 DEFUN ("terminal-coding-system", Fterminal_coding_system,
9365 Sterminal_coding_system, 0, 1, 0,
9366 doc: /* Return coding system specified for terminal output on the given terminal.
9367 TERMINAL may be a terminal object, a frame, or nil for the selected
9368 frame's terminal device. */)
9369 (terminal)
9370 Lisp_Object terminal;
9371 {
9372 struct coding_system *terminal_coding
9373 = TERMINAL_TERMINAL_CODING (get_terminal (terminal, 1));
9374 Lisp_Object coding_system = CODING_ID_NAME (terminal_coding->id);
9375
9376 /* For backward compatibility, return nil if it is `undecided'. */
9377 return (! EQ (coding_system, Qundecided) ? coding_system : Qnil);
9378 }
9379
9380 DEFUN ("set-keyboard-coding-system-internal", Fset_keyboard_coding_system_internal,
9381 Sset_keyboard_coding_system_internal, 1, 2, 0,
9382 doc: /* Internal use only. */)
9383 (coding_system, terminal)
9384 Lisp_Object coding_system;
9385 Lisp_Object terminal;
9386 {
9387 struct terminal *t = get_terminal (terminal, 1);
9388 CHECK_SYMBOL (coding_system);
9389 setup_coding_system (Fcheck_coding_system (coding_system),
9390 TERMINAL_KEYBOARD_CODING (t));
9391 /* Characer composition should be disabled. */
9392 TERMINAL_KEYBOARD_CODING (t)->common_flags
9393 &= ~CODING_ANNOTATE_COMPOSITION_MASK;
9394 return Qnil;
9395 }
9396
9397 DEFUN ("keyboard-coding-system",
9398 Fkeyboard_coding_system, Skeyboard_coding_system, 0, 1, 0,
9399 doc: /* Return coding system specified for decoding keyboard input. */)
9400 (terminal)
9401 Lisp_Object terminal;
9402 {
9403 return CODING_ID_NAME (TERMINAL_KEYBOARD_CODING
9404 (get_terminal (terminal, 1))->id);
9405 }
9406
9407 \f
9408 DEFUN ("find-operation-coding-system", Ffind_operation_coding_system,
9409 Sfind_operation_coding_system, 1, MANY, 0,
9410 doc: /* Choose a coding system for an operation based on the target name.
9411 The value names a pair of coding systems: (DECODING-SYSTEM . ENCODING-SYSTEM).
9412 DECODING-SYSTEM is the coding system to use for decoding
9413 \(in case OPERATION does decoding), and ENCODING-SYSTEM is the coding system
9414 for encoding (in case OPERATION does encoding).
9415
9416 The first argument OPERATION specifies an I/O primitive:
9417 For file I/O, `insert-file-contents' or `write-region'.
9418 For process I/O, `call-process', `call-process-region', or `start-process'.
9419 For network I/O, `open-network-stream'.
9420
9421 The remaining arguments should be the same arguments that were passed
9422 to the primitive. Depending on which primitive, one of those arguments
9423 is selected as the TARGET. For example, if OPERATION does file I/O,
9424 whichever argument specifies the file name is TARGET.
9425
9426 TARGET has a meaning which depends on OPERATION:
9427 For file I/O, TARGET is a file name (except for the special case below).
9428 For process I/O, TARGET is a process name.
9429 For network I/O, TARGET is a service name or a port number.
9430
9431 This function looks up what is specified for TARGET in
9432 `file-coding-system-alist', `process-coding-system-alist',
9433 or `network-coding-system-alist' depending on OPERATION.
9434 They may specify a coding system, a cons of coding systems,
9435 or a function symbol to call.
9436 In the last case, we call the function with one argument,
9437 which is a list of all the arguments given to this function.
9438 If the function can't decide a coding system, it can return
9439 `undecided' so that the normal code-detection is performed.
9440
9441 If OPERATION is `insert-file-contents', the argument corresponding to
9442 TARGET may be a cons (FILENAME . BUFFER). In that case, FILENAME is a
9443 file name to look up, and BUFFER is a buffer that contains the file's
9444 contents (not yet decoded). If `file-coding-system-alist' specifies a
9445 function to call for FILENAME, that function should examine the
9446 contents of BUFFER instead of reading the file.
9447
9448 usage: (find-operation-coding-system OPERATION ARGUMENTS...) */)
9449 (nargs, args)
9450 int nargs;
9451 Lisp_Object *args;
9452 {
9453 Lisp_Object operation, target_idx, target, val;
9454 register Lisp_Object chain;
9455
9456 if (nargs < 2)
9457 error ("Too few arguments");
9458 operation = args[0];
9459 if (!SYMBOLP (operation)
9460 || !INTEGERP (target_idx = Fget (operation, Qtarget_idx)))
9461 error ("Invalid first argument");
9462 if (nargs < 1 + XINT (target_idx))
9463 error ("Too few arguments for operation: %s",
9464 SDATA (SYMBOL_NAME (operation)));
9465 target = args[XINT (target_idx) + 1];
9466 if (!(STRINGP (target)
9467 || (EQ (operation, Qinsert_file_contents) && CONSP (target)
9468 && STRINGP (XCAR (target)) && BUFFERP (XCDR (target)))
9469 || (EQ (operation, Qopen_network_stream) && INTEGERP (target))))
9470 error ("Invalid %dth argument", XINT (target_idx) + 1);
9471 if (CONSP (target))
9472 target = XCAR (target);
9473
9474 chain = ((EQ (operation, Qinsert_file_contents)
9475 || EQ (operation, Qwrite_region))
9476 ? Vfile_coding_system_alist
9477 : (EQ (operation, Qopen_network_stream)
9478 ? Vnetwork_coding_system_alist
9479 : Vprocess_coding_system_alist));
9480 if (NILP (chain))
9481 return Qnil;
9482
9483 for (; CONSP (chain); chain = XCDR (chain))
9484 {
9485 Lisp_Object elt;
9486
9487 elt = XCAR (chain);
9488 if (CONSP (elt)
9489 && ((STRINGP (target)
9490 && STRINGP (XCAR (elt))
9491 && fast_string_match (XCAR (elt), target) >= 0)
9492 || (INTEGERP (target) && EQ (target, XCAR (elt)))))
9493 {
9494 val = XCDR (elt);
9495 /* Here, if VAL is both a valid coding system and a valid
9496 function symbol, we return VAL as a coding system. */
9497 if (CONSP (val))
9498 return val;
9499 if (! SYMBOLP (val))
9500 return Qnil;
9501 if (! NILP (Fcoding_system_p (val)))
9502 return Fcons (val, val);
9503 if (! NILP (Ffboundp (val)))
9504 {
9505 /* We use call1 rather than safe_call1
9506 so as to get bug reports about functions called here
9507 which don't handle the current interface. */
9508 val = call1 (val, Flist (nargs, args));
9509 if (CONSP (val))
9510 return val;
9511 if (SYMBOLP (val) && ! NILP (Fcoding_system_p (val)))
9512 return Fcons (val, val);
9513 }
9514 return Qnil;
9515 }
9516 }
9517 return Qnil;
9518 }
9519
9520 DEFUN ("set-coding-system-priority", Fset_coding_system_priority,
9521 Sset_coding_system_priority, 0, MANY, 0,
9522 doc: /* Assign higher priority to the coding systems given as arguments.
9523 If multiple coding systems belong to the same category,
9524 all but the first one are ignored.
9525
9526 usage: (set-coding-system-priority &rest coding-systems) */)
9527 (nargs, args)
9528 int nargs;
9529 Lisp_Object *args;
9530 {
9531 int i, j;
9532 int changed[coding_category_max];
9533 enum coding_category priorities[coding_category_max];
9534
9535 bzero (changed, sizeof changed);
9536
9537 for (i = j = 0; i < nargs; i++)
9538 {
9539 enum coding_category category;
9540 Lisp_Object spec, attrs;
9541
9542 CHECK_CODING_SYSTEM_GET_SPEC (args[i], spec);
9543 attrs = AREF (spec, 0);
9544 category = XINT (CODING_ATTR_CATEGORY (attrs));
9545 if (changed[category])
9546 /* Ignore this coding system because a coding system of the
9547 same category already had a higher priority. */
9548 continue;
9549 changed[category] = 1;
9550 priorities[j++] = category;
9551 if (coding_categories[category].id >= 0
9552 && ! EQ (args[i], CODING_ID_NAME (coding_categories[category].id)))
9553 setup_coding_system (args[i], &coding_categories[category]);
9554 Fset (AREF (Vcoding_category_table, category), args[i]);
9555 }
9556
9557 /* Now we have decided top J priorities. Reflect the order of the
9558 original priorities to the remaining priorities. */
9559
9560 for (i = j, j = 0; i < coding_category_max; i++, j++)
9561 {
9562 while (j < coding_category_max
9563 && changed[coding_priorities[j]])
9564 j++;
9565 if (j == coding_category_max)
9566 abort ();
9567 priorities[i] = coding_priorities[j];
9568 }
9569
9570 bcopy (priorities, coding_priorities, sizeof priorities);
9571
9572 /* Update `coding-category-list'. */
9573 Vcoding_category_list = Qnil;
9574 for (i = coding_category_max - 1; i >= 0; i--)
9575 Vcoding_category_list
9576 = Fcons (AREF (Vcoding_category_table, priorities[i]),
9577 Vcoding_category_list);
9578
9579 return Qnil;
9580 }
9581
9582 DEFUN ("coding-system-priority-list", Fcoding_system_priority_list,
9583 Scoding_system_priority_list, 0, 1, 0,
9584 doc: /* Return a list of coding systems ordered by their priorities.
9585 The list contains a subset of coding systems; i.e. coding systems
9586 assigned to each coding category (see `coding-category-list').
9587
9588 HIGHESTP non-nil means just return the highest priority one. */)
9589 (highestp)
9590 Lisp_Object highestp;
9591 {
9592 int i;
9593 Lisp_Object val;
9594
9595 for (i = 0, val = Qnil; i < coding_category_max; i++)
9596 {
9597 enum coding_category category = coding_priorities[i];
9598 int id = coding_categories[category].id;
9599 Lisp_Object attrs;
9600
9601 if (id < 0)
9602 continue;
9603 attrs = CODING_ID_ATTRS (id);
9604 if (! NILP (highestp))
9605 return CODING_ATTR_BASE_NAME (attrs);
9606 val = Fcons (CODING_ATTR_BASE_NAME (attrs), val);
9607 }
9608 return Fnreverse (val);
9609 }
9610
9611 static char *suffixes[] = { "-unix", "-dos", "-mac" };
9612
9613 static Lisp_Object
9614 make_subsidiaries (base)
9615 Lisp_Object base;
9616 {
9617 Lisp_Object subsidiaries;
9618 int base_name_len = SBYTES (SYMBOL_NAME (base));
9619 char *buf = (char *) alloca (base_name_len + 6);
9620 int i;
9621
9622 bcopy (SDATA (SYMBOL_NAME (base)), buf, base_name_len);
9623 subsidiaries = Fmake_vector (make_number (3), Qnil);
9624 for (i = 0; i < 3; i++)
9625 {
9626 bcopy (suffixes[i], buf + base_name_len, strlen (suffixes[i]) + 1);
9627 ASET (subsidiaries, i, intern (buf));
9628 }
9629 return subsidiaries;
9630 }
9631
9632
9633 DEFUN ("define-coding-system-internal", Fdefine_coding_system_internal,
9634 Sdefine_coding_system_internal, coding_arg_max, MANY, 0,
9635 doc: /* For internal use only.
9636 usage: (define-coding-system-internal ...) */)
9637 (nargs, args)
9638 int nargs;
9639 Lisp_Object *args;
9640 {
9641 Lisp_Object name;
9642 Lisp_Object spec_vec; /* [ ATTRS ALIASE EOL_TYPE ] */
9643 Lisp_Object attrs; /* Vector of attributes. */
9644 Lisp_Object eol_type;
9645 Lisp_Object aliases;
9646 Lisp_Object coding_type, charset_list, safe_charsets;
9647 enum coding_category category;
9648 Lisp_Object tail, val;
9649 int max_charset_id = 0;
9650 int i;
9651
9652 if (nargs < coding_arg_max)
9653 goto short_args;
9654
9655 attrs = Fmake_vector (make_number (coding_attr_last_index), Qnil);
9656
9657 name = args[coding_arg_name];
9658 CHECK_SYMBOL (name);
9659 CODING_ATTR_BASE_NAME (attrs) = name;
9660
9661 val = args[coding_arg_mnemonic];
9662 if (! STRINGP (val))
9663 CHECK_CHARACTER (val);
9664 CODING_ATTR_MNEMONIC (attrs) = val;
9665
9666 coding_type = args[coding_arg_coding_type];
9667 CHECK_SYMBOL (coding_type);
9668 CODING_ATTR_TYPE (attrs) = coding_type;
9669
9670 charset_list = args[coding_arg_charset_list];
9671 if (SYMBOLP (charset_list))
9672 {
9673 if (EQ (charset_list, Qiso_2022))
9674 {
9675 if (! EQ (coding_type, Qiso_2022))
9676 error ("Invalid charset-list");
9677 charset_list = Viso_2022_charset_list;
9678 }
9679 else if (EQ (charset_list, Qemacs_mule))
9680 {
9681 if (! EQ (coding_type, Qemacs_mule))
9682 error ("Invalid charset-list");
9683 charset_list = Vemacs_mule_charset_list;
9684 }
9685 for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
9686 if (max_charset_id < XFASTINT (XCAR (tail)))
9687 max_charset_id = XFASTINT (XCAR (tail));
9688 }
9689 else
9690 {
9691 charset_list = Fcopy_sequence (charset_list);
9692 for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
9693 {
9694 struct charset *charset;
9695
9696 val = XCAR (tail);
9697 CHECK_CHARSET_GET_CHARSET (val, charset);
9698 if (EQ (coding_type, Qiso_2022)
9699 ? CHARSET_ISO_FINAL (charset) < 0
9700 : EQ (coding_type, Qemacs_mule)
9701 ? CHARSET_EMACS_MULE_ID (charset) < 0
9702 : 0)
9703 error ("Can't handle charset `%s'",
9704 SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
9705
9706 XSETCAR (tail, make_number (charset->id));
9707 if (max_charset_id < charset->id)
9708 max_charset_id = charset->id;
9709 }
9710 }
9711 CODING_ATTR_CHARSET_LIST (attrs) = charset_list;
9712
9713 safe_charsets = make_uninit_string (max_charset_id + 1);
9714 memset (SDATA (safe_charsets), 255, max_charset_id + 1);
9715 for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
9716 SSET (safe_charsets, XFASTINT (XCAR (tail)), 0);
9717 CODING_ATTR_SAFE_CHARSETS (attrs) = safe_charsets;
9718
9719 CODING_ATTR_ASCII_COMPAT (attrs) = args[coding_arg_ascii_compatible_p];
9720
9721 val = args[coding_arg_decode_translation_table];
9722 if (! CHAR_TABLE_P (val) && ! CONSP (val))
9723 CHECK_SYMBOL (val);
9724 CODING_ATTR_DECODE_TBL (attrs) = val;
9725
9726 val = args[coding_arg_encode_translation_table];
9727 if (! CHAR_TABLE_P (val) && ! CONSP (val))
9728 CHECK_SYMBOL (val);
9729 CODING_ATTR_ENCODE_TBL (attrs) = val;
9730
9731 val = args[coding_arg_post_read_conversion];
9732 CHECK_SYMBOL (val);
9733 CODING_ATTR_POST_READ (attrs) = val;
9734
9735 val = args[coding_arg_pre_write_conversion];
9736 CHECK_SYMBOL (val);
9737 CODING_ATTR_PRE_WRITE (attrs) = val;
9738
9739 val = args[coding_arg_default_char];
9740 if (NILP (val))
9741 CODING_ATTR_DEFAULT_CHAR (attrs) = make_number (' ');
9742 else
9743 {
9744 CHECK_CHARACTER (val);
9745 CODING_ATTR_DEFAULT_CHAR (attrs) = val;
9746 }
9747
9748 val = args[coding_arg_for_unibyte];
9749 CODING_ATTR_FOR_UNIBYTE (attrs) = NILP (val) ? Qnil : Qt;
9750
9751 val = args[coding_arg_plist];
9752 CHECK_LIST (val);
9753 CODING_ATTR_PLIST (attrs) = val;
9754
9755 if (EQ (coding_type, Qcharset))
9756 {
9757 /* Generate a lisp vector of 256 elements. Each element is nil,
9758 integer, or a list of charset IDs.
9759
9760 If Nth element is nil, the byte code N is invalid in this
9761 coding system.
9762
9763 If Nth element is a number NUM, N is the first byte of a
9764 charset whose ID is NUM.
9765
9766 If Nth element is a list of charset IDs, N is the first byte
9767 of one of them. The list is sorted by dimensions of the
9768 charsets. A charset of smaller dimension comes firtst. */
9769 val = Fmake_vector (make_number (256), Qnil);
9770
9771 for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
9772 {
9773 struct charset *charset = CHARSET_FROM_ID (XFASTINT (XCAR (tail)));
9774 int dim = CHARSET_DIMENSION (charset);
9775 int idx = (dim - 1) * 4;
9776
9777 if (CHARSET_ASCII_COMPATIBLE_P (charset))
9778 CODING_ATTR_ASCII_COMPAT (attrs) = Qt;
9779
9780 for (i = charset->code_space[idx];
9781 i <= charset->code_space[idx + 1]; i++)
9782 {
9783 Lisp_Object tmp, tmp2;
9784 int dim2;
9785
9786 tmp = AREF (val, i);
9787 if (NILP (tmp))
9788 tmp = XCAR (tail);
9789 else if (NUMBERP (tmp))
9790 {
9791 dim2 = CHARSET_DIMENSION (CHARSET_FROM_ID (XFASTINT (tmp)));
9792 if (dim < dim2)
9793 tmp = Fcons (XCAR (tail), Fcons (tmp, Qnil));
9794 else
9795 tmp = Fcons (tmp, Fcons (XCAR (tail), Qnil));
9796 }
9797 else
9798 {
9799 for (tmp2 = tmp; CONSP (tmp2); tmp2 = XCDR (tmp2))
9800 {
9801 dim2 = CHARSET_DIMENSION (CHARSET_FROM_ID (XFASTINT (XCAR (tmp2))));
9802 if (dim < dim2)
9803 break;
9804 }
9805 if (NILP (tmp2))
9806 tmp = nconc2 (tmp, Fcons (XCAR (tail), Qnil));
9807 else
9808 {
9809 XSETCDR (tmp2, Fcons (XCAR (tmp2), XCDR (tmp2)));
9810 XSETCAR (tmp2, XCAR (tail));
9811 }
9812 }
9813 ASET (val, i, tmp);
9814 }
9815 }
9816 ASET (attrs, coding_attr_charset_valids, val);
9817 category = coding_category_charset;
9818 }
9819 else if (EQ (coding_type, Qccl))
9820 {
9821 Lisp_Object valids;
9822
9823 if (nargs < coding_arg_ccl_max)
9824 goto short_args;
9825
9826 val = args[coding_arg_ccl_decoder];
9827 CHECK_CCL_PROGRAM (val);
9828 if (VECTORP (val))
9829 val = Fcopy_sequence (val);
9830 ASET (attrs, coding_attr_ccl_decoder, val);
9831
9832 val = args[coding_arg_ccl_encoder];
9833 CHECK_CCL_PROGRAM (val);
9834 if (VECTORP (val))
9835 val = Fcopy_sequence (val);
9836 ASET (attrs, coding_attr_ccl_encoder, val);
9837
9838 val = args[coding_arg_ccl_valids];
9839 valids = Fmake_string (make_number (256), make_number (0));
9840 for (tail = val; !NILP (tail); tail = Fcdr (tail))
9841 {
9842 int from, to;
9843
9844 val = Fcar (tail);
9845 if (INTEGERP (val))
9846 {
9847 from = to = XINT (val);
9848 if (from < 0 || from > 255)
9849 args_out_of_range_3 (val, make_number (0), make_number (255));
9850 }
9851 else
9852 {
9853 CHECK_CONS (val);
9854 CHECK_NATNUM_CAR (val);
9855 CHECK_NATNUM_CDR (val);
9856 from = XINT (XCAR (val));
9857 if (from > 255)
9858 args_out_of_range_3 (XCAR (val),
9859 make_number (0), make_number (255));
9860 to = XINT (XCDR (val));
9861 if (to < from || to > 255)
9862 args_out_of_range_3 (XCDR (val),
9863 XCAR (val), make_number (255));
9864 }
9865 for (i = from; i <= to; i++)
9866 SSET (valids, i, 1);
9867 }
9868 ASET (attrs, coding_attr_ccl_valids, valids);
9869
9870 category = coding_category_ccl;
9871 }
9872 else if (EQ (coding_type, Qutf_16))
9873 {
9874 Lisp_Object bom, endian;
9875
9876 CODING_ATTR_ASCII_COMPAT (attrs) = Qnil;
9877
9878 if (nargs < coding_arg_utf16_max)
9879 goto short_args;
9880
9881 bom = args[coding_arg_utf16_bom];
9882 if (! NILP (bom) && ! EQ (bom, Qt))
9883 {
9884 CHECK_CONS (bom);
9885 val = XCAR (bom);
9886 CHECK_CODING_SYSTEM (val);
9887 val = XCDR (bom);
9888 CHECK_CODING_SYSTEM (val);
9889 }
9890 ASET (attrs, coding_attr_utf_bom, bom);
9891
9892 endian = args[coding_arg_utf16_endian];
9893 CHECK_SYMBOL (endian);
9894 if (NILP (endian))
9895 endian = Qbig;
9896 else if (! EQ (endian, Qbig) && ! EQ (endian, Qlittle))
9897 error ("Invalid endian: %s", SDATA (SYMBOL_NAME (endian)));
9898 ASET (attrs, coding_attr_utf_16_endian, endian);
9899
9900 category = (CONSP (bom)
9901 ? coding_category_utf_16_auto
9902 : NILP (bom)
9903 ? (EQ (endian, Qbig)
9904 ? coding_category_utf_16_be_nosig
9905 : coding_category_utf_16_le_nosig)
9906 : (EQ (endian, Qbig)
9907 ? coding_category_utf_16_be
9908 : coding_category_utf_16_le));
9909 }
9910 else if (EQ (coding_type, Qiso_2022))
9911 {
9912 Lisp_Object initial, reg_usage, request, flags;
9913 int i;
9914
9915 if (nargs < coding_arg_iso2022_max)
9916 goto short_args;
9917
9918 initial = Fcopy_sequence (args[coding_arg_iso2022_initial]);
9919 CHECK_VECTOR (initial);
9920 for (i = 0; i < 4; i++)
9921 {
9922 val = Faref (initial, make_number (i));
9923 if (! NILP (val))
9924 {
9925 struct charset *charset;
9926
9927 CHECK_CHARSET_GET_CHARSET (val, charset);
9928 ASET (initial, i, make_number (CHARSET_ID (charset)));
9929 if (i == 0 && CHARSET_ASCII_COMPATIBLE_P (charset))
9930 CODING_ATTR_ASCII_COMPAT (attrs) = Qt;
9931 }
9932 else
9933 ASET (initial, i, make_number (-1));
9934 }
9935
9936 reg_usage = args[coding_arg_iso2022_reg_usage];
9937 CHECK_CONS (reg_usage);
9938 CHECK_NUMBER_CAR (reg_usage);
9939 CHECK_NUMBER_CDR (reg_usage);
9940
9941 request = Fcopy_sequence (args[coding_arg_iso2022_request]);
9942 for (tail = request; ! NILP (tail); tail = Fcdr (tail))
9943 {
9944 int id;
9945 Lisp_Object tmp;
9946
9947 val = Fcar (tail);
9948 CHECK_CONS (val);
9949 tmp = XCAR (val);
9950 CHECK_CHARSET_GET_ID (tmp, id);
9951 CHECK_NATNUM_CDR (val);
9952 if (XINT (XCDR (val)) >= 4)
9953 error ("Invalid graphic register number: %d", XINT (XCDR (val)));
9954 XSETCAR (val, make_number (id));
9955 }
9956
9957 flags = args[coding_arg_iso2022_flags];
9958 CHECK_NATNUM (flags);
9959 i = XINT (flags);
9960 if (EQ (args[coding_arg_charset_list], Qiso_2022))
9961 flags = make_number (i | CODING_ISO_FLAG_FULL_SUPPORT);
9962
9963 ASET (attrs, coding_attr_iso_initial, initial);
9964 ASET (attrs, coding_attr_iso_usage, reg_usage);
9965 ASET (attrs, coding_attr_iso_request, request);
9966 ASET (attrs, coding_attr_iso_flags, flags);
9967 setup_iso_safe_charsets (attrs);
9968
9969 if (i & CODING_ISO_FLAG_SEVEN_BITS)
9970 category = ((i & (CODING_ISO_FLAG_LOCKING_SHIFT
9971 | CODING_ISO_FLAG_SINGLE_SHIFT))
9972 ? coding_category_iso_7_else
9973 : EQ (args[coding_arg_charset_list], Qiso_2022)
9974 ? coding_category_iso_7
9975 : coding_category_iso_7_tight);
9976 else
9977 {
9978 int id = XINT (AREF (initial, 1));
9979
9980 category = (((i & CODING_ISO_FLAG_LOCKING_SHIFT)
9981 || EQ (args[coding_arg_charset_list], Qiso_2022)
9982 || id < 0)
9983 ? coding_category_iso_8_else
9984 : (CHARSET_DIMENSION (CHARSET_FROM_ID (id)) == 1)
9985 ? coding_category_iso_8_1
9986 : coding_category_iso_8_2);
9987 }
9988 if (category != coding_category_iso_8_1
9989 && category != coding_category_iso_8_2)
9990 CODING_ATTR_ASCII_COMPAT (attrs) = Qnil;
9991 }
9992 else if (EQ (coding_type, Qemacs_mule))
9993 {
9994 if (EQ (args[coding_arg_charset_list], Qemacs_mule))
9995 ASET (attrs, coding_attr_emacs_mule_full, Qt);
9996 CODING_ATTR_ASCII_COMPAT (attrs) = Qt;
9997 category = coding_category_emacs_mule;
9998 }
9999 else if (EQ (coding_type, Qshift_jis))
10000 {
10001
10002 struct charset *charset;
10003
10004 if (XINT (Flength (charset_list)) != 3
10005 && XINT (Flength (charset_list)) != 4)
10006 error ("There should be three or four charsets");
10007
10008 charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
10009 if (CHARSET_DIMENSION (charset) != 1)
10010 error ("Dimension of charset %s is not one",
10011 SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
10012 if (CHARSET_ASCII_COMPATIBLE_P (charset))
10013 CODING_ATTR_ASCII_COMPAT (attrs) = Qt;
10014
10015 charset_list = XCDR (charset_list);
10016 charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
10017 if (CHARSET_DIMENSION (charset) != 1)
10018 error ("Dimension of charset %s is not one",
10019 SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
10020
10021 charset_list = XCDR (charset_list);
10022 charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
10023 if (CHARSET_DIMENSION (charset) != 2)
10024 error ("Dimension of charset %s is not two",
10025 SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
10026
10027 charset_list = XCDR (charset_list);
10028 if (! NILP (charset_list))
10029 {
10030 charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
10031 if (CHARSET_DIMENSION (charset) != 2)
10032 error ("Dimension of charset %s is not two",
10033 SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
10034 }
10035
10036 category = coding_category_sjis;
10037 Vsjis_coding_system = name;
10038 }
10039 else if (EQ (coding_type, Qbig5))
10040 {
10041 struct charset *charset;
10042
10043 if (XINT (Flength (charset_list)) != 2)
10044 error ("There should be just two charsets");
10045
10046 charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
10047 if (CHARSET_DIMENSION (charset) != 1)
10048 error ("Dimension of charset %s is not one",
10049 SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
10050 if (CHARSET_ASCII_COMPATIBLE_P (charset))
10051 CODING_ATTR_ASCII_COMPAT (attrs) = Qt;
10052
10053 charset_list = XCDR (charset_list);
10054 charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
10055 if (CHARSET_DIMENSION (charset) != 2)
10056 error ("Dimension of charset %s is not two",
10057 SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
10058
10059 category = coding_category_big5;
10060 Vbig5_coding_system = name;
10061 }
10062 else if (EQ (coding_type, Qraw_text))
10063 {
10064 category = coding_category_raw_text;
10065 CODING_ATTR_ASCII_COMPAT (attrs) = Qt;
10066 }
10067 else if (EQ (coding_type, Qutf_8))
10068 {
10069 Lisp_Object bom;
10070
10071 CODING_ATTR_ASCII_COMPAT (attrs) = Qt;
10072
10073 if (nargs < coding_arg_utf8_max)
10074 goto short_args;
10075
10076 bom = args[coding_arg_utf8_bom];
10077 if (! NILP (bom) && ! EQ (bom, Qt))
10078 {
10079 CHECK_CONS (bom);
10080 val = XCAR (bom);
10081 CHECK_CODING_SYSTEM (val);
10082 val = XCDR (bom);
10083 CHECK_CODING_SYSTEM (val);
10084 }
10085 ASET (attrs, coding_attr_utf_bom, bom);
10086
10087 category = (CONSP (bom) ? coding_category_utf_8_auto
10088 : NILP (bom) ? coding_category_utf_8_nosig
10089 : coding_category_utf_8_sig);
10090 }
10091 else if (EQ (coding_type, Qundecided))
10092 category = coding_category_undecided;
10093 else
10094 error ("Invalid coding system type: %s",
10095 SDATA (SYMBOL_NAME (coding_type)));
10096
10097 CODING_ATTR_CATEGORY (attrs) = make_number (category);
10098 CODING_ATTR_PLIST (attrs)
10099 = Fcons (QCcategory, Fcons (AREF (Vcoding_category_table, category),
10100 CODING_ATTR_PLIST (attrs)));
10101 CODING_ATTR_PLIST (attrs)
10102 = Fcons (QCascii_compatible_p,
10103 Fcons (CODING_ATTR_ASCII_COMPAT (attrs),
10104 CODING_ATTR_PLIST (attrs)));
10105
10106 eol_type = args[coding_arg_eol_type];
10107 if (! NILP (eol_type)
10108 && ! EQ (eol_type, Qunix)
10109 && ! EQ (eol_type, Qdos)
10110 && ! EQ (eol_type, Qmac))
10111 error ("Invalid eol-type");
10112
10113 aliases = Fcons (name, Qnil);
10114
10115 if (NILP (eol_type))
10116 {
10117 eol_type = make_subsidiaries (name);
10118 for (i = 0; i < 3; i++)
10119 {
10120 Lisp_Object this_spec, this_name, this_aliases, this_eol_type;
10121
10122 this_name = AREF (eol_type, i);
10123 this_aliases = Fcons (this_name, Qnil);
10124 this_eol_type = (i == 0 ? Qunix : i == 1 ? Qdos : Qmac);
10125 this_spec = Fmake_vector (make_number (3), attrs);
10126 ASET (this_spec, 1, this_aliases);
10127 ASET (this_spec, 2, this_eol_type);
10128 Fputhash (this_name, this_spec, Vcoding_system_hash_table);
10129 Vcoding_system_list = Fcons (this_name, Vcoding_system_list);
10130 val = Fassoc (Fsymbol_name (this_name), Vcoding_system_alist);
10131 if (NILP (val))
10132 Vcoding_system_alist
10133 = Fcons (Fcons (Fsymbol_name (this_name), Qnil),
10134 Vcoding_system_alist);
10135 }
10136 }
10137
10138 spec_vec = Fmake_vector (make_number (3), attrs);
10139 ASET (spec_vec, 1, aliases);
10140 ASET (spec_vec, 2, eol_type);
10141
10142 Fputhash (name, spec_vec, Vcoding_system_hash_table);
10143 Vcoding_system_list = Fcons (name, Vcoding_system_list);
10144 val = Fassoc (Fsymbol_name (name), Vcoding_system_alist);
10145 if (NILP (val))
10146 Vcoding_system_alist = Fcons (Fcons (Fsymbol_name (name), Qnil),
10147 Vcoding_system_alist);
10148
10149 {
10150 int id = coding_categories[category].id;
10151
10152 if (id < 0 || EQ (name, CODING_ID_NAME (id)))
10153 setup_coding_system (name, &coding_categories[category]);
10154 }
10155
10156 return Qnil;
10157
10158 short_args:
10159 return Fsignal (Qwrong_number_of_arguments,
10160 Fcons (intern ("define-coding-system-internal"),
10161 make_number (nargs)));
10162 }
10163
10164
10165 DEFUN ("coding-system-put", Fcoding_system_put, Scoding_system_put,
10166 3, 3, 0,
10167 doc: /* Change value in CODING-SYSTEM's property list PROP to VAL. */)
10168 (coding_system, prop, val)
10169 Lisp_Object coding_system, prop, val;
10170 {
10171 Lisp_Object spec, attrs;
10172
10173 CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
10174 attrs = AREF (spec, 0);
10175 if (EQ (prop, QCmnemonic))
10176 {
10177 if (! STRINGP (val))
10178 CHECK_CHARACTER (val);
10179 CODING_ATTR_MNEMONIC (attrs) = val;
10180 }
10181 else if (EQ (prop, QCdefault_char))
10182 {
10183 if (NILP (val))
10184 val = make_number (' ');
10185 else
10186 CHECK_CHARACTER (val);
10187 CODING_ATTR_DEFAULT_CHAR (attrs) = val;
10188 }
10189 else if (EQ (prop, QCdecode_translation_table))
10190 {
10191 if (! CHAR_TABLE_P (val) && ! CONSP (val))
10192 CHECK_SYMBOL (val);
10193 CODING_ATTR_DECODE_TBL (attrs) = val;
10194 }
10195 else if (EQ (prop, QCencode_translation_table))
10196 {
10197 if (! CHAR_TABLE_P (val) && ! CONSP (val))
10198 CHECK_SYMBOL (val);
10199 CODING_ATTR_ENCODE_TBL (attrs) = val;
10200 }
10201 else if (EQ (prop, QCpost_read_conversion))
10202 {
10203 CHECK_SYMBOL (val);
10204 CODING_ATTR_POST_READ (attrs) = val;
10205 }
10206 else if (EQ (prop, QCpre_write_conversion))
10207 {
10208 CHECK_SYMBOL (val);
10209 CODING_ATTR_PRE_WRITE (attrs) = val;
10210 }
10211 else if (EQ (prop, QCascii_compatible_p))
10212 {
10213 CODING_ATTR_ASCII_COMPAT (attrs) = val;
10214 }
10215
10216 CODING_ATTR_PLIST (attrs)
10217 = Fplist_put (CODING_ATTR_PLIST (attrs), prop, val);
10218 return val;
10219 }
10220
10221
10222 DEFUN ("define-coding-system-alias", Fdefine_coding_system_alias,
10223 Sdefine_coding_system_alias, 2, 2, 0,
10224 doc: /* Define ALIAS as an alias for CODING-SYSTEM. */)
10225 (alias, coding_system)
10226 Lisp_Object alias, coding_system;
10227 {
10228 Lisp_Object spec, aliases, eol_type, val;
10229
10230 CHECK_SYMBOL (alias);
10231 CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
10232 aliases = AREF (spec, 1);
10233 /* ALIASES should be a list of length more than zero, and the first
10234 element is a base coding system. Append ALIAS at the tail of the
10235 list. */
10236 while (!NILP (XCDR (aliases)))
10237 aliases = XCDR (aliases);
10238 XSETCDR (aliases, Fcons (alias, Qnil));
10239
10240 eol_type = AREF (spec, 2);
10241 if (VECTORP (eol_type))
10242 {
10243 Lisp_Object subsidiaries;
10244 int i;
10245
10246 subsidiaries = make_subsidiaries (alias);
10247 for (i = 0; i < 3; i++)
10248 Fdefine_coding_system_alias (AREF (subsidiaries, i),
10249 AREF (eol_type, i));
10250 }
10251
10252 Fputhash (alias, spec, Vcoding_system_hash_table);
10253 Vcoding_system_list = Fcons (alias, Vcoding_system_list);
10254 val = Fassoc (Fsymbol_name (alias), Vcoding_system_alist);
10255 if (NILP (val))
10256 Vcoding_system_alist = Fcons (Fcons (Fsymbol_name (alias), Qnil),
10257 Vcoding_system_alist);
10258
10259 return Qnil;
10260 }
10261
10262 DEFUN ("coding-system-base", Fcoding_system_base, Scoding_system_base,
10263 1, 1, 0,
10264 doc: /* Return the base of CODING-SYSTEM.
10265 Any alias or subsidiary coding system is not a base coding system. */)
10266 (coding_system)
10267 Lisp_Object coding_system;
10268 {
10269 Lisp_Object spec, attrs;
10270
10271 if (NILP (coding_system))
10272 return (Qno_conversion);
10273 CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
10274 attrs = AREF (spec, 0);
10275 return CODING_ATTR_BASE_NAME (attrs);
10276 }
10277
10278 DEFUN ("coding-system-plist", Fcoding_system_plist, Scoding_system_plist,
10279 1, 1, 0,
10280 doc: "Return the property list of CODING-SYSTEM.")
10281 (coding_system)
10282 Lisp_Object coding_system;
10283 {
10284 Lisp_Object spec, attrs;
10285
10286 if (NILP (coding_system))
10287 coding_system = Qno_conversion;
10288 CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
10289 attrs = AREF (spec, 0);
10290 return CODING_ATTR_PLIST (attrs);
10291 }
10292
10293
10294 DEFUN ("coding-system-aliases", Fcoding_system_aliases, Scoding_system_aliases,
10295 1, 1, 0,
10296 doc: /* Return the list of aliases of CODING-SYSTEM. */)
10297 (coding_system)
10298 Lisp_Object coding_system;
10299 {
10300 Lisp_Object spec;
10301
10302 if (NILP (coding_system))
10303 coding_system = Qno_conversion;
10304 CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
10305 return AREF (spec, 1);
10306 }
10307
10308 DEFUN ("coding-system-eol-type", Fcoding_system_eol_type,
10309 Scoding_system_eol_type, 1, 1, 0,
10310 doc: /* Return eol-type of CODING-SYSTEM.
10311 An eol-type is an integer 0, 1, 2, or a vector of coding systems.
10312
10313 Integer values 0, 1, and 2 indicate a format of end-of-line; LF, CRLF,
10314 and CR respectively.
10315
10316 A vector value indicates that a format of end-of-line should be
10317 detected automatically. Nth element of the vector is the subsidiary
10318 coding system whose eol-type is N. */)
10319 (coding_system)
10320 Lisp_Object coding_system;
10321 {
10322 Lisp_Object spec, eol_type;
10323 int n;
10324
10325 if (NILP (coding_system))
10326 coding_system = Qno_conversion;
10327 if (! CODING_SYSTEM_P (coding_system))
10328 return Qnil;
10329 spec = CODING_SYSTEM_SPEC (coding_system);
10330 eol_type = AREF (spec, 2);
10331 if (VECTORP (eol_type))
10332 return Fcopy_sequence (eol_type);
10333 n = EQ (eol_type, Qunix) ? 0 : EQ (eol_type, Qdos) ? 1 : 2;
10334 return make_number (n);
10335 }
10336
10337 #endif /* emacs */
10338
10339 \f
10340 /*** 9. Post-amble ***/
10341
10342 void
10343 init_coding_once ()
10344 {
10345 int i;
10346
10347 for (i = 0; i < coding_category_max; i++)
10348 {
10349 coding_categories[i].id = -1;
10350 coding_priorities[i] = i;
10351 }
10352
10353 /* ISO2022 specific initialize routine. */
10354 for (i = 0; i < 0x20; i++)
10355 iso_code_class[i] = ISO_control_0;
10356 for (i = 0x21; i < 0x7F; i++)
10357 iso_code_class[i] = ISO_graphic_plane_0;
10358 for (i = 0x80; i < 0xA0; i++)
10359 iso_code_class[i] = ISO_control_1;
10360 for (i = 0xA1; i < 0xFF; i++)
10361 iso_code_class[i] = ISO_graphic_plane_1;
10362 iso_code_class[0x20] = iso_code_class[0x7F] = ISO_0x20_or_0x7F;
10363 iso_code_class[0xA0] = iso_code_class[0xFF] = ISO_0xA0_or_0xFF;
10364 iso_code_class[ISO_CODE_SO] = ISO_shift_out;
10365 iso_code_class[ISO_CODE_SI] = ISO_shift_in;
10366 iso_code_class[ISO_CODE_SS2_7] = ISO_single_shift_2_7;
10367 iso_code_class[ISO_CODE_ESC] = ISO_escape;
10368 iso_code_class[ISO_CODE_SS2] = ISO_single_shift_2;
10369 iso_code_class[ISO_CODE_SS3] = ISO_single_shift_3;
10370 iso_code_class[ISO_CODE_CSI] = ISO_control_sequence_introducer;
10371
10372 for (i = 0; i < 256; i++)
10373 {
10374 emacs_mule_bytes[i] = 1;
10375 }
10376 emacs_mule_bytes[EMACS_MULE_LEADING_CODE_PRIVATE_11] = 3;
10377 emacs_mule_bytes[EMACS_MULE_LEADING_CODE_PRIVATE_12] = 3;
10378 emacs_mule_bytes[EMACS_MULE_LEADING_CODE_PRIVATE_21] = 4;
10379 emacs_mule_bytes[EMACS_MULE_LEADING_CODE_PRIVATE_22] = 4;
10380 }
10381
10382 #ifdef emacs
10383
10384 void
10385 syms_of_coding ()
10386 {
10387 staticpro (&Vcoding_system_hash_table);
10388 {
10389 Lisp_Object args[2];
10390 args[0] = QCtest;
10391 args[1] = Qeq;
10392 Vcoding_system_hash_table = Fmake_hash_table (2, args);
10393 }
10394
10395 staticpro (&Vsjis_coding_system);
10396 Vsjis_coding_system = Qnil;
10397
10398 staticpro (&Vbig5_coding_system);
10399 Vbig5_coding_system = Qnil;
10400
10401 staticpro (&Vcode_conversion_reused_workbuf);
10402 Vcode_conversion_reused_workbuf = Qnil;
10403
10404 staticpro (&Vcode_conversion_workbuf_name);
10405 Vcode_conversion_workbuf_name = build_string (" *code-conversion-work*");
10406
10407 reused_workbuf_in_use = 0;
10408
10409 DEFSYM (Qcharset, "charset");
10410 DEFSYM (Qtarget_idx, "target-idx");
10411 DEFSYM (Qcoding_system_history, "coding-system-history");
10412 Fset (Qcoding_system_history, Qnil);
10413
10414 /* Target FILENAME is the first argument. */
10415 Fput (Qinsert_file_contents, Qtarget_idx, make_number (0));
10416 /* Target FILENAME is the third argument. */
10417 Fput (Qwrite_region, Qtarget_idx, make_number (2));
10418
10419 DEFSYM (Qcall_process, "call-process");
10420 /* Target PROGRAM is the first argument. */
10421 Fput (Qcall_process, Qtarget_idx, make_number (0));
10422
10423 DEFSYM (Qcall_process_region, "call-process-region");
10424 /* Target PROGRAM is the third argument. */
10425 Fput (Qcall_process_region, Qtarget_idx, make_number (2));
10426
10427 DEFSYM (Qstart_process, "start-process");
10428 /* Target PROGRAM is the third argument. */
10429 Fput (Qstart_process, Qtarget_idx, make_number (2));
10430
10431 DEFSYM (Qopen_network_stream, "open-network-stream");
10432 /* Target SERVICE is the fourth argument. */
10433 Fput (Qopen_network_stream, Qtarget_idx, make_number (3));
10434
10435 DEFSYM (Qcoding_system, "coding-system");
10436 DEFSYM (Qcoding_aliases, "coding-aliases");
10437
10438 DEFSYM (Qeol_type, "eol-type");
10439 DEFSYM (Qunix, "unix");
10440 DEFSYM (Qdos, "dos");
10441
10442 DEFSYM (Qbuffer_file_coding_system, "buffer-file-coding-system");
10443 DEFSYM (Qpost_read_conversion, "post-read-conversion");
10444 DEFSYM (Qpre_write_conversion, "pre-write-conversion");
10445 DEFSYM (Qdefault_char, "default-char");
10446 DEFSYM (Qundecided, "undecided");
10447 DEFSYM (Qno_conversion, "no-conversion");
10448 DEFSYM (Qraw_text, "raw-text");
10449
10450 DEFSYM (Qiso_2022, "iso-2022");
10451
10452 DEFSYM (Qutf_8, "utf-8");
10453 DEFSYM (Qutf_8_emacs, "utf-8-emacs");
10454
10455 DEFSYM (Qutf_16, "utf-16");
10456 DEFSYM (Qbig, "big");
10457 DEFSYM (Qlittle, "little");
10458
10459 DEFSYM (Qshift_jis, "shift-jis");
10460 DEFSYM (Qbig5, "big5");
10461
10462 DEFSYM (Qcoding_system_p, "coding-system-p");
10463
10464 DEFSYM (Qcoding_system_error, "coding-system-error");
10465 Fput (Qcoding_system_error, Qerror_conditions,
10466 Fcons (Qcoding_system_error, Fcons (Qerror, Qnil)));
10467 Fput (Qcoding_system_error, Qerror_message,
10468 build_string ("Invalid coding system"));
10469
10470 /* Intern this now in case it isn't already done.
10471 Setting this variable twice is harmless.
10472 But don't staticpro it here--that is done in alloc.c. */
10473 Qchar_table_extra_slots = intern ("char-table-extra-slots");
10474
10475 DEFSYM (Qtranslation_table, "translation-table");
10476 Fput (Qtranslation_table, Qchar_table_extra_slots, make_number (2));
10477 DEFSYM (Qtranslation_table_id, "translation-table-id");
10478 DEFSYM (Qtranslation_table_for_decode, "translation-table-for-decode");
10479 DEFSYM (Qtranslation_table_for_encode, "translation-table-for-encode");
10480
10481 DEFSYM (Qvalid_codes, "valid-codes");
10482
10483 DEFSYM (Qemacs_mule, "emacs-mule");
10484
10485 DEFSYM (QCcategory, ":category");
10486 DEFSYM (QCmnemonic, ":mnemonic");
10487 DEFSYM (QCdefault_char, ":default-char");
10488 DEFSYM (QCdecode_translation_table, ":decode-translation-table");
10489 DEFSYM (QCencode_translation_table, ":encode-translation-table");
10490 DEFSYM (QCpost_read_conversion, ":post-read-conversion");
10491 DEFSYM (QCpre_write_conversion, ":pre-write-conversion");
10492 DEFSYM (QCascii_compatible_p, ":ascii-compatible-p");
10493
10494 Vcoding_category_table
10495 = Fmake_vector (make_number (coding_category_max), Qnil);
10496 staticpro (&Vcoding_category_table);
10497 /* Followings are target of code detection. */
10498 ASET (Vcoding_category_table, coding_category_iso_7,
10499 intern ("coding-category-iso-7"));
10500 ASET (Vcoding_category_table, coding_category_iso_7_tight,
10501 intern ("coding-category-iso-7-tight"));
10502 ASET (Vcoding_category_table, coding_category_iso_8_1,
10503 intern ("coding-category-iso-8-1"));
10504 ASET (Vcoding_category_table, coding_category_iso_8_2,
10505 intern ("coding-category-iso-8-2"));
10506 ASET (Vcoding_category_table, coding_category_iso_7_else,
10507 intern ("coding-category-iso-7-else"));
10508 ASET (Vcoding_category_table, coding_category_iso_8_else,
10509 intern ("coding-category-iso-8-else"));
10510 ASET (Vcoding_category_table, coding_category_utf_8_auto,
10511 intern ("coding-category-utf-8-auto"));
10512 ASET (Vcoding_category_table, coding_category_utf_8_nosig,
10513 intern ("coding-category-utf-8"));
10514 ASET (Vcoding_category_table, coding_category_utf_8_sig,
10515 intern ("coding-category-utf-8-sig"));
10516 ASET (Vcoding_category_table, coding_category_utf_16_be,
10517 intern ("coding-category-utf-16-be"));
10518 ASET (Vcoding_category_table, coding_category_utf_16_auto,
10519 intern ("coding-category-utf-16-auto"));
10520 ASET (Vcoding_category_table, coding_category_utf_16_le,
10521 intern ("coding-category-utf-16-le"));
10522 ASET (Vcoding_category_table, coding_category_utf_16_be_nosig,
10523 intern ("coding-category-utf-16-be-nosig"));
10524 ASET (Vcoding_category_table, coding_category_utf_16_le_nosig,
10525 intern ("coding-category-utf-16-le-nosig"));
10526 ASET (Vcoding_category_table, coding_category_charset,
10527 intern ("coding-category-charset"));
10528 ASET (Vcoding_category_table, coding_category_sjis,
10529 intern ("coding-category-sjis"));
10530 ASET (Vcoding_category_table, coding_category_big5,
10531 intern ("coding-category-big5"));
10532 ASET (Vcoding_category_table, coding_category_ccl,
10533 intern ("coding-category-ccl"));
10534 ASET (Vcoding_category_table, coding_category_emacs_mule,
10535 intern ("coding-category-emacs-mule"));
10536 /* Followings are NOT target of code detection. */
10537 ASET (Vcoding_category_table, coding_category_raw_text,
10538 intern ("coding-category-raw-text"));
10539 ASET (Vcoding_category_table, coding_category_undecided,
10540 intern ("coding-category-undecided"));
10541
10542 DEFSYM (Qinsufficient_source, "insufficient-source");
10543 DEFSYM (Qinconsistent_eol, "inconsistent-eol");
10544 DEFSYM (Qinvalid_source, "invalid-source");
10545 DEFSYM (Qinterrupted, "interrupted");
10546 DEFSYM (Qinsufficient_memory, "insufficient-memory");
10547 DEFSYM (Qcoding_system_define_form, "coding-system-define-form");
10548
10549 defsubr (&Scoding_system_p);
10550 defsubr (&Sread_coding_system);
10551 defsubr (&Sread_non_nil_coding_system);
10552 defsubr (&Scheck_coding_system);
10553 defsubr (&Sdetect_coding_region);
10554 defsubr (&Sdetect_coding_string);
10555 defsubr (&Sfind_coding_systems_region_internal);
10556 defsubr (&Sunencodable_char_position);
10557 defsubr (&Scheck_coding_systems_region);
10558 defsubr (&Sdecode_coding_region);
10559 defsubr (&Sencode_coding_region);
10560 defsubr (&Sdecode_coding_string);
10561 defsubr (&Sencode_coding_string);
10562 defsubr (&Sdecode_sjis_char);
10563 defsubr (&Sencode_sjis_char);
10564 defsubr (&Sdecode_big5_char);
10565 defsubr (&Sencode_big5_char);
10566 defsubr (&Sset_terminal_coding_system_internal);
10567 defsubr (&Sset_safe_terminal_coding_system_internal);
10568 defsubr (&Sterminal_coding_system);
10569 defsubr (&Sset_keyboard_coding_system_internal);
10570 defsubr (&Skeyboard_coding_system);
10571 defsubr (&Sfind_operation_coding_system);
10572 defsubr (&Sset_coding_system_priority);
10573 defsubr (&Sdefine_coding_system_internal);
10574 defsubr (&Sdefine_coding_system_alias);
10575 defsubr (&Scoding_system_put);
10576 defsubr (&Scoding_system_base);
10577 defsubr (&Scoding_system_plist);
10578 defsubr (&Scoding_system_aliases);
10579 defsubr (&Scoding_system_eol_type);
10580 defsubr (&Scoding_system_priority_list);
10581
10582 DEFVAR_LISP ("coding-system-list", &Vcoding_system_list,
10583 doc: /* List of coding systems.
10584
10585 Do not alter the value of this variable manually. This variable should be
10586 updated by the functions `define-coding-system' and
10587 `define-coding-system-alias'. */);
10588 Vcoding_system_list = Qnil;
10589
10590 DEFVAR_LISP ("coding-system-alist", &Vcoding_system_alist,
10591 doc: /* Alist of coding system names.
10592 Each element is one element list of coding system name.
10593 This variable is given to `completing-read' as COLLECTION argument.
10594
10595 Do not alter the value of this variable manually. This variable should be
10596 updated by the functions `make-coding-system' and
10597 `define-coding-system-alias'. */);
10598 Vcoding_system_alist = Qnil;
10599
10600 DEFVAR_LISP ("coding-category-list", &Vcoding_category_list,
10601 doc: /* List of coding-categories (symbols) ordered by priority.
10602
10603 On detecting a coding system, Emacs tries code detection algorithms
10604 associated with each coding-category one by one in this order. When
10605 one algorithm agrees with a byte sequence of source text, the coding
10606 system bound to the corresponding coding-category is selected.
10607
10608 Don't modify this variable directly, but use `set-coding-priority'. */);
10609 {
10610 int i;
10611
10612 Vcoding_category_list = Qnil;
10613 for (i = coding_category_max - 1; i >= 0; i--)
10614 Vcoding_category_list
10615 = Fcons (XVECTOR (Vcoding_category_table)->contents[i],
10616 Vcoding_category_list);
10617 }
10618
10619 DEFVAR_LISP ("coding-system-for-read", &Vcoding_system_for_read,
10620 doc: /* Specify the coding system for read operations.
10621 It is useful to bind this variable with `let', but do not set it globally.
10622 If the value is a coding system, it is used for decoding on read operation.
10623 If not, an appropriate element is used from one of the coding system alists.
10624 There are three such tables: `file-coding-system-alist',
10625 `process-coding-system-alist', and `network-coding-system-alist'. */);
10626 Vcoding_system_for_read = Qnil;
10627
10628 DEFVAR_LISP ("coding-system-for-write", &Vcoding_system_for_write,
10629 doc: /* Specify the coding system for write operations.
10630 Programs bind this variable with `let', but you should not set it globally.
10631 If the value is a coding system, it is used for encoding of output,
10632 when writing it to a file and when sending it to a file or subprocess.
10633
10634 If this does not specify a coding system, an appropriate element
10635 is used from one of the coding system alists.
10636 There are three such tables: `file-coding-system-alist',
10637 `process-coding-system-alist', and `network-coding-system-alist'.
10638 For output to files, if the above procedure does not specify a coding system,
10639 the value of `buffer-file-coding-system' is used. */);
10640 Vcoding_system_for_write = Qnil;
10641
10642 DEFVAR_LISP ("last-coding-system-used", &Vlast_coding_system_used,
10643 doc: /*
10644 Coding system used in the latest file or process I/O. */);
10645 Vlast_coding_system_used = Qnil;
10646
10647 DEFVAR_LISP ("last-code-conversion-error", &Vlast_code_conversion_error,
10648 doc: /*
10649 Error status of the last code conversion.
10650
10651 When an error was detected in the last code conversion, this variable
10652 is set to one of the following symbols.
10653 `insufficient-source'
10654 `inconsistent-eol'
10655 `invalid-source'
10656 `interrupted'
10657 `insufficient-memory'
10658 When no error was detected, the value doesn't change. So, to check
10659 the error status of a code conversion by this variable, you must
10660 explicitly set this variable to nil before performing code
10661 conversion. */);
10662 Vlast_code_conversion_error = Qnil;
10663
10664 DEFVAR_BOOL ("inhibit-eol-conversion", &inhibit_eol_conversion,
10665 doc: /*
10666 *Non-nil means always inhibit code conversion of end-of-line format.
10667 See info node `Coding Systems' and info node `Text and Binary' concerning
10668 such conversion. */);
10669 inhibit_eol_conversion = 0;
10670
10671 DEFVAR_BOOL ("inherit-process-coding-system", &inherit_process_coding_system,
10672 doc: /*
10673 Non-nil means process buffer inherits coding system of process output.
10674 Bind it to t if the process output is to be treated as if it were a file
10675 read from some filesystem. */);
10676 inherit_process_coding_system = 0;
10677
10678 DEFVAR_LISP ("file-coding-system-alist", &Vfile_coding_system_alist,
10679 doc: /*
10680 Alist to decide a coding system to use for a file I/O operation.
10681 The format is ((PATTERN . VAL) ...),
10682 where PATTERN is a regular expression matching a file name,
10683 VAL is a coding system, a cons of coding systems, or a function symbol.
10684 If VAL is a coding system, it is used for both decoding and encoding
10685 the file contents.
10686 If VAL is a cons of coding systems, the car part is used for decoding,
10687 and the cdr part is used for encoding.
10688 If VAL is a function symbol, the function must return a coding system
10689 or a cons of coding systems which are used as above. The function is
10690 called with an argument that is a list of the arguments with which
10691 `find-operation-coding-system' was called. If the function can't decide
10692 a coding system, it can return `undecided' so that the normal
10693 code-detection is performed.
10694
10695 See also the function `find-operation-coding-system'
10696 and the variable `auto-coding-alist'. */);
10697 Vfile_coding_system_alist = Qnil;
10698
10699 DEFVAR_LISP ("process-coding-system-alist", &Vprocess_coding_system_alist,
10700 doc: /*
10701 Alist to decide a coding system to use for a process I/O operation.
10702 The format is ((PATTERN . VAL) ...),
10703 where PATTERN is a regular expression matching a program name,
10704 VAL is a coding system, a cons of coding systems, or a function symbol.
10705 If VAL is a coding system, it is used for both decoding what received
10706 from the program and encoding what sent to the program.
10707 If VAL is a cons of coding systems, the car part is used for decoding,
10708 and the cdr part is used for encoding.
10709 If VAL is a function symbol, the function must return a coding system
10710 or a cons of coding systems which are used as above.
10711
10712 See also the function `find-operation-coding-system'. */);
10713 Vprocess_coding_system_alist = Qnil;
10714
10715 DEFVAR_LISP ("network-coding-system-alist", &Vnetwork_coding_system_alist,
10716 doc: /*
10717 Alist to decide a coding system to use for a network I/O operation.
10718 The format is ((PATTERN . VAL) ...),
10719 where PATTERN is a regular expression matching a network service name
10720 or is a port number to connect to,
10721 VAL is a coding system, a cons of coding systems, or a function symbol.
10722 If VAL is a coding system, it is used for both decoding what received
10723 from the network stream and encoding what sent to the network stream.
10724 If VAL is a cons of coding systems, the car part is used for decoding,
10725 and the cdr part is used for encoding.
10726 If VAL is a function symbol, the function must return a coding system
10727 or a cons of coding systems which are used as above.
10728
10729 See also the function `find-operation-coding-system'. */);
10730 Vnetwork_coding_system_alist = Qnil;
10731
10732 DEFVAR_LISP ("locale-coding-system", &Vlocale_coding_system,
10733 doc: /* Coding system to use with system messages.
10734 Also used for decoding keyboard input on X Window system. */);
10735 Vlocale_coding_system = Qnil;
10736
10737 /* The eol mnemonics are reset in startup.el system-dependently. */
10738 DEFVAR_LISP ("eol-mnemonic-unix", &eol_mnemonic_unix,
10739 doc: /*
10740 *String displayed in mode line for UNIX-like (LF) end-of-line format. */);
10741 eol_mnemonic_unix = build_string (":");
10742
10743 DEFVAR_LISP ("eol-mnemonic-dos", &eol_mnemonic_dos,
10744 doc: /*
10745 *String displayed in mode line for DOS-like (CRLF) end-of-line format. */);
10746 eol_mnemonic_dos = build_string ("\\");
10747
10748 DEFVAR_LISP ("eol-mnemonic-mac", &eol_mnemonic_mac,
10749 doc: /*
10750 *String displayed in mode line for MAC-like (CR) end-of-line format. */);
10751 eol_mnemonic_mac = build_string ("/");
10752
10753 DEFVAR_LISP ("eol-mnemonic-undecided", &eol_mnemonic_undecided,
10754 doc: /*
10755 *String displayed in mode line when end-of-line format is not yet determined. */);
10756 eol_mnemonic_undecided = build_string (":");
10757
10758 DEFVAR_LISP ("enable-character-translation", &Venable_character_translation,
10759 doc: /*
10760 *Non-nil enables character translation while encoding and decoding. */);
10761 Venable_character_translation = Qt;
10762
10763 DEFVAR_LISP ("standard-translation-table-for-decode",
10764 &Vstandard_translation_table_for_decode,
10765 doc: /* Table for translating characters while decoding. */);
10766 Vstandard_translation_table_for_decode = Qnil;
10767
10768 DEFVAR_LISP ("standard-translation-table-for-encode",
10769 &Vstandard_translation_table_for_encode,
10770 doc: /* Table for translating characters while encoding. */);
10771 Vstandard_translation_table_for_encode = Qnil;
10772
10773 DEFVAR_LISP ("charset-revision-table", &Vcharset_revision_table,
10774 doc: /* Alist of charsets vs revision numbers.
10775 While encoding, if a charset (car part of an element) is found,
10776 designate it with the escape sequence identifying revision (cdr part
10777 of the element). */);
10778 Vcharset_revision_table = Qnil;
10779
10780 DEFVAR_LISP ("default-process-coding-system",
10781 &Vdefault_process_coding_system,
10782 doc: /* Cons of coding systems used for process I/O by default.
10783 The car part is used for decoding a process output,
10784 the cdr part is used for encoding a text to be sent to a process. */);
10785 Vdefault_process_coding_system = Qnil;
10786
10787 DEFVAR_LISP ("latin-extra-code-table", &Vlatin_extra_code_table,
10788 doc: /*
10789 Table of extra Latin codes in the range 128..159 (inclusive).
10790 This is a vector of length 256.
10791 If Nth element is non-nil, the existence of code N in a file
10792 \(or output of subprocess) doesn't prevent it to be detected as
10793 a coding system of ISO 2022 variant which has a flag
10794 `accept-latin-extra-code' t (e.g. iso-latin-1) on reading a file
10795 or reading output of a subprocess.
10796 Only 128th through 159th elements have a meaning. */);
10797 Vlatin_extra_code_table = Fmake_vector (make_number (256), Qnil);
10798
10799 DEFVAR_LISP ("select-safe-coding-system-function",
10800 &Vselect_safe_coding_system_function,
10801 doc: /*
10802 Function to call to select safe coding system for encoding a text.
10803
10804 If set, this function is called to force a user to select a proper
10805 coding system which can encode the text in the case that a default
10806 coding system used in each operation can't encode the text. The
10807 function should take care that the buffer is not modified while
10808 the coding system is being selected.
10809
10810 The default value is `select-safe-coding-system' (which see). */);
10811 Vselect_safe_coding_system_function = Qnil;
10812
10813 DEFVAR_BOOL ("coding-system-require-warning",
10814 &coding_system_require_warning,
10815 doc: /* Internal use only.
10816 If non-nil, on writing a file, `select-safe-coding-system-function' is
10817 called even if `coding-system-for-write' is non-nil. The command
10818 `universal-coding-system-argument' binds this variable to t temporarily. */);
10819 coding_system_require_warning = 0;
10820
10821
10822 DEFVAR_BOOL ("inhibit-iso-escape-detection",
10823 &inhibit_iso_escape_detection,
10824 doc: /*
10825 If non-nil, Emacs ignores ISO-2022 escape sequences during code detection.
10826
10827 When Emacs reads text, it tries to detect how the text is encoded.
10828 This code detection is sensitive to escape sequences. If Emacs sees
10829 a valid ISO-2022 escape sequence, it assumes the text is encoded in one
10830 of the ISO2022 encodings, and decodes text by the corresponding coding
10831 system (e.g. `iso-2022-7bit').
10832
10833 However, there may be a case that you want to read escape sequences in
10834 a file as is. In such a case, you can set this variable to non-nil.
10835 Then the code detection will ignore any escape sequences, and no text is
10836 detected as encoded in some ISO-2022 encoding. The result is that all
10837 escape sequences become visible in a buffer.
10838
10839 The default value is nil, and it is strongly recommended not to change
10840 it. That is because many Emacs Lisp source files that contain
10841 non-ASCII characters are encoded by the coding system `iso-2022-7bit'
10842 in Emacs's distribution, and they won't be decoded correctly on
10843 reading if you suppress escape sequence detection.
10844
10845 The other way to read escape sequences in a file without decoding is
10846 to explicitly specify some coding system that doesn't use ISO-2022
10847 escape sequence (e.g `latin-1') on reading by \\[universal-coding-system-argument]. */);
10848 inhibit_iso_escape_detection = 0;
10849
10850 DEFVAR_BOOL ("inhibit-null-byte-detection",
10851 &inhibit_null_byte_detection,
10852 doc: /* If non-nil, Emacs ignores null bytes on code detection.
10853 By default, Emacs treats it as binary data, and does not attempt to
10854 decode it. The effect is as if you specified `no-conversion' for
10855 reading that text.
10856
10857 Set this to non-nil when a regular text happens to include null bytes.
10858 Examples are Index nodes of Info files and null-byte delimited output
10859 from GNU Find and GNU Grep. Emacs will then ignore the null bytes and
10860 decode text as usual. */);
10861 inhibit_null_byte_detection = 0;
10862
10863 DEFVAR_LISP ("translation-table-for-input", &Vtranslation_table_for_input,
10864 doc: /* Char table for translating self-inserting characters.
10865 This is applied to the result of input methods, not their input.
10866 See also `keyboard-translate-table'.
10867
10868 Use of this variable for character code unification was rendered
10869 obsolete in Emacs 23.1 and later, since Unicode is now the basis of
10870 internal character representation. */);
10871 Vtranslation_table_for_input = Qnil;
10872
10873 {
10874 Lisp_Object args[coding_arg_max];
10875 Lisp_Object plist[16];
10876 int i;
10877
10878 for (i = 0; i < coding_arg_max; i++)
10879 args[i] = Qnil;
10880
10881 plist[0] = intern (":name");
10882 plist[1] = args[coding_arg_name] = Qno_conversion;
10883 plist[2] = intern (":mnemonic");
10884 plist[3] = args[coding_arg_mnemonic] = make_number ('=');
10885 plist[4] = intern (":coding-type");
10886 plist[5] = args[coding_arg_coding_type] = Qraw_text;
10887 plist[6] = intern (":ascii-compatible-p");
10888 plist[7] = args[coding_arg_ascii_compatible_p] = Qt;
10889 plist[8] = intern (":default-char");
10890 plist[9] = args[coding_arg_default_char] = make_number (0);
10891 plist[10] = intern (":for-unibyte");
10892 plist[11] = args[coding_arg_for_unibyte] = Qt;
10893 plist[12] = intern (":docstring");
10894 plist[13] = build_string ("Do no conversion.\n\
10895 \n\
10896 When you visit a file with this coding, the file is read into a\n\
10897 unibyte buffer as is, thus each byte of a file is treated as a\n\
10898 character.");
10899 plist[14] = intern (":eol-type");
10900 plist[15] = args[coding_arg_eol_type] = Qunix;
10901 args[coding_arg_plist] = Flist (16, plist);
10902 Fdefine_coding_system_internal (coding_arg_max, args);
10903
10904 plist[1] = args[coding_arg_name] = Qundecided;
10905 plist[3] = args[coding_arg_mnemonic] = make_number ('-');
10906 plist[5] = args[coding_arg_coding_type] = Qundecided;
10907 /* This is already set.
10908 plist[7] = args[coding_arg_ascii_compatible_p] = Qt; */
10909 plist[8] = intern (":charset-list");
10910 plist[9] = args[coding_arg_charset_list] = Fcons (Qascii, Qnil);
10911 plist[11] = args[coding_arg_for_unibyte] = Qnil;
10912 plist[13] = build_string ("No conversion on encoding, automatic conversion on decoding.");
10913 plist[15] = args[coding_arg_eol_type] = Qnil;
10914 args[coding_arg_plist] = Flist (16, plist);
10915 Fdefine_coding_system_internal (coding_arg_max, args);
10916 }
10917
10918 setup_coding_system (Qno_conversion, &safe_terminal_coding);
10919
10920 {
10921 int i;
10922
10923 for (i = 0; i < coding_category_max; i++)
10924 Fset (AREF (Vcoding_category_table, i), Qno_conversion);
10925 }
10926 #if defined (MSDOS) || defined (WINDOWSNT)
10927 system_eol_type = Qdos;
10928 #else
10929 system_eol_type = Qunix;
10930 #endif
10931 staticpro (&system_eol_type);
10932 }
10933
10934 char *
10935 emacs_strerror (error_number)
10936 int error_number;
10937 {
10938 char *str;
10939
10940 synchronize_system_messages_locale ();
10941 str = strerror (error_number);
10942
10943 if (! NILP (Vlocale_coding_system))
10944 {
10945 Lisp_Object dec = code_convert_string_norecord (build_string (str),
10946 Vlocale_coding_system,
10947 0);
10948 str = (char *) SDATA (dec);
10949 }
10950
10951 return str;
10952 }
10953
10954 #endif /* emacs */
10955
10956 /* arch-tag: 3a3a2b01-5ff6-4071-9afe-f5b808d9229d
10957 (do not change this comment) */