* coding.c (Fdetect_coding_region): Do not check start and end with
[bpt/emacs.git] / src / coding.c
1 /* Coding system handler (conversion, detection, etc).
2 Copyright (C) 2001-2012 Free Software Foundation, Inc.
3 Copyright (C) 1995, 1996, 1997, 1998, 1999, 2000, 2001, 2002, 2003, 2004,
4 2005, 2006, 2007, 2008, 2009, 2010, 2011
5 National Institute of Advanced Industrial Science and Technology (AIST)
6 Registration Number H14PRO021
7 Copyright (C) 2003
8 National Institute of Advanced Industrial Science and Technology (AIST)
9 Registration Number H13PRO009
10
11 This file is part of GNU Emacs.
12
13 GNU Emacs is free software: you can redistribute it and/or modify
14 it under the terms of the GNU General Public License as published by
15 the Free Software Foundation, either version 3 of the License, or
16 (at your option) any later version.
17
18 GNU Emacs is distributed in the hope that it will be useful,
19 but WITHOUT ANY WARRANTY; without even the implied warranty of
20 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
21 GNU General Public License for more details.
22
23 You should have received a copy of the GNU General Public License
24 along with GNU Emacs. If not, see <http://www.gnu.org/licenses/>. */
25
26 /*** TABLE OF CONTENTS ***
27
28 0. General comments
29 1. Preamble
30 2. Emacs' internal format (emacs-utf-8) handlers
31 3. UTF-8 handlers
32 4. UTF-16 handlers
33 5. Charset-base coding systems handlers
34 6. emacs-mule (old Emacs' internal format) handlers
35 7. ISO2022 handlers
36 8. Shift-JIS and BIG5 handlers
37 9. CCL handlers
38 10. C library functions
39 11. Emacs Lisp library functions
40 12. Postamble
41
42 */
43
44 /*** 0. General comments ***
45
46
47 CODING SYSTEM
48
49 A coding system is an object for an encoding mechanism that contains
50 information about how to convert byte sequences to character
51 sequences and vice versa. When we say "decode", it means converting
52 a byte sequence of a specific coding system into a character
53 sequence that is represented by Emacs' internal coding system
54 `emacs-utf-8', and when we say "encode", it means converting a
55 character sequence of emacs-utf-8 to a byte sequence of a specific
56 coding system.
57
58 In Emacs Lisp, a coding system is represented by a Lisp symbol. On
59 the C level, a coding system is represented by a vector of attributes
60 stored in the hash table Vcharset_hash_table. The conversion from
61 coding system symbol to attributes vector is done by looking up
62 Vcharset_hash_table by the symbol.
63
64 Coding systems are classified into the following types depending on
65 the encoding mechanism. Here's a brief description of the types.
66
67 o UTF-8
68
69 o UTF-16
70
71 o Charset-base coding system
72
73 A coding system defined by one or more (coded) character sets.
74 Decoding and encoding are done by a code converter defined for each
75 character set.
76
77 o Old Emacs internal format (emacs-mule)
78
79 The coding system adopted by old versions of Emacs (20 and 21).
80
81 o ISO2022-base coding system
82
83 The most famous coding system for multiple character sets. X's
84 Compound Text, various EUCs (Extended Unix Code), and coding systems
85 used in the Internet communication such as ISO-2022-JP are all
86 variants of ISO2022.
87
88 o SJIS (or Shift-JIS or MS-Kanji-Code)
89
90 A coding system to encode character sets: ASCII, JISX0201, and
91 JISX0208. Widely used for PC's in Japan. Details are described in
92 section 8.
93
94 o BIG5
95
96 A coding system to encode character sets: ASCII and Big5. Widely
97 used for Chinese (mainly in Taiwan and Hong Kong). Details are
98 described in section 8. In this file, when we write "big5" (all
99 lowercase), we mean the coding system, and when we write "Big5"
100 (capitalized), we mean the character set.
101
102 o CCL
103
104 If a user wants to decode/encode text encoded in a coding system
105 not listed above, he can supply a decoder and an encoder for it in
106 CCL (Code Conversion Language) programs. Emacs executes the CCL
107 program while decoding/encoding.
108
109 o Raw-text
110
111 A coding system for text containing raw eight-bit data. Emacs
112 treats each byte of source text as a character (except for
113 end-of-line conversion).
114
115 o No-conversion
116
117 Like raw text, but don't do end-of-line conversion.
118
119
120 END-OF-LINE FORMAT
121
122 How text end-of-line is encoded depends on operating system. For
123 instance, Unix's format is just one byte of LF (line-feed) code,
124 whereas DOS's format is two-byte sequence of `carriage-return' and
125 `line-feed' codes. MacOS's format is usually one byte of
126 `carriage-return'.
127
128 Since text character encoding and end-of-line encoding are
129 independent, any coding system described above can take any format
130 of end-of-line (except for no-conversion).
131
132 STRUCT CODING_SYSTEM
133
134 Before using a coding system for code conversion (i.e. decoding and
135 encoding), we setup a structure of type `struct coding_system'.
136 This structure keeps various information about a specific code
137 conversion (e.g. the location of source and destination data).
138
139 */
140
141 /* COMMON MACROS */
142
143
144 /*** GENERAL NOTES on `detect_coding_XXX ()' functions ***
145
146 These functions check if a byte sequence specified as a source in
147 CODING conforms to the format of XXX, and update the members of
148 DETECT_INFO.
149
150 Return true if the byte sequence conforms to XXX.
151
152 Below is the template of these functions. */
153
154 #if 0
155 static bool
156 detect_coding_XXX (struct coding_system *coding,
157 struct coding_detection_info *detect_info)
158 {
159 const unsigned char *src = coding->source;
160 const unsigned char *src_end = coding->source + coding->src_bytes;
161 bool multibytep = coding->src_multibyte;
162 ptrdiff_t consumed_chars = 0;
163 int found = 0;
164 ...;
165
166 while (1)
167 {
168 /* Get one byte from the source. If the source is exhausted, jump
169 to no_more_source:. */
170 ONE_MORE_BYTE (c);
171
172 if (! __C_conforms_to_XXX___ (c))
173 break;
174 if (! __C_strongly_suggests_XXX__ (c))
175 found = CATEGORY_MASK_XXX;
176 }
177 /* The byte sequence is invalid for XXX. */
178 detect_info->rejected |= CATEGORY_MASK_XXX;
179 return 0;
180
181 no_more_source:
182 /* The source exhausted successfully. */
183 detect_info->found |= found;
184 return 1;
185 }
186 #endif
187
188 /*** GENERAL NOTES on `decode_coding_XXX ()' functions ***
189
190 These functions decode a byte sequence specified as a source by
191 CODING. The resulting multibyte text goes to a place pointed to by
192 CODING->charbuf, the length of which should not exceed
193 CODING->charbuf_size;
194
195 These functions set the information of original and decoded texts in
196 CODING->consumed, CODING->consumed_char, and CODING->charbuf_used.
197 They also set CODING->result to one of CODING_RESULT_XXX indicating
198 how the decoding is finished.
199
200 Below is the template of these functions. */
201
202 #if 0
203 static void
204 decode_coding_XXXX (struct coding_system *coding)
205 {
206 const unsigned char *src = coding->source + coding->consumed;
207 const unsigned char *src_end = coding->source + coding->src_bytes;
208 /* SRC_BASE remembers the start position in source in each loop.
209 The loop will be exited when there's not enough source code, or
210 when there's no room in CHARBUF for a decoded character. */
211 const unsigned char *src_base;
212 /* A buffer to produce decoded characters. */
213 int *charbuf = coding->charbuf + coding->charbuf_used;
214 int *charbuf_end = coding->charbuf + coding->charbuf_size;
215 bool multibytep = coding->src_multibyte;
216
217 while (1)
218 {
219 src_base = src;
220 if (charbuf < charbuf_end)
221 /* No more room to produce a decoded character. */
222 break;
223 ONE_MORE_BYTE (c);
224 /* Decode it. */
225 }
226
227 no_more_source:
228 if (src_base < src_end
229 && coding->mode & CODING_MODE_LAST_BLOCK)
230 /* If the source ends by partial bytes to construct a character,
231 treat them as eight-bit raw data. */
232 while (src_base < src_end && charbuf < charbuf_end)
233 *charbuf++ = *src_base++;
234 /* Remember how many bytes and characters we consumed. If the
235 source is multibyte, the bytes and chars are not identical. */
236 coding->consumed = coding->consumed_char = src_base - coding->source;
237 /* Remember how many characters we produced. */
238 coding->charbuf_used = charbuf - coding->charbuf;
239 }
240 #endif
241
242 /*** GENERAL NOTES on `encode_coding_XXX ()' functions ***
243
244 These functions encode SRC_BYTES length text at SOURCE of Emacs'
245 internal multibyte format by CODING. The resulting byte sequence
246 goes to a place pointed to by DESTINATION, the length of which
247 should not exceed DST_BYTES.
248
249 These functions set the information of original and encoded texts in
250 the members produced, produced_char, consumed, and consumed_char of
251 the structure *CODING. They also set the member result to one of
252 CODING_RESULT_XXX indicating how the encoding finished.
253
254 DST_BYTES zero means that source area and destination area are
255 overlapped, which means that we can produce a encoded text until it
256 reaches at the head of not-yet-encoded source text.
257
258 Below is a template of these functions. */
259 #if 0
260 static void
261 encode_coding_XXX (struct coding_system *coding)
262 {
263 bool multibytep = coding->dst_multibyte;
264 int *charbuf = coding->charbuf;
265 int *charbuf_end = charbuf->charbuf + coding->charbuf_used;
266 unsigned char *dst = coding->destination + coding->produced;
267 unsigned char *dst_end = coding->destination + coding->dst_bytes;
268 unsigned char *adjusted_dst_end = dst_end - _MAX_BYTES_PRODUCED_IN_LOOP_;
269 ptrdiff_t produced_chars = 0;
270
271 for (; charbuf < charbuf_end && dst < adjusted_dst_end; charbuf++)
272 {
273 int c = *charbuf;
274 /* Encode C into DST, and increment DST. */
275 }
276 label_no_more_destination:
277 /* How many chars and bytes we produced. */
278 coding->produced_char += produced_chars;
279 coding->produced = dst - coding->destination;
280 }
281 #endif
282
283 \f
284 /*** 1. Preamble ***/
285
286 #include <config.h>
287 #include <stdio.h>
288
289 #include "lisp.h"
290 #include "character.h"
291 #include "buffer.h"
292 #include "charset.h"
293 #include "ccl.h"
294 #include "composite.h"
295 #include "coding.h"
296 #include "window.h"
297 #include "frame.h"
298 #include "termhooks.h"
299
300 Lisp_Object Vcoding_system_hash_table;
301
302 static Lisp_Object Qcoding_system, Qeol_type;
303 static Lisp_Object Qcoding_aliases;
304 Lisp_Object Qunix, Qdos;
305 Lisp_Object Qbuffer_file_coding_system;
306 static Lisp_Object Qpost_read_conversion, Qpre_write_conversion;
307 static Lisp_Object Qdefault_char;
308 Lisp_Object Qno_conversion, Qundecided;
309 Lisp_Object Qcharset, Qutf_8;
310 static Lisp_Object Qiso_2022;
311 static Lisp_Object Qutf_16, Qshift_jis, Qbig5;
312 static Lisp_Object Qbig, Qlittle;
313 static Lisp_Object Qcoding_system_history;
314 static Lisp_Object Qvalid_codes;
315 static Lisp_Object QCcategory, QCmnemonic, QCdefault_char;
316 static Lisp_Object QCdecode_translation_table, QCencode_translation_table;
317 static Lisp_Object QCpost_read_conversion, QCpre_write_conversion;
318 static Lisp_Object QCascii_compatible_p;
319
320 Lisp_Object Qcall_process, Qcall_process_region;
321 Lisp_Object Qstart_process, Qopen_network_stream;
322 static Lisp_Object Qtarget_idx;
323
324 static Lisp_Object Qinsufficient_source, Qinconsistent_eol, Qinvalid_source;
325 static Lisp_Object Qinterrupted, Qinsufficient_memory;
326
327 /* If a symbol has this property, evaluate the value to define the
328 symbol as a coding system. */
329 static Lisp_Object Qcoding_system_define_form;
330
331 /* Format of end-of-line decided by system. This is Qunix on
332 Unix and Mac, Qdos on DOS/Windows.
333 This has an effect only for external encoding (i.e. for output to
334 file and process), not for in-buffer or Lisp string encoding. */
335 static Lisp_Object system_eol_type;
336
337 #ifdef emacs
338
339 Lisp_Object Qcoding_system_p, Qcoding_system_error;
340
341 /* Coding system emacs-mule and raw-text are for converting only
342 end-of-line format. */
343 Lisp_Object Qemacs_mule, Qraw_text;
344 Lisp_Object Qutf_8_emacs;
345
346 #if defined (WINDOWSNT) || defined (CYGWIN)
347 static Lisp_Object Qutf_16le;
348 #endif
349
350 /* Coding-systems are handed between Emacs Lisp programs and C internal
351 routines by the following three variables. */
352 /* Coding system to be used to encode text for terminal display when
353 terminal coding system is nil. */
354 struct coding_system safe_terminal_coding;
355
356 #endif /* emacs */
357
358 Lisp_Object Qtranslation_table;
359 Lisp_Object Qtranslation_table_id;
360 static Lisp_Object Qtranslation_table_for_decode;
361 static Lisp_Object Qtranslation_table_for_encode;
362
363 /* Two special coding systems. */
364 static Lisp_Object Vsjis_coding_system;
365 static Lisp_Object Vbig5_coding_system;
366
367 /* ISO2022 section */
368
369 #define CODING_ISO_INITIAL(coding, reg) \
370 (XINT (AREF (AREF (CODING_ID_ATTRS ((coding)->id), \
371 coding_attr_iso_initial), \
372 reg)))
373
374
375 #define CODING_ISO_REQUEST(coding, charset_id) \
376 (((charset_id) <= (coding)->max_charset_id \
377 ? ((coding)->safe_charsets[charset_id] != 255 \
378 ? (coding)->safe_charsets[charset_id] \
379 : -1) \
380 : -1))
381
382
383 #define CODING_ISO_FLAGS(coding) \
384 ((coding)->spec.iso_2022.flags)
385 #define CODING_ISO_DESIGNATION(coding, reg) \
386 ((coding)->spec.iso_2022.current_designation[reg])
387 #define CODING_ISO_INVOCATION(coding, plane) \
388 ((coding)->spec.iso_2022.current_invocation[plane])
389 #define CODING_ISO_SINGLE_SHIFTING(coding) \
390 ((coding)->spec.iso_2022.single_shifting)
391 #define CODING_ISO_BOL(coding) \
392 ((coding)->spec.iso_2022.bol)
393 #define CODING_ISO_INVOKED_CHARSET(coding, plane) \
394 CODING_ISO_DESIGNATION ((coding), CODING_ISO_INVOCATION ((coding), (plane)))
395 #define CODING_ISO_CMP_STATUS(coding) \
396 (&(coding)->spec.iso_2022.cmp_status)
397 #define CODING_ISO_EXTSEGMENT_LEN(coding) \
398 ((coding)->spec.iso_2022.ctext_extended_segment_len)
399 #define CODING_ISO_EMBEDDED_UTF_8(coding) \
400 ((coding)->spec.iso_2022.embedded_utf_8)
401
402 /* Control characters of ISO2022. */
403 /* code */ /* function */
404 #define ISO_CODE_SO 0x0E /* shift-out */
405 #define ISO_CODE_SI 0x0F /* shift-in */
406 #define ISO_CODE_SS2_7 0x19 /* single-shift-2 for 7-bit code */
407 #define ISO_CODE_ESC 0x1B /* escape */
408 #define ISO_CODE_SS2 0x8E /* single-shift-2 */
409 #define ISO_CODE_SS3 0x8F /* single-shift-3 */
410 #define ISO_CODE_CSI 0x9B /* control-sequence-introducer */
411
412 /* All code (1-byte) of ISO2022 is classified into one of the
413 followings. */
414 enum iso_code_class_type
415 {
416 ISO_control_0, /* Control codes in the range
417 0x00..0x1F and 0x7F, except for the
418 following 5 codes. */
419 ISO_shift_out, /* ISO_CODE_SO (0x0E) */
420 ISO_shift_in, /* ISO_CODE_SI (0x0F) */
421 ISO_single_shift_2_7, /* ISO_CODE_SS2_7 (0x19) */
422 ISO_escape, /* ISO_CODE_ESC (0x1B) */
423 ISO_control_1, /* Control codes in the range
424 0x80..0x9F, except for the
425 following 3 codes. */
426 ISO_single_shift_2, /* ISO_CODE_SS2 (0x8E) */
427 ISO_single_shift_3, /* ISO_CODE_SS3 (0x8F) */
428 ISO_control_sequence_introducer, /* ISO_CODE_CSI (0x9B) */
429 ISO_0x20_or_0x7F, /* Codes of the values 0x20 or 0x7F. */
430 ISO_graphic_plane_0, /* Graphic codes in the range 0x21..0x7E. */
431 ISO_0xA0_or_0xFF, /* Codes of the values 0xA0 or 0xFF. */
432 ISO_graphic_plane_1 /* Graphic codes in the range 0xA1..0xFE. */
433 };
434
435 /** The macros CODING_ISO_FLAG_XXX defines a flag bit of the
436 `iso-flags' attribute of an iso2022 coding system. */
437
438 /* If set, produce long-form designation sequence (e.g. ESC $ ( A)
439 instead of the correct short-form sequence (e.g. ESC $ A). */
440 #define CODING_ISO_FLAG_LONG_FORM 0x0001
441
442 /* If set, reset graphic planes and registers at end-of-line to the
443 initial state. */
444 #define CODING_ISO_FLAG_RESET_AT_EOL 0x0002
445
446 /* If set, reset graphic planes and registers before any control
447 characters to the initial state. */
448 #define CODING_ISO_FLAG_RESET_AT_CNTL 0x0004
449
450 /* If set, encode by 7-bit environment. */
451 #define CODING_ISO_FLAG_SEVEN_BITS 0x0008
452
453 /* If set, use locking-shift function. */
454 #define CODING_ISO_FLAG_LOCKING_SHIFT 0x0010
455
456 /* If set, use single-shift function. Overwrite
457 CODING_ISO_FLAG_LOCKING_SHIFT. */
458 #define CODING_ISO_FLAG_SINGLE_SHIFT 0x0020
459
460 /* If set, use designation escape sequence. */
461 #define CODING_ISO_FLAG_DESIGNATION 0x0040
462
463 /* If set, produce revision number sequence. */
464 #define CODING_ISO_FLAG_REVISION 0x0080
465
466 /* If set, produce ISO6429's direction specifying sequence. */
467 #define CODING_ISO_FLAG_DIRECTION 0x0100
468
469 /* If set, assume designation states are reset at beginning of line on
470 output. */
471 #define CODING_ISO_FLAG_INIT_AT_BOL 0x0200
472
473 /* If set, designation sequence should be placed at beginning of line
474 on output. */
475 #define CODING_ISO_FLAG_DESIGNATE_AT_BOL 0x0400
476
477 /* If set, do not encode unsafe characters on output. */
478 #define CODING_ISO_FLAG_SAFE 0x0800
479
480 /* If set, extra latin codes (128..159) are accepted as a valid code
481 on input. */
482 #define CODING_ISO_FLAG_LATIN_EXTRA 0x1000
483
484 #define CODING_ISO_FLAG_COMPOSITION 0x2000
485
486 /* #define CODING_ISO_FLAG_EUC_TW_SHIFT 0x4000 */
487
488 #define CODING_ISO_FLAG_USE_ROMAN 0x8000
489
490 #define CODING_ISO_FLAG_USE_OLDJIS 0x10000
491
492 #define CODING_ISO_FLAG_FULL_SUPPORT 0x100000
493
494 /* A character to be produced on output if encoding of the original
495 character is prohibited by CODING_ISO_FLAG_SAFE. */
496 #define CODING_INHIBIT_CHARACTER_SUBSTITUTION '?'
497
498 /* UTF-8 section */
499 #define CODING_UTF_8_BOM(coding) \
500 ((coding)->spec.utf_8_bom)
501
502 /* UTF-16 section */
503 #define CODING_UTF_16_BOM(coding) \
504 ((coding)->spec.utf_16.bom)
505
506 #define CODING_UTF_16_ENDIAN(coding) \
507 ((coding)->spec.utf_16.endian)
508
509 #define CODING_UTF_16_SURROGATE(coding) \
510 ((coding)->spec.utf_16.surrogate)
511
512
513 /* CCL section */
514 #define CODING_CCL_DECODER(coding) \
515 AREF (CODING_ID_ATTRS ((coding)->id), coding_attr_ccl_decoder)
516 #define CODING_CCL_ENCODER(coding) \
517 AREF (CODING_ID_ATTRS ((coding)->id), coding_attr_ccl_encoder)
518 #define CODING_CCL_VALIDS(coding) \
519 (SDATA (AREF (CODING_ID_ATTRS ((coding)->id), coding_attr_ccl_valids)))
520
521 /* Index for each coding category in `coding_categories' */
522
523 enum coding_category
524 {
525 coding_category_iso_7,
526 coding_category_iso_7_tight,
527 coding_category_iso_8_1,
528 coding_category_iso_8_2,
529 coding_category_iso_7_else,
530 coding_category_iso_8_else,
531 coding_category_utf_8_auto,
532 coding_category_utf_8_nosig,
533 coding_category_utf_8_sig,
534 coding_category_utf_16_auto,
535 coding_category_utf_16_be,
536 coding_category_utf_16_le,
537 coding_category_utf_16_be_nosig,
538 coding_category_utf_16_le_nosig,
539 coding_category_charset,
540 coding_category_sjis,
541 coding_category_big5,
542 coding_category_ccl,
543 coding_category_emacs_mule,
544 /* All above are targets of code detection. */
545 coding_category_raw_text,
546 coding_category_undecided,
547 coding_category_max
548 };
549
550 /* Definitions of flag bits used in detect_coding_XXXX. */
551 #define CATEGORY_MASK_ISO_7 (1 << coding_category_iso_7)
552 #define CATEGORY_MASK_ISO_7_TIGHT (1 << coding_category_iso_7_tight)
553 #define CATEGORY_MASK_ISO_8_1 (1 << coding_category_iso_8_1)
554 #define CATEGORY_MASK_ISO_8_2 (1 << coding_category_iso_8_2)
555 #define CATEGORY_MASK_ISO_7_ELSE (1 << coding_category_iso_7_else)
556 #define CATEGORY_MASK_ISO_8_ELSE (1 << coding_category_iso_8_else)
557 #define CATEGORY_MASK_UTF_8_AUTO (1 << coding_category_utf_8_auto)
558 #define CATEGORY_MASK_UTF_8_NOSIG (1 << coding_category_utf_8_nosig)
559 #define CATEGORY_MASK_UTF_8_SIG (1 << coding_category_utf_8_sig)
560 #define CATEGORY_MASK_UTF_16_AUTO (1 << coding_category_utf_16_auto)
561 #define CATEGORY_MASK_UTF_16_BE (1 << coding_category_utf_16_be)
562 #define CATEGORY_MASK_UTF_16_LE (1 << coding_category_utf_16_le)
563 #define CATEGORY_MASK_UTF_16_BE_NOSIG (1 << coding_category_utf_16_be_nosig)
564 #define CATEGORY_MASK_UTF_16_LE_NOSIG (1 << coding_category_utf_16_le_nosig)
565 #define CATEGORY_MASK_CHARSET (1 << coding_category_charset)
566 #define CATEGORY_MASK_SJIS (1 << coding_category_sjis)
567 #define CATEGORY_MASK_BIG5 (1 << coding_category_big5)
568 #define CATEGORY_MASK_CCL (1 << coding_category_ccl)
569 #define CATEGORY_MASK_EMACS_MULE (1 << coding_category_emacs_mule)
570 #define CATEGORY_MASK_RAW_TEXT (1 << coding_category_raw_text)
571
572 /* This value is returned if detect_coding_mask () find nothing other
573 than ASCII characters. */
574 #define CATEGORY_MASK_ANY \
575 (CATEGORY_MASK_ISO_7 \
576 | CATEGORY_MASK_ISO_7_TIGHT \
577 | CATEGORY_MASK_ISO_8_1 \
578 | CATEGORY_MASK_ISO_8_2 \
579 | CATEGORY_MASK_ISO_7_ELSE \
580 | CATEGORY_MASK_ISO_8_ELSE \
581 | CATEGORY_MASK_UTF_8_AUTO \
582 | CATEGORY_MASK_UTF_8_NOSIG \
583 | CATEGORY_MASK_UTF_8_SIG \
584 | CATEGORY_MASK_UTF_16_AUTO \
585 | CATEGORY_MASK_UTF_16_BE \
586 | CATEGORY_MASK_UTF_16_LE \
587 | CATEGORY_MASK_UTF_16_BE_NOSIG \
588 | CATEGORY_MASK_UTF_16_LE_NOSIG \
589 | CATEGORY_MASK_CHARSET \
590 | CATEGORY_MASK_SJIS \
591 | CATEGORY_MASK_BIG5 \
592 | CATEGORY_MASK_CCL \
593 | CATEGORY_MASK_EMACS_MULE)
594
595
596 #define CATEGORY_MASK_ISO_7BIT \
597 (CATEGORY_MASK_ISO_7 | CATEGORY_MASK_ISO_7_TIGHT)
598
599 #define CATEGORY_MASK_ISO_8BIT \
600 (CATEGORY_MASK_ISO_8_1 | CATEGORY_MASK_ISO_8_2)
601
602 #define CATEGORY_MASK_ISO_ELSE \
603 (CATEGORY_MASK_ISO_7_ELSE | CATEGORY_MASK_ISO_8_ELSE)
604
605 #define CATEGORY_MASK_ISO_ESCAPE \
606 (CATEGORY_MASK_ISO_7 \
607 | CATEGORY_MASK_ISO_7_TIGHT \
608 | CATEGORY_MASK_ISO_7_ELSE \
609 | CATEGORY_MASK_ISO_8_ELSE)
610
611 #define CATEGORY_MASK_ISO \
612 ( CATEGORY_MASK_ISO_7BIT \
613 | CATEGORY_MASK_ISO_8BIT \
614 | CATEGORY_MASK_ISO_ELSE)
615
616 #define CATEGORY_MASK_UTF_16 \
617 (CATEGORY_MASK_UTF_16_AUTO \
618 | CATEGORY_MASK_UTF_16_BE \
619 | CATEGORY_MASK_UTF_16_LE \
620 | CATEGORY_MASK_UTF_16_BE_NOSIG \
621 | CATEGORY_MASK_UTF_16_LE_NOSIG)
622
623 #define CATEGORY_MASK_UTF_8 \
624 (CATEGORY_MASK_UTF_8_AUTO \
625 | CATEGORY_MASK_UTF_8_NOSIG \
626 | CATEGORY_MASK_UTF_8_SIG)
627
628 /* Table of coding categories (Lisp symbols). This variable is for
629 internal use only. */
630 static Lisp_Object Vcoding_category_table;
631
632 /* Table of coding-categories ordered by priority. */
633 static enum coding_category coding_priorities[coding_category_max];
634
635 /* Nth element is a coding context for the coding system bound to the
636 Nth coding category. */
637 static struct coding_system coding_categories[coding_category_max];
638
639 /*** Commonly used macros and functions ***/
640
641 #ifndef min
642 #define min(a, b) ((a) < (b) ? (a) : (b))
643 #endif
644 #ifndef max
645 #define max(a, b) ((a) > (b) ? (a) : (b))
646 #endif
647
648 #define CODING_GET_INFO(coding, attrs, charset_list) \
649 do { \
650 (attrs) = CODING_ID_ATTRS ((coding)->id); \
651 (charset_list) = CODING_ATTR_CHARSET_LIST (attrs); \
652 } while (0)
653
654
655 /* Safely get one byte from the source text pointed by SRC which ends
656 at SRC_END, and set C to that byte. If there are not enough bytes
657 in the source, it jumps to 'no_more_source'. If MULTIBYTEP,
658 and a multibyte character is found at SRC, set C to the
659 negative value of the character code. The caller should declare
660 and set these variables appropriately in advance:
661 src, src_end, multibytep */
662
663 #define ONE_MORE_BYTE(c) \
664 do { \
665 if (src == src_end) \
666 { \
667 if (src_base < src) \
668 record_conversion_result \
669 (coding, CODING_RESULT_INSUFFICIENT_SRC); \
670 goto no_more_source; \
671 } \
672 c = *src++; \
673 if (multibytep && (c & 0x80)) \
674 { \
675 if ((c & 0xFE) == 0xC0) \
676 c = ((c & 1) << 6) | *src++; \
677 else \
678 { \
679 src--; \
680 c = - string_char (src, &src, NULL); \
681 record_conversion_result \
682 (coding, CODING_RESULT_INVALID_SRC); \
683 } \
684 } \
685 consumed_chars++; \
686 } while (0)
687
688 /* Safely get two bytes from the source text pointed by SRC which ends
689 at SRC_END, and set C1 and C2 to those bytes while skipping the
690 heading multibyte characters. If there are not enough bytes in the
691 source, it jumps to 'no_more_source'. If MULTIBYTEP and
692 a multibyte character is found for C2, set C2 to the negative value
693 of the character code. The caller should declare and set these
694 variables appropriately in advance:
695 src, src_end, multibytep
696 It is intended that this macro is used in detect_coding_utf_16. */
697
698 #define TWO_MORE_BYTES(c1, c2) \
699 do { \
700 do { \
701 if (src == src_end) \
702 goto no_more_source; \
703 c1 = *src++; \
704 if (multibytep && (c1 & 0x80)) \
705 { \
706 if ((c1 & 0xFE) == 0xC0) \
707 c1 = ((c1 & 1) << 6) | *src++; \
708 else \
709 { \
710 src += BYTES_BY_CHAR_HEAD (c1) - 1; \
711 c1 = -1; \
712 } \
713 } \
714 } while (c1 < 0); \
715 if (src == src_end) \
716 goto no_more_source; \
717 c2 = *src++; \
718 if (multibytep && (c2 & 0x80)) \
719 { \
720 if ((c2 & 0xFE) == 0xC0) \
721 c2 = ((c2 & 1) << 6) | *src++; \
722 else \
723 c2 = -1; \
724 } \
725 } while (0)
726
727
728 /* Store a byte C in the place pointed by DST and increment DST to the
729 next free point, and increment PRODUCED_CHARS. The caller should
730 assure that C is 0..127, and declare and set the variable `dst'
731 appropriately in advance.
732 */
733
734
735 #define EMIT_ONE_ASCII_BYTE(c) \
736 do { \
737 produced_chars++; \
738 *dst++ = (c); \
739 } while (0)
740
741
742 /* Like EMIT_ONE_ASCII_BYTE but store two bytes; C1 and C2. */
743
744 #define EMIT_TWO_ASCII_BYTES(c1, c2) \
745 do { \
746 produced_chars += 2; \
747 *dst++ = (c1), *dst++ = (c2); \
748 } while (0)
749
750
751 /* Store a byte C in the place pointed by DST and increment DST to the
752 next free point, and increment PRODUCED_CHARS. If MULTIBYTEP,
753 store in an appropriate multibyte form. The caller should
754 declare and set the variables `dst' and `multibytep' appropriately
755 in advance. */
756
757 #define EMIT_ONE_BYTE(c) \
758 do { \
759 produced_chars++; \
760 if (multibytep) \
761 { \
762 unsigned ch = (c); \
763 if (ch >= 0x80) \
764 ch = BYTE8_TO_CHAR (ch); \
765 CHAR_STRING_ADVANCE (ch, dst); \
766 } \
767 else \
768 *dst++ = (c); \
769 } while (0)
770
771
772 /* Like EMIT_ONE_BYTE, but emit two bytes; C1 and C2. */
773
774 #define EMIT_TWO_BYTES(c1, c2) \
775 do { \
776 produced_chars += 2; \
777 if (multibytep) \
778 { \
779 unsigned ch; \
780 \
781 ch = (c1); \
782 if (ch >= 0x80) \
783 ch = BYTE8_TO_CHAR (ch); \
784 CHAR_STRING_ADVANCE (ch, dst); \
785 ch = (c2); \
786 if (ch >= 0x80) \
787 ch = BYTE8_TO_CHAR (ch); \
788 CHAR_STRING_ADVANCE (ch, dst); \
789 } \
790 else \
791 { \
792 *dst++ = (c1); \
793 *dst++ = (c2); \
794 } \
795 } while (0)
796
797
798 #define EMIT_THREE_BYTES(c1, c2, c3) \
799 do { \
800 EMIT_ONE_BYTE (c1); \
801 EMIT_TWO_BYTES (c2, c3); \
802 } while (0)
803
804
805 #define EMIT_FOUR_BYTES(c1, c2, c3, c4) \
806 do { \
807 EMIT_TWO_BYTES (c1, c2); \
808 EMIT_TWO_BYTES (c3, c4); \
809 } while (0)
810
811
812 static void
813 record_conversion_result (struct coding_system *coding,
814 enum coding_result_code result)
815 {
816 coding->result = result;
817 switch (result)
818 {
819 case CODING_RESULT_INSUFFICIENT_SRC:
820 Vlast_code_conversion_error = Qinsufficient_source;
821 break;
822 case CODING_RESULT_INCONSISTENT_EOL:
823 Vlast_code_conversion_error = Qinconsistent_eol;
824 break;
825 case CODING_RESULT_INVALID_SRC:
826 Vlast_code_conversion_error = Qinvalid_source;
827 break;
828 case CODING_RESULT_INTERRUPT:
829 Vlast_code_conversion_error = Qinterrupted;
830 break;
831 case CODING_RESULT_INSUFFICIENT_MEM:
832 Vlast_code_conversion_error = Qinsufficient_memory;
833 break;
834 case CODING_RESULT_INSUFFICIENT_DST:
835 /* Don't record this error in Vlast_code_conversion_error
836 because it happens just temporarily and is resolved when the
837 whole conversion is finished. */
838 break;
839 case CODING_RESULT_SUCCESS:
840 break;
841 default:
842 Vlast_code_conversion_error = intern ("Unknown error");
843 }
844 }
845
846 /* These wrapper macros are used to preserve validity of pointers into
847 buffer text across calls to decode_char, encode_char, etc, which
848 could cause relocation of buffers if it loads a charset map,
849 because loading a charset map allocates large structures. */
850
851 #define CODING_DECODE_CHAR(coding, src, src_base, src_end, charset, code, c) \
852 do { \
853 ptrdiff_t offset; \
854 \
855 charset_map_loaded = 0; \
856 c = DECODE_CHAR (charset, code); \
857 if (charset_map_loaded \
858 && (offset = coding_change_source (coding))) \
859 { \
860 src += offset; \
861 src_base += offset; \
862 src_end += offset; \
863 } \
864 } while (0)
865
866 #define CODING_ENCODE_CHAR(coding, dst, dst_end, charset, c, code) \
867 do { \
868 ptrdiff_t offset; \
869 \
870 charset_map_loaded = 0; \
871 code = ENCODE_CHAR (charset, c); \
872 if (charset_map_loaded \
873 && (offset = coding_change_destination (coding))) \
874 { \
875 dst += offset; \
876 dst_end += offset; \
877 } \
878 } while (0)
879
880 #define CODING_CHAR_CHARSET(coding, dst, dst_end, c, charset_list, code_return, charset) \
881 do { \
882 ptrdiff_t offset; \
883 \
884 charset_map_loaded = 0; \
885 charset = char_charset (c, charset_list, code_return); \
886 if (charset_map_loaded \
887 && (offset = coding_change_destination (coding))) \
888 { \
889 dst += offset; \
890 dst_end += offset; \
891 } \
892 } while (0)
893
894 #define CODING_CHAR_CHARSET_P(coding, dst, dst_end, c, charset, result) \
895 do { \
896 ptrdiff_t offset; \
897 \
898 charset_map_loaded = 0; \
899 result = CHAR_CHARSET_P (c, charset); \
900 if (charset_map_loaded \
901 && (offset = coding_change_destination (coding))) \
902 { \
903 dst += offset; \
904 dst_end += offset; \
905 } \
906 } while (0)
907
908
909 /* If there are at least BYTES length of room at dst, allocate memory
910 for coding->destination and update dst and dst_end. We don't have
911 to take care of coding->source which will be relocated. It is
912 handled by calling coding_set_source in encode_coding. */
913
914 #define ASSURE_DESTINATION(bytes) \
915 do { \
916 if (dst + (bytes) >= dst_end) \
917 { \
918 ptrdiff_t more_bytes = charbuf_end - charbuf + (bytes); \
919 \
920 dst = alloc_destination (coding, more_bytes, dst); \
921 dst_end = coding->destination + coding->dst_bytes; \
922 } \
923 } while (0)
924
925
926 /* Store multibyte form of the character C in P, and advance P to the
927 end of the multibyte form. This used to be like CHAR_STRING_ADVANCE
928 without ever calling MAYBE_UNIFY_CHAR, but nowadays we don't call
929 MAYBE_UNIFY_CHAR in CHAR_STRING_ADVANCE. */
930
931 #define CHAR_STRING_ADVANCE_NO_UNIFY(c, p) CHAR_STRING_ADVANCE(c, p)
932
933 /* Return the character code of character whose multibyte form is at
934 P, and advance P to the end of the multibyte form. This used to be
935 like STRING_CHAR_ADVANCE without ever calling MAYBE_UNIFY_CHAR, but
936 nowadays STRING_CHAR_ADVANCE doesn't call MAYBE_UNIFY_CHAR. */
937
938 #define STRING_CHAR_ADVANCE_NO_UNIFY(p) STRING_CHAR_ADVANCE(p)
939
940 /* Set coding->source from coding->src_object. */
941
942 static void
943 coding_set_source (struct coding_system *coding)
944 {
945 if (BUFFERP (coding->src_object))
946 {
947 struct buffer *buf = XBUFFER (coding->src_object);
948
949 if (coding->src_pos < 0)
950 coding->source = BUF_GAP_END_ADDR (buf) + coding->src_pos_byte;
951 else
952 coding->source = BUF_BYTE_ADDRESS (buf, coding->src_pos_byte);
953 }
954 else if (STRINGP (coding->src_object))
955 {
956 coding->source = SDATA (coding->src_object) + coding->src_pos_byte;
957 }
958 else
959 {
960 /* Otherwise, the source is C string and is never relocated
961 automatically. Thus we don't have to update anything. */
962 }
963 }
964
965
966 /* Set coding->source from coding->src_object, and return how many
967 bytes coding->source was changed. */
968
969 static ptrdiff_t
970 coding_change_source (struct coding_system *coding)
971 {
972 const unsigned char *orig = coding->source;
973 coding_set_source (coding);
974 return coding->source - orig;
975 }
976
977
978 /* Set coding->destination from coding->dst_object. */
979
980 static void
981 coding_set_destination (struct coding_system *coding)
982 {
983 if (BUFFERP (coding->dst_object))
984 {
985 if (BUFFERP (coding->src_object) && coding->src_pos < 0)
986 {
987 coding->destination = BEG_ADDR + coding->dst_pos_byte - BEG_BYTE;
988 coding->dst_bytes = (GAP_END_ADDR
989 - (coding->src_bytes - coding->consumed)
990 - coding->destination);
991 }
992 else
993 {
994 /* We are sure that coding->dst_pos_byte is before the gap
995 of the buffer. */
996 coding->destination = (BUF_BEG_ADDR (XBUFFER (coding->dst_object))
997 + coding->dst_pos_byte - BEG_BYTE);
998 coding->dst_bytes = (BUF_GAP_END_ADDR (XBUFFER (coding->dst_object))
999 - coding->destination);
1000 }
1001 }
1002 else
1003 {
1004 /* Otherwise, the destination is C string and is never relocated
1005 automatically. Thus we don't have to update anything. */
1006 }
1007 }
1008
1009
1010 /* Set coding->destination from coding->dst_object, and return how
1011 many bytes coding->destination was changed. */
1012
1013 static ptrdiff_t
1014 coding_change_destination (struct coding_system *coding)
1015 {
1016 const unsigned char *orig = coding->destination;
1017 coding_set_destination (coding);
1018 return coding->destination - orig;
1019 }
1020
1021
1022 static void
1023 coding_alloc_by_realloc (struct coding_system *coding, ptrdiff_t bytes)
1024 {
1025 if (STRING_BYTES_BOUND - coding->dst_bytes < bytes)
1026 string_overflow ();
1027 coding->destination = xrealloc (coding->destination,
1028 coding->dst_bytes + bytes);
1029 coding->dst_bytes += bytes;
1030 }
1031
1032 static void
1033 coding_alloc_by_making_gap (struct coding_system *coding,
1034 ptrdiff_t gap_head_used, ptrdiff_t bytes)
1035 {
1036 if (EQ (coding->src_object, coding->dst_object))
1037 {
1038 /* The gap may contain the produced data at the head and not-yet
1039 consumed data at the tail. To preserve those data, we at
1040 first make the gap size to zero, then increase the gap
1041 size. */
1042 ptrdiff_t add = GAP_SIZE;
1043
1044 GPT += gap_head_used, GPT_BYTE += gap_head_used;
1045 GAP_SIZE = 0; ZV += add; Z += add; ZV_BYTE += add; Z_BYTE += add;
1046 make_gap (bytes);
1047 GAP_SIZE += add; ZV -= add; Z -= add; ZV_BYTE -= add; Z_BYTE -= add;
1048 GPT -= gap_head_used, GPT_BYTE -= gap_head_used;
1049 }
1050 else
1051 {
1052 Lisp_Object this_buffer;
1053
1054 this_buffer = Fcurrent_buffer ();
1055 set_buffer_internal (XBUFFER (coding->dst_object));
1056 make_gap (bytes);
1057 set_buffer_internal (XBUFFER (this_buffer));
1058 }
1059 }
1060
1061
1062 static unsigned char *
1063 alloc_destination (struct coding_system *coding, ptrdiff_t nbytes,
1064 unsigned char *dst)
1065 {
1066 ptrdiff_t offset = dst - coding->destination;
1067
1068 if (BUFFERP (coding->dst_object))
1069 {
1070 struct buffer *buf = XBUFFER (coding->dst_object);
1071
1072 coding_alloc_by_making_gap (coding, dst - BUF_GPT_ADDR (buf), nbytes);
1073 }
1074 else
1075 coding_alloc_by_realloc (coding, nbytes);
1076 coding_set_destination (coding);
1077 dst = coding->destination + offset;
1078 return dst;
1079 }
1080
1081 /** Macros for annotations. */
1082
1083 /* An annotation data is stored in the array coding->charbuf in this
1084 format:
1085 [ -LENGTH ANNOTATION_MASK NCHARS ... ]
1086 LENGTH is the number of elements in the annotation.
1087 ANNOTATION_MASK is one of CODING_ANNOTATE_XXX_MASK.
1088 NCHARS is the number of characters in the text annotated.
1089
1090 The format of the following elements depend on ANNOTATION_MASK.
1091
1092 In the case of CODING_ANNOTATE_COMPOSITION_MASK, these elements
1093 follows:
1094 ... NBYTES METHOD [ COMPOSITION-COMPONENTS ... ]
1095
1096 NBYTES is the number of bytes specified in the header part of
1097 old-style emacs-mule encoding, or 0 for the other kind of
1098 composition.
1099
1100 METHOD is one of enum composition_method.
1101
1102 Optional COMPOSITION-COMPONENTS are characters and composition
1103 rules.
1104
1105 In the case of CODING_ANNOTATE_CHARSET_MASK, one element CHARSET-ID
1106 follows.
1107
1108 If ANNOTATION_MASK is 0, this annotation is just a space holder to
1109 recover from an invalid annotation, and should be skipped by
1110 produce_annotation. */
1111
1112 /* Maximum length of the header of annotation data. */
1113 #define MAX_ANNOTATION_LENGTH 5
1114
1115 #define ADD_ANNOTATION_DATA(buf, len, mask, nchars) \
1116 do { \
1117 *(buf)++ = -(len); \
1118 *(buf)++ = (mask); \
1119 *(buf)++ = (nchars); \
1120 coding->annotated = 1; \
1121 } while (0);
1122
1123 #define ADD_COMPOSITION_DATA(buf, nchars, nbytes, method) \
1124 do { \
1125 ADD_ANNOTATION_DATA (buf, 5, CODING_ANNOTATE_COMPOSITION_MASK, nchars); \
1126 *buf++ = nbytes; \
1127 *buf++ = method; \
1128 } while (0)
1129
1130
1131 #define ADD_CHARSET_DATA(buf, nchars, id) \
1132 do { \
1133 ADD_ANNOTATION_DATA (buf, 4, CODING_ANNOTATE_CHARSET_MASK, nchars); \
1134 *buf++ = id; \
1135 } while (0)
1136
1137 \f
1138 /*** 2. Emacs' internal format (emacs-utf-8) ***/
1139
1140
1141
1142 \f
1143 /*** 3. UTF-8 ***/
1144
1145 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
1146 Return true if a text is encoded in UTF-8. */
1147
1148 #define UTF_8_1_OCTET_P(c) ((c) < 0x80)
1149 #define UTF_8_EXTRA_OCTET_P(c) (((c) & 0xC0) == 0x80)
1150 #define UTF_8_2_OCTET_LEADING_P(c) (((c) & 0xE0) == 0xC0)
1151 #define UTF_8_3_OCTET_LEADING_P(c) (((c) & 0xF0) == 0xE0)
1152 #define UTF_8_4_OCTET_LEADING_P(c) (((c) & 0xF8) == 0xF0)
1153 #define UTF_8_5_OCTET_LEADING_P(c) (((c) & 0xFC) == 0xF8)
1154
1155 #define UTF_8_BOM_1 0xEF
1156 #define UTF_8_BOM_2 0xBB
1157 #define UTF_8_BOM_3 0xBF
1158
1159 static bool
1160 detect_coding_utf_8 (struct coding_system *coding,
1161 struct coding_detection_info *detect_info)
1162 {
1163 const unsigned char *src = coding->source, *src_base;
1164 const unsigned char *src_end = coding->source + coding->src_bytes;
1165 bool multibytep = coding->src_multibyte;
1166 ptrdiff_t consumed_chars = 0;
1167 bool bom_found = 0;
1168 bool found = 0;
1169
1170 detect_info->checked |= CATEGORY_MASK_UTF_8;
1171 /* A coding system of this category is always ASCII compatible. */
1172 src += coding->head_ascii;
1173
1174 while (1)
1175 {
1176 int c, c1, c2, c3, c4;
1177
1178 src_base = src;
1179 ONE_MORE_BYTE (c);
1180 if (c < 0 || UTF_8_1_OCTET_P (c))
1181 continue;
1182 ONE_MORE_BYTE (c1);
1183 if (c1 < 0 || ! UTF_8_EXTRA_OCTET_P (c1))
1184 break;
1185 if (UTF_8_2_OCTET_LEADING_P (c))
1186 {
1187 found = 1;
1188 continue;
1189 }
1190 ONE_MORE_BYTE (c2);
1191 if (c2 < 0 || ! UTF_8_EXTRA_OCTET_P (c2))
1192 break;
1193 if (UTF_8_3_OCTET_LEADING_P (c))
1194 {
1195 found = 1;
1196 if (src_base == coding->source
1197 && c == UTF_8_BOM_1 && c1 == UTF_8_BOM_2 && c2 == UTF_8_BOM_3)
1198 bom_found = 1;
1199 continue;
1200 }
1201 ONE_MORE_BYTE (c3);
1202 if (c3 < 0 || ! UTF_8_EXTRA_OCTET_P (c3))
1203 break;
1204 if (UTF_8_4_OCTET_LEADING_P (c))
1205 {
1206 found = 1;
1207 continue;
1208 }
1209 ONE_MORE_BYTE (c4);
1210 if (c4 < 0 || ! UTF_8_EXTRA_OCTET_P (c4))
1211 break;
1212 if (UTF_8_5_OCTET_LEADING_P (c))
1213 {
1214 found = 1;
1215 continue;
1216 }
1217 break;
1218 }
1219 detect_info->rejected |= CATEGORY_MASK_UTF_8;
1220 return 0;
1221
1222 no_more_source:
1223 if (src_base < src && coding->mode & CODING_MODE_LAST_BLOCK)
1224 {
1225 detect_info->rejected |= CATEGORY_MASK_UTF_8;
1226 return 0;
1227 }
1228 if (bom_found)
1229 {
1230 /* The first character 0xFFFE doesn't necessarily mean a BOM. */
1231 detect_info->found |= CATEGORY_MASK_UTF_8_SIG | CATEGORY_MASK_UTF_8_NOSIG;
1232 }
1233 else
1234 {
1235 detect_info->rejected |= CATEGORY_MASK_UTF_8_SIG;
1236 if (found)
1237 detect_info->found |= CATEGORY_MASK_UTF_8_NOSIG;
1238 }
1239 return 1;
1240 }
1241
1242
1243 static void
1244 decode_coding_utf_8 (struct coding_system *coding)
1245 {
1246 const unsigned char *src = coding->source + coding->consumed;
1247 const unsigned char *src_end = coding->source + coding->src_bytes;
1248 const unsigned char *src_base;
1249 int *charbuf = coding->charbuf + coding->charbuf_used;
1250 int *charbuf_end = coding->charbuf + coding->charbuf_size;
1251 ptrdiff_t consumed_chars = 0, consumed_chars_base = 0;
1252 bool multibytep = coding->src_multibyte;
1253 enum utf_bom_type bom = CODING_UTF_8_BOM (coding);
1254 bool eol_dos
1255 = !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
1256 int byte_after_cr = -1;
1257
1258 if (bom != utf_without_bom)
1259 {
1260 int c1, c2, c3;
1261
1262 src_base = src;
1263 ONE_MORE_BYTE (c1);
1264 if (! UTF_8_3_OCTET_LEADING_P (c1))
1265 src = src_base;
1266 else
1267 {
1268 ONE_MORE_BYTE (c2);
1269 if (! UTF_8_EXTRA_OCTET_P (c2))
1270 src = src_base;
1271 else
1272 {
1273 ONE_MORE_BYTE (c3);
1274 if (! UTF_8_EXTRA_OCTET_P (c3))
1275 src = src_base;
1276 else
1277 {
1278 if ((c1 != UTF_8_BOM_1)
1279 || (c2 != UTF_8_BOM_2) || (c3 != UTF_8_BOM_3))
1280 src = src_base;
1281 else
1282 CODING_UTF_8_BOM (coding) = utf_without_bom;
1283 }
1284 }
1285 }
1286 }
1287 CODING_UTF_8_BOM (coding) = utf_without_bom;
1288
1289 while (1)
1290 {
1291 int c, c1, c2, c3, c4, c5;
1292
1293 src_base = src;
1294 consumed_chars_base = consumed_chars;
1295
1296 if (charbuf >= charbuf_end)
1297 {
1298 if (byte_after_cr >= 0)
1299 src_base--;
1300 break;
1301 }
1302
1303 if (byte_after_cr >= 0)
1304 c1 = byte_after_cr, byte_after_cr = -1;
1305 else
1306 ONE_MORE_BYTE (c1);
1307 if (c1 < 0)
1308 {
1309 c = - c1;
1310 }
1311 else if (UTF_8_1_OCTET_P (c1))
1312 {
1313 if (eol_dos && c1 == '\r')
1314 ONE_MORE_BYTE (byte_after_cr);
1315 c = c1;
1316 }
1317 else
1318 {
1319 ONE_MORE_BYTE (c2);
1320 if (c2 < 0 || ! UTF_8_EXTRA_OCTET_P (c2))
1321 goto invalid_code;
1322 if (UTF_8_2_OCTET_LEADING_P (c1))
1323 {
1324 c = ((c1 & 0x1F) << 6) | (c2 & 0x3F);
1325 /* Reject overlong sequences here and below. Encoders
1326 producing them are incorrect, they can be misleading,
1327 and they mess up read/write invariance. */
1328 if (c < 128)
1329 goto invalid_code;
1330 }
1331 else
1332 {
1333 ONE_MORE_BYTE (c3);
1334 if (c3 < 0 || ! UTF_8_EXTRA_OCTET_P (c3))
1335 goto invalid_code;
1336 if (UTF_8_3_OCTET_LEADING_P (c1))
1337 {
1338 c = (((c1 & 0xF) << 12)
1339 | ((c2 & 0x3F) << 6) | (c3 & 0x3F));
1340 if (c < 0x800
1341 || (c >= 0xd800 && c < 0xe000)) /* surrogates (invalid) */
1342 goto invalid_code;
1343 }
1344 else
1345 {
1346 ONE_MORE_BYTE (c4);
1347 if (c4 < 0 || ! UTF_8_EXTRA_OCTET_P (c4))
1348 goto invalid_code;
1349 if (UTF_8_4_OCTET_LEADING_P (c1))
1350 {
1351 c = (((c1 & 0x7) << 18) | ((c2 & 0x3F) << 12)
1352 | ((c3 & 0x3F) << 6) | (c4 & 0x3F));
1353 if (c < 0x10000)
1354 goto invalid_code;
1355 }
1356 else
1357 {
1358 ONE_MORE_BYTE (c5);
1359 if (c5 < 0 || ! UTF_8_EXTRA_OCTET_P (c5))
1360 goto invalid_code;
1361 if (UTF_8_5_OCTET_LEADING_P (c1))
1362 {
1363 c = (((c1 & 0x3) << 24) | ((c2 & 0x3F) << 18)
1364 | ((c3 & 0x3F) << 12) | ((c4 & 0x3F) << 6)
1365 | (c5 & 0x3F));
1366 if ((c > MAX_CHAR) || (c < 0x200000))
1367 goto invalid_code;
1368 }
1369 else
1370 goto invalid_code;
1371 }
1372 }
1373 }
1374 }
1375
1376 *charbuf++ = c;
1377 continue;
1378
1379 invalid_code:
1380 src = src_base;
1381 consumed_chars = consumed_chars_base;
1382 ONE_MORE_BYTE (c);
1383 *charbuf++ = ASCII_BYTE_P (c) ? c : BYTE8_TO_CHAR (c);
1384 coding->errors++;
1385 }
1386
1387 no_more_source:
1388 coding->consumed_char += consumed_chars_base;
1389 coding->consumed = src_base - coding->source;
1390 coding->charbuf_used = charbuf - coding->charbuf;
1391 }
1392
1393
1394 static bool
1395 encode_coding_utf_8 (struct coding_system *coding)
1396 {
1397 bool multibytep = coding->dst_multibyte;
1398 int *charbuf = coding->charbuf;
1399 int *charbuf_end = charbuf + coding->charbuf_used;
1400 unsigned char *dst = coding->destination + coding->produced;
1401 unsigned char *dst_end = coding->destination + coding->dst_bytes;
1402 ptrdiff_t produced_chars = 0;
1403 int c;
1404
1405 if (CODING_UTF_8_BOM (coding) == utf_with_bom)
1406 {
1407 ASSURE_DESTINATION (3);
1408 EMIT_THREE_BYTES (UTF_8_BOM_1, UTF_8_BOM_2, UTF_8_BOM_3);
1409 CODING_UTF_8_BOM (coding) = utf_without_bom;
1410 }
1411
1412 if (multibytep)
1413 {
1414 int safe_room = MAX_MULTIBYTE_LENGTH * 2;
1415
1416 while (charbuf < charbuf_end)
1417 {
1418 unsigned char str[MAX_MULTIBYTE_LENGTH], *p, *pend = str;
1419
1420 ASSURE_DESTINATION (safe_room);
1421 c = *charbuf++;
1422 if (CHAR_BYTE8_P (c))
1423 {
1424 c = CHAR_TO_BYTE8 (c);
1425 EMIT_ONE_BYTE (c);
1426 }
1427 else
1428 {
1429 CHAR_STRING_ADVANCE_NO_UNIFY (c, pend);
1430 for (p = str; p < pend; p++)
1431 EMIT_ONE_BYTE (*p);
1432 }
1433 }
1434 }
1435 else
1436 {
1437 int safe_room = MAX_MULTIBYTE_LENGTH;
1438
1439 while (charbuf < charbuf_end)
1440 {
1441 ASSURE_DESTINATION (safe_room);
1442 c = *charbuf++;
1443 if (CHAR_BYTE8_P (c))
1444 *dst++ = CHAR_TO_BYTE8 (c);
1445 else
1446 CHAR_STRING_ADVANCE_NO_UNIFY (c, dst);
1447 produced_chars++;
1448 }
1449 }
1450 record_conversion_result (coding, CODING_RESULT_SUCCESS);
1451 coding->produced_char += produced_chars;
1452 coding->produced = dst - coding->destination;
1453 return 0;
1454 }
1455
1456
1457 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
1458 Return true if a text is encoded in one of UTF-16 based coding systems. */
1459
1460 #define UTF_16_HIGH_SURROGATE_P(val) \
1461 (((val) & 0xFC00) == 0xD800)
1462
1463 #define UTF_16_LOW_SURROGATE_P(val) \
1464 (((val) & 0xFC00) == 0xDC00)
1465
1466
1467 static bool
1468 detect_coding_utf_16 (struct coding_system *coding,
1469 struct coding_detection_info *detect_info)
1470 {
1471 const unsigned char *src = coding->source;
1472 const unsigned char *src_end = coding->source + coding->src_bytes;
1473 bool multibytep = coding->src_multibyte;
1474 int c1, c2;
1475
1476 detect_info->checked |= CATEGORY_MASK_UTF_16;
1477 if (coding->mode & CODING_MODE_LAST_BLOCK
1478 && (coding->src_chars & 1))
1479 {
1480 detect_info->rejected |= CATEGORY_MASK_UTF_16;
1481 return 0;
1482 }
1483
1484 TWO_MORE_BYTES (c1, c2);
1485 if ((c1 == 0xFF) && (c2 == 0xFE))
1486 {
1487 detect_info->found |= (CATEGORY_MASK_UTF_16_LE
1488 | CATEGORY_MASK_UTF_16_AUTO);
1489 detect_info->rejected |= (CATEGORY_MASK_UTF_16_BE
1490 | CATEGORY_MASK_UTF_16_BE_NOSIG
1491 | CATEGORY_MASK_UTF_16_LE_NOSIG);
1492 }
1493 else if ((c1 == 0xFE) && (c2 == 0xFF))
1494 {
1495 detect_info->found |= (CATEGORY_MASK_UTF_16_BE
1496 | CATEGORY_MASK_UTF_16_AUTO);
1497 detect_info->rejected |= (CATEGORY_MASK_UTF_16_LE
1498 | CATEGORY_MASK_UTF_16_BE_NOSIG
1499 | CATEGORY_MASK_UTF_16_LE_NOSIG);
1500 }
1501 else if (c2 < 0)
1502 {
1503 detect_info->rejected |= CATEGORY_MASK_UTF_16;
1504 return 0;
1505 }
1506 else
1507 {
1508 /* We check the dispersion of Eth and Oth bytes where E is even and
1509 O is odd. If both are high, we assume binary data.*/
1510 unsigned char e[256], o[256];
1511 unsigned e_num = 1, o_num = 1;
1512
1513 memset (e, 0, 256);
1514 memset (o, 0, 256);
1515 e[c1] = 1;
1516 o[c2] = 1;
1517
1518 detect_info->rejected |= (CATEGORY_MASK_UTF_16_AUTO
1519 |CATEGORY_MASK_UTF_16_BE
1520 | CATEGORY_MASK_UTF_16_LE);
1521
1522 while ((detect_info->rejected & CATEGORY_MASK_UTF_16)
1523 != CATEGORY_MASK_UTF_16)
1524 {
1525 TWO_MORE_BYTES (c1, c2);
1526 if (c2 < 0)
1527 break;
1528 if (! e[c1])
1529 {
1530 e[c1] = 1;
1531 e_num++;
1532 if (e_num >= 128)
1533 detect_info->rejected |= CATEGORY_MASK_UTF_16_BE_NOSIG;
1534 }
1535 if (! o[c2])
1536 {
1537 o[c2] = 1;
1538 o_num++;
1539 if (o_num >= 128)
1540 detect_info->rejected |= CATEGORY_MASK_UTF_16_LE_NOSIG;
1541 }
1542 }
1543 return 0;
1544 }
1545
1546 no_more_source:
1547 return 1;
1548 }
1549
1550 static void
1551 decode_coding_utf_16 (struct coding_system *coding)
1552 {
1553 const unsigned char *src = coding->source + coding->consumed;
1554 const unsigned char *src_end = coding->source + coding->src_bytes;
1555 const unsigned char *src_base;
1556 int *charbuf = coding->charbuf + coding->charbuf_used;
1557 /* We may produces at most 3 chars in one loop. */
1558 int *charbuf_end = coding->charbuf + coding->charbuf_size - 2;
1559 ptrdiff_t consumed_chars = 0, consumed_chars_base = 0;
1560 bool multibytep = coding->src_multibyte;
1561 enum utf_bom_type bom = CODING_UTF_16_BOM (coding);
1562 enum utf_16_endian_type endian = CODING_UTF_16_ENDIAN (coding);
1563 int surrogate = CODING_UTF_16_SURROGATE (coding);
1564 bool eol_dos
1565 = !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
1566 int byte_after_cr1 = -1, byte_after_cr2 = -1;
1567
1568 if (bom == utf_with_bom)
1569 {
1570 int c, c1, c2;
1571
1572 src_base = src;
1573 ONE_MORE_BYTE (c1);
1574 ONE_MORE_BYTE (c2);
1575 c = (c1 << 8) | c2;
1576
1577 if (endian == utf_16_big_endian
1578 ? c != 0xFEFF : c != 0xFFFE)
1579 {
1580 /* The first two bytes are not BOM. Treat them as bytes
1581 for a normal character. */
1582 src = src_base;
1583 coding->errors++;
1584 }
1585 CODING_UTF_16_BOM (coding) = utf_without_bom;
1586 }
1587 else if (bom == utf_detect_bom)
1588 {
1589 /* We have already tried to detect BOM and failed in
1590 detect_coding. */
1591 CODING_UTF_16_BOM (coding) = utf_without_bom;
1592 }
1593
1594 while (1)
1595 {
1596 int c, c1, c2;
1597
1598 src_base = src;
1599 consumed_chars_base = consumed_chars;
1600
1601 if (charbuf >= charbuf_end)
1602 {
1603 if (byte_after_cr1 >= 0)
1604 src_base -= 2;
1605 break;
1606 }
1607
1608 if (byte_after_cr1 >= 0)
1609 c1 = byte_after_cr1, byte_after_cr1 = -1;
1610 else
1611 ONE_MORE_BYTE (c1);
1612 if (c1 < 0)
1613 {
1614 *charbuf++ = -c1;
1615 continue;
1616 }
1617 if (byte_after_cr2 >= 0)
1618 c2 = byte_after_cr2, byte_after_cr2 = -1;
1619 else
1620 ONE_MORE_BYTE (c2);
1621 if (c2 < 0)
1622 {
1623 *charbuf++ = ASCII_BYTE_P (c1) ? c1 : BYTE8_TO_CHAR (c1);
1624 *charbuf++ = -c2;
1625 continue;
1626 }
1627 c = (endian == utf_16_big_endian
1628 ? ((c1 << 8) | c2) : ((c2 << 8) | c1));
1629
1630 if (surrogate)
1631 {
1632 if (! UTF_16_LOW_SURROGATE_P (c))
1633 {
1634 if (endian == utf_16_big_endian)
1635 c1 = surrogate >> 8, c2 = surrogate & 0xFF;
1636 else
1637 c1 = surrogate & 0xFF, c2 = surrogate >> 8;
1638 *charbuf++ = c1;
1639 *charbuf++ = c2;
1640 coding->errors++;
1641 if (UTF_16_HIGH_SURROGATE_P (c))
1642 CODING_UTF_16_SURROGATE (coding) = surrogate = c;
1643 else
1644 *charbuf++ = c;
1645 }
1646 else
1647 {
1648 c = ((surrogate - 0xD800) << 10) | (c - 0xDC00);
1649 CODING_UTF_16_SURROGATE (coding) = surrogate = 0;
1650 *charbuf++ = 0x10000 + c;
1651 }
1652 }
1653 else
1654 {
1655 if (UTF_16_HIGH_SURROGATE_P (c))
1656 CODING_UTF_16_SURROGATE (coding) = surrogate = c;
1657 else
1658 {
1659 if (eol_dos && c == '\r')
1660 {
1661 ONE_MORE_BYTE (byte_after_cr1);
1662 ONE_MORE_BYTE (byte_after_cr2);
1663 }
1664 *charbuf++ = c;
1665 }
1666 }
1667 }
1668
1669 no_more_source:
1670 coding->consumed_char += consumed_chars_base;
1671 coding->consumed = src_base - coding->source;
1672 coding->charbuf_used = charbuf - coding->charbuf;
1673 }
1674
1675 static bool
1676 encode_coding_utf_16 (struct coding_system *coding)
1677 {
1678 bool multibytep = coding->dst_multibyte;
1679 int *charbuf = coding->charbuf;
1680 int *charbuf_end = charbuf + coding->charbuf_used;
1681 unsigned char *dst = coding->destination + coding->produced;
1682 unsigned char *dst_end = coding->destination + coding->dst_bytes;
1683 int safe_room = 8;
1684 enum utf_bom_type bom = CODING_UTF_16_BOM (coding);
1685 bool big_endian = CODING_UTF_16_ENDIAN (coding) == utf_16_big_endian;
1686 ptrdiff_t produced_chars = 0;
1687 int c;
1688
1689 if (bom != utf_without_bom)
1690 {
1691 ASSURE_DESTINATION (safe_room);
1692 if (big_endian)
1693 EMIT_TWO_BYTES (0xFE, 0xFF);
1694 else
1695 EMIT_TWO_BYTES (0xFF, 0xFE);
1696 CODING_UTF_16_BOM (coding) = utf_without_bom;
1697 }
1698
1699 while (charbuf < charbuf_end)
1700 {
1701 ASSURE_DESTINATION (safe_room);
1702 c = *charbuf++;
1703 if (c > MAX_UNICODE_CHAR)
1704 c = coding->default_char;
1705
1706 if (c < 0x10000)
1707 {
1708 if (big_endian)
1709 EMIT_TWO_BYTES (c >> 8, c & 0xFF);
1710 else
1711 EMIT_TWO_BYTES (c & 0xFF, c >> 8);
1712 }
1713 else
1714 {
1715 int c1, c2;
1716
1717 c -= 0x10000;
1718 c1 = (c >> 10) + 0xD800;
1719 c2 = (c & 0x3FF) + 0xDC00;
1720 if (big_endian)
1721 EMIT_FOUR_BYTES (c1 >> 8, c1 & 0xFF, c2 >> 8, c2 & 0xFF);
1722 else
1723 EMIT_FOUR_BYTES (c1 & 0xFF, c1 >> 8, c2 & 0xFF, c2 >> 8);
1724 }
1725 }
1726 record_conversion_result (coding, CODING_RESULT_SUCCESS);
1727 coding->produced = dst - coding->destination;
1728 coding->produced_char += produced_chars;
1729 return 0;
1730 }
1731
1732 \f
1733 /*** 6. Old Emacs' internal format (emacs-mule) ***/
1734
1735 /* Emacs' internal format for representation of multiple character
1736 sets is a kind of multi-byte encoding, i.e. characters are
1737 represented by variable-length sequences of one-byte codes.
1738
1739 ASCII characters and control characters (e.g. `tab', `newline') are
1740 represented by one-byte sequences which are their ASCII codes, in
1741 the range 0x00 through 0x7F.
1742
1743 8-bit characters of the range 0x80..0x9F are represented by
1744 two-byte sequences of LEADING_CODE_8_BIT_CONTROL and (their 8-bit
1745 code + 0x20).
1746
1747 8-bit characters of the range 0xA0..0xFF are represented by
1748 one-byte sequences which are their 8-bit code.
1749
1750 The other characters are represented by a sequence of `base
1751 leading-code', optional `extended leading-code', and one or two
1752 `position-code's. The length of the sequence is determined by the
1753 base leading-code. Leading-code takes the range 0x81 through 0x9D,
1754 whereas extended leading-code and position-code take the range 0xA0
1755 through 0xFF. See `charset.h' for more details about leading-code
1756 and position-code.
1757
1758 --- CODE RANGE of Emacs' internal format ---
1759 character set range
1760 ------------- -----
1761 ascii 0x00..0x7F
1762 eight-bit-control LEADING_CODE_8_BIT_CONTROL + 0xA0..0xBF
1763 eight-bit-graphic 0xA0..0xBF
1764 ELSE 0x81..0x9D + [0xA0..0xFF]+
1765 ---------------------------------------------
1766
1767 As this is the internal character representation, the format is
1768 usually not used externally (i.e. in a file or in a data sent to a
1769 process). But, it is possible to have a text externally in this
1770 format (i.e. by encoding by the coding system `emacs-mule').
1771
1772 In that case, a sequence of one-byte codes has a slightly different
1773 form.
1774
1775 At first, all characters in eight-bit-control are represented by
1776 one-byte sequences which are their 8-bit code.
1777
1778 Next, character composition data are represented by the byte
1779 sequence of the form: 0x80 METHOD BYTES CHARS COMPONENT ...,
1780 where,
1781 METHOD is 0xF2 plus one of composition method (enum
1782 composition_method),
1783
1784 BYTES is 0xA0 plus a byte length of this composition data,
1785
1786 CHARS is 0xA0 plus a number of characters composed by this
1787 data,
1788
1789 COMPONENTs are characters of multibyte form or composition
1790 rules encoded by two-byte of ASCII codes.
1791
1792 In addition, for backward compatibility, the following formats are
1793 also recognized as composition data on decoding.
1794
1795 0x80 MSEQ ...
1796 0x80 0xFF MSEQ RULE MSEQ RULE ... MSEQ
1797
1798 Here,
1799 MSEQ is a multibyte form but in these special format:
1800 ASCII: 0xA0 ASCII_CODE+0x80,
1801 other: LEADING_CODE+0x20 FOLLOWING-BYTE ...,
1802 RULE is a one byte code of the range 0xA0..0xF0 that
1803 represents a composition rule.
1804 */
1805
1806 char emacs_mule_bytes[256];
1807
1808
1809 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
1810 Return true if a text is encoded in 'emacs-mule'. */
1811
1812 static bool
1813 detect_coding_emacs_mule (struct coding_system *coding,
1814 struct coding_detection_info *detect_info)
1815 {
1816 const unsigned char *src = coding->source, *src_base;
1817 const unsigned char *src_end = coding->source + coding->src_bytes;
1818 bool multibytep = coding->src_multibyte;
1819 ptrdiff_t consumed_chars = 0;
1820 int c;
1821 int found = 0;
1822
1823 detect_info->checked |= CATEGORY_MASK_EMACS_MULE;
1824 /* A coding system of this category is always ASCII compatible. */
1825 src += coding->head_ascii;
1826
1827 while (1)
1828 {
1829 src_base = src;
1830 ONE_MORE_BYTE (c);
1831 if (c < 0)
1832 continue;
1833 if (c == 0x80)
1834 {
1835 /* Perhaps the start of composite character. We simply skip
1836 it because analyzing it is too heavy for detecting. But,
1837 at least, we check that the composite character
1838 constitutes of more than 4 bytes. */
1839 const unsigned char *src_start;
1840
1841 repeat:
1842 src_start = src;
1843 do
1844 {
1845 ONE_MORE_BYTE (c);
1846 }
1847 while (c >= 0xA0);
1848
1849 if (src - src_start <= 4)
1850 break;
1851 found = CATEGORY_MASK_EMACS_MULE;
1852 if (c == 0x80)
1853 goto repeat;
1854 }
1855
1856 if (c < 0x80)
1857 {
1858 if (c < 0x20
1859 && (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO))
1860 break;
1861 }
1862 else
1863 {
1864 int more_bytes = emacs_mule_bytes[c] - 1;
1865
1866 while (more_bytes > 0)
1867 {
1868 ONE_MORE_BYTE (c);
1869 if (c < 0xA0)
1870 {
1871 src--; /* Unread the last byte. */
1872 break;
1873 }
1874 more_bytes--;
1875 }
1876 if (more_bytes != 0)
1877 break;
1878 found = CATEGORY_MASK_EMACS_MULE;
1879 }
1880 }
1881 detect_info->rejected |= CATEGORY_MASK_EMACS_MULE;
1882 return 0;
1883
1884 no_more_source:
1885 if (src_base < src && coding->mode & CODING_MODE_LAST_BLOCK)
1886 {
1887 detect_info->rejected |= CATEGORY_MASK_EMACS_MULE;
1888 return 0;
1889 }
1890 detect_info->found |= found;
1891 return 1;
1892 }
1893
1894
1895 /* Parse emacs-mule multibyte sequence at SRC and return the decoded
1896 character. If CMP_STATUS indicates that we must expect MSEQ or
1897 RULE described above, decode it and return the negative value of
1898 the decoded character or rule. If an invalid byte is found, return
1899 -1. If SRC is too short, return -2. */
1900
1901 static int
1902 emacs_mule_char (struct coding_system *coding, const unsigned char *src,
1903 int *nbytes, int *nchars, int *id,
1904 struct composition_status *cmp_status)
1905 {
1906 const unsigned char *src_end = coding->source + coding->src_bytes;
1907 const unsigned char *src_base = src;
1908 bool multibytep = coding->src_multibyte;
1909 int charset_ID;
1910 unsigned code;
1911 int c;
1912 int consumed_chars = 0;
1913 bool mseq_found = 0;
1914
1915 ONE_MORE_BYTE (c);
1916 if (c < 0)
1917 {
1918 c = -c;
1919 charset_ID = emacs_mule_charset[0];
1920 }
1921 else
1922 {
1923 if (c >= 0xA0)
1924 {
1925 if (cmp_status->state != COMPOSING_NO
1926 && cmp_status->old_form)
1927 {
1928 if (cmp_status->state == COMPOSING_CHAR)
1929 {
1930 if (c == 0xA0)
1931 {
1932 ONE_MORE_BYTE (c);
1933 c -= 0x80;
1934 if (c < 0)
1935 goto invalid_code;
1936 }
1937 else
1938 c -= 0x20;
1939 mseq_found = 1;
1940 }
1941 else
1942 {
1943 *nbytes = src - src_base;
1944 *nchars = consumed_chars;
1945 return -c;
1946 }
1947 }
1948 else
1949 goto invalid_code;
1950 }
1951
1952 switch (emacs_mule_bytes[c])
1953 {
1954 case 2:
1955 if ((charset_ID = emacs_mule_charset[c]) < 0)
1956 goto invalid_code;
1957 ONE_MORE_BYTE (c);
1958 if (c < 0xA0)
1959 goto invalid_code;
1960 code = c & 0x7F;
1961 break;
1962
1963 case 3:
1964 if (c == EMACS_MULE_LEADING_CODE_PRIVATE_11
1965 || c == EMACS_MULE_LEADING_CODE_PRIVATE_12)
1966 {
1967 ONE_MORE_BYTE (c);
1968 if (c < 0xA0 || (charset_ID = emacs_mule_charset[c]) < 0)
1969 goto invalid_code;
1970 ONE_MORE_BYTE (c);
1971 if (c < 0xA0)
1972 goto invalid_code;
1973 code = c & 0x7F;
1974 }
1975 else
1976 {
1977 if ((charset_ID = emacs_mule_charset[c]) < 0)
1978 goto invalid_code;
1979 ONE_MORE_BYTE (c);
1980 if (c < 0xA0)
1981 goto invalid_code;
1982 code = (c & 0x7F) << 8;
1983 ONE_MORE_BYTE (c);
1984 if (c < 0xA0)
1985 goto invalid_code;
1986 code |= c & 0x7F;
1987 }
1988 break;
1989
1990 case 4:
1991 ONE_MORE_BYTE (c);
1992 if (c < 0 || (charset_ID = emacs_mule_charset[c]) < 0)
1993 goto invalid_code;
1994 ONE_MORE_BYTE (c);
1995 if (c < 0xA0)
1996 goto invalid_code;
1997 code = (c & 0x7F) << 8;
1998 ONE_MORE_BYTE (c);
1999 if (c < 0xA0)
2000 goto invalid_code;
2001 code |= c & 0x7F;
2002 break;
2003
2004 case 1:
2005 code = c;
2006 charset_ID = ASCII_BYTE_P (code) ? charset_ascii : charset_eight_bit;
2007 break;
2008
2009 default:
2010 emacs_abort ();
2011 }
2012 CODING_DECODE_CHAR (coding, src, src_base, src_end,
2013 CHARSET_FROM_ID (charset_ID), code, c);
2014 if (c < 0)
2015 goto invalid_code;
2016 }
2017 *nbytes = src - src_base;
2018 *nchars = consumed_chars;
2019 if (id)
2020 *id = charset_ID;
2021 return (mseq_found ? -c : c);
2022
2023 no_more_source:
2024 return -2;
2025
2026 invalid_code:
2027 return -1;
2028 }
2029
2030
2031 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions". */
2032
2033 /* Handle these composition sequence ('|': the end of header elements,
2034 BYTES and CHARS >= 0xA0):
2035
2036 (1) relative composition: 0x80 0xF2 BYTES CHARS | CHAR ...
2037 (2) altchar composition: 0x80 0xF4 BYTES CHARS | ALT ... ALT CHAR ...
2038 (3) alt&rule composition: 0x80 0xF5 BYTES CHARS | ALT RULE ... ALT CHAR ...
2039
2040 and these old form:
2041
2042 (4) relative composition: 0x80 | MSEQ ... MSEQ
2043 (5) rulebase composition: 0x80 0xFF | MSEQ MRULE ... MSEQ
2044
2045 When the starter 0x80 and the following header elements are found,
2046 this annotation header is produced.
2047
2048 [ -LENGTH(==-5) CODING_ANNOTATE_COMPOSITION_MASK NCHARS NBYTES METHOD ]
2049
2050 NCHARS is CHARS - 0xA0 for (1), (2), (3), and 0 for (4), (5).
2051 NBYTES is BYTES - 0xA0 for (1), (2), (3), and 0 for (4), (5).
2052
2053 Then, upon reading the following elements, these codes are produced
2054 until the composition end is found:
2055
2056 (1) CHAR ... CHAR
2057 (2) ALT ... ALT CHAR ... CHAR
2058 (3) ALT -2 DECODED-RULE ALT -2 DECODED-RULE ... ALT CHAR ... CHAR
2059 (4) CHAR ... CHAR
2060 (5) CHAR -2 DECODED-RULE CHAR -2 DECODED-RULE ... CHAR
2061
2062 When the composition end is found, LENGTH and NCHARS in the
2063 annotation header is updated as below:
2064
2065 (1) LENGTH: unchanged, NCHARS: unchanged
2066 (2) LENGTH: length of the whole sequence minus NCHARS, NCHARS: unchanged
2067 (3) LENGTH: length of the whole sequence minus NCHARS, NCHARS: unchanged
2068 (4) LENGTH: unchanged, NCHARS: number of CHARs
2069 (5) LENGTH: unchanged, NCHARS: number of CHARs
2070
2071 If an error is found while composing, the annotation header is
2072 changed to the original composition header (plus filler -1s) as
2073 below:
2074
2075 (1),(2),(3) [ 0x80 0xF2+METHOD BYTES CHARS -1 ]
2076 (5) [ 0x80 0xFF -1 -1- -1 ]
2077
2078 and the sequence [ -2 DECODED-RULE ] is changed to the original
2079 byte sequence as below:
2080 o the original byte sequence is B: [ B -1 ]
2081 o the original byte sequence is B1 B2: [ B1 B2 ]
2082
2083 Most of the routines are implemented by macros because many
2084 variables and labels in the caller decode_coding_emacs_mule must be
2085 accessible, and they are usually called just once (thus doesn't
2086 increase the size of compiled object). */
2087
2088 /* Decode a composition rule represented by C as a component of
2089 composition sequence of Emacs 20 style. Set RULE to the decoded
2090 rule. */
2091
2092 #define DECODE_EMACS_MULE_COMPOSITION_RULE_20(c, rule) \
2093 do { \
2094 int gref, nref; \
2095 \
2096 c -= 0xA0; \
2097 if (c < 0 || c >= 81) \
2098 goto invalid_code; \
2099 gref = c / 9, nref = c % 9; \
2100 if (gref == 4) gref = 10; \
2101 if (nref == 4) nref = 10; \
2102 rule = COMPOSITION_ENCODE_RULE (gref, nref); \
2103 } while (0)
2104
2105
2106 /* Decode a composition rule represented by C and the following byte
2107 at SRC as a component of composition sequence of Emacs 21 style.
2108 Set RULE to the decoded rule. */
2109
2110 #define DECODE_EMACS_MULE_COMPOSITION_RULE_21(c, rule) \
2111 do { \
2112 int gref, nref; \
2113 \
2114 gref = c - 0x20; \
2115 if (gref < 0 || gref >= 81) \
2116 goto invalid_code; \
2117 ONE_MORE_BYTE (c); \
2118 nref = c - 0x20; \
2119 if (nref < 0 || nref >= 81) \
2120 goto invalid_code; \
2121 rule = COMPOSITION_ENCODE_RULE (gref, nref); \
2122 } while (0)
2123
2124
2125 /* Start of Emacs 21 style format. The first three bytes at SRC are
2126 (METHOD - 0xF2), (BYTES - 0xA0), (CHARS - 0xA0), where BYTES is the
2127 byte length of this composition information, CHARS is the number of
2128 characters composed by this composition. */
2129
2130 #define DECODE_EMACS_MULE_21_COMPOSITION() \
2131 do { \
2132 enum composition_method method = c - 0xF2; \
2133 int nbytes, nchars; \
2134 \
2135 ONE_MORE_BYTE (c); \
2136 if (c < 0) \
2137 goto invalid_code; \
2138 nbytes = c - 0xA0; \
2139 if (nbytes < 3 || (method == COMPOSITION_RELATIVE && nbytes != 4)) \
2140 goto invalid_code; \
2141 ONE_MORE_BYTE (c); \
2142 nchars = c - 0xA0; \
2143 if (nchars <= 0 || nchars >= MAX_COMPOSITION_COMPONENTS) \
2144 goto invalid_code; \
2145 cmp_status->old_form = 0; \
2146 cmp_status->method = method; \
2147 if (method == COMPOSITION_RELATIVE) \
2148 cmp_status->state = COMPOSING_CHAR; \
2149 else \
2150 cmp_status->state = COMPOSING_COMPONENT_CHAR; \
2151 cmp_status->length = MAX_ANNOTATION_LENGTH; \
2152 cmp_status->nchars = nchars; \
2153 cmp_status->ncomps = nbytes - 4; \
2154 ADD_COMPOSITION_DATA (charbuf, nchars, nbytes, method); \
2155 } while (0)
2156
2157
2158 /* Start of Emacs 20 style format for relative composition. */
2159
2160 #define DECODE_EMACS_MULE_20_RELATIVE_COMPOSITION() \
2161 do { \
2162 cmp_status->old_form = 1; \
2163 cmp_status->method = COMPOSITION_RELATIVE; \
2164 cmp_status->state = COMPOSING_CHAR; \
2165 cmp_status->length = MAX_ANNOTATION_LENGTH; \
2166 cmp_status->nchars = cmp_status->ncomps = 0; \
2167 ADD_COMPOSITION_DATA (charbuf, 0, 0, cmp_status->method); \
2168 } while (0)
2169
2170
2171 /* Start of Emacs 20 style format for rule-base composition. */
2172
2173 #define DECODE_EMACS_MULE_20_RULEBASE_COMPOSITION() \
2174 do { \
2175 cmp_status->old_form = 1; \
2176 cmp_status->method = COMPOSITION_WITH_RULE; \
2177 cmp_status->state = COMPOSING_CHAR; \
2178 cmp_status->length = MAX_ANNOTATION_LENGTH; \
2179 cmp_status->nchars = cmp_status->ncomps = 0; \
2180 ADD_COMPOSITION_DATA (charbuf, 0, 0, cmp_status->method); \
2181 } while (0)
2182
2183
2184 #define DECODE_EMACS_MULE_COMPOSITION_START() \
2185 do { \
2186 const unsigned char *current_src = src; \
2187 \
2188 ONE_MORE_BYTE (c); \
2189 if (c < 0) \
2190 goto invalid_code; \
2191 if (c - 0xF2 >= COMPOSITION_RELATIVE \
2192 && c - 0xF2 <= COMPOSITION_WITH_RULE_ALTCHARS) \
2193 DECODE_EMACS_MULE_21_COMPOSITION (); \
2194 else if (c < 0xA0) \
2195 goto invalid_code; \
2196 else if (c < 0xC0) \
2197 { \
2198 DECODE_EMACS_MULE_20_RELATIVE_COMPOSITION (); \
2199 /* Re-read C as a composition component. */ \
2200 src = current_src; \
2201 } \
2202 else if (c == 0xFF) \
2203 DECODE_EMACS_MULE_20_RULEBASE_COMPOSITION (); \
2204 else \
2205 goto invalid_code; \
2206 } while (0)
2207
2208 #define EMACS_MULE_COMPOSITION_END() \
2209 do { \
2210 int idx = - cmp_status->length; \
2211 \
2212 if (cmp_status->old_form) \
2213 charbuf[idx + 2] = cmp_status->nchars; \
2214 else if (cmp_status->method > COMPOSITION_RELATIVE) \
2215 charbuf[idx] = charbuf[idx + 2] - cmp_status->length; \
2216 cmp_status->state = COMPOSING_NO; \
2217 } while (0)
2218
2219
2220 static int
2221 emacs_mule_finish_composition (int *charbuf,
2222 struct composition_status *cmp_status)
2223 {
2224 int idx = - cmp_status->length;
2225 int new_chars;
2226
2227 if (cmp_status->old_form && cmp_status->nchars > 0)
2228 {
2229 charbuf[idx + 2] = cmp_status->nchars;
2230 new_chars = 0;
2231 if (cmp_status->method == COMPOSITION_WITH_RULE
2232 && cmp_status->state == COMPOSING_CHAR)
2233 {
2234 /* The last rule was invalid. */
2235 int rule = charbuf[-1] + 0xA0;
2236
2237 charbuf[-2] = BYTE8_TO_CHAR (rule);
2238 charbuf[-1] = -1;
2239 new_chars = 1;
2240 }
2241 }
2242 else
2243 {
2244 charbuf[idx++] = BYTE8_TO_CHAR (0x80);
2245
2246 if (cmp_status->method == COMPOSITION_WITH_RULE)
2247 {
2248 charbuf[idx++] = BYTE8_TO_CHAR (0xFF);
2249 charbuf[idx++] = -3;
2250 charbuf[idx++] = 0;
2251 new_chars = 1;
2252 }
2253 else
2254 {
2255 int nchars = charbuf[idx + 1] + 0xA0;
2256 int nbytes = charbuf[idx + 2] + 0xA0;
2257
2258 charbuf[idx++] = BYTE8_TO_CHAR (0xF2 + cmp_status->method);
2259 charbuf[idx++] = BYTE8_TO_CHAR (nbytes);
2260 charbuf[idx++] = BYTE8_TO_CHAR (nchars);
2261 charbuf[idx++] = -1;
2262 new_chars = 4;
2263 }
2264 }
2265 cmp_status->state = COMPOSING_NO;
2266 return new_chars;
2267 }
2268
2269 #define EMACS_MULE_MAYBE_FINISH_COMPOSITION() \
2270 do { \
2271 if (cmp_status->state != COMPOSING_NO) \
2272 char_offset += emacs_mule_finish_composition (charbuf, cmp_status); \
2273 } while (0)
2274
2275
2276 static void
2277 decode_coding_emacs_mule (struct coding_system *coding)
2278 {
2279 const unsigned char *src = coding->source + coding->consumed;
2280 const unsigned char *src_end = coding->source + coding->src_bytes;
2281 const unsigned char *src_base;
2282 int *charbuf = coding->charbuf + coding->charbuf_used;
2283 /* We may produce two annotations (charset and composition) in one
2284 loop and one more charset annotation at the end. */
2285 int *charbuf_end
2286 = coding->charbuf + coding->charbuf_size - (MAX_ANNOTATION_LENGTH * 3)
2287 /* We can produce up to 2 characters in a loop. */
2288 - 1;
2289 ptrdiff_t consumed_chars = 0, consumed_chars_base;
2290 bool multibytep = coding->src_multibyte;
2291 ptrdiff_t char_offset = coding->produced_char;
2292 ptrdiff_t last_offset = char_offset;
2293 int last_id = charset_ascii;
2294 bool eol_dos
2295 = !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
2296 int byte_after_cr = -1;
2297 struct composition_status *cmp_status = &coding->spec.emacs_mule.cmp_status;
2298
2299 if (cmp_status->state != COMPOSING_NO)
2300 {
2301 int i;
2302
2303 if (charbuf_end - charbuf < cmp_status->length)
2304 emacs_abort ();
2305 for (i = 0; i < cmp_status->length; i++)
2306 *charbuf++ = cmp_status->carryover[i];
2307 coding->annotated = 1;
2308 }
2309
2310 while (1)
2311 {
2312 int c, id IF_LINT (= 0);
2313
2314 src_base = src;
2315 consumed_chars_base = consumed_chars;
2316
2317 if (charbuf >= charbuf_end)
2318 {
2319 if (byte_after_cr >= 0)
2320 src_base--;
2321 break;
2322 }
2323
2324 if (byte_after_cr >= 0)
2325 c = byte_after_cr, byte_after_cr = -1;
2326 else
2327 ONE_MORE_BYTE (c);
2328
2329 if (c < 0 || c == 0x80)
2330 {
2331 EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
2332 if (c < 0)
2333 {
2334 *charbuf++ = -c;
2335 char_offset++;
2336 }
2337 else
2338 DECODE_EMACS_MULE_COMPOSITION_START ();
2339 continue;
2340 }
2341
2342 if (c < 0x80)
2343 {
2344 if (eol_dos && c == '\r')
2345 ONE_MORE_BYTE (byte_after_cr);
2346 id = charset_ascii;
2347 if (cmp_status->state != COMPOSING_NO)
2348 {
2349 if (cmp_status->old_form)
2350 EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
2351 else if (cmp_status->state >= COMPOSING_COMPONENT_CHAR)
2352 cmp_status->ncomps--;
2353 }
2354 }
2355 else
2356 {
2357 int nchars IF_LINT (= 0), nbytes IF_LINT (= 0);
2358 /* emacs_mule_char can load a charset map from a file, which
2359 allocates a large structure and might cause buffer text
2360 to be relocated as result. Thus, we need to remember the
2361 original pointer to buffer text, and fix up all related
2362 pointers after the call. */
2363 const unsigned char *orig = coding->source;
2364 ptrdiff_t offset;
2365
2366 c = emacs_mule_char (coding, src_base, &nbytes, &nchars, &id,
2367 cmp_status);
2368 offset = coding->source - orig;
2369 if (offset)
2370 {
2371 src += offset;
2372 src_base += offset;
2373 src_end += offset;
2374 }
2375 if (c < 0)
2376 {
2377 if (c == -1)
2378 goto invalid_code;
2379 if (c == -2)
2380 break;
2381 }
2382 src = src_base + nbytes;
2383 consumed_chars = consumed_chars_base + nchars;
2384 if (cmp_status->state >= COMPOSING_COMPONENT_CHAR)
2385 cmp_status->ncomps -= nchars;
2386 }
2387
2388 /* Now if C >= 0, we found a normally encoded character, if C <
2389 0, we found an old-style composition component character or
2390 rule. */
2391
2392 if (cmp_status->state == COMPOSING_NO)
2393 {
2394 if (last_id != id)
2395 {
2396 if (last_id != charset_ascii)
2397 ADD_CHARSET_DATA (charbuf, char_offset - last_offset,
2398 last_id);
2399 last_id = id;
2400 last_offset = char_offset;
2401 }
2402 *charbuf++ = c;
2403 char_offset++;
2404 }
2405 else if (cmp_status->state == COMPOSING_CHAR)
2406 {
2407 if (cmp_status->old_form)
2408 {
2409 if (c >= 0)
2410 {
2411 EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
2412 *charbuf++ = c;
2413 char_offset++;
2414 }
2415 else
2416 {
2417 *charbuf++ = -c;
2418 cmp_status->nchars++;
2419 cmp_status->length++;
2420 if (cmp_status->nchars == MAX_COMPOSITION_COMPONENTS)
2421 EMACS_MULE_COMPOSITION_END ();
2422 else if (cmp_status->method == COMPOSITION_WITH_RULE)
2423 cmp_status->state = COMPOSING_RULE;
2424 }
2425 }
2426 else
2427 {
2428 *charbuf++ = c;
2429 cmp_status->length++;
2430 cmp_status->nchars--;
2431 if (cmp_status->nchars == 0)
2432 EMACS_MULE_COMPOSITION_END ();
2433 }
2434 }
2435 else if (cmp_status->state == COMPOSING_RULE)
2436 {
2437 int rule;
2438
2439 if (c >= 0)
2440 {
2441 EMACS_MULE_COMPOSITION_END ();
2442 *charbuf++ = c;
2443 char_offset++;
2444 }
2445 else
2446 {
2447 c = -c;
2448 DECODE_EMACS_MULE_COMPOSITION_RULE_20 (c, rule);
2449 if (rule < 0)
2450 goto invalid_code;
2451 *charbuf++ = -2;
2452 *charbuf++ = rule;
2453 cmp_status->length += 2;
2454 cmp_status->state = COMPOSING_CHAR;
2455 }
2456 }
2457 else if (cmp_status->state == COMPOSING_COMPONENT_CHAR)
2458 {
2459 *charbuf++ = c;
2460 cmp_status->length++;
2461 if (cmp_status->ncomps == 0)
2462 cmp_status->state = COMPOSING_CHAR;
2463 else if (cmp_status->ncomps > 0)
2464 {
2465 if (cmp_status->method == COMPOSITION_WITH_RULE_ALTCHARS)
2466 cmp_status->state = COMPOSING_COMPONENT_RULE;
2467 }
2468 else
2469 EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
2470 }
2471 else /* COMPOSING_COMPONENT_RULE */
2472 {
2473 int rule;
2474
2475 DECODE_EMACS_MULE_COMPOSITION_RULE_21 (c, rule);
2476 if (rule < 0)
2477 goto invalid_code;
2478 *charbuf++ = -2;
2479 *charbuf++ = rule;
2480 cmp_status->length += 2;
2481 cmp_status->ncomps--;
2482 if (cmp_status->ncomps > 0)
2483 cmp_status->state = COMPOSING_COMPONENT_CHAR;
2484 else
2485 EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
2486 }
2487 continue;
2488
2489 invalid_code:
2490 EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
2491 src = src_base;
2492 consumed_chars = consumed_chars_base;
2493 ONE_MORE_BYTE (c);
2494 *charbuf++ = ASCII_BYTE_P (c) ? c : BYTE8_TO_CHAR (c);
2495 char_offset++;
2496 coding->errors++;
2497 }
2498
2499 no_more_source:
2500 if (cmp_status->state != COMPOSING_NO)
2501 {
2502 if (coding->mode & CODING_MODE_LAST_BLOCK)
2503 EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
2504 else
2505 {
2506 int i;
2507
2508 charbuf -= cmp_status->length;
2509 for (i = 0; i < cmp_status->length; i++)
2510 cmp_status->carryover[i] = charbuf[i];
2511 }
2512 }
2513 if (last_id != charset_ascii)
2514 ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
2515 coding->consumed_char += consumed_chars_base;
2516 coding->consumed = src_base - coding->source;
2517 coding->charbuf_used = charbuf - coding->charbuf;
2518 }
2519
2520
2521 #define EMACS_MULE_LEADING_CODES(id, codes) \
2522 do { \
2523 if (id < 0xA0) \
2524 codes[0] = id, codes[1] = 0; \
2525 else if (id < 0xE0) \
2526 codes[0] = 0x9A, codes[1] = id; \
2527 else if (id < 0xF0) \
2528 codes[0] = 0x9B, codes[1] = id; \
2529 else if (id < 0xF5) \
2530 codes[0] = 0x9C, codes[1] = id; \
2531 else \
2532 codes[0] = 0x9D, codes[1] = id; \
2533 } while (0);
2534
2535
2536 static bool
2537 encode_coding_emacs_mule (struct coding_system *coding)
2538 {
2539 bool multibytep = coding->dst_multibyte;
2540 int *charbuf = coding->charbuf;
2541 int *charbuf_end = charbuf + coding->charbuf_used;
2542 unsigned char *dst = coding->destination + coding->produced;
2543 unsigned char *dst_end = coding->destination + coding->dst_bytes;
2544 int safe_room = 8;
2545 ptrdiff_t produced_chars = 0;
2546 Lisp_Object attrs, charset_list;
2547 int c;
2548 int preferred_charset_id = -1;
2549
2550 CODING_GET_INFO (coding, attrs, charset_list);
2551 if (! EQ (charset_list, Vemacs_mule_charset_list))
2552 {
2553 charset_list = Vemacs_mule_charset_list;
2554 ASET (attrs, coding_attr_charset_list, charset_list);
2555 }
2556
2557 while (charbuf < charbuf_end)
2558 {
2559 ASSURE_DESTINATION (safe_room);
2560 c = *charbuf++;
2561
2562 if (c < 0)
2563 {
2564 /* Handle an annotation. */
2565 switch (*charbuf)
2566 {
2567 case CODING_ANNOTATE_COMPOSITION_MASK:
2568 /* Not yet implemented. */
2569 break;
2570 case CODING_ANNOTATE_CHARSET_MASK:
2571 preferred_charset_id = charbuf[3];
2572 if (preferred_charset_id >= 0
2573 && NILP (Fmemq (make_number (preferred_charset_id),
2574 charset_list)))
2575 preferred_charset_id = -1;
2576 break;
2577 default:
2578 emacs_abort ();
2579 }
2580 charbuf += -c - 1;
2581 continue;
2582 }
2583
2584 if (ASCII_CHAR_P (c))
2585 EMIT_ONE_ASCII_BYTE (c);
2586 else if (CHAR_BYTE8_P (c))
2587 {
2588 c = CHAR_TO_BYTE8 (c);
2589 EMIT_ONE_BYTE (c);
2590 }
2591 else
2592 {
2593 struct charset *charset;
2594 unsigned code;
2595 int dimension;
2596 int emacs_mule_id;
2597 unsigned char leading_codes[2];
2598
2599 if (preferred_charset_id >= 0)
2600 {
2601 bool result;
2602
2603 charset = CHARSET_FROM_ID (preferred_charset_id);
2604 CODING_CHAR_CHARSET_P (coding, dst, dst_end, c, charset, result);
2605 if (result)
2606 code = ENCODE_CHAR (charset, c);
2607 else
2608 CODING_CHAR_CHARSET (coding, dst, dst_end, c, charset_list,
2609 &code, charset);
2610 }
2611 else
2612 CODING_CHAR_CHARSET (coding, dst, dst_end, c, charset_list,
2613 &code, charset);
2614 if (! charset)
2615 {
2616 c = coding->default_char;
2617 if (ASCII_CHAR_P (c))
2618 {
2619 EMIT_ONE_ASCII_BYTE (c);
2620 continue;
2621 }
2622 CODING_CHAR_CHARSET (coding, dst, dst_end, c, charset_list,
2623 &code, charset);
2624 }
2625 dimension = CHARSET_DIMENSION (charset);
2626 emacs_mule_id = CHARSET_EMACS_MULE_ID (charset);
2627 EMACS_MULE_LEADING_CODES (emacs_mule_id, leading_codes);
2628 EMIT_ONE_BYTE (leading_codes[0]);
2629 if (leading_codes[1])
2630 EMIT_ONE_BYTE (leading_codes[1]);
2631 if (dimension == 1)
2632 EMIT_ONE_BYTE (code | 0x80);
2633 else
2634 {
2635 code |= 0x8080;
2636 EMIT_ONE_BYTE (code >> 8);
2637 EMIT_ONE_BYTE (code & 0xFF);
2638 }
2639 }
2640 }
2641 record_conversion_result (coding, CODING_RESULT_SUCCESS);
2642 coding->produced_char += produced_chars;
2643 coding->produced = dst - coding->destination;
2644 return 0;
2645 }
2646
2647 \f
2648 /*** 7. ISO2022 handlers ***/
2649
2650 /* The following note describes the coding system ISO2022 briefly.
2651 Since the intention of this note is to help understand the
2652 functions in this file, some parts are NOT ACCURATE or are OVERLY
2653 SIMPLIFIED. For thorough understanding, please refer to the
2654 original document of ISO2022. This is equivalent to the standard
2655 ECMA-35, obtainable from <URL:http://www.ecma.ch/> (*).
2656
2657 ISO2022 provides many mechanisms to encode several character sets
2658 in 7-bit and 8-bit environments. For 7-bit environments, all text
2659 is encoded using bytes less than 128. This may make the encoded
2660 text a little bit longer, but the text passes more easily through
2661 several types of gateway, some of which strip off the MSB (Most
2662 Significant Bit).
2663
2664 There are two kinds of character sets: control character sets and
2665 graphic character sets. The former contain control characters such
2666 as `newline' and `escape' to provide control functions (control
2667 functions are also provided by escape sequences). The latter
2668 contain graphic characters such as 'A' and '-'. Emacs recognizes
2669 two control character sets and many graphic character sets.
2670
2671 Graphic character sets are classified into one of the following
2672 four classes, according to the number of bytes (DIMENSION) and
2673 number of characters in one dimension (CHARS) of the set:
2674 - DIMENSION1_CHARS94
2675 - DIMENSION1_CHARS96
2676 - DIMENSION2_CHARS94
2677 - DIMENSION2_CHARS96
2678
2679 In addition, each character set is assigned an identification tag,
2680 unique for each set, called the "final character" (denoted as <F>
2681 hereafter). The <F> of each character set is decided by ECMA(*)
2682 when it is registered in ISO. The code range of <F> is 0x30..0x7F
2683 (0x30..0x3F are for private use only).
2684
2685 Note (*): ECMA = European Computer Manufacturers Association
2686
2687 Here are examples of graphic character sets [NAME(<F>)]:
2688 o DIMENSION1_CHARS94 -- ASCII('B'), right-half-of-JISX0201('I'), ...
2689 o DIMENSION1_CHARS96 -- right-half-of-ISO8859-1('A'), ...
2690 o DIMENSION2_CHARS94 -- GB2312('A'), JISX0208('B'), ...
2691 o DIMENSION2_CHARS96 -- none for the moment
2692
2693 A code area (1 byte=8 bits) is divided into 4 areas, C0, GL, C1, and GR.
2694 C0 [0x00..0x1F] -- control character plane 0
2695 GL [0x20..0x7F] -- graphic character plane 0
2696 C1 [0x80..0x9F] -- control character plane 1
2697 GR [0xA0..0xFF] -- graphic character plane 1
2698
2699 A control character set is directly designated and invoked to C0 or
2700 C1 by an escape sequence. The most common case is that:
2701 - ISO646's control character set is designated/invoked to C0, and
2702 - ISO6429's control character set is designated/invoked to C1,
2703 and usually these designations/invocations are omitted in encoded
2704 text. In a 7-bit environment, only C0 can be used, and a control
2705 character for C1 is encoded by an appropriate escape sequence to
2706 fit into the environment. All control characters for C1 are
2707 defined to have corresponding escape sequences.
2708
2709 A graphic character set is at first designated to one of four
2710 graphic registers (G0 through G3), then these graphic registers are
2711 invoked to GL or GR. These designations and invocations can be
2712 done independently. The most common case is that G0 is invoked to
2713 GL, G1 is invoked to GR, and ASCII is designated to G0. Usually
2714 these invocations and designations are omitted in encoded text.
2715 In a 7-bit environment, only GL can be used.
2716
2717 When a graphic character set of CHARS94 is invoked to GL, codes
2718 0x20 and 0x7F of the GL area work as control characters SPACE and
2719 DEL respectively, and codes 0xA0 and 0xFF of the GR area should not
2720 be used.
2721
2722 There are two ways of invocation: locking-shift and single-shift.
2723 With locking-shift, the invocation lasts until the next different
2724 invocation, whereas with single-shift, the invocation affects the
2725 following character only and doesn't affect the locking-shift
2726 state. Invocations are done by the following control characters or
2727 escape sequences:
2728
2729 ----------------------------------------------------------------------
2730 abbrev function cntrl escape seq description
2731 ----------------------------------------------------------------------
2732 SI/LS0 (shift-in) 0x0F none invoke G0 into GL
2733 SO/LS1 (shift-out) 0x0E none invoke G1 into GL
2734 LS2 (locking-shift-2) none ESC 'n' invoke G2 into GL
2735 LS3 (locking-shift-3) none ESC 'o' invoke G3 into GL
2736 LS1R (locking-shift-1 right) none ESC '~' invoke G1 into GR (*)
2737 LS2R (locking-shift-2 right) none ESC '}' invoke G2 into GR (*)
2738 LS3R (locking-shift 3 right) none ESC '|' invoke G3 into GR (*)
2739 SS2 (single-shift-2) 0x8E ESC 'N' invoke G2 for one char
2740 SS3 (single-shift-3) 0x8F ESC 'O' invoke G3 for one char
2741 ----------------------------------------------------------------------
2742 (*) These are not used by any known coding system.
2743
2744 Control characters for these functions are defined by macros
2745 ISO_CODE_XXX in `coding.h'.
2746
2747 Designations are done by the following escape sequences:
2748 ----------------------------------------------------------------------
2749 escape sequence description
2750 ----------------------------------------------------------------------
2751 ESC '(' <F> designate DIMENSION1_CHARS94<F> to G0
2752 ESC ')' <F> designate DIMENSION1_CHARS94<F> to G1
2753 ESC '*' <F> designate DIMENSION1_CHARS94<F> to G2
2754 ESC '+' <F> designate DIMENSION1_CHARS94<F> to G3
2755 ESC ',' <F> designate DIMENSION1_CHARS96<F> to G0 (*)
2756 ESC '-' <F> designate DIMENSION1_CHARS96<F> to G1
2757 ESC '.' <F> designate DIMENSION1_CHARS96<F> to G2
2758 ESC '/' <F> designate DIMENSION1_CHARS96<F> to G3
2759 ESC '$' '(' <F> designate DIMENSION2_CHARS94<F> to G0 (**)
2760 ESC '$' ')' <F> designate DIMENSION2_CHARS94<F> to G1
2761 ESC '$' '*' <F> designate DIMENSION2_CHARS94<F> to G2
2762 ESC '$' '+' <F> designate DIMENSION2_CHARS94<F> to G3
2763 ESC '$' ',' <F> designate DIMENSION2_CHARS96<F> to G0 (*)
2764 ESC '$' '-' <F> designate DIMENSION2_CHARS96<F> to G1
2765 ESC '$' '.' <F> designate DIMENSION2_CHARS96<F> to G2
2766 ESC '$' '/' <F> designate DIMENSION2_CHARS96<F> to G3
2767 ----------------------------------------------------------------------
2768
2769 In this list, "DIMENSION1_CHARS94<F>" means a graphic character set
2770 of dimension 1, chars 94, and final character <F>, etc...
2771
2772 Note (*): Although these designations are not allowed in ISO2022,
2773 Emacs accepts them on decoding, and produces them on encoding
2774 CHARS96 character sets in a coding system which is characterized as
2775 7-bit environment, non-locking-shift, and non-single-shift.
2776
2777 Note (**): If <F> is '@', 'A', or 'B', the intermediate character
2778 '(' must be omitted. We refer to this as "short-form" hereafter.
2779
2780 Now you may notice that there are a lot of ways of encoding the
2781 same multilingual text in ISO2022. Actually, there exist many
2782 coding systems such as Compound Text (used in X11's inter client
2783 communication, ISO-2022-JP (used in Japanese Internet), ISO-2022-KR
2784 (used in Korean Internet), EUC (Extended UNIX Code, used in Asian
2785 localized platforms), and all of these are variants of ISO2022.
2786
2787 In addition to the above, Emacs handles two more kinds of escape
2788 sequences: ISO6429's direction specification and Emacs' private
2789 sequence for specifying character composition.
2790
2791 ISO6429's direction specification takes the following form:
2792 o CSI ']' -- end of the current direction
2793 o CSI '0' ']' -- end of the current direction
2794 o CSI '1' ']' -- start of left-to-right text
2795 o CSI '2' ']' -- start of right-to-left text
2796 The control character CSI (0x9B: control sequence introducer) is
2797 abbreviated to the escape sequence ESC '[' in a 7-bit environment.
2798
2799 Character composition specification takes the following form:
2800 o ESC '0' -- start relative composition
2801 o ESC '1' -- end composition
2802 o ESC '2' -- start rule-base composition (*)
2803 o ESC '3' -- start relative composition with alternate chars (**)
2804 o ESC '4' -- start rule-base composition with alternate chars (**)
2805 Since these are not standard escape sequences of any ISO standard,
2806 the use of them with these meanings is restricted to Emacs only.
2807
2808 (*) This form is used only in Emacs 20.7 and older versions,
2809 but newer versions can safely decode it.
2810 (**) This form is used only in Emacs 21.1 and newer versions,
2811 and older versions can't decode it.
2812
2813 Here's a list of example usages of these composition escape
2814 sequences (categorized by `enum composition_method').
2815
2816 COMPOSITION_RELATIVE:
2817 ESC 0 CHAR [ CHAR ] ESC 1
2818 COMPOSITION_WITH_RULE:
2819 ESC 2 CHAR [ RULE CHAR ] ESC 1
2820 COMPOSITION_WITH_ALTCHARS:
2821 ESC 3 ALTCHAR [ ALTCHAR ] ESC 0 CHAR [ CHAR ] ESC 1
2822 COMPOSITION_WITH_RULE_ALTCHARS:
2823 ESC 4 ALTCHAR [ RULE ALTCHAR ] ESC 0 CHAR [ CHAR ] ESC 1 */
2824
2825 static enum iso_code_class_type iso_code_class[256];
2826
2827 #define SAFE_CHARSET_P(coding, id) \
2828 ((id) <= (coding)->max_charset_id \
2829 && (coding)->safe_charsets[id] != 255)
2830
2831 static void
2832 setup_iso_safe_charsets (Lisp_Object attrs)
2833 {
2834 Lisp_Object charset_list, safe_charsets;
2835 Lisp_Object request;
2836 Lisp_Object reg_usage;
2837 Lisp_Object tail;
2838 EMACS_INT reg94, reg96;
2839 int flags = XINT (AREF (attrs, coding_attr_iso_flags));
2840 int max_charset_id;
2841
2842 charset_list = CODING_ATTR_CHARSET_LIST (attrs);
2843 if ((flags & CODING_ISO_FLAG_FULL_SUPPORT)
2844 && ! EQ (charset_list, Viso_2022_charset_list))
2845 {
2846 charset_list = Viso_2022_charset_list;
2847 ASET (attrs, coding_attr_charset_list, charset_list);
2848 ASET (attrs, coding_attr_safe_charsets, Qnil);
2849 }
2850
2851 if (STRINGP (AREF (attrs, coding_attr_safe_charsets)))
2852 return;
2853
2854 max_charset_id = 0;
2855 for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
2856 {
2857 int id = XINT (XCAR (tail));
2858 if (max_charset_id < id)
2859 max_charset_id = id;
2860 }
2861
2862 safe_charsets = make_uninit_string (max_charset_id + 1);
2863 memset (SDATA (safe_charsets), 255, max_charset_id + 1);
2864 request = AREF (attrs, coding_attr_iso_request);
2865 reg_usage = AREF (attrs, coding_attr_iso_usage);
2866 reg94 = XINT (XCAR (reg_usage));
2867 reg96 = XINT (XCDR (reg_usage));
2868
2869 for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
2870 {
2871 Lisp_Object id;
2872 Lisp_Object reg;
2873 struct charset *charset;
2874
2875 id = XCAR (tail);
2876 charset = CHARSET_FROM_ID (XINT (id));
2877 reg = Fcdr (Fassq (id, request));
2878 if (! NILP (reg))
2879 SSET (safe_charsets, XINT (id), XINT (reg));
2880 else if (charset->iso_chars_96)
2881 {
2882 if (reg96 < 4)
2883 SSET (safe_charsets, XINT (id), reg96);
2884 }
2885 else
2886 {
2887 if (reg94 < 4)
2888 SSET (safe_charsets, XINT (id), reg94);
2889 }
2890 }
2891 ASET (attrs, coding_attr_safe_charsets, safe_charsets);
2892 }
2893
2894
2895 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
2896 Return true if a text is encoded in one of ISO-2022 based coding
2897 systems. */
2898
2899 static bool
2900 detect_coding_iso_2022 (struct coding_system *coding,
2901 struct coding_detection_info *detect_info)
2902 {
2903 const unsigned char *src = coding->source, *src_base = src;
2904 const unsigned char *src_end = coding->source + coding->src_bytes;
2905 bool multibytep = coding->src_multibyte;
2906 bool single_shifting = 0;
2907 int id;
2908 int c, c1;
2909 ptrdiff_t consumed_chars = 0;
2910 int i;
2911 int rejected = 0;
2912 int found = 0;
2913 int composition_count = -1;
2914
2915 detect_info->checked |= CATEGORY_MASK_ISO;
2916
2917 for (i = coding_category_iso_7; i <= coding_category_iso_8_else; i++)
2918 {
2919 struct coding_system *this = &(coding_categories[i]);
2920 Lisp_Object attrs, val;
2921
2922 if (this->id < 0)
2923 continue;
2924 attrs = CODING_ID_ATTRS (this->id);
2925 if (CODING_ISO_FLAGS (this) & CODING_ISO_FLAG_FULL_SUPPORT
2926 && ! EQ (CODING_ATTR_CHARSET_LIST (attrs), Viso_2022_charset_list))
2927 setup_iso_safe_charsets (attrs);
2928 val = CODING_ATTR_SAFE_CHARSETS (attrs);
2929 this->max_charset_id = SCHARS (val) - 1;
2930 this->safe_charsets = SDATA (val);
2931 }
2932
2933 /* A coding system of this category is always ASCII compatible. */
2934 src += coding->head_ascii;
2935
2936 while (rejected != CATEGORY_MASK_ISO)
2937 {
2938 src_base = src;
2939 ONE_MORE_BYTE (c);
2940 switch (c)
2941 {
2942 case ISO_CODE_ESC:
2943 if (inhibit_iso_escape_detection)
2944 break;
2945 single_shifting = 0;
2946 ONE_MORE_BYTE (c);
2947 if (c == 'N' || c == 'O')
2948 {
2949 /* ESC <Fe> for SS2 or SS3. */
2950 single_shifting = 1;
2951 rejected |= CATEGORY_MASK_ISO_7BIT | CATEGORY_MASK_ISO_8BIT;
2952 }
2953 else if (c == '1')
2954 {
2955 /* End of composition. */
2956 if (composition_count < 0
2957 || composition_count > MAX_COMPOSITION_COMPONENTS)
2958 /* Invalid */
2959 break;
2960 composition_count = -1;
2961 found |= CATEGORY_MASK_ISO;
2962 }
2963 else if (c >= '0' && c <= '4')
2964 {
2965 /* ESC <Fp> for start/end composition. */
2966 composition_count = 0;
2967 }
2968 else
2969 {
2970 if (c >= '(' && c <= '/')
2971 {
2972 /* Designation sequence for a charset of dimension 1. */
2973 ONE_MORE_BYTE (c1);
2974 if (c1 < ' ' || c1 >= 0x80
2975 || (id = iso_charset_table[0][c >= ','][c1]) < 0)
2976 /* Invalid designation sequence. Just ignore. */
2977 break;
2978 }
2979 else if (c == '$')
2980 {
2981 /* Designation sequence for a charset of dimension 2. */
2982 ONE_MORE_BYTE (c);
2983 if (c >= '@' && c <= 'B')
2984 /* Designation for JISX0208.1978, GB2312, or JISX0208. */
2985 id = iso_charset_table[1][0][c];
2986 else if (c >= '(' && c <= '/')
2987 {
2988 ONE_MORE_BYTE (c1);
2989 if (c1 < ' ' || c1 >= 0x80
2990 || (id = iso_charset_table[1][c >= ','][c1]) < 0)
2991 /* Invalid designation sequence. Just ignore. */
2992 break;
2993 }
2994 else
2995 /* Invalid designation sequence. Just ignore it. */
2996 break;
2997 }
2998 else
2999 {
3000 /* Invalid escape sequence. Just ignore it. */
3001 break;
3002 }
3003
3004 /* We found a valid designation sequence for CHARSET. */
3005 rejected |= CATEGORY_MASK_ISO_8BIT;
3006 if (SAFE_CHARSET_P (&coding_categories[coding_category_iso_7],
3007 id))
3008 found |= CATEGORY_MASK_ISO_7;
3009 else
3010 rejected |= CATEGORY_MASK_ISO_7;
3011 if (SAFE_CHARSET_P (&coding_categories[coding_category_iso_7_tight],
3012 id))
3013 found |= CATEGORY_MASK_ISO_7_TIGHT;
3014 else
3015 rejected |= CATEGORY_MASK_ISO_7_TIGHT;
3016 if (SAFE_CHARSET_P (&coding_categories[coding_category_iso_7_else],
3017 id))
3018 found |= CATEGORY_MASK_ISO_7_ELSE;
3019 else
3020 rejected |= CATEGORY_MASK_ISO_7_ELSE;
3021 if (SAFE_CHARSET_P (&coding_categories[coding_category_iso_8_else],
3022 id))
3023 found |= CATEGORY_MASK_ISO_8_ELSE;
3024 else
3025 rejected |= CATEGORY_MASK_ISO_8_ELSE;
3026 }
3027 break;
3028
3029 case ISO_CODE_SO:
3030 case ISO_CODE_SI:
3031 /* Locking shift out/in. */
3032 if (inhibit_iso_escape_detection)
3033 break;
3034 single_shifting = 0;
3035 rejected |= CATEGORY_MASK_ISO_7BIT | CATEGORY_MASK_ISO_8BIT;
3036 break;
3037
3038 case ISO_CODE_CSI:
3039 /* Control sequence introducer. */
3040 single_shifting = 0;
3041 rejected |= CATEGORY_MASK_ISO_7BIT | CATEGORY_MASK_ISO_7_ELSE;
3042 found |= CATEGORY_MASK_ISO_8_ELSE;
3043 goto check_extra_latin;
3044
3045 case ISO_CODE_SS2:
3046 case ISO_CODE_SS3:
3047 /* Single shift. */
3048 if (inhibit_iso_escape_detection)
3049 break;
3050 single_shifting = 0;
3051 rejected |= CATEGORY_MASK_ISO_7BIT;
3052 if (CODING_ISO_FLAGS (&coding_categories[coding_category_iso_8_1])
3053 & CODING_ISO_FLAG_SINGLE_SHIFT)
3054 {
3055 found |= CATEGORY_MASK_ISO_8_1;
3056 single_shifting = 1;
3057 }
3058 if (CODING_ISO_FLAGS (&coding_categories[coding_category_iso_8_2])
3059 & CODING_ISO_FLAG_SINGLE_SHIFT)
3060 {
3061 found |= CATEGORY_MASK_ISO_8_2;
3062 single_shifting = 1;
3063 }
3064 if (single_shifting)
3065 break;
3066 check_extra_latin:
3067 if (! VECTORP (Vlatin_extra_code_table)
3068 || NILP (AREF (Vlatin_extra_code_table, c)))
3069 {
3070 rejected = CATEGORY_MASK_ISO;
3071 break;
3072 }
3073 if (CODING_ISO_FLAGS (&coding_categories[coding_category_iso_8_1])
3074 & CODING_ISO_FLAG_LATIN_EXTRA)
3075 found |= CATEGORY_MASK_ISO_8_1;
3076 else
3077 rejected |= CATEGORY_MASK_ISO_8_1;
3078 rejected |= CATEGORY_MASK_ISO_8_2;
3079 break;
3080
3081 default:
3082 if (c < 0)
3083 continue;
3084 if (c < 0x80)
3085 {
3086 if (composition_count >= 0)
3087 composition_count++;
3088 single_shifting = 0;
3089 break;
3090 }
3091 if (c >= 0xA0)
3092 {
3093 rejected |= CATEGORY_MASK_ISO_7BIT | CATEGORY_MASK_ISO_7_ELSE;
3094 found |= CATEGORY_MASK_ISO_8_1;
3095 /* Check the length of succeeding codes of the range
3096 0xA0..0FF. If the byte length is even, we include
3097 CATEGORY_MASK_ISO_8_2 in `found'. We can check this
3098 only when we are not single shifting. */
3099 if (! single_shifting
3100 && ! (rejected & CATEGORY_MASK_ISO_8_2))
3101 {
3102 int len = 1;
3103 while (src < src_end)
3104 {
3105 src_base = src;
3106 ONE_MORE_BYTE (c);
3107 if (c < 0xA0)
3108 {
3109 src = src_base;
3110 break;
3111 }
3112 len++;
3113 }
3114
3115 if (len & 1 && src < src_end)
3116 {
3117 rejected |= CATEGORY_MASK_ISO_8_2;
3118 if (composition_count >= 0)
3119 composition_count += len;
3120 }
3121 else
3122 {
3123 found |= CATEGORY_MASK_ISO_8_2;
3124 if (composition_count >= 0)
3125 composition_count += len / 2;
3126 }
3127 }
3128 break;
3129 }
3130 }
3131 }
3132 detect_info->rejected |= CATEGORY_MASK_ISO;
3133 return 0;
3134
3135 no_more_source:
3136 detect_info->rejected |= rejected;
3137 detect_info->found |= (found & ~rejected);
3138 return 1;
3139 }
3140
3141
3142 /* Set designation state into CODING. Set CHARS_96 to -1 if the
3143 escape sequence should be kept. */
3144 #define DECODE_DESIGNATION(reg, dim, chars_96, final) \
3145 do { \
3146 int id, prev; \
3147 \
3148 if (final < '0' || final >= 128 \
3149 || ((id = ISO_CHARSET_TABLE (dim, chars_96, final)) < 0) \
3150 || !SAFE_CHARSET_P (coding, id)) \
3151 { \
3152 CODING_ISO_DESIGNATION (coding, reg) = -2; \
3153 chars_96 = -1; \
3154 break; \
3155 } \
3156 prev = CODING_ISO_DESIGNATION (coding, reg); \
3157 if (id == charset_jisx0201_roman) \
3158 { \
3159 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_USE_ROMAN) \
3160 id = charset_ascii; \
3161 } \
3162 else if (id == charset_jisx0208_1978) \
3163 { \
3164 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_USE_OLDJIS) \
3165 id = charset_jisx0208; \
3166 } \
3167 CODING_ISO_DESIGNATION (coding, reg) = id; \
3168 /* If there was an invalid designation to REG previously, and this \
3169 designation is ASCII to REG, we should keep this designation \
3170 sequence. */ \
3171 if (prev == -2 && id == charset_ascii) \
3172 chars_96 = -1; \
3173 } while (0)
3174
3175
3176 /* Handle these composition sequence (ALT: alternate char):
3177
3178 (1) relative composition: ESC 0 CHAR ... ESC 1
3179 (2) rulebase composition: ESC 2 CHAR RULE CHAR RULE ... CHAR ESC 1
3180 (3) altchar composition: ESC 3 ALT ... ALT ESC 0 CHAR ... ESC 1
3181 (4) alt&rule composition: ESC 4 ALT RULE ... ALT ESC 0 CHAR ... ESC 1
3182
3183 When the start sequence (ESC 0/2/3/4) is found, this annotation
3184 header is produced.
3185
3186 [ -LENGTH(==-5) CODING_ANNOTATE_COMPOSITION_MASK NCHARS(==0) 0 METHOD ]
3187
3188 Then, upon reading CHAR or RULE (one or two bytes), these codes are
3189 produced until the end sequence (ESC 1) is found:
3190
3191 (1) CHAR ... CHAR
3192 (2) CHAR -2 DECODED-RULE CHAR -2 DECODED-RULE ... CHAR
3193 (3) ALT ... ALT -1 -1 CHAR ... CHAR
3194 (4) ALT -2 DECODED-RULE ALT -2 DECODED-RULE ... ALT -1 -1 CHAR ... CHAR
3195
3196 When the end sequence (ESC 1) is found, LENGTH and NCHARS in the
3197 annotation header is updated as below:
3198
3199 (1) LENGTH: unchanged, NCHARS: number of CHARs
3200 (2) LENGTH: unchanged, NCHARS: number of CHARs
3201 (3) LENGTH: += number of ALTs + 2, NCHARS: number of CHARs
3202 (4) LENGTH: += number of ALTs * 3, NCHARS: number of CHARs
3203
3204 If an error is found while composing, the annotation header is
3205 changed to:
3206
3207 [ ESC '0'/'2'/'3'/'4' -2 0 ]
3208
3209 and the sequence [ -2 DECODED-RULE ] is changed to the original
3210 byte sequence as below:
3211 o the original byte sequence is B: [ B -1 ]
3212 o the original byte sequence is B1 B2: [ B1 B2 ]
3213 and the sequence [ -1 -1 ] is changed to the original byte
3214 sequence:
3215 [ ESC '0' ]
3216 */
3217
3218 /* Decode a composition rule C1 and maybe one more byte from the
3219 source, and set RULE to the encoded composition rule. If the rule
3220 is invalid, goto invalid_code. */
3221
3222 #define DECODE_COMPOSITION_RULE(rule) \
3223 do { \
3224 rule = c1 - 32; \
3225 if (rule < 0) \
3226 goto invalid_code; \
3227 if (rule < 81) /* old format (before ver.21) */ \
3228 { \
3229 int gref = (rule) / 9; \
3230 int nref = (rule) % 9; \
3231 if (gref == 4) gref = 10; \
3232 if (nref == 4) nref = 10; \
3233 rule = COMPOSITION_ENCODE_RULE (gref, nref); \
3234 } \
3235 else /* new format (after ver.21) */ \
3236 { \
3237 int b; \
3238 \
3239 ONE_MORE_BYTE (b); \
3240 if (! COMPOSITION_ENCODE_RULE_VALID (rule - 81, b - 32)) \
3241 goto invalid_code; \
3242 rule = COMPOSITION_ENCODE_RULE (rule - 81, b - 32); \
3243 rule += 0x100; /* Distinguish it from the old format. */ \
3244 } \
3245 } while (0)
3246
3247 #define ENCODE_COMPOSITION_RULE(rule) \
3248 do { \
3249 int gref = (rule % 0x100) / 12, nref = (rule % 0x100) % 12; \
3250 \
3251 if (rule < 0x100) /* old format */ \
3252 { \
3253 if (gref == 10) gref = 4; \
3254 if (nref == 10) nref = 4; \
3255 charbuf[idx] = 32 + gref * 9 + nref; \
3256 charbuf[idx + 1] = -1; \
3257 new_chars++; \
3258 } \
3259 else /* new format */ \
3260 { \
3261 charbuf[idx] = 32 + 81 + gref; \
3262 charbuf[idx + 1] = 32 + nref; \
3263 new_chars += 2; \
3264 } \
3265 } while (0)
3266
3267 /* Finish the current composition as invalid. */
3268
3269 static int
3270 finish_composition (int *charbuf, struct composition_status *cmp_status)
3271 {
3272 int idx = - cmp_status->length;
3273 int new_chars;
3274
3275 /* Recover the original ESC sequence */
3276 charbuf[idx++] = ISO_CODE_ESC;
3277 charbuf[idx++] = (cmp_status->method == COMPOSITION_RELATIVE ? '0'
3278 : cmp_status->method == COMPOSITION_WITH_RULE ? '2'
3279 : cmp_status->method == COMPOSITION_WITH_ALTCHARS ? '3'
3280 /* cmp_status->method == COMPOSITION_WITH_RULE_ALTCHARS */
3281 : '4');
3282 charbuf[idx++] = -2;
3283 charbuf[idx++] = 0;
3284 charbuf[idx++] = -1;
3285 new_chars = cmp_status->nchars;
3286 if (cmp_status->method >= COMPOSITION_WITH_RULE)
3287 for (; idx < 0; idx++)
3288 {
3289 int elt = charbuf[idx];
3290
3291 if (elt == -2)
3292 {
3293 ENCODE_COMPOSITION_RULE (charbuf[idx + 1]);
3294 idx++;
3295 }
3296 else if (elt == -1)
3297 {
3298 charbuf[idx++] = ISO_CODE_ESC;
3299 charbuf[idx] = '0';
3300 new_chars += 2;
3301 }
3302 }
3303 cmp_status->state = COMPOSING_NO;
3304 return new_chars;
3305 }
3306
3307 /* If characters are under composition, finish the composition. */
3308 #define MAYBE_FINISH_COMPOSITION() \
3309 do { \
3310 if (cmp_status->state != COMPOSING_NO) \
3311 char_offset += finish_composition (charbuf, cmp_status); \
3312 } while (0)
3313
3314 /* Handle composition start sequence ESC 0, ESC 2, ESC 3, or ESC 4.
3315
3316 ESC 0 : relative composition : ESC 0 CHAR ... ESC 1
3317 ESC 2 : rulebase composition : ESC 2 CHAR RULE CHAR RULE ... CHAR ESC 1
3318 ESC 3 : altchar composition : ESC 3 CHAR ... ESC 0 CHAR ... ESC 1
3319 ESC 4 : alt&rule composition : ESC 4 CHAR RULE ... CHAR ESC 0 CHAR ... ESC 1
3320
3321 Produce this annotation sequence now:
3322
3323 [ -LENGTH(==-4) CODING_ANNOTATE_COMPOSITION_MASK NCHARS(==0) METHOD ]
3324 */
3325
3326 #define DECODE_COMPOSITION_START(c1) \
3327 do { \
3328 if (c1 == '0' \
3329 && ((cmp_status->state == COMPOSING_COMPONENT_CHAR \
3330 && cmp_status->method == COMPOSITION_WITH_ALTCHARS) \
3331 || (cmp_status->state == COMPOSING_COMPONENT_RULE \
3332 && cmp_status->method == COMPOSITION_WITH_RULE_ALTCHARS))) \
3333 { \
3334 *charbuf++ = -1; \
3335 *charbuf++= -1; \
3336 cmp_status->state = COMPOSING_CHAR; \
3337 cmp_status->length += 2; \
3338 } \
3339 else \
3340 { \
3341 MAYBE_FINISH_COMPOSITION (); \
3342 cmp_status->method = (c1 == '0' ? COMPOSITION_RELATIVE \
3343 : c1 == '2' ? COMPOSITION_WITH_RULE \
3344 : c1 == '3' ? COMPOSITION_WITH_ALTCHARS \
3345 : COMPOSITION_WITH_RULE_ALTCHARS); \
3346 cmp_status->state \
3347 = (c1 <= '2' ? COMPOSING_CHAR : COMPOSING_COMPONENT_CHAR); \
3348 ADD_COMPOSITION_DATA (charbuf, 0, 0, cmp_status->method); \
3349 cmp_status->length = MAX_ANNOTATION_LENGTH; \
3350 cmp_status->nchars = cmp_status->ncomps = 0; \
3351 coding->annotated = 1; \
3352 } \
3353 } while (0)
3354
3355
3356 /* Handle composition end sequence ESC 1. */
3357
3358 #define DECODE_COMPOSITION_END() \
3359 do { \
3360 if (cmp_status->nchars == 0 \
3361 || ((cmp_status->state == COMPOSING_CHAR) \
3362 == (cmp_status->method == COMPOSITION_WITH_RULE))) \
3363 { \
3364 MAYBE_FINISH_COMPOSITION (); \
3365 goto invalid_code; \
3366 } \
3367 if (cmp_status->method == COMPOSITION_WITH_ALTCHARS) \
3368 charbuf[- cmp_status->length] -= cmp_status->ncomps + 2; \
3369 else if (cmp_status->method == COMPOSITION_WITH_RULE_ALTCHARS) \
3370 charbuf[- cmp_status->length] -= cmp_status->ncomps * 3; \
3371 charbuf[- cmp_status->length + 2] = cmp_status->nchars; \
3372 char_offset += cmp_status->nchars; \
3373 cmp_status->state = COMPOSING_NO; \
3374 } while (0)
3375
3376 /* Store a composition rule RULE in charbuf, and update cmp_status. */
3377
3378 #define STORE_COMPOSITION_RULE(rule) \
3379 do { \
3380 *charbuf++ = -2; \
3381 *charbuf++ = rule; \
3382 cmp_status->length += 2; \
3383 cmp_status->state--; \
3384 } while (0)
3385
3386 /* Store a composed char or a component char C in charbuf, and update
3387 cmp_status. */
3388
3389 #define STORE_COMPOSITION_CHAR(c) \
3390 do { \
3391 *charbuf++ = (c); \
3392 cmp_status->length++; \
3393 if (cmp_status->state == COMPOSING_CHAR) \
3394 cmp_status->nchars++; \
3395 else \
3396 cmp_status->ncomps++; \
3397 if (cmp_status->method == COMPOSITION_WITH_RULE \
3398 || (cmp_status->method == COMPOSITION_WITH_RULE_ALTCHARS \
3399 && cmp_status->state == COMPOSING_COMPONENT_CHAR)) \
3400 cmp_status->state++; \
3401 } while (0)
3402
3403
3404 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions". */
3405
3406 static void
3407 decode_coding_iso_2022 (struct coding_system *coding)
3408 {
3409 const unsigned char *src = coding->source + coding->consumed;
3410 const unsigned char *src_end = coding->source + coding->src_bytes;
3411 const unsigned char *src_base;
3412 int *charbuf = coding->charbuf + coding->charbuf_used;
3413 /* We may produce two annotations (charset and composition) in one
3414 loop and one more charset annotation at the end. */
3415 int *charbuf_end
3416 = coding->charbuf + coding->charbuf_size - (MAX_ANNOTATION_LENGTH * 3);
3417 ptrdiff_t consumed_chars = 0, consumed_chars_base;
3418 bool multibytep = coding->src_multibyte;
3419 /* Charsets invoked to graphic plane 0 and 1 respectively. */
3420 int charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3421 int charset_id_1 = CODING_ISO_INVOKED_CHARSET (coding, 1);
3422 int charset_id_2, charset_id_3;
3423 struct charset *charset;
3424 int c;
3425 struct composition_status *cmp_status = CODING_ISO_CMP_STATUS (coding);
3426 Lisp_Object attrs = CODING_ID_ATTRS (coding->id);
3427 ptrdiff_t char_offset = coding->produced_char;
3428 ptrdiff_t last_offset = char_offset;
3429 int last_id = charset_ascii;
3430 bool eol_dos
3431 = !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
3432 int byte_after_cr = -1;
3433 int i;
3434
3435 setup_iso_safe_charsets (attrs);
3436 coding->safe_charsets = SDATA (CODING_ATTR_SAFE_CHARSETS (attrs));
3437
3438 if (cmp_status->state != COMPOSING_NO)
3439 {
3440 if (charbuf_end - charbuf < cmp_status->length)
3441 emacs_abort ();
3442 for (i = 0; i < cmp_status->length; i++)
3443 *charbuf++ = cmp_status->carryover[i];
3444 coding->annotated = 1;
3445 }
3446
3447 while (1)
3448 {
3449 int c1, c2, c3;
3450
3451 src_base = src;
3452 consumed_chars_base = consumed_chars;
3453
3454 if (charbuf >= charbuf_end)
3455 {
3456 if (byte_after_cr >= 0)
3457 src_base--;
3458 break;
3459 }
3460
3461 if (byte_after_cr >= 0)
3462 c1 = byte_after_cr, byte_after_cr = -1;
3463 else
3464 ONE_MORE_BYTE (c1);
3465 if (c1 < 0)
3466 goto invalid_code;
3467
3468 if (CODING_ISO_EXTSEGMENT_LEN (coding) > 0)
3469 {
3470 *charbuf++ = ASCII_BYTE_P (c1) ? c1 : BYTE8_TO_CHAR (c1);
3471 char_offset++;
3472 CODING_ISO_EXTSEGMENT_LEN (coding)--;
3473 continue;
3474 }
3475
3476 if (CODING_ISO_EMBEDDED_UTF_8 (coding))
3477 {
3478 if (c1 == ISO_CODE_ESC)
3479 {
3480 if (src + 1 >= src_end)
3481 goto no_more_source;
3482 *charbuf++ = ISO_CODE_ESC;
3483 char_offset++;
3484 if (src[0] == '%' && src[1] == '@')
3485 {
3486 src += 2;
3487 consumed_chars += 2;
3488 char_offset += 2;
3489 /* We are sure charbuf can contain two more chars. */
3490 *charbuf++ = '%';
3491 *charbuf++ = '@';
3492 CODING_ISO_EMBEDDED_UTF_8 (coding) = 0;
3493 }
3494 }
3495 else
3496 {
3497 *charbuf++ = ASCII_BYTE_P (c1) ? c1 : BYTE8_TO_CHAR (c1);
3498 char_offset++;
3499 }
3500 continue;
3501 }
3502
3503 if ((cmp_status->state == COMPOSING_RULE
3504 || cmp_status->state == COMPOSING_COMPONENT_RULE)
3505 && c1 != ISO_CODE_ESC)
3506 {
3507 int rule;
3508
3509 DECODE_COMPOSITION_RULE (rule);
3510 STORE_COMPOSITION_RULE (rule);
3511 continue;
3512 }
3513
3514 /* We produce at most one character. */
3515 switch (iso_code_class [c1])
3516 {
3517 case ISO_0x20_or_0x7F:
3518 if (charset_id_0 < 0
3519 || ! CHARSET_ISO_CHARS_96 (CHARSET_FROM_ID (charset_id_0)))
3520 /* This is SPACE or DEL. */
3521 charset = CHARSET_FROM_ID (charset_ascii);
3522 else
3523 charset = CHARSET_FROM_ID (charset_id_0);
3524 break;
3525
3526 case ISO_graphic_plane_0:
3527 if (charset_id_0 < 0)
3528 charset = CHARSET_FROM_ID (charset_ascii);
3529 else
3530 charset = CHARSET_FROM_ID (charset_id_0);
3531 break;
3532
3533 case ISO_0xA0_or_0xFF:
3534 if (charset_id_1 < 0
3535 || ! CHARSET_ISO_CHARS_96 (CHARSET_FROM_ID (charset_id_1))
3536 || CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS)
3537 goto invalid_code;
3538 /* This is a graphic character, we fall down ... */
3539
3540 case ISO_graphic_plane_1:
3541 if (charset_id_1 < 0)
3542 goto invalid_code;
3543 charset = CHARSET_FROM_ID (charset_id_1);
3544 break;
3545
3546 case ISO_control_0:
3547 if (eol_dos && c1 == '\r')
3548 ONE_MORE_BYTE (byte_after_cr);
3549 MAYBE_FINISH_COMPOSITION ();
3550 charset = CHARSET_FROM_ID (charset_ascii);
3551 break;
3552
3553 case ISO_control_1:
3554 goto invalid_code;
3555
3556 case ISO_shift_out:
3557 if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LOCKING_SHIFT)
3558 || CODING_ISO_DESIGNATION (coding, 1) < 0)
3559 goto invalid_code;
3560 CODING_ISO_INVOCATION (coding, 0) = 1;
3561 charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3562 continue;
3563
3564 case ISO_shift_in:
3565 if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LOCKING_SHIFT))
3566 goto invalid_code;
3567 CODING_ISO_INVOCATION (coding, 0) = 0;
3568 charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3569 continue;
3570
3571 case ISO_single_shift_2_7:
3572 if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS))
3573 goto invalid_code;
3574 case ISO_single_shift_2:
3575 if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT))
3576 goto invalid_code;
3577 /* SS2 is handled as an escape sequence of ESC 'N' */
3578 c1 = 'N';
3579 goto label_escape_sequence;
3580
3581 case ISO_single_shift_3:
3582 if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT))
3583 goto invalid_code;
3584 /* SS2 is handled as an escape sequence of ESC 'O' */
3585 c1 = 'O';
3586 goto label_escape_sequence;
3587
3588 case ISO_control_sequence_introducer:
3589 /* CSI is handled as an escape sequence of ESC '[' ... */
3590 c1 = '[';
3591 goto label_escape_sequence;
3592
3593 case ISO_escape:
3594 ONE_MORE_BYTE (c1);
3595 label_escape_sequence:
3596 /* Escape sequences handled here are invocation,
3597 designation, direction specification, and character
3598 composition specification. */
3599 switch (c1)
3600 {
3601 case '&': /* revision of following character set */
3602 ONE_MORE_BYTE (c1);
3603 if (!(c1 >= '@' && c1 <= '~'))
3604 goto invalid_code;
3605 ONE_MORE_BYTE (c1);
3606 if (c1 != ISO_CODE_ESC)
3607 goto invalid_code;
3608 ONE_MORE_BYTE (c1);
3609 goto label_escape_sequence;
3610
3611 case '$': /* designation of 2-byte character set */
3612 if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DESIGNATION))
3613 goto invalid_code;
3614 {
3615 int reg, chars96;
3616
3617 ONE_MORE_BYTE (c1);
3618 if (c1 >= '@' && c1 <= 'B')
3619 { /* designation of JISX0208.1978, GB2312.1980,
3620 or JISX0208.1980 */
3621 reg = 0, chars96 = 0;
3622 }
3623 else if (c1 >= 0x28 && c1 <= 0x2B)
3624 { /* designation of DIMENSION2_CHARS94 character set */
3625 reg = c1 - 0x28, chars96 = 0;
3626 ONE_MORE_BYTE (c1);
3627 }
3628 else if (c1 >= 0x2C && c1 <= 0x2F)
3629 { /* designation of DIMENSION2_CHARS96 character set */
3630 reg = c1 - 0x2C, chars96 = 1;
3631 ONE_MORE_BYTE (c1);
3632 }
3633 else
3634 goto invalid_code;
3635 DECODE_DESIGNATION (reg, 2, chars96, c1);
3636 /* We must update these variables now. */
3637 if (reg == 0)
3638 charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3639 else if (reg == 1)
3640 charset_id_1 = CODING_ISO_INVOKED_CHARSET (coding, 1);
3641 if (chars96 < 0)
3642 goto invalid_code;
3643 }
3644 continue;
3645
3646 case 'n': /* invocation of locking-shift-2 */
3647 if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LOCKING_SHIFT)
3648 || CODING_ISO_DESIGNATION (coding, 2) < 0)
3649 goto invalid_code;
3650 CODING_ISO_INVOCATION (coding, 0) = 2;
3651 charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3652 continue;
3653
3654 case 'o': /* invocation of locking-shift-3 */
3655 if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LOCKING_SHIFT)
3656 || CODING_ISO_DESIGNATION (coding, 3) < 0)
3657 goto invalid_code;
3658 CODING_ISO_INVOCATION (coding, 0) = 3;
3659 charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3660 continue;
3661
3662 case 'N': /* invocation of single-shift-2 */
3663 if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT)
3664 || CODING_ISO_DESIGNATION (coding, 2) < 0)
3665 goto invalid_code;
3666 charset_id_2 = CODING_ISO_DESIGNATION (coding, 2);
3667 if (charset_id_2 < 0)
3668 charset = CHARSET_FROM_ID (charset_ascii);
3669 else
3670 charset = CHARSET_FROM_ID (charset_id_2);
3671 ONE_MORE_BYTE (c1);
3672 if (c1 < 0x20 || (c1 >= 0x80 && c1 < 0xA0))
3673 goto invalid_code;
3674 break;
3675
3676 case 'O': /* invocation of single-shift-3 */
3677 if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT)
3678 || CODING_ISO_DESIGNATION (coding, 3) < 0)
3679 goto invalid_code;
3680 charset_id_3 = CODING_ISO_DESIGNATION (coding, 3);
3681 if (charset_id_3 < 0)
3682 charset = CHARSET_FROM_ID (charset_ascii);
3683 else
3684 charset = CHARSET_FROM_ID (charset_id_3);
3685 ONE_MORE_BYTE (c1);
3686 if (c1 < 0x20 || (c1 >= 0x80 && c1 < 0xA0))
3687 goto invalid_code;
3688 break;
3689
3690 case '0': case '2': case '3': case '4': /* start composition */
3691 if (! (coding->common_flags & CODING_ANNOTATE_COMPOSITION_MASK))
3692 goto invalid_code;
3693 if (last_id != charset_ascii)
3694 {
3695 ADD_CHARSET_DATA (charbuf, char_offset- last_offset, last_id);
3696 last_id = charset_ascii;
3697 last_offset = char_offset;
3698 }
3699 DECODE_COMPOSITION_START (c1);
3700 continue;
3701
3702 case '1': /* end composition */
3703 if (cmp_status->state == COMPOSING_NO)
3704 goto invalid_code;
3705 DECODE_COMPOSITION_END ();
3706 continue;
3707
3708 case '[': /* specification of direction */
3709 if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DIRECTION))
3710 goto invalid_code;
3711 /* For the moment, nested direction is not supported.
3712 So, `coding->mode & CODING_MODE_DIRECTION' zero means
3713 left-to-right, and nonzero means right-to-left. */
3714 ONE_MORE_BYTE (c1);
3715 switch (c1)
3716 {
3717 case ']': /* end of the current direction */
3718 coding->mode &= ~CODING_MODE_DIRECTION;
3719
3720 case '0': /* end of the current direction */
3721 case '1': /* start of left-to-right direction */
3722 ONE_MORE_BYTE (c1);
3723 if (c1 == ']')
3724 coding->mode &= ~CODING_MODE_DIRECTION;
3725 else
3726 goto invalid_code;
3727 break;
3728
3729 case '2': /* start of right-to-left direction */
3730 ONE_MORE_BYTE (c1);
3731 if (c1 == ']')
3732 coding->mode |= CODING_MODE_DIRECTION;
3733 else
3734 goto invalid_code;
3735 break;
3736
3737 default:
3738 goto invalid_code;
3739 }
3740 continue;
3741
3742 case '%':
3743 ONE_MORE_BYTE (c1);
3744 if (c1 == '/')
3745 {
3746 /* CTEXT extended segment:
3747 ESC % / [0-4] M L --ENCODING-NAME-- \002 --BYTES--
3748 We keep these bytes as is for the moment.
3749 They may be decoded by post-read-conversion. */
3750 int dim, M, L;
3751 int size;
3752
3753 ONE_MORE_BYTE (dim);
3754 if (dim < '0' || dim > '4')
3755 goto invalid_code;
3756 ONE_MORE_BYTE (M);
3757 if (M < 128)
3758 goto invalid_code;
3759 ONE_MORE_BYTE (L);
3760 if (L < 128)
3761 goto invalid_code;
3762 size = ((M - 128) * 128) + (L - 128);
3763 if (charbuf + 6 > charbuf_end)
3764 goto break_loop;
3765 *charbuf++ = ISO_CODE_ESC;
3766 *charbuf++ = '%';
3767 *charbuf++ = '/';
3768 *charbuf++ = dim;
3769 *charbuf++ = BYTE8_TO_CHAR (M);
3770 *charbuf++ = BYTE8_TO_CHAR (L);
3771 CODING_ISO_EXTSEGMENT_LEN (coding) = size;
3772 }
3773 else if (c1 == 'G')
3774 {
3775 /* XFree86 extension for embedding UTF-8 in CTEXT:
3776 ESC % G --UTF-8-BYTES-- ESC % @
3777 We keep these bytes as is for the moment.
3778 They may be decoded by post-read-conversion. */
3779 if (charbuf + 3 > charbuf_end)
3780 goto break_loop;
3781 *charbuf++ = ISO_CODE_ESC;
3782 *charbuf++ = '%';
3783 *charbuf++ = 'G';
3784 CODING_ISO_EMBEDDED_UTF_8 (coding) = 1;
3785 }
3786 else
3787 goto invalid_code;
3788 continue;
3789 break;
3790
3791 default:
3792 if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DESIGNATION))
3793 goto invalid_code;
3794 {
3795 int reg, chars96;
3796
3797 if (c1 >= 0x28 && c1 <= 0x2B)
3798 { /* designation of DIMENSION1_CHARS94 character set */
3799 reg = c1 - 0x28, chars96 = 0;
3800 ONE_MORE_BYTE (c1);
3801 }
3802 else if (c1 >= 0x2C && c1 <= 0x2F)
3803 { /* designation of DIMENSION1_CHARS96 character set */
3804 reg = c1 - 0x2C, chars96 = 1;
3805 ONE_MORE_BYTE (c1);
3806 }
3807 else
3808 goto invalid_code;
3809 DECODE_DESIGNATION (reg, 1, chars96, c1);
3810 /* We must update these variables now. */
3811 if (reg == 0)
3812 charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3813 else if (reg == 1)
3814 charset_id_1 = CODING_ISO_INVOKED_CHARSET (coding, 1);
3815 if (chars96 < 0)
3816 goto invalid_code;
3817 }
3818 continue;
3819 }
3820 break;
3821
3822 default:
3823 emacs_abort ();
3824 }
3825
3826 if (cmp_status->state == COMPOSING_NO
3827 && charset->id != charset_ascii
3828 && last_id != charset->id)
3829 {
3830 if (last_id != charset_ascii)
3831 ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
3832 last_id = charset->id;
3833 last_offset = char_offset;
3834 }
3835
3836 /* Now we know CHARSET and 1st position code C1 of a character.
3837 Produce a decoded character while getting 2nd and 3rd
3838 position codes C2, C3 if necessary. */
3839 if (CHARSET_DIMENSION (charset) > 1)
3840 {
3841 ONE_MORE_BYTE (c2);
3842 if (c2 < 0x20 || (c2 >= 0x80 && c2 < 0xA0)
3843 || ((c1 & 0x80) != (c2 & 0x80)))
3844 /* C2 is not in a valid range. */
3845 goto invalid_code;
3846 if (CHARSET_DIMENSION (charset) == 2)
3847 c1 = (c1 << 8) | c2;
3848 else
3849 {
3850 ONE_MORE_BYTE (c3);
3851 if (c3 < 0x20 || (c3 >= 0x80 && c3 < 0xA0)
3852 || ((c1 & 0x80) != (c3 & 0x80)))
3853 /* C3 is not in a valid range. */
3854 goto invalid_code;
3855 c1 = (c1 << 16) | (c2 << 8) | c2;
3856 }
3857 }
3858 c1 &= 0x7F7F7F;
3859 CODING_DECODE_CHAR (coding, src, src_base, src_end, charset, c1, c);
3860 if (c < 0)
3861 {
3862 MAYBE_FINISH_COMPOSITION ();
3863 for (; src_base < src; src_base++, char_offset++)
3864 {
3865 if (ASCII_BYTE_P (*src_base))
3866 *charbuf++ = *src_base;
3867 else
3868 *charbuf++ = BYTE8_TO_CHAR (*src_base);
3869 }
3870 }
3871 else if (cmp_status->state == COMPOSING_NO)
3872 {
3873 *charbuf++ = c;
3874 char_offset++;
3875 }
3876 else if ((cmp_status->state == COMPOSING_CHAR
3877 ? cmp_status->nchars
3878 : cmp_status->ncomps)
3879 >= MAX_COMPOSITION_COMPONENTS)
3880 {
3881 /* Too long composition. */
3882 MAYBE_FINISH_COMPOSITION ();
3883 *charbuf++ = c;
3884 char_offset++;
3885 }
3886 else
3887 STORE_COMPOSITION_CHAR (c);
3888 continue;
3889
3890 invalid_code:
3891 MAYBE_FINISH_COMPOSITION ();
3892 src = src_base;
3893 consumed_chars = consumed_chars_base;
3894 ONE_MORE_BYTE (c);
3895 *charbuf++ = c < 0 ? -c : ASCII_BYTE_P (c) ? c : BYTE8_TO_CHAR (c);
3896 char_offset++;
3897 coding->errors++;
3898 continue;
3899
3900 break_loop:
3901 break;
3902 }
3903
3904 no_more_source:
3905 if (cmp_status->state != COMPOSING_NO)
3906 {
3907 if (coding->mode & CODING_MODE_LAST_BLOCK)
3908 MAYBE_FINISH_COMPOSITION ();
3909 else
3910 {
3911 charbuf -= cmp_status->length;
3912 for (i = 0; i < cmp_status->length; i++)
3913 cmp_status->carryover[i] = charbuf[i];
3914 }
3915 }
3916 else if (last_id != charset_ascii)
3917 ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
3918 coding->consumed_char += consumed_chars_base;
3919 coding->consumed = src_base - coding->source;
3920 coding->charbuf_used = charbuf - coding->charbuf;
3921 }
3922
3923
3924 /* ISO2022 encoding stuff. */
3925
3926 /*
3927 It is not enough to say just "ISO2022" on encoding, we have to
3928 specify more details. In Emacs, each coding system of ISO2022
3929 variant has the following specifications:
3930 1. Initial designation to G0 thru G3.
3931 2. Allows short-form designation?
3932 3. ASCII should be designated to G0 before control characters?
3933 4. ASCII should be designated to G0 at end of line?
3934 5. 7-bit environment or 8-bit environment?
3935 6. Use locking-shift?
3936 7. Use Single-shift?
3937 And the following two are only for Japanese:
3938 8. Use ASCII in place of JIS0201-1976-Roman?
3939 9. Use JISX0208-1983 in place of JISX0208-1978?
3940 These specifications are encoded in CODING_ISO_FLAGS (coding) as flag bits
3941 defined by macros CODING_ISO_FLAG_XXX. See `coding.h' for more
3942 details.
3943 */
3944
3945 /* Produce codes (escape sequence) for designating CHARSET to graphic
3946 register REG at DST, and increment DST. If <final-char> of CHARSET is
3947 '@', 'A', or 'B' and the coding system CODING allows, produce
3948 designation sequence of short-form. */
3949
3950 #define ENCODE_DESIGNATION(charset, reg, coding) \
3951 do { \
3952 unsigned char final_char = CHARSET_ISO_FINAL (charset); \
3953 const char *intermediate_char_94 = "()*+"; \
3954 const char *intermediate_char_96 = ",-./"; \
3955 int revision = -1; \
3956 \
3957 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_REVISION) \
3958 revision = CHARSET_ISO_REVISION (charset); \
3959 \
3960 if (revision >= 0) \
3961 { \
3962 EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, '&'); \
3963 EMIT_ONE_BYTE ('@' + revision); \
3964 } \
3965 EMIT_ONE_ASCII_BYTE (ISO_CODE_ESC); \
3966 if (CHARSET_DIMENSION (charset) == 1) \
3967 { \
3968 int b; \
3969 if (! CHARSET_ISO_CHARS_96 (charset)) \
3970 b = intermediate_char_94[reg]; \
3971 else \
3972 b = intermediate_char_96[reg]; \
3973 EMIT_ONE_ASCII_BYTE (b); \
3974 } \
3975 else \
3976 { \
3977 EMIT_ONE_ASCII_BYTE ('$'); \
3978 if (! CHARSET_ISO_CHARS_96 (charset)) \
3979 { \
3980 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LONG_FORM \
3981 || reg != 0 \
3982 || final_char < '@' || final_char > 'B') \
3983 EMIT_ONE_ASCII_BYTE (intermediate_char_94[reg]); \
3984 } \
3985 else \
3986 EMIT_ONE_ASCII_BYTE (intermediate_char_96[reg]); \
3987 } \
3988 EMIT_ONE_ASCII_BYTE (final_char); \
3989 \
3990 CODING_ISO_DESIGNATION (coding, reg) = CHARSET_ID (charset); \
3991 } while (0)
3992
3993
3994 /* The following two macros produce codes (control character or escape
3995 sequence) for ISO2022 single-shift functions (single-shift-2 and
3996 single-shift-3). */
3997
3998 #define ENCODE_SINGLE_SHIFT_2 \
3999 do { \
4000 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS) \
4001 EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'N'); \
4002 else \
4003 EMIT_ONE_BYTE (ISO_CODE_SS2); \
4004 CODING_ISO_SINGLE_SHIFTING (coding) = 1; \
4005 } while (0)
4006
4007
4008 #define ENCODE_SINGLE_SHIFT_3 \
4009 do { \
4010 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS) \
4011 EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'O'); \
4012 else \
4013 EMIT_ONE_BYTE (ISO_CODE_SS3); \
4014 CODING_ISO_SINGLE_SHIFTING (coding) = 1; \
4015 } while (0)
4016
4017
4018 /* The following four macros produce codes (control character or
4019 escape sequence) for ISO2022 locking-shift functions (shift-in,
4020 shift-out, locking-shift-2, and locking-shift-3). */
4021
4022 #define ENCODE_SHIFT_IN \
4023 do { \
4024 EMIT_ONE_ASCII_BYTE (ISO_CODE_SI); \
4025 CODING_ISO_INVOCATION (coding, 0) = 0; \
4026 } while (0)
4027
4028
4029 #define ENCODE_SHIFT_OUT \
4030 do { \
4031 EMIT_ONE_ASCII_BYTE (ISO_CODE_SO); \
4032 CODING_ISO_INVOCATION (coding, 0) = 1; \
4033 } while (0)
4034
4035
4036 #define ENCODE_LOCKING_SHIFT_2 \
4037 do { \
4038 EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'n'); \
4039 CODING_ISO_INVOCATION (coding, 0) = 2; \
4040 } while (0)
4041
4042
4043 #define ENCODE_LOCKING_SHIFT_3 \
4044 do { \
4045 EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'n'); \
4046 CODING_ISO_INVOCATION (coding, 0) = 3; \
4047 } while (0)
4048
4049
4050 /* Produce codes for a DIMENSION1 character whose character set is
4051 CHARSET and whose position-code is C1. Designation and invocation
4052 sequences are also produced in advance if necessary. */
4053
4054 #define ENCODE_ISO_CHARACTER_DIMENSION1(charset, c1) \
4055 do { \
4056 int id = CHARSET_ID (charset); \
4057 \
4058 if ((CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_USE_ROMAN) \
4059 && id == charset_ascii) \
4060 { \
4061 id = charset_jisx0201_roman; \
4062 charset = CHARSET_FROM_ID (id); \
4063 } \
4064 \
4065 if (CODING_ISO_SINGLE_SHIFTING (coding)) \
4066 { \
4067 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS) \
4068 EMIT_ONE_ASCII_BYTE (c1 & 0x7F); \
4069 else \
4070 EMIT_ONE_BYTE (c1 | 0x80); \
4071 CODING_ISO_SINGLE_SHIFTING (coding) = 0; \
4072 break; \
4073 } \
4074 else if (id == CODING_ISO_INVOKED_CHARSET (coding, 0)) \
4075 { \
4076 EMIT_ONE_ASCII_BYTE (c1 & 0x7F); \
4077 break; \
4078 } \
4079 else if (id == CODING_ISO_INVOKED_CHARSET (coding, 1)) \
4080 { \
4081 EMIT_ONE_BYTE (c1 | 0x80); \
4082 break; \
4083 } \
4084 else \
4085 /* Since CHARSET is not yet invoked to any graphic planes, we \
4086 must invoke it, or, at first, designate it to some graphic \
4087 register. Then repeat the loop to actually produce the \
4088 character. */ \
4089 dst = encode_invocation_designation (charset, coding, dst, \
4090 &produced_chars); \
4091 } while (1)
4092
4093
4094 /* Produce codes for a DIMENSION2 character whose character set is
4095 CHARSET and whose position-codes are C1 and C2. Designation and
4096 invocation codes are also produced in advance if necessary. */
4097
4098 #define ENCODE_ISO_CHARACTER_DIMENSION2(charset, c1, c2) \
4099 do { \
4100 int id = CHARSET_ID (charset); \
4101 \
4102 if ((CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_USE_OLDJIS) \
4103 && id == charset_jisx0208) \
4104 { \
4105 id = charset_jisx0208_1978; \
4106 charset = CHARSET_FROM_ID (id); \
4107 } \
4108 \
4109 if (CODING_ISO_SINGLE_SHIFTING (coding)) \
4110 { \
4111 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS) \
4112 EMIT_TWO_ASCII_BYTES ((c1) & 0x7F, (c2) & 0x7F); \
4113 else \
4114 EMIT_TWO_BYTES ((c1) | 0x80, (c2) | 0x80); \
4115 CODING_ISO_SINGLE_SHIFTING (coding) = 0; \
4116 break; \
4117 } \
4118 else if (id == CODING_ISO_INVOKED_CHARSET (coding, 0)) \
4119 { \
4120 EMIT_TWO_ASCII_BYTES ((c1) & 0x7F, (c2) & 0x7F); \
4121 break; \
4122 } \
4123 else if (id == CODING_ISO_INVOKED_CHARSET (coding, 1)) \
4124 { \
4125 EMIT_TWO_BYTES ((c1) | 0x80, (c2) | 0x80); \
4126 break; \
4127 } \
4128 else \
4129 /* Since CHARSET is not yet invoked to any graphic planes, we \
4130 must invoke it, or, at first, designate it to some graphic \
4131 register. Then repeat the loop to actually produce the \
4132 character. */ \
4133 dst = encode_invocation_designation (charset, coding, dst, \
4134 &produced_chars); \
4135 } while (1)
4136
4137
4138 #define ENCODE_ISO_CHARACTER(charset, c) \
4139 do { \
4140 unsigned code; \
4141 CODING_ENCODE_CHAR (coding, dst, dst_end, (charset), (c), code); \
4142 \
4143 if (CHARSET_DIMENSION (charset) == 1) \
4144 ENCODE_ISO_CHARACTER_DIMENSION1 ((charset), code); \
4145 else \
4146 ENCODE_ISO_CHARACTER_DIMENSION2 ((charset), code >> 8, code & 0xFF); \
4147 } while (0)
4148
4149
4150 /* Produce designation and invocation codes at a place pointed by DST
4151 to use CHARSET. The element `spec.iso_2022' of *CODING is updated.
4152 Return new DST. */
4153
4154 static unsigned char *
4155 encode_invocation_designation (struct charset *charset,
4156 struct coding_system *coding,
4157 unsigned char *dst, ptrdiff_t *p_nchars)
4158 {
4159 bool multibytep = coding->dst_multibyte;
4160 ptrdiff_t produced_chars = *p_nchars;
4161 int reg; /* graphic register number */
4162 int id = CHARSET_ID (charset);
4163
4164 /* At first, check designations. */
4165 for (reg = 0; reg < 4; reg++)
4166 if (id == CODING_ISO_DESIGNATION (coding, reg))
4167 break;
4168
4169 if (reg >= 4)
4170 {
4171 /* CHARSET is not yet designated to any graphic registers. */
4172 /* At first check the requested designation. */
4173 reg = CODING_ISO_REQUEST (coding, id);
4174 if (reg < 0)
4175 /* Since CHARSET requests no special designation, designate it
4176 to graphic register 0. */
4177 reg = 0;
4178
4179 ENCODE_DESIGNATION (charset, reg, coding);
4180 }
4181
4182 if (CODING_ISO_INVOCATION (coding, 0) != reg
4183 && CODING_ISO_INVOCATION (coding, 1) != reg)
4184 {
4185 /* Since the graphic register REG is not invoked to any graphic
4186 planes, invoke it to graphic plane 0. */
4187 switch (reg)
4188 {
4189 case 0: /* graphic register 0 */
4190 ENCODE_SHIFT_IN;
4191 break;
4192
4193 case 1: /* graphic register 1 */
4194 ENCODE_SHIFT_OUT;
4195 break;
4196
4197 case 2: /* graphic register 2 */
4198 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT)
4199 ENCODE_SINGLE_SHIFT_2;
4200 else
4201 ENCODE_LOCKING_SHIFT_2;
4202 break;
4203
4204 case 3: /* graphic register 3 */
4205 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT)
4206 ENCODE_SINGLE_SHIFT_3;
4207 else
4208 ENCODE_LOCKING_SHIFT_3;
4209 break;
4210 }
4211 }
4212
4213 *p_nchars = produced_chars;
4214 return dst;
4215 }
4216
4217
4218 /* Produce codes for designation and invocation to reset the graphic
4219 planes and registers to initial state. */
4220 #define ENCODE_RESET_PLANE_AND_REGISTER() \
4221 do { \
4222 int reg; \
4223 struct charset *charset; \
4224 \
4225 if (CODING_ISO_INVOCATION (coding, 0) != 0) \
4226 ENCODE_SHIFT_IN; \
4227 for (reg = 0; reg < 4; reg++) \
4228 if (CODING_ISO_INITIAL (coding, reg) >= 0 \
4229 && (CODING_ISO_DESIGNATION (coding, reg) \
4230 != CODING_ISO_INITIAL (coding, reg))) \
4231 { \
4232 charset = CHARSET_FROM_ID (CODING_ISO_INITIAL (coding, reg)); \
4233 ENCODE_DESIGNATION (charset, reg, coding); \
4234 } \
4235 } while (0)
4236
4237
4238 /* Produce designation sequences of charsets in the line started from
4239 CHARBUF to a place pointed by DST, and return the number of
4240 produced bytes. DST should not directly point a buffer text area
4241 which may be relocated by char_charset call.
4242
4243 If the current block ends before any end-of-line, we may fail to
4244 find all the necessary designations. */
4245
4246 static ptrdiff_t
4247 encode_designation_at_bol (struct coding_system *coding,
4248 int *charbuf, int *charbuf_end,
4249 unsigned char *dst)
4250 {
4251 unsigned char *orig = dst;
4252 struct charset *charset;
4253 /* Table of charsets to be designated to each graphic register. */
4254 int r[4];
4255 int c, found = 0, reg;
4256 ptrdiff_t produced_chars = 0;
4257 bool multibytep = coding->dst_multibyte;
4258 Lisp_Object attrs;
4259 Lisp_Object charset_list;
4260
4261 attrs = CODING_ID_ATTRS (coding->id);
4262 charset_list = CODING_ATTR_CHARSET_LIST (attrs);
4263 if (EQ (charset_list, Qiso_2022))
4264 charset_list = Viso_2022_charset_list;
4265
4266 for (reg = 0; reg < 4; reg++)
4267 r[reg] = -1;
4268
4269 while (charbuf < charbuf_end && found < 4)
4270 {
4271 int id;
4272
4273 c = *charbuf++;
4274 if (c == '\n')
4275 break;
4276 charset = char_charset (c, charset_list, NULL);
4277 id = CHARSET_ID (charset);
4278 reg = CODING_ISO_REQUEST (coding, id);
4279 if (reg >= 0 && r[reg] < 0)
4280 {
4281 found++;
4282 r[reg] = id;
4283 }
4284 }
4285
4286 if (found)
4287 {
4288 for (reg = 0; reg < 4; reg++)
4289 if (r[reg] >= 0
4290 && CODING_ISO_DESIGNATION (coding, reg) != r[reg])
4291 ENCODE_DESIGNATION (CHARSET_FROM_ID (r[reg]), reg, coding);
4292 }
4293
4294 return dst - orig;
4295 }
4296
4297 /* See the above "GENERAL NOTES on `encode_coding_XXX ()' functions". */
4298
4299 static bool
4300 encode_coding_iso_2022 (struct coding_system *coding)
4301 {
4302 bool multibytep = coding->dst_multibyte;
4303 int *charbuf = coding->charbuf;
4304 int *charbuf_end = charbuf + coding->charbuf_used;
4305 unsigned char *dst = coding->destination + coding->produced;
4306 unsigned char *dst_end = coding->destination + coding->dst_bytes;
4307 int safe_room = 16;
4308 bool bol_designation
4309 = (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DESIGNATE_AT_BOL
4310 && CODING_ISO_BOL (coding));
4311 ptrdiff_t produced_chars = 0;
4312 Lisp_Object attrs, eol_type, charset_list;
4313 bool ascii_compatible;
4314 int c;
4315 int preferred_charset_id = -1;
4316
4317 CODING_GET_INFO (coding, attrs, charset_list);
4318 eol_type = inhibit_eol_conversion ? Qunix : CODING_ID_EOL_TYPE (coding->id);
4319 if (VECTORP (eol_type))
4320 eol_type = Qunix;
4321
4322 setup_iso_safe_charsets (attrs);
4323 /* Charset list may have been changed. */
4324 charset_list = CODING_ATTR_CHARSET_LIST (attrs);
4325 coding->safe_charsets = SDATA (CODING_ATTR_SAFE_CHARSETS (attrs));
4326
4327 ascii_compatible
4328 = (! NILP (CODING_ATTR_ASCII_COMPAT (attrs))
4329 && ! (CODING_ISO_FLAGS (coding) & (CODING_ISO_FLAG_DESIGNATION
4330 | CODING_ISO_FLAG_LOCKING_SHIFT)));
4331
4332 while (charbuf < charbuf_end)
4333 {
4334 ASSURE_DESTINATION (safe_room);
4335
4336 if (bol_designation)
4337 {
4338 /* We have to produce designation sequences if any now. */
4339 unsigned char desig_buf[16];
4340 int nbytes;
4341 ptrdiff_t offset;
4342
4343 charset_map_loaded = 0;
4344 nbytes = encode_designation_at_bol (coding, charbuf, charbuf_end,
4345 desig_buf);
4346 if (charset_map_loaded
4347 && (offset = coding_change_destination (coding)))
4348 {
4349 dst += offset;
4350 dst_end += offset;
4351 }
4352 memcpy (dst, desig_buf, nbytes);
4353 dst += nbytes;
4354 /* We are sure that designation sequences are all ASCII bytes. */
4355 produced_chars += nbytes;
4356 bol_designation = 0;
4357 ASSURE_DESTINATION (safe_room);
4358 }
4359
4360 c = *charbuf++;
4361
4362 if (c < 0)
4363 {
4364 /* Handle an annotation. */
4365 switch (*charbuf)
4366 {
4367 case CODING_ANNOTATE_COMPOSITION_MASK:
4368 /* Not yet implemented. */
4369 break;
4370 case CODING_ANNOTATE_CHARSET_MASK:
4371 preferred_charset_id = charbuf[2];
4372 if (preferred_charset_id >= 0
4373 && NILP (Fmemq (make_number (preferred_charset_id),
4374 charset_list)))
4375 preferred_charset_id = -1;
4376 break;
4377 default:
4378 emacs_abort ();
4379 }
4380 charbuf += -c - 1;
4381 continue;
4382 }
4383
4384 /* Now encode the character C. */
4385 if (c < 0x20 || c == 0x7F)
4386 {
4387 if (c == '\n'
4388 || (c == '\r' && EQ (eol_type, Qmac)))
4389 {
4390 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_RESET_AT_EOL)
4391 ENCODE_RESET_PLANE_AND_REGISTER ();
4392 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_INIT_AT_BOL)
4393 {
4394 int i;
4395
4396 for (i = 0; i < 4; i++)
4397 CODING_ISO_DESIGNATION (coding, i)
4398 = CODING_ISO_INITIAL (coding, i);
4399 }
4400 bol_designation = ((CODING_ISO_FLAGS (coding)
4401 & CODING_ISO_FLAG_DESIGNATE_AT_BOL)
4402 != 0);
4403 }
4404 else if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_RESET_AT_CNTL)
4405 ENCODE_RESET_PLANE_AND_REGISTER ();
4406 EMIT_ONE_ASCII_BYTE (c);
4407 }
4408 else if (ASCII_CHAR_P (c))
4409 {
4410 if (ascii_compatible)
4411 EMIT_ONE_ASCII_BYTE (c);
4412 else
4413 {
4414 struct charset *charset = CHARSET_FROM_ID (charset_ascii);
4415 ENCODE_ISO_CHARACTER (charset, c);
4416 }
4417 }
4418 else if (CHAR_BYTE8_P (c))
4419 {
4420 c = CHAR_TO_BYTE8 (c);
4421 EMIT_ONE_BYTE (c);
4422 }
4423 else
4424 {
4425 struct charset *charset;
4426
4427 if (preferred_charset_id >= 0)
4428 {
4429 bool result;
4430
4431 charset = CHARSET_FROM_ID (preferred_charset_id);
4432 CODING_CHAR_CHARSET_P (coding, dst, dst_end, c, charset, result);
4433 if (! result)
4434 CODING_CHAR_CHARSET (coding, dst, dst_end, c, charset_list,
4435 NULL, charset);
4436 }
4437 else
4438 CODING_CHAR_CHARSET (coding, dst, dst_end, c, charset_list,
4439 NULL, charset);
4440 if (!charset)
4441 {
4442 if (coding->mode & CODING_MODE_SAFE_ENCODING)
4443 {
4444 c = CODING_INHIBIT_CHARACTER_SUBSTITUTION;
4445 charset = CHARSET_FROM_ID (charset_ascii);
4446 }
4447 else
4448 {
4449 c = coding->default_char;
4450 CODING_CHAR_CHARSET (coding, dst, dst_end, c,
4451 charset_list, NULL, charset);
4452 }
4453 }
4454 ENCODE_ISO_CHARACTER (charset, c);
4455 }
4456 }
4457
4458 if (coding->mode & CODING_MODE_LAST_BLOCK
4459 && CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_RESET_AT_EOL)
4460 {
4461 ASSURE_DESTINATION (safe_room);
4462 ENCODE_RESET_PLANE_AND_REGISTER ();
4463 }
4464 record_conversion_result (coding, CODING_RESULT_SUCCESS);
4465 CODING_ISO_BOL (coding) = bol_designation;
4466 coding->produced_char += produced_chars;
4467 coding->produced = dst - coding->destination;
4468 return 0;
4469 }
4470
4471 \f
4472 /*** 8,9. SJIS and BIG5 handlers ***/
4473
4474 /* Although SJIS and BIG5 are not ISO's coding system, they are used
4475 quite widely. So, for the moment, Emacs supports them in the bare
4476 C code. But, in the future, they may be supported only by CCL. */
4477
4478 /* SJIS is a coding system encoding three character sets: ASCII, right
4479 half of JISX0201-Kana, and JISX0208. An ASCII character is encoded
4480 as is. A character of charset katakana-jisx0201 is encoded by
4481 "position-code + 0x80". A character of charset japanese-jisx0208
4482 is encoded in 2-byte but two position-codes are divided and shifted
4483 so that it fit in the range below.
4484
4485 --- CODE RANGE of SJIS ---
4486 (character set) (range)
4487 ASCII 0x00 .. 0x7F
4488 KATAKANA-JISX0201 0xA0 .. 0xDF
4489 JISX0208 (1st byte) 0x81 .. 0x9F and 0xE0 .. 0xEF
4490 (2nd byte) 0x40 .. 0x7E and 0x80 .. 0xFC
4491 -------------------------------
4492
4493 */
4494
4495 /* BIG5 is a coding system encoding two character sets: ASCII and
4496 Big5. An ASCII character is encoded as is. Big5 is a two-byte
4497 character set and is encoded in two-byte.
4498
4499 --- CODE RANGE of BIG5 ---
4500 (character set) (range)
4501 ASCII 0x00 .. 0x7F
4502 Big5 (1st byte) 0xA1 .. 0xFE
4503 (2nd byte) 0x40 .. 0x7E and 0xA1 .. 0xFE
4504 --------------------------
4505
4506 */
4507
4508 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
4509 Return true if a text is encoded in SJIS. */
4510
4511 static bool
4512 detect_coding_sjis (struct coding_system *coding,
4513 struct coding_detection_info *detect_info)
4514 {
4515 const unsigned char *src = coding->source, *src_base;
4516 const unsigned char *src_end = coding->source + coding->src_bytes;
4517 bool multibytep = coding->src_multibyte;
4518 ptrdiff_t consumed_chars = 0;
4519 int found = 0;
4520 int c;
4521 Lisp_Object attrs, charset_list;
4522 int max_first_byte_of_2_byte_code;
4523
4524 CODING_GET_INFO (coding, attrs, charset_list);
4525 max_first_byte_of_2_byte_code
4526 = (XINT (Flength (charset_list)) > 3 ? 0xFC : 0xEF);
4527
4528 detect_info->checked |= CATEGORY_MASK_SJIS;
4529 /* A coding system of this category is always ASCII compatible. */
4530 src += coding->head_ascii;
4531
4532 while (1)
4533 {
4534 src_base = src;
4535 ONE_MORE_BYTE (c);
4536 if (c < 0x80)
4537 continue;
4538 if ((c >= 0x81 && c <= 0x9F)
4539 || (c >= 0xE0 && c <= max_first_byte_of_2_byte_code))
4540 {
4541 ONE_MORE_BYTE (c);
4542 if (c < 0x40 || c == 0x7F || c > 0xFC)
4543 break;
4544 found = CATEGORY_MASK_SJIS;
4545 }
4546 else if (c >= 0xA0 && c < 0xE0)
4547 found = CATEGORY_MASK_SJIS;
4548 else
4549 break;
4550 }
4551 detect_info->rejected |= CATEGORY_MASK_SJIS;
4552 return 0;
4553
4554 no_more_source:
4555 if (src_base < src && coding->mode & CODING_MODE_LAST_BLOCK)
4556 {
4557 detect_info->rejected |= CATEGORY_MASK_SJIS;
4558 return 0;
4559 }
4560 detect_info->found |= found;
4561 return 1;
4562 }
4563
4564 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
4565 Return true if a text is encoded in BIG5. */
4566
4567 static bool
4568 detect_coding_big5 (struct coding_system *coding,
4569 struct coding_detection_info *detect_info)
4570 {
4571 const unsigned char *src = coding->source, *src_base;
4572 const unsigned char *src_end = coding->source + coding->src_bytes;
4573 bool multibytep = coding->src_multibyte;
4574 ptrdiff_t consumed_chars = 0;
4575 int found = 0;
4576 int c;
4577
4578 detect_info->checked |= CATEGORY_MASK_BIG5;
4579 /* A coding system of this category is always ASCII compatible. */
4580 src += coding->head_ascii;
4581
4582 while (1)
4583 {
4584 src_base = src;
4585 ONE_MORE_BYTE (c);
4586 if (c < 0x80)
4587 continue;
4588 if (c >= 0xA1)
4589 {
4590 ONE_MORE_BYTE (c);
4591 if (c < 0x40 || (c >= 0x7F && c <= 0xA0))
4592 return 0;
4593 found = CATEGORY_MASK_BIG5;
4594 }
4595 else
4596 break;
4597 }
4598 detect_info->rejected |= CATEGORY_MASK_BIG5;
4599 return 0;
4600
4601 no_more_source:
4602 if (src_base < src && coding->mode & CODING_MODE_LAST_BLOCK)
4603 {
4604 detect_info->rejected |= CATEGORY_MASK_BIG5;
4605 return 0;
4606 }
4607 detect_info->found |= found;
4608 return 1;
4609 }
4610
4611 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions". */
4612
4613 static void
4614 decode_coding_sjis (struct coding_system *coding)
4615 {
4616 const unsigned char *src = coding->source + coding->consumed;
4617 const unsigned char *src_end = coding->source + coding->src_bytes;
4618 const unsigned char *src_base;
4619 int *charbuf = coding->charbuf + coding->charbuf_used;
4620 /* We may produce one charset annotation in one loop and one more at
4621 the end. */
4622 int *charbuf_end
4623 = coding->charbuf + coding->charbuf_size - (MAX_ANNOTATION_LENGTH * 2);
4624 ptrdiff_t consumed_chars = 0, consumed_chars_base;
4625 bool multibytep = coding->src_multibyte;
4626 struct charset *charset_roman, *charset_kanji, *charset_kana;
4627 struct charset *charset_kanji2;
4628 Lisp_Object attrs, charset_list, val;
4629 ptrdiff_t char_offset = coding->produced_char;
4630 ptrdiff_t last_offset = char_offset;
4631 int last_id = charset_ascii;
4632 bool eol_dos
4633 = !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
4634 int byte_after_cr = -1;
4635
4636 CODING_GET_INFO (coding, attrs, charset_list);
4637
4638 val = charset_list;
4639 charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4640 charset_kana = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4641 charset_kanji = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4642 charset_kanji2 = NILP (val) ? NULL : CHARSET_FROM_ID (XINT (XCAR (val)));
4643
4644 while (1)
4645 {
4646 int c, c1;
4647 struct charset *charset;
4648
4649 src_base = src;
4650 consumed_chars_base = consumed_chars;
4651
4652 if (charbuf >= charbuf_end)
4653 {
4654 if (byte_after_cr >= 0)
4655 src_base--;
4656 break;
4657 }
4658
4659 if (byte_after_cr >= 0)
4660 c = byte_after_cr, byte_after_cr = -1;
4661 else
4662 ONE_MORE_BYTE (c);
4663 if (c < 0)
4664 goto invalid_code;
4665 if (c < 0x80)
4666 {
4667 if (eol_dos && c == '\r')
4668 ONE_MORE_BYTE (byte_after_cr);
4669 charset = charset_roman;
4670 }
4671 else if (c == 0x80 || c == 0xA0)
4672 goto invalid_code;
4673 else if (c >= 0xA1 && c <= 0xDF)
4674 {
4675 /* SJIS -> JISX0201-Kana */
4676 c &= 0x7F;
4677 charset = charset_kana;
4678 }
4679 else if (c <= 0xEF)
4680 {
4681 /* SJIS -> JISX0208 */
4682 ONE_MORE_BYTE (c1);
4683 if (c1 < 0x40 || c1 == 0x7F || c1 > 0xFC)
4684 goto invalid_code;
4685 c = (c << 8) | c1;
4686 SJIS_TO_JIS (c);
4687 charset = charset_kanji;
4688 }
4689 else if (c <= 0xFC && charset_kanji2)
4690 {
4691 /* SJIS -> JISX0213-2 */
4692 ONE_MORE_BYTE (c1);
4693 if (c1 < 0x40 || c1 == 0x7F || c1 > 0xFC)
4694 goto invalid_code;
4695 c = (c << 8) | c1;
4696 SJIS_TO_JIS2 (c);
4697 charset = charset_kanji2;
4698 }
4699 else
4700 goto invalid_code;
4701 if (charset->id != charset_ascii
4702 && last_id != charset->id)
4703 {
4704 if (last_id != charset_ascii)
4705 ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
4706 last_id = charset->id;
4707 last_offset = char_offset;
4708 }
4709 CODING_DECODE_CHAR (coding, src, src_base, src_end, charset, c, c);
4710 *charbuf++ = c;
4711 char_offset++;
4712 continue;
4713
4714 invalid_code:
4715 src = src_base;
4716 consumed_chars = consumed_chars_base;
4717 ONE_MORE_BYTE (c);
4718 *charbuf++ = c < 0 ? -c : BYTE8_TO_CHAR (c);
4719 char_offset++;
4720 coding->errors++;
4721 }
4722
4723 no_more_source:
4724 if (last_id != charset_ascii)
4725 ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
4726 coding->consumed_char += consumed_chars_base;
4727 coding->consumed = src_base - coding->source;
4728 coding->charbuf_used = charbuf - coding->charbuf;
4729 }
4730
4731 static void
4732 decode_coding_big5 (struct coding_system *coding)
4733 {
4734 const unsigned char *src = coding->source + coding->consumed;
4735 const unsigned char *src_end = coding->source + coding->src_bytes;
4736 const unsigned char *src_base;
4737 int *charbuf = coding->charbuf + coding->charbuf_used;
4738 /* We may produce one charset annotation in one loop and one more at
4739 the end. */
4740 int *charbuf_end
4741 = coding->charbuf + coding->charbuf_size - (MAX_ANNOTATION_LENGTH * 2);
4742 ptrdiff_t consumed_chars = 0, consumed_chars_base;
4743 bool multibytep = coding->src_multibyte;
4744 struct charset *charset_roman, *charset_big5;
4745 Lisp_Object attrs, charset_list, val;
4746 ptrdiff_t char_offset = coding->produced_char;
4747 ptrdiff_t last_offset = char_offset;
4748 int last_id = charset_ascii;
4749 bool eol_dos
4750 = !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
4751 int byte_after_cr = -1;
4752
4753 CODING_GET_INFO (coding, attrs, charset_list);
4754 val = charset_list;
4755 charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4756 charset_big5 = CHARSET_FROM_ID (XINT (XCAR (val)));
4757
4758 while (1)
4759 {
4760 int c, c1;
4761 struct charset *charset;
4762
4763 src_base = src;
4764 consumed_chars_base = consumed_chars;
4765
4766 if (charbuf >= charbuf_end)
4767 {
4768 if (byte_after_cr >= 0)
4769 src_base--;
4770 break;
4771 }
4772
4773 if (byte_after_cr >= 0)
4774 c = byte_after_cr, byte_after_cr = -1;
4775 else
4776 ONE_MORE_BYTE (c);
4777
4778 if (c < 0)
4779 goto invalid_code;
4780 if (c < 0x80)
4781 {
4782 if (eol_dos && c == '\r')
4783 ONE_MORE_BYTE (byte_after_cr);
4784 charset = charset_roman;
4785 }
4786 else
4787 {
4788 /* BIG5 -> Big5 */
4789 if (c < 0xA1 || c > 0xFE)
4790 goto invalid_code;
4791 ONE_MORE_BYTE (c1);
4792 if (c1 < 0x40 || (c1 > 0x7E && c1 < 0xA1) || c1 > 0xFE)
4793 goto invalid_code;
4794 c = c << 8 | c1;
4795 charset = charset_big5;
4796 }
4797 if (charset->id != charset_ascii
4798 && last_id != charset->id)
4799 {
4800 if (last_id != charset_ascii)
4801 ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
4802 last_id = charset->id;
4803 last_offset = char_offset;
4804 }
4805 CODING_DECODE_CHAR (coding, src, src_base, src_end, charset, c, c);
4806 *charbuf++ = c;
4807 char_offset++;
4808 continue;
4809
4810 invalid_code:
4811 src = src_base;
4812 consumed_chars = consumed_chars_base;
4813 ONE_MORE_BYTE (c);
4814 *charbuf++ = c < 0 ? -c : BYTE8_TO_CHAR (c);
4815 char_offset++;
4816 coding->errors++;
4817 }
4818
4819 no_more_source:
4820 if (last_id != charset_ascii)
4821 ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
4822 coding->consumed_char += consumed_chars_base;
4823 coding->consumed = src_base - coding->source;
4824 coding->charbuf_used = charbuf - coding->charbuf;
4825 }
4826
4827 /* See the above "GENERAL NOTES on `encode_coding_XXX ()' functions".
4828 This function can encode charsets `ascii', `katakana-jisx0201',
4829 `japanese-jisx0208', `chinese-big5-1', and `chinese-big5-2'. We
4830 are sure that all these charsets are registered as official charset
4831 (i.e. do not have extended leading-codes). Characters of other
4832 charsets are produced without any encoding. */
4833
4834 static bool
4835 encode_coding_sjis (struct coding_system *coding)
4836 {
4837 bool multibytep = coding->dst_multibyte;
4838 int *charbuf = coding->charbuf;
4839 int *charbuf_end = charbuf + coding->charbuf_used;
4840 unsigned char *dst = coding->destination + coding->produced;
4841 unsigned char *dst_end = coding->destination + coding->dst_bytes;
4842 int safe_room = 4;
4843 ptrdiff_t produced_chars = 0;
4844 Lisp_Object attrs, charset_list, val;
4845 bool ascii_compatible;
4846 struct charset *charset_kanji, *charset_kana;
4847 struct charset *charset_kanji2;
4848 int c;
4849
4850 CODING_GET_INFO (coding, attrs, charset_list);
4851 val = XCDR (charset_list);
4852 charset_kana = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4853 charset_kanji = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4854 charset_kanji2 = NILP (val) ? NULL : CHARSET_FROM_ID (XINT (XCAR (val)));
4855
4856 ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs));
4857
4858 while (charbuf < charbuf_end)
4859 {
4860 ASSURE_DESTINATION (safe_room);
4861 c = *charbuf++;
4862 /* Now encode the character C. */
4863 if (ASCII_CHAR_P (c) && ascii_compatible)
4864 EMIT_ONE_ASCII_BYTE (c);
4865 else if (CHAR_BYTE8_P (c))
4866 {
4867 c = CHAR_TO_BYTE8 (c);
4868 EMIT_ONE_BYTE (c);
4869 }
4870 else
4871 {
4872 unsigned code;
4873 struct charset *charset;
4874 CODING_CHAR_CHARSET (coding, dst, dst_end, c, charset_list,
4875 &code, charset);
4876
4877 if (!charset)
4878 {
4879 if (coding->mode & CODING_MODE_SAFE_ENCODING)
4880 {
4881 code = CODING_INHIBIT_CHARACTER_SUBSTITUTION;
4882 charset = CHARSET_FROM_ID (charset_ascii);
4883 }
4884 else
4885 {
4886 c = coding->default_char;
4887 CODING_CHAR_CHARSET (coding, dst, dst_end, c,
4888 charset_list, &code, charset);
4889 }
4890 }
4891 if (code == CHARSET_INVALID_CODE (charset))
4892 emacs_abort ();
4893 if (charset == charset_kanji)
4894 {
4895 int c1, c2;
4896 JIS_TO_SJIS (code);
4897 c1 = code >> 8, c2 = code & 0xFF;
4898 EMIT_TWO_BYTES (c1, c2);
4899 }
4900 else if (charset == charset_kana)
4901 EMIT_ONE_BYTE (code | 0x80);
4902 else if (charset_kanji2 && charset == charset_kanji2)
4903 {
4904 int c1, c2;
4905
4906 c1 = code >> 8;
4907 if (c1 == 0x21 || (c1 >= 0x23 && c1 <= 0x25)
4908 || c1 == 0x28
4909 || (c1 >= 0x2C && c1 <= 0x2F) || c1 >= 0x6E)
4910 {
4911 JIS_TO_SJIS2 (code);
4912 c1 = code >> 8, c2 = code & 0xFF;
4913 EMIT_TWO_BYTES (c1, c2);
4914 }
4915 else
4916 EMIT_ONE_ASCII_BYTE (code & 0x7F);
4917 }
4918 else
4919 EMIT_ONE_ASCII_BYTE (code & 0x7F);
4920 }
4921 }
4922 record_conversion_result (coding, CODING_RESULT_SUCCESS);
4923 coding->produced_char += produced_chars;
4924 coding->produced = dst - coding->destination;
4925 return 0;
4926 }
4927
4928 static bool
4929 encode_coding_big5 (struct coding_system *coding)
4930 {
4931 bool multibytep = coding->dst_multibyte;
4932 int *charbuf = coding->charbuf;
4933 int *charbuf_end = charbuf + coding->charbuf_used;
4934 unsigned char *dst = coding->destination + coding->produced;
4935 unsigned char *dst_end = coding->destination + coding->dst_bytes;
4936 int safe_room = 4;
4937 ptrdiff_t produced_chars = 0;
4938 Lisp_Object attrs, charset_list, val;
4939 bool ascii_compatible;
4940 struct charset *charset_big5;
4941 int c;
4942
4943 CODING_GET_INFO (coding, attrs, charset_list);
4944 val = XCDR (charset_list);
4945 charset_big5 = CHARSET_FROM_ID (XINT (XCAR (val)));
4946 ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs));
4947
4948 while (charbuf < charbuf_end)
4949 {
4950 ASSURE_DESTINATION (safe_room);
4951 c = *charbuf++;
4952 /* Now encode the character C. */
4953 if (ASCII_CHAR_P (c) && ascii_compatible)
4954 EMIT_ONE_ASCII_BYTE (c);
4955 else if (CHAR_BYTE8_P (c))
4956 {
4957 c = CHAR_TO_BYTE8 (c);
4958 EMIT_ONE_BYTE (c);
4959 }
4960 else
4961 {
4962 unsigned code;
4963 struct charset *charset;
4964 CODING_CHAR_CHARSET (coding, dst, dst_end, c, charset_list,
4965 &code, charset);
4966
4967 if (! charset)
4968 {
4969 if (coding->mode & CODING_MODE_SAFE_ENCODING)
4970 {
4971 code = CODING_INHIBIT_CHARACTER_SUBSTITUTION;
4972 charset = CHARSET_FROM_ID (charset_ascii);
4973 }
4974 else
4975 {
4976 c = coding->default_char;
4977 CODING_CHAR_CHARSET (coding, dst, dst_end, c,
4978 charset_list, &code, charset);
4979 }
4980 }
4981 if (code == CHARSET_INVALID_CODE (charset))
4982 emacs_abort ();
4983 if (charset == charset_big5)
4984 {
4985 int c1, c2;
4986
4987 c1 = code >> 8, c2 = code & 0xFF;
4988 EMIT_TWO_BYTES (c1, c2);
4989 }
4990 else
4991 EMIT_ONE_ASCII_BYTE (code & 0x7F);
4992 }
4993 }
4994 record_conversion_result (coding, CODING_RESULT_SUCCESS);
4995 coding->produced_char += produced_chars;
4996 coding->produced = dst - coding->destination;
4997 return 0;
4998 }
4999
5000 \f
5001 /*** 10. CCL handlers ***/
5002
5003 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
5004 Return true if a text is encoded in a coding system of which
5005 encoder/decoder are written in CCL program. */
5006
5007 static bool
5008 detect_coding_ccl (struct coding_system *coding,
5009 struct coding_detection_info *detect_info)
5010 {
5011 const unsigned char *src = coding->source, *src_base;
5012 const unsigned char *src_end = coding->source + coding->src_bytes;
5013 bool multibytep = coding->src_multibyte;
5014 ptrdiff_t consumed_chars = 0;
5015 int found = 0;
5016 unsigned char *valids;
5017 ptrdiff_t head_ascii = coding->head_ascii;
5018 Lisp_Object attrs;
5019
5020 detect_info->checked |= CATEGORY_MASK_CCL;
5021
5022 coding = &coding_categories[coding_category_ccl];
5023 valids = CODING_CCL_VALIDS (coding);
5024 attrs = CODING_ID_ATTRS (coding->id);
5025 if (! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
5026 src += head_ascii;
5027
5028 while (1)
5029 {
5030 int c;
5031
5032 src_base = src;
5033 ONE_MORE_BYTE (c);
5034 if (c < 0 || ! valids[c])
5035 break;
5036 if ((valids[c] > 1))
5037 found = CATEGORY_MASK_CCL;
5038 }
5039 detect_info->rejected |= CATEGORY_MASK_CCL;
5040 return 0;
5041
5042 no_more_source:
5043 detect_info->found |= found;
5044 return 1;
5045 }
5046
5047 static void
5048 decode_coding_ccl (struct coding_system *coding)
5049 {
5050 const unsigned char *src = coding->source + coding->consumed;
5051 const unsigned char *src_end = coding->source + coding->src_bytes;
5052 int *charbuf = coding->charbuf + coding->charbuf_used;
5053 int *charbuf_end = coding->charbuf + coding->charbuf_size;
5054 ptrdiff_t consumed_chars = 0;
5055 bool multibytep = coding->src_multibyte;
5056 struct ccl_program *ccl = &coding->spec.ccl->ccl;
5057 int source_charbuf[1024];
5058 int source_byteidx[1025];
5059 Lisp_Object attrs, charset_list;
5060
5061 CODING_GET_INFO (coding, attrs, charset_list);
5062
5063 while (1)
5064 {
5065 const unsigned char *p = src;
5066 ptrdiff_t offset;
5067 int i = 0;
5068
5069 if (multibytep)
5070 {
5071 while (i < 1024 && p < src_end)
5072 {
5073 source_byteidx[i] = p - src;
5074 source_charbuf[i++] = STRING_CHAR_ADVANCE (p);
5075 }
5076 source_byteidx[i] = p - src;
5077 }
5078 else
5079 while (i < 1024 && p < src_end)
5080 source_charbuf[i++] = *p++;
5081
5082 if (p == src_end && coding->mode & CODING_MODE_LAST_BLOCK)
5083 ccl->last_block = 1;
5084 /* As ccl_driver calls DECODE_CHAR, buffer may be relocated. */
5085 charset_map_loaded = 0;
5086 ccl_driver (ccl, source_charbuf, charbuf, i, charbuf_end - charbuf,
5087 charset_list);
5088 if (charset_map_loaded
5089 && (offset = coding_change_source (coding)))
5090 {
5091 p += offset;
5092 src += offset;
5093 src_end += offset;
5094 }
5095 charbuf += ccl->produced;
5096 if (multibytep)
5097 src += source_byteidx[ccl->consumed];
5098 else
5099 src += ccl->consumed;
5100 consumed_chars += ccl->consumed;
5101 if (p == src_end || ccl->status != CCL_STAT_SUSPEND_BY_SRC)
5102 break;
5103 }
5104
5105 switch (ccl->status)
5106 {
5107 case CCL_STAT_SUSPEND_BY_SRC:
5108 record_conversion_result (coding, CODING_RESULT_INSUFFICIENT_SRC);
5109 break;
5110 case CCL_STAT_SUSPEND_BY_DST:
5111 record_conversion_result (coding, CODING_RESULT_INSUFFICIENT_DST);
5112 break;
5113 case CCL_STAT_QUIT:
5114 case CCL_STAT_INVALID_CMD:
5115 record_conversion_result (coding, CODING_RESULT_INTERRUPT);
5116 break;
5117 default:
5118 record_conversion_result (coding, CODING_RESULT_SUCCESS);
5119 break;
5120 }
5121 coding->consumed_char += consumed_chars;
5122 coding->consumed = src - coding->source;
5123 coding->charbuf_used = charbuf - coding->charbuf;
5124 }
5125
5126 static bool
5127 encode_coding_ccl (struct coding_system *coding)
5128 {
5129 struct ccl_program *ccl = &coding->spec.ccl->ccl;
5130 bool multibytep = coding->dst_multibyte;
5131 int *charbuf = coding->charbuf;
5132 int *charbuf_end = charbuf + coding->charbuf_used;
5133 unsigned char *dst = coding->destination + coding->produced;
5134 unsigned char *dst_end = coding->destination + coding->dst_bytes;
5135 int destination_charbuf[1024];
5136 ptrdiff_t produced_chars = 0;
5137 int i;
5138 Lisp_Object attrs, charset_list;
5139
5140 CODING_GET_INFO (coding, attrs, charset_list);
5141 if (coding->consumed_char == coding->src_chars
5142 && coding->mode & CODING_MODE_LAST_BLOCK)
5143 ccl->last_block = 1;
5144
5145 do
5146 {
5147 ptrdiff_t offset;
5148
5149 /* As ccl_driver calls DECODE_CHAR, buffer may be relocated. */
5150 charset_map_loaded = 0;
5151 ccl_driver (ccl, charbuf, destination_charbuf,
5152 charbuf_end - charbuf, 1024, charset_list);
5153 if (charset_map_loaded
5154 && (offset = coding_change_destination (coding)))
5155 dst += offset;
5156 if (multibytep)
5157 {
5158 ASSURE_DESTINATION (ccl->produced * 2);
5159 for (i = 0; i < ccl->produced; i++)
5160 EMIT_ONE_BYTE (destination_charbuf[i] & 0xFF);
5161 }
5162 else
5163 {
5164 ASSURE_DESTINATION (ccl->produced);
5165 for (i = 0; i < ccl->produced; i++)
5166 *dst++ = destination_charbuf[i] & 0xFF;
5167 produced_chars += ccl->produced;
5168 }
5169 charbuf += ccl->consumed;
5170 if (ccl->status == CCL_STAT_QUIT
5171 || ccl->status == CCL_STAT_INVALID_CMD)
5172 break;
5173 }
5174 while (charbuf < charbuf_end);
5175
5176 switch (ccl->status)
5177 {
5178 case CCL_STAT_SUSPEND_BY_SRC:
5179 record_conversion_result (coding, CODING_RESULT_INSUFFICIENT_SRC);
5180 break;
5181 case CCL_STAT_SUSPEND_BY_DST:
5182 record_conversion_result (coding, CODING_RESULT_INSUFFICIENT_DST);
5183 break;
5184 case CCL_STAT_QUIT:
5185 case CCL_STAT_INVALID_CMD:
5186 record_conversion_result (coding, CODING_RESULT_INTERRUPT);
5187 break;
5188 default:
5189 record_conversion_result (coding, CODING_RESULT_SUCCESS);
5190 break;
5191 }
5192
5193 coding->produced_char += produced_chars;
5194 coding->produced = dst - coding->destination;
5195 return 0;
5196 }
5197
5198 \f
5199 /*** 10, 11. no-conversion handlers ***/
5200
5201 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions". */
5202
5203 static void
5204 decode_coding_raw_text (struct coding_system *coding)
5205 {
5206 bool eol_dos
5207 = !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
5208
5209 coding->chars_at_source = 1;
5210 coding->consumed_char = coding->src_chars;
5211 coding->consumed = coding->src_bytes;
5212 if (eol_dos && coding->source[coding->src_bytes - 1] == '\r')
5213 {
5214 coding->consumed_char--;
5215 coding->consumed--;
5216 record_conversion_result (coding, CODING_RESULT_INSUFFICIENT_SRC);
5217 }
5218 else
5219 record_conversion_result (coding, CODING_RESULT_SUCCESS);
5220 }
5221
5222 static bool
5223 encode_coding_raw_text (struct coding_system *coding)
5224 {
5225 bool multibytep = coding->dst_multibyte;
5226 int *charbuf = coding->charbuf;
5227 int *charbuf_end = coding->charbuf + coding->charbuf_used;
5228 unsigned char *dst = coding->destination + coding->produced;
5229 unsigned char *dst_end = coding->destination + coding->dst_bytes;
5230 ptrdiff_t produced_chars = 0;
5231 int c;
5232
5233 if (multibytep)
5234 {
5235 int safe_room = MAX_MULTIBYTE_LENGTH * 2;
5236
5237 if (coding->src_multibyte)
5238 while (charbuf < charbuf_end)
5239 {
5240 ASSURE_DESTINATION (safe_room);
5241 c = *charbuf++;
5242 if (ASCII_CHAR_P (c))
5243 EMIT_ONE_ASCII_BYTE (c);
5244 else if (CHAR_BYTE8_P (c))
5245 {
5246 c = CHAR_TO_BYTE8 (c);
5247 EMIT_ONE_BYTE (c);
5248 }
5249 else
5250 {
5251 unsigned char str[MAX_MULTIBYTE_LENGTH], *p0 = str, *p1 = str;
5252
5253 CHAR_STRING_ADVANCE (c, p1);
5254 do
5255 {
5256 EMIT_ONE_BYTE (*p0);
5257 p0++;
5258 }
5259 while (p0 < p1);
5260 }
5261 }
5262 else
5263 while (charbuf < charbuf_end)
5264 {
5265 ASSURE_DESTINATION (safe_room);
5266 c = *charbuf++;
5267 EMIT_ONE_BYTE (c);
5268 }
5269 }
5270 else
5271 {
5272 if (coding->src_multibyte)
5273 {
5274 int safe_room = MAX_MULTIBYTE_LENGTH;
5275
5276 while (charbuf < charbuf_end)
5277 {
5278 ASSURE_DESTINATION (safe_room);
5279 c = *charbuf++;
5280 if (ASCII_CHAR_P (c))
5281 *dst++ = c;
5282 else if (CHAR_BYTE8_P (c))
5283 *dst++ = CHAR_TO_BYTE8 (c);
5284 else
5285 CHAR_STRING_ADVANCE (c, dst);
5286 }
5287 }
5288 else
5289 {
5290 ASSURE_DESTINATION (charbuf_end - charbuf);
5291 while (charbuf < charbuf_end && dst < dst_end)
5292 *dst++ = *charbuf++;
5293 }
5294 produced_chars = dst - (coding->destination + coding->produced);
5295 }
5296 record_conversion_result (coding, CODING_RESULT_SUCCESS);
5297 coding->produced_char += produced_chars;
5298 coding->produced = dst - coding->destination;
5299 return 0;
5300 }
5301
5302 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
5303 Return true if a text is encoded in a charset-based coding system. */
5304
5305 static bool
5306 detect_coding_charset (struct coding_system *coding,
5307 struct coding_detection_info *detect_info)
5308 {
5309 const unsigned char *src = coding->source, *src_base;
5310 const unsigned char *src_end = coding->source + coding->src_bytes;
5311 bool multibytep = coding->src_multibyte;
5312 ptrdiff_t consumed_chars = 0;
5313 Lisp_Object attrs, valids, name;
5314 int found = 0;
5315 ptrdiff_t head_ascii = coding->head_ascii;
5316 bool check_latin_extra = 0;
5317
5318 detect_info->checked |= CATEGORY_MASK_CHARSET;
5319
5320 coding = &coding_categories[coding_category_charset];
5321 attrs = CODING_ID_ATTRS (coding->id);
5322 valids = AREF (attrs, coding_attr_charset_valids);
5323 name = CODING_ID_NAME (coding->id);
5324 if (strncmp (SSDATA (SYMBOL_NAME (name)),
5325 "iso-8859-", sizeof ("iso-8859-") - 1) == 0
5326 || strncmp (SSDATA (SYMBOL_NAME (name)),
5327 "iso-latin-", sizeof ("iso-latin-") - 1) == 0)
5328 check_latin_extra = 1;
5329
5330 if (! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
5331 src += head_ascii;
5332
5333 while (1)
5334 {
5335 int c;
5336 Lisp_Object val;
5337 struct charset *charset;
5338 int dim, idx;
5339
5340 src_base = src;
5341 ONE_MORE_BYTE (c);
5342 if (c < 0)
5343 continue;
5344 val = AREF (valids, c);
5345 if (NILP (val))
5346 break;
5347 if (c >= 0x80)
5348 {
5349 if (c < 0xA0
5350 && check_latin_extra
5351 && (!VECTORP (Vlatin_extra_code_table)
5352 || NILP (AREF (Vlatin_extra_code_table, c))))
5353 break;
5354 found = CATEGORY_MASK_CHARSET;
5355 }
5356 if (INTEGERP (val))
5357 {
5358 charset = CHARSET_FROM_ID (XFASTINT (val));
5359 dim = CHARSET_DIMENSION (charset);
5360 for (idx = 1; idx < dim; idx++)
5361 {
5362 if (src == src_end)
5363 goto too_short;
5364 ONE_MORE_BYTE (c);
5365 if (c < charset->code_space[(dim - 1 - idx) * 4]
5366 || c > charset->code_space[(dim - 1 - idx) * 4 + 1])
5367 break;
5368 }
5369 if (idx < dim)
5370 break;
5371 }
5372 else
5373 {
5374 idx = 1;
5375 for (; CONSP (val); val = XCDR (val))
5376 {
5377 charset = CHARSET_FROM_ID (XFASTINT (XCAR (val)));
5378 dim = CHARSET_DIMENSION (charset);
5379 while (idx < dim)
5380 {
5381 if (src == src_end)
5382 goto too_short;
5383 ONE_MORE_BYTE (c);
5384 if (c < charset->code_space[(dim - 1 - idx) * 4]
5385 || c > charset->code_space[(dim - 1 - idx) * 4 + 1])
5386 break;
5387 idx++;
5388 }
5389 if (idx == dim)
5390 {
5391 val = Qnil;
5392 break;
5393 }
5394 }
5395 if (CONSP (val))
5396 break;
5397 }
5398 }
5399 too_short:
5400 detect_info->rejected |= CATEGORY_MASK_CHARSET;
5401 return 0;
5402
5403 no_more_source:
5404 detect_info->found |= found;
5405 return 1;
5406 }
5407
5408 static void
5409 decode_coding_charset (struct coding_system *coding)
5410 {
5411 const unsigned char *src = coding->source + coding->consumed;
5412 const unsigned char *src_end = coding->source + coding->src_bytes;
5413 const unsigned char *src_base;
5414 int *charbuf = coding->charbuf + coding->charbuf_used;
5415 /* We may produce one charset annotation in one loop and one more at
5416 the end. */
5417 int *charbuf_end
5418 = coding->charbuf + coding->charbuf_size - (MAX_ANNOTATION_LENGTH * 2);
5419 ptrdiff_t consumed_chars = 0, consumed_chars_base;
5420 bool multibytep = coding->src_multibyte;
5421 Lisp_Object attrs = CODING_ID_ATTRS (coding->id);
5422 Lisp_Object valids;
5423 ptrdiff_t char_offset = coding->produced_char;
5424 ptrdiff_t last_offset = char_offset;
5425 int last_id = charset_ascii;
5426 bool eol_dos
5427 = !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
5428 int byte_after_cr = -1;
5429
5430 valids = AREF (attrs, coding_attr_charset_valids);
5431
5432 while (1)
5433 {
5434 int c;
5435 Lisp_Object val;
5436 struct charset *charset;
5437 int dim;
5438 int len = 1;
5439 unsigned code;
5440
5441 src_base = src;
5442 consumed_chars_base = consumed_chars;
5443
5444 if (charbuf >= charbuf_end)
5445 {
5446 if (byte_after_cr >= 0)
5447 src_base--;
5448 break;
5449 }
5450
5451 if (byte_after_cr >= 0)
5452 {
5453 c = byte_after_cr;
5454 byte_after_cr = -1;
5455 }
5456 else
5457 {
5458 ONE_MORE_BYTE (c);
5459 if (eol_dos && c == '\r')
5460 ONE_MORE_BYTE (byte_after_cr);
5461 }
5462 if (c < 0)
5463 goto invalid_code;
5464 code = c;
5465
5466 val = AREF (valids, c);
5467 if (! INTEGERP (val) && ! CONSP (val))
5468 goto invalid_code;
5469 if (INTEGERP (val))
5470 {
5471 charset = CHARSET_FROM_ID (XFASTINT (val));
5472 dim = CHARSET_DIMENSION (charset);
5473 while (len < dim)
5474 {
5475 ONE_MORE_BYTE (c);
5476 code = (code << 8) | c;
5477 len++;
5478 }
5479 CODING_DECODE_CHAR (coding, src, src_base, src_end,
5480 charset, code, c);
5481 }
5482 else
5483 {
5484 /* VAL is a list of charset IDs. It is assured that the
5485 list is sorted by charset dimensions (smaller one
5486 comes first). */
5487 while (CONSP (val))
5488 {
5489 charset = CHARSET_FROM_ID (XFASTINT (XCAR (val)));
5490 dim = CHARSET_DIMENSION (charset);
5491 while (len < dim)
5492 {
5493 ONE_MORE_BYTE (c);
5494 code = (code << 8) | c;
5495 len++;
5496 }
5497 CODING_DECODE_CHAR (coding, src, src_base,
5498 src_end, charset, code, c);
5499 if (c >= 0)
5500 break;
5501 val = XCDR (val);
5502 }
5503 }
5504 if (c < 0)
5505 goto invalid_code;
5506 if (charset->id != charset_ascii
5507 && last_id != charset->id)
5508 {
5509 if (last_id != charset_ascii)
5510 ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
5511 last_id = charset->id;
5512 last_offset = char_offset;
5513 }
5514
5515 *charbuf++ = c;
5516 char_offset++;
5517 continue;
5518
5519 invalid_code:
5520 src = src_base;
5521 consumed_chars = consumed_chars_base;
5522 ONE_MORE_BYTE (c);
5523 *charbuf++ = c < 0 ? -c : ASCII_BYTE_P (c) ? c : BYTE8_TO_CHAR (c);
5524 char_offset++;
5525 coding->errors++;
5526 }
5527
5528 no_more_source:
5529 if (last_id != charset_ascii)
5530 ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
5531 coding->consumed_char += consumed_chars_base;
5532 coding->consumed = src_base - coding->source;
5533 coding->charbuf_used = charbuf - coding->charbuf;
5534 }
5535
5536 static bool
5537 encode_coding_charset (struct coding_system *coding)
5538 {
5539 bool multibytep = coding->dst_multibyte;
5540 int *charbuf = coding->charbuf;
5541 int *charbuf_end = charbuf + coding->charbuf_used;
5542 unsigned char *dst = coding->destination + coding->produced;
5543 unsigned char *dst_end = coding->destination + coding->dst_bytes;
5544 int safe_room = MAX_MULTIBYTE_LENGTH;
5545 ptrdiff_t produced_chars = 0;
5546 Lisp_Object attrs, charset_list;
5547 bool ascii_compatible;
5548 int c;
5549
5550 CODING_GET_INFO (coding, attrs, charset_list);
5551 ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs));
5552
5553 while (charbuf < charbuf_end)
5554 {
5555 struct charset *charset;
5556 unsigned code;
5557
5558 ASSURE_DESTINATION (safe_room);
5559 c = *charbuf++;
5560 if (ascii_compatible && ASCII_CHAR_P (c))
5561 EMIT_ONE_ASCII_BYTE (c);
5562 else if (CHAR_BYTE8_P (c))
5563 {
5564 c = CHAR_TO_BYTE8 (c);
5565 EMIT_ONE_BYTE (c);
5566 }
5567 else
5568 {
5569 CODING_CHAR_CHARSET (coding, dst, dst_end, c, charset_list,
5570 &code, charset);
5571
5572 if (charset)
5573 {
5574 if (CHARSET_DIMENSION (charset) == 1)
5575 EMIT_ONE_BYTE (code);
5576 else if (CHARSET_DIMENSION (charset) == 2)
5577 EMIT_TWO_BYTES (code >> 8, code & 0xFF);
5578 else if (CHARSET_DIMENSION (charset) == 3)
5579 EMIT_THREE_BYTES (code >> 16, (code >> 8) & 0xFF, code & 0xFF);
5580 else
5581 EMIT_FOUR_BYTES (code >> 24, (code >> 16) & 0xFF,
5582 (code >> 8) & 0xFF, code & 0xFF);
5583 }
5584 else
5585 {
5586 if (coding->mode & CODING_MODE_SAFE_ENCODING)
5587 c = CODING_INHIBIT_CHARACTER_SUBSTITUTION;
5588 else
5589 c = coding->default_char;
5590 EMIT_ONE_BYTE (c);
5591 }
5592 }
5593 }
5594
5595 record_conversion_result (coding, CODING_RESULT_SUCCESS);
5596 coding->produced_char += produced_chars;
5597 coding->produced = dst - coding->destination;
5598 return 0;
5599 }
5600
5601 \f
5602 /*** 7. C library functions ***/
5603
5604 /* Setup coding context CODING from information about CODING_SYSTEM.
5605 If CODING_SYSTEM is nil, `no-conversion' is assumed. If
5606 CODING_SYSTEM is invalid, signal an error. */
5607
5608 void
5609 setup_coding_system (Lisp_Object coding_system, struct coding_system *coding)
5610 {
5611 Lisp_Object attrs;
5612 Lisp_Object eol_type;
5613 Lisp_Object coding_type;
5614 Lisp_Object val;
5615
5616 if (NILP (coding_system))
5617 coding_system = Qundecided;
5618
5619 CHECK_CODING_SYSTEM_GET_ID (coding_system, coding->id);
5620
5621 attrs = CODING_ID_ATTRS (coding->id);
5622 eol_type = inhibit_eol_conversion ? Qunix : CODING_ID_EOL_TYPE (coding->id);
5623
5624 coding->mode = 0;
5625 coding->head_ascii = -1;
5626 if (VECTORP (eol_type))
5627 coding->common_flags = (CODING_REQUIRE_DECODING_MASK
5628 | CODING_REQUIRE_DETECTION_MASK);
5629 else if (! EQ (eol_type, Qunix))
5630 coding->common_flags = (CODING_REQUIRE_DECODING_MASK
5631 | CODING_REQUIRE_ENCODING_MASK);
5632 else
5633 coding->common_flags = 0;
5634 if (! NILP (CODING_ATTR_POST_READ (attrs)))
5635 coding->common_flags |= CODING_REQUIRE_DECODING_MASK;
5636 if (! NILP (CODING_ATTR_PRE_WRITE (attrs)))
5637 coding->common_flags |= CODING_REQUIRE_ENCODING_MASK;
5638 if (! NILP (CODING_ATTR_FOR_UNIBYTE (attrs)))
5639 coding->common_flags |= CODING_FOR_UNIBYTE_MASK;
5640
5641 val = CODING_ATTR_SAFE_CHARSETS (attrs);
5642 coding->max_charset_id = SCHARS (val) - 1;
5643 coding->safe_charsets = SDATA (val);
5644 coding->default_char = XINT (CODING_ATTR_DEFAULT_CHAR (attrs));
5645 coding->carryover_bytes = 0;
5646
5647 coding_type = CODING_ATTR_TYPE (attrs);
5648 if (EQ (coding_type, Qundecided))
5649 {
5650 coding->detector = NULL;
5651 coding->decoder = decode_coding_raw_text;
5652 coding->encoder = encode_coding_raw_text;
5653 coding->common_flags |= CODING_REQUIRE_DETECTION_MASK;
5654 }
5655 else if (EQ (coding_type, Qiso_2022))
5656 {
5657 int i;
5658 int flags = XINT (AREF (attrs, coding_attr_iso_flags));
5659
5660 /* Invoke graphic register 0 to plane 0. */
5661 CODING_ISO_INVOCATION (coding, 0) = 0;
5662 /* Invoke graphic register 1 to plane 1 if we can use 8-bit. */
5663 CODING_ISO_INVOCATION (coding, 1)
5664 = (flags & CODING_ISO_FLAG_SEVEN_BITS ? -1 : 1);
5665 /* Setup the initial status of designation. */
5666 for (i = 0; i < 4; i++)
5667 CODING_ISO_DESIGNATION (coding, i) = CODING_ISO_INITIAL (coding, i);
5668 /* Not single shifting initially. */
5669 CODING_ISO_SINGLE_SHIFTING (coding) = 0;
5670 /* Beginning of buffer should also be regarded as bol. */
5671 CODING_ISO_BOL (coding) = 1;
5672 coding->detector = detect_coding_iso_2022;
5673 coding->decoder = decode_coding_iso_2022;
5674 coding->encoder = encode_coding_iso_2022;
5675 if (flags & CODING_ISO_FLAG_SAFE)
5676 coding->mode |= CODING_MODE_SAFE_ENCODING;
5677 coding->common_flags
5678 |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK
5679 | CODING_REQUIRE_FLUSHING_MASK);
5680 if (flags & CODING_ISO_FLAG_COMPOSITION)
5681 coding->common_flags |= CODING_ANNOTATE_COMPOSITION_MASK;
5682 if (flags & CODING_ISO_FLAG_DESIGNATION)
5683 coding->common_flags |= CODING_ANNOTATE_CHARSET_MASK;
5684 if (flags & CODING_ISO_FLAG_FULL_SUPPORT)
5685 {
5686 setup_iso_safe_charsets (attrs);
5687 val = CODING_ATTR_SAFE_CHARSETS (attrs);
5688 coding->max_charset_id = SCHARS (val) - 1;
5689 coding->safe_charsets = SDATA (val);
5690 }
5691 CODING_ISO_FLAGS (coding) = flags;
5692 CODING_ISO_CMP_STATUS (coding)->state = COMPOSING_NO;
5693 CODING_ISO_CMP_STATUS (coding)->method = COMPOSITION_NO;
5694 CODING_ISO_EXTSEGMENT_LEN (coding) = 0;
5695 CODING_ISO_EMBEDDED_UTF_8 (coding) = 0;
5696 }
5697 else if (EQ (coding_type, Qcharset))
5698 {
5699 coding->detector = detect_coding_charset;
5700 coding->decoder = decode_coding_charset;
5701 coding->encoder = encode_coding_charset;
5702 coding->common_flags
5703 |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
5704 }
5705 else if (EQ (coding_type, Qutf_8))
5706 {
5707 val = AREF (attrs, coding_attr_utf_bom);
5708 CODING_UTF_8_BOM (coding) = (CONSP (val) ? utf_detect_bom
5709 : EQ (val, Qt) ? utf_with_bom
5710 : utf_without_bom);
5711 coding->detector = detect_coding_utf_8;
5712 coding->decoder = decode_coding_utf_8;
5713 coding->encoder = encode_coding_utf_8;
5714 coding->common_flags
5715 |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
5716 if (CODING_UTF_8_BOM (coding) == utf_detect_bom)
5717 coding->common_flags |= CODING_REQUIRE_DETECTION_MASK;
5718 }
5719 else if (EQ (coding_type, Qutf_16))
5720 {
5721 val = AREF (attrs, coding_attr_utf_bom);
5722 CODING_UTF_16_BOM (coding) = (CONSP (val) ? utf_detect_bom
5723 : EQ (val, Qt) ? utf_with_bom
5724 : utf_without_bom);
5725 val = AREF (attrs, coding_attr_utf_16_endian);
5726 CODING_UTF_16_ENDIAN (coding) = (EQ (val, Qbig) ? utf_16_big_endian
5727 : utf_16_little_endian);
5728 CODING_UTF_16_SURROGATE (coding) = 0;
5729 coding->detector = detect_coding_utf_16;
5730 coding->decoder = decode_coding_utf_16;
5731 coding->encoder = encode_coding_utf_16;
5732 coding->common_flags
5733 |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
5734 if (CODING_UTF_16_BOM (coding) == utf_detect_bom)
5735 coding->common_flags |= CODING_REQUIRE_DETECTION_MASK;
5736 }
5737 else if (EQ (coding_type, Qccl))
5738 {
5739 coding->detector = detect_coding_ccl;
5740 coding->decoder = decode_coding_ccl;
5741 coding->encoder = encode_coding_ccl;
5742 coding->common_flags
5743 |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK
5744 | CODING_REQUIRE_FLUSHING_MASK);
5745 }
5746 else if (EQ (coding_type, Qemacs_mule))
5747 {
5748 coding->detector = detect_coding_emacs_mule;
5749 coding->decoder = decode_coding_emacs_mule;
5750 coding->encoder = encode_coding_emacs_mule;
5751 coding->common_flags
5752 |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
5753 if (! NILP (AREF (attrs, coding_attr_emacs_mule_full))
5754 && ! EQ (CODING_ATTR_CHARSET_LIST (attrs), Vemacs_mule_charset_list))
5755 {
5756 Lisp_Object tail, safe_charsets;
5757 int max_charset_id = 0;
5758
5759 for (tail = Vemacs_mule_charset_list; CONSP (tail);
5760 tail = XCDR (tail))
5761 if (max_charset_id < XFASTINT (XCAR (tail)))
5762 max_charset_id = XFASTINT (XCAR (tail));
5763 safe_charsets = make_uninit_string (max_charset_id + 1);
5764 memset (SDATA (safe_charsets), 255, max_charset_id + 1);
5765 for (tail = Vemacs_mule_charset_list; CONSP (tail);
5766 tail = XCDR (tail))
5767 SSET (safe_charsets, XFASTINT (XCAR (tail)), 0);
5768 coding->max_charset_id = max_charset_id;
5769 coding->safe_charsets = SDATA (safe_charsets);
5770 }
5771 coding->spec.emacs_mule.cmp_status.state = COMPOSING_NO;
5772 coding->spec.emacs_mule.cmp_status.method = COMPOSITION_NO;
5773 }
5774 else if (EQ (coding_type, Qshift_jis))
5775 {
5776 coding->detector = detect_coding_sjis;
5777 coding->decoder = decode_coding_sjis;
5778 coding->encoder = encode_coding_sjis;
5779 coding->common_flags
5780 |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
5781 }
5782 else if (EQ (coding_type, Qbig5))
5783 {
5784 coding->detector = detect_coding_big5;
5785 coding->decoder = decode_coding_big5;
5786 coding->encoder = encode_coding_big5;
5787 coding->common_flags
5788 |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
5789 }
5790 else /* EQ (coding_type, Qraw_text) */
5791 {
5792 coding->detector = NULL;
5793 coding->decoder = decode_coding_raw_text;
5794 coding->encoder = encode_coding_raw_text;
5795 if (! EQ (eol_type, Qunix))
5796 {
5797 coding->common_flags |= CODING_REQUIRE_DECODING_MASK;
5798 if (! VECTORP (eol_type))
5799 coding->common_flags |= CODING_REQUIRE_ENCODING_MASK;
5800 }
5801
5802 }
5803
5804 return;
5805 }
5806
5807 /* Return a list of charsets supported by CODING. */
5808
5809 Lisp_Object
5810 coding_charset_list (struct coding_system *coding)
5811 {
5812 Lisp_Object attrs, charset_list;
5813
5814 CODING_GET_INFO (coding, attrs, charset_list);
5815 if (EQ (CODING_ATTR_TYPE (attrs), Qiso_2022))
5816 {
5817 int flags = XINT (AREF (attrs, coding_attr_iso_flags));
5818
5819 if (flags & CODING_ISO_FLAG_FULL_SUPPORT)
5820 charset_list = Viso_2022_charset_list;
5821 }
5822 else if (EQ (CODING_ATTR_TYPE (attrs), Qemacs_mule))
5823 {
5824 charset_list = Vemacs_mule_charset_list;
5825 }
5826 return charset_list;
5827 }
5828
5829
5830 /* Return a list of charsets supported by CODING-SYSTEM. */
5831
5832 Lisp_Object
5833 coding_system_charset_list (Lisp_Object coding_system)
5834 {
5835 ptrdiff_t id;
5836 Lisp_Object attrs, charset_list;
5837
5838 CHECK_CODING_SYSTEM_GET_ID (coding_system, id);
5839 attrs = CODING_ID_ATTRS (id);
5840
5841 if (EQ (CODING_ATTR_TYPE (attrs), Qiso_2022))
5842 {
5843 int flags = XINT (AREF (attrs, coding_attr_iso_flags));
5844
5845 if (flags & CODING_ISO_FLAG_FULL_SUPPORT)
5846 charset_list = Viso_2022_charset_list;
5847 else
5848 charset_list = CODING_ATTR_CHARSET_LIST (attrs);
5849 }
5850 else if (EQ (CODING_ATTR_TYPE (attrs), Qemacs_mule))
5851 {
5852 charset_list = Vemacs_mule_charset_list;
5853 }
5854 else
5855 {
5856 charset_list = CODING_ATTR_CHARSET_LIST (attrs);
5857 }
5858 return charset_list;
5859 }
5860
5861
5862 /* Return raw-text or one of its subsidiaries that has the same
5863 eol_type as CODING-SYSTEM. */
5864
5865 Lisp_Object
5866 raw_text_coding_system (Lisp_Object coding_system)
5867 {
5868 Lisp_Object spec, attrs;
5869 Lisp_Object eol_type, raw_text_eol_type;
5870
5871 if (NILP (coding_system))
5872 return Qraw_text;
5873 spec = CODING_SYSTEM_SPEC (coding_system);
5874 attrs = AREF (spec, 0);
5875
5876 if (EQ (CODING_ATTR_TYPE (attrs), Qraw_text))
5877 return coding_system;
5878
5879 eol_type = AREF (spec, 2);
5880 if (VECTORP (eol_type))
5881 return Qraw_text;
5882 spec = CODING_SYSTEM_SPEC (Qraw_text);
5883 raw_text_eol_type = AREF (spec, 2);
5884 return (EQ (eol_type, Qunix) ? AREF (raw_text_eol_type, 0)
5885 : EQ (eol_type, Qdos) ? AREF (raw_text_eol_type, 1)
5886 : AREF (raw_text_eol_type, 2));
5887 }
5888
5889
5890 /* If CODING_SYSTEM doesn't specify end-of-line format, return one of
5891 the subsidiary that has the same eol-spec as PARENT (if it is not
5892 nil and specifies end-of-line format) or the system's setting
5893 (system_eol_type). */
5894
5895 Lisp_Object
5896 coding_inherit_eol_type (Lisp_Object coding_system, Lisp_Object parent)
5897 {
5898 Lisp_Object spec, eol_type;
5899
5900 if (NILP (coding_system))
5901 coding_system = Qraw_text;
5902 spec = CODING_SYSTEM_SPEC (coding_system);
5903 eol_type = AREF (spec, 2);
5904 if (VECTORP (eol_type))
5905 {
5906 Lisp_Object parent_eol_type;
5907
5908 if (! NILP (parent))
5909 {
5910 Lisp_Object parent_spec;
5911
5912 parent_spec = CODING_SYSTEM_SPEC (parent);
5913 parent_eol_type = AREF (parent_spec, 2);
5914 if (VECTORP (parent_eol_type))
5915 parent_eol_type = system_eol_type;
5916 }
5917 else
5918 parent_eol_type = system_eol_type;
5919 if (EQ (parent_eol_type, Qunix))
5920 coding_system = AREF (eol_type, 0);
5921 else if (EQ (parent_eol_type, Qdos))
5922 coding_system = AREF (eol_type, 1);
5923 else if (EQ (parent_eol_type, Qmac))
5924 coding_system = AREF (eol_type, 2);
5925 }
5926 return coding_system;
5927 }
5928
5929
5930 /* Check if text-conversion and eol-conversion of CODING_SYSTEM are
5931 decided for writing to a process. If not, complement them, and
5932 return a new coding system. */
5933
5934 Lisp_Object
5935 complement_process_encoding_system (Lisp_Object coding_system)
5936 {
5937 Lisp_Object coding_base = Qnil, eol_base = Qnil;
5938 Lisp_Object spec, attrs;
5939 int i;
5940
5941 for (i = 0; i < 3; i++)
5942 {
5943 if (i == 1)
5944 coding_system = CDR_SAFE (Vdefault_process_coding_system);
5945 else if (i == 2)
5946 coding_system = preferred_coding_system ();
5947 spec = CODING_SYSTEM_SPEC (coding_system);
5948 if (NILP (spec))
5949 continue;
5950 attrs = AREF (spec, 0);
5951 if (NILP (coding_base) && ! EQ (CODING_ATTR_TYPE (attrs), Qundecided))
5952 coding_base = CODING_ATTR_BASE_NAME (attrs);
5953 if (NILP (eol_base) && ! VECTORP (AREF (spec, 2)))
5954 eol_base = coding_system;
5955 if (! NILP (coding_base) && ! NILP (eol_base))
5956 break;
5957 }
5958
5959 if (i > 0)
5960 /* The original CODING_SYSTEM didn't specify text-conversion or
5961 eol-conversion. Be sure that we return a fully complemented
5962 coding system. */
5963 coding_system = coding_inherit_eol_type (coding_base, eol_base);
5964 return coding_system;
5965 }
5966
5967
5968 /* Emacs has a mechanism to automatically detect a coding system if it
5969 is one of Emacs' internal format, ISO2022, SJIS, and BIG5. But,
5970 it's impossible to distinguish some coding systems accurately
5971 because they use the same range of codes. So, at first, coding
5972 systems are categorized into 7, those are:
5973
5974 o coding-category-emacs-mule
5975
5976 The category for a coding system which has the same code range
5977 as Emacs' internal format. Assigned the coding-system (Lisp
5978 symbol) `emacs-mule' by default.
5979
5980 o coding-category-sjis
5981
5982 The category for a coding system which has the same code range
5983 as SJIS. Assigned the coding-system (Lisp
5984 symbol) `japanese-shift-jis' by default.
5985
5986 o coding-category-iso-7
5987
5988 The category for a coding system which has the same code range
5989 as ISO2022 of 7-bit environment. This doesn't use any locking
5990 shift and single shift functions. This can encode/decode all
5991 charsets. Assigned the coding-system (Lisp symbol)
5992 `iso-2022-7bit' by default.
5993
5994 o coding-category-iso-7-tight
5995
5996 Same as coding-category-iso-7 except that this can
5997 encode/decode only the specified charsets.
5998
5999 o coding-category-iso-8-1
6000
6001 The category for a coding system which has the same code range
6002 as ISO2022 of 8-bit environment and graphic plane 1 used only
6003 for DIMENSION1 charset. This doesn't use any locking shift
6004 and single shift functions. Assigned the coding-system (Lisp
6005 symbol) `iso-latin-1' by default.
6006
6007 o coding-category-iso-8-2
6008
6009 The category for a coding system which has the same code range
6010 as ISO2022 of 8-bit environment and graphic plane 1 used only
6011 for DIMENSION2 charset. This doesn't use any locking shift
6012 and single shift functions. Assigned the coding-system (Lisp
6013 symbol) `japanese-iso-8bit' by default.
6014
6015 o coding-category-iso-7-else
6016
6017 The category for a coding system which has the same code range
6018 as ISO2022 of 7-bit environment but uses locking shift or
6019 single shift functions. Assigned the coding-system (Lisp
6020 symbol) `iso-2022-7bit-lock' by default.
6021
6022 o coding-category-iso-8-else
6023
6024 The category for a coding system which has the same code range
6025 as ISO2022 of 8-bit environment but uses locking shift or
6026 single shift functions. Assigned the coding-system (Lisp
6027 symbol) `iso-2022-8bit-ss2' by default.
6028
6029 o coding-category-big5
6030
6031 The category for a coding system which has the same code range
6032 as BIG5. Assigned the coding-system (Lisp symbol)
6033 `cn-big5' by default.
6034
6035 o coding-category-utf-8
6036
6037 The category for a coding system which has the same code range
6038 as UTF-8 (cf. RFC3629). Assigned the coding-system (Lisp
6039 symbol) `utf-8' by default.
6040
6041 o coding-category-utf-16-be
6042
6043 The category for a coding system in which a text has an
6044 Unicode signature (cf. Unicode Standard) in the order of BIG
6045 endian at the head. Assigned the coding-system (Lisp symbol)
6046 `utf-16-be' by default.
6047
6048 o coding-category-utf-16-le
6049
6050 The category for a coding system in which a text has an
6051 Unicode signature (cf. Unicode Standard) in the order of
6052 LITTLE endian at the head. Assigned the coding-system (Lisp
6053 symbol) `utf-16-le' by default.
6054
6055 o coding-category-ccl
6056
6057 The category for a coding system of which encoder/decoder is
6058 written in CCL programs. The default value is nil, i.e., no
6059 coding system is assigned.
6060
6061 o coding-category-binary
6062
6063 The category for a coding system not categorized in any of the
6064 above. Assigned the coding-system (Lisp symbol)
6065 `no-conversion' by default.
6066
6067 Each of them is a Lisp symbol and the value is an actual
6068 `coding-system's (this is also a Lisp symbol) assigned by a user.
6069 What Emacs does actually is to detect a category of coding system.
6070 Then, it uses a `coding-system' assigned to it. If Emacs can't
6071 decide only one possible category, it selects a category of the
6072 highest priority. Priorities of categories are also specified by a
6073 user in a Lisp variable `coding-category-list'.
6074
6075 */
6076
6077 #define EOL_SEEN_NONE 0
6078 #define EOL_SEEN_LF 1
6079 #define EOL_SEEN_CR 2
6080 #define EOL_SEEN_CRLF 4
6081
6082 /* Detect how end-of-line of a text of length SRC_BYTES pointed by
6083 SOURCE is encoded. If CATEGORY is one of
6084 coding_category_utf_16_XXXX, assume that CR and LF are encoded by
6085 two-byte, else they are encoded by one-byte.
6086
6087 Return one of EOL_SEEN_XXX. */
6088
6089 #define MAX_EOL_CHECK_COUNT 3
6090
6091 static int
6092 detect_eol (const unsigned char *source, ptrdiff_t src_bytes,
6093 enum coding_category category)
6094 {
6095 const unsigned char *src = source, *src_end = src + src_bytes;
6096 unsigned char c;
6097 int total = 0;
6098 int eol_seen = EOL_SEEN_NONE;
6099
6100 if ((1 << category) & CATEGORY_MASK_UTF_16)
6101 {
6102 bool msb = category == (coding_category_utf_16_le
6103 | coding_category_utf_16_le_nosig);
6104 bool lsb = !msb;
6105
6106 while (src + 1 < src_end)
6107 {
6108 c = src[lsb];
6109 if (src[msb] == 0 && (c == '\n' || c == '\r'))
6110 {
6111 int this_eol;
6112
6113 if (c == '\n')
6114 this_eol = EOL_SEEN_LF;
6115 else if (src + 3 >= src_end
6116 || src[msb + 2] != 0
6117 || src[lsb + 2] != '\n')
6118 this_eol = EOL_SEEN_CR;
6119 else
6120 {
6121 this_eol = EOL_SEEN_CRLF;
6122 src += 2;
6123 }
6124
6125 if (eol_seen == EOL_SEEN_NONE)
6126 /* This is the first end-of-line. */
6127 eol_seen = this_eol;
6128 else if (eol_seen != this_eol)
6129 {
6130 /* The found type is different from what found before.
6131 Allow for stray ^M characters in DOS EOL files. */
6132 if ((eol_seen == EOL_SEEN_CR && this_eol == EOL_SEEN_CRLF)
6133 || (eol_seen == EOL_SEEN_CRLF
6134 && this_eol == EOL_SEEN_CR))
6135 eol_seen = EOL_SEEN_CRLF;
6136 else
6137 {
6138 eol_seen = EOL_SEEN_LF;
6139 break;
6140 }
6141 }
6142 if (++total == MAX_EOL_CHECK_COUNT)
6143 break;
6144 }
6145 src += 2;
6146 }
6147 }
6148 else
6149 while (src < src_end)
6150 {
6151 c = *src++;
6152 if (c == '\n' || c == '\r')
6153 {
6154 int this_eol;
6155
6156 if (c == '\n')
6157 this_eol = EOL_SEEN_LF;
6158 else if (src >= src_end || *src != '\n')
6159 this_eol = EOL_SEEN_CR;
6160 else
6161 this_eol = EOL_SEEN_CRLF, src++;
6162
6163 if (eol_seen == EOL_SEEN_NONE)
6164 /* This is the first end-of-line. */
6165 eol_seen = this_eol;
6166 else if (eol_seen != this_eol)
6167 {
6168 /* The found type is different from what found before.
6169 Allow for stray ^M characters in DOS EOL files. */
6170 if ((eol_seen == EOL_SEEN_CR && this_eol == EOL_SEEN_CRLF)
6171 || (eol_seen == EOL_SEEN_CRLF && this_eol == EOL_SEEN_CR))
6172 eol_seen = EOL_SEEN_CRLF;
6173 else
6174 {
6175 eol_seen = EOL_SEEN_LF;
6176 break;
6177 }
6178 }
6179 if (++total == MAX_EOL_CHECK_COUNT)
6180 break;
6181 }
6182 }
6183 return eol_seen;
6184 }
6185
6186
6187 static Lisp_Object
6188 adjust_coding_eol_type (struct coding_system *coding, int eol_seen)
6189 {
6190 Lisp_Object eol_type;
6191
6192 eol_type = CODING_ID_EOL_TYPE (coding->id);
6193 if (eol_seen & EOL_SEEN_LF)
6194 {
6195 coding->id = CODING_SYSTEM_ID (AREF (eol_type, 0));
6196 eol_type = Qunix;
6197 }
6198 else if (eol_seen & EOL_SEEN_CRLF)
6199 {
6200 coding->id = CODING_SYSTEM_ID (AREF (eol_type, 1));
6201 eol_type = Qdos;
6202 }
6203 else if (eol_seen & EOL_SEEN_CR)
6204 {
6205 coding->id = CODING_SYSTEM_ID (AREF (eol_type, 2));
6206 eol_type = Qmac;
6207 }
6208 return eol_type;
6209 }
6210
6211 /* Detect how a text specified in CODING is encoded. If a coding
6212 system is detected, update fields of CODING by the detected coding
6213 system. */
6214
6215 static void
6216 detect_coding (struct coding_system *coding)
6217 {
6218 const unsigned char *src, *src_end;
6219 unsigned int saved_mode = coding->mode;
6220
6221 coding->consumed = coding->consumed_char = 0;
6222 coding->produced = coding->produced_char = 0;
6223 coding_set_source (coding);
6224
6225 src_end = coding->source + coding->src_bytes;
6226 coding->head_ascii = 0;
6227
6228 /* If we have not yet decided the text encoding type, detect it
6229 now. */
6230 if (EQ (CODING_ATTR_TYPE (CODING_ID_ATTRS (coding->id)), Qundecided))
6231 {
6232 int c, i;
6233 struct coding_detection_info detect_info;
6234 bool null_byte_found = 0, eight_bit_found = 0;
6235
6236 detect_info.checked = detect_info.found = detect_info.rejected = 0;
6237 for (src = coding->source; src < src_end; src++)
6238 {
6239 c = *src;
6240 if (c & 0x80)
6241 {
6242 eight_bit_found = 1;
6243 if (null_byte_found)
6244 break;
6245 }
6246 else if (c < 0x20)
6247 {
6248 if ((c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
6249 && ! inhibit_iso_escape_detection
6250 && ! detect_info.checked)
6251 {
6252 if (detect_coding_iso_2022 (coding, &detect_info))
6253 {
6254 /* We have scanned the whole data. */
6255 if (! (detect_info.rejected & CATEGORY_MASK_ISO_7_ELSE))
6256 {
6257 /* We didn't find an 8-bit code. We may
6258 have found a null-byte, but it's very
6259 rare that a binary file conforms to
6260 ISO-2022. */
6261 src = src_end;
6262 coding->head_ascii = src - coding->source;
6263 }
6264 detect_info.rejected |= ~CATEGORY_MASK_ISO_ESCAPE;
6265 break;
6266 }
6267 }
6268 else if (! c && !inhibit_null_byte_detection)
6269 {
6270 null_byte_found = 1;
6271 if (eight_bit_found)
6272 break;
6273 }
6274 if (! eight_bit_found)
6275 coding->head_ascii++;
6276 }
6277 else if (! eight_bit_found)
6278 coding->head_ascii++;
6279 }
6280
6281 if (null_byte_found || eight_bit_found
6282 || coding->head_ascii < coding->src_bytes
6283 || detect_info.found)
6284 {
6285 enum coding_category category;
6286 struct coding_system *this;
6287
6288 if (coding->head_ascii == coding->src_bytes)
6289 /* As all bytes are 7-bit, we can ignore non-ISO-2022 codings. */
6290 for (i = 0; i < coding_category_raw_text; i++)
6291 {
6292 category = coding_priorities[i];
6293 this = coding_categories + category;
6294 if (detect_info.found & (1 << category))
6295 break;
6296 }
6297 else
6298 {
6299 if (null_byte_found)
6300 {
6301 detect_info.checked |= ~CATEGORY_MASK_UTF_16;
6302 detect_info.rejected |= ~CATEGORY_MASK_UTF_16;
6303 }
6304 for (i = 0; i < coding_category_raw_text; i++)
6305 {
6306 category = coding_priorities[i];
6307 this = coding_categories + category;
6308 /* Some of this->detector (e.g. detect_coding_sjis)
6309 require this information. */
6310 coding->id = this->id;
6311 if (this->id < 0)
6312 {
6313 /* No coding system of this category is defined. */
6314 detect_info.rejected |= (1 << category);
6315 }
6316 else if (category >= coding_category_raw_text)
6317 continue;
6318 else if (detect_info.checked & (1 << category))
6319 {
6320 if (detect_info.found & (1 << category))
6321 break;
6322 }
6323 else if ((*(this->detector)) (coding, &detect_info)
6324 && detect_info.found & (1 << category))
6325 {
6326 if (category == coding_category_utf_16_auto)
6327 {
6328 if (detect_info.found & CATEGORY_MASK_UTF_16_LE)
6329 category = coding_category_utf_16_le;
6330 else
6331 category = coding_category_utf_16_be;
6332 }
6333 break;
6334 }
6335 }
6336 }
6337
6338 if (i < coding_category_raw_text)
6339 setup_coding_system (CODING_ID_NAME (this->id), coding);
6340 else if (null_byte_found)
6341 setup_coding_system (Qno_conversion, coding);
6342 else if ((detect_info.rejected & CATEGORY_MASK_ANY)
6343 == CATEGORY_MASK_ANY)
6344 setup_coding_system (Qraw_text, coding);
6345 else if (detect_info.rejected)
6346 for (i = 0; i < coding_category_raw_text; i++)
6347 if (! (detect_info.rejected & (1 << coding_priorities[i])))
6348 {
6349 this = coding_categories + coding_priorities[i];
6350 setup_coding_system (CODING_ID_NAME (this->id), coding);
6351 break;
6352 }
6353 }
6354 }
6355 else if (XINT (CODING_ATTR_CATEGORY (CODING_ID_ATTRS (coding->id)))
6356 == coding_category_utf_8_auto)
6357 {
6358 Lisp_Object coding_systems;
6359 struct coding_detection_info detect_info;
6360
6361 coding_systems
6362 = AREF (CODING_ID_ATTRS (coding->id), coding_attr_utf_bom);
6363 detect_info.found = detect_info.rejected = 0;
6364 coding->head_ascii = 0;
6365 if (CONSP (coding_systems)
6366 && detect_coding_utf_8 (coding, &detect_info))
6367 {
6368 if (detect_info.found & CATEGORY_MASK_UTF_8_SIG)
6369 setup_coding_system (XCAR (coding_systems), coding);
6370 else
6371 setup_coding_system (XCDR (coding_systems), coding);
6372 }
6373 }
6374 else if (XINT (CODING_ATTR_CATEGORY (CODING_ID_ATTRS (coding->id)))
6375 == coding_category_utf_16_auto)
6376 {
6377 Lisp_Object coding_systems;
6378 struct coding_detection_info detect_info;
6379
6380 coding_systems
6381 = AREF (CODING_ID_ATTRS (coding->id), coding_attr_utf_bom);
6382 detect_info.found = detect_info.rejected = 0;
6383 coding->head_ascii = 0;
6384 if (CONSP (coding_systems)
6385 && detect_coding_utf_16 (coding, &detect_info))
6386 {
6387 if (detect_info.found & CATEGORY_MASK_UTF_16_LE)
6388 setup_coding_system (XCAR (coding_systems), coding);
6389 else if (detect_info.found & CATEGORY_MASK_UTF_16_BE)
6390 setup_coding_system (XCDR (coding_systems), coding);
6391 }
6392 }
6393 coding->mode = saved_mode;
6394 }
6395
6396
6397 static void
6398 decode_eol (struct coding_system *coding)
6399 {
6400 Lisp_Object eol_type;
6401 unsigned char *p, *pbeg, *pend;
6402
6403 eol_type = CODING_ID_EOL_TYPE (coding->id);
6404 if (EQ (eol_type, Qunix) || inhibit_eol_conversion)
6405 return;
6406
6407 if (NILP (coding->dst_object))
6408 pbeg = coding->destination;
6409 else
6410 pbeg = BYTE_POS_ADDR (coding->dst_pos_byte);
6411 pend = pbeg + coding->produced;
6412
6413 if (VECTORP (eol_type))
6414 {
6415 int eol_seen = EOL_SEEN_NONE;
6416
6417 for (p = pbeg; p < pend; p++)
6418 {
6419 if (*p == '\n')
6420 eol_seen |= EOL_SEEN_LF;
6421 else if (*p == '\r')
6422 {
6423 if (p + 1 < pend && *(p + 1) == '\n')
6424 {
6425 eol_seen |= EOL_SEEN_CRLF;
6426 p++;
6427 }
6428 else
6429 eol_seen |= EOL_SEEN_CR;
6430 }
6431 }
6432 /* Handle DOS-style EOLs in a file with stray ^M characters. */
6433 if ((eol_seen & EOL_SEEN_CRLF) != 0
6434 && (eol_seen & EOL_SEEN_CR) != 0
6435 && (eol_seen & EOL_SEEN_LF) == 0)
6436 eol_seen = EOL_SEEN_CRLF;
6437 else if (eol_seen != EOL_SEEN_NONE
6438 && eol_seen != EOL_SEEN_LF
6439 && eol_seen != EOL_SEEN_CRLF
6440 && eol_seen != EOL_SEEN_CR)
6441 eol_seen = EOL_SEEN_LF;
6442 if (eol_seen != EOL_SEEN_NONE)
6443 eol_type = adjust_coding_eol_type (coding, eol_seen);
6444 }
6445
6446 if (EQ (eol_type, Qmac))
6447 {
6448 for (p = pbeg; p < pend; p++)
6449 if (*p == '\r')
6450 *p = '\n';
6451 }
6452 else if (EQ (eol_type, Qdos))
6453 {
6454 ptrdiff_t n = 0;
6455
6456 if (NILP (coding->dst_object))
6457 {
6458 /* Start deleting '\r' from the tail to minimize the memory
6459 movement. */
6460 for (p = pend - 2; p >= pbeg; p--)
6461 if (*p == '\r')
6462 {
6463 memmove (p, p + 1, pend-- - p - 1);
6464 n++;
6465 }
6466 }
6467 else
6468 {
6469 ptrdiff_t pos_byte = coding->dst_pos_byte;
6470 ptrdiff_t pos = coding->dst_pos;
6471 ptrdiff_t pos_end = pos + coding->produced_char - 1;
6472
6473 while (pos < pos_end)
6474 {
6475 p = BYTE_POS_ADDR (pos_byte);
6476 if (*p == '\r' && p[1] == '\n')
6477 {
6478 del_range_2 (pos, pos_byte, pos + 1, pos_byte + 1, 0);
6479 n++;
6480 pos_end--;
6481 }
6482 pos++;
6483 if (coding->dst_multibyte)
6484 pos_byte += BYTES_BY_CHAR_HEAD (*p);
6485 else
6486 pos_byte++;
6487 }
6488 }
6489 coding->produced -= n;
6490 coding->produced_char -= n;
6491 }
6492 }
6493
6494
6495 /* Return a translation table (or list of them) from coding system
6496 attribute vector ATTRS for encoding (if ENCODEP) or decoding (if
6497 not ENCODEP). */
6498
6499 static Lisp_Object
6500 get_translation_table (Lisp_Object attrs, bool encodep, int *max_lookup)
6501 {
6502 Lisp_Object standard, translation_table;
6503 Lisp_Object val;
6504
6505 if (NILP (Venable_character_translation))
6506 {
6507 if (max_lookup)
6508 *max_lookup = 0;
6509 return Qnil;
6510 }
6511 if (encodep)
6512 translation_table = CODING_ATTR_ENCODE_TBL (attrs),
6513 standard = Vstandard_translation_table_for_encode;
6514 else
6515 translation_table = CODING_ATTR_DECODE_TBL (attrs),
6516 standard = Vstandard_translation_table_for_decode;
6517 if (NILP (translation_table))
6518 translation_table = standard;
6519 else
6520 {
6521 if (SYMBOLP (translation_table))
6522 translation_table = Fget (translation_table, Qtranslation_table);
6523 else if (CONSP (translation_table))
6524 {
6525 translation_table = Fcopy_sequence (translation_table);
6526 for (val = translation_table; CONSP (val); val = XCDR (val))
6527 if (SYMBOLP (XCAR (val)))
6528 XSETCAR (val, Fget (XCAR (val), Qtranslation_table));
6529 }
6530 if (CHAR_TABLE_P (standard))
6531 {
6532 if (CONSP (translation_table))
6533 translation_table = nconc2 (translation_table,
6534 Fcons (standard, Qnil));
6535 else
6536 translation_table = Fcons (translation_table,
6537 Fcons (standard, Qnil));
6538 }
6539 }
6540
6541 if (max_lookup)
6542 {
6543 *max_lookup = 1;
6544 if (CHAR_TABLE_P (translation_table)
6545 && CHAR_TABLE_EXTRA_SLOTS (XCHAR_TABLE (translation_table)) > 1)
6546 {
6547 val = XCHAR_TABLE (translation_table)->extras[1];
6548 if (NATNUMP (val) && *max_lookup < XFASTINT (val))
6549 *max_lookup = XFASTINT (val);
6550 }
6551 else if (CONSP (translation_table))
6552 {
6553 Lisp_Object tail;
6554
6555 for (tail = translation_table; CONSP (tail); tail = XCDR (tail))
6556 if (CHAR_TABLE_P (XCAR (tail))
6557 && CHAR_TABLE_EXTRA_SLOTS (XCHAR_TABLE (XCAR (tail))) > 1)
6558 {
6559 Lisp_Object tailval = XCHAR_TABLE (XCAR (tail))->extras[1];
6560 if (NATNUMP (tailval) && *max_lookup < XFASTINT (tailval))
6561 *max_lookup = XFASTINT (tailval);
6562 }
6563 }
6564 }
6565 return translation_table;
6566 }
6567
6568 #define LOOKUP_TRANSLATION_TABLE(table, c, trans) \
6569 do { \
6570 trans = Qnil; \
6571 if (CHAR_TABLE_P (table)) \
6572 { \
6573 trans = CHAR_TABLE_REF (table, c); \
6574 if (CHARACTERP (trans)) \
6575 c = XFASTINT (trans), trans = Qnil; \
6576 } \
6577 else if (CONSP (table)) \
6578 { \
6579 Lisp_Object tail; \
6580 \
6581 for (tail = table; CONSP (tail); tail = XCDR (tail)) \
6582 if (CHAR_TABLE_P (XCAR (tail))) \
6583 { \
6584 trans = CHAR_TABLE_REF (XCAR (tail), c); \
6585 if (CHARACTERP (trans)) \
6586 c = XFASTINT (trans), trans = Qnil; \
6587 else if (! NILP (trans)) \
6588 break; \
6589 } \
6590 } \
6591 } while (0)
6592
6593
6594 /* Return a translation of character(s) at BUF according to TRANS.
6595 TRANS is TO-CHAR or ((FROM . TO) ...) where
6596 FROM = [FROM-CHAR ...], TO is TO-CHAR or [TO-CHAR ...].
6597 The return value is TO-CHAR or ([FROM-CHAR ...] . TO) if a
6598 translation is found, and Qnil if not found..
6599 If BUF is too short to lookup characters in FROM, return Qt. */
6600
6601 static Lisp_Object
6602 get_translation (Lisp_Object trans, int *buf, int *buf_end)
6603 {
6604
6605 if (INTEGERP (trans))
6606 return trans;
6607 for (; CONSP (trans); trans = XCDR (trans))
6608 {
6609 Lisp_Object val = XCAR (trans);
6610 Lisp_Object from = XCAR (val);
6611 ptrdiff_t len = ASIZE (from);
6612 ptrdiff_t i;
6613
6614 for (i = 0; i < len; i++)
6615 {
6616 if (buf + i == buf_end)
6617 return Qt;
6618 if (XINT (AREF (from, i)) != buf[i])
6619 break;
6620 }
6621 if (i == len)
6622 return val;
6623 }
6624 return Qnil;
6625 }
6626
6627
6628 static int
6629 produce_chars (struct coding_system *coding, Lisp_Object translation_table,
6630 bool last_block)
6631 {
6632 unsigned char *dst = coding->destination + coding->produced;
6633 unsigned char *dst_end = coding->destination + coding->dst_bytes;
6634 ptrdiff_t produced;
6635 ptrdiff_t produced_chars = 0;
6636 int carryover = 0;
6637
6638 if (! coding->chars_at_source)
6639 {
6640 /* Source characters are in coding->charbuf. */
6641 int *buf = coding->charbuf;
6642 int *buf_end = buf + coding->charbuf_used;
6643
6644 if (EQ (coding->src_object, coding->dst_object))
6645 {
6646 coding_set_source (coding);
6647 dst_end = ((unsigned char *) coding->source) + coding->consumed;
6648 }
6649
6650 while (buf < buf_end)
6651 {
6652 int c = *buf;
6653 ptrdiff_t i;
6654
6655 if (c >= 0)
6656 {
6657 ptrdiff_t from_nchars = 1, to_nchars = 1;
6658 Lisp_Object trans = Qnil;
6659
6660 LOOKUP_TRANSLATION_TABLE (translation_table, c, trans);
6661 if (! NILP (trans))
6662 {
6663 trans = get_translation (trans, buf, buf_end);
6664 if (INTEGERP (trans))
6665 c = XINT (trans);
6666 else if (CONSP (trans))
6667 {
6668 from_nchars = ASIZE (XCAR (trans));
6669 trans = XCDR (trans);
6670 if (INTEGERP (trans))
6671 c = XINT (trans);
6672 else
6673 {
6674 to_nchars = ASIZE (trans);
6675 c = XINT (AREF (trans, 0));
6676 }
6677 }
6678 else if (EQ (trans, Qt) && ! last_block)
6679 break;
6680 }
6681
6682 if ((dst_end - dst) / MAX_MULTIBYTE_LENGTH < to_nchars)
6683 {
6684 if (((min (PTRDIFF_MAX, SIZE_MAX) - (buf_end - buf))
6685 / MAX_MULTIBYTE_LENGTH)
6686 < to_nchars)
6687 memory_full (SIZE_MAX);
6688 dst = alloc_destination (coding,
6689 buf_end - buf
6690 + MAX_MULTIBYTE_LENGTH * to_nchars,
6691 dst);
6692 if (EQ (coding->src_object, coding->dst_object))
6693 {
6694 coding_set_source (coding);
6695 dst_end = (((unsigned char *) coding->source)
6696 + coding->consumed);
6697 }
6698 else
6699 dst_end = coding->destination + coding->dst_bytes;
6700 }
6701
6702 for (i = 0; i < to_nchars; i++)
6703 {
6704 if (i > 0)
6705 c = XINT (AREF (trans, i));
6706 if (coding->dst_multibyte
6707 || ! CHAR_BYTE8_P (c))
6708 CHAR_STRING_ADVANCE_NO_UNIFY (c, dst);
6709 else
6710 *dst++ = CHAR_TO_BYTE8 (c);
6711 }
6712 produced_chars += to_nchars;
6713 buf += from_nchars;
6714 }
6715 else
6716 /* This is an annotation datum. (-C) is the length. */
6717 buf += -c;
6718 }
6719 carryover = buf_end - buf;
6720 }
6721 else
6722 {
6723 /* Source characters are at coding->source. */
6724 const unsigned char *src = coding->source;
6725 const unsigned char *src_end = src + coding->consumed;
6726
6727 if (EQ (coding->dst_object, coding->src_object))
6728 dst_end = (unsigned char *) src;
6729 if (coding->src_multibyte != coding->dst_multibyte)
6730 {
6731 if (coding->src_multibyte)
6732 {
6733 bool multibytep = 1;
6734 ptrdiff_t consumed_chars = 0;
6735
6736 while (1)
6737 {
6738 const unsigned char *src_base = src;
6739 int c;
6740
6741 ONE_MORE_BYTE (c);
6742 if (dst == dst_end)
6743 {
6744 if (EQ (coding->src_object, coding->dst_object))
6745 dst_end = (unsigned char *) src;
6746 if (dst == dst_end)
6747 {
6748 ptrdiff_t offset = src - coding->source;
6749
6750 dst = alloc_destination (coding, src_end - src + 1,
6751 dst);
6752 dst_end = coding->destination + coding->dst_bytes;
6753 coding_set_source (coding);
6754 src = coding->source + offset;
6755 src_end = coding->source + coding->consumed;
6756 if (EQ (coding->src_object, coding->dst_object))
6757 dst_end = (unsigned char *) src;
6758 }
6759 }
6760 *dst++ = c;
6761 produced_chars++;
6762 }
6763 no_more_source:
6764 ;
6765 }
6766 else
6767 while (src < src_end)
6768 {
6769 bool multibytep = 1;
6770 int c = *src++;
6771
6772 if (dst >= dst_end - 1)
6773 {
6774 if (EQ (coding->src_object, coding->dst_object))
6775 dst_end = (unsigned char *) src;
6776 if (dst >= dst_end - 1)
6777 {
6778 ptrdiff_t offset = src - coding->source;
6779 ptrdiff_t more_bytes;
6780
6781 if (EQ (coding->src_object, coding->dst_object))
6782 more_bytes = ((src_end - src) / 2) + 2;
6783 else
6784 more_bytes = src_end - src + 2;
6785 dst = alloc_destination (coding, more_bytes, dst);
6786 dst_end = coding->destination + coding->dst_bytes;
6787 coding_set_source (coding);
6788 src = coding->source + offset;
6789 src_end = coding->source + coding->consumed;
6790 if (EQ (coding->src_object, coding->dst_object))
6791 dst_end = (unsigned char *) src;
6792 }
6793 }
6794 EMIT_ONE_BYTE (c);
6795 }
6796 }
6797 else
6798 {
6799 if (!EQ (coding->src_object, coding->dst_object))
6800 {
6801 ptrdiff_t require = coding->src_bytes - coding->dst_bytes;
6802
6803 if (require > 0)
6804 {
6805 ptrdiff_t offset = src - coding->source;
6806
6807 dst = alloc_destination (coding, require, dst);
6808 coding_set_source (coding);
6809 src = coding->source + offset;
6810 src_end = coding->source + coding->consumed;
6811 }
6812 }
6813 produced_chars = coding->consumed_char;
6814 while (src < src_end)
6815 *dst++ = *src++;
6816 }
6817 }
6818
6819 produced = dst - (coding->destination + coding->produced);
6820 if (BUFFERP (coding->dst_object) && produced_chars > 0)
6821 insert_from_gap (produced_chars, produced);
6822 coding->produced += produced;
6823 coding->produced_char += produced_chars;
6824 return carryover;
6825 }
6826
6827 /* Compose text in CODING->object according to the annotation data at
6828 CHARBUF. CHARBUF is an array:
6829 [ -LENGTH ANNOTATION_MASK NCHARS NBYTES METHOD [ COMPONENTS... ] ]
6830 */
6831
6832 static void
6833 produce_composition (struct coding_system *coding, int *charbuf, ptrdiff_t pos)
6834 {
6835 int len;
6836 ptrdiff_t to;
6837 enum composition_method method;
6838 Lisp_Object components;
6839
6840 len = -charbuf[0] - MAX_ANNOTATION_LENGTH;
6841 to = pos + charbuf[2];
6842 method = (enum composition_method) (charbuf[4]);
6843
6844 if (method == COMPOSITION_RELATIVE)
6845 components = Qnil;
6846 else
6847 {
6848 Lisp_Object args[MAX_COMPOSITION_COMPONENTS * 2 - 1];
6849 int i, j;
6850
6851 if (method == COMPOSITION_WITH_RULE)
6852 len = charbuf[2] * 3 - 2;
6853 charbuf += MAX_ANNOTATION_LENGTH;
6854 /* charbuf = [ CHRA ... CHAR] or [ CHAR -2 RULE ... CHAR ] */
6855 for (i = j = 0; i < len && charbuf[i] != -1; i++, j++)
6856 {
6857 if (charbuf[i] >= 0)
6858 args[j] = make_number (charbuf[i]);
6859 else
6860 {
6861 i++;
6862 args[j] = make_number (charbuf[i] % 0x100);
6863 }
6864 }
6865 components = (i == j ? Fstring (j, args) : Fvector (j, args));
6866 }
6867 compose_text (pos, to, components, Qnil, coding->dst_object);
6868 }
6869
6870
6871 /* Put `charset' property on text in CODING->object according to
6872 the annotation data at CHARBUF. CHARBUF is an array:
6873 [ -LENGTH ANNOTATION_MASK NCHARS CHARSET-ID ]
6874 */
6875
6876 static void
6877 produce_charset (struct coding_system *coding, int *charbuf, ptrdiff_t pos)
6878 {
6879 ptrdiff_t from = pos - charbuf[2];
6880 struct charset *charset = CHARSET_FROM_ID (charbuf[3]);
6881
6882 Fput_text_property (make_number (from), make_number (pos),
6883 Qcharset, CHARSET_NAME (charset),
6884 coding->dst_object);
6885 }
6886
6887
6888 #define CHARBUF_SIZE 0x4000
6889
6890 #define ALLOC_CONVERSION_WORK_AREA(coding) \
6891 do { \
6892 int size = CHARBUF_SIZE; \
6893 \
6894 coding->charbuf = NULL; \
6895 while (size > 1024) \
6896 { \
6897 coding->charbuf = alloca (sizeof (int) * size); \
6898 if (coding->charbuf) \
6899 break; \
6900 size >>= 1; \
6901 } \
6902 if (! coding->charbuf) \
6903 { \
6904 record_conversion_result (coding, CODING_RESULT_INSUFFICIENT_MEM); \
6905 return; \
6906 } \
6907 coding->charbuf_size = size; \
6908 } while (0)
6909
6910
6911 static void
6912 produce_annotation (struct coding_system *coding, ptrdiff_t pos)
6913 {
6914 int *charbuf = coding->charbuf;
6915 int *charbuf_end = charbuf + coding->charbuf_used;
6916
6917 if (NILP (coding->dst_object))
6918 return;
6919
6920 while (charbuf < charbuf_end)
6921 {
6922 if (*charbuf >= 0)
6923 pos++, charbuf++;
6924 else
6925 {
6926 int len = -*charbuf;
6927
6928 if (len > 2)
6929 switch (charbuf[1])
6930 {
6931 case CODING_ANNOTATE_COMPOSITION_MASK:
6932 produce_composition (coding, charbuf, pos);
6933 break;
6934 case CODING_ANNOTATE_CHARSET_MASK:
6935 produce_charset (coding, charbuf, pos);
6936 break;
6937 }
6938 charbuf += len;
6939 }
6940 }
6941 }
6942
6943 /* Decode the data at CODING->src_object into CODING->dst_object.
6944 CODING->src_object is a buffer, a string, or nil.
6945 CODING->dst_object is a buffer.
6946
6947 If CODING->src_object is a buffer, it must be the current buffer.
6948 In this case, if CODING->src_pos is positive, it is a position of
6949 the source text in the buffer, otherwise, the source text is in the
6950 gap area of the buffer, and CODING->src_pos specifies the offset of
6951 the text from GPT (which must be the same as PT). If this is the
6952 same buffer as CODING->dst_object, CODING->src_pos must be
6953 negative.
6954
6955 If CODING->src_object is a string, CODING->src_pos is an index to
6956 that string.
6957
6958 If CODING->src_object is nil, CODING->source must already point to
6959 the non-relocatable memory area. In this case, CODING->src_pos is
6960 an offset from CODING->source.
6961
6962 The decoded data is inserted at the current point of the buffer
6963 CODING->dst_object.
6964 */
6965
6966 static void
6967 decode_coding (struct coding_system *coding)
6968 {
6969 Lisp_Object attrs;
6970 Lisp_Object undo_list;
6971 Lisp_Object translation_table;
6972 struct ccl_spec cclspec;
6973 int carryover;
6974 int i;
6975
6976 if (BUFFERP (coding->src_object)
6977 && coding->src_pos > 0
6978 && coding->src_pos < GPT
6979 && coding->src_pos + coding->src_chars > GPT)
6980 move_gap_both (coding->src_pos, coding->src_pos_byte);
6981
6982 undo_list = Qt;
6983 if (BUFFERP (coding->dst_object))
6984 {
6985 set_buffer_internal (XBUFFER (coding->dst_object));
6986 if (GPT != PT)
6987 move_gap_both (PT, PT_BYTE);
6988
6989 /* We must disable undo_list in order to record the whole insert
6990 transaction via record_insert at the end. But doing so also
6991 disables the recording of the first change to the undo_list.
6992 Therefore we check for first change here and record it via
6993 record_first_change if needed. */
6994 if (MODIFF <= SAVE_MODIFF)
6995 record_first_change ();
6996
6997 undo_list = BVAR (current_buffer, undo_list);
6998 bset_undo_list (current_buffer, Qt);
6999 }
7000
7001 coding->consumed = coding->consumed_char = 0;
7002 coding->produced = coding->produced_char = 0;
7003 coding->chars_at_source = 0;
7004 record_conversion_result (coding, CODING_RESULT_SUCCESS);
7005 coding->errors = 0;
7006
7007 ALLOC_CONVERSION_WORK_AREA (coding);
7008
7009 attrs = CODING_ID_ATTRS (coding->id);
7010 translation_table = get_translation_table (attrs, 0, NULL);
7011
7012 carryover = 0;
7013 if (coding->decoder == decode_coding_ccl)
7014 {
7015 coding->spec.ccl = &cclspec;
7016 setup_ccl_program (&cclspec.ccl, CODING_CCL_DECODER (coding));
7017 }
7018 do
7019 {
7020 ptrdiff_t pos = coding->dst_pos + coding->produced_char;
7021
7022 coding_set_source (coding);
7023 coding->annotated = 0;
7024 coding->charbuf_used = carryover;
7025 (*(coding->decoder)) (coding);
7026 coding_set_destination (coding);
7027 carryover = produce_chars (coding, translation_table, 0);
7028 if (coding->annotated)
7029 produce_annotation (coding, pos);
7030 for (i = 0; i < carryover; i++)
7031 coding->charbuf[i]
7032 = coding->charbuf[coding->charbuf_used - carryover + i];
7033 }
7034 while (coding->result == CODING_RESULT_INSUFFICIENT_DST
7035 || (coding->consumed < coding->src_bytes
7036 && (coding->result == CODING_RESULT_SUCCESS
7037 || coding->result == CODING_RESULT_INVALID_SRC)));
7038
7039 if (carryover > 0)
7040 {
7041 coding_set_destination (coding);
7042 coding->charbuf_used = carryover;
7043 produce_chars (coding, translation_table, 1);
7044 }
7045
7046 coding->carryover_bytes = 0;
7047 if (coding->consumed < coding->src_bytes)
7048 {
7049 int nbytes = coding->src_bytes - coding->consumed;
7050 const unsigned char *src;
7051
7052 coding_set_source (coding);
7053 coding_set_destination (coding);
7054 src = coding->source + coding->consumed;
7055
7056 if (coding->mode & CODING_MODE_LAST_BLOCK)
7057 {
7058 /* Flush out unprocessed data as binary chars. We are sure
7059 that the number of data is less than the size of
7060 coding->charbuf. */
7061 coding->charbuf_used = 0;
7062 coding->chars_at_source = 0;
7063
7064 while (nbytes-- > 0)
7065 {
7066 int c = *src++;
7067
7068 if (c & 0x80)
7069 c = BYTE8_TO_CHAR (c);
7070 coding->charbuf[coding->charbuf_used++] = c;
7071 }
7072 produce_chars (coding, Qnil, 1);
7073 }
7074 else
7075 {
7076 /* Record unprocessed bytes in coding->carryover. We are
7077 sure that the number of data is less than the size of
7078 coding->carryover. */
7079 unsigned char *p = coding->carryover;
7080
7081 if (nbytes > sizeof coding->carryover)
7082 nbytes = sizeof coding->carryover;
7083 coding->carryover_bytes = nbytes;
7084 while (nbytes-- > 0)
7085 *p++ = *src++;
7086 }
7087 coding->consumed = coding->src_bytes;
7088 }
7089
7090 if (! EQ (CODING_ID_EOL_TYPE (coding->id), Qunix)
7091 && !inhibit_eol_conversion)
7092 decode_eol (coding);
7093 if (BUFFERP (coding->dst_object))
7094 {
7095 bset_undo_list (current_buffer, undo_list);
7096 record_insert (coding->dst_pos, coding->produced_char);
7097 }
7098 }
7099
7100
7101 /* Extract an annotation datum from a composition starting at POS and
7102 ending before LIMIT of CODING->src_object (buffer or string), store
7103 the data in BUF, set *STOP to a starting position of the next
7104 composition (if any) or to LIMIT, and return the address of the
7105 next element of BUF.
7106
7107 If such an annotation is not found, set *STOP to a starting
7108 position of a composition after POS (if any) or to LIMIT, and
7109 return BUF. */
7110
7111 static int *
7112 handle_composition_annotation (ptrdiff_t pos, ptrdiff_t limit,
7113 struct coding_system *coding, int *buf,
7114 ptrdiff_t *stop)
7115 {
7116 ptrdiff_t start, end;
7117 Lisp_Object prop;
7118
7119 if (! find_composition (pos, limit, &start, &end, &prop, coding->src_object)
7120 || end > limit)
7121 *stop = limit;
7122 else if (start > pos)
7123 *stop = start;
7124 else
7125 {
7126 if (start == pos)
7127 {
7128 /* We found a composition. Store the corresponding
7129 annotation data in BUF. */
7130 int *head = buf;
7131 enum composition_method method = COMPOSITION_METHOD (prop);
7132 int nchars = COMPOSITION_LENGTH (prop);
7133
7134 ADD_COMPOSITION_DATA (buf, nchars, 0, method);
7135 if (method != COMPOSITION_RELATIVE)
7136 {
7137 Lisp_Object components;
7138 ptrdiff_t i, len, i_byte;
7139
7140 components = COMPOSITION_COMPONENTS (prop);
7141 if (VECTORP (components))
7142 {
7143 len = ASIZE (components);
7144 for (i = 0; i < len; i++)
7145 *buf++ = XINT (AREF (components, i));
7146 }
7147 else if (STRINGP (components))
7148 {
7149 len = SCHARS (components);
7150 i = i_byte = 0;
7151 while (i < len)
7152 {
7153 FETCH_STRING_CHAR_ADVANCE (*buf, components, i, i_byte);
7154 buf++;
7155 }
7156 }
7157 else if (INTEGERP (components))
7158 {
7159 len = 1;
7160 *buf++ = XINT (components);
7161 }
7162 else if (CONSP (components))
7163 {
7164 for (len = 0; CONSP (components);
7165 len++, components = XCDR (components))
7166 *buf++ = XINT (XCAR (components));
7167 }
7168 else
7169 emacs_abort ();
7170 *head -= len;
7171 }
7172 }
7173
7174 if (find_composition (end, limit, &start, &end, &prop,
7175 coding->src_object)
7176 && end <= limit)
7177 *stop = start;
7178 else
7179 *stop = limit;
7180 }
7181 return buf;
7182 }
7183
7184
7185 /* Extract an annotation datum from a text property `charset' at POS of
7186 CODING->src_object (buffer of string), store the data in BUF, set
7187 *STOP to the position where the value of `charset' property changes
7188 (limiting by LIMIT), and return the address of the next element of
7189 BUF.
7190
7191 If the property value is nil, set *STOP to the position where the
7192 property value is non-nil (limiting by LIMIT), and return BUF. */
7193
7194 static int *
7195 handle_charset_annotation (ptrdiff_t pos, ptrdiff_t limit,
7196 struct coding_system *coding, int *buf,
7197 ptrdiff_t *stop)
7198 {
7199 Lisp_Object val, next;
7200 int id;
7201
7202 val = Fget_text_property (make_number (pos), Qcharset, coding->src_object);
7203 if (! NILP (val) && CHARSETP (val))
7204 id = XINT (CHARSET_SYMBOL_ID (val));
7205 else
7206 id = -1;
7207 ADD_CHARSET_DATA (buf, 0, id);
7208 next = Fnext_single_property_change (make_number (pos), Qcharset,
7209 coding->src_object,
7210 make_number (limit));
7211 *stop = XINT (next);
7212 return buf;
7213 }
7214
7215
7216 static void
7217 consume_chars (struct coding_system *coding, Lisp_Object translation_table,
7218 int max_lookup)
7219 {
7220 int *buf = coding->charbuf;
7221 int *buf_end = coding->charbuf + coding->charbuf_size;
7222 const unsigned char *src = coding->source + coding->consumed;
7223 const unsigned char *src_end = coding->source + coding->src_bytes;
7224 ptrdiff_t pos = coding->src_pos + coding->consumed_char;
7225 ptrdiff_t end_pos = coding->src_pos + coding->src_chars;
7226 bool multibytep = coding->src_multibyte;
7227 Lisp_Object eol_type;
7228 int c;
7229 ptrdiff_t stop, stop_composition, stop_charset;
7230 int *lookup_buf = NULL;
7231
7232 if (! NILP (translation_table))
7233 lookup_buf = alloca (sizeof (int) * max_lookup);
7234
7235 eol_type = inhibit_eol_conversion ? Qunix : CODING_ID_EOL_TYPE (coding->id);
7236 if (VECTORP (eol_type))
7237 eol_type = Qunix;
7238
7239 /* Note: composition handling is not yet implemented. */
7240 coding->common_flags &= ~CODING_ANNOTATE_COMPOSITION_MASK;
7241
7242 if (NILP (coding->src_object))
7243 stop = stop_composition = stop_charset = end_pos;
7244 else
7245 {
7246 if (coding->common_flags & CODING_ANNOTATE_COMPOSITION_MASK)
7247 stop = stop_composition = pos;
7248 else
7249 stop = stop_composition = end_pos;
7250 if (coding->common_flags & CODING_ANNOTATE_CHARSET_MASK)
7251 stop = stop_charset = pos;
7252 else
7253 stop_charset = end_pos;
7254 }
7255
7256 /* Compensate for CRLF and conversion. */
7257 buf_end -= 1 + MAX_ANNOTATION_LENGTH;
7258 while (buf < buf_end)
7259 {
7260 Lisp_Object trans;
7261
7262 if (pos == stop)
7263 {
7264 if (pos == end_pos)
7265 break;
7266 if (pos == stop_composition)
7267 buf = handle_composition_annotation (pos, end_pos, coding,
7268 buf, &stop_composition);
7269 if (pos == stop_charset)
7270 buf = handle_charset_annotation (pos, end_pos, coding,
7271 buf, &stop_charset);
7272 stop = (stop_composition < stop_charset
7273 ? stop_composition : stop_charset);
7274 }
7275
7276 if (! multibytep)
7277 {
7278 int bytes;
7279
7280 if (coding->encoder == encode_coding_raw_text
7281 || coding->encoder == encode_coding_ccl)
7282 c = *src++, pos++;
7283 else if ((bytes = MULTIBYTE_LENGTH (src, src_end)) > 0)
7284 c = STRING_CHAR_ADVANCE_NO_UNIFY (src), pos += bytes;
7285 else
7286 c = BYTE8_TO_CHAR (*src), src++, pos++;
7287 }
7288 else
7289 c = STRING_CHAR_ADVANCE_NO_UNIFY (src), pos++;
7290 if ((c == '\r') && (coding->mode & CODING_MODE_SELECTIVE_DISPLAY))
7291 c = '\n';
7292 if (! EQ (eol_type, Qunix))
7293 {
7294 if (c == '\n')
7295 {
7296 if (EQ (eol_type, Qdos))
7297 *buf++ = '\r';
7298 else
7299 c = '\r';
7300 }
7301 }
7302
7303 trans = Qnil;
7304 LOOKUP_TRANSLATION_TABLE (translation_table, c, trans);
7305 if (NILP (trans))
7306 *buf++ = c;
7307 else
7308 {
7309 ptrdiff_t from_nchars = 1, to_nchars = 1;
7310 int *lookup_buf_end;
7311 const unsigned char *p = src;
7312 int i;
7313
7314 lookup_buf[0] = c;
7315 for (i = 1; i < max_lookup && p < src_end; i++)
7316 lookup_buf[i] = STRING_CHAR_ADVANCE (p);
7317 lookup_buf_end = lookup_buf + i;
7318 trans = get_translation (trans, lookup_buf, lookup_buf_end);
7319 if (INTEGERP (trans))
7320 c = XINT (trans);
7321 else if (CONSP (trans))
7322 {
7323 from_nchars = ASIZE (XCAR (trans));
7324 trans = XCDR (trans);
7325 if (INTEGERP (trans))
7326 c = XINT (trans);
7327 else
7328 {
7329 to_nchars = ASIZE (trans);
7330 if (buf_end - buf < to_nchars)
7331 break;
7332 c = XINT (AREF (trans, 0));
7333 }
7334 }
7335 else
7336 break;
7337 *buf++ = c;
7338 for (i = 1; i < to_nchars; i++)
7339 *buf++ = XINT (AREF (trans, i));
7340 for (i = 1; i < from_nchars; i++, pos++)
7341 src += MULTIBYTE_LENGTH_NO_CHECK (src);
7342 }
7343 }
7344
7345 coding->consumed = src - coding->source;
7346 coding->consumed_char = pos - coding->src_pos;
7347 coding->charbuf_used = buf - coding->charbuf;
7348 coding->chars_at_source = 0;
7349 }
7350
7351
7352 /* Encode the text at CODING->src_object into CODING->dst_object.
7353 CODING->src_object is a buffer or a string.
7354 CODING->dst_object is a buffer or nil.
7355
7356 If CODING->src_object is a buffer, it must be the current buffer.
7357 In this case, if CODING->src_pos is positive, it is a position of
7358 the source text in the buffer, otherwise. the source text is in the
7359 gap area of the buffer, and coding->src_pos specifies the offset of
7360 the text from GPT (which must be the same as PT). If this is the
7361 same buffer as CODING->dst_object, CODING->src_pos must be
7362 negative and CODING should not have `pre-write-conversion'.
7363
7364 If CODING->src_object is a string, CODING should not have
7365 `pre-write-conversion'.
7366
7367 If CODING->dst_object is a buffer, the encoded data is inserted at
7368 the current point of that buffer.
7369
7370 If CODING->dst_object is nil, the encoded data is placed at the
7371 memory area specified by CODING->destination. */
7372
7373 static void
7374 encode_coding (struct coding_system *coding)
7375 {
7376 Lisp_Object attrs;
7377 Lisp_Object translation_table;
7378 int max_lookup;
7379 struct ccl_spec cclspec;
7380
7381 attrs = CODING_ID_ATTRS (coding->id);
7382 if (coding->encoder == encode_coding_raw_text)
7383 translation_table = Qnil, max_lookup = 0;
7384 else
7385 translation_table = get_translation_table (attrs, 1, &max_lookup);
7386
7387 if (BUFFERP (coding->dst_object))
7388 {
7389 set_buffer_internal (XBUFFER (coding->dst_object));
7390 coding->dst_multibyte
7391 = ! NILP (BVAR (current_buffer, enable_multibyte_characters));
7392 }
7393
7394 coding->consumed = coding->consumed_char = 0;
7395 coding->produced = coding->produced_char = 0;
7396 record_conversion_result (coding, CODING_RESULT_SUCCESS);
7397 coding->errors = 0;
7398
7399 ALLOC_CONVERSION_WORK_AREA (coding);
7400
7401 if (coding->encoder == encode_coding_ccl)
7402 {
7403 coding->spec.ccl = &cclspec;
7404 setup_ccl_program (&cclspec.ccl, CODING_CCL_ENCODER (coding));
7405 }
7406 do {
7407 coding_set_source (coding);
7408 consume_chars (coding, translation_table, max_lookup);
7409 coding_set_destination (coding);
7410 (*(coding->encoder)) (coding);
7411 } while (coding->consumed_char < coding->src_chars);
7412
7413 if (BUFFERP (coding->dst_object) && coding->produced_char > 0)
7414 insert_from_gap (coding->produced_char, coding->produced);
7415 }
7416
7417
7418 /* Name (or base name) of work buffer for code conversion. */
7419 static Lisp_Object Vcode_conversion_workbuf_name;
7420
7421 /* A working buffer used by the top level conversion. Once it is
7422 created, it is never destroyed. It has the name
7423 Vcode_conversion_workbuf_name. The other working buffers are
7424 destroyed after the use is finished, and their names are modified
7425 versions of Vcode_conversion_workbuf_name. */
7426 static Lisp_Object Vcode_conversion_reused_workbuf;
7427
7428 /* True iff Vcode_conversion_reused_workbuf is already in use. */
7429 static bool reused_workbuf_in_use;
7430
7431
7432 /* Return a working buffer of code conversion. MULTIBYTE specifies the
7433 multibyteness of returning buffer. */
7434
7435 static Lisp_Object
7436 make_conversion_work_buffer (bool multibyte)
7437 {
7438 Lisp_Object name, workbuf;
7439 struct buffer *current;
7440
7441 if (reused_workbuf_in_use)
7442 {
7443 name = Fgenerate_new_buffer_name (Vcode_conversion_workbuf_name, Qnil);
7444 workbuf = Fget_buffer_create (name);
7445 }
7446 else
7447 {
7448 reused_workbuf_in_use = 1;
7449 if (NILP (Fbuffer_live_p (Vcode_conversion_reused_workbuf)))
7450 Vcode_conversion_reused_workbuf
7451 = Fget_buffer_create (Vcode_conversion_workbuf_name);
7452 workbuf = Vcode_conversion_reused_workbuf;
7453 }
7454 current = current_buffer;
7455 set_buffer_internal (XBUFFER (workbuf));
7456 /* We can't allow modification hooks to run in the work buffer. For
7457 instance, directory_files_internal assumes that file decoding
7458 doesn't compile new regexps. */
7459 Fset (Fmake_local_variable (Qinhibit_modification_hooks), Qt);
7460 Ferase_buffer ();
7461 bset_undo_list (current_buffer, Qt);
7462 bset_enable_multibyte_characters (current_buffer, multibyte ? Qt : Qnil);
7463 set_buffer_internal (current);
7464 return workbuf;
7465 }
7466
7467
7468 static Lisp_Object
7469 code_conversion_restore (Lisp_Object arg)
7470 {
7471 Lisp_Object current, workbuf;
7472 struct gcpro gcpro1;
7473
7474 GCPRO1 (arg);
7475 current = XCAR (arg);
7476 workbuf = XCDR (arg);
7477 if (! NILP (workbuf))
7478 {
7479 if (EQ (workbuf, Vcode_conversion_reused_workbuf))
7480 reused_workbuf_in_use = 0;
7481 else
7482 Fkill_buffer (workbuf);
7483 }
7484 set_buffer_internal (XBUFFER (current));
7485 UNGCPRO;
7486 return Qnil;
7487 }
7488
7489 Lisp_Object
7490 code_conversion_save (bool with_work_buf, bool multibyte)
7491 {
7492 Lisp_Object workbuf = Qnil;
7493
7494 if (with_work_buf)
7495 workbuf = make_conversion_work_buffer (multibyte);
7496 record_unwind_protect (code_conversion_restore,
7497 Fcons (Fcurrent_buffer (), workbuf));
7498 return workbuf;
7499 }
7500
7501 void
7502 decode_coding_gap (struct coding_system *coding,
7503 ptrdiff_t chars, ptrdiff_t bytes)
7504 {
7505 ptrdiff_t count = SPECPDL_INDEX ();
7506 Lisp_Object attrs;
7507
7508 code_conversion_save (0, 0);
7509
7510 coding->src_object = Fcurrent_buffer ();
7511 coding->src_chars = chars;
7512 coding->src_bytes = bytes;
7513 coding->src_pos = -chars;
7514 coding->src_pos_byte = -bytes;
7515 coding->src_multibyte = chars < bytes;
7516 coding->dst_object = coding->src_object;
7517 coding->dst_pos = PT;
7518 coding->dst_pos_byte = PT_BYTE;
7519 coding->dst_multibyte = ! NILP (BVAR (current_buffer, enable_multibyte_characters));
7520
7521 if (CODING_REQUIRE_DETECTION (coding))
7522 detect_coding (coding);
7523
7524 coding->mode |= CODING_MODE_LAST_BLOCK;
7525 current_buffer->text->inhibit_shrinking = 1;
7526 decode_coding (coding);
7527 current_buffer->text->inhibit_shrinking = 0;
7528
7529 attrs = CODING_ID_ATTRS (coding->id);
7530 if (! NILP (CODING_ATTR_POST_READ (attrs)))
7531 {
7532 ptrdiff_t prev_Z = Z, prev_Z_BYTE = Z_BYTE;
7533 Lisp_Object val;
7534
7535 TEMP_SET_PT_BOTH (coding->dst_pos, coding->dst_pos_byte);
7536 val = call1 (CODING_ATTR_POST_READ (attrs),
7537 make_number (coding->produced_char));
7538 CHECK_NATNUM (val);
7539 coding->produced_char += Z - prev_Z;
7540 coding->produced += Z_BYTE - prev_Z_BYTE;
7541 }
7542
7543 unbind_to (count, Qnil);
7544 }
7545
7546
7547 /* Decode the text in the range FROM/FROM_BYTE and TO/TO_BYTE in
7548 SRC_OBJECT into DST_OBJECT by coding context CODING.
7549
7550 SRC_OBJECT is a buffer, a string, or Qnil.
7551
7552 If it is a buffer, the text is at point of the buffer. FROM and TO
7553 are positions in the buffer.
7554
7555 If it is a string, the text is at the beginning of the string.
7556 FROM and TO are indices to the string.
7557
7558 If it is nil, the text is at coding->source. FROM and TO are
7559 indices to coding->source.
7560
7561 DST_OBJECT is a buffer, Qt, or Qnil.
7562
7563 If it is a buffer, the decoded text is inserted at point of the
7564 buffer. If the buffer is the same as SRC_OBJECT, the source text
7565 is deleted.
7566
7567 If it is Qt, a string is made from the decoded text, and
7568 set in CODING->dst_object.
7569
7570 If it is Qnil, the decoded text is stored at CODING->destination.
7571 The caller must allocate CODING->dst_bytes bytes at
7572 CODING->destination by xmalloc. If the decoded text is longer than
7573 CODING->dst_bytes, CODING->destination is relocated by xrealloc.
7574 */
7575
7576 void
7577 decode_coding_object (struct coding_system *coding,
7578 Lisp_Object src_object,
7579 ptrdiff_t from, ptrdiff_t from_byte,
7580 ptrdiff_t to, ptrdiff_t to_byte,
7581 Lisp_Object dst_object)
7582 {
7583 ptrdiff_t count = SPECPDL_INDEX ();
7584 unsigned char *destination IF_LINT (= NULL);
7585 ptrdiff_t dst_bytes IF_LINT (= 0);
7586 ptrdiff_t chars = to - from;
7587 ptrdiff_t bytes = to_byte - from_byte;
7588 Lisp_Object attrs;
7589 ptrdiff_t saved_pt = -1, saved_pt_byte IF_LINT (= 0);
7590 bool need_marker_adjustment = 0;
7591 Lisp_Object old_deactivate_mark;
7592
7593 old_deactivate_mark = Vdeactivate_mark;
7594
7595 if (NILP (dst_object))
7596 {
7597 destination = coding->destination;
7598 dst_bytes = coding->dst_bytes;
7599 }
7600
7601 coding->src_object = src_object;
7602 coding->src_chars = chars;
7603 coding->src_bytes = bytes;
7604 coding->src_multibyte = chars < bytes;
7605
7606 if (STRINGP (src_object))
7607 {
7608 coding->src_pos = from;
7609 coding->src_pos_byte = from_byte;
7610 }
7611 else if (BUFFERP (src_object))
7612 {
7613 set_buffer_internal (XBUFFER (src_object));
7614 if (from != GPT)
7615 move_gap_both (from, from_byte);
7616 if (EQ (src_object, dst_object))
7617 {
7618 struct Lisp_Marker *tail;
7619
7620 for (tail = BUF_MARKERS (current_buffer); tail; tail = tail->next)
7621 {
7622 tail->need_adjustment
7623 = tail->charpos == (tail->insertion_type ? from : to);
7624 need_marker_adjustment |= tail->need_adjustment;
7625 }
7626 saved_pt = PT, saved_pt_byte = PT_BYTE;
7627 TEMP_SET_PT_BOTH (from, from_byte);
7628 current_buffer->text->inhibit_shrinking = 1;
7629 del_range_both (from, from_byte, to, to_byte, 1);
7630 coding->src_pos = -chars;
7631 coding->src_pos_byte = -bytes;
7632 }
7633 else
7634 {
7635 coding->src_pos = from;
7636 coding->src_pos_byte = from_byte;
7637 }
7638 }
7639
7640 if (CODING_REQUIRE_DETECTION (coding))
7641 detect_coding (coding);
7642 attrs = CODING_ID_ATTRS (coding->id);
7643
7644 if (EQ (dst_object, Qt)
7645 || (! NILP (CODING_ATTR_POST_READ (attrs))
7646 && NILP (dst_object)))
7647 {
7648 coding->dst_multibyte = !CODING_FOR_UNIBYTE (coding);
7649 coding->dst_object = code_conversion_save (1, coding->dst_multibyte);
7650 coding->dst_pos = BEG;
7651 coding->dst_pos_byte = BEG_BYTE;
7652 }
7653 else if (BUFFERP (dst_object))
7654 {
7655 code_conversion_save (0, 0);
7656 coding->dst_object = dst_object;
7657 coding->dst_pos = BUF_PT (XBUFFER (dst_object));
7658 coding->dst_pos_byte = BUF_PT_BYTE (XBUFFER (dst_object));
7659 coding->dst_multibyte
7660 = ! NILP (BVAR (XBUFFER (dst_object), enable_multibyte_characters));
7661 }
7662 else
7663 {
7664 code_conversion_save (0, 0);
7665 coding->dst_object = Qnil;
7666 /* Most callers presume this will return a multibyte result, and they
7667 won't use `binary' or `raw-text' anyway, so let's not worry about
7668 CODING_FOR_UNIBYTE. */
7669 coding->dst_multibyte = 1;
7670 }
7671
7672 decode_coding (coding);
7673
7674 if (BUFFERP (coding->dst_object))
7675 set_buffer_internal (XBUFFER (coding->dst_object));
7676
7677 if (! NILP (CODING_ATTR_POST_READ (attrs)))
7678 {
7679 struct gcpro gcpro1, gcpro2, gcpro3, gcpro4, gcpro5;
7680 ptrdiff_t prev_Z = Z, prev_Z_BYTE = Z_BYTE;
7681 Lisp_Object val;
7682
7683 TEMP_SET_PT_BOTH (coding->dst_pos, coding->dst_pos_byte);
7684 GCPRO5 (coding->src_object, coding->dst_object, src_object, dst_object,
7685 old_deactivate_mark);
7686 val = safe_call1 (CODING_ATTR_POST_READ (attrs),
7687 make_number (coding->produced_char));
7688 UNGCPRO;
7689 CHECK_NATNUM (val);
7690 coding->produced_char += Z - prev_Z;
7691 coding->produced += Z_BYTE - prev_Z_BYTE;
7692 }
7693
7694 if (EQ (dst_object, Qt))
7695 {
7696 coding->dst_object = Fbuffer_string ();
7697 }
7698 else if (NILP (dst_object) && BUFFERP (coding->dst_object))
7699 {
7700 set_buffer_internal (XBUFFER (coding->dst_object));
7701 if (dst_bytes < coding->produced)
7702 {
7703 destination = xrealloc (destination, coding->produced);
7704 if (! destination)
7705 {
7706 record_conversion_result (coding,
7707 CODING_RESULT_INSUFFICIENT_MEM);
7708 unbind_to (count, Qnil);
7709 return;
7710 }
7711 if (BEGV < GPT && GPT < BEGV + coding->produced_char)
7712 move_gap_both (BEGV, BEGV_BYTE);
7713 memcpy (destination, BEGV_ADDR, coding->produced);
7714 coding->destination = destination;
7715 }
7716 }
7717
7718 if (saved_pt >= 0)
7719 {
7720 /* This is the case of:
7721 (BUFFERP (src_object) && EQ (src_object, dst_object))
7722 As we have moved PT while replacing the original buffer
7723 contents, we must recover it now. */
7724 set_buffer_internal (XBUFFER (src_object));
7725 current_buffer->text->inhibit_shrinking = 0;
7726 if (saved_pt < from)
7727 TEMP_SET_PT_BOTH (saved_pt, saved_pt_byte);
7728 else if (saved_pt < from + chars)
7729 TEMP_SET_PT_BOTH (from, from_byte);
7730 else if (! NILP (BVAR (current_buffer, enable_multibyte_characters)))
7731 TEMP_SET_PT_BOTH (saved_pt + (coding->produced_char - chars),
7732 saved_pt_byte + (coding->produced - bytes));
7733 else
7734 TEMP_SET_PT_BOTH (saved_pt + (coding->produced - bytes),
7735 saved_pt_byte + (coding->produced - bytes));
7736
7737 if (need_marker_adjustment)
7738 {
7739 struct Lisp_Marker *tail;
7740
7741 for (tail = BUF_MARKERS (current_buffer); tail; tail = tail->next)
7742 if (tail->need_adjustment)
7743 {
7744 tail->need_adjustment = 0;
7745 if (tail->insertion_type)
7746 {
7747 tail->bytepos = from_byte;
7748 tail->charpos = from;
7749 }
7750 else
7751 {
7752 tail->bytepos = from_byte + coding->produced;
7753 tail->charpos
7754 = (NILP (BVAR (current_buffer, enable_multibyte_characters))
7755 ? tail->bytepos : from + coding->produced_char);
7756 }
7757 }
7758 }
7759 }
7760
7761 Vdeactivate_mark = old_deactivate_mark;
7762 unbind_to (count, coding->dst_object);
7763 }
7764
7765
7766 void
7767 encode_coding_object (struct coding_system *coding,
7768 Lisp_Object src_object,
7769 ptrdiff_t from, ptrdiff_t from_byte,
7770 ptrdiff_t to, ptrdiff_t to_byte,
7771 Lisp_Object dst_object)
7772 {
7773 ptrdiff_t count = SPECPDL_INDEX ();
7774 ptrdiff_t chars = to - from;
7775 ptrdiff_t bytes = to_byte - from_byte;
7776 Lisp_Object attrs;
7777 ptrdiff_t saved_pt = -1, saved_pt_byte IF_LINT (= 0);
7778 bool need_marker_adjustment = 0;
7779 bool kill_src_buffer = 0;
7780 Lisp_Object old_deactivate_mark;
7781
7782 old_deactivate_mark = Vdeactivate_mark;
7783
7784 coding->src_object = src_object;
7785 coding->src_chars = chars;
7786 coding->src_bytes = bytes;
7787 coding->src_multibyte = chars < bytes;
7788
7789 attrs = CODING_ID_ATTRS (coding->id);
7790
7791 if (EQ (src_object, dst_object))
7792 {
7793 struct Lisp_Marker *tail;
7794
7795 for (tail = BUF_MARKERS (current_buffer); tail; tail = tail->next)
7796 {
7797 tail->need_adjustment
7798 = tail->charpos == (tail->insertion_type ? from : to);
7799 need_marker_adjustment |= tail->need_adjustment;
7800 }
7801 }
7802
7803 if (! NILP (CODING_ATTR_PRE_WRITE (attrs)))
7804 {
7805 coding->src_object = code_conversion_save (1, coding->src_multibyte);
7806 set_buffer_internal (XBUFFER (coding->src_object));
7807 if (STRINGP (src_object))
7808 insert_from_string (src_object, from, from_byte, chars, bytes, 0);
7809 else if (BUFFERP (src_object))
7810 insert_from_buffer (XBUFFER (src_object), from, chars, 0);
7811 else
7812 insert_1_both ((char *) coding->source + from, chars, bytes, 0, 0, 0);
7813
7814 if (EQ (src_object, dst_object))
7815 {
7816 set_buffer_internal (XBUFFER (src_object));
7817 saved_pt = PT, saved_pt_byte = PT_BYTE;
7818 del_range_both (from, from_byte, to, to_byte, 1);
7819 set_buffer_internal (XBUFFER (coding->src_object));
7820 }
7821
7822 {
7823 struct gcpro gcpro1, gcpro2, gcpro3, gcpro4, gcpro5;
7824
7825 GCPRO5 (coding->src_object, coding->dst_object, src_object, dst_object,
7826 old_deactivate_mark);
7827 safe_call2 (CODING_ATTR_PRE_WRITE (attrs),
7828 make_number (BEG), make_number (Z));
7829 UNGCPRO;
7830 }
7831 if (XBUFFER (coding->src_object) != current_buffer)
7832 kill_src_buffer = 1;
7833 coding->src_object = Fcurrent_buffer ();
7834 if (BEG != GPT)
7835 move_gap_both (BEG, BEG_BYTE);
7836 coding->src_chars = Z - BEG;
7837 coding->src_bytes = Z_BYTE - BEG_BYTE;
7838 coding->src_pos = BEG;
7839 coding->src_pos_byte = BEG_BYTE;
7840 coding->src_multibyte = Z < Z_BYTE;
7841 }
7842 else if (STRINGP (src_object))
7843 {
7844 code_conversion_save (0, 0);
7845 coding->src_pos = from;
7846 coding->src_pos_byte = from_byte;
7847 }
7848 else if (BUFFERP (src_object))
7849 {
7850 code_conversion_save (0, 0);
7851 set_buffer_internal (XBUFFER (src_object));
7852 if (EQ (src_object, dst_object))
7853 {
7854 saved_pt = PT, saved_pt_byte = PT_BYTE;
7855 coding->src_object = del_range_1 (from, to, 1, 1);
7856 coding->src_pos = 0;
7857 coding->src_pos_byte = 0;
7858 }
7859 else
7860 {
7861 if (from < GPT && to >= GPT)
7862 move_gap_both (from, from_byte);
7863 coding->src_pos = from;
7864 coding->src_pos_byte = from_byte;
7865 }
7866 }
7867 else
7868 code_conversion_save (0, 0);
7869
7870 if (BUFFERP (dst_object))
7871 {
7872 coding->dst_object = dst_object;
7873 if (EQ (src_object, dst_object))
7874 {
7875 coding->dst_pos = from;
7876 coding->dst_pos_byte = from_byte;
7877 }
7878 else
7879 {
7880 struct buffer *current = current_buffer;
7881
7882 set_buffer_temp (XBUFFER (dst_object));
7883 coding->dst_pos = PT;
7884 coding->dst_pos_byte = PT_BYTE;
7885 move_gap_both (coding->dst_pos, coding->dst_pos_byte);
7886 set_buffer_temp (current);
7887 }
7888 coding->dst_multibyte
7889 = ! NILP (BVAR (XBUFFER (dst_object), enable_multibyte_characters));
7890 }
7891 else if (EQ (dst_object, Qt))
7892 {
7893 ptrdiff_t dst_bytes = max (1, coding->src_chars);
7894 coding->dst_object = Qnil;
7895 coding->destination = xmalloc (dst_bytes);
7896 coding->dst_bytes = dst_bytes;
7897 coding->dst_multibyte = 0;
7898 }
7899 else
7900 {
7901 coding->dst_object = Qnil;
7902 coding->dst_multibyte = 0;
7903 }
7904
7905 encode_coding (coding);
7906
7907 if (EQ (dst_object, Qt))
7908 {
7909 if (BUFFERP (coding->dst_object))
7910 coding->dst_object = Fbuffer_string ();
7911 else
7912 {
7913 coding->dst_object
7914 = make_unibyte_string ((char *) coding->destination,
7915 coding->produced);
7916 xfree (coding->destination);
7917 }
7918 }
7919
7920 if (saved_pt >= 0)
7921 {
7922 /* This is the case of:
7923 (BUFFERP (src_object) && EQ (src_object, dst_object))
7924 As we have moved PT while replacing the original buffer
7925 contents, we must recover it now. */
7926 set_buffer_internal (XBUFFER (src_object));
7927 if (saved_pt < from)
7928 TEMP_SET_PT_BOTH (saved_pt, saved_pt_byte);
7929 else if (saved_pt < from + chars)
7930 TEMP_SET_PT_BOTH (from, from_byte);
7931 else if (! NILP (BVAR (current_buffer, enable_multibyte_characters)))
7932 TEMP_SET_PT_BOTH (saved_pt + (coding->produced_char - chars),
7933 saved_pt_byte + (coding->produced - bytes));
7934 else
7935 TEMP_SET_PT_BOTH (saved_pt + (coding->produced - bytes),
7936 saved_pt_byte + (coding->produced - bytes));
7937
7938 if (need_marker_adjustment)
7939 {
7940 struct Lisp_Marker *tail;
7941
7942 for (tail = BUF_MARKERS (current_buffer); tail; tail = tail->next)
7943 if (tail->need_adjustment)
7944 {
7945 tail->need_adjustment = 0;
7946 if (tail->insertion_type)
7947 {
7948 tail->bytepos = from_byte;
7949 tail->charpos = from;
7950 }
7951 else
7952 {
7953 tail->bytepos = from_byte + coding->produced;
7954 tail->charpos
7955 = (NILP (BVAR (current_buffer, enable_multibyte_characters))
7956 ? tail->bytepos : from + coding->produced_char);
7957 }
7958 }
7959 }
7960 }
7961
7962 if (kill_src_buffer)
7963 Fkill_buffer (coding->src_object);
7964
7965 Vdeactivate_mark = old_deactivate_mark;
7966 unbind_to (count, Qnil);
7967 }
7968
7969
7970 Lisp_Object
7971 preferred_coding_system (void)
7972 {
7973 int id = coding_categories[coding_priorities[0]].id;
7974
7975 return CODING_ID_NAME (id);
7976 }
7977
7978 #if defined (WINDOWSNT) || defined (CYGWIN)
7979
7980 Lisp_Object
7981 from_unicode (Lisp_Object str)
7982 {
7983 CHECK_STRING (str);
7984 if (!STRING_MULTIBYTE (str) &&
7985 SBYTES (str) & 1)
7986 {
7987 str = Fsubstring (str, make_number (0), make_number (-1));
7988 }
7989
7990 return code_convert_string_norecord (str, Qutf_16le, 0);
7991 }
7992
7993 wchar_t *
7994 to_unicode (Lisp_Object str, Lisp_Object *buf)
7995 {
7996 *buf = code_convert_string_norecord (str, Qutf_16le, 1);
7997 /* We need to make a another copy (in addition to the one made by
7998 code_convert_string_norecord) to ensure that the final string is
7999 _doubly_ zero terminated --- that is, that the string is
8000 terminated by two zero bytes and one utf-16le null character.
8001 Because strings are already terminated with a single zero byte,
8002 we just add one additional zero. */
8003 str = make_uninit_string (SBYTES (*buf) + 1);
8004 memcpy (SDATA (str), SDATA (*buf), SBYTES (*buf));
8005 SDATA (str) [SBYTES (*buf)] = '\0';
8006 *buf = str;
8007 return WCSDATA (*buf);
8008 }
8009
8010 #endif /* WINDOWSNT || CYGWIN */
8011
8012 \f
8013 #ifdef emacs
8014 /*** 8. Emacs Lisp library functions ***/
8015
8016 DEFUN ("coding-system-p", Fcoding_system_p, Scoding_system_p, 1, 1, 0,
8017 doc: /* Return t if OBJECT is nil or a coding-system.
8018 See the documentation of `define-coding-system' for information
8019 about coding-system objects. */)
8020 (Lisp_Object object)
8021 {
8022 if (NILP (object)
8023 || CODING_SYSTEM_ID (object) >= 0)
8024 return Qt;
8025 if (! SYMBOLP (object)
8026 || NILP (Fget (object, Qcoding_system_define_form)))
8027 return Qnil;
8028 return Qt;
8029 }
8030
8031 DEFUN ("read-non-nil-coding-system", Fread_non_nil_coding_system,
8032 Sread_non_nil_coding_system, 1, 1, 0,
8033 doc: /* Read a coding system from the minibuffer, prompting with string PROMPT. */)
8034 (Lisp_Object prompt)
8035 {
8036 Lisp_Object val;
8037 do
8038 {
8039 val = Fcompleting_read (prompt, Vcoding_system_alist, Qnil,
8040 Qt, Qnil, Qcoding_system_history, Qnil, Qnil);
8041 }
8042 while (SCHARS (val) == 0);
8043 return (Fintern (val, Qnil));
8044 }
8045
8046 DEFUN ("read-coding-system", Fread_coding_system, Sread_coding_system, 1, 2, 0,
8047 doc: /* Read a coding system from the minibuffer, prompting with string PROMPT.
8048 If the user enters null input, return second argument DEFAULT-CODING-SYSTEM.
8049 Ignores case when completing coding systems (all Emacs coding systems
8050 are lower-case). */)
8051 (Lisp_Object prompt, Lisp_Object default_coding_system)
8052 {
8053 Lisp_Object val;
8054 ptrdiff_t count = SPECPDL_INDEX ();
8055
8056 if (SYMBOLP (default_coding_system))
8057 default_coding_system = SYMBOL_NAME (default_coding_system);
8058 specbind (Qcompletion_ignore_case, Qt);
8059 val = Fcompleting_read (prompt, Vcoding_system_alist, Qnil,
8060 Qt, Qnil, Qcoding_system_history,
8061 default_coding_system, Qnil);
8062 unbind_to (count, Qnil);
8063 return (SCHARS (val) == 0 ? Qnil : Fintern (val, Qnil));
8064 }
8065
8066 DEFUN ("check-coding-system", Fcheck_coding_system, Scheck_coding_system,
8067 1, 1, 0,
8068 doc: /* Check validity of CODING-SYSTEM.
8069 If valid, return CODING-SYSTEM, else signal a `coding-system-error' error.
8070 It is valid if it is nil or a symbol defined as a coding system by the
8071 function `define-coding-system'. */)
8072 (Lisp_Object coding_system)
8073 {
8074 Lisp_Object define_form;
8075
8076 define_form = Fget (coding_system, Qcoding_system_define_form);
8077 if (! NILP (define_form))
8078 {
8079 Fput (coding_system, Qcoding_system_define_form, Qnil);
8080 safe_eval (define_form);
8081 }
8082 if (!NILP (Fcoding_system_p (coding_system)))
8083 return coding_system;
8084 xsignal1 (Qcoding_system_error, coding_system);
8085 }
8086
8087 \f
8088 /* Detect how the bytes at SRC of length SRC_BYTES are encoded. If
8089 HIGHEST, return the coding system of the highest
8090 priority among the detected coding systems. Otherwise return a
8091 list of detected coding systems sorted by their priorities. If
8092 MULTIBYTEP, it is assumed that the bytes are in correct
8093 multibyte form but contains only ASCII and eight-bit chars.
8094 Otherwise, the bytes are raw bytes.
8095
8096 CODING-SYSTEM controls the detection as below:
8097
8098 If it is nil, detect both text-format and eol-format. If the
8099 text-format part of CODING-SYSTEM is already specified
8100 (e.g. `iso-latin-1'), detect only eol-format. If the eol-format
8101 part of CODING-SYSTEM is already specified (e.g. `undecided-unix'),
8102 detect only text-format. */
8103
8104 Lisp_Object
8105 detect_coding_system (const unsigned char *src,
8106 ptrdiff_t src_chars, ptrdiff_t src_bytes,
8107 bool highest, bool multibytep,
8108 Lisp_Object coding_system)
8109 {
8110 const unsigned char *src_end = src + src_bytes;
8111 Lisp_Object attrs, eol_type;
8112 Lisp_Object val = Qnil;
8113 struct coding_system coding;
8114 ptrdiff_t id;
8115 struct coding_detection_info detect_info;
8116 enum coding_category base_category;
8117 bool null_byte_found = 0, eight_bit_found = 0;
8118
8119 if (NILP (coding_system))
8120 coding_system = Qundecided;
8121 setup_coding_system (coding_system, &coding);
8122 attrs = CODING_ID_ATTRS (coding.id);
8123 eol_type = CODING_ID_EOL_TYPE (coding.id);
8124 coding_system = CODING_ATTR_BASE_NAME (attrs);
8125
8126 coding.source = src;
8127 coding.src_chars = src_chars;
8128 coding.src_bytes = src_bytes;
8129 coding.src_multibyte = multibytep;
8130 coding.consumed = 0;
8131 coding.mode |= CODING_MODE_LAST_BLOCK;
8132 coding.head_ascii = 0;
8133
8134 detect_info.checked = detect_info.found = detect_info.rejected = 0;
8135
8136 /* At first, detect text-format if necessary. */
8137 base_category = XINT (CODING_ATTR_CATEGORY (attrs));
8138 if (base_category == coding_category_undecided)
8139 {
8140 enum coding_category category IF_LINT (= 0);
8141 struct coding_system *this IF_LINT (= NULL);
8142 int c, i;
8143
8144 /* Skip all ASCII bytes except for a few ISO2022 controls. */
8145 for (; src < src_end; src++)
8146 {
8147 c = *src;
8148 if (c & 0x80)
8149 {
8150 eight_bit_found = 1;
8151 if (null_byte_found)
8152 break;
8153 }
8154 else if (c < 0x20)
8155 {
8156 if ((c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
8157 && ! inhibit_iso_escape_detection
8158 && ! detect_info.checked)
8159 {
8160 if (detect_coding_iso_2022 (&coding, &detect_info))
8161 {
8162 /* We have scanned the whole data. */
8163 if (! (detect_info.rejected & CATEGORY_MASK_ISO_7_ELSE))
8164 {
8165 /* We didn't find an 8-bit code. We may
8166 have found a null-byte, but it's very
8167 rare that a binary file confirm to
8168 ISO-2022. */
8169 src = src_end;
8170 coding.head_ascii = src - coding.source;
8171 }
8172 detect_info.rejected |= ~CATEGORY_MASK_ISO_ESCAPE;
8173 break;
8174 }
8175 }
8176 else if (! c && !inhibit_null_byte_detection)
8177 {
8178 null_byte_found = 1;
8179 if (eight_bit_found)
8180 break;
8181 }
8182 if (! eight_bit_found)
8183 coding.head_ascii++;
8184 }
8185 else if (! eight_bit_found)
8186 coding.head_ascii++;
8187 }
8188
8189 if (null_byte_found || eight_bit_found
8190 || coding.head_ascii < coding.src_bytes
8191 || detect_info.found)
8192 {
8193 if (coding.head_ascii == coding.src_bytes)
8194 /* As all bytes are 7-bit, we can ignore non-ISO-2022 codings. */
8195 for (i = 0; i < coding_category_raw_text; i++)
8196 {
8197 category = coding_priorities[i];
8198 this = coding_categories + category;
8199 if (detect_info.found & (1 << category))
8200 break;
8201 }
8202 else
8203 {
8204 if (null_byte_found)
8205 {
8206 detect_info.checked |= ~CATEGORY_MASK_UTF_16;
8207 detect_info.rejected |= ~CATEGORY_MASK_UTF_16;
8208 }
8209 for (i = 0; i < coding_category_raw_text; i++)
8210 {
8211 category = coding_priorities[i];
8212 this = coding_categories + category;
8213
8214 if (this->id < 0)
8215 {
8216 /* No coding system of this category is defined. */
8217 detect_info.rejected |= (1 << category);
8218 }
8219 else if (category >= coding_category_raw_text)
8220 continue;
8221 else if (detect_info.checked & (1 << category))
8222 {
8223 if (highest
8224 && (detect_info.found & (1 << category)))
8225 break;
8226 }
8227 else if ((*(this->detector)) (&coding, &detect_info)
8228 && highest
8229 && (detect_info.found & (1 << category)))
8230 {
8231 if (category == coding_category_utf_16_auto)
8232 {
8233 if (detect_info.found & CATEGORY_MASK_UTF_16_LE)
8234 category = coding_category_utf_16_le;
8235 else
8236 category = coding_category_utf_16_be;
8237 }
8238 break;
8239 }
8240 }
8241 }
8242 }
8243
8244 if ((detect_info.rejected & CATEGORY_MASK_ANY) == CATEGORY_MASK_ANY
8245 || null_byte_found)
8246 {
8247 detect_info.found = CATEGORY_MASK_RAW_TEXT;
8248 id = CODING_SYSTEM_ID (Qno_conversion);
8249 val = Fcons (make_number (id), Qnil);
8250 }
8251 else if (! detect_info.rejected && ! detect_info.found)
8252 {
8253 detect_info.found = CATEGORY_MASK_ANY;
8254 id = coding_categories[coding_category_undecided].id;
8255 val = Fcons (make_number (id), Qnil);
8256 }
8257 else if (highest)
8258 {
8259 if (detect_info.found)
8260 {
8261 detect_info.found = 1 << category;
8262 val = Fcons (make_number (this->id), Qnil);
8263 }
8264 else
8265 for (i = 0; i < coding_category_raw_text; i++)
8266 if (! (detect_info.rejected & (1 << coding_priorities[i])))
8267 {
8268 detect_info.found = 1 << coding_priorities[i];
8269 id = coding_categories[coding_priorities[i]].id;
8270 val = Fcons (make_number (id), Qnil);
8271 break;
8272 }
8273 }
8274 else
8275 {
8276 int mask = detect_info.rejected | detect_info.found;
8277 int found = 0;
8278
8279 for (i = coding_category_raw_text - 1; i >= 0; i--)
8280 {
8281 category = coding_priorities[i];
8282 if (! (mask & (1 << category)))
8283 {
8284 found |= 1 << category;
8285 id = coding_categories[category].id;
8286 if (id >= 0)
8287 val = Fcons (make_number (id), val);
8288 }
8289 }
8290 for (i = coding_category_raw_text - 1; i >= 0; i--)
8291 {
8292 category = coding_priorities[i];
8293 if (detect_info.found & (1 << category))
8294 {
8295 id = coding_categories[category].id;
8296 val = Fcons (make_number (id), val);
8297 }
8298 }
8299 detect_info.found |= found;
8300 }
8301 }
8302 else if (base_category == coding_category_utf_8_auto)
8303 {
8304 if (detect_coding_utf_8 (&coding, &detect_info))
8305 {
8306 struct coding_system *this;
8307
8308 if (detect_info.found & CATEGORY_MASK_UTF_8_SIG)
8309 this = coding_categories + coding_category_utf_8_sig;
8310 else
8311 this = coding_categories + coding_category_utf_8_nosig;
8312 val = Fcons (make_number (this->id), Qnil);
8313 }
8314 }
8315 else if (base_category == coding_category_utf_16_auto)
8316 {
8317 if (detect_coding_utf_16 (&coding, &detect_info))
8318 {
8319 struct coding_system *this;
8320
8321 if (detect_info.found & CATEGORY_MASK_UTF_16_LE)
8322 this = coding_categories + coding_category_utf_16_le;
8323 else if (detect_info.found & CATEGORY_MASK_UTF_16_BE)
8324 this = coding_categories + coding_category_utf_16_be;
8325 else if (detect_info.rejected & CATEGORY_MASK_UTF_16_LE_NOSIG)
8326 this = coding_categories + coding_category_utf_16_be_nosig;
8327 else
8328 this = coding_categories + coding_category_utf_16_le_nosig;
8329 val = Fcons (make_number (this->id), Qnil);
8330 }
8331 }
8332 else
8333 {
8334 detect_info.found = 1 << XINT (CODING_ATTR_CATEGORY (attrs));
8335 val = Fcons (make_number (coding.id), Qnil);
8336 }
8337
8338 /* Then, detect eol-format if necessary. */
8339 {
8340 int normal_eol = -1, utf_16_be_eol = -1, utf_16_le_eol = -1;
8341 Lisp_Object tail;
8342
8343 if (VECTORP (eol_type))
8344 {
8345 if (detect_info.found & ~CATEGORY_MASK_UTF_16)
8346 {
8347 if (null_byte_found)
8348 normal_eol = EOL_SEEN_LF;
8349 else
8350 normal_eol = detect_eol (coding.source, src_bytes,
8351 coding_category_raw_text);
8352 }
8353 if (detect_info.found & (CATEGORY_MASK_UTF_16_BE
8354 | CATEGORY_MASK_UTF_16_BE_NOSIG))
8355 utf_16_be_eol = detect_eol (coding.source, src_bytes,
8356 coding_category_utf_16_be);
8357 if (detect_info.found & (CATEGORY_MASK_UTF_16_LE
8358 | CATEGORY_MASK_UTF_16_LE_NOSIG))
8359 utf_16_le_eol = detect_eol (coding.source, src_bytes,
8360 coding_category_utf_16_le);
8361 }
8362 else
8363 {
8364 if (EQ (eol_type, Qunix))
8365 normal_eol = utf_16_be_eol = utf_16_le_eol = EOL_SEEN_LF;
8366 else if (EQ (eol_type, Qdos))
8367 normal_eol = utf_16_be_eol = utf_16_le_eol = EOL_SEEN_CRLF;
8368 else
8369 normal_eol = utf_16_be_eol = utf_16_le_eol = EOL_SEEN_CR;
8370 }
8371
8372 for (tail = val; CONSP (tail); tail = XCDR (tail))
8373 {
8374 enum coding_category category;
8375 int this_eol;
8376
8377 id = XINT (XCAR (tail));
8378 attrs = CODING_ID_ATTRS (id);
8379 category = XINT (CODING_ATTR_CATEGORY (attrs));
8380 eol_type = CODING_ID_EOL_TYPE (id);
8381 if (VECTORP (eol_type))
8382 {
8383 if (category == coding_category_utf_16_be
8384 || category == coding_category_utf_16_be_nosig)
8385 this_eol = utf_16_be_eol;
8386 else if (category == coding_category_utf_16_le
8387 || category == coding_category_utf_16_le_nosig)
8388 this_eol = utf_16_le_eol;
8389 else
8390 this_eol = normal_eol;
8391
8392 if (this_eol == EOL_SEEN_LF)
8393 XSETCAR (tail, AREF (eol_type, 0));
8394 else if (this_eol == EOL_SEEN_CRLF)
8395 XSETCAR (tail, AREF (eol_type, 1));
8396 else if (this_eol == EOL_SEEN_CR)
8397 XSETCAR (tail, AREF (eol_type, 2));
8398 else
8399 XSETCAR (tail, CODING_ID_NAME (id));
8400 }
8401 else
8402 XSETCAR (tail, CODING_ID_NAME (id));
8403 }
8404 }
8405
8406 return (highest ? (CONSP (val) ? XCAR (val) : Qnil) : val);
8407 }
8408
8409
8410 DEFUN ("detect-coding-region", Fdetect_coding_region, Sdetect_coding_region,
8411 2, 3, 0,
8412 doc: /* Detect coding system of the text in the region between START and END.
8413 Return a list of possible coding systems ordered by priority.
8414 The coding systems to try and their priorities follows what
8415 the function `coding-system-priority-list' (which see) returns.
8416
8417 If only ASCII characters are found (except for such ISO-2022 control
8418 characters as ESC), it returns a list of single element `undecided'
8419 or its subsidiary coding system according to a detected end-of-line
8420 format.
8421
8422 If optional argument HIGHEST is non-nil, return the coding system of
8423 highest priority. */)
8424 (Lisp_Object start, Lisp_Object end, Lisp_Object highest)
8425 {
8426 ptrdiff_t from, to;
8427 ptrdiff_t from_byte, to_byte;
8428
8429 validate_region (&start, &end);
8430 from = XINT (start), to = XINT (end);
8431 from_byte = CHAR_TO_BYTE (from);
8432 to_byte = CHAR_TO_BYTE (to);
8433
8434 if (from < GPT && to >= GPT)
8435 move_gap_both (to, to_byte);
8436
8437 return detect_coding_system (BYTE_POS_ADDR (from_byte),
8438 to - from, to_byte - from_byte,
8439 !NILP (highest),
8440 !NILP (BVAR (current_buffer
8441 , enable_multibyte_characters)),
8442 Qnil);
8443 }
8444
8445 DEFUN ("detect-coding-string", Fdetect_coding_string, Sdetect_coding_string,
8446 1, 2, 0,
8447 doc: /* Detect coding system of the text in STRING.
8448 Return a list of possible coding systems ordered by priority.
8449 The coding systems to try and their priorities follows what
8450 the function `coding-system-priority-list' (which see) returns.
8451
8452 If only ASCII characters are found (except for such ISO-2022 control
8453 characters as ESC), it returns a list of single element `undecided'
8454 or its subsidiary coding system according to a detected end-of-line
8455 format.
8456
8457 If optional argument HIGHEST is non-nil, return the coding system of
8458 highest priority. */)
8459 (Lisp_Object string, Lisp_Object highest)
8460 {
8461 CHECK_STRING (string);
8462
8463 return detect_coding_system (SDATA (string),
8464 SCHARS (string), SBYTES (string),
8465 !NILP (highest), STRING_MULTIBYTE (string),
8466 Qnil);
8467 }
8468
8469
8470 static bool
8471 char_encodable_p (int c, Lisp_Object attrs)
8472 {
8473 Lisp_Object tail;
8474 struct charset *charset;
8475 Lisp_Object translation_table;
8476
8477 translation_table = CODING_ATTR_TRANS_TBL (attrs);
8478 if (! NILP (translation_table))
8479 c = translate_char (translation_table, c);
8480 for (tail = CODING_ATTR_CHARSET_LIST (attrs);
8481 CONSP (tail); tail = XCDR (tail))
8482 {
8483 charset = CHARSET_FROM_ID (XINT (XCAR (tail)));
8484 if (CHAR_CHARSET_P (c, charset))
8485 break;
8486 }
8487 return (! NILP (tail));
8488 }
8489
8490
8491 /* Return a list of coding systems that safely encode the text between
8492 START and END. If EXCLUDE is non-nil, it is a list of coding
8493 systems not to check. The returned list doesn't contain any such
8494 coding systems. In any case, if the text contains only ASCII or is
8495 unibyte, return t. */
8496
8497 DEFUN ("find-coding-systems-region-internal",
8498 Ffind_coding_systems_region_internal,
8499 Sfind_coding_systems_region_internal, 2, 3, 0,
8500 doc: /* Internal use only. */)
8501 (Lisp_Object start, Lisp_Object end, Lisp_Object exclude)
8502 {
8503 Lisp_Object coding_attrs_list, safe_codings;
8504 ptrdiff_t start_byte, end_byte;
8505 const unsigned char *p, *pbeg, *pend;
8506 int c;
8507 Lisp_Object tail, elt, work_table;
8508
8509 if (STRINGP (start))
8510 {
8511 if (!STRING_MULTIBYTE (start)
8512 || SCHARS (start) == SBYTES (start))
8513 return Qt;
8514 start_byte = 0;
8515 end_byte = SBYTES (start);
8516 }
8517 else
8518 {
8519 CHECK_NUMBER_COERCE_MARKER (start);
8520 CHECK_NUMBER_COERCE_MARKER (end);
8521 if (XINT (start) < BEG || XINT (end) > Z || XINT (start) > XINT (end))
8522 args_out_of_range (start, end);
8523 if (NILP (BVAR (current_buffer, enable_multibyte_characters)))
8524 return Qt;
8525 start_byte = CHAR_TO_BYTE (XINT (start));
8526 end_byte = CHAR_TO_BYTE (XINT (end));
8527 if (XINT (end) - XINT (start) == end_byte - start_byte)
8528 return Qt;
8529
8530 if (XINT (start) < GPT && XINT (end) > GPT)
8531 {
8532 if ((GPT - XINT (start)) < (XINT (end) - GPT))
8533 move_gap_both (XINT (start), start_byte);
8534 else
8535 move_gap_both (XINT (end), end_byte);
8536 }
8537 }
8538
8539 coding_attrs_list = Qnil;
8540 for (tail = Vcoding_system_list; CONSP (tail); tail = XCDR (tail))
8541 if (NILP (exclude)
8542 || NILP (Fmemq (XCAR (tail), exclude)))
8543 {
8544 Lisp_Object attrs;
8545
8546 attrs = AREF (CODING_SYSTEM_SPEC (XCAR (tail)), 0);
8547 if (EQ (XCAR (tail), CODING_ATTR_BASE_NAME (attrs))
8548 && ! EQ (CODING_ATTR_TYPE (attrs), Qundecided))
8549 {
8550 ASET (attrs, coding_attr_trans_tbl,
8551 get_translation_table (attrs, 1, NULL));
8552 coding_attrs_list = Fcons (attrs, coding_attrs_list);
8553 }
8554 }
8555
8556 if (STRINGP (start))
8557 p = pbeg = SDATA (start);
8558 else
8559 p = pbeg = BYTE_POS_ADDR (start_byte);
8560 pend = p + (end_byte - start_byte);
8561
8562 while (p < pend && ASCII_BYTE_P (*p)) p++;
8563 while (p < pend && ASCII_BYTE_P (*(pend - 1))) pend--;
8564
8565 work_table = Fmake_char_table (Qnil, Qnil);
8566 while (p < pend)
8567 {
8568 if (ASCII_BYTE_P (*p))
8569 p++;
8570 else
8571 {
8572 c = STRING_CHAR_ADVANCE (p);
8573 if (!NILP (char_table_ref (work_table, c)))
8574 /* This character was already checked. Ignore it. */
8575 continue;
8576
8577 charset_map_loaded = 0;
8578 for (tail = coding_attrs_list; CONSP (tail);)
8579 {
8580 elt = XCAR (tail);
8581 if (NILP (elt))
8582 tail = XCDR (tail);
8583 else if (char_encodable_p (c, elt))
8584 tail = XCDR (tail);
8585 else if (CONSP (XCDR (tail)))
8586 {
8587 XSETCAR (tail, XCAR (XCDR (tail)));
8588 XSETCDR (tail, XCDR (XCDR (tail)));
8589 }
8590 else
8591 {
8592 XSETCAR (tail, Qnil);
8593 tail = XCDR (tail);
8594 }
8595 }
8596 if (charset_map_loaded)
8597 {
8598 ptrdiff_t p_offset = p - pbeg, pend_offset = pend - pbeg;
8599
8600 if (STRINGP (start))
8601 pbeg = SDATA (start);
8602 else
8603 pbeg = BYTE_POS_ADDR (start_byte);
8604 p = pbeg + p_offset;
8605 pend = pbeg + pend_offset;
8606 }
8607 char_table_set (work_table, c, Qt);
8608 }
8609 }
8610
8611 safe_codings = list2 (Qraw_text, Qno_conversion);
8612 for (tail = coding_attrs_list; CONSP (tail); tail = XCDR (tail))
8613 if (! NILP (XCAR (tail)))
8614 safe_codings = Fcons (CODING_ATTR_BASE_NAME (XCAR (tail)), safe_codings);
8615
8616 return safe_codings;
8617 }
8618
8619
8620 DEFUN ("unencodable-char-position", Funencodable_char_position,
8621 Sunencodable_char_position, 3, 5, 0,
8622 doc: /*
8623 Return position of first un-encodable character in a region.
8624 START and END specify the region and CODING-SYSTEM specifies the
8625 encoding to check. Return nil if CODING-SYSTEM does encode the region.
8626
8627 If optional 4th argument COUNT is non-nil, it specifies at most how
8628 many un-encodable characters to search. In this case, the value is a
8629 list of positions.
8630
8631 If optional 5th argument STRING is non-nil, it is a string to search
8632 for un-encodable characters. In that case, START and END are indexes
8633 to the string. */)
8634 (Lisp_Object start, Lisp_Object end, Lisp_Object coding_system, Lisp_Object count, Lisp_Object string)
8635 {
8636 EMACS_INT n;
8637 struct coding_system coding;
8638 Lisp_Object attrs, charset_list, translation_table;
8639 Lisp_Object positions;
8640 ptrdiff_t from, to;
8641 const unsigned char *p, *stop, *pend;
8642 bool ascii_compatible;
8643
8644 setup_coding_system (Fcheck_coding_system (coding_system), &coding);
8645 attrs = CODING_ID_ATTRS (coding.id);
8646 if (EQ (CODING_ATTR_TYPE (attrs), Qraw_text))
8647 return Qnil;
8648 ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs));
8649 charset_list = CODING_ATTR_CHARSET_LIST (attrs);
8650 translation_table = get_translation_table (attrs, 1, NULL);
8651
8652 if (NILP (string))
8653 {
8654 validate_region (&start, &end);
8655 from = XINT (start);
8656 to = XINT (end);
8657 if (NILP (BVAR (current_buffer, enable_multibyte_characters))
8658 || (ascii_compatible
8659 && (to - from) == (CHAR_TO_BYTE (to) - (CHAR_TO_BYTE (from)))))
8660 return Qnil;
8661 p = CHAR_POS_ADDR (from);
8662 pend = CHAR_POS_ADDR (to);
8663 if (from < GPT && to >= GPT)
8664 stop = GPT_ADDR;
8665 else
8666 stop = pend;
8667 }
8668 else
8669 {
8670 CHECK_STRING (string);
8671 CHECK_NATNUM (start);
8672 CHECK_NATNUM (end);
8673 if (! (XINT (start) <= XINT (end) && XINT (end) <= SCHARS (string)))
8674 args_out_of_range_3 (string, start, end);
8675 from = XINT (start);
8676 to = XINT (end);
8677 if (! STRING_MULTIBYTE (string))
8678 return Qnil;
8679 p = SDATA (string) + string_char_to_byte (string, from);
8680 stop = pend = SDATA (string) + string_char_to_byte (string, to);
8681 if (ascii_compatible && (to - from) == (pend - p))
8682 return Qnil;
8683 }
8684
8685 if (NILP (count))
8686 n = 1;
8687 else
8688 {
8689 CHECK_NATNUM (count);
8690 n = XINT (count);
8691 }
8692
8693 positions = Qnil;
8694 charset_map_loaded = 0;
8695 while (1)
8696 {
8697 int c;
8698
8699 if (ascii_compatible)
8700 while (p < stop && ASCII_BYTE_P (*p))
8701 p++, from++;
8702 if (p >= stop)
8703 {
8704 if (p >= pend)
8705 break;
8706 stop = pend;
8707 p = GAP_END_ADDR;
8708 }
8709
8710 c = STRING_CHAR_ADVANCE (p);
8711 if (! (ASCII_CHAR_P (c) && ascii_compatible)
8712 && ! char_charset (translate_char (translation_table, c),
8713 charset_list, NULL))
8714 {
8715 positions = Fcons (make_number (from), positions);
8716 n--;
8717 if (n == 0)
8718 break;
8719 }
8720
8721 from++;
8722 if (charset_map_loaded && NILP (string))
8723 {
8724 p = CHAR_POS_ADDR (from);
8725 pend = CHAR_POS_ADDR (to);
8726 if (from < GPT && to >= GPT)
8727 stop = GPT_ADDR;
8728 else
8729 stop = pend;
8730 charset_map_loaded = 0;
8731 }
8732 }
8733
8734 return (NILP (count) ? Fcar (positions) : Fnreverse (positions));
8735 }
8736
8737
8738 DEFUN ("check-coding-systems-region", Fcheck_coding_systems_region,
8739 Scheck_coding_systems_region, 3, 3, 0,
8740 doc: /* Check if the region is encodable by coding systems.
8741
8742 START and END are buffer positions specifying the region.
8743 CODING-SYSTEM-LIST is a list of coding systems to check.
8744
8745 The value is an alist ((CODING-SYSTEM POS0 POS1 ...) ...), where
8746 CODING-SYSTEM is a member of CODING-SYSTEM-LIST and can't encode the
8747 whole region, POS0, POS1, ... are buffer positions where non-encodable
8748 characters are found.
8749
8750 If all coding systems in CODING-SYSTEM-LIST can encode the region, the
8751 value is nil.
8752
8753 START may be a string. In that case, check if the string is
8754 encodable, and the value contains indices to the string instead of
8755 buffer positions. END is ignored.
8756
8757 If the current buffer (or START if it is a string) is unibyte, the value
8758 is nil. */)
8759 (Lisp_Object start, Lisp_Object end, Lisp_Object coding_system_list)
8760 {
8761 Lisp_Object list;
8762 ptrdiff_t start_byte, end_byte;
8763 ptrdiff_t pos;
8764 const unsigned char *p, *pbeg, *pend;
8765 int c;
8766 Lisp_Object tail, elt, attrs;
8767
8768 if (STRINGP (start))
8769 {
8770 if (!STRING_MULTIBYTE (start)
8771 || SCHARS (start) == SBYTES (start))
8772 return Qnil;
8773 start_byte = 0;
8774 end_byte = SBYTES (start);
8775 pos = 0;
8776 }
8777 else
8778 {
8779 CHECK_NUMBER_COERCE_MARKER (start);
8780 CHECK_NUMBER_COERCE_MARKER (end);
8781 if (XINT (start) < BEG || XINT (end) > Z || XINT (start) > XINT (end))
8782 args_out_of_range (start, end);
8783 if (NILP (BVAR (current_buffer, enable_multibyte_characters)))
8784 return Qnil;
8785 start_byte = CHAR_TO_BYTE (XINT (start));
8786 end_byte = CHAR_TO_BYTE (XINT (end));
8787 if (XINT (end) - XINT (start) == end_byte - start_byte)
8788 return Qnil;
8789
8790 if (XINT (start) < GPT && XINT (end) > GPT)
8791 {
8792 if ((GPT - XINT (start)) < (XINT (end) - GPT))
8793 move_gap_both (XINT (start), start_byte);
8794 else
8795 move_gap_both (XINT (end), end_byte);
8796 }
8797 pos = XINT (start);
8798 }
8799
8800 list = Qnil;
8801 for (tail = coding_system_list; CONSP (tail); tail = XCDR (tail))
8802 {
8803 elt = XCAR (tail);
8804 attrs = AREF (CODING_SYSTEM_SPEC (elt), 0);
8805 ASET (attrs, coding_attr_trans_tbl,
8806 get_translation_table (attrs, 1, NULL));
8807 list = Fcons (Fcons (elt, Fcons (attrs, Qnil)), list);
8808 }
8809
8810 if (STRINGP (start))
8811 p = pbeg = SDATA (start);
8812 else
8813 p = pbeg = BYTE_POS_ADDR (start_byte);
8814 pend = p + (end_byte - start_byte);
8815
8816 while (p < pend && ASCII_BYTE_P (*p)) p++, pos++;
8817 while (p < pend && ASCII_BYTE_P (*(pend - 1))) pend--;
8818
8819 while (p < pend)
8820 {
8821 if (ASCII_BYTE_P (*p))
8822 p++;
8823 else
8824 {
8825 c = STRING_CHAR_ADVANCE (p);
8826
8827 charset_map_loaded = 0;
8828 for (tail = list; CONSP (tail); tail = XCDR (tail))
8829 {
8830 elt = XCDR (XCAR (tail));
8831 if (! char_encodable_p (c, XCAR (elt)))
8832 XSETCDR (elt, Fcons (make_number (pos), XCDR (elt)));
8833 }
8834 if (charset_map_loaded)
8835 {
8836 ptrdiff_t p_offset = p - pbeg, pend_offset = pend - pbeg;
8837
8838 if (STRINGP (start))
8839 pbeg = SDATA (start);
8840 else
8841 pbeg = BYTE_POS_ADDR (start_byte);
8842 p = pbeg + p_offset;
8843 pend = pbeg + pend_offset;
8844 }
8845 }
8846 pos++;
8847 }
8848
8849 tail = list;
8850 list = Qnil;
8851 for (; CONSP (tail); tail = XCDR (tail))
8852 {
8853 elt = XCAR (tail);
8854 if (CONSP (XCDR (XCDR (elt))))
8855 list = Fcons (Fcons (XCAR (elt), Fnreverse (XCDR (XCDR (elt)))),
8856 list);
8857 }
8858
8859 return list;
8860 }
8861
8862
8863 static Lisp_Object
8864 code_convert_region (Lisp_Object start, Lisp_Object end,
8865 Lisp_Object coding_system, Lisp_Object dst_object,
8866 bool encodep, bool norecord)
8867 {
8868 struct coding_system coding;
8869 ptrdiff_t from, from_byte, to, to_byte;
8870 Lisp_Object src_object;
8871
8872 if (NILP (coding_system))
8873 coding_system = Qno_conversion;
8874 else
8875 CHECK_CODING_SYSTEM (coding_system);
8876 src_object = Fcurrent_buffer ();
8877 if (NILP (dst_object))
8878 dst_object = src_object;
8879 else if (! EQ (dst_object, Qt))
8880 CHECK_BUFFER (dst_object);
8881
8882 validate_region (&start, &end);
8883 from = XFASTINT (start);
8884 from_byte = CHAR_TO_BYTE (from);
8885 to = XFASTINT (end);
8886 to_byte = CHAR_TO_BYTE (to);
8887
8888 setup_coding_system (coding_system, &coding);
8889 coding.mode |= CODING_MODE_LAST_BLOCK;
8890
8891 if (encodep)
8892 encode_coding_object (&coding, src_object, from, from_byte, to, to_byte,
8893 dst_object);
8894 else
8895 decode_coding_object (&coding, src_object, from, from_byte, to, to_byte,
8896 dst_object);
8897 if (! norecord)
8898 Vlast_coding_system_used = CODING_ID_NAME (coding.id);
8899
8900 return (BUFFERP (dst_object)
8901 ? make_number (coding.produced_char)
8902 : coding.dst_object);
8903 }
8904
8905
8906 DEFUN ("decode-coding-region", Fdecode_coding_region, Sdecode_coding_region,
8907 3, 4, "r\nzCoding system: ",
8908 doc: /* Decode the current region from the specified coding system.
8909 When called from a program, takes four arguments:
8910 START, END, CODING-SYSTEM, and DESTINATION.
8911 START and END are buffer positions.
8912
8913 Optional 4th arguments DESTINATION specifies where the decoded text goes.
8914 If nil, the region between START and END is replaced by the decoded text.
8915 If buffer, the decoded text is inserted in that buffer after point (point
8916 does not move).
8917 In those cases, the length of the decoded text is returned.
8918 If DESTINATION is t, the decoded text is returned.
8919
8920 This function sets `last-coding-system-used' to the precise coding system
8921 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
8922 not fully specified.) */)
8923 (Lisp_Object start, Lisp_Object end, Lisp_Object coding_system, Lisp_Object destination)
8924 {
8925 return code_convert_region (start, end, coding_system, destination, 0, 0);
8926 }
8927
8928 DEFUN ("encode-coding-region", Fencode_coding_region, Sencode_coding_region,
8929 3, 4, "r\nzCoding system: ",
8930 doc: /* Encode the current region by specified coding system.
8931 When called from a program, takes four arguments:
8932 START, END, CODING-SYSTEM and DESTINATION.
8933 START and END are buffer positions.
8934
8935 Optional 4th arguments DESTINATION specifies where the encoded text goes.
8936 If nil, the region between START and END is replace by the encoded text.
8937 If buffer, the encoded text is inserted in that buffer after point (point
8938 does not move).
8939 In those cases, the length of the encoded text is returned.
8940 If DESTINATION is t, the encoded text is returned.
8941
8942 This function sets `last-coding-system-used' to the precise coding system
8943 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
8944 not fully specified.) */)
8945 (Lisp_Object start, Lisp_Object end, Lisp_Object coding_system, Lisp_Object destination)
8946 {
8947 return code_convert_region (start, end, coding_system, destination, 1, 0);
8948 }
8949
8950 Lisp_Object
8951 code_convert_string (Lisp_Object string, Lisp_Object coding_system,
8952 Lisp_Object dst_object, bool encodep, bool nocopy,
8953 bool norecord)
8954 {
8955 struct coding_system coding;
8956 ptrdiff_t chars, bytes;
8957
8958 CHECK_STRING (string);
8959 if (NILP (coding_system))
8960 {
8961 if (! norecord)
8962 Vlast_coding_system_used = Qno_conversion;
8963 if (NILP (dst_object))
8964 return (nocopy ? Fcopy_sequence (string) : string);
8965 }
8966
8967 if (NILP (coding_system))
8968 coding_system = Qno_conversion;
8969 else
8970 CHECK_CODING_SYSTEM (coding_system);
8971 if (NILP (dst_object))
8972 dst_object = Qt;
8973 else if (! EQ (dst_object, Qt))
8974 CHECK_BUFFER (dst_object);
8975
8976 setup_coding_system (coding_system, &coding);
8977 coding.mode |= CODING_MODE_LAST_BLOCK;
8978 chars = SCHARS (string);
8979 bytes = SBYTES (string);
8980 if (encodep)
8981 encode_coding_object (&coding, string, 0, 0, chars, bytes, dst_object);
8982 else
8983 decode_coding_object (&coding, string, 0, 0, chars, bytes, dst_object);
8984 if (! norecord)
8985 Vlast_coding_system_used = CODING_ID_NAME (coding.id);
8986
8987 return (BUFFERP (dst_object)
8988 ? make_number (coding.produced_char)
8989 : coding.dst_object);
8990 }
8991
8992
8993 /* Encode or decode STRING according to CODING_SYSTEM.
8994 Do not set Vlast_coding_system_used.
8995
8996 This function is called only from macros DECODE_FILE and
8997 ENCODE_FILE, thus we ignore character composition. */
8998
8999 Lisp_Object
9000 code_convert_string_norecord (Lisp_Object string, Lisp_Object coding_system,
9001 bool encodep)
9002 {
9003 return code_convert_string (string, coding_system, Qt, encodep, 0, 1);
9004 }
9005
9006
9007 DEFUN ("decode-coding-string", Fdecode_coding_string, Sdecode_coding_string,
9008 2, 4, 0,
9009 doc: /* Decode STRING which is encoded in CODING-SYSTEM, and return the result.
9010
9011 Optional third arg NOCOPY non-nil means it is OK to return STRING itself
9012 if the decoding operation is trivial.
9013
9014 Optional fourth arg BUFFER non-nil means that the decoded text is
9015 inserted in that buffer after point (point does not move). In this
9016 case, the return value is the length of the decoded text.
9017
9018 This function sets `last-coding-system-used' to the precise coding system
9019 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
9020 not fully specified.) */)
9021 (Lisp_Object string, Lisp_Object coding_system, Lisp_Object nocopy, Lisp_Object buffer)
9022 {
9023 return code_convert_string (string, coding_system, buffer,
9024 0, ! NILP (nocopy), 0);
9025 }
9026
9027 DEFUN ("encode-coding-string", Fencode_coding_string, Sencode_coding_string,
9028 2, 4, 0,
9029 doc: /* Encode STRING to CODING-SYSTEM, and return the result.
9030
9031 Optional third arg NOCOPY non-nil means it is OK to return STRING
9032 itself if the encoding operation is trivial.
9033
9034 Optional fourth arg BUFFER non-nil means that the encoded text is
9035 inserted in that buffer after point (point does not move). In this
9036 case, the return value is the length of the encoded text.
9037
9038 This function sets `last-coding-system-used' to the precise coding system
9039 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
9040 not fully specified.) */)
9041 (Lisp_Object string, Lisp_Object coding_system, Lisp_Object nocopy, Lisp_Object buffer)
9042 {
9043 return code_convert_string (string, coding_system, buffer,
9044 1, ! NILP (nocopy), 0);
9045 }
9046
9047 \f
9048 DEFUN ("decode-sjis-char", Fdecode_sjis_char, Sdecode_sjis_char, 1, 1, 0,
9049 doc: /* Decode a Japanese character which has CODE in shift_jis encoding.
9050 Return the corresponding character. */)
9051 (Lisp_Object code)
9052 {
9053 Lisp_Object spec, attrs, val;
9054 struct charset *charset_roman, *charset_kanji, *charset_kana, *charset;
9055 EMACS_INT ch;
9056 int c;
9057
9058 CHECK_NATNUM (code);
9059 ch = XFASTINT (code);
9060 CHECK_CODING_SYSTEM_GET_SPEC (Vsjis_coding_system, spec);
9061 attrs = AREF (spec, 0);
9062
9063 if (ASCII_BYTE_P (ch)
9064 && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
9065 return code;
9066
9067 val = CODING_ATTR_CHARSET_LIST (attrs);
9068 charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
9069 charset_kana = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
9070 charset_kanji = CHARSET_FROM_ID (XINT (XCAR (val)));
9071
9072 if (ch <= 0x7F)
9073 {
9074 c = ch;
9075 charset = charset_roman;
9076 }
9077 else if (ch >= 0xA0 && ch < 0xDF)
9078 {
9079 c = ch - 0x80;
9080 charset = charset_kana;
9081 }
9082 else
9083 {
9084 EMACS_INT c1 = ch >> 8;
9085 int c2 = ch & 0xFF;
9086
9087 if (c1 < 0x81 || (c1 > 0x9F && c1 < 0xE0) || c1 > 0xEF
9088 || c2 < 0x40 || c2 == 0x7F || c2 > 0xFC)
9089 error ("Invalid code: %"pI"d", ch);
9090 c = ch;
9091 SJIS_TO_JIS (c);
9092 charset = charset_kanji;
9093 }
9094 c = DECODE_CHAR (charset, c);
9095 if (c < 0)
9096 error ("Invalid code: %"pI"d", ch);
9097 return make_number (c);
9098 }
9099
9100
9101 DEFUN ("encode-sjis-char", Fencode_sjis_char, Sencode_sjis_char, 1, 1, 0,
9102 doc: /* Encode a Japanese character CH to shift_jis encoding.
9103 Return the corresponding code in SJIS. */)
9104 (Lisp_Object ch)
9105 {
9106 Lisp_Object spec, attrs, charset_list;
9107 int c;
9108 struct charset *charset;
9109 unsigned code;
9110
9111 CHECK_CHARACTER (ch);
9112 c = XFASTINT (ch);
9113 CHECK_CODING_SYSTEM_GET_SPEC (Vsjis_coding_system, spec);
9114 attrs = AREF (spec, 0);
9115
9116 if (ASCII_CHAR_P (c)
9117 && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
9118 return ch;
9119
9120 charset_list = CODING_ATTR_CHARSET_LIST (attrs);
9121 charset = char_charset (c, charset_list, &code);
9122 if (code == CHARSET_INVALID_CODE (charset))
9123 error ("Can't encode by shift_jis encoding: %c", c);
9124 JIS_TO_SJIS (code);
9125
9126 return make_number (code);
9127 }
9128
9129 DEFUN ("decode-big5-char", Fdecode_big5_char, Sdecode_big5_char, 1, 1, 0,
9130 doc: /* Decode a Big5 character which has CODE in BIG5 coding system.
9131 Return the corresponding character. */)
9132 (Lisp_Object code)
9133 {
9134 Lisp_Object spec, attrs, val;
9135 struct charset *charset_roman, *charset_big5, *charset;
9136 EMACS_INT ch;
9137 int c;
9138
9139 CHECK_NATNUM (code);
9140 ch = XFASTINT (code);
9141 CHECK_CODING_SYSTEM_GET_SPEC (Vbig5_coding_system, spec);
9142 attrs = AREF (spec, 0);
9143
9144 if (ASCII_BYTE_P (ch)
9145 && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
9146 return code;
9147
9148 val = CODING_ATTR_CHARSET_LIST (attrs);
9149 charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
9150 charset_big5 = CHARSET_FROM_ID (XINT (XCAR (val)));
9151
9152 if (ch <= 0x7F)
9153 {
9154 c = ch;
9155 charset = charset_roman;
9156 }
9157 else
9158 {
9159 EMACS_INT b1 = ch >> 8;
9160 int b2 = ch & 0x7F;
9161 if (b1 < 0xA1 || b1 > 0xFE
9162 || b2 < 0x40 || (b2 > 0x7E && b2 < 0xA1) || b2 > 0xFE)
9163 error ("Invalid code: %"pI"d", ch);
9164 c = ch;
9165 charset = charset_big5;
9166 }
9167 c = DECODE_CHAR (charset, c);
9168 if (c < 0)
9169 error ("Invalid code: %"pI"d", ch);
9170 return make_number (c);
9171 }
9172
9173 DEFUN ("encode-big5-char", Fencode_big5_char, Sencode_big5_char, 1, 1, 0,
9174 doc: /* Encode the Big5 character CH to BIG5 coding system.
9175 Return the corresponding character code in Big5. */)
9176 (Lisp_Object ch)
9177 {
9178 Lisp_Object spec, attrs, charset_list;
9179 struct charset *charset;
9180 int c;
9181 unsigned code;
9182
9183 CHECK_CHARACTER (ch);
9184 c = XFASTINT (ch);
9185 CHECK_CODING_SYSTEM_GET_SPEC (Vbig5_coding_system, spec);
9186 attrs = AREF (spec, 0);
9187 if (ASCII_CHAR_P (c)
9188 && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
9189 return ch;
9190
9191 charset_list = CODING_ATTR_CHARSET_LIST (attrs);
9192 charset = char_charset (c, charset_list, &code);
9193 if (code == CHARSET_INVALID_CODE (charset))
9194 error ("Can't encode by Big5 encoding: %c", c);
9195
9196 return make_number (code);
9197 }
9198
9199 \f
9200 DEFUN ("set-terminal-coding-system-internal", Fset_terminal_coding_system_internal,
9201 Sset_terminal_coding_system_internal, 1, 2, 0,
9202 doc: /* Internal use only. */)
9203 (Lisp_Object coding_system, Lisp_Object terminal)
9204 {
9205 struct terminal *term = get_terminal (terminal, 1);
9206 struct coding_system *terminal_coding = TERMINAL_TERMINAL_CODING (term);
9207 CHECK_SYMBOL (coding_system);
9208 setup_coding_system (Fcheck_coding_system (coding_system), terminal_coding);
9209 /* We had better not send unsafe characters to terminal. */
9210 terminal_coding->mode |= CODING_MODE_SAFE_ENCODING;
9211 /* Character composition should be disabled. */
9212 terminal_coding->common_flags &= ~CODING_ANNOTATE_COMPOSITION_MASK;
9213 terminal_coding->src_multibyte = 1;
9214 terminal_coding->dst_multibyte = 0;
9215 tset_charset_list
9216 (term, (terminal_coding->common_flags & CODING_REQUIRE_ENCODING_MASK
9217 ? coding_charset_list (terminal_coding)
9218 : Fcons (make_number (charset_ascii), Qnil)));
9219 return Qnil;
9220 }
9221
9222 DEFUN ("set-safe-terminal-coding-system-internal",
9223 Fset_safe_terminal_coding_system_internal,
9224 Sset_safe_terminal_coding_system_internal, 1, 1, 0,
9225 doc: /* Internal use only. */)
9226 (Lisp_Object coding_system)
9227 {
9228 CHECK_SYMBOL (coding_system);
9229 setup_coding_system (Fcheck_coding_system (coding_system),
9230 &safe_terminal_coding);
9231 /* Character composition should be disabled. */
9232 safe_terminal_coding.common_flags &= ~CODING_ANNOTATE_COMPOSITION_MASK;
9233 safe_terminal_coding.src_multibyte = 1;
9234 safe_terminal_coding.dst_multibyte = 0;
9235 return Qnil;
9236 }
9237
9238 DEFUN ("terminal-coding-system", Fterminal_coding_system,
9239 Sterminal_coding_system, 0, 1, 0,
9240 doc: /* Return coding system specified for terminal output on the given terminal.
9241 TERMINAL may be a terminal object, a frame, or nil for the selected
9242 frame's terminal device. */)
9243 (Lisp_Object terminal)
9244 {
9245 struct coding_system *terminal_coding
9246 = TERMINAL_TERMINAL_CODING (get_terminal (terminal, 1));
9247 Lisp_Object coding_system = CODING_ID_NAME (terminal_coding->id);
9248
9249 /* For backward compatibility, return nil if it is `undecided'. */
9250 return (! EQ (coding_system, Qundecided) ? coding_system : Qnil);
9251 }
9252
9253 DEFUN ("set-keyboard-coding-system-internal", Fset_keyboard_coding_system_internal,
9254 Sset_keyboard_coding_system_internal, 1, 2, 0,
9255 doc: /* Internal use only. */)
9256 (Lisp_Object coding_system, Lisp_Object terminal)
9257 {
9258 struct terminal *t = get_terminal (terminal, 1);
9259 CHECK_SYMBOL (coding_system);
9260 if (NILP (coding_system))
9261 coding_system = Qno_conversion;
9262 else
9263 Fcheck_coding_system (coding_system);
9264 setup_coding_system (coding_system, TERMINAL_KEYBOARD_CODING (t));
9265 /* Character composition should be disabled. */
9266 TERMINAL_KEYBOARD_CODING (t)->common_flags
9267 &= ~CODING_ANNOTATE_COMPOSITION_MASK;
9268 return Qnil;
9269 }
9270
9271 DEFUN ("keyboard-coding-system",
9272 Fkeyboard_coding_system, Skeyboard_coding_system, 0, 1, 0,
9273 doc: /* Return coding system specified for decoding keyboard input. */)
9274 (Lisp_Object terminal)
9275 {
9276 return CODING_ID_NAME (TERMINAL_KEYBOARD_CODING
9277 (get_terminal (terminal, 1))->id);
9278 }
9279
9280 \f
9281 DEFUN ("find-operation-coding-system", Ffind_operation_coding_system,
9282 Sfind_operation_coding_system, 1, MANY, 0,
9283 doc: /* Choose a coding system for an operation based on the target name.
9284 The value names a pair of coding systems: (DECODING-SYSTEM . ENCODING-SYSTEM).
9285 DECODING-SYSTEM is the coding system to use for decoding
9286 \(in case OPERATION does decoding), and ENCODING-SYSTEM is the coding system
9287 for encoding (in case OPERATION does encoding).
9288
9289 The first argument OPERATION specifies an I/O primitive:
9290 For file I/O, `insert-file-contents' or `write-region'.
9291 For process I/O, `call-process', `call-process-region', or `start-process'.
9292 For network I/O, `open-network-stream'.
9293
9294 The remaining arguments should be the same arguments that were passed
9295 to the primitive. Depending on which primitive, one of those arguments
9296 is selected as the TARGET. For example, if OPERATION does file I/O,
9297 whichever argument specifies the file name is TARGET.
9298
9299 TARGET has a meaning which depends on OPERATION:
9300 For file I/O, TARGET is a file name (except for the special case below).
9301 For process I/O, TARGET is a process name.
9302 For network I/O, TARGET is a service name or a port number.
9303
9304 This function looks up what is specified for TARGET in
9305 `file-coding-system-alist', `process-coding-system-alist',
9306 or `network-coding-system-alist' depending on OPERATION.
9307 They may specify a coding system, a cons of coding systems,
9308 or a function symbol to call.
9309 In the last case, we call the function with one argument,
9310 which is a list of all the arguments given to this function.
9311 If the function can't decide a coding system, it can return
9312 `undecided' so that the normal code-detection is performed.
9313
9314 If OPERATION is `insert-file-contents', the argument corresponding to
9315 TARGET may be a cons (FILENAME . BUFFER). In that case, FILENAME is a
9316 file name to look up, and BUFFER is a buffer that contains the file's
9317 contents (not yet decoded). If `file-coding-system-alist' specifies a
9318 function to call for FILENAME, that function should examine the
9319 contents of BUFFER instead of reading the file.
9320
9321 usage: (find-operation-coding-system OPERATION ARGUMENTS...) */)
9322 (ptrdiff_t nargs, Lisp_Object *args)
9323 {
9324 Lisp_Object operation, target_idx, target, val;
9325 register Lisp_Object chain;
9326
9327 if (nargs < 2)
9328 error ("Too few arguments");
9329 operation = args[0];
9330 if (!SYMBOLP (operation)
9331 || (target_idx = Fget (operation, Qtarget_idx), !NATNUMP (target_idx)))
9332 error ("Invalid first argument");
9333 if (nargs <= 1 + XFASTINT (target_idx))
9334 error ("Too few arguments for operation `%s'",
9335 SDATA (SYMBOL_NAME (operation)));
9336 target = args[XFASTINT (target_idx) + 1];
9337 if (!(STRINGP (target)
9338 || (EQ (operation, Qinsert_file_contents) && CONSP (target)
9339 && STRINGP (XCAR (target)) && BUFFERP (XCDR (target)))
9340 || (EQ (operation, Qopen_network_stream) && INTEGERP (target))))
9341 error ("Invalid argument %"pI"d of operation `%s'",
9342 XFASTINT (target_idx) + 1, SDATA (SYMBOL_NAME (operation)));
9343 if (CONSP (target))
9344 target = XCAR (target);
9345
9346 chain = ((EQ (operation, Qinsert_file_contents)
9347 || EQ (operation, Qwrite_region))
9348 ? Vfile_coding_system_alist
9349 : (EQ (operation, Qopen_network_stream)
9350 ? Vnetwork_coding_system_alist
9351 : Vprocess_coding_system_alist));
9352 if (NILP (chain))
9353 return Qnil;
9354
9355 for (; CONSP (chain); chain = XCDR (chain))
9356 {
9357 Lisp_Object elt;
9358
9359 elt = XCAR (chain);
9360 if (CONSP (elt)
9361 && ((STRINGP (target)
9362 && STRINGP (XCAR (elt))
9363 && fast_string_match (XCAR (elt), target) >= 0)
9364 || (INTEGERP (target) && EQ (target, XCAR (elt)))))
9365 {
9366 val = XCDR (elt);
9367 /* Here, if VAL is both a valid coding system and a valid
9368 function symbol, we return VAL as a coding system. */
9369 if (CONSP (val))
9370 return val;
9371 if (! SYMBOLP (val))
9372 return Qnil;
9373 if (! NILP (Fcoding_system_p (val)))
9374 return Fcons (val, val);
9375 if (! NILP (Ffboundp (val)))
9376 {
9377 /* We use call1 rather than safe_call1
9378 so as to get bug reports about functions called here
9379 which don't handle the current interface. */
9380 val = call1 (val, Flist (nargs, args));
9381 if (CONSP (val))
9382 return val;
9383 if (SYMBOLP (val) && ! NILP (Fcoding_system_p (val)))
9384 return Fcons (val, val);
9385 }
9386 return Qnil;
9387 }
9388 }
9389 return Qnil;
9390 }
9391
9392 DEFUN ("set-coding-system-priority", Fset_coding_system_priority,
9393 Sset_coding_system_priority, 0, MANY, 0,
9394 doc: /* Assign higher priority to the coding systems given as arguments.
9395 If multiple coding systems belong to the same category,
9396 all but the first one are ignored.
9397
9398 usage: (set-coding-system-priority &rest coding-systems) */)
9399 (ptrdiff_t nargs, Lisp_Object *args)
9400 {
9401 ptrdiff_t i, j;
9402 bool changed[coding_category_max];
9403 enum coding_category priorities[coding_category_max];
9404
9405 memset (changed, 0, sizeof changed);
9406
9407 for (i = j = 0; i < nargs; i++)
9408 {
9409 enum coding_category category;
9410 Lisp_Object spec, attrs;
9411
9412 CHECK_CODING_SYSTEM_GET_SPEC (args[i], spec);
9413 attrs = AREF (spec, 0);
9414 category = XINT (CODING_ATTR_CATEGORY (attrs));
9415 if (changed[category])
9416 /* Ignore this coding system because a coding system of the
9417 same category already had a higher priority. */
9418 continue;
9419 changed[category] = 1;
9420 priorities[j++] = category;
9421 if (coding_categories[category].id >= 0
9422 && ! EQ (args[i], CODING_ID_NAME (coding_categories[category].id)))
9423 setup_coding_system (args[i], &coding_categories[category]);
9424 Fset (AREF (Vcoding_category_table, category), args[i]);
9425 }
9426
9427 /* Now we have decided top J priorities. Reflect the order of the
9428 original priorities to the remaining priorities. */
9429
9430 for (i = j, j = 0; i < coding_category_max; i++, j++)
9431 {
9432 while (j < coding_category_max
9433 && changed[coding_priorities[j]])
9434 j++;
9435 if (j == coding_category_max)
9436 emacs_abort ();
9437 priorities[i] = coding_priorities[j];
9438 }
9439
9440 memcpy (coding_priorities, priorities, sizeof priorities);
9441
9442 /* Update `coding-category-list'. */
9443 Vcoding_category_list = Qnil;
9444 for (i = coding_category_max; i-- > 0; )
9445 Vcoding_category_list
9446 = Fcons (AREF (Vcoding_category_table, priorities[i]),
9447 Vcoding_category_list);
9448
9449 return Qnil;
9450 }
9451
9452 DEFUN ("coding-system-priority-list", Fcoding_system_priority_list,
9453 Scoding_system_priority_list, 0, 1, 0,
9454 doc: /* Return a list of coding systems ordered by their priorities.
9455 The list contains a subset of coding systems; i.e. coding systems
9456 assigned to each coding category (see `coding-category-list').
9457
9458 HIGHESTP non-nil means just return the highest priority one. */)
9459 (Lisp_Object highestp)
9460 {
9461 int i;
9462 Lisp_Object val;
9463
9464 for (i = 0, val = Qnil; i < coding_category_max; i++)
9465 {
9466 enum coding_category category = coding_priorities[i];
9467 int id = coding_categories[category].id;
9468 Lisp_Object attrs;
9469
9470 if (id < 0)
9471 continue;
9472 attrs = CODING_ID_ATTRS (id);
9473 if (! NILP (highestp))
9474 return CODING_ATTR_BASE_NAME (attrs);
9475 val = Fcons (CODING_ATTR_BASE_NAME (attrs), val);
9476 }
9477 return Fnreverse (val);
9478 }
9479
9480 static const char *const suffixes[] = { "-unix", "-dos", "-mac" };
9481
9482 static Lisp_Object
9483 make_subsidiaries (Lisp_Object base)
9484 {
9485 Lisp_Object subsidiaries;
9486 ptrdiff_t base_name_len = SBYTES (SYMBOL_NAME (base));
9487 char *buf = alloca (base_name_len + 6);
9488 int i;
9489
9490 memcpy (buf, SDATA (SYMBOL_NAME (base)), base_name_len);
9491 subsidiaries = Fmake_vector (make_number (3), Qnil);
9492 for (i = 0; i < 3; i++)
9493 {
9494 strcpy (buf + base_name_len, suffixes[i]);
9495 ASET (subsidiaries, i, intern (buf));
9496 }
9497 return subsidiaries;
9498 }
9499
9500
9501 DEFUN ("define-coding-system-internal", Fdefine_coding_system_internal,
9502 Sdefine_coding_system_internal, coding_arg_max, MANY, 0,
9503 doc: /* For internal use only.
9504 usage: (define-coding-system-internal ...) */)
9505 (ptrdiff_t nargs, Lisp_Object *args)
9506 {
9507 Lisp_Object name;
9508 Lisp_Object spec_vec; /* [ ATTRS ALIASE EOL_TYPE ] */
9509 Lisp_Object attrs; /* Vector of attributes. */
9510 Lisp_Object eol_type;
9511 Lisp_Object aliases;
9512 Lisp_Object coding_type, charset_list, safe_charsets;
9513 enum coding_category category;
9514 Lisp_Object tail, val;
9515 int max_charset_id = 0;
9516 int i;
9517
9518 if (nargs < coding_arg_max)
9519 goto short_args;
9520
9521 attrs = Fmake_vector (make_number (coding_attr_last_index), Qnil);
9522
9523 name = args[coding_arg_name];
9524 CHECK_SYMBOL (name);
9525 ASET (attrs, coding_attr_base_name, name);
9526
9527 val = args[coding_arg_mnemonic];
9528 if (! STRINGP (val))
9529 CHECK_CHARACTER (val);
9530 ASET (attrs, coding_attr_mnemonic, val);
9531
9532 coding_type = args[coding_arg_coding_type];
9533 CHECK_SYMBOL (coding_type);
9534 ASET (attrs, coding_attr_type, coding_type);
9535
9536 charset_list = args[coding_arg_charset_list];
9537 if (SYMBOLP (charset_list))
9538 {
9539 if (EQ (charset_list, Qiso_2022))
9540 {
9541 if (! EQ (coding_type, Qiso_2022))
9542 error ("Invalid charset-list");
9543 charset_list = Viso_2022_charset_list;
9544 }
9545 else if (EQ (charset_list, Qemacs_mule))
9546 {
9547 if (! EQ (coding_type, Qemacs_mule))
9548 error ("Invalid charset-list");
9549 charset_list = Vemacs_mule_charset_list;
9550 }
9551 for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
9552 {
9553 if (! RANGED_INTEGERP (0, XCAR (tail), INT_MAX - 1))
9554 error ("Invalid charset-list");
9555 if (max_charset_id < XFASTINT (XCAR (tail)))
9556 max_charset_id = XFASTINT (XCAR (tail));
9557 }
9558 }
9559 else
9560 {
9561 charset_list = Fcopy_sequence (charset_list);
9562 for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
9563 {
9564 struct charset *charset;
9565
9566 val = XCAR (tail);
9567 CHECK_CHARSET_GET_CHARSET (val, charset);
9568 if (EQ (coding_type, Qiso_2022)
9569 ? CHARSET_ISO_FINAL (charset) < 0
9570 : EQ (coding_type, Qemacs_mule)
9571 ? CHARSET_EMACS_MULE_ID (charset) < 0
9572 : 0)
9573 error ("Can't handle charset `%s'",
9574 SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
9575
9576 XSETCAR (tail, make_number (charset->id));
9577 if (max_charset_id < charset->id)
9578 max_charset_id = charset->id;
9579 }
9580 }
9581 ASET (attrs, coding_attr_charset_list, charset_list);
9582
9583 safe_charsets = make_uninit_string (max_charset_id + 1);
9584 memset (SDATA (safe_charsets), 255, max_charset_id + 1);
9585 for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
9586 SSET (safe_charsets, XFASTINT (XCAR (tail)), 0);
9587 ASET (attrs, coding_attr_safe_charsets, safe_charsets);
9588
9589 ASET (attrs, coding_attr_ascii_compat, args[coding_arg_ascii_compatible_p]);
9590
9591 val = args[coding_arg_decode_translation_table];
9592 if (! CHAR_TABLE_P (val) && ! CONSP (val))
9593 CHECK_SYMBOL (val);
9594 ASET (attrs, coding_attr_decode_tbl, val);
9595
9596 val = args[coding_arg_encode_translation_table];
9597 if (! CHAR_TABLE_P (val) && ! CONSP (val))
9598 CHECK_SYMBOL (val);
9599 ASET (attrs, coding_attr_encode_tbl, val);
9600
9601 val = args[coding_arg_post_read_conversion];
9602 CHECK_SYMBOL (val);
9603 ASET (attrs, coding_attr_post_read, val);
9604
9605 val = args[coding_arg_pre_write_conversion];
9606 CHECK_SYMBOL (val);
9607 ASET (attrs, coding_attr_pre_write, val);
9608
9609 val = args[coding_arg_default_char];
9610 if (NILP (val))
9611 ASET (attrs, coding_attr_default_char, make_number (' '));
9612 else
9613 {
9614 CHECK_CHARACTER (val);
9615 ASET (attrs, coding_attr_default_char, val);
9616 }
9617
9618 val = args[coding_arg_for_unibyte];
9619 ASET (attrs, coding_attr_for_unibyte, NILP (val) ? Qnil : Qt);
9620
9621 val = args[coding_arg_plist];
9622 CHECK_LIST (val);
9623 ASET (attrs, coding_attr_plist, val);
9624
9625 if (EQ (coding_type, Qcharset))
9626 {
9627 /* Generate a lisp vector of 256 elements. Each element is nil,
9628 integer, or a list of charset IDs.
9629
9630 If Nth element is nil, the byte code N is invalid in this
9631 coding system.
9632
9633 If Nth element is a number NUM, N is the first byte of a
9634 charset whose ID is NUM.
9635
9636 If Nth element is a list of charset IDs, N is the first byte
9637 of one of them. The list is sorted by dimensions of the
9638 charsets. A charset of smaller dimension comes first. */
9639 val = Fmake_vector (make_number (256), Qnil);
9640
9641 for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
9642 {
9643 struct charset *charset = CHARSET_FROM_ID (XFASTINT (XCAR (tail)));
9644 int dim = CHARSET_DIMENSION (charset);
9645 int idx = (dim - 1) * 4;
9646
9647 if (CHARSET_ASCII_COMPATIBLE_P (charset))
9648 ASET (attrs, coding_attr_ascii_compat, Qt);
9649
9650 for (i = charset->code_space[idx];
9651 i <= charset->code_space[idx + 1]; i++)
9652 {
9653 Lisp_Object tmp, tmp2;
9654 int dim2;
9655
9656 tmp = AREF (val, i);
9657 if (NILP (tmp))
9658 tmp = XCAR (tail);
9659 else if (NUMBERP (tmp))
9660 {
9661 dim2 = CHARSET_DIMENSION (CHARSET_FROM_ID (XFASTINT (tmp)));
9662 if (dim < dim2)
9663 tmp = Fcons (XCAR (tail), Fcons (tmp, Qnil));
9664 else
9665 tmp = Fcons (tmp, Fcons (XCAR (tail), Qnil));
9666 }
9667 else
9668 {
9669 for (tmp2 = tmp; CONSP (tmp2); tmp2 = XCDR (tmp2))
9670 {
9671 dim2 = CHARSET_DIMENSION (CHARSET_FROM_ID (XFASTINT (XCAR (tmp2))));
9672 if (dim < dim2)
9673 break;
9674 }
9675 if (NILP (tmp2))
9676 tmp = nconc2 (tmp, Fcons (XCAR (tail), Qnil));
9677 else
9678 {
9679 XSETCDR (tmp2, Fcons (XCAR (tmp2), XCDR (tmp2)));
9680 XSETCAR (tmp2, XCAR (tail));
9681 }
9682 }
9683 ASET (val, i, tmp);
9684 }
9685 }
9686 ASET (attrs, coding_attr_charset_valids, val);
9687 category = coding_category_charset;
9688 }
9689 else if (EQ (coding_type, Qccl))
9690 {
9691 Lisp_Object valids;
9692
9693 if (nargs < coding_arg_ccl_max)
9694 goto short_args;
9695
9696 val = args[coding_arg_ccl_decoder];
9697 CHECK_CCL_PROGRAM (val);
9698 if (VECTORP (val))
9699 val = Fcopy_sequence (val);
9700 ASET (attrs, coding_attr_ccl_decoder, val);
9701
9702 val = args[coding_arg_ccl_encoder];
9703 CHECK_CCL_PROGRAM (val);
9704 if (VECTORP (val))
9705 val = Fcopy_sequence (val);
9706 ASET (attrs, coding_attr_ccl_encoder, val);
9707
9708 val = args[coding_arg_ccl_valids];
9709 valids = Fmake_string (make_number (256), make_number (0));
9710 for (tail = val; CONSP (tail); tail = XCDR (tail))
9711 {
9712 int from, to;
9713
9714 val = XCAR (tail);
9715 if (INTEGERP (val))
9716 {
9717 if (! (0 <= XINT (val) && XINT (val) <= 255))
9718 args_out_of_range_3 (val, make_number (0), make_number (255));
9719 from = to = XINT (val);
9720 }
9721 else
9722 {
9723 CHECK_CONS (val);
9724 CHECK_NATNUM_CAR (val);
9725 CHECK_NUMBER_CDR (val);
9726 if (XINT (XCAR (val)) > 255)
9727 args_out_of_range_3 (XCAR (val),
9728 make_number (0), make_number (255));
9729 from = XINT (XCAR (val));
9730 if (! (from <= XINT (XCDR (val)) && XINT (XCDR (val)) <= 255))
9731 args_out_of_range_3 (XCDR (val),
9732 XCAR (val), make_number (255));
9733 to = XINT (XCDR (val));
9734 }
9735 for (i = from; i <= to; i++)
9736 SSET (valids, i, 1);
9737 }
9738 ASET (attrs, coding_attr_ccl_valids, valids);
9739
9740 category = coding_category_ccl;
9741 }
9742 else if (EQ (coding_type, Qutf_16))
9743 {
9744 Lisp_Object bom, endian;
9745
9746 ASET (attrs, coding_attr_ascii_compat, Qnil);
9747
9748 if (nargs < coding_arg_utf16_max)
9749 goto short_args;
9750
9751 bom = args[coding_arg_utf16_bom];
9752 if (! NILP (bom) && ! EQ (bom, Qt))
9753 {
9754 CHECK_CONS (bom);
9755 val = XCAR (bom);
9756 CHECK_CODING_SYSTEM (val);
9757 val = XCDR (bom);
9758 CHECK_CODING_SYSTEM (val);
9759 }
9760 ASET (attrs, coding_attr_utf_bom, bom);
9761
9762 endian = args[coding_arg_utf16_endian];
9763 CHECK_SYMBOL (endian);
9764 if (NILP (endian))
9765 endian = Qbig;
9766 else if (! EQ (endian, Qbig) && ! EQ (endian, Qlittle))
9767 error ("Invalid endian: %s", SDATA (SYMBOL_NAME (endian)));
9768 ASET (attrs, coding_attr_utf_16_endian, endian);
9769
9770 category = (CONSP (bom)
9771 ? coding_category_utf_16_auto
9772 : NILP (bom)
9773 ? (EQ (endian, Qbig)
9774 ? coding_category_utf_16_be_nosig
9775 : coding_category_utf_16_le_nosig)
9776 : (EQ (endian, Qbig)
9777 ? coding_category_utf_16_be
9778 : coding_category_utf_16_le));
9779 }
9780 else if (EQ (coding_type, Qiso_2022))
9781 {
9782 Lisp_Object initial, reg_usage, request, flags;
9783
9784 if (nargs < coding_arg_iso2022_max)
9785 goto short_args;
9786
9787 initial = Fcopy_sequence (args[coding_arg_iso2022_initial]);
9788 CHECK_VECTOR (initial);
9789 for (i = 0; i < 4; i++)
9790 {
9791 val = Faref (initial, make_number (i));
9792 if (! NILP (val))
9793 {
9794 struct charset *charset;
9795
9796 CHECK_CHARSET_GET_CHARSET (val, charset);
9797 ASET (initial, i, make_number (CHARSET_ID (charset)));
9798 if (i == 0 && CHARSET_ASCII_COMPATIBLE_P (charset))
9799 ASET (attrs, coding_attr_ascii_compat, Qt);
9800 }
9801 else
9802 ASET (initial, i, make_number (-1));
9803 }
9804
9805 reg_usage = args[coding_arg_iso2022_reg_usage];
9806 CHECK_CONS (reg_usage);
9807 CHECK_NUMBER_CAR (reg_usage);
9808 CHECK_NUMBER_CDR (reg_usage);
9809
9810 request = Fcopy_sequence (args[coding_arg_iso2022_request]);
9811 for (tail = request; CONSP (tail); tail = XCDR (tail))
9812 {
9813 int id;
9814 Lisp_Object tmp1;
9815
9816 val = XCAR (tail);
9817 CHECK_CONS (val);
9818 tmp1 = XCAR (val);
9819 CHECK_CHARSET_GET_ID (tmp1, id);
9820 CHECK_NATNUM_CDR (val);
9821 if (XINT (XCDR (val)) >= 4)
9822 error ("Invalid graphic register number: %"pI"d", XINT (XCDR (val)));
9823 XSETCAR (val, make_number (id));
9824 }
9825
9826 flags = args[coding_arg_iso2022_flags];
9827 CHECK_NATNUM (flags);
9828 i = XINT (flags) & INT_MAX;
9829 if (EQ (args[coding_arg_charset_list], Qiso_2022))
9830 i |= CODING_ISO_FLAG_FULL_SUPPORT;
9831 flags = make_number (i);
9832
9833 ASET (attrs, coding_attr_iso_initial, initial);
9834 ASET (attrs, coding_attr_iso_usage, reg_usage);
9835 ASET (attrs, coding_attr_iso_request, request);
9836 ASET (attrs, coding_attr_iso_flags, flags);
9837 setup_iso_safe_charsets (attrs);
9838
9839 if (i & CODING_ISO_FLAG_SEVEN_BITS)
9840 category = ((i & (CODING_ISO_FLAG_LOCKING_SHIFT
9841 | CODING_ISO_FLAG_SINGLE_SHIFT))
9842 ? coding_category_iso_7_else
9843 : EQ (args[coding_arg_charset_list], Qiso_2022)
9844 ? coding_category_iso_7
9845 : coding_category_iso_7_tight);
9846 else
9847 {
9848 int id = XINT (AREF (initial, 1));
9849
9850 category = (((i & CODING_ISO_FLAG_LOCKING_SHIFT)
9851 || EQ (args[coding_arg_charset_list], Qiso_2022)
9852 || id < 0)
9853 ? coding_category_iso_8_else
9854 : (CHARSET_DIMENSION (CHARSET_FROM_ID (id)) == 1)
9855 ? coding_category_iso_8_1
9856 : coding_category_iso_8_2);
9857 }
9858 if (category != coding_category_iso_8_1
9859 && category != coding_category_iso_8_2)
9860 ASET (attrs, coding_attr_ascii_compat, Qnil);
9861 }
9862 else if (EQ (coding_type, Qemacs_mule))
9863 {
9864 if (EQ (args[coding_arg_charset_list], Qemacs_mule))
9865 ASET (attrs, coding_attr_emacs_mule_full, Qt);
9866 ASET (attrs, coding_attr_ascii_compat, Qt);
9867 category = coding_category_emacs_mule;
9868 }
9869 else if (EQ (coding_type, Qshift_jis))
9870 {
9871
9872 struct charset *charset;
9873
9874 if (XINT (Flength (charset_list)) != 3
9875 && XINT (Flength (charset_list)) != 4)
9876 error ("There should be three or four charsets");
9877
9878 charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
9879 if (CHARSET_DIMENSION (charset) != 1)
9880 error ("Dimension of charset %s is not one",
9881 SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
9882 if (CHARSET_ASCII_COMPATIBLE_P (charset))
9883 ASET (attrs, coding_attr_ascii_compat, Qt);
9884
9885 charset_list = XCDR (charset_list);
9886 charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
9887 if (CHARSET_DIMENSION (charset) != 1)
9888 error ("Dimension of charset %s is not one",
9889 SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
9890
9891 charset_list = XCDR (charset_list);
9892 charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
9893 if (CHARSET_DIMENSION (charset) != 2)
9894 error ("Dimension of charset %s is not two",
9895 SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
9896
9897 charset_list = XCDR (charset_list);
9898 if (! NILP (charset_list))
9899 {
9900 charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
9901 if (CHARSET_DIMENSION (charset) != 2)
9902 error ("Dimension of charset %s is not two",
9903 SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
9904 }
9905
9906 category = coding_category_sjis;
9907 Vsjis_coding_system = name;
9908 }
9909 else if (EQ (coding_type, Qbig5))
9910 {
9911 struct charset *charset;
9912
9913 if (XINT (Flength (charset_list)) != 2)
9914 error ("There should be just two charsets");
9915
9916 charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
9917 if (CHARSET_DIMENSION (charset) != 1)
9918 error ("Dimension of charset %s is not one",
9919 SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
9920 if (CHARSET_ASCII_COMPATIBLE_P (charset))
9921 ASET (attrs, coding_attr_ascii_compat, Qt);
9922
9923 charset_list = XCDR (charset_list);
9924 charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
9925 if (CHARSET_DIMENSION (charset) != 2)
9926 error ("Dimension of charset %s is not two",
9927 SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
9928
9929 category = coding_category_big5;
9930 Vbig5_coding_system = name;
9931 }
9932 else if (EQ (coding_type, Qraw_text))
9933 {
9934 category = coding_category_raw_text;
9935 ASET (attrs, coding_attr_ascii_compat, Qt);
9936 }
9937 else if (EQ (coding_type, Qutf_8))
9938 {
9939 Lisp_Object bom;
9940
9941 if (nargs < coding_arg_utf8_max)
9942 goto short_args;
9943
9944 bom = args[coding_arg_utf8_bom];
9945 if (! NILP (bom) && ! EQ (bom, Qt))
9946 {
9947 CHECK_CONS (bom);
9948 val = XCAR (bom);
9949 CHECK_CODING_SYSTEM (val);
9950 val = XCDR (bom);
9951 CHECK_CODING_SYSTEM (val);
9952 }
9953 ASET (attrs, coding_attr_utf_bom, bom);
9954 if (NILP (bom))
9955 ASET (attrs, coding_attr_ascii_compat, Qt);
9956
9957 category = (CONSP (bom) ? coding_category_utf_8_auto
9958 : NILP (bom) ? coding_category_utf_8_nosig
9959 : coding_category_utf_8_sig);
9960 }
9961 else if (EQ (coding_type, Qundecided))
9962 category = coding_category_undecided;
9963 else
9964 error ("Invalid coding system type: %s",
9965 SDATA (SYMBOL_NAME (coding_type)));
9966
9967 ASET (attrs, coding_attr_category, make_number (category));
9968 ASET (attrs, coding_attr_plist,
9969 Fcons (QCcategory,
9970 Fcons (AREF (Vcoding_category_table, category),
9971 CODING_ATTR_PLIST (attrs))));
9972 ASET (attrs, coding_attr_plist,
9973 Fcons (QCascii_compatible_p,
9974 Fcons (CODING_ATTR_ASCII_COMPAT (attrs),
9975 CODING_ATTR_PLIST (attrs))));
9976
9977 eol_type = args[coding_arg_eol_type];
9978 if (! NILP (eol_type)
9979 && ! EQ (eol_type, Qunix)
9980 && ! EQ (eol_type, Qdos)
9981 && ! EQ (eol_type, Qmac))
9982 error ("Invalid eol-type");
9983
9984 aliases = Fcons (name, Qnil);
9985
9986 if (NILP (eol_type))
9987 {
9988 eol_type = make_subsidiaries (name);
9989 for (i = 0; i < 3; i++)
9990 {
9991 Lisp_Object this_spec, this_name, this_aliases, this_eol_type;
9992
9993 this_name = AREF (eol_type, i);
9994 this_aliases = Fcons (this_name, Qnil);
9995 this_eol_type = (i == 0 ? Qunix : i == 1 ? Qdos : Qmac);
9996 this_spec = Fmake_vector (make_number (3), attrs);
9997 ASET (this_spec, 1, this_aliases);
9998 ASET (this_spec, 2, this_eol_type);
9999 Fputhash (this_name, this_spec, Vcoding_system_hash_table);
10000 Vcoding_system_list = Fcons (this_name, Vcoding_system_list);
10001 val = Fassoc (Fsymbol_name (this_name), Vcoding_system_alist);
10002 if (NILP (val))
10003 Vcoding_system_alist
10004 = Fcons (Fcons (Fsymbol_name (this_name), Qnil),
10005 Vcoding_system_alist);
10006 }
10007 }
10008
10009 spec_vec = Fmake_vector (make_number (3), attrs);
10010 ASET (spec_vec, 1, aliases);
10011 ASET (spec_vec, 2, eol_type);
10012
10013 Fputhash (name, spec_vec, Vcoding_system_hash_table);
10014 Vcoding_system_list = Fcons (name, Vcoding_system_list);
10015 val = Fassoc (Fsymbol_name (name), Vcoding_system_alist);
10016 if (NILP (val))
10017 Vcoding_system_alist = Fcons (Fcons (Fsymbol_name (name), Qnil),
10018 Vcoding_system_alist);
10019
10020 {
10021 int id = coding_categories[category].id;
10022
10023 if (id < 0 || EQ (name, CODING_ID_NAME (id)))
10024 setup_coding_system (name, &coding_categories[category]);
10025 }
10026
10027 return Qnil;
10028
10029 short_args:
10030 return Fsignal (Qwrong_number_of_arguments,
10031 Fcons (intern ("define-coding-system-internal"),
10032 make_number (nargs)));
10033 }
10034
10035
10036 DEFUN ("coding-system-put", Fcoding_system_put, Scoding_system_put,
10037 3, 3, 0,
10038 doc: /* Change value in CODING-SYSTEM's property list PROP to VAL. */)
10039 (Lisp_Object coding_system, Lisp_Object prop, Lisp_Object val)
10040 {
10041 Lisp_Object spec, attrs;
10042
10043 CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
10044 attrs = AREF (spec, 0);
10045 if (EQ (prop, QCmnemonic))
10046 {
10047 if (! STRINGP (val))
10048 CHECK_CHARACTER (val);
10049 ASET (attrs, coding_attr_mnemonic, val);
10050 }
10051 else if (EQ (prop, QCdefault_char))
10052 {
10053 if (NILP (val))
10054 val = make_number (' ');
10055 else
10056 CHECK_CHARACTER (val);
10057 ASET (attrs, coding_attr_default_char, val);
10058 }
10059 else if (EQ (prop, QCdecode_translation_table))
10060 {
10061 if (! CHAR_TABLE_P (val) && ! CONSP (val))
10062 CHECK_SYMBOL (val);
10063 ASET (attrs, coding_attr_decode_tbl, val);
10064 }
10065 else if (EQ (prop, QCencode_translation_table))
10066 {
10067 if (! CHAR_TABLE_P (val) && ! CONSP (val))
10068 CHECK_SYMBOL (val);
10069 ASET (attrs, coding_attr_encode_tbl, val);
10070 }
10071 else if (EQ (prop, QCpost_read_conversion))
10072 {
10073 CHECK_SYMBOL (val);
10074 ASET (attrs, coding_attr_post_read, val);
10075 }
10076 else if (EQ (prop, QCpre_write_conversion))
10077 {
10078 CHECK_SYMBOL (val);
10079 ASET (attrs, coding_attr_pre_write, val);
10080 }
10081 else if (EQ (prop, QCascii_compatible_p))
10082 {
10083 ASET (attrs, coding_attr_ascii_compat, val);
10084 }
10085
10086 ASET (attrs, coding_attr_plist,
10087 Fplist_put (CODING_ATTR_PLIST (attrs), prop, val));
10088 return val;
10089 }
10090
10091
10092 DEFUN ("define-coding-system-alias", Fdefine_coding_system_alias,
10093 Sdefine_coding_system_alias, 2, 2, 0,
10094 doc: /* Define ALIAS as an alias for CODING-SYSTEM. */)
10095 (Lisp_Object alias, Lisp_Object coding_system)
10096 {
10097 Lisp_Object spec, aliases, eol_type, val;
10098
10099 CHECK_SYMBOL (alias);
10100 CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
10101 aliases = AREF (spec, 1);
10102 /* ALIASES should be a list of length more than zero, and the first
10103 element is a base coding system. Append ALIAS at the tail of the
10104 list. */
10105 while (!NILP (XCDR (aliases)))
10106 aliases = XCDR (aliases);
10107 XSETCDR (aliases, Fcons (alias, Qnil));
10108
10109 eol_type = AREF (spec, 2);
10110 if (VECTORP (eol_type))
10111 {
10112 Lisp_Object subsidiaries;
10113 int i;
10114
10115 subsidiaries = make_subsidiaries (alias);
10116 for (i = 0; i < 3; i++)
10117 Fdefine_coding_system_alias (AREF (subsidiaries, i),
10118 AREF (eol_type, i));
10119 }
10120
10121 Fputhash (alias, spec, Vcoding_system_hash_table);
10122 Vcoding_system_list = Fcons (alias, Vcoding_system_list);
10123 val = Fassoc (Fsymbol_name (alias), Vcoding_system_alist);
10124 if (NILP (val))
10125 Vcoding_system_alist = Fcons (Fcons (Fsymbol_name (alias), Qnil),
10126 Vcoding_system_alist);
10127
10128 return Qnil;
10129 }
10130
10131 DEFUN ("coding-system-base", Fcoding_system_base, Scoding_system_base,
10132 1, 1, 0,
10133 doc: /* Return the base of CODING-SYSTEM.
10134 Any alias or subsidiary coding system is not a base coding system. */)
10135 (Lisp_Object coding_system)
10136 {
10137 Lisp_Object spec, attrs;
10138
10139 if (NILP (coding_system))
10140 return (Qno_conversion);
10141 CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
10142 attrs = AREF (spec, 0);
10143 return CODING_ATTR_BASE_NAME (attrs);
10144 }
10145
10146 DEFUN ("coding-system-plist", Fcoding_system_plist, Scoding_system_plist,
10147 1, 1, 0,
10148 doc: "Return the property list of CODING-SYSTEM.")
10149 (Lisp_Object coding_system)
10150 {
10151 Lisp_Object spec, attrs;
10152
10153 if (NILP (coding_system))
10154 coding_system = Qno_conversion;
10155 CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
10156 attrs = AREF (spec, 0);
10157 return CODING_ATTR_PLIST (attrs);
10158 }
10159
10160
10161 DEFUN ("coding-system-aliases", Fcoding_system_aliases, Scoding_system_aliases,
10162 1, 1, 0,
10163 doc: /* Return the list of aliases of CODING-SYSTEM. */)
10164 (Lisp_Object coding_system)
10165 {
10166 Lisp_Object spec;
10167
10168 if (NILP (coding_system))
10169 coding_system = Qno_conversion;
10170 CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
10171 return AREF (spec, 1);
10172 }
10173
10174 DEFUN ("coding-system-eol-type", Fcoding_system_eol_type,
10175 Scoding_system_eol_type, 1, 1, 0,
10176 doc: /* Return eol-type of CODING-SYSTEM.
10177 An eol-type is an integer 0, 1, 2, or a vector of coding systems.
10178
10179 Integer values 0, 1, and 2 indicate a format of end-of-line; LF, CRLF,
10180 and CR respectively.
10181
10182 A vector value indicates that a format of end-of-line should be
10183 detected automatically. Nth element of the vector is the subsidiary
10184 coding system whose eol-type is N. */)
10185 (Lisp_Object coding_system)
10186 {
10187 Lisp_Object spec, eol_type;
10188 int n;
10189
10190 if (NILP (coding_system))
10191 coding_system = Qno_conversion;
10192 if (! CODING_SYSTEM_P (coding_system))
10193 return Qnil;
10194 spec = CODING_SYSTEM_SPEC (coding_system);
10195 eol_type = AREF (spec, 2);
10196 if (VECTORP (eol_type))
10197 return Fcopy_sequence (eol_type);
10198 n = EQ (eol_type, Qunix) ? 0 : EQ (eol_type, Qdos) ? 1 : 2;
10199 return make_number (n);
10200 }
10201
10202 #endif /* emacs */
10203
10204 \f
10205 /*** 9. Post-amble ***/
10206
10207 void
10208 init_coding_once (void)
10209 {
10210 int i;
10211
10212 for (i = 0; i < coding_category_max; i++)
10213 {
10214 coding_categories[i].id = -1;
10215 coding_priorities[i] = i;
10216 }
10217
10218 /* ISO2022 specific initialize routine. */
10219 for (i = 0; i < 0x20; i++)
10220 iso_code_class[i] = ISO_control_0;
10221 for (i = 0x21; i < 0x7F; i++)
10222 iso_code_class[i] = ISO_graphic_plane_0;
10223 for (i = 0x80; i < 0xA0; i++)
10224 iso_code_class[i] = ISO_control_1;
10225 for (i = 0xA1; i < 0xFF; i++)
10226 iso_code_class[i] = ISO_graphic_plane_1;
10227 iso_code_class[0x20] = iso_code_class[0x7F] = ISO_0x20_or_0x7F;
10228 iso_code_class[0xA0] = iso_code_class[0xFF] = ISO_0xA0_or_0xFF;
10229 iso_code_class[ISO_CODE_SO] = ISO_shift_out;
10230 iso_code_class[ISO_CODE_SI] = ISO_shift_in;
10231 iso_code_class[ISO_CODE_SS2_7] = ISO_single_shift_2_7;
10232 iso_code_class[ISO_CODE_ESC] = ISO_escape;
10233 iso_code_class[ISO_CODE_SS2] = ISO_single_shift_2;
10234 iso_code_class[ISO_CODE_SS3] = ISO_single_shift_3;
10235 iso_code_class[ISO_CODE_CSI] = ISO_control_sequence_introducer;
10236
10237 for (i = 0; i < 256; i++)
10238 {
10239 emacs_mule_bytes[i] = 1;
10240 }
10241 emacs_mule_bytes[EMACS_MULE_LEADING_CODE_PRIVATE_11] = 3;
10242 emacs_mule_bytes[EMACS_MULE_LEADING_CODE_PRIVATE_12] = 3;
10243 emacs_mule_bytes[EMACS_MULE_LEADING_CODE_PRIVATE_21] = 4;
10244 emacs_mule_bytes[EMACS_MULE_LEADING_CODE_PRIVATE_22] = 4;
10245 }
10246
10247 #ifdef emacs
10248
10249 void
10250 syms_of_coding (void)
10251 {
10252 staticpro (&Vcoding_system_hash_table);
10253 {
10254 Lisp_Object args[2];
10255 args[0] = QCtest;
10256 args[1] = Qeq;
10257 Vcoding_system_hash_table = Fmake_hash_table (2, args);
10258 }
10259
10260 staticpro (&Vsjis_coding_system);
10261 Vsjis_coding_system = Qnil;
10262
10263 staticpro (&Vbig5_coding_system);
10264 Vbig5_coding_system = Qnil;
10265
10266 staticpro (&Vcode_conversion_reused_workbuf);
10267 Vcode_conversion_reused_workbuf = Qnil;
10268
10269 staticpro (&Vcode_conversion_workbuf_name);
10270 Vcode_conversion_workbuf_name = build_pure_c_string (" *code-conversion-work*");
10271
10272 reused_workbuf_in_use = 0;
10273
10274 DEFSYM (Qcharset, "charset");
10275 DEFSYM (Qtarget_idx, "target-idx");
10276 DEFSYM (Qcoding_system_history, "coding-system-history");
10277 Fset (Qcoding_system_history, Qnil);
10278
10279 /* Target FILENAME is the first argument. */
10280 Fput (Qinsert_file_contents, Qtarget_idx, make_number (0));
10281 /* Target FILENAME is the third argument. */
10282 Fput (Qwrite_region, Qtarget_idx, make_number (2));
10283
10284 DEFSYM (Qcall_process, "call-process");
10285 /* Target PROGRAM is the first argument. */
10286 Fput (Qcall_process, Qtarget_idx, make_number (0));
10287
10288 DEFSYM (Qcall_process_region, "call-process-region");
10289 /* Target PROGRAM is the third argument. */
10290 Fput (Qcall_process_region, Qtarget_idx, make_number (2));
10291
10292 DEFSYM (Qstart_process, "start-process");
10293 /* Target PROGRAM is the third argument. */
10294 Fput (Qstart_process, Qtarget_idx, make_number (2));
10295
10296 DEFSYM (Qopen_network_stream, "open-network-stream");
10297 /* Target SERVICE is the fourth argument. */
10298 Fput (Qopen_network_stream, Qtarget_idx, make_number (3));
10299
10300 DEFSYM (Qcoding_system, "coding-system");
10301 DEFSYM (Qcoding_aliases, "coding-aliases");
10302
10303 DEFSYM (Qeol_type, "eol-type");
10304 DEFSYM (Qunix, "unix");
10305 DEFSYM (Qdos, "dos");
10306
10307 DEFSYM (Qbuffer_file_coding_system, "buffer-file-coding-system");
10308 DEFSYM (Qpost_read_conversion, "post-read-conversion");
10309 DEFSYM (Qpre_write_conversion, "pre-write-conversion");
10310 DEFSYM (Qdefault_char, "default-char");
10311 DEFSYM (Qundecided, "undecided");
10312 DEFSYM (Qno_conversion, "no-conversion");
10313 DEFSYM (Qraw_text, "raw-text");
10314
10315 DEFSYM (Qiso_2022, "iso-2022");
10316
10317 DEFSYM (Qutf_8, "utf-8");
10318 DEFSYM (Qutf_8_emacs, "utf-8-emacs");
10319
10320 #if defined (WINDOWSNT) || defined (CYGWIN)
10321 /* No, not utf-16-le: that one has a BOM. */
10322 DEFSYM (Qutf_16le, "utf-16le");
10323 #endif
10324
10325 DEFSYM (Qutf_16, "utf-16");
10326 DEFSYM (Qbig, "big");
10327 DEFSYM (Qlittle, "little");
10328
10329 DEFSYM (Qshift_jis, "shift-jis");
10330 DEFSYM (Qbig5, "big5");
10331
10332 DEFSYM (Qcoding_system_p, "coding-system-p");
10333
10334 DEFSYM (Qcoding_system_error, "coding-system-error");
10335 Fput (Qcoding_system_error, Qerror_conditions,
10336 listn (CONSTYPE_PURE, 2, Qcoding_system_error, Qerror));
10337 Fput (Qcoding_system_error, Qerror_message,
10338 build_pure_c_string ("Invalid coding system"));
10339
10340 /* Intern this now in case it isn't already done.
10341 Setting this variable twice is harmless.
10342 But don't staticpro it here--that is done in alloc.c. */
10343 Qchar_table_extra_slots = intern_c_string ("char-table-extra-slots");
10344
10345 DEFSYM (Qtranslation_table, "translation-table");
10346 Fput (Qtranslation_table, Qchar_table_extra_slots, make_number (2));
10347 DEFSYM (Qtranslation_table_id, "translation-table-id");
10348 DEFSYM (Qtranslation_table_for_decode, "translation-table-for-decode");
10349 DEFSYM (Qtranslation_table_for_encode, "translation-table-for-encode");
10350
10351 DEFSYM (Qvalid_codes, "valid-codes");
10352
10353 DEFSYM (Qemacs_mule, "emacs-mule");
10354
10355 DEFSYM (QCcategory, ":category");
10356 DEFSYM (QCmnemonic, ":mnemonic");
10357 DEFSYM (QCdefault_char, ":default-char");
10358 DEFSYM (QCdecode_translation_table, ":decode-translation-table");
10359 DEFSYM (QCencode_translation_table, ":encode-translation-table");
10360 DEFSYM (QCpost_read_conversion, ":post-read-conversion");
10361 DEFSYM (QCpre_write_conversion, ":pre-write-conversion");
10362 DEFSYM (QCascii_compatible_p, ":ascii-compatible-p");
10363
10364 Vcoding_category_table
10365 = Fmake_vector (make_number (coding_category_max), Qnil);
10366 staticpro (&Vcoding_category_table);
10367 /* Followings are target of code detection. */
10368 ASET (Vcoding_category_table, coding_category_iso_7,
10369 intern_c_string ("coding-category-iso-7"));
10370 ASET (Vcoding_category_table, coding_category_iso_7_tight,
10371 intern_c_string ("coding-category-iso-7-tight"));
10372 ASET (Vcoding_category_table, coding_category_iso_8_1,
10373 intern_c_string ("coding-category-iso-8-1"));
10374 ASET (Vcoding_category_table, coding_category_iso_8_2,
10375 intern_c_string ("coding-category-iso-8-2"));
10376 ASET (Vcoding_category_table, coding_category_iso_7_else,
10377 intern_c_string ("coding-category-iso-7-else"));
10378 ASET (Vcoding_category_table, coding_category_iso_8_else,
10379 intern_c_string ("coding-category-iso-8-else"));
10380 ASET (Vcoding_category_table, coding_category_utf_8_auto,
10381 intern_c_string ("coding-category-utf-8-auto"));
10382 ASET (Vcoding_category_table, coding_category_utf_8_nosig,
10383 intern_c_string ("coding-category-utf-8"));
10384 ASET (Vcoding_category_table, coding_category_utf_8_sig,
10385 intern_c_string ("coding-category-utf-8-sig"));
10386 ASET (Vcoding_category_table, coding_category_utf_16_be,
10387 intern_c_string ("coding-category-utf-16-be"));
10388 ASET (Vcoding_category_table, coding_category_utf_16_auto,
10389 intern_c_string ("coding-category-utf-16-auto"));
10390 ASET (Vcoding_category_table, coding_category_utf_16_le,
10391 intern_c_string ("coding-category-utf-16-le"));
10392 ASET (Vcoding_category_table, coding_category_utf_16_be_nosig,
10393 intern_c_string ("coding-category-utf-16-be-nosig"));
10394 ASET (Vcoding_category_table, coding_category_utf_16_le_nosig,
10395 intern_c_string ("coding-category-utf-16-le-nosig"));
10396 ASET (Vcoding_category_table, coding_category_charset,
10397 intern_c_string ("coding-category-charset"));
10398 ASET (Vcoding_category_table, coding_category_sjis,
10399 intern_c_string ("coding-category-sjis"));
10400 ASET (Vcoding_category_table, coding_category_big5,
10401 intern_c_string ("coding-category-big5"));
10402 ASET (Vcoding_category_table, coding_category_ccl,
10403 intern_c_string ("coding-category-ccl"));
10404 ASET (Vcoding_category_table, coding_category_emacs_mule,
10405 intern_c_string ("coding-category-emacs-mule"));
10406 /* Followings are NOT target of code detection. */
10407 ASET (Vcoding_category_table, coding_category_raw_text,
10408 intern_c_string ("coding-category-raw-text"));
10409 ASET (Vcoding_category_table, coding_category_undecided,
10410 intern_c_string ("coding-category-undecided"));
10411
10412 DEFSYM (Qinsufficient_source, "insufficient-source");
10413 DEFSYM (Qinconsistent_eol, "inconsistent-eol");
10414 DEFSYM (Qinvalid_source, "invalid-source");
10415 DEFSYM (Qinterrupted, "interrupted");
10416 DEFSYM (Qinsufficient_memory, "insufficient-memory");
10417 DEFSYM (Qcoding_system_define_form, "coding-system-define-form");
10418
10419 defsubr (&Scoding_system_p);
10420 defsubr (&Sread_coding_system);
10421 defsubr (&Sread_non_nil_coding_system);
10422 defsubr (&Scheck_coding_system);
10423 defsubr (&Sdetect_coding_region);
10424 defsubr (&Sdetect_coding_string);
10425 defsubr (&Sfind_coding_systems_region_internal);
10426 defsubr (&Sunencodable_char_position);
10427 defsubr (&Scheck_coding_systems_region);
10428 defsubr (&Sdecode_coding_region);
10429 defsubr (&Sencode_coding_region);
10430 defsubr (&Sdecode_coding_string);
10431 defsubr (&Sencode_coding_string);
10432 defsubr (&Sdecode_sjis_char);
10433 defsubr (&Sencode_sjis_char);
10434 defsubr (&Sdecode_big5_char);
10435 defsubr (&Sencode_big5_char);
10436 defsubr (&Sset_terminal_coding_system_internal);
10437 defsubr (&Sset_safe_terminal_coding_system_internal);
10438 defsubr (&Sterminal_coding_system);
10439 defsubr (&Sset_keyboard_coding_system_internal);
10440 defsubr (&Skeyboard_coding_system);
10441 defsubr (&Sfind_operation_coding_system);
10442 defsubr (&Sset_coding_system_priority);
10443 defsubr (&Sdefine_coding_system_internal);
10444 defsubr (&Sdefine_coding_system_alias);
10445 defsubr (&Scoding_system_put);
10446 defsubr (&Scoding_system_base);
10447 defsubr (&Scoding_system_plist);
10448 defsubr (&Scoding_system_aliases);
10449 defsubr (&Scoding_system_eol_type);
10450 defsubr (&Scoding_system_priority_list);
10451
10452 DEFVAR_LISP ("coding-system-list", Vcoding_system_list,
10453 doc: /* List of coding systems.
10454
10455 Do not alter the value of this variable manually. This variable should be
10456 updated by the functions `define-coding-system' and
10457 `define-coding-system-alias'. */);
10458 Vcoding_system_list = Qnil;
10459
10460 DEFVAR_LISP ("coding-system-alist", Vcoding_system_alist,
10461 doc: /* Alist of coding system names.
10462 Each element is one element list of coding system name.
10463 This variable is given to `completing-read' as COLLECTION argument.
10464
10465 Do not alter the value of this variable manually. This variable should be
10466 updated by the functions `make-coding-system' and
10467 `define-coding-system-alias'. */);
10468 Vcoding_system_alist = Qnil;
10469
10470 DEFVAR_LISP ("coding-category-list", Vcoding_category_list,
10471 doc: /* List of coding-categories (symbols) ordered by priority.
10472
10473 On detecting a coding system, Emacs tries code detection algorithms
10474 associated with each coding-category one by one in this order. When
10475 one algorithm agrees with a byte sequence of source text, the coding
10476 system bound to the corresponding coding-category is selected.
10477
10478 Don't modify this variable directly, but use `set-coding-system-priority'. */);
10479 {
10480 int i;
10481
10482 Vcoding_category_list = Qnil;
10483 for (i = coding_category_max - 1; i >= 0; i--)
10484 Vcoding_category_list
10485 = Fcons (AREF (Vcoding_category_table, i),
10486 Vcoding_category_list);
10487 }
10488
10489 DEFVAR_LISP ("coding-system-for-read", Vcoding_system_for_read,
10490 doc: /* Specify the coding system for read operations.
10491 It is useful to bind this variable with `let', but do not set it globally.
10492 If the value is a coding system, it is used for decoding on read operation.
10493 If not, an appropriate element is used from one of the coding system alists.
10494 There are three such tables: `file-coding-system-alist',
10495 `process-coding-system-alist', and `network-coding-system-alist'. */);
10496 Vcoding_system_for_read = Qnil;
10497
10498 DEFVAR_LISP ("coding-system-for-write", Vcoding_system_for_write,
10499 doc: /* Specify the coding system for write operations.
10500 Programs bind this variable with `let', but you should not set it globally.
10501 If the value is a coding system, it is used for encoding of output,
10502 when writing it to a file and when sending it to a file or subprocess.
10503
10504 If this does not specify a coding system, an appropriate element
10505 is used from one of the coding system alists.
10506 There are three such tables: `file-coding-system-alist',
10507 `process-coding-system-alist', and `network-coding-system-alist'.
10508 For output to files, if the above procedure does not specify a coding system,
10509 the value of `buffer-file-coding-system' is used. */);
10510 Vcoding_system_for_write = Qnil;
10511
10512 DEFVAR_LISP ("last-coding-system-used", Vlast_coding_system_used,
10513 doc: /*
10514 Coding system used in the latest file or process I/O. */);
10515 Vlast_coding_system_used = Qnil;
10516
10517 DEFVAR_LISP ("last-code-conversion-error", Vlast_code_conversion_error,
10518 doc: /*
10519 Error status of the last code conversion.
10520
10521 When an error was detected in the last code conversion, this variable
10522 is set to one of the following symbols.
10523 `insufficient-source'
10524 `inconsistent-eol'
10525 `invalid-source'
10526 `interrupted'
10527 `insufficient-memory'
10528 When no error was detected, the value doesn't change. So, to check
10529 the error status of a code conversion by this variable, you must
10530 explicitly set this variable to nil before performing code
10531 conversion. */);
10532 Vlast_code_conversion_error = Qnil;
10533
10534 DEFVAR_BOOL ("inhibit-eol-conversion", inhibit_eol_conversion,
10535 doc: /*
10536 *Non-nil means always inhibit code conversion of end-of-line format.
10537 See info node `Coding Systems' and info node `Text and Binary' concerning
10538 such conversion. */);
10539 inhibit_eol_conversion = 0;
10540
10541 DEFVAR_BOOL ("inherit-process-coding-system", inherit_process_coding_system,
10542 doc: /*
10543 Non-nil means process buffer inherits coding system of process output.
10544 Bind it to t if the process output is to be treated as if it were a file
10545 read from some filesystem. */);
10546 inherit_process_coding_system = 0;
10547
10548 DEFVAR_LISP ("file-coding-system-alist", Vfile_coding_system_alist,
10549 doc: /*
10550 Alist to decide a coding system to use for a file I/O operation.
10551 The format is ((PATTERN . VAL) ...),
10552 where PATTERN is a regular expression matching a file name,
10553 VAL is a coding system, a cons of coding systems, or a function symbol.
10554 If VAL is a coding system, it is used for both decoding and encoding
10555 the file contents.
10556 If VAL is a cons of coding systems, the car part is used for decoding,
10557 and the cdr part is used for encoding.
10558 If VAL is a function symbol, the function must return a coding system
10559 or a cons of coding systems which are used as above. The function is
10560 called with an argument that is a list of the arguments with which
10561 `find-operation-coding-system' was called. If the function can't decide
10562 a coding system, it can return `undecided' so that the normal
10563 code-detection is performed.
10564
10565 See also the function `find-operation-coding-system'
10566 and the variable `auto-coding-alist'. */);
10567 Vfile_coding_system_alist = Qnil;
10568
10569 DEFVAR_LISP ("process-coding-system-alist", Vprocess_coding_system_alist,
10570 doc: /*
10571 Alist to decide a coding system to use for a process I/O operation.
10572 The format is ((PATTERN . VAL) ...),
10573 where PATTERN is a regular expression matching a program name,
10574 VAL is a coding system, a cons of coding systems, or a function symbol.
10575 If VAL is a coding system, it is used for both decoding what received
10576 from the program and encoding what sent to the program.
10577 If VAL is a cons of coding systems, the car part is used for decoding,
10578 and the cdr part is used for encoding.
10579 If VAL is a function symbol, the function must return a coding system
10580 or a cons of coding systems which are used as above.
10581
10582 See also the function `find-operation-coding-system'. */);
10583 Vprocess_coding_system_alist = Qnil;
10584
10585 DEFVAR_LISP ("network-coding-system-alist", Vnetwork_coding_system_alist,
10586 doc: /*
10587 Alist to decide a coding system to use for a network I/O operation.
10588 The format is ((PATTERN . VAL) ...),
10589 where PATTERN is a regular expression matching a network service name
10590 or is a port number to connect to,
10591 VAL is a coding system, a cons of coding systems, or a function symbol.
10592 If VAL is a coding system, it is used for both decoding what received
10593 from the network stream and encoding what sent to the network stream.
10594 If VAL is a cons of coding systems, the car part is used for decoding,
10595 and the cdr part is used for encoding.
10596 If VAL is a function symbol, the function must return a coding system
10597 or a cons of coding systems which are used as above.
10598
10599 See also the function `find-operation-coding-system'. */);
10600 Vnetwork_coding_system_alist = Qnil;
10601
10602 DEFVAR_LISP ("locale-coding-system", Vlocale_coding_system,
10603 doc: /* Coding system to use with system messages.
10604 Also used for decoding keyboard input on X Window system. */);
10605 Vlocale_coding_system = Qnil;
10606
10607 /* The eol mnemonics are reset in startup.el system-dependently. */
10608 DEFVAR_LISP ("eol-mnemonic-unix", eol_mnemonic_unix,
10609 doc: /*
10610 *String displayed in mode line for UNIX-like (LF) end-of-line format. */);
10611 eol_mnemonic_unix = build_pure_c_string (":");
10612
10613 DEFVAR_LISP ("eol-mnemonic-dos", eol_mnemonic_dos,
10614 doc: /*
10615 *String displayed in mode line for DOS-like (CRLF) end-of-line format. */);
10616 eol_mnemonic_dos = build_pure_c_string ("\\");
10617
10618 DEFVAR_LISP ("eol-mnemonic-mac", eol_mnemonic_mac,
10619 doc: /*
10620 *String displayed in mode line for MAC-like (CR) end-of-line format. */);
10621 eol_mnemonic_mac = build_pure_c_string ("/");
10622
10623 DEFVAR_LISP ("eol-mnemonic-undecided", eol_mnemonic_undecided,
10624 doc: /*
10625 *String displayed in mode line when end-of-line format is not yet determined. */);
10626 eol_mnemonic_undecided = build_pure_c_string (":");
10627
10628 DEFVAR_LISP ("enable-character-translation", Venable_character_translation,
10629 doc: /*
10630 *Non-nil enables character translation while encoding and decoding. */);
10631 Venable_character_translation = Qt;
10632
10633 DEFVAR_LISP ("standard-translation-table-for-decode",
10634 Vstandard_translation_table_for_decode,
10635 doc: /* Table for translating characters while decoding. */);
10636 Vstandard_translation_table_for_decode = Qnil;
10637
10638 DEFVAR_LISP ("standard-translation-table-for-encode",
10639 Vstandard_translation_table_for_encode,
10640 doc: /* Table for translating characters while encoding. */);
10641 Vstandard_translation_table_for_encode = Qnil;
10642
10643 DEFVAR_LISP ("charset-revision-table", Vcharset_revision_table,
10644 doc: /* Alist of charsets vs revision numbers.
10645 While encoding, if a charset (car part of an element) is found,
10646 designate it with the escape sequence identifying revision (cdr part
10647 of the element). */);
10648 Vcharset_revision_table = Qnil;
10649
10650 DEFVAR_LISP ("default-process-coding-system",
10651 Vdefault_process_coding_system,
10652 doc: /* Cons of coding systems used for process I/O by default.
10653 The car part is used for decoding a process output,
10654 the cdr part is used for encoding a text to be sent to a process. */);
10655 Vdefault_process_coding_system = Qnil;
10656
10657 DEFVAR_LISP ("latin-extra-code-table", Vlatin_extra_code_table,
10658 doc: /*
10659 Table of extra Latin codes in the range 128..159 (inclusive).
10660 This is a vector of length 256.
10661 If Nth element is non-nil, the existence of code N in a file
10662 \(or output of subprocess) doesn't prevent it to be detected as
10663 a coding system of ISO 2022 variant which has a flag
10664 `accept-latin-extra-code' t (e.g. iso-latin-1) on reading a file
10665 or reading output of a subprocess.
10666 Only 128th through 159th elements have a meaning. */);
10667 Vlatin_extra_code_table = Fmake_vector (make_number (256), Qnil);
10668
10669 DEFVAR_LISP ("select-safe-coding-system-function",
10670 Vselect_safe_coding_system_function,
10671 doc: /*
10672 Function to call to select safe coding system for encoding a text.
10673
10674 If set, this function is called to force a user to select a proper
10675 coding system which can encode the text in the case that a default
10676 coding system used in each operation can't encode the text. The
10677 function should take care that the buffer is not modified while
10678 the coding system is being selected.
10679
10680 The default value is `select-safe-coding-system' (which see). */);
10681 Vselect_safe_coding_system_function = Qnil;
10682
10683 DEFVAR_BOOL ("coding-system-require-warning",
10684 coding_system_require_warning,
10685 doc: /* Internal use only.
10686 If non-nil, on writing a file, `select-safe-coding-system-function' is
10687 called even if `coding-system-for-write' is non-nil. The command
10688 `universal-coding-system-argument' binds this variable to t temporarily. */);
10689 coding_system_require_warning = 0;
10690
10691
10692 DEFVAR_BOOL ("inhibit-iso-escape-detection",
10693 inhibit_iso_escape_detection,
10694 doc: /*
10695 If non-nil, Emacs ignores ISO-2022 escape sequences during code detection.
10696
10697 When Emacs reads text, it tries to detect how the text is encoded.
10698 This code detection is sensitive to escape sequences. If Emacs sees
10699 a valid ISO-2022 escape sequence, it assumes the text is encoded in one
10700 of the ISO2022 encodings, and decodes text by the corresponding coding
10701 system (e.g. `iso-2022-7bit').
10702
10703 However, there may be a case that you want to read escape sequences in
10704 a file as is. In such a case, you can set this variable to non-nil.
10705 Then the code detection will ignore any escape sequences, and no text is
10706 detected as encoded in some ISO-2022 encoding. The result is that all
10707 escape sequences become visible in a buffer.
10708
10709 The default value is nil, and it is strongly recommended not to change
10710 it. That is because many Emacs Lisp source files that contain
10711 non-ASCII characters are encoded by the coding system `iso-2022-7bit'
10712 in Emacs's distribution, and they won't be decoded correctly on
10713 reading if you suppress escape sequence detection.
10714
10715 The other way to read escape sequences in a file without decoding is
10716 to explicitly specify some coding system that doesn't use ISO-2022
10717 escape sequence (e.g `latin-1') on reading by \\[universal-coding-system-argument]. */);
10718 inhibit_iso_escape_detection = 0;
10719
10720 DEFVAR_BOOL ("inhibit-null-byte-detection",
10721 inhibit_null_byte_detection,
10722 doc: /* If non-nil, Emacs ignores null bytes on code detection.
10723 By default, Emacs treats it as binary data, and does not attempt to
10724 decode it. The effect is as if you specified `no-conversion' for
10725 reading that text.
10726
10727 Set this to non-nil when a regular text happens to include null bytes.
10728 Examples are Index nodes of Info files and null-byte delimited output
10729 from GNU Find and GNU Grep. Emacs will then ignore the null bytes and
10730 decode text as usual. */);
10731 inhibit_null_byte_detection = 0;
10732
10733 DEFVAR_LISP ("translation-table-for-input", Vtranslation_table_for_input,
10734 doc: /* Char table for translating self-inserting characters.
10735 This is applied to the result of input methods, not their input.
10736 See also `keyboard-translate-table'.
10737
10738 Use of this variable for character code unification was rendered
10739 obsolete in Emacs 23.1 and later, since Unicode is now the basis of
10740 internal character representation. */);
10741 Vtranslation_table_for_input = Qnil;
10742
10743 {
10744 Lisp_Object args[coding_arg_max];
10745 Lisp_Object plist[16];
10746 int i;
10747
10748 for (i = 0; i < coding_arg_max; i++)
10749 args[i] = Qnil;
10750
10751 plist[0] = intern_c_string (":name");
10752 plist[1] = args[coding_arg_name] = Qno_conversion;
10753 plist[2] = intern_c_string (":mnemonic");
10754 plist[3] = args[coding_arg_mnemonic] = make_number ('=');
10755 plist[4] = intern_c_string (":coding-type");
10756 plist[5] = args[coding_arg_coding_type] = Qraw_text;
10757 plist[6] = intern_c_string (":ascii-compatible-p");
10758 plist[7] = args[coding_arg_ascii_compatible_p] = Qt;
10759 plist[8] = intern_c_string (":default-char");
10760 plist[9] = args[coding_arg_default_char] = make_number (0);
10761 plist[10] = intern_c_string (":for-unibyte");
10762 plist[11] = args[coding_arg_for_unibyte] = Qt;
10763 plist[12] = intern_c_string (":docstring");
10764 plist[13] = build_pure_c_string ("Do no conversion.\n\
10765 \n\
10766 When you visit a file with this coding, the file is read into a\n\
10767 unibyte buffer as is, thus each byte of a file is treated as a\n\
10768 character.");
10769 plist[14] = intern_c_string (":eol-type");
10770 plist[15] = args[coding_arg_eol_type] = Qunix;
10771 args[coding_arg_plist] = Flist (16, plist);
10772 Fdefine_coding_system_internal (coding_arg_max, args);
10773
10774 plist[1] = args[coding_arg_name] = Qundecided;
10775 plist[3] = args[coding_arg_mnemonic] = make_number ('-');
10776 plist[5] = args[coding_arg_coding_type] = Qundecided;
10777 /* This is already set.
10778 plist[7] = args[coding_arg_ascii_compatible_p] = Qt; */
10779 plist[8] = intern_c_string (":charset-list");
10780 plist[9] = args[coding_arg_charset_list] = Fcons (Qascii, Qnil);
10781 plist[11] = args[coding_arg_for_unibyte] = Qnil;
10782 plist[13] = build_pure_c_string ("No conversion on encoding, automatic conversion on decoding.");
10783 plist[15] = args[coding_arg_eol_type] = Qnil;
10784 args[coding_arg_plist] = Flist (16, plist);
10785 Fdefine_coding_system_internal (coding_arg_max, args);
10786 }
10787
10788 setup_coding_system (Qno_conversion, &safe_terminal_coding);
10789
10790 {
10791 int i;
10792
10793 for (i = 0; i < coding_category_max; i++)
10794 Fset (AREF (Vcoding_category_table, i), Qno_conversion);
10795 }
10796 #if defined (DOS_NT)
10797 system_eol_type = Qdos;
10798 #else
10799 system_eol_type = Qunix;
10800 #endif
10801 staticpro (&system_eol_type);
10802 }
10803
10804 char *
10805 emacs_strerror (int error_number)
10806 {
10807 char *str;
10808
10809 synchronize_system_messages_locale ();
10810 str = strerror (error_number);
10811
10812 if (! NILP (Vlocale_coding_system))
10813 {
10814 Lisp_Object dec = code_convert_string_norecord (build_string (str),
10815 Vlocale_coding_system,
10816 0);
10817 str = SSDATA (dec);
10818 }
10819
10820 return str;
10821 }
10822
10823 #endif /* emacs */