* lisp.h (make_gap_1): New prototype.
[bpt/emacs.git] / src / coding.c
1 /* Coding system handler (conversion, detection, etc).
2 Copyright (C) 2001-2013 Free Software Foundation, Inc.
3 Copyright (C) 1995, 1996, 1997, 1998, 1999, 2000, 2001, 2002, 2003, 2004,
4 2005, 2006, 2007, 2008, 2009, 2010, 2011
5 National Institute of Advanced Industrial Science and Technology (AIST)
6 Registration Number H14PRO021
7 Copyright (C) 2003
8 National Institute of Advanced Industrial Science and Technology (AIST)
9 Registration Number H13PRO009
10
11 This file is part of GNU Emacs.
12
13 GNU Emacs is free software: you can redistribute it and/or modify
14 it under the terms of the GNU General Public License as published by
15 the Free Software Foundation, either version 3 of the License, or
16 (at your option) any later version.
17
18 GNU Emacs is distributed in the hope that it will be useful,
19 but WITHOUT ANY WARRANTY; without even the implied warranty of
20 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
21 GNU General Public License for more details.
22
23 You should have received a copy of the GNU General Public License
24 along with GNU Emacs. If not, see <http://www.gnu.org/licenses/>. */
25
26 /*** TABLE OF CONTENTS ***
27
28 0. General comments
29 1. Preamble
30 2. Emacs' internal format (emacs-utf-8) handlers
31 3. UTF-8 handlers
32 4. UTF-16 handlers
33 5. Charset-base coding systems handlers
34 6. emacs-mule (old Emacs' internal format) handlers
35 7. ISO2022 handlers
36 8. Shift-JIS and BIG5 handlers
37 9. CCL handlers
38 10. C library functions
39 11. Emacs Lisp library functions
40 12. Postamble
41
42 */
43
44 /*** 0. General comments ***
45
46
47 CODING SYSTEM
48
49 A coding system is an object for an encoding mechanism that contains
50 information about how to convert byte sequences to character
51 sequences and vice versa. When we say "decode", it means converting
52 a byte sequence of a specific coding system into a character
53 sequence that is represented by Emacs' internal coding system
54 `emacs-utf-8', and when we say "encode", it means converting a
55 character sequence of emacs-utf-8 to a byte sequence of a specific
56 coding system.
57
58 In Emacs Lisp, a coding system is represented by a Lisp symbol. On
59 the C level, a coding system is represented by a vector of attributes
60 stored in the hash table Vcharset_hash_table. The conversion from
61 coding system symbol to attributes vector is done by looking up
62 Vcharset_hash_table by the symbol.
63
64 Coding systems are classified into the following types depending on
65 the encoding mechanism. Here's a brief description of the types.
66
67 o UTF-8
68
69 o UTF-16
70
71 o Charset-base coding system
72
73 A coding system defined by one or more (coded) character sets.
74 Decoding and encoding are done by a code converter defined for each
75 character set.
76
77 o Old Emacs internal format (emacs-mule)
78
79 The coding system adopted by old versions of Emacs (20 and 21).
80
81 o ISO2022-base coding system
82
83 The most famous coding system for multiple character sets. X's
84 Compound Text, various EUCs (Extended Unix Code), and coding systems
85 used in the Internet communication such as ISO-2022-JP are all
86 variants of ISO2022.
87
88 o SJIS (or Shift-JIS or MS-Kanji-Code)
89
90 A coding system to encode character sets: ASCII, JISX0201, and
91 JISX0208. Widely used for PC's in Japan. Details are described in
92 section 8.
93
94 o BIG5
95
96 A coding system to encode character sets: ASCII and Big5. Widely
97 used for Chinese (mainly in Taiwan and Hong Kong). Details are
98 described in section 8. In this file, when we write "big5" (all
99 lowercase), we mean the coding system, and when we write "Big5"
100 (capitalized), we mean the character set.
101
102 o CCL
103
104 If a user wants to decode/encode text encoded in a coding system
105 not listed above, he can supply a decoder and an encoder for it in
106 CCL (Code Conversion Language) programs. Emacs executes the CCL
107 program while decoding/encoding.
108
109 o Raw-text
110
111 A coding system for text containing raw eight-bit data. Emacs
112 treats each byte of source text as a character (except for
113 end-of-line conversion).
114
115 o No-conversion
116
117 Like raw text, but don't do end-of-line conversion.
118
119
120 END-OF-LINE FORMAT
121
122 How text end-of-line is encoded depends on operating system. For
123 instance, Unix's format is just one byte of LF (line-feed) code,
124 whereas DOS's format is two-byte sequence of `carriage-return' and
125 `line-feed' codes. MacOS's format is usually one byte of
126 `carriage-return'.
127
128 Since text character encoding and end-of-line encoding are
129 independent, any coding system described above can take any format
130 of end-of-line (except for no-conversion).
131
132 STRUCT CODING_SYSTEM
133
134 Before using a coding system for code conversion (i.e. decoding and
135 encoding), we setup a structure of type `struct coding_system'.
136 This structure keeps various information about a specific code
137 conversion (e.g. the location of source and destination data).
138
139 */
140
141 /* COMMON MACROS */
142
143
144 /*** GENERAL NOTES on `detect_coding_XXX ()' functions ***
145
146 These functions check if a byte sequence specified as a source in
147 CODING conforms to the format of XXX, and update the members of
148 DETECT_INFO.
149
150 Return true if the byte sequence conforms to XXX.
151
152 Below is the template of these functions. */
153
154 #if 0
155 static bool
156 detect_coding_XXX (struct coding_system *coding,
157 struct coding_detection_info *detect_info)
158 {
159 const unsigned char *src = coding->source;
160 const unsigned char *src_end = coding->source + coding->src_bytes;
161 bool multibytep = coding->src_multibyte;
162 ptrdiff_t consumed_chars = 0;
163 int found = 0;
164 ...;
165
166 while (1)
167 {
168 /* Get one byte from the source. If the source is exhausted, jump
169 to no_more_source:. */
170 ONE_MORE_BYTE (c);
171
172 if (! __C_conforms_to_XXX___ (c))
173 break;
174 if (! __C_strongly_suggests_XXX__ (c))
175 found = CATEGORY_MASK_XXX;
176 }
177 /* The byte sequence is invalid for XXX. */
178 detect_info->rejected |= CATEGORY_MASK_XXX;
179 return 0;
180
181 no_more_source:
182 /* The source exhausted successfully. */
183 detect_info->found |= found;
184 return 1;
185 }
186 #endif
187
188 /*** GENERAL NOTES on `decode_coding_XXX ()' functions ***
189
190 These functions decode a byte sequence specified as a source by
191 CODING. The resulting multibyte text goes to a place pointed to by
192 CODING->charbuf, the length of which should not exceed
193 CODING->charbuf_size;
194
195 These functions set the information of original and decoded texts in
196 CODING->consumed, CODING->consumed_char, and CODING->charbuf_used.
197 They also set CODING->result to one of CODING_RESULT_XXX indicating
198 how the decoding is finished.
199
200 Below is the template of these functions. */
201
202 #if 0
203 static void
204 decode_coding_XXXX (struct coding_system *coding)
205 {
206 const unsigned char *src = coding->source + coding->consumed;
207 const unsigned char *src_end = coding->source + coding->src_bytes;
208 /* SRC_BASE remembers the start position in source in each loop.
209 The loop will be exited when there's not enough source code, or
210 when there's no room in CHARBUF for a decoded character. */
211 const unsigned char *src_base;
212 /* A buffer to produce decoded characters. */
213 int *charbuf = coding->charbuf + coding->charbuf_used;
214 int *charbuf_end = coding->charbuf + coding->charbuf_size;
215 bool multibytep = coding->src_multibyte;
216
217 while (1)
218 {
219 src_base = src;
220 if (charbuf < charbuf_end)
221 /* No more room to produce a decoded character. */
222 break;
223 ONE_MORE_BYTE (c);
224 /* Decode it. */
225 }
226
227 no_more_source:
228 if (src_base < src_end
229 && coding->mode & CODING_MODE_LAST_BLOCK)
230 /* If the source ends by partial bytes to construct a character,
231 treat them as eight-bit raw data. */
232 while (src_base < src_end && charbuf < charbuf_end)
233 *charbuf++ = *src_base++;
234 /* Remember how many bytes and characters we consumed. If the
235 source is multibyte, the bytes and chars are not identical. */
236 coding->consumed = coding->consumed_char = src_base - coding->source;
237 /* Remember how many characters we produced. */
238 coding->charbuf_used = charbuf - coding->charbuf;
239 }
240 #endif
241
242 /*** GENERAL NOTES on `encode_coding_XXX ()' functions ***
243
244 These functions encode SRC_BYTES length text at SOURCE of Emacs'
245 internal multibyte format by CODING. The resulting byte sequence
246 goes to a place pointed to by DESTINATION, the length of which
247 should not exceed DST_BYTES.
248
249 These functions set the information of original and encoded texts in
250 the members produced, produced_char, consumed, and consumed_char of
251 the structure *CODING. They also set the member result to one of
252 CODING_RESULT_XXX indicating how the encoding finished.
253
254 DST_BYTES zero means that source area and destination area are
255 overlapped, which means that we can produce a encoded text until it
256 reaches at the head of not-yet-encoded source text.
257
258 Below is a template of these functions. */
259 #if 0
260 static void
261 encode_coding_XXX (struct coding_system *coding)
262 {
263 bool multibytep = coding->dst_multibyte;
264 int *charbuf = coding->charbuf;
265 int *charbuf_end = charbuf->charbuf + coding->charbuf_used;
266 unsigned char *dst = coding->destination + coding->produced;
267 unsigned char *dst_end = coding->destination + coding->dst_bytes;
268 unsigned char *adjusted_dst_end = dst_end - _MAX_BYTES_PRODUCED_IN_LOOP_;
269 ptrdiff_t produced_chars = 0;
270
271 for (; charbuf < charbuf_end && dst < adjusted_dst_end; charbuf++)
272 {
273 int c = *charbuf;
274 /* Encode C into DST, and increment DST. */
275 }
276 label_no_more_destination:
277 /* How many chars and bytes we produced. */
278 coding->produced_char += produced_chars;
279 coding->produced = dst - coding->destination;
280 }
281 #endif
282
283 \f
284 /*** 1. Preamble ***/
285
286 #include <config.h>
287 #include <stdio.h>
288
289 #include "lisp.h"
290 #include "character.h"
291 #include "buffer.h"
292 #include "charset.h"
293 #include "ccl.h"
294 #include "composite.h"
295 #include "coding.h"
296 #include "window.h"
297 #include "frame.h"
298 #include "termhooks.h"
299
300 Lisp_Object Vcoding_system_hash_table;
301
302 static Lisp_Object Qcoding_system, Qeol_type;
303 static Lisp_Object Qcoding_aliases;
304 Lisp_Object Qunix, Qdos;
305 static Lisp_Object Qmac;
306 Lisp_Object Qbuffer_file_coding_system;
307 static Lisp_Object Qpost_read_conversion, Qpre_write_conversion;
308 static Lisp_Object Qdefault_char;
309 Lisp_Object Qno_conversion, Qundecided;
310 Lisp_Object Qcharset, Qutf_8;
311 static Lisp_Object Qiso_2022;
312 static Lisp_Object Qutf_16, Qshift_jis, Qbig5;
313 static Lisp_Object Qbig, Qlittle;
314 static Lisp_Object Qcoding_system_history;
315 static Lisp_Object Qvalid_codes;
316 static Lisp_Object QCcategory, QCmnemonic, QCdefault_char;
317 static Lisp_Object QCdecode_translation_table, QCencode_translation_table;
318 static Lisp_Object QCpost_read_conversion, QCpre_write_conversion;
319 static Lisp_Object QCascii_compatible_p;
320
321 Lisp_Object Qcall_process, Qcall_process_region;
322 Lisp_Object Qstart_process, Qopen_network_stream;
323 static Lisp_Object Qtarget_idx;
324
325 static Lisp_Object Qinsufficient_source, Qinconsistent_eol, Qinvalid_source;
326 static Lisp_Object Qinterrupted, Qinsufficient_memory;
327
328 /* If a symbol has this property, evaluate the value to define the
329 symbol as a coding system. */
330 static Lisp_Object Qcoding_system_define_form;
331
332 /* Format of end-of-line decided by system. This is Qunix on
333 Unix and Mac, Qdos on DOS/Windows.
334 This has an effect only for external encoding (i.e. for output to
335 file and process), not for in-buffer or Lisp string encoding. */
336 static Lisp_Object system_eol_type;
337
338 #ifdef emacs
339
340 Lisp_Object Qcoding_system_p, Qcoding_system_error;
341
342 /* Coding system emacs-mule and raw-text are for converting only
343 end-of-line format. */
344 Lisp_Object Qemacs_mule, Qraw_text;
345 Lisp_Object Qutf_8_emacs;
346
347 #if defined (WINDOWSNT) || defined (CYGWIN)
348 static Lisp_Object Qutf_16le;
349 #endif
350
351 /* Coding-systems are handed between Emacs Lisp programs and C internal
352 routines by the following three variables. */
353 /* Coding system to be used to encode text for terminal display when
354 terminal coding system is nil. */
355 struct coding_system safe_terminal_coding;
356
357 #endif /* emacs */
358
359 Lisp_Object Qtranslation_table;
360 Lisp_Object Qtranslation_table_id;
361 static Lisp_Object Qtranslation_table_for_decode;
362 static Lisp_Object Qtranslation_table_for_encode;
363
364 /* Two special coding systems. */
365 static Lisp_Object Vsjis_coding_system;
366 static Lisp_Object Vbig5_coding_system;
367
368 /* ISO2022 section */
369
370 #define CODING_ISO_INITIAL(coding, reg) \
371 (XINT (AREF (AREF (CODING_ID_ATTRS ((coding)->id), \
372 coding_attr_iso_initial), \
373 reg)))
374
375
376 #define CODING_ISO_REQUEST(coding, charset_id) \
377 (((charset_id) <= (coding)->max_charset_id \
378 ? ((coding)->safe_charsets[charset_id] != 255 \
379 ? (coding)->safe_charsets[charset_id] \
380 : -1) \
381 : -1))
382
383
384 #define CODING_ISO_FLAGS(coding) \
385 ((coding)->spec.iso_2022.flags)
386 #define CODING_ISO_DESIGNATION(coding, reg) \
387 ((coding)->spec.iso_2022.current_designation[reg])
388 #define CODING_ISO_INVOCATION(coding, plane) \
389 ((coding)->spec.iso_2022.current_invocation[plane])
390 #define CODING_ISO_SINGLE_SHIFTING(coding) \
391 ((coding)->spec.iso_2022.single_shifting)
392 #define CODING_ISO_BOL(coding) \
393 ((coding)->spec.iso_2022.bol)
394 #define CODING_ISO_INVOKED_CHARSET(coding, plane) \
395 CODING_ISO_DESIGNATION ((coding), CODING_ISO_INVOCATION ((coding), (plane)))
396 #define CODING_ISO_CMP_STATUS(coding) \
397 (&(coding)->spec.iso_2022.cmp_status)
398 #define CODING_ISO_EXTSEGMENT_LEN(coding) \
399 ((coding)->spec.iso_2022.ctext_extended_segment_len)
400 #define CODING_ISO_EMBEDDED_UTF_8(coding) \
401 ((coding)->spec.iso_2022.embedded_utf_8)
402
403 /* Control characters of ISO2022. */
404 /* code */ /* function */
405 #define ISO_CODE_SO 0x0E /* shift-out */
406 #define ISO_CODE_SI 0x0F /* shift-in */
407 #define ISO_CODE_SS2_7 0x19 /* single-shift-2 for 7-bit code */
408 #define ISO_CODE_ESC 0x1B /* escape */
409 #define ISO_CODE_SS2 0x8E /* single-shift-2 */
410 #define ISO_CODE_SS3 0x8F /* single-shift-3 */
411 #define ISO_CODE_CSI 0x9B /* control-sequence-introducer */
412
413 /* All code (1-byte) of ISO2022 is classified into one of the
414 followings. */
415 enum iso_code_class_type
416 {
417 ISO_control_0, /* Control codes in the range
418 0x00..0x1F and 0x7F, except for the
419 following 5 codes. */
420 ISO_shift_out, /* ISO_CODE_SO (0x0E) */
421 ISO_shift_in, /* ISO_CODE_SI (0x0F) */
422 ISO_single_shift_2_7, /* ISO_CODE_SS2_7 (0x19) */
423 ISO_escape, /* ISO_CODE_ESC (0x1B) */
424 ISO_control_1, /* Control codes in the range
425 0x80..0x9F, except for the
426 following 3 codes. */
427 ISO_single_shift_2, /* ISO_CODE_SS2 (0x8E) */
428 ISO_single_shift_3, /* ISO_CODE_SS3 (0x8F) */
429 ISO_control_sequence_introducer, /* ISO_CODE_CSI (0x9B) */
430 ISO_0x20_or_0x7F, /* Codes of the values 0x20 or 0x7F. */
431 ISO_graphic_plane_0, /* Graphic codes in the range 0x21..0x7E. */
432 ISO_0xA0_or_0xFF, /* Codes of the values 0xA0 or 0xFF. */
433 ISO_graphic_plane_1 /* Graphic codes in the range 0xA1..0xFE. */
434 };
435
436 /** The macros CODING_ISO_FLAG_XXX defines a flag bit of the
437 `iso-flags' attribute of an iso2022 coding system. */
438
439 /* If set, produce long-form designation sequence (e.g. ESC $ ( A)
440 instead of the correct short-form sequence (e.g. ESC $ A). */
441 #define CODING_ISO_FLAG_LONG_FORM 0x0001
442
443 /* If set, reset graphic planes and registers at end-of-line to the
444 initial state. */
445 #define CODING_ISO_FLAG_RESET_AT_EOL 0x0002
446
447 /* If set, reset graphic planes and registers before any control
448 characters to the initial state. */
449 #define CODING_ISO_FLAG_RESET_AT_CNTL 0x0004
450
451 /* If set, encode by 7-bit environment. */
452 #define CODING_ISO_FLAG_SEVEN_BITS 0x0008
453
454 /* If set, use locking-shift function. */
455 #define CODING_ISO_FLAG_LOCKING_SHIFT 0x0010
456
457 /* If set, use single-shift function. Overwrite
458 CODING_ISO_FLAG_LOCKING_SHIFT. */
459 #define CODING_ISO_FLAG_SINGLE_SHIFT 0x0020
460
461 /* If set, use designation escape sequence. */
462 #define CODING_ISO_FLAG_DESIGNATION 0x0040
463
464 /* If set, produce revision number sequence. */
465 #define CODING_ISO_FLAG_REVISION 0x0080
466
467 /* If set, produce ISO6429's direction specifying sequence. */
468 #define CODING_ISO_FLAG_DIRECTION 0x0100
469
470 /* If set, assume designation states are reset at beginning of line on
471 output. */
472 #define CODING_ISO_FLAG_INIT_AT_BOL 0x0200
473
474 /* If set, designation sequence should be placed at beginning of line
475 on output. */
476 #define CODING_ISO_FLAG_DESIGNATE_AT_BOL 0x0400
477
478 /* If set, do not encode unsafe characters on output. */
479 #define CODING_ISO_FLAG_SAFE 0x0800
480
481 /* If set, extra latin codes (128..159) are accepted as a valid code
482 on input. */
483 #define CODING_ISO_FLAG_LATIN_EXTRA 0x1000
484
485 #define CODING_ISO_FLAG_COMPOSITION 0x2000
486
487 /* #define CODING_ISO_FLAG_EUC_TW_SHIFT 0x4000 */
488
489 #define CODING_ISO_FLAG_USE_ROMAN 0x8000
490
491 #define CODING_ISO_FLAG_USE_OLDJIS 0x10000
492
493 #define CODING_ISO_FLAG_FULL_SUPPORT 0x100000
494
495 /* A character to be produced on output if encoding of the original
496 character is prohibited by CODING_ISO_FLAG_SAFE. */
497 #define CODING_INHIBIT_CHARACTER_SUBSTITUTION '?'
498
499 /* UTF-8 section */
500 #define CODING_UTF_8_BOM(coding) \
501 ((coding)->spec.utf_8_bom)
502
503 /* UTF-16 section */
504 #define CODING_UTF_16_BOM(coding) \
505 ((coding)->spec.utf_16.bom)
506
507 #define CODING_UTF_16_ENDIAN(coding) \
508 ((coding)->spec.utf_16.endian)
509
510 #define CODING_UTF_16_SURROGATE(coding) \
511 ((coding)->spec.utf_16.surrogate)
512
513
514 /* CCL section */
515 #define CODING_CCL_DECODER(coding) \
516 AREF (CODING_ID_ATTRS ((coding)->id), coding_attr_ccl_decoder)
517 #define CODING_CCL_ENCODER(coding) \
518 AREF (CODING_ID_ATTRS ((coding)->id), coding_attr_ccl_encoder)
519 #define CODING_CCL_VALIDS(coding) \
520 (SDATA (AREF (CODING_ID_ATTRS ((coding)->id), coding_attr_ccl_valids)))
521
522 /* Index for each coding category in `coding_categories' */
523
524 enum coding_category
525 {
526 coding_category_iso_7,
527 coding_category_iso_7_tight,
528 coding_category_iso_8_1,
529 coding_category_iso_8_2,
530 coding_category_iso_7_else,
531 coding_category_iso_8_else,
532 coding_category_utf_8_auto,
533 coding_category_utf_8_nosig,
534 coding_category_utf_8_sig,
535 coding_category_utf_16_auto,
536 coding_category_utf_16_be,
537 coding_category_utf_16_le,
538 coding_category_utf_16_be_nosig,
539 coding_category_utf_16_le_nosig,
540 coding_category_charset,
541 coding_category_sjis,
542 coding_category_big5,
543 coding_category_ccl,
544 coding_category_emacs_mule,
545 /* All above are targets of code detection. */
546 coding_category_raw_text,
547 coding_category_undecided,
548 coding_category_max
549 };
550
551 /* Definitions of flag bits used in detect_coding_XXXX. */
552 #define CATEGORY_MASK_ISO_7 (1 << coding_category_iso_7)
553 #define CATEGORY_MASK_ISO_7_TIGHT (1 << coding_category_iso_7_tight)
554 #define CATEGORY_MASK_ISO_8_1 (1 << coding_category_iso_8_1)
555 #define CATEGORY_MASK_ISO_8_2 (1 << coding_category_iso_8_2)
556 #define CATEGORY_MASK_ISO_7_ELSE (1 << coding_category_iso_7_else)
557 #define CATEGORY_MASK_ISO_8_ELSE (1 << coding_category_iso_8_else)
558 #define CATEGORY_MASK_UTF_8_AUTO (1 << coding_category_utf_8_auto)
559 #define CATEGORY_MASK_UTF_8_NOSIG (1 << coding_category_utf_8_nosig)
560 #define CATEGORY_MASK_UTF_8_SIG (1 << coding_category_utf_8_sig)
561 #define CATEGORY_MASK_UTF_16_AUTO (1 << coding_category_utf_16_auto)
562 #define CATEGORY_MASK_UTF_16_BE (1 << coding_category_utf_16_be)
563 #define CATEGORY_MASK_UTF_16_LE (1 << coding_category_utf_16_le)
564 #define CATEGORY_MASK_UTF_16_BE_NOSIG (1 << coding_category_utf_16_be_nosig)
565 #define CATEGORY_MASK_UTF_16_LE_NOSIG (1 << coding_category_utf_16_le_nosig)
566 #define CATEGORY_MASK_CHARSET (1 << coding_category_charset)
567 #define CATEGORY_MASK_SJIS (1 << coding_category_sjis)
568 #define CATEGORY_MASK_BIG5 (1 << coding_category_big5)
569 #define CATEGORY_MASK_CCL (1 << coding_category_ccl)
570 #define CATEGORY_MASK_EMACS_MULE (1 << coding_category_emacs_mule)
571 #define CATEGORY_MASK_RAW_TEXT (1 << coding_category_raw_text)
572
573 /* This value is returned if detect_coding_mask () find nothing other
574 than ASCII characters. */
575 #define CATEGORY_MASK_ANY \
576 (CATEGORY_MASK_ISO_7 \
577 | CATEGORY_MASK_ISO_7_TIGHT \
578 | CATEGORY_MASK_ISO_8_1 \
579 | CATEGORY_MASK_ISO_8_2 \
580 | CATEGORY_MASK_ISO_7_ELSE \
581 | CATEGORY_MASK_ISO_8_ELSE \
582 | CATEGORY_MASK_UTF_8_AUTO \
583 | CATEGORY_MASK_UTF_8_NOSIG \
584 | CATEGORY_MASK_UTF_8_SIG \
585 | CATEGORY_MASK_UTF_16_AUTO \
586 | CATEGORY_MASK_UTF_16_BE \
587 | CATEGORY_MASK_UTF_16_LE \
588 | CATEGORY_MASK_UTF_16_BE_NOSIG \
589 | CATEGORY_MASK_UTF_16_LE_NOSIG \
590 | CATEGORY_MASK_CHARSET \
591 | CATEGORY_MASK_SJIS \
592 | CATEGORY_MASK_BIG5 \
593 | CATEGORY_MASK_CCL \
594 | CATEGORY_MASK_EMACS_MULE)
595
596
597 #define CATEGORY_MASK_ISO_7BIT \
598 (CATEGORY_MASK_ISO_7 | CATEGORY_MASK_ISO_7_TIGHT)
599
600 #define CATEGORY_MASK_ISO_8BIT \
601 (CATEGORY_MASK_ISO_8_1 | CATEGORY_MASK_ISO_8_2)
602
603 #define CATEGORY_MASK_ISO_ELSE \
604 (CATEGORY_MASK_ISO_7_ELSE | CATEGORY_MASK_ISO_8_ELSE)
605
606 #define CATEGORY_MASK_ISO_ESCAPE \
607 (CATEGORY_MASK_ISO_7 \
608 | CATEGORY_MASK_ISO_7_TIGHT \
609 | CATEGORY_MASK_ISO_7_ELSE \
610 | CATEGORY_MASK_ISO_8_ELSE)
611
612 #define CATEGORY_MASK_ISO \
613 ( CATEGORY_MASK_ISO_7BIT \
614 | CATEGORY_MASK_ISO_8BIT \
615 | CATEGORY_MASK_ISO_ELSE)
616
617 #define CATEGORY_MASK_UTF_16 \
618 (CATEGORY_MASK_UTF_16_AUTO \
619 | CATEGORY_MASK_UTF_16_BE \
620 | CATEGORY_MASK_UTF_16_LE \
621 | CATEGORY_MASK_UTF_16_BE_NOSIG \
622 | CATEGORY_MASK_UTF_16_LE_NOSIG)
623
624 #define CATEGORY_MASK_UTF_8 \
625 (CATEGORY_MASK_UTF_8_AUTO \
626 | CATEGORY_MASK_UTF_8_NOSIG \
627 | CATEGORY_MASK_UTF_8_SIG)
628
629 /* Table of coding categories (Lisp symbols). This variable is for
630 internal use only. */
631 static Lisp_Object Vcoding_category_table;
632
633 /* Table of coding-categories ordered by priority. */
634 static enum coding_category coding_priorities[coding_category_max];
635
636 /* Nth element is a coding context for the coding system bound to the
637 Nth coding category. */
638 static struct coding_system coding_categories[coding_category_max];
639
640 /*** Commonly used macros and functions ***/
641
642 #ifndef min
643 #define min(a, b) ((a) < (b) ? (a) : (b))
644 #endif
645 #ifndef max
646 #define max(a, b) ((a) > (b) ? (a) : (b))
647 #endif
648
649 #define CODING_GET_INFO(coding, attrs, charset_list) \
650 do { \
651 (attrs) = CODING_ID_ATTRS ((coding)->id); \
652 (charset_list) = CODING_ATTR_CHARSET_LIST (attrs); \
653 } while (0)
654
655
656 /* Safely get one byte from the source text pointed by SRC which ends
657 at SRC_END, and set C to that byte. If there are not enough bytes
658 in the source, it jumps to 'no_more_source'. If MULTIBYTEP,
659 and a multibyte character is found at SRC, set C to the
660 negative value of the character code. The caller should declare
661 and set these variables appropriately in advance:
662 src, src_end, multibytep */
663
664 #define ONE_MORE_BYTE(c) \
665 do { \
666 if (src == src_end) \
667 { \
668 if (src_base < src) \
669 record_conversion_result \
670 (coding, CODING_RESULT_INSUFFICIENT_SRC); \
671 goto no_more_source; \
672 } \
673 c = *src++; \
674 if (multibytep && (c & 0x80)) \
675 { \
676 if ((c & 0xFE) == 0xC0) \
677 c = ((c & 1) << 6) | *src++; \
678 else \
679 { \
680 src--; \
681 c = - string_char (src, &src, NULL); \
682 record_conversion_result \
683 (coding, CODING_RESULT_INVALID_SRC); \
684 } \
685 } \
686 consumed_chars++; \
687 } while (0)
688
689 /* Safely get two bytes from the source text pointed by SRC which ends
690 at SRC_END, and set C1 and C2 to those bytes while skipping the
691 heading multibyte characters. If there are not enough bytes in the
692 source, it jumps to 'no_more_source'. If MULTIBYTEP and
693 a multibyte character is found for C2, set C2 to the negative value
694 of the character code. The caller should declare and set these
695 variables appropriately in advance:
696 src, src_end, multibytep
697 It is intended that this macro is used in detect_coding_utf_16. */
698
699 #define TWO_MORE_BYTES(c1, c2) \
700 do { \
701 do { \
702 if (src == src_end) \
703 goto no_more_source; \
704 c1 = *src++; \
705 if (multibytep && (c1 & 0x80)) \
706 { \
707 if ((c1 & 0xFE) == 0xC0) \
708 c1 = ((c1 & 1) << 6) | *src++; \
709 else \
710 { \
711 src += BYTES_BY_CHAR_HEAD (c1) - 1; \
712 c1 = -1; \
713 } \
714 } \
715 } while (c1 < 0); \
716 if (src == src_end) \
717 goto no_more_source; \
718 c2 = *src++; \
719 if (multibytep && (c2 & 0x80)) \
720 { \
721 if ((c2 & 0xFE) == 0xC0) \
722 c2 = ((c2 & 1) << 6) | *src++; \
723 else \
724 c2 = -1; \
725 } \
726 } while (0)
727
728
729 /* Store a byte C in the place pointed by DST and increment DST to the
730 next free point, and increment PRODUCED_CHARS. The caller should
731 assure that C is 0..127, and declare and set the variable `dst'
732 appropriately in advance.
733 */
734
735
736 #define EMIT_ONE_ASCII_BYTE(c) \
737 do { \
738 produced_chars++; \
739 *dst++ = (c); \
740 } while (0)
741
742
743 /* Like EMIT_ONE_ASCII_BYTE but store two bytes; C1 and C2. */
744
745 #define EMIT_TWO_ASCII_BYTES(c1, c2) \
746 do { \
747 produced_chars += 2; \
748 *dst++ = (c1), *dst++ = (c2); \
749 } while (0)
750
751
752 /* Store a byte C in the place pointed by DST and increment DST to the
753 next free point, and increment PRODUCED_CHARS. If MULTIBYTEP,
754 store in an appropriate multibyte form. The caller should
755 declare and set the variables `dst' and `multibytep' appropriately
756 in advance. */
757
758 #define EMIT_ONE_BYTE(c) \
759 do { \
760 produced_chars++; \
761 if (multibytep) \
762 { \
763 unsigned ch = (c); \
764 if (ch >= 0x80) \
765 ch = BYTE8_TO_CHAR (ch); \
766 CHAR_STRING_ADVANCE (ch, dst); \
767 } \
768 else \
769 *dst++ = (c); \
770 } while (0)
771
772
773 /* Like EMIT_ONE_BYTE, but emit two bytes; C1 and C2. */
774
775 #define EMIT_TWO_BYTES(c1, c2) \
776 do { \
777 produced_chars += 2; \
778 if (multibytep) \
779 { \
780 unsigned ch; \
781 \
782 ch = (c1); \
783 if (ch >= 0x80) \
784 ch = BYTE8_TO_CHAR (ch); \
785 CHAR_STRING_ADVANCE (ch, dst); \
786 ch = (c2); \
787 if (ch >= 0x80) \
788 ch = BYTE8_TO_CHAR (ch); \
789 CHAR_STRING_ADVANCE (ch, dst); \
790 } \
791 else \
792 { \
793 *dst++ = (c1); \
794 *dst++ = (c2); \
795 } \
796 } while (0)
797
798
799 #define EMIT_THREE_BYTES(c1, c2, c3) \
800 do { \
801 EMIT_ONE_BYTE (c1); \
802 EMIT_TWO_BYTES (c2, c3); \
803 } while (0)
804
805
806 #define EMIT_FOUR_BYTES(c1, c2, c3, c4) \
807 do { \
808 EMIT_TWO_BYTES (c1, c2); \
809 EMIT_TWO_BYTES (c3, c4); \
810 } while (0)
811
812
813 static void
814 record_conversion_result (struct coding_system *coding,
815 enum coding_result_code result)
816 {
817 coding->result = result;
818 switch (result)
819 {
820 case CODING_RESULT_INSUFFICIENT_SRC:
821 Vlast_code_conversion_error = Qinsufficient_source;
822 break;
823 case CODING_RESULT_INCONSISTENT_EOL:
824 Vlast_code_conversion_error = Qinconsistent_eol;
825 break;
826 case CODING_RESULT_INVALID_SRC:
827 Vlast_code_conversion_error = Qinvalid_source;
828 break;
829 case CODING_RESULT_INTERRUPT:
830 Vlast_code_conversion_error = Qinterrupted;
831 break;
832 case CODING_RESULT_INSUFFICIENT_MEM:
833 Vlast_code_conversion_error = Qinsufficient_memory;
834 break;
835 case CODING_RESULT_INSUFFICIENT_DST:
836 /* Don't record this error in Vlast_code_conversion_error
837 because it happens just temporarily and is resolved when the
838 whole conversion is finished. */
839 break;
840 case CODING_RESULT_SUCCESS:
841 break;
842 default:
843 Vlast_code_conversion_error = intern ("Unknown error");
844 }
845 }
846
847 /* These wrapper macros are used to preserve validity of pointers into
848 buffer text across calls to decode_char, encode_char, etc, which
849 could cause relocation of buffers if it loads a charset map,
850 because loading a charset map allocates large structures. */
851
852 #define CODING_DECODE_CHAR(coding, src, src_base, src_end, charset, code, c) \
853 do { \
854 ptrdiff_t offset; \
855 \
856 charset_map_loaded = 0; \
857 c = DECODE_CHAR (charset, code); \
858 if (charset_map_loaded \
859 && (offset = coding_change_source (coding))) \
860 { \
861 src += offset; \
862 src_base += offset; \
863 src_end += offset; \
864 } \
865 } while (0)
866
867 #define CODING_ENCODE_CHAR(coding, dst, dst_end, charset, c, code) \
868 do { \
869 ptrdiff_t offset; \
870 \
871 charset_map_loaded = 0; \
872 code = ENCODE_CHAR (charset, c); \
873 if (charset_map_loaded \
874 && (offset = coding_change_destination (coding))) \
875 { \
876 dst += offset; \
877 dst_end += offset; \
878 } \
879 } while (0)
880
881 #define CODING_CHAR_CHARSET(coding, dst, dst_end, c, charset_list, code_return, charset) \
882 do { \
883 ptrdiff_t offset; \
884 \
885 charset_map_loaded = 0; \
886 charset = char_charset (c, charset_list, code_return); \
887 if (charset_map_loaded \
888 && (offset = coding_change_destination (coding))) \
889 { \
890 dst += offset; \
891 dst_end += offset; \
892 } \
893 } while (0)
894
895 #define CODING_CHAR_CHARSET_P(coding, dst, dst_end, c, charset, result) \
896 do { \
897 ptrdiff_t offset; \
898 \
899 charset_map_loaded = 0; \
900 result = CHAR_CHARSET_P (c, charset); \
901 if (charset_map_loaded \
902 && (offset = coding_change_destination (coding))) \
903 { \
904 dst += offset; \
905 dst_end += offset; \
906 } \
907 } while (0)
908
909
910 /* If there are at least BYTES length of room at dst, allocate memory
911 for coding->destination and update dst and dst_end. We don't have
912 to take care of coding->source which will be relocated. It is
913 handled by calling coding_set_source in encode_coding. */
914
915 #define ASSURE_DESTINATION(bytes) \
916 do { \
917 if (dst + (bytes) >= dst_end) \
918 { \
919 ptrdiff_t more_bytes = charbuf_end - charbuf + (bytes); \
920 \
921 dst = alloc_destination (coding, more_bytes, dst); \
922 dst_end = coding->destination + coding->dst_bytes; \
923 } \
924 } while (0)
925
926
927 /* Store multibyte form of the character C in P, and advance P to the
928 end of the multibyte form. This used to be like CHAR_STRING_ADVANCE
929 without ever calling MAYBE_UNIFY_CHAR, but nowadays we don't call
930 MAYBE_UNIFY_CHAR in CHAR_STRING_ADVANCE. */
931
932 #define CHAR_STRING_ADVANCE_NO_UNIFY(c, p) CHAR_STRING_ADVANCE(c, p)
933
934 /* Return the character code of character whose multibyte form is at
935 P, and advance P to the end of the multibyte form. This used to be
936 like STRING_CHAR_ADVANCE without ever calling MAYBE_UNIFY_CHAR, but
937 nowadays STRING_CHAR_ADVANCE doesn't call MAYBE_UNIFY_CHAR. */
938
939 #define STRING_CHAR_ADVANCE_NO_UNIFY(p) STRING_CHAR_ADVANCE(p)
940
941 /* Set coding->source from coding->src_object. */
942
943 static void
944 coding_set_source (struct coding_system *coding)
945 {
946 if (BUFFERP (coding->src_object))
947 {
948 struct buffer *buf = XBUFFER (coding->src_object);
949
950 if (coding->src_pos < 0)
951 coding->source = BUF_GAP_END_ADDR (buf) + coding->src_pos_byte;
952 else
953 coding->source = BUF_BYTE_ADDRESS (buf, coding->src_pos_byte);
954 }
955 else if (STRINGP (coding->src_object))
956 {
957 coding->source = SDATA (coding->src_object) + coding->src_pos_byte;
958 }
959 else
960 {
961 /* Otherwise, the source is C string and is never relocated
962 automatically. Thus we don't have to update anything. */
963 }
964 }
965
966
967 /* Set coding->source from coding->src_object, and return how many
968 bytes coding->source was changed. */
969
970 static ptrdiff_t
971 coding_change_source (struct coding_system *coding)
972 {
973 const unsigned char *orig = coding->source;
974 coding_set_source (coding);
975 return coding->source - orig;
976 }
977
978
979 /* Set coding->destination from coding->dst_object. */
980
981 static void
982 coding_set_destination (struct coding_system *coding)
983 {
984 if (BUFFERP (coding->dst_object))
985 {
986 if (BUFFERP (coding->src_object) && coding->src_pos < 0)
987 {
988 coding->destination = BEG_ADDR + coding->dst_pos_byte - BEG_BYTE;
989 coding->dst_bytes = (GAP_END_ADDR
990 - (coding->src_bytes - coding->consumed)
991 - coding->destination);
992 }
993 else
994 {
995 /* We are sure that coding->dst_pos_byte is before the gap
996 of the buffer. */
997 coding->destination = (BUF_BEG_ADDR (XBUFFER (coding->dst_object))
998 + coding->dst_pos_byte - BEG_BYTE);
999 coding->dst_bytes = (BUF_GAP_END_ADDR (XBUFFER (coding->dst_object))
1000 - coding->destination);
1001 }
1002 }
1003 else
1004 {
1005 /* Otherwise, the destination is C string and is never relocated
1006 automatically. Thus we don't have to update anything. */
1007 }
1008 }
1009
1010
1011 /* Set coding->destination from coding->dst_object, and return how
1012 many bytes coding->destination was changed. */
1013
1014 static ptrdiff_t
1015 coding_change_destination (struct coding_system *coding)
1016 {
1017 const unsigned char *orig = coding->destination;
1018 coding_set_destination (coding);
1019 return coding->destination - orig;
1020 }
1021
1022
1023 static void
1024 coding_alloc_by_realloc (struct coding_system *coding, ptrdiff_t bytes)
1025 {
1026 if (STRING_BYTES_BOUND - coding->dst_bytes < bytes)
1027 string_overflow ();
1028 coding->destination = xrealloc (coding->destination,
1029 coding->dst_bytes + bytes);
1030 coding->dst_bytes += bytes;
1031 }
1032
1033 static void
1034 coding_alloc_by_making_gap (struct coding_system *coding,
1035 ptrdiff_t gap_head_used, ptrdiff_t bytes)
1036 {
1037 if (EQ (coding->src_object, coding->dst_object))
1038 {
1039 /* The gap may contain the produced data at the head and not-yet
1040 consumed data at the tail. To preserve those data, we at
1041 first make the gap size to zero, then increase the gap
1042 size. */
1043 ptrdiff_t add = GAP_SIZE;
1044
1045 GPT += gap_head_used, GPT_BYTE += gap_head_used;
1046 GAP_SIZE = 0; ZV += add; Z += add; ZV_BYTE += add; Z_BYTE += add;
1047 make_gap (bytes);
1048 GAP_SIZE += add; ZV -= add; Z -= add; ZV_BYTE -= add; Z_BYTE -= add;
1049 GPT -= gap_head_used, GPT_BYTE -= gap_head_used;
1050 }
1051 else
1052 make_gap_1 (XBUFFER (coding->dst_object), bytes);
1053 }
1054
1055
1056 static unsigned char *
1057 alloc_destination (struct coding_system *coding, ptrdiff_t nbytes,
1058 unsigned char *dst)
1059 {
1060 ptrdiff_t offset = dst - coding->destination;
1061
1062 if (BUFFERP (coding->dst_object))
1063 {
1064 struct buffer *buf = XBUFFER (coding->dst_object);
1065
1066 coding_alloc_by_making_gap (coding, dst - BUF_GPT_ADDR (buf), nbytes);
1067 }
1068 else
1069 coding_alloc_by_realloc (coding, nbytes);
1070 coding_set_destination (coding);
1071 dst = coding->destination + offset;
1072 return dst;
1073 }
1074
1075 /** Macros for annotations. */
1076
1077 /* An annotation data is stored in the array coding->charbuf in this
1078 format:
1079 [ -LENGTH ANNOTATION_MASK NCHARS ... ]
1080 LENGTH is the number of elements in the annotation.
1081 ANNOTATION_MASK is one of CODING_ANNOTATE_XXX_MASK.
1082 NCHARS is the number of characters in the text annotated.
1083
1084 The format of the following elements depend on ANNOTATION_MASK.
1085
1086 In the case of CODING_ANNOTATE_COMPOSITION_MASK, these elements
1087 follows:
1088 ... NBYTES METHOD [ COMPOSITION-COMPONENTS ... ]
1089
1090 NBYTES is the number of bytes specified in the header part of
1091 old-style emacs-mule encoding, or 0 for the other kind of
1092 composition.
1093
1094 METHOD is one of enum composition_method.
1095
1096 Optional COMPOSITION-COMPONENTS are characters and composition
1097 rules.
1098
1099 In the case of CODING_ANNOTATE_CHARSET_MASK, one element CHARSET-ID
1100 follows.
1101
1102 If ANNOTATION_MASK is 0, this annotation is just a space holder to
1103 recover from an invalid annotation, and should be skipped by
1104 produce_annotation. */
1105
1106 /* Maximum length of the header of annotation data. */
1107 #define MAX_ANNOTATION_LENGTH 5
1108
1109 #define ADD_ANNOTATION_DATA(buf, len, mask, nchars) \
1110 do { \
1111 *(buf)++ = -(len); \
1112 *(buf)++ = (mask); \
1113 *(buf)++ = (nchars); \
1114 coding->annotated = 1; \
1115 } while (0);
1116
1117 #define ADD_COMPOSITION_DATA(buf, nchars, nbytes, method) \
1118 do { \
1119 ADD_ANNOTATION_DATA (buf, 5, CODING_ANNOTATE_COMPOSITION_MASK, nchars); \
1120 *buf++ = nbytes; \
1121 *buf++ = method; \
1122 } while (0)
1123
1124
1125 #define ADD_CHARSET_DATA(buf, nchars, id) \
1126 do { \
1127 ADD_ANNOTATION_DATA (buf, 4, CODING_ANNOTATE_CHARSET_MASK, nchars); \
1128 *buf++ = id; \
1129 } while (0)
1130
1131 \f
1132 /*** 2. Emacs' internal format (emacs-utf-8) ***/
1133
1134
1135
1136 \f
1137 /*** 3. UTF-8 ***/
1138
1139 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
1140 Return true if a text is encoded in UTF-8. */
1141
1142 #define UTF_8_1_OCTET_P(c) ((c) < 0x80)
1143 #define UTF_8_EXTRA_OCTET_P(c) (((c) & 0xC0) == 0x80)
1144 #define UTF_8_2_OCTET_LEADING_P(c) (((c) & 0xE0) == 0xC0)
1145 #define UTF_8_3_OCTET_LEADING_P(c) (((c) & 0xF0) == 0xE0)
1146 #define UTF_8_4_OCTET_LEADING_P(c) (((c) & 0xF8) == 0xF0)
1147 #define UTF_8_5_OCTET_LEADING_P(c) (((c) & 0xFC) == 0xF8)
1148
1149 #define UTF_8_BOM_1 0xEF
1150 #define UTF_8_BOM_2 0xBB
1151 #define UTF_8_BOM_3 0xBF
1152
1153 static bool
1154 detect_coding_utf_8 (struct coding_system *coding,
1155 struct coding_detection_info *detect_info)
1156 {
1157 const unsigned char *src = coding->source, *src_base;
1158 const unsigned char *src_end = coding->source + coding->src_bytes;
1159 bool multibytep = coding->src_multibyte;
1160 ptrdiff_t consumed_chars = 0;
1161 bool bom_found = 0;
1162 bool found = 0;
1163
1164 detect_info->checked |= CATEGORY_MASK_UTF_8;
1165 /* A coding system of this category is always ASCII compatible. */
1166 src += coding->head_ascii;
1167
1168 while (1)
1169 {
1170 int c, c1, c2, c3, c4;
1171
1172 src_base = src;
1173 ONE_MORE_BYTE (c);
1174 if (c < 0 || UTF_8_1_OCTET_P (c))
1175 continue;
1176 ONE_MORE_BYTE (c1);
1177 if (c1 < 0 || ! UTF_8_EXTRA_OCTET_P (c1))
1178 break;
1179 if (UTF_8_2_OCTET_LEADING_P (c))
1180 {
1181 found = 1;
1182 continue;
1183 }
1184 ONE_MORE_BYTE (c2);
1185 if (c2 < 0 || ! UTF_8_EXTRA_OCTET_P (c2))
1186 break;
1187 if (UTF_8_3_OCTET_LEADING_P (c))
1188 {
1189 found = 1;
1190 if (src_base == coding->source
1191 && c == UTF_8_BOM_1 && c1 == UTF_8_BOM_2 && c2 == UTF_8_BOM_3)
1192 bom_found = 1;
1193 continue;
1194 }
1195 ONE_MORE_BYTE (c3);
1196 if (c3 < 0 || ! UTF_8_EXTRA_OCTET_P (c3))
1197 break;
1198 if (UTF_8_4_OCTET_LEADING_P (c))
1199 {
1200 found = 1;
1201 continue;
1202 }
1203 ONE_MORE_BYTE (c4);
1204 if (c4 < 0 || ! UTF_8_EXTRA_OCTET_P (c4))
1205 break;
1206 if (UTF_8_5_OCTET_LEADING_P (c))
1207 {
1208 found = 1;
1209 continue;
1210 }
1211 break;
1212 }
1213 detect_info->rejected |= CATEGORY_MASK_UTF_8;
1214 return 0;
1215
1216 no_more_source:
1217 if (src_base < src && coding->mode & CODING_MODE_LAST_BLOCK)
1218 {
1219 detect_info->rejected |= CATEGORY_MASK_UTF_8;
1220 return 0;
1221 }
1222 if (bom_found)
1223 {
1224 /* The first character 0xFFFE doesn't necessarily mean a BOM. */
1225 detect_info->found |= CATEGORY_MASK_UTF_8_SIG | CATEGORY_MASK_UTF_8_NOSIG;
1226 }
1227 else
1228 {
1229 detect_info->rejected |= CATEGORY_MASK_UTF_8_SIG;
1230 if (found)
1231 detect_info->found |= CATEGORY_MASK_UTF_8_NOSIG;
1232 }
1233 return 1;
1234 }
1235
1236
1237 static void
1238 decode_coding_utf_8 (struct coding_system *coding)
1239 {
1240 const unsigned char *src = coding->source + coding->consumed;
1241 const unsigned char *src_end = coding->source + coding->src_bytes;
1242 const unsigned char *src_base;
1243 int *charbuf = coding->charbuf + coding->charbuf_used;
1244 int *charbuf_end = coding->charbuf + coding->charbuf_size;
1245 ptrdiff_t consumed_chars = 0, consumed_chars_base = 0;
1246 bool multibytep = coding->src_multibyte;
1247 enum utf_bom_type bom = CODING_UTF_8_BOM (coding);
1248 bool eol_dos
1249 = !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
1250 int byte_after_cr = -1;
1251
1252 if (bom != utf_without_bom)
1253 {
1254 int c1, c2, c3;
1255
1256 src_base = src;
1257 ONE_MORE_BYTE (c1);
1258 if (! UTF_8_3_OCTET_LEADING_P (c1))
1259 src = src_base;
1260 else
1261 {
1262 ONE_MORE_BYTE (c2);
1263 if (! UTF_8_EXTRA_OCTET_P (c2))
1264 src = src_base;
1265 else
1266 {
1267 ONE_MORE_BYTE (c3);
1268 if (! UTF_8_EXTRA_OCTET_P (c3))
1269 src = src_base;
1270 else
1271 {
1272 if ((c1 != UTF_8_BOM_1)
1273 || (c2 != UTF_8_BOM_2) || (c3 != UTF_8_BOM_3))
1274 src = src_base;
1275 else
1276 CODING_UTF_8_BOM (coding) = utf_without_bom;
1277 }
1278 }
1279 }
1280 }
1281 CODING_UTF_8_BOM (coding) = utf_without_bom;
1282
1283 while (1)
1284 {
1285 int c, c1, c2, c3, c4, c5;
1286
1287 src_base = src;
1288 consumed_chars_base = consumed_chars;
1289
1290 if (charbuf >= charbuf_end)
1291 {
1292 if (byte_after_cr >= 0)
1293 src_base--;
1294 break;
1295 }
1296
1297 if (byte_after_cr >= 0)
1298 c1 = byte_after_cr, byte_after_cr = -1;
1299 else
1300 ONE_MORE_BYTE (c1);
1301 if (c1 < 0)
1302 {
1303 c = - c1;
1304 }
1305 else if (UTF_8_1_OCTET_P (c1))
1306 {
1307 if (eol_dos && c1 == '\r')
1308 ONE_MORE_BYTE (byte_after_cr);
1309 c = c1;
1310 }
1311 else
1312 {
1313 ONE_MORE_BYTE (c2);
1314 if (c2 < 0 || ! UTF_8_EXTRA_OCTET_P (c2))
1315 goto invalid_code;
1316 if (UTF_8_2_OCTET_LEADING_P (c1))
1317 {
1318 c = ((c1 & 0x1F) << 6) | (c2 & 0x3F);
1319 /* Reject overlong sequences here and below. Encoders
1320 producing them are incorrect, they can be misleading,
1321 and they mess up read/write invariance. */
1322 if (c < 128)
1323 goto invalid_code;
1324 }
1325 else
1326 {
1327 ONE_MORE_BYTE (c3);
1328 if (c3 < 0 || ! UTF_8_EXTRA_OCTET_P (c3))
1329 goto invalid_code;
1330 if (UTF_8_3_OCTET_LEADING_P (c1))
1331 {
1332 c = (((c1 & 0xF) << 12)
1333 | ((c2 & 0x3F) << 6) | (c3 & 0x3F));
1334 if (c < 0x800
1335 || (c >= 0xd800 && c < 0xe000)) /* surrogates (invalid) */
1336 goto invalid_code;
1337 }
1338 else
1339 {
1340 ONE_MORE_BYTE (c4);
1341 if (c4 < 0 || ! UTF_8_EXTRA_OCTET_P (c4))
1342 goto invalid_code;
1343 if (UTF_8_4_OCTET_LEADING_P (c1))
1344 {
1345 c = (((c1 & 0x7) << 18) | ((c2 & 0x3F) << 12)
1346 | ((c3 & 0x3F) << 6) | (c4 & 0x3F));
1347 if (c < 0x10000)
1348 goto invalid_code;
1349 }
1350 else
1351 {
1352 ONE_MORE_BYTE (c5);
1353 if (c5 < 0 || ! UTF_8_EXTRA_OCTET_P (c5))
1354 goto invalid_code;
1355 if (UTF_8_5_OCTET_LEADING_P (c1))
1356 {
1357 c = (((c1 & 0x3) << 24) | ((c2 & 0x3F) << 18)
1358 | ((c3 & 0x3F) << 12) | ((c4 & 0x3F) << 6)
1359 | (c5 & 0x3F));
1360 if ((c > MAX_CHAR) || (c < 0x200000))
1361 goto invalid_code;
1362 }
1363 else
1364 goto invalid_code;
1365 }
1366 }
1367 }
1368 }
1369
1370 *charbuf++ = c;
1371 continue;
1372
1373 invalid_code:
1374 src = src_base;
1375 consumed_chars = consumed_chars_base;
1376 ONE_MORE_BYTE (c);
1377 *charbuf++ = ASCII_BYTE_P (c) ? c : BYTE8_TO_CHAR (c);
1378 coding->errors++;
1379 }
1380
1381 no_more_source:
1382 coding->consumed_char += consumed_chars_base;
1383 coding->consumed = src_base - coding->source;
1384 coding->charbuf_used = charbuf - coding->charbuf;
1385 }
1386
1387
1388 static bool
1389 encode_coding_utf_8 (struct coding_system *coding)
1390 {
1391 bool multibytep = coding->dst_multibyte;
1392 int *charbuf = coding->charbuf;
1393 int *charbuf_end = charbuf + coding->charbuf_used;
1394 unsigned char *dst = coding->destination + coding->produced;
1395 unsigned char *dst_end = coding->destination + coding->dst_bytes;
1396 ptrdiff_t produced_chars = 0;
1397 int c;
1398
1399 if (CODING_UTF_8_BOM (coding) == utf_with_bom)
1400 {
1401 ASSURE_DESTINATION (3);
1402 EMIT_THREE_BYTES (UTF_8_BOM_1, UTF_8_BOM_2, UTF_8_BOM_3);
1403 CODING_UTF_8_BOM (coding) = utf_without_bom;
1404 }
1405
1406 if (multibytep)
1407 {
1408 int safe_room = MAX_MULTIBYTE_LENGTH * 2;
1409
1410 while (charbuf < charbuf_end)
1411 {
1412 unsigned char str[MAX_MULTIBYTE_LENGTH], *p, *pend = str;
1413
1414 ASSURE_DESTINATION (safe_room);
1415 c = *charbuf++;
1416 if (CHAR_BYTE8_P (c))
1417 {
1418 c = CHAR_TO_BYTE8 (c);
1419 EMIT_ONE_BYTE (c);
1420 }
1421 else
1422 {
1423 CHAR_STRING_ADVANCE_NO_UNIFY (c, pend);
1424 for (p = str; p < pend; p++)
1425 EMIT_ONE_BYTE (*p);
1426 }
1427 }
1428 }
1429 else
1430 {
1431 int safe_room = MAX_MULTIBYTE_LENGTH;
1432
1433 while (charbuf < charbuf_end)
1434 {
1435 ASSURE_DESTINATION (safe_room);
1436 c = *charbuf++;
1437 if (CHAR_BYTE8_P (c))
1438 *dst++ = CHAR_TO_BYTE8 (c);
1439 else
1440 CHAR_STRING_ADVANCE_NO_UNIFY (c, dst);
1441 produced_chars++;
1442 }
1443 }
1444 record_conversion_result (coding, CODING_RESULT_SUCCESS);
1445 coding->produced_char += produced_chars;
1446 coding->produced = dst - coding->destination;
1447 return 0;
1448 }
1449
1450
1451 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
1452 Return true if a text is encoded in one of UTF-16 based coding systems. */
1453
1454 #define UTF_16_HIGH_SURROGATE_P(val) \
1455 (((val) & 0xFC00) == 0xD800)
1456
1457 #define UTF_16_LOW_SURROGATE_P(val) \
1458 (((val) & 0xFC00) == 0xDC00)
1459
1460
1461 static bool
1462 detect_coding_utf_16 (struct coding_system *coding,
1463 struct coding_detection_info *detect_info)
1464 {
1465 const unsigned char *src = coding->source;
1466 const unsigned char *src_end = coding->source + coding->src_bytes;
1467 bool multibytep = coding->src_multibyte;
1468 int c1, c2;
1469
1470 detect_info->checked |= CATEGORY_MASK_UTF_16;
1471 if (coding->mode & CODING_MODE_LAST_BLOCK
1472 && (coding->src_chars & 1))
1473 {
1474 detect_info->rejected |= CATEGORY_MASK_UTF_16;
1475 return 0;
1476 }
1477
1478 TWO_MORE_BYTES (c1, c2);
1479 if ((c1 == 0xFF) && (c2 == 0xFE))
1480 {
1481 detect_info->found |= (CATEGORY_MASK_UTF_16_LE
1482 | CATEGORY_MASK_UTF_16_AUTO);
1483 detect_info->rejected |= (CATEGORY_MASK_UTF_16_BE
1484 | CATEGORY_MASK_UTF_16_BE_NOSIG
1485 | CATEGORY_MASK_UTF_16_LE_NOSIG);
1486 }
1487 else if ((c1 == 0xFE) && (c2 == 0xFF))
1488 {
1489 detect_info->found |= (CATEGORY_MASK_UTF_16_BE
1490 | CATEGORY_MASK_UTF_16_AUTO);
1491 detect_info->rejected |= (CATEGORY_MASK_UTF_16_LE
1492 | CATEGORY_MASK_UTF_16_BE_NOSIG
1493 | CATEGORY_MASK_UTF_16_LE_NOSIG);
1494 }
1495 else if (c2 < 0)
1496 {
1497 detect_info->rejected |= CATEGORY_MASK_UTF_16;
1498 return 0;
1499 }
1500 else
1501 {
1502 /* We check the dispersion of Eth and Oth bytes where E is even and
1503 O is odd. If both are high, we assume binary data.*/
1504 unsigned char e[256], o[256];
1505 unsigned e_num = 1, o_num = 1;
1506
1507 memset (e, 0, 256);
1508 memset (o, 0, 256);
1509 e[c1] = 1;
1510 o[c2] = 1;
1511
1512 detect_info->rejected |= (CATEGORY_MASK_UTF_16_AUTO
1513 |CATEGORY_MASK_UTF_16_BE
1514 | CATEGORY_MASK_UTF_16_LE);
1515
1516 while ((detect_info->rejected & CATEGORY_MASK_UTF_16)
1517 != CATEGORY_MASK_UTF_16)
1518 {
1519 TWO_MORE_BYTES (c1, c2);
1520 if (c2 < 0)
1521 break;
1522 if (! e[c1])
1523 {
1524 e[c1] = 1;
1525 e_num++;
1526 if (e_num >= 128)
1527 detect_info->rejected |= CATEGORY_MASK_UTF_16_BE_NOSIG;
1528 }
1529 if (! o[c2])
1530 {
1531 o[c2] = 1;
1532 o_num++;
1533 if (o_num >= 128)
1534 detect_info->rejected |= CATEGORY_MASK_UTF_16_LE_NOSIG;
1535 }
1536 }
1537 return 0;
1538 }
1539
1540 no_more_source:
1541 return 1;
1542 }
1543
1544 static void
1545 decode_coding_utf_16 (struct coding_system *coding)
1546 {
1547 const unsigned char *src = coding->source + coding->consumed;
1548 const unsigned char *src_end = coding->source + coding->src_bytes;
1549 const unsigned char *src_base;
1550 int *charbuf = coding->charbuf + coding->charbuf_used;
1551 /* We may produces at most 3 chars in one loop. */
1552 int *charbuf_end = coding->charbuf + coding->charbuf_size - 2;
1553 ptrdiff_t consumed_chars = 0, consumed_chars_base = 0;
1554 bool multibytep = coding->src_multibyte;
1555 enum utf_bom_type bom = CODING_UTF_16_BOM (coding);
1556 enum utf_16_endian_type endian = CODING_UTF_16_ENDIAN (coding);
1557 int surrogate = CODING_UTF_16_SURROGATE (coding);
1558 bool eol_dos
1559 = !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
1560 int byte_after_cr1 = -1, byte_after_cr2 = -1;
1561
1562 if (bom == utf_with_bom)
1563 {
1564 int c, c1, c2;
1565
1566 src_base = src;
1567 ONE_MORE_BYTE (c1);
1568 ONE_MORE_BYTE (c2);
1569 c = (c1 << 8) | c2;
1570
1571 if (endian == utf_16_big_endian
1572 ? c != 0xFEFF : c != 0xFFFE)
1573 {
1574 /* The first two bytes are not BOM. Treat them as bytes
1575 for a normal character. */
1576 src = src_base;
1577 coding->errors++;
1578 }
1579 CODING_UTF_16_BOM (coding) = utf_without_bom;
1580 }
1581 else if (bom == utf_detect_bom)
1582 {
1583 /* We have already tried to detect BOM and failed in
1584 detect_coding. */
1585 CODING_UTF_16_BOM (coding) = utf_without_bom;
1586 }
1587
1588 while (1)
1589 {
1590 int c, c1, c2;
1591
1592 src_base = src;
1593 consumed_chars_base = consumed_chars;
1594
1595 if (charbuf >= charbuf_end)
1596 {
1597 if (byte_after_cr1 >= 0)
1598 src_base -= 2;
1599 break;
1600 }
1601
1602 if (byte_after_cr1 >= 0)
1603 c1 = byte_after_cr1, byte_after_cr1 = -1;
1604 else
1605 ONE_MORE_BYTE (c1);
1606 if (c1 < 0)
1607 {
1608 *charbuf++ = -c1;
1609 continue;
1610 }
1611 if (byte_after_cr2 >= 0)
1612 c2 = byte_after_cr2, byte_after_cr2 = -1;
1613 else
1614 ONE_MORE_BYTE (c2);
1615 if (c2 < 0)
1616 {
1617 *charbuf++ = ASCII_BYTE_P (c1) ? c1 : BYTE8_TO_CHAR (c1);
1618 *charbuf++ = -c2;
1619 continue;
1620 }
1621 c = (endian == utf_16_big_endian
1622 ? ((c1 << 8) | c2) : ((c2 << 8) | c1));
1623
1624 if (surrogate)
1625 {
1626 if (! UTF_16_LOW_SURROGATE_P (c))
1627 {
1628 if (endian == utf_16_big_endian)
1629 c1 = surrogate >> 8, c2 = surrogate & 0xFF;
1630 else
1631 c1 = surrogate & 0xFF, c2 = surrogate >> 8;
1632 *charbuf++ = c1;
1633 *charbuf++ = c2;
1634 coding->errors++;
1635 if (UTF_16_HIGH_SURROGATE_P (c))
1636 CODING_UTF_16_SURROGATE (coding) = surrogate = c;
1637 else
1638 *charbuf++ = c;
1639 }
1640 else
1641 {
1642 c = ((surrogate - 0xD800) << 10) | (c - 0xDC00);
1643 CODING_UTF_16_SURROGATE (coding) = surrogate = 0;
1644 *charbuf++ = 0x10000 + c;
1645 }
1646 }
1647 else
1648 {
1649 if (UTF_16_HIGH_SURROGATE_P (c))
1650 CODING_UTF_16_SURROGATE (coding) = surrogate = c;
1651 else
1652 {
1653 if (eol_dos && c == '\r')
1654 {
1655 ONE_MORE_BYTE (byte_after_cr1);
1656 ONE_MORE_BYTE (byte_after_cr2);
1657 }
1658 *charbuf++ = c;
1659 }
1660 }
1661 }
1662
1663 no_more_source:
1664 coding->consumed_char += consumed_chars_base;
1665 coding->consumed = src_base - coding->source;
1666 coding->charbuf_used = charbuf - coding->charbuf;
1667 }
1668
1669 static bool
1670 encode_coding_utf_16 (struct coding_system *coding)
1671 {
1672 bool multibytep = coding->dst_multibyte;
1673 int *charbuf = coding->charbuf;
1674 int *charbuf_end = charbuf + coding->charbuf_used;
1675 unsigned char *dst = coding->destination + coding->produced;
1676 unsigned char *dst_end = coding->destination + coding->dst_bytes;
1677 int safe_room = 8;
1678 enum utf_bom_type bom = CODING_UTF_16_BOM (coding);
1679 bool big_endian = CODING_UTF_16_ENDIAN (coding) == utf_16_big_endian;
1680 ptrdiff_t produced_chars = 0;
1681 int c;
1682
1683 if (bom != utf_without_bom)
1684 {
1685 ASSURE_DESTINATION (safe_room);
1686 if (big_endian)
1687 EMIT_TWO_BYTES (0xFE, 0xFF);
1688 else
1689 EMIT_TWO_BYTES (0xFF, 0xFE);
1690 CODING_UTF_16_BOM (coding) = utf_without_bom;
1691 }
1692
1693 while (charbuf < charbuf_end)
1694 {
1695 ASSURE_DESTINATION (safe_room);
1696 c = *charbuf++;
1697 if (c > MAX_UNICODE_CHAR)
1698 c = coding->default_char;
1699
1700 if (c < 0x10000)
1701 {
1702 if (big_endian)
1703 EMIT_TWO_BYTES (c >> 8, c & 0xFF);
1704 else
1705 EMIT_TWO_BYTES (c & 0xFF, c >> 8);
1706 }
1707 else
1708 {
1709 int c1, c2;
1710
1711 c -= 0x10000;
1712 c1 = (c >> 10) + 0xD800;
1713 c2 = (c & 0x3FF) + 0xDC00;
1714 if (big_endian)
1715 EMIT_FOUR_BYTES (c1 >> 8, c1 & 0xFF, c2 >> 8, c2 & 0xFF);
1716 else
1717 EMIT_FOUR_BYTES (c1 & 0xFF, c1 >> 8, c2 & 0xFF, c2 >> 8);
1718 }
1719 }
1720 record_conversion_result (coding, CODING_RESULT_SUCCESS);
1721 coding->produced = dst - coding->destination;
1722 coding->produced_char += produced_chars;
1723 return 0;
1724 }
1725
1726 \f
1727 /*** 6. Old Emacs' internal format (emacs-mule) ***/
1728
1729 /* Emacs' internal format for representation of multiple character
1730 sets is a kind of multi-byte encoding, i.e. characters are
1731 represented by variable-length sequences of one-byte codes.
1732
1733 ASCII characters and control characters (e.g. `tab', `newline') are
1734 represented by one-byte sequences which are their ASCII codes, in
1735 the range 0x00 through 0x7F.
1736
1737 8-bit characters of the range 0x80..0x9F are represented by
1738 two-byte sequences of LEADING_CODE_8_BIT_CONTROL and (their 8-bit
1739 code + 0x20).
1740
1741 8-bit characters of the range 0xA0..0xFF are represented by
1742 one-byte sequences which are their 8-bit code.
1743
1744 The other characters are represented by a sequence of `base
1745 leading-code', optional `extended leading-code', and one or two
1746 `position-code's. The length of the sequence is determined by the
1747 base leading-code. Leading-code takes the range 0x81 through 0x9D,
1748 whereas extended leading-code and position-code take the range 0xA0
1749 through 0xFF. See `charset.h' for more details about leading-code
1750 and position-code.
1751
1752 --- CODE RANGE of Emacs' internal format ---
1753 character set range
1754 ------------- -----
1755 ascii 0x00..0x7F
1756 eight-bit-control LEADING_CODE_8_BIT_CONTROL + 0xA0..0xBF
1757 eight-bit-graphic 0xA0..0xBF
1758 ELSE 0x81..0x9D + [0xA0..0xFF]+
1759 ---------------------------------------------
1760
1761 As this is the internal character representation, the format is
1762 usually not used externally (i.e. in a file or in a data sent to a
1763 process). But, it is possible to have a text externally in this
1764 format (i.e. by encoding by the coding system `emacs-mule').
1765
1766 In that case, a sequence of one-byte codes has a slightly different
1767 form.
1768
1769 At first, all characters in eight-bit-control are represented by
1770 one-byte sequences which are their 8-bit code.
1771
1772 Next, character composition data are represented by the byte
1773 sequence of the form: 0x80 METHOD BYTES CHARS COMPONENT ...,
1774 where,
1775 METHOD is 0xF2 plus one of composition method (enum
1776 composition_method),
1777
1778 BYTES is 0xA0 plus a byte length of this composition data,
1779
1780 CHARS is 0xA0 plus a number of characters composed by this
1781 data,
1782
1783 COMPONENTs are characters of multibyte form or composition
1784 rules encoded by two-byte of ASCII codes.
1785
1786 In addition, for backward compatibility, the following formats are
1787 also recognized as composition data on decoding.
1788
1789 0x80 MSEQ ...
1790 0x80 0xFF MSEQ RULE MSEQ RULE ... MSEQ
1791
1792 Here,
1793 MSEQ is a multibyte form but in these special format:
1794 ASCII: 0xA0 ASCII_CODE+0x80,
1795 other: LEADING_CODE+0x20 FOLLOWING-BYTE ...,
1796 RULE is a one byte code of the range 0xA0..0xF0 that
1797 represents a composition rule.
1798 */
1799
1800 char emacs_mule_bytes[256];
1801
1802
1803 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
1804 Return true if a text is encoded in 'emacs-mule'. */
1805
1806 static bool
1807 detect_coding_emacs_mule (struct coding_system *coding,
1808 struct coding_detection_info *detect_info)
1809 {
1810 const unsigned char *src = coding->source, *src_base;
1811 const unsigned char *src_end = coding->source + coding->src_bytes;
1812 bool multibytep = coding->src_multibyte;
1813 ptrdiff_t consumed_chars = 0;
1814 int c;
1815 int found = 0;
1816
1817 detect_info->checked |= CATEGORY_MASK_EMACS_MULE;
1818 /* A coding system of this category is always ASCII compatible. */
1819 src += coding->head_ascii;
1820
1821 while (1)
1822 {
1823 src_base = src;
1824 ONE_MORE_BYTE (c);
1825 if (c < 0)
1826 continue;
1827 if (c == 0x80)
1828 {
1829 /* Perhaps the start of composite character. We simply skip
1830 it because analyzing it is too heavy for detecting. But,
1831 at least, we check that the composite character
1832 constitutes of more than 4 bytes. */
1833 const unsigned char *src_start;
1834
1835 repeat:
1836 src_start = src;
1837 do
1838 {
1839 ONE_MORE_BYTE (c);
1840 }
1841 while (c >= 0xA0);
1842
1843 if (src - src_start <= 4)
1844 break;
1845 found = CATEGORY_MASK_EMACS_MULE;
1846 if (c == 0x80)
1847 goto repeat;
1848 }
1849
1850 if (c < 0x80)
1851 {
1852 if (c < 0x20
1853 && (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO))
1854 break;
1855 }
1856 else
1857 {
1858 int more_bytes = emacs_mule_bytes[c] - 1;
1859
1860 while (more_bytes > 0)
1861 {
1862 ONE_MORE_BYTE (c);
1863 if (c < 0xA0)
1864 {
1865 src--; /* Unread the last byte. */
1866 break;
1867 }
1868 more_bytes--;
1869 }
1870 if (more_bytes != 0)
1871 break;
1872 found = CATEGORY_MASK_EMACS_MULE;
1873 }
1874 }
1875 detect_info->rejected |= CATEGORY_MASK_EMACS_MULE;
1876 return 0;
1877
1878 no_more_source:
1879 if (src_base < src && coding->mode & CODING_MODE_LAST_BLOCK)
1880 {
1881 detect_info->rejected |= CATEGORY_MASK_EMACS_MULE;
1882 return 0;
1883 }
1884 detect_info->found |= found;
1885 return 1;
1886 }
1887
1888
1889 /* Parse emacs-mule multibyte sequence at SRC and return the decoded
1890 character. If CMP_STATUS indicates that we must expect MSEQ or
1891 RULE described above, decode it and return the negative value of
1892 the decoded character or rule. If an invalid byte is found, return
1893 -1. If SRC is too short, return -2. */
1894
1895 static int
1896 emacs_mule_char (struct coding_system *coding, const unsigned char *src,
1897 int *nbytes, int *nchars, int *id,
1898 struct composition_status *cmp_status)
1899 {
1900 const unsigned char *src_end = coding->source + coding->src_bytes;
1901 const unsigned char *src_base = src;
1902 bool multibytep = coding->src_multibyte;
1903 int charset_ID;
1904 unsigned code;
1905 int c;
1906 int consumed_chars = 0;
1907 bool mseq_found = 0;
1908
1909 ONE_MORE_BYTE (c);
1910 if (c < 0)
1911 {
1912 c = -c;
1913 charset_ID = emacs_mule_charset[0];
1914 }
1915 else
1916 {
1917 if (c >= 0xA0)
1918 {
1919 if (cmp_status->state != COMPOSING_NO
1920 && cmp_status->old_form)
1921 {
1922 if (cmp_status->state == COMPOSING_CHAR)
1923 {
1924 if (c == 0xA0)
1925 {
1926 ONE_MORE_BYTE (c);
1927 c -= 0x80;
1928 if (c < 0)
1929 goto invalid_code;
1930 }
1931 else
1932 c -= 0x20;
1933 mseq_found = 1;
1934 }
1935 else
1936 {
1937 *nbytes = src - src_base;
1938 *nchars = consumed_chars;
1939 return -c;
1940 }
1941 }
1942 else
1943 goto invalid_code;
1944 }
1945
1946 switch (emacs_mule_bytes[c])
1947 {
1948 case 2:
1949 if ((charset_ID = emacs_mule_charset[c]) < 0)
1950 goto invalid_code;
1951 ONE_MORE_BYTE (c);
1952 if (c < 0xA0)
1953 goto invalid_code;
1954 code = c & 0x7F;
1955 break;
1956
1957 case 3:
1958 if (c == EMACS_MULE_LEADING_CODE_PRIVATE_11
1959 || c == EMACS_MULE_LEADING_CODE_PRIVATE_12)
1960 {
1961 ONE_MORE_BYTE (c);
1962 if (c < 0xA0 || (charset_ID = emacs_mule_charset[c]) < 0)
1963 goto invalid_code;
1964 ONE_MORE_BYTE (c);
1965 if (c < 0xA0)
1966 goto invalid_code;
1967 code = c & 0x7F;
1968 }
1969 else
1970 {
1971 if ((charset_ID = emacs_mule_charset[c]) < 0)
1972 goto invalid_code;
1973 ONE_MORE_BYTE (c);
1974 if (c < 0xA0)
1975 goto invalid_code;
1976 code = (c & 0x7F) << 8;
1977 ONE_MORE_BYTE (c);
1978 if (c < 0xA0)
1979 goto invalid_code;
1980 code |= c & 0x7F;
1981 }
1982 break;
1983
1984 case 4:
1985 ONE_MORE_BYTE (c);
1986 if (c < 0 || (charset_ID = emacs_mule_charset[c]) < 0)
1987 goto invalid_code;
1988 ONE_MORE_BYTE (c);
1989 if (c < 0xA0)
1990 goto invalid_code;
1991 code = (c & 0x7F) << 8;
1992 ONE_MORE_BYTE (c);
1993 if (c < 0xA0)
1994 goto invalid_code;
1995 code |= c & 0x7F;
1996 break;
1997
1998 case 1:
1999 code = c;
2000 charset_ID = ASCII_BYTE_P (code) ? charset_ascii : charset_eight_bit;
2001 break;
2002
2003 default:
2004 emacs_abort ();
2005 }
2006 CODING_DECODE_CHAR (coding, src, src_base, src_end,
2007 CHARSET_FROM_ID (charset_ID), code, c);
2008 if (c < 0)
2009 goto invalid_code;
2010 }
2011 *nbytes = src - src_base;
2012 *nchars = consumed_chars;
2013 if (id)
2014 *id = charset_ID;
2015 return (mseq_found ? -c : c);
2016
2017 no_more_source:
2018 return -2;
2019
2020 invalid_code:
2021 return -1;
2022 }
2023
2024
2025 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions". */
2026
2027 /* Handle these composition sequence ('|': the end of header elements,
2028 BYTES and CHARS >= 0xA0):
2029
2030 (1) relative composition: 0x80 0xF2 BYTES CHARS | CHAR ...
2031 (2) altchar composition: 0x80 0xF4 BYTES CHARS | ALT ... ALT CHAR ...
2032 (3) alt&rule composition: 0x80 0xF5 BYTES CHARS | ALT RULE ... ALT CHAR ...
2033
2034 and these old form:
2035
2036 (4) relative composition: 0x80 | MSEQ ... MSEQ
2037 (5) rulebase composition: 0x80 0xFF | MSEQ MRULE ... MSEQ
2038
2039 When the starter 0x80 and the following header elements are found,
2040 this annotation header is produced.
2041
2042 [ -LENGTH(==-5) CODING_ANNOTATE_COMPOSITION_MASK NCHARS NBYTES METHOD ]
2043
2044 NCHARS is CHARS - 0xA0 for (1), (2), (3), and 0 for (4), (5).
2045 NBYTES is BYTES - 0xA0 for (1), (2), (3), and 0 for (4), (5).
2046
2047 Then, upon reading the following elements, these codes are produced
2048 until the composition end is found:
2049
2050 (1) CHAR ... CHAR
2051 (2) ALT ... ALT CHAR ... CHAR
2052 (3) ALT -2 DECODED-RULE ALT -2 DECODED-RULE ... ALT CHAR ... CHAR
2053 (4) CHAR ... CHAR
2054 (5) CHAR -2 DECODED-RULE CHAR -2 DECODED-RULE ... CHAR
2055
2056 When the composition end is found, LENGTH and NCHARS in the
2057 annotation header is updated as below:
2058
2059 (1) LENGTH: unchanged, NCHARS: unchanged
2060 (2) LENGTH: length of the whole sequence minus NCHARS, NCHARS: unchanged
2061 (3) LENGTH: length of the whole sequence minus NCHARS, NCHARS: unchanged
2062 (4) LENGTH: unchanged, NCHARS: number of CHARs
2063 (5) LENGTH: unchanged, NCHARS: number of CHARs
2064
2065 If an error is found while composing, the annotation header is
2066 changed to the original composition header (plus filler -1s) as
2067 below:
2068
2069 (1),(2),(3) [ 0x80 0xF2+METHOD BYTES CHARS -1 ]
2070 (5) [ 0x80 0xFF -1 -1- -1 ]
2071
2072 and the sequence [ -2 DECODED-RULE ] is changed to the original
2073 byte sequence as below:
2074 o the original byte sequence is B: [ B -1 ]
2075 o the original byte sequence is B1 B2: [ B1 B2 ]
2076
2077 Most of the routines are implemented by macros because many
2078 variables and labels in the caller decode_coding_emacs_mule must be
2079 accessible, and they are usually called just once (thus doesn't
2080 increase the size of compiled object). */
2081
2082 /* Decode a composition rule represented by C as a component of
2083 composition sequence of Emacs 20 style. Set RULE to the decoded
2084 rule. */
2085
2086 #define DECODE_EMACS_MULE_COMPOSITION_RULE_20(c, rule) \
2087 do { \
2088 int gref, nref; \
2089 \
2090 c -= 0xA0; \
2091 if (c < 0 || c >= 81) \
2092 goto invalid_code; \
2093 gref = c / 9, nref = c % 9; \
2094 if (gref == 4) gref = 10; \
2095 if (nref == 4) nref = 10; \
2096 rule = COMPOSITION_ENCODE_RULE (gref, nref); \
2097 } while (0)
2098
2099
2100 /* Decode a composition rule represented by C and the following byte
2101 at SRC as a component of composition sequence of Emacs 21 style.
2102 Set RULE to the decoded rule. */
2103
2104 #define DECODE_EMACS_MULE_COMPOSITION_RULE_21(c, rule) \
2105 do { \
2106 int gref, nref; \
2107 \
2108 gref = c - 0x20; \
2109 if (gref < 0 || gref >= 81) \
2110 goto invalid_code; \
2111 ONE_MORE_BYTE (c); \
2112 nref = c - 0x20; \
2113 if (nref < 0 || nref >= 81) \
2114 goto invalid_code; \
2115 rule = COMPOSITION_ENCODE_RULE (gref, nref); \
2116 } while (0)
2117
2118
2119 /* Start of Emacs 21 style format. The first three bytes at SRC are
2120 (METHOD - 0xF2), (BYTES - 0xA0), (CHARS - 0xA0), where BYTES is the
2121 byte length of this composition information, CHARS is the number of
2122 characters composed by this composition. */
2123
2124 #define DECODE_EMACS_MULE_21_COMPOSITION() \
2125 do { \
2126 enum composition_method method = c - 0xF2; \
2127 int nbytes, nchars; \
2128 \
2129 ONE_MORE_BYTE (c); \
2130 if (c < 0) \
2131 goto invalid_code; \
2132 nbytes = c - 0xA0; \
2133 if (nbytes < 3 || (method == COMPOSITION_RELATIVE && nbytes != 4)) \
2134 goto invalid_code; \
2135 ONE_MORE_BYTE (c); \
2136 nchars = c - 0xA0; \
2137 if (nchars <= 0 || nchars >= MAX_COMPOSITION_COMPONENTS) \
2138 goto invalid_code; \
2139 cmp_status->old_form = 0; \
2140 cmp_status->method = method; \
2141 if (method == COMPOSITION_RELATIVE) \
2142 cmp_status->state = COMPOSING_CHAR; \
2143 else \
2144 cmp_status->state = COMPOSING_COMPONENT_CHAR; \
2145 cmp_status->length = MAX_ANNOTATION_LENGTH; \
2146 cmp_status->nchars = nchars; \
2147 cmp_status->ncomps = nbytes - 4; \
2148 ADD_COMPOSITION_DATA (charbuf, nchars, nbytes, method); \
2149 } while (0)
2150
2151
2152 /* Start of Emacs 20 style format for relative composition. */
2153
2154 #define DECODE_EMACS_MULE_20_RELATIVE_COMPOSITION() \
2155 do { \
2156 cmp_status->old_form = 1; \
2157 cmp_status->method = COMPOSITION_RELATIVE; \
2158 cmp_status->state = COMPOSING_CHAR; \
2159 cmp_status->length = MAX_ANNOTATION_LENGTH; \
2160 cmp_status->nchars = cmp_status->ncomps = 0; \
2161 ADD_COMPOSITION_DATA (charbuf, 0, 0, cmp_status->method); \
2162 } while (0)
2163
2164
2165 /* Start of Emacs 20 style format for rule-base composition. */
2166
2167 #define DECODE_EMACS_MULE_20_RULEBASE_COMPOSITION() \
2168 do { \
2169 cmp_status->old_form = 1; \
2170 cmp_status->method = COMPOSITION_WITH_RULE; \
2171 cmp_status->state = COMPOSING_CHAR; \
2172 cmp_status->length = MAX_ANNOTATION_LENGTH; \
2173 cmp_status->nchars = cmp_status->ncomps = 0; \
2174 ADD_COMPOSITION_DATA (charbuf, 0, 0, cmp_status->method); \
2175 } while (0)
2176
2177
2178 #define DECODE_EMACS_MULE_COMPOSITION_START() \
2179 do { \
2180 const unsigned char *current_src = src; \
2181 \
2182 ONE_MORE_BYTE (c); \
2183 if (c < 0) \
2184 goto invalid_code; \
2185 if (c - 0xF2 >= COMPOSITION_RELATIVE \
2186 && c - 0xF2 <= COMPOSITION_WITH_RULE_ALTCHARS) \
2187 DECODE_EMACS_MULE_21_COMPOSITION (); \
2188 else if (c < 0xA0) \
2189 goto invalid_code; \
2190 else if (c < 0xC0) \
2191 { \
2192 DECODE_EMACS_MULE_20_RELATIVE_COMPOSITION (); \
2193 /* Re-read C as a composition component. */ \
2194 src = current_src; \
2195 } \
2196 else if (c == 0xFF) \
2197 DECODE_EMACS_MULE_20_RULEBASE_COMPOSITION (); \
2198 else \
2199 goto invalid_code; \
2200 } while (0)
2201
2202 #define EMACS_MULE_COMPOSITION_END() \
2203 do { \
2204 int idx = - cmp_status->length; \
2205 \
2206 if (cmp_status->old_form) \
2207 charbuf[idx + 2] = cmp_status->nchars; \
2208 else if (cmp_status->method > COMPOSITION_RELATIVE) \
2209 charbuf[idx] = charbuf[idx + 2] - cmp_status->length; \
2210 cmp_status->state = COMPOSING_NO; \
2211 } while (0)
2212
2213
2214 static int
2215 emacs_mule_finish_composition (int *charbuf,
2216 struct composition_status *cmp_status)
2217 {
2218 int idx = - cmp_status->length;
2219 int new_chars;
2220
2221 if (cmp_status->old_form && cmp_status->nchars > 0)
2222 {
2223 charbuf[idx + 2] = cmp_status->nchars;
2224 new_chars = 0;
2225 if (cmp_status->method == COMPOSITION_WITH_RULE
2226 && cmp_status->state == COMPOSING_CHAR)
2227 {
2228 /* The last rule was invalid. */
2229 int rule = charbuf[-1] + 0xA0;
2230
2231 charbuf[-2] = BYTE8_TO_CHAR (rule);
2232 charbuf[-1] = -1;
2233 new_chars = 1;
2234 }
2235 }
2236 else
2237 {
2238 charbuf[idx++] = BYTE8_TO_CHAR (0x80);
2239
2240 if (cmp_status->method == COMPOSITION_WITH_RULE)
2241 {
2242 charbuf[idx++] = BYTE8_TO_CHAR (0xFF);
2243 charbuf[idx++] = -3;
2244 charbuf[idx++] = 0;
2245 new_chars = 1;
2246 }
2247 else
2248 {
2249 int nchars = charbuf[idx + 1] + 0xA0;
2250 int nbytes = charbuf[idx + 2] + 0xA0;
2251
2252 charbuf[idx++] = BYTE8_TO_CHAR (0xF2 + cmp_status->method);
2253 charbuf[idx++] = BYTE8_TO_CHAR (nbytes);
2254 charbuf[idx++] = BYTE8_TO_CHAR (nchars);
2255 charbuf[idx++] = -1;
2256 new_chars = 4;
2257 }
2258 }
2259 cmp_status->state = COMPOSING_NO;
2260 return new_chars;
2261 }
2262
2263 #define EMACS_MULE_MAYBE_FINISH_COMPOSITION() \
2264 do { \
2265 if (cmp_status->state != COMPOSING_NO) \
2266 char_offset += emacs_mule_finish_composition (charbuf, cmp_status); \
2267 } while (0)
2268
2269
2270 static void
2271 decode_coding_emacs_mule (struct coding_system *coding)
2272 {
2273 const unsigned char *src = coding->source + coding->consumed;
2274 const unsigned char *src_end = coding->source + coding->src_bytes;
2275 const unsigned char *src_base;
2276 int *charbuf = coding->charbuf + coding->charbuf_used;
2277 /* We may produce two annotations (charset and composition) in one
2278 loop and one more charset annotation at the end. */
2279 int *charbuf_end
2280 = coding->charbuf + coding->charbuf_size - (MAX_ANNOTATION_LENGTH * 3)
2281 /* We can produce up to 2 characters in a loop. */
2282 - 1;
2283 ptrdiff_t consumed_chars = 0, consumed_chars_base;
2284 bool multibytep = coding->src_multibyte;
2285 ptrdiff_t char_offset = coding->produced_char;
2286 ptrdiff_t last_offset = char_offset;
2287 int last_id = charset_ascii;
2288 bool eol_dos
2289 = !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
2290 int byte_after_cr = -1;
2291 struct composition_status *cmp_status = &coding->spec.emacs_mule.cmp_status;
2292
2293 if (cmp_status->state != COMPOSING_NO)
2294 {
2295 int i;
2296
2297 if (charbuf_end - charbuf < cmp_status->length)
2298 emacs_abort ();
2299 for (i = 0; i < cmp_status->length; i++)
2300 *charbuf++ = cmp_status->carryover[i];
2301 coding->annotated = 1;
2302 }
2303
2304 while (1)
2305 {
2306 int c, id IF_LINT (= 0);
2307
2308 src_base = src;
2309 consumed_chars_base = consumed_chars;
2310
2311 if (charbuf >= charbuf_end)
2312 {
2313 if (byte_after_cr >= 0)
2314 src_base--;
2315 break;
2316 }
2317
2318 if (byte_after_cr >= 0)
2319 c = byte_after_cr, byte_after_cr = -1;
2320 else
2321 ONE_MORE_BYTE (c);
2322
2323 if (c < 0 || c == 0x80)
2324 {
2325 EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
2326 if (c < 0)
2327 {
2328 *charbuf++ = -c;
2329 char_offset++;
2330 }
2331 else
2332 DECODE_EMACS_MULE_COMPOSITION_START ();
2333 continue;
2334 }
2335
2336 if (c < 0x80)
2337 {
2338 if (eol_dos && c == '\r')
2339 ONE_MORE_BYTE (byte_after_cr);
2340 id = charset_ascii;
2341 if (cmp_status->state != COMPOSING_NO)
2342 {
2343 if (cmp_status->old_form)
2344 EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
2345 else if (cmp_status->state >= COMPOSING_COMPONENT_CHAR)
2346 cmp_status->ncomps--;
2347 }
2348 }
2349 else
2350 {
2351 int nchars IF_LINT (= 0), nbytes IF_LINT (= 0);
2352 /* emacs_mule_char can load a charset map from a file, which
2353 allocates a large structure and might cause buffer text
2354 to be relocated as result. Thus, we need to remember the
2355 original pointer to buffer text, and fix up all related
2356 pointers after the call. */
2357 const unsigned char *orig = coding->source;
2358 ptrdiff_t offset;
2359
2360 c = emacs_mule_char (coding, src_base, &nbytes, &nchars, &id,
2361 cmp_status);
2362 offset = coding->source - orig;
2363 if (offset)
2364 {
2365 src += offset;
2366 src_base += offset;
2367 src_end += offset;
2368 }
2369 if (c < 0)
2370 {
2371 if (c == -1)
2372 goto invalid_code;
2373 if (c == -2)
2374 break;
2375 }
2376 src = src_base + nbytes;
2377 consumed_chars = consumed_chars_base + nchars;
2378 if (cmp_status->state >= COMPOSING_COMPONENT_CHAR)
2379 cmp_status->ncomps -= nchars;
2380 }
2381
2382 /* Now if C >= 0, we found a normally encoded character, if C <
2383 0, we found an old-style composition component character or
2384 rule. */
2385
2386 if (cmp_status->state == COMPOSING_NO)
2387 {
2388 if (last_id != id)
2389 {
2390 if (last_id != charset_ascii)
2391 ADD_CHARSET_DATA (charbuf, char_offset - last_offset,
2392 last_id);
2393 last_id = id;
2394 last_offset = char_offset;
2395 }
2396 *charbuf++ = c;
2397 char_offset++;
2398 }
2399 else if (cmp_status->state == COMPOSING_CHAR)
2400 {
2401 if (cmp_status->old_form)
2402 {
2403 if (c >= 0)
2404 {
2405 EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
2406 *charbuf++ = c;
2407 char_offset++;
2408 }
2409 else
2410 {
2411 *charbuf++ = -c;
2412 cmp_status->nchars++;
2413 cmp_status->length++;
2414 if (cmp_status->nchars == MAX_COMPOSITION_COMPONENTS)
2415 EMACS_MULE_COMPOSITION_END ();
2416 else if (cmp_status->method == COMPOSITION_WITH_RULE)
2417 cmp_status->state = COMPOSING_RULE;
2418 }
2419 }
2420 else
2421 {
2422 *charbuf++ = c;
2423 cmp_status->length++;
2424 cmp_status->nchars--;
2425 if (cmp_status->nchars == 0)
2426 EMACS_MULE_COMPOSITION_END ();
2427 }
2428 }
2429 else if (cmp_status->state == COMPOSING_RULE)
2430 {
2431 int rule;
2432
2433 if (c >= 0)
2434 {
2435 EMACS_MULE_COMPOSITION_END ();
2436 *charbuf++ = c;
2437 char_offset++;
2438 }
2439 else
2440 {
2441 c = -c;
2442 DECODE_EMACS_MULE_COMPOSITION_RULE_20 (c, rule);
2443 if (rule < 0)
2444 goto invalid_code;
2445 *charbuf++ = -2;
2446 *charbuf++ = rule;
2447 cmp_status->length += 2;
2448 cmp_status->state = COMPOSING_CHAR;
2449 }
2450 }
2451 else if (cmp_status->state == COMPOSING_COMPONENT_CHAR)
2452 {
2453 *charbuf++ = c;
2454 cmp_status->length++;
2455 if (cmp_status->ncomps == 0)
2456 cmp_status->state = COMPOSING_CHAR;
2457 else if (cmp_status->ncomps > 0)
2458 {
2459 if (cmp_status->method == COMPOSITION_WITH_RULE_ALTCHARS)
2460 cmp_status->state = COMPOSING_COMPONENT_RULE;
2461 }
2462 else
2463 EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
2464 }
2465 else /* COMPOSING_COMPONENT_RULE */
2466 {
2467 int rule;
2468
2469 DECODE_EMACS_MULE_COMPOSITION_RULE_21 (c, rule);
2470 if (rule < 0)
2471 goto invalid_code;
2472 *charbuf++ = -2;
2473 *charbuf++ = rule;
2474 cmp_status->length += 2;
2475 cmp_status->ncomps--;
2476 if (cmp_status->ncomps > 0)
2477 cmp_status->state = COMPOSING_COMPONENT_CHAR;
2478 else
2479 EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
2480 }
2481 continue;
2482
2483 invalid_code:
2484 EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
2485 src = src_base;
2486 consumed_chars = consumed_chars_base;
2487 ONE_MORE_BYTE (c);
2488 *charbuf++ = ASCII_BYTE_P (c) ? c : BYTE8_TO_CHAR (c);
2489 char_offset++;
2490 coding->errors++;
2491 }
2492
2493 no_more_source:
2494 if (cmp_status->state != COMPOSING_NO)
2495 {
2496 if (coding->mode & CODING_MODE_LAST_BLOCK)
2497 EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
2498 else
2499 {
2500 int i;
2501
2502 charbuf -= cmp_status->length;
2503 for (i = 0; i < cmp_status->length; i++)
2504 cmp_status->carryover[i] = charbuf[i];
2505 }
2506 }
2507 if (last_id != charset_ascii)
2508 ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
2509 coding->consumed_char += consumed_chars_base;
2510 coding->consumed = src_base - coding->source;
2511 coding->charbuf_used = charbuf - coding->charbuf;
2512 }
2513
2514
2515 #define EMACS_MULE_LEADING_CODES(id, codes) \
2516 do { \
2517 if (id < 0xA0) \
2518 codes[0] = id, codes[1] = 0; \
2519 else if (id < 0xE0) \
2520 codes[0] = 0x9A, codes[1] = id; \
2521 else if (id < 0xF0) \
2522 codes[0] = 0x9B, codes[1] = id; \
2523 else if (id < 0xF5) \
2524 codes[0] = 0x9C, codes[1] = id; \
2525 else \
2526 codes[0] = 0x9D, codes[1] = id; \
2527 } while (0);
2528
2529
2530 static bool
2531 encode_coding_emacs_mule (struct coding_system *coding)
2532 {
2533 bool multibytep = coding->dst_multibyte;
2534 int *charbuf = coding->charbuf;
2535 int *charbuf_end = charbuf + coding->charbuf_used;
2536 unsigned char *dst = coding->destination + coding->produced;
2537 unsigned char *dst_end = coding->destination + coding->dst_bytes;
2538 int safe_room = 8;
2539 ptrdiff_t produced_chars = 0;
2540 Lisp_Object attrs, charset_list;
2541 int c;
2542 int preferred_charset_id = -1;
2543
2544 CODING_GET_INFO (coding, attrs, charset_list);
2545 if (! EQ (charset_list, Vemacs_mule_charset_list))
2546 {
2547 charset_list = Vemacs_mule_charset_list;
2548 ASET (attrs, coding_attr_charset_list, charset_list);
2549 }
2550
2551 while (charbuf < charbuf_end)
2552 {
2553 ASSURE_DESTINATION (safe_room);
2554 c = *charbuf++;
2555
2556 if (c < 0)
2557 {
2558 /* Handle an annotation. */
2559 switch (*charbuf)
2560 {
2561 case CODING_ANNOTATE_COMPOSITION_MASK:
2562 /* Not yet implemented. */
2563 break;
2564 case CODING_ANNOTATE_CHARSET_MASK:
2565 preferred_charset_id = charbuf[3];
2566 if (preferred_charset_id >= 0
2567 && NILP (Fmemq (make_number (preferred_charset_id),
2568 charset_list)))
2569 preferred_charset_id = -1;
2570 break;
2571 default:
2572 emacs_abort ();
2573 }
2574 charbuf += -c - 1;
2575 continue;
2576 }
2577
2578 if (ASCII_CHAR_P (c))
2579 EMIT_ONE_ASCII_BYTE (c);
2580 else if (CHAR_BYTE8_P (c))
2581 {
2582 c = CHAR_TO_BYTE8 (c);
2583 EMIT_ONE_BYTE (c);
2584 }
2585 else
2586 {
2587 struct charset *charset;
2588 unsigned code;
2589 int dimension;
2590 int emacs_mule_id;
2591 unsigned char leading_codes[2];
2592
2593 if (preferred_charset_id >= 0)
2594 {
2595 bool result;
2596
2597 charset = CHARSET_FROM_ID (preferred_charset_id);
2598 CODING_CHAR_CHARSET_P (coding, dst, dst_end, c, charset, result);
2599 if (result)
2600 code = ENCODE_CHAR (charset, c);
2601 else
2602 CODING_CHAR_CHARSET (coding, dst, dst_end, c, charset_list,
2603 &code, charset);
2604 }
2605 else
2606 CODING_CHAR_CHARSET (coding, dst, dst_end, c, charset_list,
2607 &code, charset);
2608 if (! charset)
2609 {
2610 c = coding->default_char;
2611 if (ASCII_CHAR_P (c))
2612 {
2613 EMIT_ONE_ASCII_BYTE (c);
2614 continue;
2615 }
2616 CODING_CHAR_CHARSET (coding, dst, dst_end, c, charset_list,
2617 &code, charset);
2618 }
2619 dimension = CHARSET_DIMENSION (charset);
2620 emacs_mule_id = CHARSET_EMACS_MULE_ID (charset);
2621 EMACS_MULE_LEADING_CODES (emacs_mule_id, leading_codes);
2622 EMIT_ONE_BYTE (leading_codes[0]);
2623 if (leading_codes[1])
2624 EMIT_ONE_BYTE (leading_codes[1]);
2625 if (dimension == 1)
2626 EMIT_ONE_BYTE (code | 0x80);
2627 else
2628 {
2629 code |= 0x8080;
2630 EMIT_ONE_BYTE (code >> 8);
2631 EMIT_ONE_BYTE (code & 0xFF);
2632 }
2633 }
2634 }
2635 record_conversion_result (coding, CODING_RESULT_SUCCESS);
2636 coding->produced_char += produced_chars;
2637 coding->produced = dst - coding->destination;
2638 return 0;
2639 }
2640
2641 \f
2642 /*** 7. ISO2022 handlers ***/
2643
2644 /* The following note describes the coding system ISO2022 briefly.
2645 Since the intention of this note is to help understand the
2646 functions in this file, some parts are NOT ACCURATE or are OVERLY
2647 SIMPLIFIED. For thorough understanding, please refer to the
2648 original document of ISO2022. This is equivalent to the standard
2649 ECMA-35, obtainable from <URL:http://www.ecma.ch/> (*).
2650
2651 ISO2022 provides many mechanisms to encode several character sets
2652 in 7-bit and 8-bit environments. For 7-bit environments, all text
2653 is encoded using bytes less than 128. This may make the encoded
2654 text a little bit longer, but the text passes more easily through
2655 several types of gateway, some of which strip off the MSB (Most
2656 Significant Bit).
2657
2658 There are two kinds of character sets: control character sets and
2659 graphic character sets. The former contain control characters such
2660 as `newline' and `escape' to provide control functions (control
2661 functions are also provided by escape sequences). The latter
2662 contain graphic characters such as 'A' and '-'. Emacs recognizes
2663 two control character sets and many graphic character sets.
2664
2665 Graphic character sets are classified into one of the following
2666 four classes, according to the number of bytes (DIMENSION) and
2667 number of characters in one dimension (CHARS) of the set:
2668 - DIMENSION1_CHARS94
2669 - DIMENSION1_CHARS96
2670 - DIMENSION2_CHARS94
2671 - DIMENSION2_CHARS96
2672
2673 In addition, each character set is assigned an identification tag,
2674 unique for each set, called the "final character" (denoted as <F>
2675 hereafter). The <F> of each character set is decided by ECMA(*)
2676 when it is registered in ISO. The code range of <F> is 0x30..0x7F
2677 (0x30..0x3F are for private use only).
2678
2679 Note (*): ECMA = European Computer Manufacturers Association
2680
2681 Here are examples of graphic character sets [NAME(<F>)]:
2682 o DIMENSION1_CHARS94 -- ASCII('B'), right-half-of-JISX0201('I'), ...
2683 o DIMENSION1_CHARS96 -- right-half-of-ISO8859-1('A'), ...
2684 o DIMENSION2_CHARS94 -- GB2312('A'), JISX0208('B'), ...
2685 o DIMENSION2_CHARS96 -- none for the moment
2686
2687 A code area (1 byte=8 bits) is divided into 4 areas, C0, GL, C1, and GR.
2688 C0 [0x00..0x1F] -- control character plane 0
2689 GL [0x20..0x7F] -- graphic character plane 0
2690 C1 [0x80..0x9F] -- control character plane 1
2691 GR [0xA0..0xFF] -- graphic character plane 1
2692
2693 A control character set is directly designated and invoked to C0 or
2694 C1 by an escape sequence. The most common case is that:
2695 - ISO646's control character set is designated/invoked to C0, and
2696 - ISO6429's control character set is designated/invoked to C1,
2697 and usually these designations/invocations are omitted in encoded
2698 text. In a 7-bit environment, only C0 can be used, and a control
2699 character for C1 is encoded by an appropriate escape sequence to
2700 fit into the environment. All control characters for C1 are
2701 defined to have corresponding escape sequences.
2702
2703 A graphic character set is at first designated to one of four
2704 graphic registers (G0 through G3), then these graphic registers are
2705 invoked to GL or GR. These designations and invocations can be
2706 done independently. The most common case is that G0 is invoked to
2707 GL, G1 is invoked to GR, and ASCII is designated to G0. Usually
2708 these invocations and designations are omitted in encoded text.
2709 In a 7-bit environment, only GL can be used.
2710
2711 When a graphic character set of CHARS94 is invoked to GL, codes
2712 0x20 and 0x7F of the GL area work as control characters SPACE and
2713 DEL respectively, and codes 0xA0 and 0xFF of the GR area should not
2714 be used.
2715
2716 There are two ways of invocation: locking-shift and single-shift.
2717 With locking-shift, the invocation lasts until the next different
2718 invocation, whereas with single-shift, the invocation affects the
2719 following character only and doesn't affect the locking-shift
2720 state. Invocations are done by the following control characters or
2721 escape sequences:
2722
2723 ----------------------------------------------------------------------
2724 abbrev function cntrl escape seq description
2725 ----------------------------------------------------------------------
2726 SI/LS0 (shift-in) 0x0F none invoke G0 into GL
2727 SO/LS1 (shift-out) 0x0E none invoke G1 into GL
2728 LS2 (locking-shift-2) none ESC 'n' invoke G2 into GL
2729 LS3 (locking-shift-3) none ESC 'o' invoke G3 into GL
2730 LS1R (locking-shift-1 right) none ESC '~' invoke G1 into GR (*)
2731 LS2R (locking-shift-2 right) none ESC '}' invoke G2 into GR (*)
2732 LS3R (locking-shift 3 right) none ESC '|' invoke G3 into GR (*)
2733 SS2 (single-shift-2) 0x8E ESC 'N' invoke G2 for one char
2734 SS3 (single-shift-3) 0x8F ESC 'O' invoke G3 for one char
2735 ----------------------------------------------------------------------
2736 (*) These are not used by any known coding system.
2737
2738 Control characters for these functions are defined by macros
2739 ISO_CODE_XXX in `coding.h'.
2740
2741 Designations are done by the following escape sequences:
2742 ----------------------------------------------------------------------
2743 escape sequence description
2744 ----------------------------------------------------------------------
2745 ESC '(' <F> designate DIMENSION1_CHARS94<F> to G0
2746 ESC ')' <F> designate DIMENSION1_CHARS94<F> to G1
2747 ESC '*' <F> designate DIMENSION1_CHARS94<F> to G2
2748 ESC '+' <F> designate DIMENSION1_CHARS94<F> to G3
2749 ESC ',' <F> designate DIMENSION1_CHARS96<F> to G0 (*)
2750 ESC '-' <F> designate DIMENSION1_CHARS96<F> to G1
2751 ESC '.' <F> designate DIMENSION1_CHARS96<F> to G2
2752 ESC '/' <F> designate DIMENSION1_CHARS96<F> to G3
2753 ESC '$' '(' <F> designate DIMENSION2_CHARS94<F> to G0 (**)
2754 ESC '$' ')' <F> designate DIMENSION2_CHARS94<F> to G1
2755 ESC '$' '*' <F> designate DIMENSION2_CHARS94<F> to G2
2756 ESC '$' '+' <F> designate DIMENSION2_CHARS94<F> to G3
2757 ESC '$' ',' <F> designate DIMENSION2_CHARS96<F> to G0 (*)
2758 ESC '$' '-' <F> designate DIMENSION2_CHARS96<F> to G1
2759 ESC '$' '.' <F> designate DIMENSION2_CHARS96<F> to G2
2760 ESC '$' '/' <F> designate DIMENSION2_CHARS96<F> to G3
2761 ----------------------------------------------------------------------
2762
2763 In this list, "DIMENSION1_CHARS94<F>" means a graphic character set
2764 of dimension 1, chars 94, and final character <F>, etc...
2765
2766 Note (*): Although these designations are not allowed in ISO2022,
2767 Emacs accepts them on decoding, and produces them on encoding
2768 CHARS96 character sets in a coding system which is characterized as
2769 7-bit environment, non-locking-shift, and non-single-shift.
2770
2771 Note (**): If <F> is '@', 'A', or 'B', the intermediate character
2772 '(' must be omitted. We refer to this as "short-form" hereafter.
2773
2774 Now you may notice that there are a lot of ways of encoding the
2775 same multilingual text in ISO2022. Actually, there exist many
2776 coding systems such as Compound Text (used in X11's inter client
2777 communication, ISO-2022-JP (used in Japanese Internet), ISO-2022-KR
2778 (used in Korean Internet), EUC (Extended UNIX Code, used in Asian
2779 localized platforms), and all of these are variants of ISO2022.
2780
2781 In addition to the above, Emacs handles two more kinds of escape
2782 sequences: ISO6429's direction specification and Emacs' private
2783 sequence for specifying character composition.
2784
2785 ISO6429's direction specification takes the following form:
2786 o CSI ']' -- end of the current direction
2787 o CSI '0' ']' -- end of the current direction
2788 o CSI '1' ']' -- start of left-to-right text
2789 o CSI '2' ']' -- start of right-to-left text
2790 The control character CSI (0x9B: control sequence introducer) is
2791 abbreviated to the escape sequence ESC '[' in a 7-bit environment.
2792
2793 Character composition specification takes the following form:
2794 o ESC '0' -- start relative composition
2795 o ESC '1' -- end composition
2796 o ESC '2' -- start rule-base composition (*)
2797 o ESC '3' -- start relative composition with alternate chars (**)
2798 o ESC '4' -- start rule-base composition with alternate chars (**)
2799 Since these are not standard escape sequences of any ISO standard,
2800 the use of them with these meanings is restricted to Emacs only.
2801
2802 (*) This form is used only in Emacs 20.7 and older versions,
2803 but newer versions can safely decode it.
2804 (**) This form is used only in Emacs 21.1 and newer versions,
2805 and older versions can't decode it.
2806
2807 Here's a list of example usages of these composition escape
2808 sequences (categorized by `enum composition_method').
2809
2810 COMPOSITION_RELATIVE:
2811 ESC 0 CHAR [ CHAR ] ESC 1
2812 COMPOSITION_WITH_RULE:
2813 ESC 2 CHAR [ RULE CHAR ] ESC 1
2814 COMPOSITION_WITH_ALTCHARS:
2815 ESC 3 ALTCHAR [ ALTCHAR ] ESC 0 CHAR [ CHAR ] ESC 1
2816 COMPOSITION_WITH_RULE_ALTCHARS:
2817 ESC 4 ALTCHAR [ RULE ALTCHAR ] ESC 0 CHAR [ CHAR ] ESC 1 */
2818
2819 static enum iso_code_class_type iso_code_class[256];
2820
2821 #define SAFE_CHARSET_P(coding, id) \
2822 ((id) <= (coding)->max_charset_id \
2823 && (coding)->safe_charsets[id] != 255)
2824
2825 static void
2826 setup_iso_safe_charsets (Lisp_Object attrs)
2827 {
2828 Lisp_Object charset_list, safe_charsets;
2829 Lisp_Object request;
2830 Lisp_Object reg_usage;
2831 Lisp_Object tail;
2832 EMACS_INT reg94, reg96;
2833 int flags = XINT (AREF (attrs, coding_attr_iso_flags));
2834 int max_charset_id;
2835
2836 charset_list = CODING_ATTR_CHARSET_LIST (attrs);
2837 if ((flags & CODING_ISO_FLAG_FULL_SUPPORT)
2838 && ! EQ (charset_list, Viso_2022_charset_list))
2839 {
2840 charset_list = Viso_2022_charset_list;
2841 ASET (attrs, coding_attr_charset_list, charset_list);
2842 ASET (attrs, coding_attr_safe_charsets, Qnil);
2843 }
2844
2845 if (STRINGP (AREF (attrs, coding_attr_safe_charsets)))
2846 return;
2847
2848 max_charset_id = 0;
2849 for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
2850 {
2851 int id = XINT (XCAR (tail));
2852 if (max_charset_id < id)
2853 max_charset_id = id;
2854 }
2855
2856 safe_charsets = make_uninit_string (max_charset_id + 1);
2857 memset (SDATA (safe_charsets), 255, max_charset_id + 1);
2858 request = AREF (attrs, coding_attr_iso_request);
2859 reg_usage = AREF (attrs, coding_attr_iso_usage);
2860 reg94 = XINT (XCAR (reg_usage));
2861 reg96 = XINT (XCDR (reg_usage));
2862
2863 for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
2864 {
2865 Lisp_Object id;
2866 Lisp_Object reg;
2867 struct charset *charset;
2868
2869 id = XCAR (tail);
2870 charset = CHARSET_FROM_ID (XINT (id));
2871 reg = Fcdr (Fassq (id, request));
2872 if (! NILP (reg))
2873 SSET (safe_charsets, XINT (id), XINT (reg));
2874 else if (charset->iso_chars_96)
2875 {
2876 if (reg96 < 4)
2877 SSET (safe_charsets, XINT (id), reg96);
2878 }
2879 else
2880 {
2881 if (reg94 < 4)
2882 SSET (safe_charsets, XINT (id), reg94);
2883 }
2884 }
2885 ASET (attrs, coding_attr_safe_charsets, safe_charsets);
2886 }
2887
2888
2889 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
2890 Return true if a text is encoded in one of ISO-2022 based coding
2891 systems. */
2892
2893 static bool
2894 detect_coding_iso_2022 (struct coding_system *coding,
2895 struct coding_detection_info *detect_info)
2896 {
2897 const unsigned char *src = coding->source, *src_base = src;
2898 const unsigned char *src_end = coding->source + coding->src_bytes;
2899 bool multibytep = coding->src_multibyte;
2900 bool single_shifting = 0;
2901 int id;
2902 int c, c1;
2903 ptrdiff_t consumed_chars = 0;
2904 int i;
2905 int rejected = 0;
2906 int found = 0;
2907 int composition_count = -1;
2908
2909 detect_info->checked |= CATEGORY_MASK_ISO;
2910
2911 for (i = coding_category_iso_7; i <= coding_category_iso_8_else; i++)
2912 {
2913 struct coding_system *this = &(coding_categories[i]);
2914 Lisp_Object attrs, val;
2915
2916 if (this->id < 0)
2917 continue;
2918 attrs = CODING_ID_ATTRS (this->id);
2919 if (CODING_ISO_FLAGS (this) & CODING_ISO_FLAG_FULL_SUPPORT
2920 && ! EQ (CODING_ATTR_CHARSET_LIST (attrs), Viso_2022_charset_list))
2921 setup_iso_safe_charsets (attrs);
2922 val = CODING_ATTR_SAFE_CHARSETS (attrs);
2923 this->max_charset_id = SCHARS (val) - 1;
2924 this->safe_charsets = SDATA (val);
2925 }
2926
2927 /* A coding system of this category is always ASCII compatible. */
2928 src += coding->head_ascii;
2929
2930 while (rejected != CATEGORY_MASK_ISO)
2931 {
2932 src_base = src;
2933 ONE_MORE_BYTE (c);
2934 switch (c)
2935 {
2936 case ISO_CODE_ESC:
2937 if (inhibit_iso_escape_detection)
2938 break;
2939 single_shifting = 0;
2940 ONE_MORE_BYTE (c);
2941 if (c == 'N' || c == 'O')
2942 {
2943 /* ESC <Fe> for SS2 or SS3. */
2944 single_shifting = 1;
2945 rejected |= CATEGORY_MASK_ISO_7BIT | CATEGORY_MASK_ISO_8BIT;
2946 }
2947 else if (c == '1')
2948 {
2949 /* End of composition. */
2950 if (composition_count < 0
2951 || composition_count > MAX_COMPOSITION_COMPONENTS)
2952 /* Invalid */
2953 break;
2954 composition_count = -1;
2955 found |= CATEGORY_MASK_ISO;
2956 }
2957 else if (c >= '0' && c <= '4')
2958 {
2959 /* ESC <Fp> for start/end composition. */
2960 composition_count = 0;
2961 }
2962 else
2963 {
2964 if (c >= '(' && c <= '/')
2965 {
2966 /* Designation sequence for a charset of dimension 1. */
2967 ONE_MORE_BYTE (c1);
2968 if (c1 < ' ' || c1 >= 0x80
2969 || (id = iso_charset_table[0][c >= ','][c1]) < 0)
2970 /* Invalid designation sequence. Just ignore. */
2971 break;
2972 }
2973 else if (c == '$')
2974 {
2975 /* Designation sequence for a charset of dimension 2. */
2976 ONE_MORE_BYTE (c);
2977 if (c >= '@' && c <= 'B')
2978 /* Designation for JISX0208.1978, GB2312, or JISX0208. */
2979 id = iso_charset_table[1][0][c];
2980 else if (c >= '(' && c <= '/')
2981 {
2982 ONE_MORE_BYTE (c1);
2983 if (c1 < ' ' || c1 >= 0x80
2984 || (id = iso_charset_table[1][c >= ','][c1]) < 0)
2985 /* Invalid designation sequence. Just ignore. */
2986 break;
2987 }
2988 else
2989 /* Invalid designation sequence. Just ignore it. */
2990 break;
2991 }
2992 else
2993 {
2994 /* Invalid escape sequence. Just ignore it. */
2995 break;
2996 }
2997
2998 /* We found a valid designation sequence for CHARSET. */
2999 rejected |= CATEGORY_MASK_ISO_8BIT;
3000 if (SAFE_CHARSET_P (&coding_categories[coding_category_iso_7],
3001 id))
3002 found |= CATEGORY_MASK_ISO_7;
3003 else
3004 rejected |= CATEGORY_MASK_ISO_7;
3005 if (SAFE_CHARSET_P (&coding_categories[coding_category_iso_7_tight],
3006 id))
3007 found |= CATEGORY_MASK_ISO_7_TIGHT;
3008 else
3009 rejected |= CATEGORY_MASK_ISO_7_TIGHT;
3010 if (SAFE_CHARSET_P (&coding_categories[coding_category_iso_7_else],
3011 id))
3012 found |= CATEGORY_MASK_ISO_7_ELSE;
3013 else
3014 rejected |= CATEGORY_MASK_ISO_7_ELSE;
3015 if (SAFE_CHARSET_P (&coding_categories[coding_category_iso_8_else],
3016 id))
3017 found |= CATEGORY_MASK_ISO_8_ELSE;
3018 else
3019 rejected |= CATEGORY_MASK_ISO_8_ELSE;
3020 }
3021 break;
3022
3023 case ISO_CODE_SO:
3024 case ISO_CODE_SI:
3025 /* Locking shift out/in. */
3026 if (inhibit_iso_escape_detection)
3027 break;
3028 single_shifting = 0;
3029 rejected |= CATEGORY_MASK_ISO_7BIT | CATEGORY_MASK_ISO_8BIT;
3030 break;
3031
3032 case ISO_CODE_CSI:
3033 /* Control sequence introducer. */
3034 single_shifting = 0;
3035 rejected |= CATEGORY_MASK_ISO_7BIT | CATEGORY_MASK_ISO_7_ELSE;
3036 found |= CATEGORY_MASK_ISO_8_ELSE;
3037 goto check_extra_latin;
3038
3039 case ISO_CODE_SS2:
3040 case ISO_CODE_SS3:
3041 /* Single shift. */
3042 if (inhibit_iso_escape_detection)
3043 break;
3044 single_shifting = 0;
3045 rejected |= CATEGORY_MASK_ISO_7BIT;
3046 if (CODING_ISO_FLAGS (&coding_categories[coding_category_iso_8_1])
3047 & CODING_ISO_FLAG_SINGLE_SHIFT)
3048 {
3049 found |= CATEGORY_MASK_ISO_8_1;
3050 single_shifting = 1;
3051 }
3052 if (CODING_ISO_FLAGS (&coding_categories[coding_category_iso_8_2])
3053 & CODING_ISO_FLAG_SINGLE_SHIFT)
3054 {
3055 found |= CATEGORY_MASK_ISO_8_2;
3056 single_shifting = 1;
3057 }
3058 if (single_shifting)
3059 break;
3060 check_extra_latin:
3061 if (! VECTORP (Vlatin_extra_code_table)
3062 || NILP (AREF (Vlatin_extra_code_table, c)))
3063 {
3064 rejected = CATEGORY_MASK_ISO;
3065 break;
3066 }
3067 if (CODING_ISO_FLAGS (&coding_categories[coding_category_iso_8_1])
3068 & CODING_ISO_FLAG_LATIN_EXTRA)
3069 found |= CATEGORY_MASK_ISO_8_1;
3070 else
3071 rejected |= CATEGORY_MASK_ISO_8_1;
3072 rejected |= CATEGORY_MASK_ISO_8_2;
3073 break;
3074
3075 default:
3076 if (c < 0)
3077 continue;
3078 if (c < 0x80)
3079 {
3080 if (composition_count >= 0)
3081 composition_count++;
3082 single_shifting = 0;
3083 break;
3084 }
3085 if (c >= 0xA0)
3086 {
3087 rejected |= CATEGORY_MASK_ISO_7BIT | CATEGORY_MASK_ISO_7_ELSE;
3088 found |= CATEGORY_MASK_ISO_8_1;
3089 /* Check the length of succeeding codes of the range
3090 0xA0..0FF. If the byte length is even, we include
3091 CATEGORY_MASK_ISO_8_2 in `found'. We can check this
3092 only when we are not single shifting. */
3093 if (! single_shifting
3094 && ! (rejected & CATEGORY_MASK_ISO_8_2))
3095 {
3096 int len = 1;
3097 while (src < src_end)
3098 {
3099 src_base = src;
3100 ONE_MORE_BYTE (c);
3101 if (c < 0xA0)
3102 {
3103 src = src_base;
3104 break;
3105 }
3106 len++;
3107 }
3108
3109 if (len & 1 && src < src_end)
3110 {
3111 rejected |= CATEGORY_MASK_ISO_8_2;
3112 if (composition_count >= 0)
3113 composition_count += len;
3114 }
3115 else
3116 {
3117 found |= CATEGORY_MASK_ISO_8_2;
3118 if (composition_count >= 0)
3119 composition_count += len / 2;
3120 }
3121 }
3122 break;
3123 }
3124 }
3125 }
3126 detect_info->rejected |= CATEGORY_MASK_ISO;
3127 return 0;
3128
3129 no_more_source:
3130 detect_info->rejected |= rejected;
3131 detect_info->found |= (found & ~rejected);
3132 return 1;
3133 }
3134
3135
3136 /* Set designation state into CODING. Set CHARS_96 to -1 if the
3137 escape sequence should be kept. */
3138 #define DECODE_DESIGNATION(reg, dim, chars_96, final) \
3139 do { \
3140 int id, prev; \
3141 \
3142 if (final < '0' || final >= 128 \
3143 || ((id = ISO_CHARSET_TABLE (dim, chars_96, final)) < 0) \
3144 || !SAFE_CHARSET_P (coding, id)) \
3145 { \
3146 CODING_ISO_DESIGNATION (coding, reg) = -2; \
3147 chars_96 = -1; \
3148 break; \
3149 } \
3150 prev = CODING_ISO_DESIGNATION (coding, reg); \
3151 if (id == charset_jisx0201_roman) \
3152 { \
3153 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_USE_ROMAN) \
3154 id = charset_ascii; \
3155 } \
3156 else if (id == charset_jisx0208_1978) \
3157 { \
3158 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_USE_OLDJIS) \
3159 id = charset_jisx0208; \
3160 } \
3161 CODING_ISO_DESIGNATION (coding, reg) = id; \
3162 /* If there was an invalid designation to REG previously, and this \
3163 designation is ASCII to REG, we should keep this designation \
3164 sequence. */ \
3165 if (prev == -2 && id == charset_ascii) \
3166 chars_96 = -1; \
3167 } while (0)
3168
3169
3170 /* Handle these composition sequence (ALT: alternate char):
3171
3172 (1) relative composition: ESC 0 CHAR ... ESC 1
3173 (2) rulebase composition: ESC 2 CHAR RULE CHAR RULE ... CHAR ESC 1
3174 (3) altchar composition: ESC 3 ALT ... ALT ESC 0 CHAR ... ESC 1
3175 (4) alt&rule composition: ESC 4 ALT RULE ... ALT ESC 0 CHAR ... ESC 1
3176
3177 When the start sequence (ESC 0/2/3/4) is found, this annotation
3178 header is produced.
3179
3180 [ -LENGTH(==-5) CODING_ANNOTATE_COMPOSITION_MASK NCHARS(==0) 0 METHOD ]
3181
3182 Then, upon reading CHAR or RULE (one or two bytes), these codes are
3183 produced until the end sequence (ESC 1) is found:
3184
3185 (1) CHAR ... CHAR
3186 (2) CHAR -2 DECODED-RULE CHAR -2 DECODED-RULE ... CHAR
3187 (3) ALT ... ALT -1 -1 CHAR ... CHAR
3188 (4) ALT -2 DECODED-RULE ALT -2 DECODED-RULE ... ALT -1 -1 CHAR ... CHAR
3189
3190 When the end sequence (ESC 1) is found, LENGTH and NCHARS in the
3191 annotation header is updated as below:
3192
3193 (1) LENGTH: unchanged, NCHARS: number of CHARs
3194 (2) LENGTH: unchanged, NCHARS: number of CHARs
3195 (3) LENGTH: += number of ALTs + 2, NCHARS: number of CHARs
3196 (4) LENGTH: += number of ALTs * 3, NCHARS: number of CHARs
3197
3198 If an error is found while composing, the annotation header is
3199 changed to:
3200
3201 [ ESC '0'/'2'/'3'/'4' -2 0 ]
3202
3203 and the sequence [ -2 DECODED-RULE ] is changed to the original
3204 byte sequence as below:
3205 o the original byte sequence is B: [ B -1 ]
3206 o the original byte sequence is B1 B2: [ B1 B2 ]
3207 and the sequence [ -1 -1 ] is changed to the original byte
3208 sequence:
3209 [ ESC '0' ]
3210 */
3211
3212 /* Decode a composition rule C1 and maybe one more byte from the
3213 source, and set RULE to the encoded composition rule. If the rule
3214 is invalid, goto invalid_code. */
3215
3216 #define DECODE_COMPOSITION_RULE(rule) \
3217 do { \
3218 rule = c1 - 32; \
3219 if (rule < 0) \
3220 goto invalid_code; \
3221 if (rule < 81) /* old format (before ver.21) */ \
3222 { \
3223 int gref = (rule) / 9; \
3224 int nref = (rule) % 9; \
3225 if (gref == 4) gref = 10; \
3226 if (nref == 4) nref = 10; \
3227 rule = COMPOSITION_ENCODE_RULE (gref, nref); \
3228 } \
3229 else /* new format (after ver.21) */ \
3230 { \
3231 int b; \
3232 \
3233 ONE_MORE_BYTE (b); \
3234 if (! COMPOSITION_ENCODE_RULE_VALID (rule - 81, b - 32)) \
3235 goto invalid_code; \
3236 rule = COMPOSITION_ENCODE_RULE (rule - 81, b - 32); \
3237 rule += 0x100; /* Distinguish it from the old format. */ \
3238 } \
3239 } while (0)
3240
3241 #define ENCODE_COMPOSITION_RULE(rule) \
3242 do { \
3243 int gref = (rule % 0x100) / 12, nref = (rule % 0x100) % 12; \
3244 \
3245 if (rule < 0x100) /* old format */ \
3246 { \
3247 if (gref == 10) gref = 4; \
3248 if (nref == 10) nref = 4; \
3249 charbuf[idx] = 32 + gref * 9 + nref; \
3250 charbuf[idx + 1] = -1; \
3251 new_chars++; \
3252 } \
3253 else /* new format */ \
3254 { \
3255 charbuf[idx] = 32 + 81 + gref; \
3256 charbuf[idx + 1] = 32 + nref; \
3257 new_chars += 2; \
3258 } \
3259 } while (0)
3260
3261 /* Finish the current composition as invalid. */
3262
3263 static int
3264 finish_composition (int *charbuf, struct composition_status *cmp_status)
3265 {
3266 int idx = - cmp_status->length;
3267 int new_chars;
3268
3269 /* Recover the original ESC sequence */
3270 charbuf[idx++] = ISO_CODE_ESC;
3271 charbuf[idx++] = (cmp_status->method == COMPOSITION_RELATIVE ? '0'
3272 : cmp_status->method == COMPOSITION_WITH_RULE ? '2'
3273 : cmp_status->method == COMPOSITION_WITH_ALTCHARS ? '3'
3274 /* cmp_status->method == COMPOSITION_WITH_RULE_ALTCHARS */
3275 : '4');
3276 charbuf[idx++] = -2;
3277 charbuf[idx++] = 0;
3278 charbuf[idx++] = -1;
3279 new_chars = cmp_status->nchars;
3280 if (cmp_status->method >= COMPOSITION_WITH_RULE)
3281 for (; idx < 0; idx++)
3282 {
3283 int elt = charbuf[idx];
3284
3285 if (elt == -2)
3286 {
3287 ENCODE_COMPOSITION_RULE (charbuf[idx + 1]);
3288 idx++;
3289 }
3290 else if (elt == -1)
3291 {
3292 charbuf[idx++] = ISO_CODE_ESC;
3293 charbuf[idx] = '0';
3294 new_chars += 2;
3295 }
3296 }
3297 cmp_status->state = COMPOSING_NO;
3298 return new_chars;
3299 }
3300
3301 /* If characters are under composition, finish the composition. */
3302 #define MAYBE_FINISH_COMPOSITION() \
3303 do { \
3304 if (cmp_status->state != COMPOSING_NO) \
3305 char_offset += finish_composition (charbuf, cmp_status); \
3306 } while (0)
3307
3308 /* Handle composition start sequence ESC 0, ESC 2, ESC 3, or ESC 4.
3309
3310 ESC 0 : relative composition : ESC 0 CHAR ... ESC 1
3311 ESC 2 : rulebase composition : ESC 2 CHAR RULE CHAR RULE ... CHAR ESC 1
3312 ESC 3 : altchar composition : ESC 3 CHAR ... ESC 0 CHAR ... ESC 1
3313 ESC 4 : alt&rule composition : ESC 4 CHAR RULE ... CHAR ESC 0 CHAR ... ESC 1
3314
3315 Produce this annotation sequence now:
3316
3317 [ -LENGTH(==-4) CODING_ANNOTATE_COMPOSITION_MASK NCHARS(==0) METHOD ]
3318 */
3319
3320 #define DECODE_COMPOSITION_START(c1) \
3321 do { \
3322 if (c1 == '0' \
3323 && ((cmp_status->state == COMPOSING_COMPONENT_CHAR \
3324 && cmp_status->method == COMPOSITION_WITH_ALTCHARS) \
3325 || (cmp_status->state == COMPOSING_COMPONENT_RULE \
3326 && cmp_status->method == COMPOSITION_WITH_RULE_ALTCHARS))) \
3327 { \
3328 *charbuf++ = -1; \
3329 *charbuf++= -1; \
3330 cmp_status->state = COMPOSING_CHAR; \
3331 cmp_status->length += 2; \
3332 } \
3333 else \
3334 { \
3335 MAYBE_FINISH_COMPOSITION (); \
3336 cmp_status->method = (c1 == '0' ? COMPOSITION_RELATIVE \
3337 : c1 == '2' ? COMPOSITION_WITH_RULE \
3338 : c1 == '3' ? COMPOSITION_WITH_ALTCHARS \
3339 : COMPOSITION_WITH_RULE_ALTCHARS); \
3340 cmp_status->state \
3341 = (c1 <= '2' ? COMPOSING_CHAR : COMPOSING_COMPONENT_CHAR); \
3342 ADD_COMPOSITION_DATA (charbuf, 0, 0, cmp_status->method); \
3343 cmp_status->length = MAX_ANNOTATION_LENGTH; \
3344 cmp_status->nchars = cmp_status->ncomps = 0; \
3345 coding->annotated = 1; \
3346 } \
3347 } while (0)
3348
3349
3350 /* Handle composition end sequence ESC 1. */
3351
3352 #define DECODE_COMPOSITION_END() \
3353 do { \
3354 if (cmp_status->nchars == 0 \
3355 || ((cmp_status->state == COMPOSING_CHAR) \
3356 == (cmp_status->method == COMPOSITION_WITH_RULE))) \
3357 { \
3358 MAYBE_FINISH_COMPOSITION (); \
3359 goto invalid_code; \
3360 } \
3361 if (cmp_status->method == COMPOSITION_WITH_ALTCHARS) \
3362 charbuf[- cmp_status->length] -= cmp_status->ncomps + 2; \
3363 else if (cmp_status->method == COMPOSITION_WITH_RULE_ALTCHARS) \
3364 charbuf[- cmp_status->length] -= cmp_status->ncomps * 3; \
3365 charbuf[- cmp_status->length + 2] = cmp_status->nchars; \
3366 char_offset += cmp_status->nchars; \
3367 cmp_status->state = COMPOSING_NO; \
3368 } while (0)
3369
3370 /* Store a composition rule RULE in charbuf, and update cmp_status. */
3371
3372 #define STORE_COMPOSITION_RULE(rule) \
3373 do { \
3374 *charbuf++ = -2; \
3375 *charbuf++ = rule; \
3376 cmp_status->length += 2; \
3377 cmp_status->state--; \
3378 } while (0)
3379
3380 /* Store a composed char or a component char C in charbuf, and update
3381 cmp_status. */
3382
3383 #define STORE_COMPOSITION_CHAR(c) \
3384 do { \
3385 *charbuf++ = (c); \
3386 cmp_status->length++; \
3387 if (cmp_status->state == COMPOSING_CHAR) \
3388 cmp_status->nchars++; \
3389 else \
3390 cmp_status->ncomps++; \
3391 if (cmp_status->method == COMPOSITION_WITH_RULE \
3392 || (cmp_status->method == COMPOSITION_WITH_RULE_ALTCHARS \
3393 && cmp_status->state == COMPOSING_COMPONENT_CHAR)) \
3394 cmp_status->state++; \
3395 } while (0)
3396
3397
3398 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions". */
3399
3400 static void
3401 decode_coding_iso_2022 (struct coding_system *coding)
3402 {
3403 const unsigned char *src = coding->source + coding->consumed;
3404 const unsigned char *src_end = coding->source + coding->src_bytes;
3405 const unsigned char *src_base;
3406 int *charbuf = coding->charbuf + coding->charbuf_used;
3407 /* We may produce two annotations (charset and composition) in one
3408 loop and one more charset annotation at the end. */
3409 int *charbuf_end
3410 = coding->charbuf + coding->charbuf_size - (MAX_ANNOTATION_LENGTH * 3);
3411 ptrdiff_t consumed_chars = 0, consumed_chars_base;
3412 bool multibytep = coding->src_multibyte;
3413 /* Charsets invoked to graphic plane 0 and 1 respectively. */
3414 int charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3415 int charset_id_1 = CODING_ISO_INVOKED_CHARSET (coding, 1);
3416 int charset_id_2, charset_id_3;
3417 struct charset *charset;
3418 int c;
3419 struct composition_status *cmp_status = CODING_ISO_CMP_STATUS (coding);
3420 Lisp_Object attrs = CODING_ID_ATTRS (coding->id);
3421 ptrdiff_t char_offset = coding->produced_char;
3422 ptrdiff_t last_offset = char_offset;
3423 int last_id = charset_ascii;
3424 bool eol_dos
3425 = !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
3426 int byte_after_cr = -1;
3427 int i;
3428
3429 setup_iso_safe_charsets (attrs);
3430 coding->safe_charsets = SDATA (CODING_ATTR_SAFE_CHARSETS (attrs));
3431
3432 if (cmp_status->state != COMPOSING_NO)
3433 {
3434 if (charbuf_end - charbuf < cmp_status->length)
3435 emacs_abort ();
3436 for (i = 0; i < cmp_status->length; i++)
3437 *charbuf++ = cmp_status->carryover[i];
3438 coding->annotated = 1;
3439 }
3440
3441 while (1)
3442 {
3443 int c1, c2, c3;
3444
3445 src_base = src;
3446 consumed_chars_base = consumed_chars;
3447
3448 if (charbuf >= charbuf_end)
3449 {
3450 if (byte_after_cr >= 0)
3451 src_base--;
3452 break;
3453 }
3454
3455 if (byte_after_cr >= 0)
3456 c1 = byte_after_cr, byte_after_cr = -1;
3457 else
3458 ONE_MORE_BYTE (c1);
3459 if (c1 < 0)
3460 goto invalid_code;
3461
3462 if (CODING_ISO_EXTSEGMENT_LEN (coding) > 0)
3463 {
3464 *charbuf++ = ASCII_BYTE_P (c1) ? c1 : BYTE8_TO_CHAR (c1);
3465 char_offset++;
3466 CODING_ISO_EXTSEGMENT_LEN (coding)--;
3467 continue;
3468 }
3469
3470 if (CODING_ISO_EMBEDDED_UTF_8 (coding))
3471 {
3472 if (c1 == ISO_CODE_ESC)
3473 {
3474 if (src + 1 >= src_end)
3475 goto no_more_source;
3476 *charbuf++ = ISO_CODE_ESC;
3477 char_offset++;
3478 if (src[0] == '%' && src[1] == '@')
3479 {
3480 src += 2;
3481 consumed_chars += 2;
3482 char_offset += 2;
3483 /* We are sure charbuf can contain two more chars. */
3484 *charbuf++ = '%';
3485 *charbuf++ = '@';
3486 CODING_ISO_EMBEDDED_UTF_8 (coding) = 0;
3487 }
3488 }
3489 else
3490 {
3491 *charbuf++ = ASCII_BYTE_P (c1) ? c1 : BYTE8_TO_CHAR (c1);
3492 char_offset++;
3493 }
3494 continue;
3495 }
3496
3497 if ((cmp_status->state == COMPOSING_RULE
3498 || cmp_status->state == COMPOSING_COMPONENT_RULE)
3499 && c1 != ISO_CODE_ESC)
3500 {
3501 int rule;
3502
3503 DECODE_COMPOSITION_RULE (rule);
3504 STORE_COMPOSITION_RULE (rule);
3505 continue;
3506 }
3507
3508 /* We produce at most one character. */
3509 switch (iso_code_class [c1])
3510 {
3511 case ISO_0x20_or_0x7F:
3512 if (charset_id_0 < 0
3513 || ! CHARSET_ISO_CHARS_96 (CHARSET_FROM_ID (charset_id_0)))
3514 /* This is SPACE or DEL. */
3515 charset = CHARSET_FROM_ID (charset_ascii);
3516 else
3517 charset = CHARSET_FROM_ID (charset_id_0);
3518 break;
3519
3520 case ISO_graphic_plane_0:
3521 if (charset_id_0 < 0)
3522 charset = CHARSET_FROM_ID (charset_ascii);
3523 else
3524 charset = CHARSET_FROM_ID (charset_id_0);
3525 break;
3526
3527 case ISO_0xA0_or_0xFF:
3528 if (charset_id_1 < 0
3529 || ! CHARSET_ISO_CHARS_96 (CHARSET_FROM_ID (charset_id_1))
3530 || CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS)
3531 goto invalid_code;
3532 /* This is a graphic character, we fall down ... */
3533
3534 case ISO_graphic_plane_1:
3535 if (charset_id_1 < 0)
3536 goto invalid_code;
3537 charset = CHARSET_FROM_ID (charset_id_1);
3538 break;
3539
3540 case ISO_control_0:
3541 if (eol_dos && c1 == '\r')
3542 ONE_MORE_BYTE (byte_after_cr);
3543 MAYBE_FINISH_COMPOSITION ();
3544 charset = CHARSET_FROM_ID (charset_ascii);
3545 break;
3546
3547 case ISO_control_1:
3548 goto invalid_code;
3549
3550 case ISO_shift_out:
3551 if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LOCKING_SHIFT)
3552 || CODING_ISO_DESIGNATION (coding, 1) < 0)
3553 goto invalid_code;
3554 CODING_ISO_INVOCATION (coding, 0) = 1;
3555 charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3556 continue;
3557
3558 case ISO_shift_in:
3559 if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LOCKING_SHIFT))
3560 goto invalid_code;
3561 CODING_ISO_INVOCATION (coding, 0) = 0;
3562 charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3563 continue;
3564
3565 case ISO_single_shift_2_7:
3566 if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS))
3567 goto invalid_code;
3568 case ISO_single_shift_2:
3569 if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT))
3570 goto invalid_code;
3571 /* SS2 is handled as an escape sequence of ESC 'N' */
3572 c1 = 'N';
3573 goto label_escape_sequence;
3574
3575 case ISO_single_shift_3:
3576 if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT))
3577 goto invalid_code;
3578 /* SS2 is handled as an escape sequence of ESC 'O' */
3579 c1 = 'O';
3580 goto label_escape_sequence;
3581
3582 case ISO_control_sequence_introducer:
3583 /* CSI is handled as an escape sequence of ESC '[' ... */
3584 c1 = '[';
3585 goto label_escape_sequence;
3586
3587 case ISO_escape:
3588 ONE_MORE_BYTE (c1);
3589 label_escape_sequence:
3590 /* Escape sequences handled here are invocation,
3591 designation, direction specification, and character
3592 composition specification. */
3593 switch (c1)
3594 {
3595 case '&': /* revision of following character set */
3596 ONE_MORE_BYTE (c1);
3597 if (!(c1 >= '@' && c1 <= '~'))
3598 goto invalid_code;
3599 ONE_MORE_BYTE (c1);
3600 if (c1 != ISO_CODE_ESC)
3601 goto invalid_code;
3602 ONE_MORE_BYTE (c1);
3603 goto label_escape_sequence;
3604
3605 case '$': /* designation of 2-byte character set */
3606 if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DESIGNATION))
3607 goto invalid_code;
3608 {
3609 int reg, chars96;
3610
3611 ONE_MORE_BYTE (c1);
3612 if (c1 >= '@' && c1 <= 'B')
3613 { /* designation of JISX0208.1978, GB2312.1980,
3614 or JISX0208.1980 */
3615 reg = 0, chars96 = 0;
3616 }
3617 else if (c1 >= 0x28 && c1 <= 0x2B)
3618 { /* designation of DIMENSION2_CHARS94 character set */
3619 reg = c1 - 0x28, chars96 = 0;
3620 ONE_MORE_BYTE (c1);
3621 }
3622 else if (c1 >= 0x2C && c1 <= 0x2F)
3623 { /* designation of DIMENSION2_CHARS96 character set */
3624 reg = c1 - 0x2C, chars96 = 1;
3625 ONE_MORE_BYTE (c1);
3626 }
3627 else
3628 goto invalid_code;
3629 DECODE_DESIGNATION (reg, 2, chars96, c1);
3630 /* We must update these variables now. */
3631 if (reg == 0)
3632 charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3633 else if (reg == 1)
3634 charset_id_1 = CODING_ISO_INVOKED_CHARSET (coding, 1);
3635 if (chars96 < 0)
3636 goto invalid_code;
3637 }
3638 continue;
3639
3640 case 'n': /* invocation of locking-shift-2 */
3641 if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LOCKING_SHIFT)
3642 || CODING_ISO_DESIGNATION (coding, 2) < 0)
3643 goto invalid_code;
3644 CODING_ISO_INVOCATION (coding, 0) = 2;
3645 charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3646 continue;
3647
3648 case 'o': /* invocation of locking-shift-3 */
3649 if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LOCKING_SHIFT)
3650 || CODING_ISO_DESIGNATION (coding, 3) < 0)
3651 goto invalid_code;
3652 CODING_ISO_INVOCATION (coding, 0) = 3;
3653 charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3654 continue;
3655
3656 case 'N': /* invocation of single-shift-2 */
3657 if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT)
3658 || CODING_ISO_DESIGNATION (coding, 2) < 0)
3659 goto invalid_code;
3660 charset_id_2 = CODING_ISO_DESIGNATION (coding, 2);
3661 if (charset_id_2 < 0)
3662 charset = CHARSET_FROM_ID (charset_ascii);
3663 else
3664 charset = CHARSET_FROM_ID (charset_id_2);
3665 ONE_MORE_BYTE (c1);
3666 if (c1 < 0x20 || (c1 >= 0x80 && c1 < 0xA0))
3667 goto invalid_code;
3668 break;
3669
3670 case 'O': /* invocation of single-shift-3 */
3671 if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT)
3672 || CODING_ISO_DESIGNATION (coding, 3) < 0)
3673 goto invalid_code;
3674 charset_id_3 = CODING_ISO_DESIGNATION (coding, 3);
3675 if (charset_id_3 < 0)
3676 charset = CHARSET_FROM_ID (charset_ascii);
3677 else
3678 charset = CHARSET_FROM_ID (charset_id_3);
3679 ONE_MORE_BYTE (c1);
3680 if (c1 < 0x20 || (c1 >= 0x80 && c1 < 0xA0))
3681 goto invalid_code;
3682 break;
3683
3684 case '0': case '2': case '3': case '4': /* start composition */
3685 if (! (coding->common_flags & CODING_ANNOTATE_COMPOSITION_MASK))
3686 goto invalid_code;
3687 if (last_id != charset_ascii)
3688 {
3689 ADD_CHARSET_DATA (charbuf, char_offset- last_offset, last_id);
3690 last_id = charset_ascii;
3691 last_offset = char_offset;
3692 }
3693 DECODE_COMPOSITION_START (c1);
3694 continue;
3695
3696 case '1': /* end composition */
3697 if (cmp_status->state == COMPOSING_NO)
3698 goto invalid_code;
3699 DECODE_COMPOSITION_END ();
3700 continue;
3701
3702 case '[': /* specification of direction */
3703 if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DIRECTION))
3704 goto invalid_code;
3705 /* For the moment, nested direction is not supported.
3706 So, `coding->mode & CODING_MODE_DIRECTION' zero means
3707 left-to-right, and nonzero means right-to-left. */
3708 ONE_MORE_BYTE (c1);
3709 switch (c1)
3710 {
3711 case ']': /* end of the current direction */
3712 coding->mode &= ~CODING_MODE_DIRECTION;
3713
3714 case '0': /* end of the current direction */
3715 case '1': /* start of left-to-right direction */
3716 ONE_MORE_BYTE (c1);
3717 if (c1 == ']')
3718 coding->mode &= ~CODING_MODE_DIRECTION;
3719 else
3720 goto invalid_code;
3721 break;
3722
3723 case '2': /* start of right-to-left direction */
3724 ONE_MORE_BYTE (c1);
3725 if (c1 == ']')
3726 coding->mode |= CODING_MODE_DIRECTION;
3727 else
3728 goto invalid_code;
3729 break;
3730
3731 default:
3732 goto invalid_code;
3733 }
3734 continue;
3735
3736 case '%':
3737 ONE_MORE_BYTE (c1);
3738 if (c1 == '/')
3739 {
3740 /* CTEXT extended segment:
3741 ESC % / [0-4] M L --ENCODING-NAME-- \002 --BYTES--
3742 We keep these bytes as is for the moment.
3743 They may be decoded by post-read-conversion. */
3744 int dim, M, L;
3745 int size;
3746
3747 ONE_MORE_BYTE (dim);
3748 if (dim < '0' || dim > '4')
3749 goto invalid_code;
3750 ONE_MORE_BYTE (M);
3751 if (M < 128)
3752 goto invalid_code;
3753 ONE_MORE_BYTE (L);
3754 if (L < 128)
3755 goto invalid_code;
3756 size = ((M - 128) * 128) + (L - 128);
3757 if (charbuf + 6 > charbuf_end)
3758 goto break_loop;
3759 *charbuf++ = ISO_CODE_ESC;
3760 *charbuf++ = '%';
3761 *charbuf++ = '/';
3762 *charbuf++ = dim;
3763 *charbuf++ = BYTE8_TO_CHAR (M);
3764 *charbuf++ = BYTE8_TO_CHAR (L);
3765 CODING_ISO_EXTSEGMENT_LEN (coding) = size;
3766 }
3767 else if (c1 == 'G')
3768 {
3769 /* XFree86 extension for embedding UTF-8 in CTEXT:
3770 ESC % G --UTF-8-BYTES-- ESC % @
3771 We keep these bytes as is for the moment.
3772 They may be decoded by post-read-conversion. */
3773 if (charbuf + 3 > charbuf_end)
3774 goto break_loop;
3775 *charbuf++ = ISO_CODE_ESC;
3776 *charbuf++ = '%';
3777 *charbuf++ = 'G';
3778 CODING_ISO_EMBEDDED_UTF_8 (coding) = 1;
3779 }
3780 else
3781 goto invalid_code;
3782 continue;
3783 break;
3784
3785 default:
3786 if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DESIGNATION))
3787 goto invalid_code;
3788 {
3789 int reg, chars96;
3790
3791 if (c1 >= 0x28 && c1 <= 0x2B)
3792 { /* designation of DIMENSION1_CHARS94 character set */
3793 reg = c1 - 0x28, chars96 = 0;
3794 ONE_MORE_BYTE (c1);
3795 }
3796 else if (c1 >= 0x2C && c1 <= 0x2F)
3797 { /* designation of DIMENSION1_CHARS96 character set */
3798 reg = c1 - 0x2C, chars96 = 1;
3799 ONE_MORE_BYTE (c1);
3800 }
3801 else
3802 goto invalid_code;
3803 DECODE_DESIGNATION (reg, 1, chars96, c1);
3804 /* We must update these variables now. */
3805 if (reg == 0)
3806 charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3807 else if (reg == 1)
3808 charset_id_1 = CODING_ISO_INVOKED_CHARSET (coding, 1);
3809 if (chars96 < 0)
3810 goto invalid_code;
3811 }
3812 continue;
3813 }
3814 break;
3815
3816 default:
3817 emacs_abort ();
3818 }
3819
3820 if (cmp_status->state == COMPOSING_NO
3821 && charset->id != charset_ascii
3822 && last_id != charset->id)
3823 {
3824 if (last_id != charset_ascii)
3825 ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
3826 last_id = charset->id;
3827 last_offset = char_offset;
3828 }
3829
3830 /* Now we know CHARSET and 1st position code C1 of a character.
3831 Produce a decoded character while getting 2nd and 3rd
3832 position codes C2, C3 if necessary. */
3833 if (CHARSET_DIMENSION (charset) > 1)
3834 {
3835 ONE_MORE_BYTE (c2);
3836 if (c2 < 0x20 || (c2 >= 0x80 && c2 < 0xA0)
3837 || ((c1 & 0x80) != (c2 & 0x80)))
3838 /* C2 is not in a valid range. */
3839 goto invalid_code;
3840 if (CHARSET_DIMENSION (charset) == 2)
3841 c1 = (c1 << 8) | c2;
3842 else
3843 {
3844 ONE_MORE_BYTE (c3);
3845 if (c3 < 0x20 || (c3 >= 0x80 && c3 < 0xA0)
3846 || ((c1 & 0x80) != (c3 & 0x80)))
3847 /* C3 is not in a valid range. */
3848 goto invalid_code;
3849 c1 = (c1 << 16) | (c2 << 8) | c2;
3850 }
3851 }
3852 c1 &= 0x7F7F7F;
3853 CODING_DECODE_CHAR (coding, src, src_base, src_end, charset, c1, c);
3854 if (c < 0)
3855 {
3856 MAYBE_FINISH_COMPOSITION ();
3857 for (; src_base < src; src_base++, char_offset++)
3858 {
3859 if (ASCII_BYTE_P (*src_base))
3860 *charbuf++ = *src_base;
3861 else
3862 *charbuf++ = BYTE8_TO_CHAR (*src_base);
3863 }
3864 }
3865 else if (cmp_status->state == COMPOSING_NO)
3866 {
3867 *charbuf++ = c;
3868 char_offset++;
3869 }
3870 else if ((cmp_status->state == COMPOSING_CHAR
3871 ? cmp_status->nchars
3872 : cmp_status->ncomps)
3873 >= MAX_COMPOSITION_COMPONENTS)
3874 {
3875 /* Too long composition. */
3876 MAYBE_FINISH_COMPOSITION ();
3877 *charbuf++ = c;
3878 char_offset++;
3879 }
3880 else
3881 STORE_COMPOSITION_CHAR (c);
3882 continue;
3883
3884 invalid_code:
3885 MAYBE_FINISH_COMPOSITION ();
3886 src = src_base;
3887 consumed_chars = consumed_chars_base;
3888 ONE_MORE_BYTE (c);
3889 *charbuf++ = c < 0 ? -c : ASCII_BYTE_P (c) ? c : BYTE8_TO_CHAR (c);
3890 char_offset++;
3891 coding->errors++;
3892 continue;
3893
3894 break_loop:
3895 break;
3896 }
3897
3898 no_more_source:
3899 if (cmp_status->state != COMPOSING_NO)
3900 {
3901 if (coding->mode & CODING_MODE_LAST_BLOCK)
3902 MAYBE_FINISH_COMPOSITION ();
3903 else
3904 {
3905 charbuf -= cmp_status->length;
3906 for (i = 0; i < cmp_status->length; i++)
3907 cmp_status->carryover[i] = charbuf[i];
3908 }
3909 }
3910 else if (last_id != charset_ascii)
3911 ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
3912 coding->consumed_char += consumed_chars_base;
3913 coding->consumed = src_base - coding->source;
3914 coding->charbuf_used = charbuf - coding->charbuf;
3915 }
3916
3917
3918 /* ISO2022 encoding stuff. */
3919
3920 /*
3921 It is not enough to say just "ISO2022" on encoding, we have to
3922 specify more details. In Emacs, each coding system of ISO2022
3923 variant has the following specifications:
3924 1. Initial designation to G0 thru G3.
3925 2. Allows short-form designation?
3926 3. ASCII should be designated to G0 before control characters?
3927 4. ASCII should be designated to G0 at end of line?
3928 5. 7-bit environment or 8-bit environment?
3929 6. Use locking-shift?
3930 7. Use Single-shift?
3931 And the following two are only for Japanese:
3932 8. Use ASCII in place of JIS0201-1976-Roman?
3933 9. Use JISX0208-1983 in place of JISX0208-1978?
3934 These specifications are encoded in CODING_ISO_FLAGS (coding) as flag bits
3935 defined by macros CODING_ISO_FLAG_XXX. See `coding.h' for more
3936 details.
3937 */
3938
3939 /* Produce codes (escape sequence) for designating CHARSET to graphic
3940 register REG at DST, and increment DST. If <final-char> of CHARSET is
3941 '@', 'A', or 'B' and the coding system CODING allows, produce
3942 designation sequence of short-form. */
3943
3944 #define ENCODE_DESIGNATION(charset, reg, coding) \
3945 do { \
3946 unsigned char final_char = CHARSET_ISO_FINAL (charset); \
3947 const char *intermediate_char_94 = "()*+"; \
3948 const char *intermediate_char_96 = ",-./"; \
3949 int revision = -1; \
3950 \
3951 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_REVISION) \
3952 revision = CHARSET_ISO_REVISION (charset); \
3953 \
3954 if (revision >= 0) \
3955 { \
3956 EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, '&'); \
3957 EMIT_ONE_BYTE ('@' + revision); \
3958 } \
3959 EMIT_ONE_ASCII_BYTE (ISO_CODE_ESC); \
3960 if (CHARSET_DIMENSION (charset) == 1) \
3961 { \
3962 int b; \
3963 if (! CHARSET_ISO_CHARS_96 (charset)) \
3964 b = intermediate_char_94[reg]; \
3965 else \
3966 b = intermediate_char_96[reg]; \
3967 EMIT_ONE_ASCII_BYTE (b); \
3968 } \
3969 else \
3970 { \
3971 EMIT_ONE_ASCII_BYTE ('$'); \
3972 if (! CHARSET_ISO_CHARS_96 (charset)) \
3973 { \
3974 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LONG_FORM \
3975 || reg != 0 \
3976 || final_char < '@' || final_char > 'B') \
3977 EMIT_ONE_ASCII_BYTE (intermediate_char_94[reg]); \
3978 } \
3979 else \
3980 EMIT_ONE_ASCII_BYTE (intermediate_char_96[reg]); \
3981 } \
3982 EMIT_ONE_ASCII_BYTE (final_char); \
3983 \
3984 CODING_ISO_DESIGNATION (coding, reg) = CHARSET_ID (charset); \
3985 } while (0)
3986
3987
3988 /* The following two macros produce codes (control character or escape
3989 sequence) for ISO2022 single-shift functions (single-shift-2 and
3990 single-shift-3). */
3991
3992 #define ENCODE_SINGLE_SHIFT_2 \
3993 do { \
3994 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS) \
3995 EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'N'); \
3996 else \
3997 EMIT_ONE_BYTE (ISO_CODE_SS2); \
3998 CODING_ISO_SINGLE_SHIFTING (coding) = 1; \
3999 } while (0)
4000
4001
4002 #define ENCODE_SINGLE_SHIFT_3 \
4003 do { \
4004 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS) \
4005 EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'O'); \
4006 else \
4007 EMIT_ONE_BYTE (ISO_CODE_SS3); \
4008 CODING_ISO_SINGLE_SHIFTING (coding) = 1; \
4009 } while (0)
4010
4011
4012 /* The following four macros produce codes (control character or
4013 escape sequence) for ISO2022 locking-shift functions (shift-in,
4014 shift-out, locking-shift-2, and locking-shift-3). */
4015
4016 #define ENCODE_SHIFT_IN \
4017 do { \
4018 EMIT_ONE_ASCII_BYTE (ISO_CODE_SI); \
4019 CODING_ISO_INVOCATION (coding, 0) = 0; \
4020 } while (0)
4021
4022
4023 #define ENCODE_SHIFT_OUT \
4024 do { \
4025 EMIT_ONE_ASCII_BYTE (ISO_CODE_SO); \
4026 CODING_ISO_INVOCATION (coding, 0) = 1; \
4027 } while (0)
4028
4029
4030 #define ENCODE_LOCKING_SHIFT_2 \
4031 do { \
4032 EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'n'); \
4033 CODING_ISO_INVOCATION (coding, 0) = 2; \
4034 } while (0)
4035
4036
4037 #define ENCODE_LOCKING_SHIFT_3 \
4038 do { \
4039 EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'n'); \
4040 CODING_ISO_INVOCATION (coding, 0) = 3; \
4041 } while (0)
4042
4043
4044 /* Produce codes for a DIMENSION1 character whose character set is
4045 CHARSET and whose position-code is C1. Designation and invocation
4046 sequences are also produced in advance if necessary. */
4047
4048 #define ENCODE_ISO_CHARACTER_DIMENSION1(charset, c1) \
4049 do { \
4050 int id = CHARSET_ID (charset); \
4051 \
4052 if ((CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_USE_ROMAN) \
4053 && id == charset_ascii) \
4054 { \
4055 id = charset_jisx0201_roman; \
4056 charset = CHARSET_FROM_ID (id); \
4057 } \
4058 \
4059 if (CODING_ISO_SINGLE_SHIFTING (coding)) \
4060 { \
4061 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS) \
4062 EMIT_ONE_ASCII_BYTE (c1 & 0x7F); \
4063 else \
4064 EMIT_ONE_BYTE (c1 | 0x80); \
4065 CODING_ISO_SINGLE_SHIFTING (coding) = 0; \
4066 break; \
4067 } \
4068 else if (id == CODING_ISO_INVOKED_CHARSET (coding, 0)) \
4069 { \
4070 EMIT_ONE_ASCII_BYTE (c1 & 0x7F); \
4071 break; \
4072 } \
4073 else if (id == CODING_ISO_INVOKED_CHARSET (coding, 1)) \
4074 { \
4075 EMIT_ONE_BYTE (c1 | 0x80); \
4076 break; \
4077 } \
4078 else \
4079 /* Since CHARSET is not yet invoked to any graphic planes, we \
4080 must invoke it, or, at first, designate it to some graphic \
4081 register. Then repeat the loop to actually produce the \
4082 character. */ \
4083 dst = encode_invocation_designation (charset, coding, dst, \
4084 &produced_chars); \
4085 } while (1)
4086
4087
4088 /* Produce codes for a DIMENSION2 character whose character set is
4089 CHARSET and whose position-codes are C1 and C2. Designation and
4090 invocation codes are also produced in advance if necessary. */
4091
4092 #define ENCODE_ISO_CHARACTER_DIMENSION2(charset, c1, c2) \
4093 do { \
4094 int id = CHARSET_ID (charset); \
4095 \
4096 if ((CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_USE_OLDJIS) \
4097 && id == charset_jisx0208) \
4098 { \
4099 id = charset_jisx0208_1978; \
4100 charset = CHARSET_FROM_ID (id); \
4101 } \
4102 \
4103 if (CODING_ISO_SINGLE_SHIFTING (coding)) \
4104 { \
4105 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS) \
4106 EMIT_TWO_ASCII_BYTES ((c1) & 0x7F, (c2) & 0x7F); \
4107 else \
4108 EMIT_TWO_BYTES ((c1) | 0x80, (c2) | 0x80); \
4109 CODING_ISO_SINGLE_SHIFTING (coding) = 0; \
4110 break; \
4111 } \
4112 else if (id == CODING_ISO_INVOKED_CHARSET (coding, 0)) \
4113 { \
4114 EMIT_TWO_ASCII_BYTES ((c1) & 0x7F, (c2) & 0x7F); \
4115 break; \
4116 } \
4117 else if (id == CODING_ISO_INVOKED_CHARSET (coding, 1)) \
4118 { \
4119 EMIT_TWO_BYTES ((c1) | 0x80, (c2) | 0x80); \
4120 break; \
4121 } \
4122 else \
4123 /* Since CHARSET is not yet invoked to any graphic planes, we \
4124 must invoke it, or, at first, designate it to some graphic \
4125 register. Then repeat the loop to actually produce the \
4126 character. */ \
4127 dst = encode_invocation_designation (charset, coding, dst, \
4128 &produced_chars); \
4129 } while (1)
4130
4131
4132 #define ENCODE_ISO_CHARACTER(charset, c) \
4133 do { \
4134 unsigned code; \
4135 CODING_ENCODE_CHAR (coding, dst, dst_end, (charset), (c), code); \
4136 \
4137 if (CHARSET_DIMENSION (charset) == 1) \
4138 ENCODE_ISO_CHARACTER_DIMENSION1 ((charset), code); \
4139 else \
4140 ENCODE_ISO_CHARACTER_DIMENSION2 ((charset), code >> 8, code & 0xFF); \
4141 } while (0)
4142
4143
4144 /* Produce designation and invocation codes at a place pointed by DST
4145 to use CHARSET. The element `spec.iso_2022' of *CODING is updated.
4146 Return new DST. */
4147
4148 static unsigned char *
4149 encode_invocation_designation (struct charset *charset,
4150 struct coding_system *coding,
4151 unsigned char *dst, ptrdiff_t *p_nchars)
4152 {
4153 bool multibytep = coding->dst_multibyte;
4154 ptrdiff_t produced_chars = *p_nchars;
4155 int reg; /* graphic register number */
4156 int id = CHARSET_ID (charset);
4157
4158 /* At first, check designations. */
4159 for (reg = 0; reg < 4; reg++)
4160 if (id == CODING_ISO_DESIGNATION (coding, reg))
4161 break;
4162
4163 if (reg >= 4)
4164 {
4165 /* CHARSET is not yet designated to any graphic registers. */
4166 /* At first check the requested designation. */
4167 reg = CODING_ISO_REQUEST (coding, id);
4168 if (reg < 0)
4169 /* Since CHARSET requests no special designation, designate it
4170 to graphic register 0. */
4171 reg = 0;
4172
4173 ENCODE_DESIGNATION (charset, reg, coding);
4174 }
4175
4176 if (CODING_ISO_INVOCATION (coding, 0) != reg
4177 && CODING_ISO_INVOCATION (coding, 1) != reg)
4178 {
4179 /* Since the graphic register REG is not invoked to any graphic
4180 planes, invoke it to graphic plane 0. */
4181 switch (reg)
4182 {
4183 case 0: /* graphic register 0 */
4184 ENCODE_SHIFT_IN;
4185 break;
4186
4187 case 1: /* graphic register 1 */
4188 ENCODE_SHIFT_OUT;
4189 break;
4190
4191 case 2: /* graphic register 2 */
4192 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT)
4193 ENCODE_SINGLE_SHIFT_2;
4194 else
4195 ENCODE_LOCKING_SHIFT_2;
4196 break;
4197
4198 case 3: /* graphic register 3 */
4199 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT)
4200 ENCODE_SINGLE_SHIFT_3;
4201 else
4202 ENCODE_LOCKING_SHIFT_3;
4203 break;
4204 }
4205 }
4206
4207 *p_nchars = produced_chars;
4208 return dst;
4209 }
4210
4211
4212 /* Produce codes for designation and invocation to reset the graphic
4213 planes and registers to initial state. */
4214 #define ENCODE_RESET_PLANE_AND_REGISTER() \
4215 do { \
4216 int reg; \
4217 struct charset *charset; \
4218 \
4219 if (CODING_ISO_INVOCATION (coding, 0) != 0) \
4220 ENCODE_SHIFT_IN; \
4221 for (reg = 0; reg < 4; reg++) \
4222 if (CODING_ISO_INITIAL (coding, reg) >= 0 \
4223 && (CODING_ISO_DESIGNATION (coding, reg) \
4224 != CODING_ISO_INITIAL (coding, reg))) \
4225 { \
4226 charset = CHARSET_FROM_ID (CODING_ISO_INITIAL (coding, reg)); \
4227 ENCODE_DESIGNATION (charset, reg, coding); \
4228 } \
4229 } while (0)
4230
4231
4232 /* Produce designation sequences of charsets in the line started from
4233 CHARBUF to a place pointed by DST, and return the number of
4234 produced bytes. DST should not directly point a buffer text area
4235 which may be relocated by char_charset call.
4236
4237 If the current block ends before any end-of-line, we may fail to
4238 find all the necessary designations. */
4239
4240 static ptrdiff_t
4241 encode_designation_at_bol (struct coding_system *coding,
4242 int *charbuf, int *charbuf_end,
4243 unsigned char *dst)
4244 {
4245 unsigned char *orig = dst;
4246 struct charset *charset;
4247 /* Table of charsets to be designated to each graphic register. */
4248 int r[4];
4249 int c, found = 0, reg;
4250 ptrdiff_t produced_chars = 0;
4251 bool multibytep = coding->dst_multibyte;
4252 Lisp_Object attrs;
4253 Lisp_Object charset_list;
4254
4255 attrs = CODING_ID_ATTRS (coding->id);
4256 charset_list = CODING_ATTR_CHARSET_LIST (attrs);
4257 if (EQ (charset_list, Qiso_2022))
4258 charset_list = Viso_2022_charset_list;
4259
4260 for (reg = 0; reg < 4; reg++)
4261 r[reg] = -1;
4262
4263 while (charbuf < charbuf_end && found < 4)
4264 {
4265 int id;
4266
4267 c = *charbuf++;
4268 if (c == '\n')
4269 break;
4270 charset = char_charset (c, charset_list, NULL);
4271 id = CHARSET_ID (charset);
4272 reg = CODING_ISO_REQUEST (coding, id);
4273 if (reg >= 0 && r[reg] < 0)
4274 {
4275 found++;
4276 r[reg] = id;
4277 }
4278 }
4279
4280 if (found)
4281 {
4282 for (reg = 0; reg < 4; reg++)
4283 if (r[reg] >= 0
4284 && CODING_ISO_DESIGNATION (coding, reg) != r[reg])
4285 ENCODE_DESIGNATION (CHARSET_FROM_ID (r[reg]), reg, coding);
4286 }
4287
4288 return dst - orig;
4289 }
4290
4291 /* See the above "GENERAL NOTES on `encode_coding_XXX ()' functions". */
4292
4293 static bool
4294 encode_coding_iso_2022 (struct coding_system *coding)
4295 {
4296 bool multibytep = coding->dst_multibyte;
4297 int *charbuf = coding->charbuf;
4298 int *charbuf_end = charbuf + coding->charbuf_used;
4299 unsigned char *dst = coding->destination + coding->produced;
4300 unsigned char *dst_end = coding->destination + coding->dst_bytes;
4301 int safe_room = 16;
4302 bool bol_designation
4303 = (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DESIGNATE_AT_BOL
4304 && CODING_ISO_BOL (coding));
4305 ptrdiff_t produced_chars = 0;
4306 Lisp_Object attrs, eol_type, charset_list;
4307 bool ascii_compatible;
4308 int c;
4309 int preferred_charset_id = -1;
4310
4311 CODING_GET_INFO (coding, attrs, charset_list);
4312 eol_type = inhibit_eol_conversion ? Qunix : CODING_ID_EOL_TYPE (coding->id);
4313 if (VECTORP (eol_type))
4314 eol_type = Qunix;
4315
4316 setup_iso_safe_charsets (attrs);
4317 /* Charset list may have been changed. */
4318 charset_list = CODING_ATTR_CHARSET_LIST (attrs);
4319 coding->safe_charsets = SDATA (CODING_ATTR_SAFE_CHARSETS (attrs));
4320
4321 ascii_compatible
4322 = (! NILP (CODING_ATTR_ASCII_COMPAT (attrs))
4323 && ! (CODING_ISO_FLAGS (coding) & (CODING_ISO_FLAG_DESIGNATION
4324 | CODING_ISO_FLAG_LOCKING_SHIFT)));
4325
4326 while (charbuf < charbuf_end)
4327 {
4328 ASSURE_DESTINATION (safe_room);
4329
4330 if (bol_designation)
4331 {
4332 /* We have to produce designation sequences if any now. */
4333 unsigned char desig_buf[16];
4334 int nbytes;
4335 ptrdiff_t offset;
4336
4337 charset_map_loaded = 0;
4338 nbytes = encode_designation_at_bol (coding, charbuf, charbuf_end,
4339 desig_buf);
4340 if (charset_map_loaded
4341 && (offset = coding_change_destination (coding)))
4342 {
4343 dst += offset;
4344 dst_end += offset;
4345 }
4346 memcpy (dst, desig_buf, nbytes);
4347 dst += nbytes;
4348 /* We are sure that designation sequences are all ASCII bytes. */
4349 produced_chars += nbytes;
4350 bol_designation = 0;
4351 ASSURE_DESTINATION (safe_room);
4352 }
4353
4354 c = *charbuf++;
4355
4356 if (c < 0)
4357 {
4358 /* Handle an annotation. */
4359 switch (*charbuf)
4360 {
4361 case CODING_ANNOTATE_COMPOSITION_MASK:
4362 /* Not yet implemented. */
4363 break;
4364 case CODING_ANNOTATE_CHARSET_MASK:
4365 preferred_charset_id = charbuf[2];
4366 if (preferred_charset_id >= 0
4367 && NILP (Fmemq (make_number (preferred_charset_id),
4368 charset_list)))
4369 preferred_charset_id = -1;
4370 break;
4371 default:
4372 emacs_abort ();
4373 }
4374 charbuf += -c - 1;
4375 continue;
4376 }
4377
4378 /* Now encode the character C. */
4379 if (c < 0x20 || c == 0x7F)
4380 {
4381 if (c == '\n'
4382 || (c == '\r' && EQ (eol_type, Qmac)))
4383 {
4384 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_RESET_AT_EOL)
4385 ENCODE_RESET_PLANE_AND_REGISTER ();
4386 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_INIT_AT_BOL)
4387 {
4388 int i;
4389
4390 for (i = 0; i < 4; i++)
4391 CODING_ISO_DESIGNATION (coding, i)
4392 = CODING_ISO_INITIAL (coding, i);
4393 }
4394 bol_designation = ((CODING_ISO_FLAGS (coding)
4395 & CODING_ISO_FLAG_DESIGNATE_AT_BOL)
4396 != 0);
4397 }
4398 else if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_RESET_AT_CNTL)
4399 ENCODE_RESET_PLANE_AND_REGISTER ();
4400 EMIT_ONE_ASCII_BYTE (c);
4401 }
4402 else if (ASCII_CHAR_P (c))
4403 {
4404 if (ascii_compatible)
4405 EMIT_ONE_ASCII_BYTE (c);
4406 else
4407 {
4408 struct charset *charset = CHARSET_FROM_ID (charset_ascii);
4409 ENCODE_ISO_CHARACTER (charset, c);
4410 }
4411 }
4412 else if (CHAR_BYTE8_P (c))
4413 {
4414 c = CHAR_TO_BYTE8 (c);
4415 EMIT_ONE_BYTE (c);
4416 }
4417 else
4418 {
4419 struct charset *charset;
4420
4421 if (preferred_charset_id >= 0)
4422 {
4423 bool result;
4424
4425 charset = CHARSET_FROM_ID (preferred_charset_id);
4426 CODING_CHAR_CHARSET_P (coding, dst, dst_end, c, charset, result);
4427 if (! result)
4428 CODING_CHAR_CHARSET (coding, dst, dst_end, c, charset_list,
4429 NULL, charset);
4430 }
4431 else
4432 CODING_CHAR_CHARSET (coding, dst, dst_end, c, charset_list,
4433 NULL, charset);
4434 if (!charset)
4435 {
4436 if (coding->mode & CODING_MODE_SAFE_ENCODING)
4437 {
4438 c = CODING_INHIBIT_CHARACTER_SUBSTITUTION;
4439 charset = CHARSET_FROM_ID (charset_ascii);
4440 }
4441 else
4442 {
4443 c = coding->default_char;
4444 CODING_CHAR_CHARSET (coding, dst, dst_end, c,
4445 charset_list, NULL, charset);
4446 }
4447 }
4448 ENCODE_ISO_CHARACTER (charset, c);
4449 }
4450 }
4451
4452 if (coding->mode & CODING_MODE_LAST_BLOCK
4453 && CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_RESET_AT_EOL)
4454 {
4455 ASSURE_DESTINATION (safe_room);
4456 ENCODE_RESET_PLANE_AND_REGISTER ();
4457 }
4458 record_conversion_result (coding, CODING_RESULT_SUCCESS);
4459 CODING_ISO_BOL (coding) = bol_designation;
4460 coding->produced_char += produced_chars;
4461 coding->produced = dst - coding->destination;
4462 return 0;
4463 }
4464
4465 \f
4466 /*** 8,9. SJIS and BIG5 handlers ***/
4467
4468 /* Although SJIS and BIG5 are not ISO's coding system, they are used
4469 quite widely. So, for the moment, Emacs supports them in the bare
4470 C code. But, in the future, they may be supported only by CCL. */
4471
4472 /* SJIS is a coding system encoding three character sets: ASCII, right
4473 half of JISX0201-Kana, and JISX0208. An ASCII character is encoded
4474 as is. A character of charset katakana-jisx0201 is encoded by
4475 "position-code + 0x80". A character of charset japanese-jisx0208
4476 is encoded in 2-byte but two position-codes are divided and shifted
4477 so that it fit in the range below.
4478
4479 --- CODE RANGE of SJIS ---
4480 (character set) (range)
4481 ASCII 0x00 .. 0x7F
4482 KATAKANA-JISX0201 0xA0 .. 0xDF
4483 JISX0208 (1st byte) 0x81 .. 0x9F and 0xE0 .. 0xEF
4484 (2nd byte) 0x40 .. 0x7E and 0x80 .. 0xFC
4485 -------------------------------
4486
4487 */
4488
4489 /* BIG5 is a coding system encoding two character sets: ASCII and
4490 Big5. An ASCII character is encoded as is. Big5 is a two-byte
4491 character set and is encoded in two-byte.
4492
4493 --- CODE RANGE of BIG5 ---
4494 (character set) (range)
4495 ASCII 0x00 .. 0x7F
4496 Big5 (1st byte) 0xA1 .. 0xFE
4497 (2nd byte) 0x40 .. 0x7E and 0xA1 .. 0xFE
4498 --------------------------
4499
4500 */
4501
4502 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
4503 Return true if a text is encoded in SJIS. */
4504
4505 static bool
4506 detect_coding_sjis (struct coding_system *coding,
4507 struct coding_detection_info *detect_info)
4508 {
4509 const unsigned char *src = coding->source, *src_base;
4510 const unsigned char *src_end = coding->source + coding->src_bytes;
4511 bool multibytep = coding->src_multibyte;
4512 ptrdiff_t consumed_chars = 0;
4513 int found = 0;
4514 int c;
4515 Lisp_Object attrs, charset_list;
4516 int max_first_byte_of_2_byte_code;
4517
4518 CODING_GET_INFO (coding, attrs, charset_list);
4519 max_first_byte_of_2_byte_code
4520 = (XINT (Flength (charset_list)) > 3 ? 0xFC : 0xEF);
4521
4522 detect_info->checked |= CATEGORY_MASK_SJIS;
4523 /* A coding system of this category is always ASCII compatible. */
4524 src += coding->head_ascii;
4525
4526 while (1)
4527 {
4528 src_base = src;
4529 ONE_MORE_BYTE (c);
4530 if (c < 0x80)
4531 continue;
4532 if ((c >= 0x81 && c <= 0x9F)
4533 || (c >= 0xE0 && c <= max_first_byte_of_2_byte_code))
4534 {
4535 ONE_MORE_BYTE (c);
4536 if (c < 0x40 || c == 0x7F || c > 0xFC)
4537 break;
4538 found = CATEGORY_MASK_SJIS;
4539 }
4540 else if (c >= 0xA0 && c < 0xE0)
4541 found = CATEGORY_MASK_SJIS;
4542 else
4543 break;
4544 }
4545 detect_info->rejected |= CATEGORY_MASK_SJIS;
4546 return 0;
4547
4548 no_more_source:
4549 if (src_base < src && coding->mode & CODING_MODE_LAST_BLOCK)
4550 {
4551 detect_info->rejected |= CATEGORY_MASK_SJIS;
4552 return 0;
4553 }
4554 detect_info->found |= found;
4555 return 1;
4556 }
4557
4558 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
4559 Return true if a text is encoded in BIG5. */
4560
4561 static bool
4562 detect_coding_big5 (struct coding_system *coding,
4563 struct coding_detection_info *detect_info)
4564 {
4565 const unsigned char *src = coding->source, *src_base;
4566 const unsigned char *src_end = coding->source + coding->src_bytes;
4567 bool multibytep = coding->src_multibyte;
4568 ptrdiff_t consumed_chars = 0;
4569 int found = 0;
4570 int c;
4571
4572 detect_info->checked |= CATEGORY_MASK_BIG5;
4573 /* A coding system of this category is always ASCII compatible. */
4574 src += coding->head_ascii;
4575
4576 while (1)
4577 {
4578 src_base = src;
4579 ONE_MORE_BYTE (c);
4580 if (c < 0x80)
4581 continue;
4582 if (c >= 0xA1)
4583 {
4584 ONE_MORE_BYTE (c);
4585 if (c < 0x40 || (c >= 0x7F && c <= 0xA0))
4586 return 0;
4587 found = CATEGORY_MASK_BIG5;
4588 }
4589 else
4590 break;
4591 }
4592 detect_info->rejected |= CATEGORY_MASK_BIG5;
4593 return 0;
4594
4595 no_more_source:
4596 if (src_base < src && coding->mode & CODING_MODE_LAST_BLOCK)
4597 {
4598 detect_info->rejected |= CATEGORY_MASK_BIG5;
4599 return 0;
4600 }
4601 detect_info->found |= found;
4602 return 1;
4603 }
4604
4605 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions". */
4606
4607 static void
4608 decode_coding_sjis (struct coding_system *coding)
4609 {
4610 const unsigned char *src = coding->source + coding->consumed;
4611 const unsigned char *src_end = coding->source + coding->src_bytes;
4612 const unsigned char *src_base;
4613 int *charbuf = coding->charbuf + coding->charbuf_used;
4614 /* We may produce one charset annotation in one loop and one more at
4615 the end. */
4616 int *charbuf_end
4617 = coding->charbuf + coding->charbuf_size - (MAX_ANNOTATION_LENGTH * 2);
4618 ptrdiff_t consumed_chars = 0, consumed_chars_base;
4619 bool multibytep = coding->src_multibyte;
4620 struct charset *charset_roman, *charset_kanji, *charset_kana;
4621 struct charset *charset_kanji2;
4622 Lisp_Object attrs, charset_list, val;
4623 ptrdiff_t char_offset = coding->produced_char;
4624 ptrdiff_t last_offset = char_offset;
4625 int last_id = charset_ascii;
4626 bool eol_dos
4627 = !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
4628 int byte_after_cr = -1;
4629
4630 CODING_GET_INFO (coding, attrs, charset_list);
4631
4632 val = charset_list;
4633 charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4634 charset_kana = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4635 charset_kanji = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4636 charset_kanji2 = NILP (val) ? NULL : CHARSET_FROM_ID (XINT (XCAR (val)));
4637
4638 while (1)
4639 {
4640 int c, c1;
4641 struct charset *charset;
4642
4643 src_base = src;
4644 consumed_chars_base = consumed_chars;
4645
4646 if (charbuf >= charbuf_end)
4647 {
4648 if (byte_after_cr >= 0)
4649 src_base--;
4650 break;
4651 }
4652
4653 if (byte_after_cr >= 0)
4654 c = byte_after_cr, byte_after_cr = -1;
4655 else
4656 ONE_MORE_BYTE (c);
4657 if (c < 0)
4658 goto invalid_code;
4659 if (c < 0x80)
4660 {
4661 if (eol_dos && c == '\r')
4662 ONE_MORE_BYTE (byte_after_cr);
4663 charset = charset_roman;
4664 }
4665 else if (c == 0x80 || c == 0xA0)
4666 goto invalid_code;
4667 else if (c >= 0xA1 && c <= 0xDF)
4668 {
4669 /* SJIS -> JISX0201-Kana */
4670 c &= 0x7F;
4671 charset = charset_kana;
4672 }
4673 else if (c <= 0xEF)
4674 {
4675 /* SJIS -> JISX0208 */
4676 ONE_MORE_BYTE (c1);
4677 if (c1 < 0x40 || c1 == 0x7F || c1 > 0xFC)
4678 goto invalid_code;
4679 c = (c << 8) | c1;
4680 SJIS_TO_JIS (c);
4681 charset = charset_kanji;
4682 }
4683 else if (c <= 0xFC && charset_kanji2)
4684 {
4685 /* SJIS -> JISX0213-2 */
4686 ONE_MORE_BYTE (c1);
4687 if (c1 < 0x40 || c1 == 0x7F || c1 > 0xFC)
4688 goto invalid_code;
4689 c = (c << 8) | c1;
4690 SJIS_TO_JIS2 (c);
4691 charset = charset_kanji2;
4692 }
4693 else
4694 goto invalid_code;
4695 if (charset->id != charset_ascii
4696 && last_id != charset->id)
4697 {
4698 if (last_id != charset_ascii)
4699 ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
4700 last_id = charset->id;
4701 last_offset = char_offset;
4702 }
4703 CODING_DECODE_CHAR (coding, src, src_base, src_end, charset, c, c);
4704 *charbuf++ = c;
4705 char_offset++;
4706 continue;
4707
4708 invalid_code:
4709 src = src_base;
4710 consumed_chars = consumed_chars_base;
4711 ONE_MORE_BYTE (c);
4712 *charbuf++ = c < 0 ? -c : BYTE8_TO_CHAR (c);
4713 char_offset++;
4714 coding->errors++;
4715 }
4716
4717 no_more_source:
4718 if (last_id != charset_ascii)
4719 ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
4720 coding->consumed_char += consumed_chars_base;
4721 coding->consumed = src_base - coding->source;
4722 coding->charbuf_used = charbuf - coding->charbuf;
4723 }
4724
4725 static void
4726 decode_coding_big5 (struct coding_system *coding)
4727 {
4728 const unsigned char *src = coding->source + coding->consumed;
4729 const unsigned char *src_end = coding->source + coding->src_bytes;
4730 const unsigned char *src_base;
4731 int *charbuf = coding->charbuf + coding->charbuf_used;
4732 /* We may produce one charset annotation in one loop and one more at
4733 the end. */
4734 int *charbuf_end
4735 = coding->charbuf + coding->charbuf_size - (MAX_ANNOTATION_LENGTH * 2);
4736 ptrdiff_t consumed_chars = 0, consumed_chars_base;
4737 bool multibytep = coding->src_multibyte;
4738 struct charset *charset_roman, *charset_big5;
4739 Lisp_Object attrs, charset_list, val;
4740 ptrdiff_t char_offset = coding->produced_char;
4741 ptrdiff_t last_offset = char_offset;
4742 int last_id = charset_ascii;
4743 bool eol_dos
4744 = !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
4745 int byte_after_cr = -1;
4746
4747 CODING_GET_INFO (coding, attrs, charset_list);
4748 val = charset_list;
4749 charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4750 charset_big5 = CHARSET_FROM_ID (XINT (XCAR (val)));
4751
4752 while (1)
4753 {
4754 int c, c1;
4755 struct charset *charset;
4756
4757 src_base = src;
4758 consumed_chars_base = consumed_chars;
4759
4760 if (charbuf >= charbuf_end)
4761 {
4762 if (byte_after_cr >= 0)
4763 src_base--;
4764 break;
4765 }
4766
4767 if (byte_after_cr >= 0)
4768 c = byte_after_cr, byte_after_cr = -1;
4769 else
4770 ONE_MORE_BYTE (c);
4771
4772 if (c < 0)
4773 goto invalid_code;
4774 if (c < 0x80)
4775 {
4776 if (eol_dos && c == '\r')
4777 ONE_MORE_BYTE (byte_after_cr);
4778 charset = charset_roman;
4779 }
4780 else
4781 {
4782 /* BIG5 -> Big5 */
4783 if (c < 0xA1 || c > 0xFE)
4784 goto invalid_code;
4785 ONE_MORE_BYTE (c1);
4786 if (c1 < 0x40 || (c1 > 0x7E && c1 < 0xA1) || c1 > 0xFE)
4787 goto invalid_code;
4788 c = c << 8 | c1;
4789 charset = charset_big5;
4790 }
4791 if (charset->id != charset_ascii
4792 && last_id != charset->id)
4793 {
4794 if (last_id != charset_ascii)
4795 ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
4796 last_id = charset->id;
4797 last_offset = char_offset;
4798 }
4799 CODING_DECODE_CHAR (coding, src, src_base, src_end, charset, c, c);
4800 *charbuf++ = c;
4801 char_offset++;
4802 continue;
4803
4804 invalid_code:
4805 src = src_base;
4806 consumed_chars = consumed_chars_base;
4807 ONE_MORE_BYTE (c);
4808 *charbuf++ = c < 0 ? -c : BYTE8_TO_CHAR (c);
4809 char_offset++;
4810 coding->errors++;
4811 }
4812
4813 no_more_source:
4814 if (last_id != charset_ascii)
4815 ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
4816 coding->consumed_char += consumed_chars_base;
4817 coding->consumed = src_base - coding->source;
4818 coding->charbuf_used = charbuf - coding->charbuf;
4819 }
4820
4821 /* See the above "GENERAL NOTES on `encode_coding_XXX ()' functions".
4822 This function can encode charsets `ascii', `katakana-jisx0201',
4823 `japanese-jisx0208', `chinese-big5-1', and `chinese-big5-2'. We
4824 are sure that all these charsets are registered as official charset
4825 (i.e. do not have extended leading-codes). Characters of other
4826 charsets are produced without any encoding. */
4827
4828 static bool
4829 encode_coding_sjis (struct coding_system *coding)
4830 {
4831 bool multibytep = coding->dst_multibyte;
4832 int *charbuf = coding->charbuf;
4833 int *charbuf_end = charbuf + coding->charbuf_used;
4834 unsigned char *dst = coding->destination + coding->produced;
4835 unsigned char *dst_end = coding->destination + coding->dst_bytes;
4836 int safe_room = 4;
4837 ptrdiff_t produced_chars = 0;
4838 Lisp_Object attrs, charset_list, val;
4839 bool ascii_compatible;
4840 struct charset *charset_kanji, *charset_kana;
4841 struct charset *charset_kanji2;
4842 int c;
4843
4844 CODING_GET_INFO (coding, attrs, charset_list);
4845 val = XCDR (charset_list);
4846 charset_kana = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4847 charset_kanji = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4848 charset_kanji2 = NILP (val) ? NULL : CHARSET_FROM_ID (XINT (XCAR (val)));
4849
4850 ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs));
4851
4852 while (charbuf < charbuf_end)
4853 {
4854 ASSURE_DESTINATION (safe_room);
4855 c = *charbuf++;
4856 /* Now encode the character C. */
4857 if (ASCII_CHAR_P (c) && ascii_compatible)
4858 EMIT_ONE_ASCII_BYTE (c);
4859 else if (CHAR_BYTE8_P (c))
4860 {
4861 c = CHAR_TO_BYTE8 (c);
4862 EMIT_ONE_BYTE (c);
4863 }
4864 else
4865 {
4866 unsigned code;
4867 struct charset *charset;
4868 CODING_CHAR_CHARSET (coding, dst, dst_end, c, charset_list,
4869 &code, charset);
4870
4871 if (!charset)
4872 {
4873 if (coding->mode & CODING_MODE_SAFE_ENCODING)
4874 {
4875 code = CODING_INHIBIT_CHARACTER_SUBSTITUTION;
4876 charset = CHARSET_FROM_ID (charset_ascii);
4877 }
4878 else
4879 {
4880 c = coding->default_char;
4881 CODING_CHAR_CHARSET (coding, dst, dst_end, c,
4882 charset_list, &code, charset);
4883 }
4884 }
4885 if (code == CHARSET_INVALID_CODE (charset))
4886 emacs_abort ();
4887 if (charset == charset_kanji)
4888 {
4889 int c1, c2;
4890 JIS_TO_SJIS (code);
4891 c1 = code >> 8, c2 = code & 0xFF;
4892 EMIT_TWO_BYTES (c1, c2);
4893 }
4894 else if (charset == charset_kana)
4895 EMIT_ONE_BYTE (code | 0x80);
4896 else if (charset_kanji2 && charset == charset_kanji2)
4897 {
4898 int c1, c2;
4899
4900 c1 = code >> 8;
4901 if (c1 == 0x21 || (c1 >= 0x23 && c1 <= 0x25)
4902 || c1 == 0x28
4903 || (c1 >= 0x2C && c1 <= 0x2F) || c1 >= 0x6E)
4904 {
4905 JIS_TO_SJIS2 (code);
4906 c1 = code >> 8, c2 = code & 0xFF;
4907 EMIT_TWO_BYTES (c1, c2);
4908 }
4909 else
4910 EMIT_ONE_ASCII_BYTE (code & 0x7F);
4911 }
4912 else
4913 EMIT_ONE_ASCII_BYTE (code & 0x7F);
4914 }
4915 }
4916 record_conversion_result (coding, CODING_RESULT_SUCCESS);
4917 coding->produced_char += produced_chars;
4918 coding->produced = dst - coding->destination;
4919 return 0;
4920 }
4921
4922 static bool
4923 encode_coding_big5 (struct coding_system *coding)
4924 {
4925 bool multibytep = coding->dst_multibyte;
4926 int *charbuf = coding->charbuf;
4927 int *charbuf_end = charbuf + coding->charbuf_used;
4928 unsigned char *dst = coding->destination + coding->produced;
4929 unsigned char *dst_end = coding->destination + coding->dst_bytes;
4930 int safe_room = 4;
4931 ptrdiff_t produced_chars = 0;
4932 Lisp_Object attrs, charset_list, val;
4933 bool ascii_compatible;
4934 struct charset *charset_big5;
4935 int c;
4936
4937 CODING_GET_INFO (coding, attrs, charset_list);
4938 val = XCDR (charset_list);
4939 charset_big5 = CHARSET_FROM_ID (XINT (XCAR (val)));
4940 ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs));
4941
4942 while (charbuf < charbuf_end)
4943 {
4944 ASSURE_DESTINATION (safe_room);
4945 c = *charbuf++;
4946 /* Now encode the character C. */
4947 if (ASCII_CHAR_P (c) && ascii_compatible)
4948 EMIT_ONE_ASCII_BYTE (c);
4949 else if (CHAR_BYTE8_P (c))
4950 {
4951 c = CHAR_TO_BYTE8 (c);
4952 EMIT_ONE_BYTE (c);
4953 }
4954 else
4955 {
4956 unsigned code;
4957 struct charset *charset;
4958 CODING_CHAR_CHARSET (coding, dst, dst_end, c, charset_list,
4959 &code, charset);
4960
4961 if (! charset)
4962 {
4963 if (coding->mode & CODING_MODE_SAFE_ENCODING)
4964 {
4965 code = CODING_INHIBIT_CHARACTER_SUBSTITUTION;
4966 charset = CHARSET_FROM_ID (charset_ascii);
4967 }
4968 else
4969 {
4970 c = coding->default_char;
4971 CODING_CHAR_CHARSET (coding, dst, dst_end, c,
4972 charset_list, &code, charset);
4973 }
4974 }
4975 if (code == CHARSET_INVALID_CODE (charset))
4976 emacs_abort ();
4977 if (charset == charset_big5)
4978 {
4979 int c1, c2;
4980
4981 c1 = code >> 8, c2 = code & 0xFF;
4982 EMIT_TWO_BYTES (c1, c2);
4983 }
4984 else
4985 EMIT_ONE_ASCII_BYTE (code & 0x7F);
4986 }
4987 }
4988 record_conversion_result (coding, CODING_RESULT_SUCCESS);
4989 coding->produced_char += produced_chars;
4990 coding->produced = dst - coding->destination;
4991 return 0;
4992 }
4993
4994 \f
4995 /*** 10. CCL handlers ***/
4996
4997 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
4998 Return true if a text is encoded in a coding system of which
4999 encoder/decoder are written in CCL program. */
5000
5001 static bool
5002 detect_coding_ccl (struct coding_system *coding,
5003 struct coding_detection_info *detect_info)
5004 {
5005 const unsigned char *src = coding->source, *src_base;
5006 const unsigned char *src_end = coding->source + coding->src_bytes;
5007 bool multibytep = coding->src_multibyte;
5008 ptrdiff_t consumed_chars = 0;
5009 int found = 0;
5010 unsigned char *valids;
5011 ptrdiff_t head_ascii = coding->head_ascii;
5012 Lisp_Object attrs;
5013
5014 detect_info->checked |= CATEGORY_MASK_CCL;
5015
5016 coding = &coding_categories[coding_category_ccl];
5017 valids = CODING_CCL_VALIDS (coding);
5018 attrs = CODING_ID_ATTRS (coding->id);
5019 if (! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
5020 src += head_ascii;
5021
5022 while (1)
5023 {
5024 int c;
5025
5026 src_base = src;
5027 ONE_MORE_BYTE (c);
5028 if (c < 0 || ! valids[c])
5029 break;
5030 if ((valids[c] > 1))
5031 found = CATEGORY_MASK_CCL;
5032 }
5033 detect_info->rejected |= CATEGORY_MASK_CCL;
5034 return 0;
5035
5036 no_more_source:
5037 detect_info->found |= found;
5038 return 1;
5039 }
5040
5041 static void
5042 decode_coding_ccl (struct coding_system *coding)
5043 {
5044 const unsigned char *src = coding->source + coding->consumed;
5045 const unsigned char *src_end = coding->source + coding->src_bytes;
5046 int *charbuf = coding->charbuf + coding->charbuf_used;
5047 int *charbuf_end = coding->charbuf + coding->charbuf_size;
5048 ptrdiff_t consumed_chars = 0;
5049 bool multibytep = coding->src_multibyte;
5050 struct ccl_program *ccl = &coding->spec.ccl->ccl;
5051 int source_charbuf[1024];
5052 int source_byteidx[1025];
5053 Lisp_Object attrs, charset_list;
5054
5055 CODING_GET_INFO (coding, attrs, charset_list);
5056
5057 while (1)
5058 {
5059 const unsigned char *p = src;
5060 ptrdiff_t offset;
5061 int i = 0;
5062
5063 if (multibytep)
5064 {
5065 while (i < 1024 && p < src_end)
5066 {
5067 source_byteidx[i] = p - src;
5068 source_charbuf[i++] = STRING_CHAR_ADVANCE (p);
5069 }
5070 source_byteidx[i] = p - src;
5071 }
5072 else
5073 while (i < 1024 && p < src_end)
5074 source_charbuf[i++] = *p++;
5075
5076 if (p == src_end && coding->mode & CODING_MODE_LAST_BLOCK)
5077 ccl->last_block = 1;
5078 /* As ccl_driver calls DECODE_CHAR, buffer may be relocated. */
5079 charset_map_loaded = 0;
5080 ccl_driver (ccl, source_charbuf, charbuf, i, charbuf_end - charbuf,
5081 charset_list);
5082 if (charset_map_loaded
5083 && (offset = coding_change_source (coding)))
5084 {
5085 p += offset;
5086 src += offset;
5087 src_end += offset;
5088 }
5089 charbuf += ccl->produced;
5090 if (multibytep)
5091 src += source_byteidx[ccl->consumed];
5092 else
5093 src += ccl->consumed;
5094 consumed_chars += ccl->consumed;
5095 if (p == src_end || ccl->status != CCL_STAT_SUSPEND_BY_SRC)
5096 break;
5097 }
5098
5099 switch (ccl->status)
5100 {
5101 case CCL_STAT_SUSPEND_BY_SRC:
5102 record_conversion_result (coding, CODING_RESULT_INSUFFICIENT_SRC);
5103 break;
5104 case CCL_STAT_SUSPEND_BY_DST:
5105 record_conversion_result (coding, CODING_RESULT_INSUFFICIENT_DST);
5106 break;
5107 case CCL_STAT_QUIT:
5108 case CCL_STAT_INVALID_CMD:
5109 record_conversion_result (coding, CODING_RESULT_INTERRUPT);
5110 break;
5111 default:
5112 record_conversion_result (coding, CODING_RESULT_SUCCESS);
5113 break;
5114 }
5115 coding->consumed_char += consumed_chars;
5116 coding->consumed = src - coding->source;
5117 coding->charbuf_used = charbuf - coding->charbuf;
5118 }
5119
5120 static bool
5121 encode_coding_ccl (struct coding_system *coding)
5122 {
5123 struct ccl_program *ccl = &coding->spec.ccl->ccl;
5124 bool multibytep = coding->dst_multibyte;
5125 int *charbuf = coding->charbuf;
5126 int *charbuf_end = charbuf + coding->charbuf_used;
5127 unsigned char *dst = coding->destination + coding->produced;
5128 unsigned char *dst_end = coding->destination + coding->dst_bytes;
5129 int destination_charbuf[1024];
5130 ptrdiff_t produced_chars = 0;
5131 int i;
5132 Lisp_Object attrs, charset_list;
5133
5134 CODING_GET_INFO (coding, attrs, charset_list);
5135 if (coding->consumed_char == coding->src_chars
5136 && coding->mode & CODING_MODE_LAST_BLOCK)
5137 ccl->last_block = 1;
5138
5139 do
5140 {
5141 ptrdiff_t offset;
5142
5143 /* As ccl_driver calls DECODE_CHAR, buffer may be relocated. */
5144 charset_map_loaded = 0;
5145 ccl_driver (ccl, charbuf, destination_charbuf,
5146 charbuf_end - charbuf, 1024, charset_list);
5147 if (charset_map_loaded
5148 && (offset = coding_change_destination (coding)))
5149 dst += offset;
5150 if (multibytep)
5151 {
5152 ASSURE_DESTINATION (ccl->produced * 2);
5153 for (i = 0; i < ccl->produced; i++)
5154 EMIT_ONE_BYTE (destination_charbuf[i] & 0xFF);
5155 }
5156 else
5157 {
5158 ASSURE_DESTINATION (ccl->produced);
5159 for (i = 0; i < ccl->produced; i++)
5160 *dst++ = destination_charbuf[i] & 0xFF;
5161 produced_chars += ccl->produced;
5162 }
5163 charbuf += ccl->consumed;
5164 if (ccl->status == CCL_STAT_QUIT
5165 || ccl->status == CCL_STAT_INVALID_CMD)
5166 break;
5167 }
5168 while (charbuf < charbuf_end);
5169
5170 switch (ccl->status)
5171 {
5172 case CCL_STAT_SUSPEND_BY_SRC:
5173 record_conversion_result (coding, CODING_RESULT_INSUFFICIENT_SRC);
5174 break;
5175 case CCL_STAT_SUSPEND_BY_DST:
5176 record_conversion_result (coding, CODING_RESULT_INSUFFICIENT_DST);
5177 break;
5178 case CCL_STAT_QUIT:
5179 case CCL_STAT_INVALID_CMD:
5180 record_conversion_result (coding, CODING_RESULT_INTERRUPT);
5181 break;
5182 default:
5183 record_conversion_result (coding, CODING_RESULT_SUCCESS);
5184 break;
5185 }
5186
5187 coding->produced_char += produced_chars;
5188 coding->produced = dst - coding->destination;
5189 return 0;
5190 }
5191
5192 \f
5193 /*** 10, 11. no-conversion handlers ***/
5194
5195 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions". */
5196
5197 static void
5198 decode_coding_raw_text (struct coding_system *coding)
5199 {
5200 bool eol_dos
5201 = !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
5202
5203 coding->chars_at_source = 1;
5204 coding->consumed_char = coding->src_chars;
5205 coding->consumed = coding->src_bytes;
5206 if (eol_dos && coding->source[coding->src_bytes - 1] == '\r')
5207 {
5208 coding->consumed_char--;
5209 coding->consumed--;
5210 record_conversion_result (coding, CODING_RESULT_INSUFFICIENT_SRC);
5211 }
5212 else
5213 record_conversion_result (coding, CODING_RESULT_SUCCESS);
5214 }
5215
5216 static bool
5217 encode_coding_raw_text (struct coding_system *coding)
5218 {
5219 bool multibytep = coding->dst_multibyte;
5220 int *charbuf = coding->charbuf;
5221 int *charbuf_end = coding->charbuf + coding->charbuf_used;
5222 unsigned char *dst = coding->destination + coding->produced;
5223 unsigned char *dst_end = coding->destination + coding->dst_bytes;
5224 ptrdiff_t produced_chars = 0;
5225 int c;
5226
5227 if (multibytep)
5228 {
5229 int safe_room = MAX_MULTIBYTE_LENGTH * 2;
5230
5231 if (coding->src_multibyte)
5232 while (charbuf < charbuf_end)
5233 {
5234 ASSURE_DESTINATION (safe_room);
5235 c = *charbuf++;
5236 if (ASCII_CHAR_P (c))
5237 EMIT_ONE_ASCII_BYTE (c);
5238 else if (CHAR_BYTE8_P (c))
5239 {
5240 c = CHAR_TO_BYTE8 (c);
5241 EMIT_ONE_BYTE (c);
5242 }
5243 else
5244 {
5245 unsigned char str[MAX_MULTIBYTE_LENGTH], *p0 = str, *p1 = str;
5246
5247 CHAR_STRING_ADVANCE (c, p1);
5248 do
5249 {
5250 EMIT_ONE_BYTE (*p0);
5251 p0++;
5252 }
5253 while (p0 < p1);
5254 }
5255 }
5256 else
5257 while (charbuf < charbuf_end)
5258 {
5259 ASSURE_DESTINATION (safe_room);
5260 c = *charbuf++;
5261 EMIT_ONE_BYTE (c);
5262 }
5263 }
5264 else
5265 {
5266 if (coding->src_multibyte)
5267 {
5268 int safe_room = MAX_MULTIBYTE_LENGTH;
5269
5270 while (charbuf < charbuf_end)
5271 {
5272 ASSURE_DESTINATION (safe_room);
5273 c = *charbuf++;
5274 if (ASCII_CHAR_P (c))
5275 *dst++ = c;
5276 else if (CHAR_BYTE8_P (c))
5277 *dst++ = CHAR_TO_BYTE8 (c);
5278 else
5279 CHAR_STRING_ADVANCE (c, dst);
5280 }
5281 }
5282 else
5283 {
5284 ASSURE_DESTINATION (charbuf_end - charbuf);
5285 while (charbuf < charbuf_end && dst < dst_end)
5286 *dst++ = *charbuf++;
5287 }
5288 produced_chars = dst - (coding->destination + coding->produced);
5289 }
5290 record_conversion_result (coding, CODING_RESULT_SUCCESS);
5291 coding->produced_char += produced_chars;
5292 coding->produced = dst - coding->destination;
5293 return 0;
5294 }
5295
5296 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
5297 Return true if a text is encoded in a charset-based coding system. */
5298
5299 static bool
5300 detect_coding_charset (struct coding_system *coding,
5301 struct coding_detection_info *detect_info)
5302 {
5303 const unsigned char *src = coding->source, *src_base;
5304 const unsigned char *src_end = coding->source + coding->src_bytes;
5305 bool multibytep = coding->src_multibyte;
5306 ptrdiff_t consumed_chars = 0;
5307 Lisp_Object attrs, valids, name;
5308 int found = 0;
5309 ptrdiff_t head_ascii = coding->head_ascii;
5310 bool check_latin_extra = 0;
5311
5312 detect_info->checked |= CATEGORY_MASK_CHARSET;
5313
5314 coding = &coding_categories[coding_category_charset];
5315 attrs = CODING_ID_ATTRS (coding->id);
5316 valids = AREF (attrs, coding_attr_charset_valids);
5317 name = CODING_ID_NAME (coding->id);
5318 if (strncmp (SSDATA (SYMBOL_NAME (name)),
5319 "iso-8859-", sizeof ("iso-8859-") - 1) == 0
5320 || strncmp (SSDATA (SYMBOL_NAME (name)),
5321 "iso-latin-", sizeof ("iso-latin-") - 1) == 0)
5322 check_latin_extra = 1;
5323
5324 if (! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
5325 src += head_ascii;
5326
5327 while (1)
5328 {
5329 int c;
5330 Lisp_Object val;
5331 struct charset *charset;
5332 int dim, idx;
5333
5334 src_base = src;
5335 ONE_MORE_BYTE (c);
5336 if (c < 0)
5337 continue;
5338 val = AREF (valids, c);
5339 if (NILP (val))
5340 break;
5341 if (c >= 0x80)
5342 {
5343 if (c < 0xA0
5344 && check_latin_extra
5345 && (!VECTORP (Vlatin_extra_code_table)
5346 || NILP (AREF (Vlatin_extra_code_table, c))))
5347 break;
5348 found = CATEGORY_MASK_CHARSET;
5349 }
5350 if (INTEGERP (val))
5351 {
5352 charset = CHARSET_FROM_ID (XFASTINT (val));
5353 dim = CHARSET_DIMENSION (charset);
5354 for (idx = 1; idx < dim; idx++)
5355 {
5356 if (src == src_end)
5357 goto too_short;
5358 ONE_MORE_BYTE (c);
5359 if (c < charset->code_space[(dim - 1 - idx) * 4]
5360 || c > charset->code_space[(dim - 1 - idx) * 4 + 1])
5361 break;
5362 }
5363 if (idx < dim)
5364 break;
5365 }
5366 else
5367 {
5368 idx = 1;
5369 for (; CONSP (val); val = XCDR (val))
5370 {
5371 charset = CHARSET_FROM_ID (XFASTINT (XCAR (val)));
5372 dim = CHARSET_DIMENSION (charset);
5373 while (idx < dim)
5374 {
5375 if (src == src_end)
5376 goto too_short;
5377 ONE_MORE_BYTE (c);
5378 if (c < charset->code_space[(dim - 1 - idx) * 4]
5379 || c > charset->code_space[(dim - 1 - idx) * 4 + 1])
5380 break;
5381 idx++;
5382 }
5383 if (idx == dim)
5384 {
5385 val = Qnil;
5386 break;
5387 }
5388 }
5389 if (CONSP (val))
5390 break;
5391 }
5392 }
5393 too_short:
5394 detect_info->rejected |= CATEGORY_MASK_CHARSET;
5395 return 0;
5396
5397 no_more_source:
5398 detect_info->found |= found;
5399 return 1;
5400 }
5401
5402 static void
5403 decode_coding_charset (struct coding_system *coding)
5404 {
5405 const unsigned char *src = coding->source + coding->consumed;
5406 const unsigned char *src_end = coding->source + coding->src_bytes;
5407 const unsigned char *src_base;
5408 int *charbuf = coding->charbuf + coding->charbuf_used;
5409 /* We may produce one charset annotation in one loop and one more at
5410 the end. */
5411 int *charbuf_end
5412 = coding->charbuf + coding->charbuf_size - (MAX_ANNOTATION_LENGTH * 2);
5413 ptrdiff_t consumed_chars = 0, consumed_chars_base;
5414 bool multibytep = coding->src_multibyte;
5415 Lisp_Object attrs = CODING_ID_ATTRS (coding->id);
5416 Lisp_Object valids;
5417 ptrdiff_t char_offset = coding->produced_char;
5418 ptrdiff_t last_offset = char_offset;
5419 int last_id = charset_ascii;
5420 bool eol_dos
5421 = !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
5422 int byte_after_cr = -1;
5423
5424 valids = AREF (attrs, coding_attr_charset_valids);
5425
5426 while (1)
5427 {
5428 int c;
5429 Lisp_Object val;
5430 struct charset *charset;
5431 int dim;
5432 int len = 1;
5433 unsigned code;
5434
5435 src_base = src;
5436 consumed_chars_base = consumed_chars;
5437
5438 if (charbuf >= charbuf_end)
5439 {
5440 if (byte_after_cr >= 0)
5441 src_base--;
5442 break;
5443 }
5444
5445 if (byte_after_cr >= 0)
5446 {
5447 c = byte_after_cr;
5448 byte_after_cr = -1;
5449 }
5450 else
5451 {
5452 ONE_MORE_BYTE (c);
5453 if (eol_dos && c == '\r')
5454 ONE_MORE_BYTE (byte_after_cr);
5455 }
5456 if (c < 0)
5457 goto invalid_code;
5458 code = c;
5459
5460 val = AREF (valids, c);
5461 if (! INTEGERP (val) && ! CONSP (val))
5462 goto invalid_code;
5463 if (INTEGERP (val))
5464 {
5465 charset = CHARSET_FROM_ID (XFASTINT (val));
5466 dim = CHARSET_DIMENSION (charset);
5467 while (len < dim)
5468 {
5469 ONE_MORE_BYTE (c);
5470 code = (code << 8) | c;
5471 len++;
5472 }
5473 CODING_DECODE_CHAR (coding, src, src_base, src_end,
5474 charset, code, c);
5475 }
5476 else
5477 {
5478 /* VAL is a list of charset IDs. It is assured that the
5479 list is sorted by charset dimensions (smaller one
5480 comes first). */
5481 while (CONSP (val))
5482 {
5483 charset = CHARSET_FROM_ID (XFASTINT (XCAR (val)));
5484 dim = CHARSET_DIMENSION (charset);
5485 while (len < dim)
5486 {
5487 ONE_MORE_BYTE (c);
5488 code = (code << 8) | c;
5489 len++;
5490 }
5491 CODING_DECODE_CHAR (coding, src, src_base,
5492 src_end, charset, code, c);
5493 if (c >= 0)
5494 break;
5495 val = XCDR (val);
5496 }
5497 }
5498 if (c < 0)
5499 goto invalid_code;
5500 if (charset->id != charset_ascii
5501 && last_id != charset->id)
5502 {
5503 if (last_id != charset_ascii)
5504 ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
5505 last_id = charset->id;
5506 last_offset = char_offset;
5507 }
5508
5509 *charbuf++ = c;
5510 char_offset++;
5511 continue;
5512
5513 invalid_code:
5514 src = src_base;
5515 consumed_chars = consumed_chars_base;
5516 ONE_MORE_BYTE (c);
5517 *charbuf++ = c < 0 ? -c : ASCII_BYTE_P (c) ? c : BYTE8_TO_CHAR (c);
5518 char_offset++;
5519 coding->errors++;
5520 }
5521
5522 no_more_source:
5523 if (last_id != charset_ascii)
5524 ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
5525 coding->consumed_char += consumed_chars_base;
5526 coding->consumed = src_base - coding->source;
5527 coding->charbuf_used = charbuf - coding->charbuf;
5528 }
5529
5530 static bool
5531 encode_coding_charset (struct coding_system *coding)
5532 {
5533 bool multibytep = coding->dst_multibyte;
5534 int *charbuf = coding->charbuf;
5535 int *charbuf_end = charbuf + coding->charbuf_used;
5536 unsigned char *dst = coding->destination + coding->produced;
5537 unsigned char *dst_end = coding->destination + coding->dst_bytes;
5538 int safe_room = MAX_MULTIBYTE_LENGTH;
5539 ptrdiff_t produced_chars = 0;
5540 Lisp_Object attrs, charset_list;
5541 bool ascii_compatible;
5542 int c;
5543
5544 CODING_GET_INFO (coding, attrs, charset_list);
5545 ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs));
5546
5547 while (charbuf < charbuf_end)
5548 {
5549 struct charset *charset;
5550 unsigned code;
5551
5552 ASSURE_DESTINATION (safe_room);
5553 c = *charbuf++;
5554 if (ascii_compatible && ASCII_CHAR_P (c))
5555 EMIT_ONE_ASCII_BYTE (c);
5556 else if (CHAR_BYTE8_P (c))
5557 {
5558 c = CHAR_TO_BYTE8 (c);
5559 EMIT_ONE_BYTE (c);
5560 }
5561 else
5562 {
5563 CODING_CHAR_CHARSET (coding, dst, dst_end, c, charset_list,
5564 &code, charset);
5565
5566 if (charset)
5567 {
5568 if (CHARSET_DIMENSION (charset) == 1)
5569 EMIT_ONE_BYTE (code);
5570 else if (CHARSET_DIMENSION (charset) == 2)
5571 EMIT_TWO_BYTES (code >> 8, code & 0xFF);
5572 else if (CHARSET_DIMENSION (charset) == 3)
5573 EMIT_THREE_BYTES (code >> 16, (code >> 8) & 0xFF, code & 0xFF);
5574 else
5575 EMIT_FOUR_BYTES (code >> 24, (code >> 16) & 0xFF,
5576 (code >> 8) & 0xFF, code & 0xFF);
5577 }
5578 else
5579 {
5580 if (coding->mode & CODING_MODE_SAFE_ENCODING)
5581 c = CODING_INHIBIT_CHARACTER_SUBSTITUTION;
5582 else
5583 c = coding->default_char;
5584 EMIT_ONE_BYTE (c);
5585 }
5586 }
5587 }
5588
5589 record_conversion_result (coding, CODING_RESULT_SUCCESS);
5590 coding->produced_char += produced_chars;
5591 coding->produced = dst - coding->destination;
5592 return 0;
5593 }
5594
5595 \f
5596 /*** 7. C library functions ***/
5597
5598 /* Setup coding context CODING from information about CODING_SYSTEM.
5599 If CODING_SYSTEM is nil, `no-conversion' is assumed. If
5600 CODING_SYSTEM is invalid, signal an error. */
5601
5602 void
5603 setup_coding_system (Lisp_Object coding_system, struct coding_system *coding)
5604 {
5605 Lisp_Object attrs;
5606 Lisp_Object eol_type;
5607 Lisp_Object coding_type;
5608 Lisp_Object val;
5609
5610 if (NILP (coding_system))
5611 coding_system = Qundecided;
5612
5613 CHECK_CODING_SYSTEM_GET_ID (coding_system, coding->id);
5614
5615 attrs = CODING_ID_ATTRS (coding->id);
5616 eol_type = inhibit_eol_conversion ? Qunix : CODING_ID_EOL_TYPE (coding->id);
5617
5618 coding->mode = 0;
5619 coding->head_ascii = -1;
5620 if (VECTORP (eol_type))
5621 coding->common_flags = (CODING_REQUIRE_DECODING_MASK
5622 | CODING_REQUIRE_DETECTION_MASK);
5623 else if (! EQ (eol_type, Qunix))
5624 coding->common_flags = (CODING_REQUIRE_DECODING_MASK
5625 | CODING_REQUIRE_ENCODING_MASK);
5626 else
5627 coding->common_flags = 0;
5628 if (! NILP (CODING_ATTR_POST_READ (attrs)))
5629 coding->common_flags |= CODING_REQUIRE_DECODING_MASK;
5630 if (! NILP (CODING_ATTR_PRE_WRITE (attrs)))
5631 coding->common_flags |= CODING_REQUIRE_ENCODING_MASK;
5632 if (! NILP (CODING_ATTR_FOR_UNIBYTE (attrs)))
5633 coding->common_flags |= CODING_FOR_UNIBYTE_MASK;
5634
5635 val = CODING_ATTR_SAFE_CHARSETS (attrs);
5636 coding->max_charset_id = SCHARS (val) - 1;
5637 coding->safe_charsets = SDATA (val);
5638 coding->default_char = XINT (CODING_ATTR_DEFAULT_CHAR (attrs));
5639 coding->carryover_bytes = 0;
5640
5641 coding_type = CODING_ATTR_TYPE (attrs);
5642 if (EQ (coding_type, Qundecided))
5643 {
5644 coding->detector = NULL;
5645 coding->decoder = decode_coding_raw_text;
5646 coding->encoder = encode_coding_raw_text;
5647 coding->common_flags |= CODING_REQUIRE_DETECTION_MASK;
5648 }
5649 else if (EQ (coding_type, Qiso_2022))
5650 {
5651 int i;
5652 int flags = XINT (AREF (attrs, coding_attr_iso_flags));
5653
5654 /* Invoke graphic register 0 to plane 0. */
5655 CODING_ISO_INVOCATION (coding, 0) = 0;
5656 /* Invoke graphic register 1 to plane 1 if we can use 8-bit. */
5657 CODING_ISO_INVOCATION (coding, 1)
5658 = (flags & CODING_ISO_FLAG_SEVEN_BITS ? -1 : 1);
5659 /* Setup the initial status of designation. */
5660 for (i = 0; i < 4; i++)
5661 CODING_ISO_DESIGNATION (coding, i) = CODING_ISO_INITIAL (coding, i);
5662 /* Not single shifting initially. */
5663 CODING_ISO_SINGLE_SHIFTING (coding) = 0;
5664 /* Beginning of buffer should also be regarded as bol. */
5665 CODING_ISO_BOL (coding) = 1;
5666 coding->detector = detect_coding_iso_2022;
5667 coding->decoder = decode_coding_iso_2022;
5668 coding->encoder = encode_coding_iso_2022;
5669 if (flags & CODING_ISO_FLAG_SAFE)
5670 coding->mode |= CODING_MODE_SAFE_ENCODING;
5671 coding->common_flags
5672 |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK
5673 | CODING_REQUIRE_FLUSHING_MASK);
5674 if (flags & CODING_ISO_FLAG_COMPOSITION)
5675 coding->common_flags |= CODING_ANNOTATE_COMPOSITION_MASK;
5676 if (flags & CODING_ISO_FLAG_DESIGNATION)
5677 coding->common_flags |= CODING_ANNOTATE_CHARSET_MASK;
5678 if (flags & CODING_ISO_FLAG_FULL_SUPPORT)
5679 {
5680 setup_iso_safe_charsets (attrs);
5681 val = CODING_ATTR_SAFE_CHARSETS (attrs);
5682 coding->max_charset_id = SCHARS (val) - 1;
5683 coding->safe_charsets = SDATA (val);
5684 }
5685 CODING_ISO_FLAGS (coding) = flags;
5686 CODING_ISO_CMP_STATUS (coding)->state = COMPOSING_NO;
5687 CODING_ISO_CMP_STATUS (coding)->method = COMPOSITION_NO;
5688 CODING_ISO_EXTSEGMENT_LEN (coding) = 0;
5689 CODING_ISO_EMBEDDED_UTF_8 (coding) = 0;
5690 }
5691 else if (EQ (coding_type, Qcharset))
5692 {
5693 coding->detector = detect_coding_charset;
5694 coding->decoder = decode_coding_charset;
5695 coding->encoder = encode_coding_charset;
5696 coding->common_flags
5697 |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
5698 }
5699 else if (EQ (coding_type, Qutf_8))
5700 {
5701 val = AREF (attrs, coding_attr_utf_bom);
5702 CODING_UTF_8_BOM (coding) = (CONSP (val) ? utf_detect_bom
5703 : EQ (val, Qt) ? utf_with_bom
5704 : utf_without_bom);
5705 coding->detector = detect_coding_utf_8;
5706 coding->decoder = decode_coding_utf_8;
5707 coding->encoder = encode_coding_utf_8;
5708 coding->common_flags
5709 |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
5710 if (CODING_UTF_8_BOM (coding) == utf_detect_bom)
5711 coding->common_flags |= CODING_REQUIRE_DETECTION_MASK;
5712 }
5713 else if (EQ (coding_type, Qutf_16))
5714 {
5715 val = AREF (attrs, coding_attr_utf_bom);
5716 CODING_UTF_16_BOM (coding) = (CONSP (val) ? utf_detect_bom
5717 : EQ (val, Qt) ? utf_with_bom
5718 : utf_without_bom);
5719 val = AREF (attrs, coding_attr_utf_16_endian);
5720 CODING_UTF_16_ENDIAN (coding) = (EQ (val, Qbig) ? utf_16_big_endian
5721 : utf_16_little_endian);
5722 CODING_UTF_16_SURROGATE (coding) = 0;
5723 coding->detector = detect_coding_utf_16;
5724 coding->decoder = decode_coding_utf_16;
5725 coding->encoder = encode_coding_utf_16;
5726 coding->common_flags
5727 |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
5728 if (CODING_UTF_16_BOM (coding) == utf_detect_bom)
5729 coding->common_flags |= CODING_REQUIRE_DETECTION_MASK;
5730 }
5731 else if (EQ (coding_type, Qccl))
5732 {
5733 coding->detector = detect_coding_ccl;
5734 coding->decoder = decode_coding_ccl;
5735 coding->encoder = encode_coding_ccl;
5736 coding->common_flags
5737 |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK
5738 | CODING_REQUIRE_FLUSHING_MASK);
5739 }
5740 else if (EQ (coding_type, Qemacs_mule))
5741 {
5742 coding->detector = detect_coding_emacs_mule;
5743 coding->decoder = decode_coding_emacs_mule;
5744 coding->encoder = encode_coding_emacs_mule;
5745 coding->common_flags
5746 |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
5747 if (! NILP (AREF (attrs, coding_attr_emacs_mule_full))
5748 && ! EQ (CODING_ATTR_CHARSET_LIST (attrs), Vemacs_mule_charset_list))
5749 {
5750 Lisp_Object tail, safe_charsets;
5751 int max_charset_id = 0;
5752
5753 for (tail = Vemacs_mule_charset_list; CONSP (tail);
5754 tail = XCDR (tail))
5755 if (max_charset_id < XFASTINT (XCAR (tail)))
5756 max_charset_id = XFASTINT (XCAR (tail));
5757 safe_charsets = make_uninit_string (max_charset_id + 1);
5758 memset (SDATA (safe_charsets), 255, max_charset_id + 1);
5759 for (tail = Vemacs_mule_charset_list; CONSP (tail);
5760 tail = XCDR (tail))
5761 SSET (safe_charsets, XFASTINT (XCAR (tail)), 0);
5762 coding->max_charset_id = max_charset_id;
5763 coding->safe_charsets = SDATA (safe_charsets);
5764 }
5765 coding->spec.emacs_mule.cmp_status.state = COMPOSING_NO;
5766 coding->spec.emacs_mule.cmp_status.method = COMPOSITION_NO;
5767 }
5768 else if (EQ (coding_type, Qshift_jis))
5769 {
5770 coding->detector = detect_coding_sjis;
5771 coding->decoder = decode_coding_sjis;
5772 coding->encoder = encode_coding_sjis;
5773 coding->common_flags
5774 |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
5775 }
5776 else if (EQ (coding_type, Qbig5))
5777 {
5778 coding->detector = detect_coding_big5;
5779 coding->decoder = decode_coding_big5;
5780 coding->encoder = encode_coding_big5;
5781 coding->common_flags
5782 |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
5783 }
5784 else /* EQ (coding_type, Qraw_text) */
5785 {
5786 coding->detector = NULL;
5787 coding->decoder = decode_coding_raw_text;
5788 coding->encoder = encode_coding_raw_text;
5789 if (! EQ (eol_type, Qunix))
5790 {
5791 coding->common_flags |= CODING_REQUIRE_DECODING_MASK;
5792 if (! VECTORP (eol_type))
5793 coding->common_flags |= CODING_REQUIRE_ENCODING_MASK;
5794 }
5795
5796 }
5797
5798 return;
5799 }
5800
5801 /* Return a list of charsets supported by CODING. */
5802
5803 Lisp_Object
5804 coding_charset_list (struct coding_system *coding)
5805 {
5806 Lisp_Object attrs, charset_list;
5807
5808 CODING_GET_INFO (coding, attrs, charset_list);
5809 if (EQ (CODING_ATTR_TYPE (attrs), Qiso_2022))
5810 {
5811 int flags = XINT (AREF (attrs, coding_attr_iso_flags));
5812
5813 if (flags & CODING_ISO_FLAG_FULL_SUPPORT)
5814 charset_list = Viso_2022_charset_list;
5815 }
5816 else if (EQ (CODING_ATTR_TYPE (attrs), Qemacs_mule))
5817 {
5818 charset_list = Vemacs_mule_charset_list;
5819 }
5820 return charset_list;
5821 }
5822
5823
5824 /* Return a list of charsets supported by CODING-SYSTEM. */
5825
5826 Lisp_Object
5827 coding_system_charset_list (Lisp_Object coding_system)
5828 {
5829 ptrdiff_t id;
5830 Lisp_Object attrs, charset_list;
5831
5832 CHECK_CODING_SYSTEM_GET_ID (coding_system, id);
5833 attrs = CODING_ID_ATTRS (id);
5834
5835 if (EQ (CODING_ATTR_TYPE (attrs), Qiso_2022))
5836 {
5837 int flags = XINT (AREF (attrs, coding_attr_iso_flags));
5838
5839 if (flags & CODING_ISO_FLAG_FULL_SUPPORT)
5840 charset_list = Viso_2022_charset_list;
5841 else
5842 charset_list = CODING_ATTR_CHARSET_LIST (attrs);
5843 }
5844 else if (EQ (CODING_ATTR_TYPE (attrs), Qemacs_mule))
5845 {
5846 charset_list = Vemacs_mule_charset_list;
5847 }
5848 else
5849 {
5850 charset_list = CODING_ATTR_CHARSET_LIST (attrs);
5851 }
5852 return charset_list;
5853 }
5854
5855
5856 /* Return raw-text or one of its subsidiaries that has the same
5857 eol_type as CODING-SYSTEM. */
5858
5859 Lisp_Object
5860 raw_text_coding_system (Lisp_Object coding_system)
5861 {
5862 Lisp_Object spec, attrs;
5863 Lisp_Object eol_type, raw_text_eol_type;
5864
5865 if (NILP (coding_system))
5866 return Qraw_text;
5867 spec = CODING_SYSTEM_SPEC (coding_system);
5868 attrs = AREF (spec, 0);
5869
5870 if (EQ (CODING_ATTR_TYPE (attrs), Qraw_text))
5871 return coding_system;
5872
5873 eol_type = AREF (spec, 2);
5874 if (VECTORP (eol_type))
5875 return Qraw_text;
5876 spec = CODING_SYSTEM_SPEC (Qraw_text);
5877 raw_text_eol_type = AREF (spec, 2);
5878 return (EQ (eol_type, Qunix) ? AREF (raw_text_eol_type, 0)
5879 : EQ (eol_type, Qdos) ? AREF (raw_text_eol_type, 1)
5880 : AREF (raw_text_eol_type, 2));
5881 }
5882
5883
5884 /* If CODING_SYSTEM doesn't specify end-of-line format, return one of
5885 the subsidiary that has the same eol-spec as PARENT (if it is not
5886 nil and specifies end-of-line format) or the system's setting
5887 (system_eol_type). */
5888
5889 Lisp_Object
5890 coding_inherit_eol_type (Lisp_Object coding_system, Lisp_Object parent)
5891 {
5892 Lisp_Object spec, eol_type;
5893
5894 if (NILP (coding_system))
5895 coding_system = Qraw_text;
5896 spec = CODING_SYSTEM_SPEC (coding_system);
5897 eol_type = AREF (spec, 2);
5898 if (VECTORP (eol_type))
5899 {
5900 Lisp_Object parent_eol_type;
5901
5902 if (! NILP (parent))
5903 {
5904 Lisp_Object parent_spec;
5905
5906 parent_spec = CODING_SYSTEM_SPEC (parent);
5907 parent_eol_type = AREF (parent_spec, 2);
5908 if (VECTORP (parent_eol_type))
5909 parent_eol_type = system_eol_type;
5910 }
5911 else
5912 parent_eol_type = system_eol_type;
5913 if (EQ (parent_eol_type, Qunix))
5914 coding_system = AREF (eol_type, 0);
5915 else if (EQ (parent_eol_type, Qdos))
5916 coding_system = AREF (eol_type, 1);
5917 else if (EQ (parent_eol_type, Qmac))
5918 coding_system = AREF (eol_type, 2);
5919 }
5920 return coding_system;
5921 }
5922
5923
5924 /* Check if text-conversion and eol-conversion of CODING_SYSTEM are
5925 decided for writing to a process. If not, complement them, and
5926 return a new coding system. */
5927
5928 Lisp_Object
5929 complement_process_encoding_system (Lisp_Object coding_system)
5930 {
5931 Lisp_Object coding_base = Qnil, eol_base = Qnil;
5932 Lisp_Object spec, attrs;
5933 int i;
5934
5935 for (i = 0; i < 3; i++)
5936 {
5937 if (i == 1)
5938 coding_system = CDR_SAFE (Vdefault_process_coding_system);
5939 else if (i == 2)
5940 coding_system = preferred_coding_system ();
5941 spec = CODING_SYSTEM_SPEC (coding_system);
5942 if (NILP (spec))
5943 continue;
5944 attrs = AREF (spec, 0);
5945 if (NILP (coding_base) && ! EQ (CODING_ATTR_TYPE (attrs), Qundecided))
5946 coding_base = CODING_ATTR_BASE_NAME (attrs);
5947 if (NILP (eol_base) && ! VECTORP (AREF (spec, 2)))
5948 eol_base = coding_system;
5949 if (! NILP (coding_base) && ! NILP (eol_base))
5950 break;
5951 }
5952
5953 if (i > 0)
5954 /* The original CODING_SYSTEM didn't specify text-conversion or
5955 eol-conversion. Be sure that we return a fully complemented
5956 coding system. */
5957 coding_system = coding_inherit_eol_type (coding_base, eol_base);
5958 return coding_system;
5959 }
5960
5961
5962 /* Emacs has a mechanism to automatically detect a coding system if it
5963 is one of Emacs' internal format, ISO2022, SJIS, and BIG5. But,
5964 it's impossible to distinguish some coding systems accurately
5965 because they use the same range of codes. So, at first, coding
5966 systems are categorized into 7, those are:
5967
5968 o coding-category-emacs-mule
5969
5970 The category for a coding system which has the same code range
5971 as Emacs' internal format. Assigned the coding-system (Lisp
5972 symbol) `emacs-mule' by default.
5973
5974 o coding-category-sjis
5975
5976 The category for a coding system which has the same code range
5977 as SJIS. Assigned the coding-system (Lisp
5978 symbol) `japanese-shift-jis' by default.
5979
5980 o coding-category-iso-7
5981
5982 The category for a coding system which has the same code range
5983 as ISO2022 of 7-bit environment. This doesn't use any locking
5984 shift and single shift functions. This can encode/decode all
5985 charsets. Assigned the coding-system (Lisp symbol)
5986 `iso-2022-7bit' by default.
5987
5988 o coding-category-iso-7-tight
5989
5990 Same as coding-category-iso-7 except that this can
5991 encode/decode only the specified charsets.
5992
5993 o coding-category-iso-8-1
5994
5995 The category for a coding system which has the same code range
5996 as ISO2022 of 8-bit environment and graphic plane 1 used only
5997 for DIMENSION1 charset. This doesn't use any locking shift
5998 and single shift functions. Assigned the coding-system (Lisp
5999 symbol) `iso-latin-1' by default.
6000
6001 o coding-category-iso-8-2
6002
6003 The category for a coding system which has the same code range
6004 as ISO2022 of 8-bit environment and graphic plane 1 used only
6005 for DIMENSION2 charset. This doesn't use any locking shift
6006 and single shift functions. Assigned the coding-system (Lisp
6007 symbol) `japanese-iso-8bit' by default.
6008
6009 o coding-category-iso-7-else
6010
6011 The category for a coding system which has the same code range
6012 as ISO2022 of 7-bit environment but uses locking shift or
6013 single shift functions. Assigned the coding-system (Lisp
6014 symbol) `iso-2022-7bit-lock' by default.
6015
6016 o coding-category-iso-8-else
6017
6018 The category for a coding system which has the same code range
6019 as ISO2022 of 8-bit environment but uses locking shift or
6020 single shift functions. Assigned the coding-system (Lisp
6021 symbol) `iso-2022-8bit-ss2' by default.
6022
6023 o coding-category-big5
6024
6025 The category for a coding system which has the same code range
6026 as BIG5. Assigned the coding-system (Lisp symbol)
6027 `cn-big5' by default.
6028
6029 o coding-category-utf-8
6030
6031 The category for a coding system which has the same code range
6032 as UTF-8 (cf. RFC3629). Assigned the coding-system (Lisp
6033 symbol) `utf-8' by default.
6034
6035 o coding-category-utf-16-be
6036
6037 The category for a coding system in which a text has an
6038 Unicode signature (cf. Unicode Standard) in the order of BIG
6039 endian at the head. Assigned the coding-system (Lisp symbol)
6040 `utf-16-be' by default.
6041
6042 o coding-category-utf-16-le
6043
6044 The category for a coding system in which a text has an
6045 Unicode signature (cf. Unicode Standard) in the order of
6046 LITTLE endian at the head. Assigned the coding-system (Lisp
6047 symbol) `utf-16-le' by default.
6048
6049 o coding-category-ccl
6050
6051 The category for a coding system of which encoder/decoder is
6052 written in CCL programs. The default value is nil, i.e., no
6053 coding system is assigned.
6054
6055 o coding-category-binary
6056
6057 The category for a coding system not categorized in any of the
6058 above. Assigned the coding-system (Lisp symbol)
6059 `no-conversion' by default.
6060
6061 Each of them is a Lisp symbol and the value is an actual
6062 `coding-system's (this is also a Lisp symbol) assigned by a user.
6063 What Emacs does actually is to detect a category of coding system.
6064 Then, it uses a `coding-system' assigned to it. If Emacs can't
6065 decide only one possible category, it selects a category of the
6066 highest priority. Priorities of categories are also specified by a
6067 user in a Lisp variable `coding-category-list'.
6068
6069 */
6070
6071 #define EOL_SEEN_NONE 0
6072 #define EOL_SEEN_LF 1
6073 #define EOL_SEEN_CR 2
6074 #define EOL_SEEN_CRLF 4
6075
6076 /* Detect how end-of-line of a text of length SRC_BYTES pointed by
6077 SOURCE is encoded. If CATEGORY is one of
6078 coding_category_utf_16_XXXX, assume that CR and LF are encoded by
6079 two-byte, else they are encoded by one-byte.
6080
6081 Return one of EOL_SEEN_XXX. */
6082
6083 #define MAX_EOL_CHECK_COUNT 3
6084
6085 static int
6086 detect_eol (const unsigned char *source, ptrdiff_t src_bytes,
6087 enum coding_category category)
6088 {
6089 const unsigned char *src = source, *src_end = src + src_bytes;
6090 unsigned char c;
6091 int total = 0;
6092 int eol_seen = EOL_SEEN_NONE;
6093
6094 if ((1 << category) & CATEGORY_MASK_UTF_16)
6095 {
6096 bool msb = category == (coding_category_utf_16_le
6097 | coding_category_utf_16_le_nosig);
6098 bool lsb = !msb;
6099
6100 while (src + 1 < src_end)
6101 {
6102 c = src[lsb];
6103 if (src[msb] == 0 && (c == '\n' || c == '\r'))
6104 {
6105 int this_eol;
6106
6107 if (c == '\n')
6108 this_eol = EOL_SEEN_LF;
6109 else if (src + 3 >= src_end
6110 || src[msb + 2] != 0
6111 || src[lsb + 2] != '\n')
6112 this_eol = EOL_SEEN_CR;
6113 else
6114 {
6115 this_eol = EOL_SEEN_CRLF;
6116 src += 2;
6117 }
6118
6119 if (eol_seen == EOL_SEEN_NONE)
6120 /* This is the first end-of-line. */
6121 eol_seen = this_eol;
6122 else if (eol_seen != this_eol)
6123 {
6124 /* The found type is different from what found before.
6125 Allow for stray ^M characters in DOS EOL files. */
6126 if ((eol_seen == EOL_SEEN_CR && this_eol == EOL_SEEN_CRLF)
6127 || (eol_seen == EOL_SEEN_CRLF
6128 && this_eol == EOL_SEEN_CR))
6129 eol_seen = EOL_SEEN_CRLF;
6130 else
6131 {
6132 eol_seen = EOL_SEEN_LF;
6133 break;
6134 }
6135 }
6136 if (++total == MAX_EOL_CHECK_COUNT)
6137 break;
6138 }
6139 src += 2;
6140 }
6141 }
6142 else
6143 while (src < src_end)
6144 {
6145 c = *src++;
6146 if (c == '\n' || c == '\r')
6147 {
6148 int this_eol;
6149
6150 if (c == '\n')
6151 this_eol = EOL_SEEN_LF;
6152 else if (src >= src_end || *src != '\n')
6153 this_eol = EOL_SEEN_CR;
6154 else
6155 this_eol = EOL_SEEN_CRLF, src++;
6156
6157 if (eol_seen == EOL_SEEN_NONE)
6158 /* This is the first end-of-line. */
6159 eol_seen = this_eol;
6160 else if (eol_seen != this_eol)
6161 {
6162 /* The found type is different from what found before.
6163 Allow for stray ^M characters in DOS EOL files. */
6164 if ((eol_seen == EOL_SEEN_CR && this_eol == EOL_SEEN_CRLF)
6165 || (eol_seen == EOL_SEEN_CRLF && this_eol == EOL_SEEN_CR))
6166 eol_seen = EOL_SEEN_CRLF;
6167 else
6168 {
6169 eol_seen = EOL_SEEN_LF;
6170 break;
6171 }
6172 }
6173 if (++total == MAX_EOL_CHECK_COUNT)
6174 break;
6175 }
6176 }
6177 return eol_seen;
6178 }
6179
6180
6181 static Lisp_Object
6182 adjust_coding_eol_type (struct coding_system *coding, int eol_seen)
6183 {
6184 Lisp_Object eol_type;
6185
6186 eol_type = CODING_ID_EOL_TYPE (coding->id);
6187 if (eol_seen & EOL_SEEN_LF)
6188 {
6189 coding->id = CODING_SYSTEM_ID (AREF (eol_type, 0));
6190 eol_type = Qunix;
6191 }
6192 else if (eol_seen & EOL_SEEN_CRLF)
6193 {
6194 coding->id = CODING_SYSTEM_ID (AREF (eol_type, 1));
6195 eol_type = Qdos;
6196 }
6197 else if (eol_seen & EOL_SEEN_CR)
6198 {
6199 coding->id = CODING_SYSTEM_ID (AREF (eol_type, 2));
6200 eol_type = Qmac;
6201 }
6202 return eol_type;
6203 }
6204
6205 /* Detect how a text specified in CODING is encoded. If a coding
6206 system is detected, update fields of CODING by the detected coding
6207 system. */
6208
6209 static void
6210 detect_coding (struct coding_system *coding)
6211 {
6212 const unsigned char *src, *src_end;
6213 unsigned int saved_mode = coding->mode;
6214
6215 coding->consumed = coding->consumed_char = 0;
6216 coding->produced = coding->produced_char = 0;
6217 coding_set_source (coding);
6218
6219 src_end = coding->source + coding->src_bytes;
6220 coding->head_ascii = 0;
6221
6222 /* If we have not yet decided the text encoding type, detect it
6223 now. */
6224 if (EQ (CODING_ATTR_TYPE (CODING_ID_ATTRS (coding->id)), Qundecided))
6225 {
6226 int c, i;
6227 struct coding_detection_info detect_info;
6228 bool null_byte_found = 0, eight_bit_found = 0;
6229
6230 detect_info.checked = detect_info.found = detect_info.rejected = 0;
6231 for (src = coding->source; src < src_end; src++)
6232 {
6233 c = *src;
6234 if (c & 0x80)
6235 {
6236 eight_bit_found = 1;
6237 if (null_byte_found)
6238 break;
6239 }
6240 else if (c < 0x20)
6241 {
6242 if ((c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
6243 && ! inhibit_iso_escape_detection
6244 && ! detect_info.checked)
6245 {
6246 if (detect_coding_iso_2022 (coding, &detect_info))
6247 {
6248 /* We have scanned the whole data. */
6249 if (! (detect_info.rejected & CATEGORY_MASK_ISO_7_ELSE))
6250 {
6251 /* We didn't find an 8-bit code. We may
6252 have found a null-byte, but it's very
6253 rare that a binary file conforms to
6254 ISO-2022. */
6255 src = src_end;
6256 coding->head_ascii = src - coding->source;
6257 }
6258 detect_info.rejected |= ~CATEGORY_MASK_ISO_ESCAPE;
6259 break;
6260 }
6261 }
6262 else if (! c && !inhibit_null_byte_detection)
6263 {
6264 null_byte_found = 1;
6265 if (eight_bit_found)
6266 break;
6267 }
6268 if (! eight_bit_found)
6269 coding->head_ascii++;
6270 }
6271 else if (! eight_bit_found)
6272 coding->head_ascii++;
6273 }
6274
6275 if (null_byte_found || eight_bit_found
6276 || coding->head_ascii < coding->src_bytes
6277 || detect_info.found)
6278 {
6279 enum coding_category category;
6280 struct coding_system *this;
6281
6282 if (coding->head_ascii == coding->src_bytes)
6283 /* As all bytes are 7-bit, we can ignore non-ISO-2022 codings. */
6284 for (i = 0; i < coding_category_raw_text; i++)
6285 {
6286 category = coding_priorities[i];
6287 this = coding_categories + category;
6288 if (detect_info.found & (1 << category))
6289 break;
6290 }
6291 else
6292 {
6293 if (null_byte_found)
6294 {
6295 detect_info.checked |= ~CATEGORY_MASK_UTF_16;
6296 detect_info.rejected |= ~CATEGORY_MASK_UTF_16;
6297 }
6298 for (i = 0; i < coding_category_raw_text; i++)
6299 {
6300 category = coding_priorities[i];
6301 this = coding_categories + category;
6302 /* Some of this->detector (e.g. detect_coding_sjis)
6303 require this information. */
6304 coding->id = this->id;
6305 if (this->id < 0)
6306 {
6307 /* No coding system of this category is defined. */
6308 detect_info.rejected |= (1 << category);
6309 }
6310 else if (category >= coding_category_raw_text)
6311 continue;
6312 else if (detect_info.checked & (1 << category))
6313 {
6314 if (detect_info.found & (1 << category))
6315 break;
6316 }
6317 else if ((*(this->detector)) (coding, &detect_info)
6318 && detect_info.found & (1 << category))
6319 {
6320 if (category == coding_category_utf_16_auto)
6321 {
6322 if (detect_info.found & CATEGORY_MASK_UTF_16_LE)
6323 category = coding_category_utf_16_le;
6324 else
6325 category = coding_category_utf_16_be;
6326 }
6327 break;
6328 }
6329 }
6330 }
6331
6332 if (i < coding_category_raw_text)
6333 setup_coding_system (CODING_ID_NAME (this->id), coding);
6334 else if (null_byte_found)
6335 setup_coding_system (Qno_conversion, coding);
6336 else if ((detect_info.rejected & CATEGORY_MASK_ANY)
6337 == CATEGORY_MASK_ANY)
6338 setup_coding_system (Qraw_text, coding);
6339 else if (detect_info.rejected)
6340 for (i = 0; i < coding_category_raw_text; i++)
6341 if (! (detect_info.rejected & (1 << coding_priorities[i])))
6342 {
6343 this = coding_categories + coding_priorities[i];
6344 setup_coding_system (CODING_ID_NAME (this->id), coding);
6345 break;
6346 }
6347 }
6348 }
6349 else if (XINT (CODING_ATTR_CATEGORY (CODING_ID_ATTRS (coding->id)))
6350 == coding_category_utf_8_auto)
6351 {
6352 Lisp_Object coding_systems;
6353 struct coding_detection_info detect_info;
6354
6355 coding_systems
6356 = AREF (CODING_ID_ATTRS (coding->id), coding_attr_utf_bom);
6357 detect_info.found = detect_info.rejected = 0;
6358 coding->head_ascii = 0;
6359 if (CONSP (coding_systems)
6360 && detect_coding_utf_8 (coding, &detect_info))
6361 {
6362 if (detect_info.found & CATEGORY_MASK_UTF_8_SIG)
6363 setup_coding_system (XCAR (coding_systems), coding);
6364 else
6365 setup_coding_system (XCDR (coding_systems), coding);
6366 }
6367 }
6368 else if (XINT (CODING_ATTR_CATEGORY (CODING_ID_ATTRS (coding->id)))
6369 == coding_category_utf_16_auto)
6370 {
6371 Lisp_Object coding_systems;
6372 struct coding_detection_info detect_info;
6373
6374 coding_systems
6375 = AREF (CODING_ID_ATTRS (coding->id), coding_attr_utf_bom);
6376 detect_info.found = detect_info.rejected = 0;
6377 coding->head_ascii = 0;
6378 if (CONSP (coding_systems)
6379 && detect_coding_utf_16 (coding, &detect_info))
6380 {
6381 if (detect_info.found & CATEGORY_MASK_UTF_16_LE)
6382 setup_coding_system (XCAR (coding_systems), coding);
6383 else if (detect_info.found & CATEGORY_MASK_UTF_16_BE)
6384 setup_coding_system (XCDR (coding_systems), coding);
6385 }
6386 }
6387 coding->mode = saved_mode;
6388 }
6389
6390
6391 static void
6392 decode_eol (struct coding_system *coding)
6393 {
6394 Lisp_Object eol_type;
6395 unsigned char *p, *pbeg, *pend;
6396
6397 eol_type = CODING_ID_EOL_TYPE (coding->id);
6398 if (EQ (eol_type, Qunix) || inhibit_eol_conversion)
6399 return;
6400
6401 if (NILP (coding->dst_object))
6402 pbeg = coding->destination;
6403 else
6404 pbeg = BYTE_POS_ADDR (coding->dst_pos_byte);
6405 pend = pbeg + coding->produced;
6406
6407 if (VECTORP (eol_type))
6408 {
6409 int eol_seen = EOL_SEEN_NONE;
6410
6411 for (p = pbeg; p < pend; p++)
6412 {
6413 if (*p == '\n')
6414 eol_seen |= EOL_SEEN_LF;
6415 else if (*p == '\r')
6416 {
6417 if (p + 1 < pend && *(p + 1) == '\n')
6418 {
6419 eol_seen |= EOL_SEEN_CRLF;
6420 p++;
6421 }
6422 else
6423 eol_seen |= EOL_SEEN_CR;
6424 }
6425 }
6426 /* Handle DOS-style EOLs in a file with stray ^M characters. */
6427 if ((eol_seen & EOL_SEEN_CRLF) != 0
6428 && (eol_seen & EOL_SEEN_CR) != 0
6429 && (eol_seen & EOL_SEEN_LF) == 0)
6430 eol_seen = EOL_SEEN_CRLF;
6431 else if (eol_seen != EOL_SEEN_NONE
6432 && eol_seen != EOL_SEEN_LF
6433 && eol_seen != EOL_SEEN_CRLF
6434 && eol_seen != EOL_SEEN_CR)
6435 eol_seen = EOL_SEEN_LF;
6436 if (eol_seen != EOL_SEEN_NONE)
6437 eol_type = adjust_coding_eol_type (coding, eol_seen);
6438 }
6439
6440 if (EQ (eol_type, Qmac))
6441 {
6442 for (p = pbeg; p < pend; p++)
6443 if (*p == '\r')
6444 *p = '\n';
6445 }
6446 else if (EQ (eol_type, Qdos))
6447 {
6448 ptrdiff_t n = 0;
6449
6450 if (NILP (coding->dst_object))
6451 {
6452 /* Start deleting '\r' from the tail to minimize the memory
6453 movement. */
6454 for (p = pend - 2; p >= pbeg; p--)
6455 if (*p == '\r')
6456 {
6457 memmove (p, p + 1, pend-- - p - 1);
6458 n++;
6459 }
6460 }
6461 else
6462 {
6463 ptrdiff_t pos_byte = coding->dst_pos_byte;
6464 ptrdiff_t pos = coding->dst_pos;
6465 ptrdiff_t pos_end = pos + coding->produced_char - 1;
6466
6467 while (pos < pos_end)
6468 {
6469 p = BYTE_POS_ADDR (pos_byte);
6470 if (*p == '\r' && p[1] == '\n')
6471 {
6472 del_range_2 (pos, pos_byte, pos + 1, pos_byte + 1, 0);
6473 n++;
6474 pos_end--;
6475 }
6476 pos++;
6477 if (coding->dst_multibyte)
6478 pos_byte += BYTES_BY_CHAR_HEAD (*p);
6479 else
6480 pos_byte++;
6481 }
6482 }
6483 coding->produced -= n;
6484 coding->produced_char -= n;
6485 }
6486 }
6487
6488
6489 /* Return a translation table (or list of them) from coding system
6490 attribute vector ATTRS for encoding (if ENCODEP) or decoding (if
6491 not ENCODEP). */
6492
6493 static Lisp_Object
6494 get_translation_table (Lisp_Object attrs, bool encodep, int *max_lookup)
6495 {
6496 Lisp_Object standard, translation_table;
6497 Lisp_Object val;
6498
6499 if (NILP (Venable_character_translation))
6500 {
6501 if (max_lookup)
6502 *max_lookup = 0;
6503 return Qnil;
6504 }
6505 if (encodep)
6506 translation_table = CODING_ATTR_ENCODE_TBL (attrs),
6507 standard = Vstandard_translation_table_for_encode;
6508 else
6509 translation_table = CODING_ATTR_DECODE_TBL (attrs),
6510 standard = Vstandard_translation_table_for_decode;
6511 if (NILP (translation_table))
6512 translation_table = standard;
6513 else
6514 {
6515 if (SYMBOLP (translation_table))
6516 translation_table = Fget (translation_table, Qtranslation_table);
6517 else if (CONSP (translation_table))
6518 {
6519 translation_table = Fcopy_sequence (translation_table);
6520 for (val = translation_table; CONSP (val); val = XCDR (val))
6521 if (SYMBOLP (XCAR (val)))
6522 XSETCAR (val, Fget (XCAR (val), Qtranslation_table));
6523 }
6524 if (CHAR_TABLE_P (standard))
6525 {
6526 if (CONSP (translation_table))
6527 translation_table = nconc2 (translation_table,
6528 Fcons (standard, Qnil));
6529 else
6530 translation_table = Fcons (translation_table,
6531 Fcons (standard, Qnil));
6532 }
6533 }
6534
6535 if (max_lookup)
6536 {
6537 *max_lookup = 1;
6538 if (CHAR_TABLE_P (translation_table)
6539 && CHAR_TABLE_EXTRA_SLOTS (XCHAR_TABLE (translation_table)) > 1)
6540 {
6541 val = XCHAR_TABLE (translation_table)->extras[1];
6542 if (NATNUMP (val) && *max_lookup < XFASTINT (val))
6543 *max_lookup = XFASTINT (val);
6544 }
6545 else if (CONSP (translation_table))
6546 {
6547 Lisp_Object tail;
6548
6549 for (tail = translation_table; CONSP (tail); tail = XCDR (tail))
6550 if (CHAR_TABLE_P (XCAR (tail))
6551 && CHAR_TABLE_EXTRA_SLOTS (XCHAR_TABLE (XCAR (tail))) > 1)
6552 {
6553 Lisp_Object tailval = XCHAR_TABLE (XCAR (tail))->extras[1];
6554 if (NATNUMP (tailval) && *max_lookup < XFASTINT (tailval))
6555 *max_lookup = XFASTINT (tailval);
6556 }
6557 }
6558 }
6559 return translation_table;
6560 }
6561
6562 #define LOOKUP_TRANSLATION_TABLE(table, c, trans) \
6563 do { \
6564 trans = Qnil; \
6565 if (CHAR_TABLE_P (table)) \
6566 { \
6567 trans = CHAR_TABLE_REF (table, c); \
6568 if (CHARACTERP (trans)) \
6569 c = XFASTINT (trans), trans = Qnil; \
6570 } \
6571 else if (CONSP (table)) \
6572 { \
6573 Lisp_Object tail; \
6574 \
6575 for (tail = table; CONSP (tail); tail = XCDR (tail)) \
6576 if (CHAR_TABLE_P (XCAR (tail))) \
6577 { \
6578 trans = CHAR_TABLE_REF (XCAR (tail), c); \
6579 if (CHARACTERP (trans)) \
6580 c = XFASTINT (trans), trans = Qnil; \
6581 else if (! NILP (trans)) \
6582 break; \
6583 } \
6584 } \
6585 } while (0)
6586
6587
6588 /* Return a translation of character(s) at BUF according to TRANS.
6589 TRANS is TO-CHAR or ((FROM . TO) ...) where
6590 FROM = [FROM-CHAR ...], TO is TO-CHAR or [TO-CHAR ...].
6591 The return value is TO-CHAR or ([FROM-CHAR ...] . TO) if a
6592 translation is found, and Qnil if not found..
6593 If BUF is too short to lookup characters in FROM, return Qt. */
6594
6595 static Lisp_Object
6596 get_translation (Lisp_Object trans, int *buf, int *buf_end)
6597 {
6598
6599 if (INTEGERP (trans))
6600 return trans;
6601 for (; CONSP (trans); trans = XCDR (trans))
6602 {
6603 Lisp_Object val = XCAR (trans);
6604 Lisp_Object from = XCAR (val);
6605 ptrdiff_t len = ASIZE (from);
6606 ptrdiff_t i;
6607
6608 for (i = 0; i < len; i++)
6609 {
6610 if (buf + i == buf_end)
6611 return Qt;
6612 if (XINT (AREF (from, i)) != buf[i])
6613 break;
6614 }
6615 if (i == len)
6616 return val;
6617 }
6618 return Qnil;
6619 }
6620
6621
6622 static int
6623 produce_chars (struct coding_system *coding, Lisp_Object translation_table,
6624 bool last_block)
6625 {
6626 unsigned char *dst = coding->destination + coding->produced;
6627 unsigned char *dst_end = coding->destination + coding->dst_bytes;
6628 ptrdiff_t produced;
6629 ptrdiff_t produced_chars = 0;
6630 int carryover = 0;
6631
6632 if (! coding->chars_at_source)
6633 {
6634 /* Source characters are in coding->charbuf. */
6635 int *buf = coding->charbuf;
6636 int *buf_end = buf + coding->charbuf_used;
6637
6638 if (EQ (coding->src_object, coding->dst_object))
6639 {
6640 coding_set_source (coding);
6641 dst_end = ((unsigned char *) coding->source) + coding->consumed;
6642 }
6643
6644 while (buf < buf_end)
6645 {
6646 int c = *buf;
6647 ptrdiff_t i;
6648
6649 if (c >= 0)
6650 {
6651 ptrdiff_t from_nchars = 1, to_nchars = 1;
6652 Lisp_Object trans = Qnil;
6653
6654 LOOKUP_TRANSLATION_TABLE (translation_table, c, trans);
6655 if (! NILP (trans))
6656 {
6657 trans = get_translation (trans, buf, buf_end);
6658 if (INTEGERP (trans))
6659 c = XINT (trans);
6660 else if (CONSP (trans))
6661 {
6662 from_nchars = ASIZE (XCAR (trans));
6663 trans = XCDR (trans);
6664 if (INTEGERP (trans))
6665 c = XINT (trans);
6666 else
6667 {
6668 to_nchars = ASIZE (trans);
6669 c = XINT (AREF (trans, 0));
6670 }
6671 }
6672 else if (EQ (trans, Qt) && ! last_block)
6673 break;
6674 }
6675
6676 if ((dst_end - dst) / MAX_MULTIBYTE_LENGTH < to_nchars)
6677 {
6678 if (((min (PTRDIFF_MAX, SIZE_MAX) - (buf_end - buf))
6679 / MAX_MULTIBYTE_LENGTH)
6680 < to_nchars)
6681 memory_full (SIZE_MAX);
6682 dst = alloc_destination (coding,
6683 buf_end - buf
6684 + MAX_MULTIBYTE_LENGTH * to_nchars,
6685 dst);
6686 if (EQ (coding->src_object, coding->dst_object))
6687 {
6688 coding_set_source (coding);
6689 dst_end = (((unsigned char *) coding->source)
6690 + coding->consumed);
6691 }
6692 else
6693 dst_end = coding->destination + coding->dst_bytes;
6694 }
6695
6696 for (i = 0; i < to_nchars; i++)
6697 {
6698 if (i > 0)
6699 c = XINT (AREF (trans, i));
6700 if (coding->dst_multibyte
6701 || ! CHAR_BYTE8_P (c))
6702 CHAR_STRING_ADVANCE_NO_UNIFY (c, dst);
6703 else
6704 *dst++ = CHAR_TO_BYTE8 (c);
6705 }
6706 produced_chars += to_nchars;
6707 buf += from_nchars;
6708 }
6709 else
6710 /* This is an annotation datum. (-C) is the length. */
6711 buf += -c;
6712 }
6713 carryover = buf_end - buf;
6714 }
6715 else
6716 {
6717 /* Source characters are at coding->source. */
6718 const unsigned char *src = coding->source;
6719 const unsigned char *src_end = src + coding->consumed;
6720
6721 if (EQ (coding->dst_object, coding->src_object))
6722 dst_end = (unsigned char *) src;
6723 if (coding->src_multibyte != coding->dst_multibyte)
6724 {
6725 if (coding->src_multibyte)
6726 {
6727 bool multibytep = 1;
6728 ptrdiff_t consumed_chars = 0;
6729
6730 while (1)
6731 {
6732 const unsigned char *src_base = src;
6733 int c;
6734
6735 ONE_MORE_BYTE (c);
6736 if (dst == dst_end)
6737 {
6738 if (EQ (coding->src_object, coding->dst_object))
6739 dst_end = (unsigned char *) src;
6740 if (dst == dst_end)
6741 {
6742 ptrdiff_t offset = src - coding->source;
6743
6744 dst = alloc_destination (coding, src_end - src + 1,
6745 dst);
6746 dst_end = coding->destination + coding->dst_bytes;
6747 coding_set_source (coding);
6748 src = coding->source + offset;
6749 src_end = coding->source + coding->consumed;
6750 if (EQ (coding->src_object, coding->dst_object))
6751 dst_end = (unsigned char *) src;
6752 }
6753 }
6754 *dst++ = c;
6755 produced_chars++;
6756 }
6757 no_more_source:
6758 ;
6759 }
6760 else
6761 while (src < src_end)
6762 {
6763 bool multibytep = 1;
6764 int c = *src++;
6765
6766 if (dst >= dst_end - 1)
6767 {
6768 if (EQ (coding->src_object, coding->dst_object))
6769 dst_end = (unsigned char *) src;
6770 if (dst >= dst_end - 1)
6771 {
6772 ptrdiff_t offset = src - coding->source;
6773 ptrdiff_t more_bytes;
6774
6775 if (EQ (coding->src_object, coding->dst_object))
6776 more_bytes = ((src_end - src) / 2) + 2;
6777 else
6778 more_bytes = src_end - src + 2;
6779 dst = alloc_destination (coding, more_bytes, dst);
6780 dst_end = coding->destination + coding->dst_bytes;
6781 coding_set_source (coding);
6782 src = coding->source + offset;
6783 src_end = coding->source + coding->consumed;
6784 if (EQ (coding->src_object, coding->dst_object))
6785 dst_end = (unsigned char *) src;
6786 }
6787 }
6788 EMIT_ONE_BYTE (c);
6789 }
6790 }
6791 else
6792 {
6793 if (!EQ (coding->src_object, coding->dst_object))
6794 {
6795 ptrdiff_t require = coding->src_bytes - coding->dst_bytes;
6796
6797 if (require > 0)
6798 {
6799 ptrdiff_t offset = src - coding->source;
6800
6801 dst = alloc_destination (coding, require, dst);
6802 coding_set_source (coding);
6803 src = coding->source + offset;
6804 src_end = coding->source + coding->consumed;
6805 }
6806 }
6807 produced_chars = coding->consumed_char;
6808 while (src < src_end)
6809 *dst++ = *src++;
6810 }
6811 }
6812
6813 produced = dst - (coding->destination + coding->produced);
6814 if (BUFFERP (coding->dst_object) && produced_chars > 0)
6815 insert_from_gap (produced_chars, produced);
6816 coding->produced += produced;
6817 coding->produced_char += produced_chars;
6818 return carryover;
6819 }
6820
6821 /* Compose text in CODING->object according to the annotation data at
6822 CHARBUF. CHARBUF is an array:
6823 [ -LENGTH ANNOTATION_MASK NCHARS NBYTES METHOD [ COMPONENTS... ] ]
6824 */
6825
6826 static void
6827 produce_composition (struct coding_system *coding, int *charbuf, ptrdiff_t pos)
6828 {
6829 int len;
6830 ptrdiff_t to;
6831 enum composition_method method;
6832 Lisp_Object components;
6833
6834 len = -charbuf[0] - MAX_ANNOTATION_LENGTH;
6835 to = pos + charbuf[2];
6836 method = (enum composition_method) (charbuf[4]);
6837
6838 if (method == COMPOSITION_RELATIVE)
6839 components = Qnil;
6840 else
6841 {
6842 Lisp_Object args[MAX_COMPOSITION_COMPONENTS * 2 - 1];
6843 int i, j;
6844
6845 if (method == COMPOSITION_WITH_RULE)
6846 len = charbuf[2] * 3 - 2;
6847 charbuf += MAX_ANNOTATION_LENGTH;
6848 /* charbuf = [ CHRA ... CHAR] or [ CHAR -2 RULE ... CHAR ] */
6849 for (i = j = 0; i < len && charbuf[i] != -1; i++, j++)
6850 {
6851 if (charbuf[i] >= 0)
6852 args[j] = make_number (charbuf[i]);
6853 else
6854 {
6855 i++;
6856 args[j] = make_number (charbuf[i] % 0x100);
6857 }
6858 }
6859 components = (i == j ? Fstring (j, args) : Fvector (j, args));
6860 }
6861 compose_text (pos, to, components, Qnil, coding->dst_object);
6862 }
6863
6864
6865 /* Put `charset' property on text in CODING->object according to
6866 the annotation data at CHARBUF. CHARBUF is an array:
6867 [ -LENGTH ANNOTATION_MASK NCHARS CHARSET-ID ]
6868 */
6869
6870 static void
6871 produce_charset (struct coding_system *coding, int *charbuf, ptrdiff_t pos)
6872 {
6873 ptrdiff_t from = pos - charbuf[2];
6874 struct charset *charset = CHARSET_FROM_ID (charbuf[3]);
6875
6876 Fput_text_property (make_number (from), make_number (pos),
6877 Qcharset, CHARSET_NAME (charset),
6878 coding->dst_object);
6879 }
6880
6881
6882 #define CHARBUF_SIZE 0x4000
6883
6884 #define ALLOC_CONVERSION_WORK_AREA(coding) \
6885 do { \
6886 int size = CHARBUF_SIZE; \
6887 \
6888 coding->charbuf = NULL; \
6889 while (size > 1024) \
6890 { \
6891 coding->charbuf = alloca (sizeof (int) * size); \
6892 if (coding->charbuf) \
6893 break; \
6894 size >>= 1; \
6895 } \
6896 if (! coding->charbuf) \
6897 { \
6898 record_conversion_result (coding, CODING_RESULT_INSUFFICIENT_MEM); \
6899 return; \
6900 } \
6901 coding->charbuf_size = size; \
6902 } while (0)
6903
6904
6905 static void
6906 produce_annotation (struct coding_system *coding, ptrdiff_t pos)
6907 {
6908 int *charbuf = coding->charbuf;
6909 int *charbuf_end = charbuf + coding->charbuf_used;
6910
6911 if (NILP (coding->dst_object))
6912 return;
6913
6914 while (charbuf < charbuf_end)
6915 {
6916 if (*charbuf >= 0)
6917 pos++, charbuf++;
6918 else
6919 {
6920 int len = -*charbuf;
6921
6922 if (len > 2)
6923 switch (charbuf[1])
6924 {
6925 case CODING_ANNOTATE_COMPOSITION_MASK:
6926 produce_composition (coding, charbuf, pos);
6927 break;
6928 case CODING_ANNOTATE_CHARSET_MASK:
6929 produce_charset (coding, charbuf, pos);
6930 break;
6931 }
6932 charbuf += len;
6933 }
6934 }
6935 }
6936
6937 /* Decode the data at CODING->src_object into CODING->dst_object.
6938 CODING->src_object is a buffer, a string, or nil.
6939 CODING->dst_object is a buffer.
6940
6941 If CODING->src_object is a buffer, it must be the current buffer.
6942 In this case, if CODING->src_pos is positive, it is a position of
6943 the source text in the buffer, otherwise, the source text is in the
6944 gap area of the buffer, and CODING->src_pos specifies the offset of
6945 the text from GPT (which must be the same as PT). If this is the
6946 same buffer as CODING->dst_object, CODING->src_pos must be
6947 negative.
6948
6949 If CODING->src_object is a string, CODING->src_pos is an index to
6950 that string.
6951
6952 If CODING->src_object is nil, CODING->source must already point to
6953 the non-relocatable memory area. In this case, CODING->src_pos is
6954 an offset from CODING->source.
6955
6956 The decoded data is inserted at the current point of the buffer
6957 CODING->dst_object.
6958 */
6959
6960 static void
6961 decode_coding (struct coding_system *coding)
6962 {
6963 Lisp_Object attrs;
6964 Lisp_Object undo_list;
6965 Lisp_Object translation_table;
6966 struct ccl_spec cclspec;
6967 int carryover;
6968 int i;
6969
6970 if (BUFFERP (coding->src_object)
6971 && coding->src_pos > 0
6972 && coding->src_pos < GPT
6973 && coding->src_pos + coding->src_chars > GPT)
6974 move_gap_both (coding->src_pos, coding->src_pos_byte);
6975
6976 undo_list = Qt;
6977 if (BUFFERP (coding->dst_object))
6978 {
6979 set_buffer_internal (XBUFFER (coding->dst_object));
6980 if (GPT != PT)
6981 move_gap_both (PT, PT_BYTE);
6982
6983 /* We must disable undo_list in order to record the whole insert
6984 transaction via record_insert at the end. But doing so also
6985 disables the recording of the first change to the undo_list.
6986 Therefore we check for first change here and record it via
6987 record_first_change if needed. */
6988 if (MODIFF <= SAVE_MODIFF)
6989 record_first_change ();
6990
6991 undo_list = BVAR (current_buffer, undo_list);
6992 bset_undo_list (current_buffer, Qt);
6993 }
6994
6995 coding->consumed = coding->consumed_char = 0;
6996 coding->produced = coding->produced_char = 0;
6997 coding->chars_at_source = 0;
6998 record_conversion_result (coding, CODING_RESULT_SUCCESS);
6999 coding->errors = 0;
7000
7001 ALLOC_CONVERSION_WORK_AREA (coding);
7002
7003 attrs = CODING_ID_ATTRS (coding->id);
7004 translation_table = get_translation_table (attrs, 0, NULL);
7005
7006 carryover = 0;
7007 if (coding->decoder == decode_coding_ccl)
7008 {
7009 coding->spec.ccl = &cclspec;
7010 setup_ccl_program (&cclspec.ccl, CODING_CCL_DECODER (coding));
7011 }
7012 do
7013 {
7014 ptrdiff_t pos = coding->dst_pos + coding->produced_char;
7015
7016 coding_set_source (coding);
7017 coding->annotated = 0;
7018 coding->charbuf_used = carryover;
7019 (*(coding->decoder)) (coding);
7020 coding_set_destination (coding);
7021 carryover = produce_chars (coding, translation_table, 0);
7022 if (coding->annotated)
7023 produce_annotation (coding, pos);
7024 for (i = 0; i < carryover; i++)
7025 coding->charbuf[i]
7026 = coding->charbuf[coding->charbuf_used - carryover + i];
7027 }
7028 while (coding->result == CODING_RESULT_INSUFFICIENT_DST
7029 || (coding->consumed < coding->src_bytes
7030 && (coding->result == CODING_RESULT_SUCCESS
7031 || coding->result == CODING_RESULT_INVALID_SRC)));
7032
7033 if (carryover > 0)
7034 {
7035 coding_set_destination (coding);
7036 coding->charbuf_used = carryover;
7037 produce_chars (coding, translation_table, 1);
7038 }
7039
7040 coding->carryover_bytes = 0;
7041 if (coding->consumed < coding->src_bytes)
7042 {
7043 int nbytes = coding->src_bytes - coding->consumed;
7044 const unsigned char *src;
7045
7046 coding_set_source (coding);
7047 coding_set_destination (coding);
7048 src = coding->source + coding->consumed;
7049
7050 if (coding->mode & CODING_MODE_LAST_BLOCK)
7051 {
7052 /* Flush out unprocessed data as binary chars. We are sure
7053 that the number of data is less than the size of
7054 coding->charbuf. */
7055 coding->charbuf_used = 0;
7056 coding->chars_at_source = 0;
7057
7058 while (nbytes-- > 0)
7059 {
7060 int c = *src++;
7061
7062 if (c & 0x80)
7063 c = BYTE8_TO_CHAR (c);
7064 coding->charbuf[coding->charbuf_used++] = c;
7065 }
7066 produce_chars (coding, Qnil, 1);
7067 }
7068 else
7069 {
7070 /* Record unprocessed bytes in coding->carryover. We are
7071 sure that the number of data is less than the size of
7072 coding->carryover. */
7073 unsigned char *p = coding->carryover;
7074
7075 if (nbytes > sizeof coding->carryover)
7076 nbytes = sizeof coding->carryover;
7077 coding->carryover_bytes = nbytes;
7078 while (nbytes-- > 0)
7079 *p++ = *src++;
7080 }
7081 coding->consumed = coding->src_bytes;
7082 }
7083
7084 if (! EQ (CODING_ID_EOL_TYPE (coding->id), Qunix)
7085 && !inhibit_eol_conversion)
7086 decode_eol (coding);
7087 if (BUFFERP (coding->dst_object))
7088 {
7089 bset_undo_list (current_buffer, undo_list);
7090 record_insert (coding->dst_pos, coding->produced_char);
7091 }
7092 }
7093
7094
7095 /* Extract an annotation datum from a composition starting at POS and
7096 ending before LIMIT of CODING->src_object (buffer or string), store
7097 the data in BUF, set *STOP to a starting position of the next
7098 composition (if any) or to LIMIT, and return the address of the
7099 next element of BUF.
7100
7101 If such an annotation is not found, set *STOP to a starting
7102 position of a composition after POS (if any) or to LIMIT, and
7103 return BUF. */
7104
7105 static int *
7106 handle_composition_annotation (ptrdiff_t pos, ptrdiff_t limit,
7107 struct coding_system *coding, int *buf,
7108 ptrdiff_t *stop)
7109 {
7110 ptrdiff_t start, end;
7111 Lisp_Object prop;
7112
7113 if (! find_composition (pos, limit, &start, &end, &prop, coding->src_object)
7114 || end > limit)
7115 *stop = limit;
7116 else if (start > pos)
7117 *stop = start;
7118 else
7119 {
7120 if (start == pos)
7121 {
7122 /* We found a composition. Store the corresponding
7123 annotation data in BUF. */
7124 int *head = buf;
7125 enum composition_method method = COMPOSITION_METHOD (prop);
7126 int nchars = COMPOSITION_LENGTH (prop);
7127
7128 ADD_COMPOSITION_DATA (buf, nchars, 0, method);
7129 if (method != COMPOSITION_RELATIVE)
7130 {
7131 Lisp_Object components;
7132 ptrdiff_t i, len, i_byte;
7133
7134 components = COMPOSITION_COMPONENTS (prop);
7135 if (VECTORP (components))
7136 {
7137 len = ASIZE (components);
7138 for (i = 0; i < len; i++)
7139 *buf++ = XINT (AREF (components, i));
7140 }
7141 else if (STRINGP (components))
7142 {
7143 len = SCHARS (components);
7144 i = i_byte = 0;
7145 while (i < len)
7146 {
7147 FETCH_STRING_CHAR_ADVANCE (*buf, components, i, i_byte);
7148 buf++;
7149 }
7150 }
7151 else if (INTEGERP (components))
7152 {
7153 len = 1;
7154 *buf++ = XINT (components);
7155 }
7156 else if (CONSP (components))
7157 {
7158 for (len = 0; CONSP (components);
7159 len++, components = XCDR (components))
7160 *buf++ = XINT (XCAR (components));
7161 }
7162 else
7163 emacs_abort ();
7164 *head -= len;
7165 }
7166 }
7167
7168 if (find_composition (end, limit, &start, &end, &prop,
7169 coding->src_object)
7170 && end <= limit)
7171 *stop = start;
7172 else
7173 *stop = limit;
7174 }
7175 return buf;
7176 }
7177
7178
7179 /* Extract an annotation datum from a text property `charset' at POS of
7180 CODING->src_object (buffer of string), store the data in BUF, set
7181 *STOP to the position where the value of `charset' property changes
7182 (limiting by LIMIT), and return the address of the next element of
7183 BUF.
7184
7185 If the property value is nil, set *STOP to the position where the
7186 property value is non-nil (limiting by LIMIT), and return BUF. */
7187
7188 static int *
7189 handle_charset_annotation (ptrdiff_t pos, ptrdiff_t limit,
7190 struct coding_system *coding, int *buf,
7191 ptrdiff_t *stop)
7192 {
7193 Lisp_Object val, next;
7194 int id;
7195
7196 val = Fget_text_property (make_number (pos), Qcharset, coding->src_object);
7197 if (! NILP (val) && CHARSETP (val))
7198 id = XINT (CHARSET_SYMBOL_ID (val));
7199 else
7200 id = -1;
7201 ADD_CHARSET_DATA (buf, 0, id);
7202 next = Fnext_single_property_change (make_number (pos), Qcharset,
7203 coding->src_object,
7204 make_number (limit));
7205 *stop = XINT (next);
7206 return buf;
7207 }
7208
7209
7210 static void
7211 consume_chars (struct coding_system *coding, Lisp_Object translation_table,
7212 int max_lookup)
7213 {
7214 int *buf = coding->charbuf;
7215 int *buf_end = coding->charbuf + coding->charbuf_size;
7216 const unsigned char *src = coding->source + coding->consumed;
7217 const unsigned char *src_end = coding->source + coding->src_bytes;
7218 ptrdiff_t pos = coding->src_pos + coding->consumed_char;
7219 ptrdiff_t end_pos = coding->src_pos + coding->src_chars;
7220 bool multibytep = coding->src_multibyte;
7221 Lisp_Object eol_type;
7222 int c;
7223 ptrdiff_t stop, stop_composition, stop_charset;
7224 int *lookup_buf = NULL;
7225
7226 if (! NILP (translation_table))
7227 lookup_buf = alloca (sizeof (int) * max_lookup);
7228
7229 eol_type = inhibit_eol_conversion ? Qunix : CODING_ID_EOL_TYPE (coding->id);
7230 if (VECTORP (eol_type))
7231 eol_type = Qunix;
7232
7233 /* Note: composition handling is not yet implemented. */
7234 coding->common_flags &= ~CODING_ANNOTATE_COMPOSITION_MASK;
7235
7236 if (NILP (coding->src_object))
7237 stop = stop_composition = stop_charset = end_pos;
7238 else
7239 {
7240 if (coding->common_flags & CODING_ANNOTATE_COMPOSITION_MASK)
7241 stop = stop_composition = pos;
7242 else
7243 stop = stop_composition = end_pos;
7244 if (coding->common_flags & CODING_ANNOTATE_CHARSET_MASK)
7245 stop = stop_charset = pos;
7246 else
7247 stop_charset = end_pos;
7248 }
7249
7250 /* Compensate for CRLF and conversion. */
7251 buf_end -= 1 + MAX_ANNOTATION_LENGTH;
7252 while (buf < buf_end)
7253 {
7254 Lisp_Object trans;
7255
7256 if (pos == stop)
7257 {
7258 if (pos == end_pos)
7259 break;
7260 if (pos == stop_composition)
7261 buf = handle_composition_annotation (pos, end_pos, coding,
7262 buf, &stop_composition);
7263 if (pos == stop_charset)
7264 buf = handle_charset_annotation (pos, end_pos, coding,
7265 buf, &stop_charset);
7266 stop = (stop_composition < stop_charset
7267 ? stop_composition : stop_charset);
7268 }
7269
7270 if (! multibytep)
7271 {
7272 int bytes;
7273
7274 if (coding->encoder == encode_coding_raw_text
7275 || coding->encoder == encode_coding_ccl)
7276 c = *src++, pos++;
7277 else if ((bytes = MULTIBYTE_LENGTH (src, src_end)) > 0)
7278 c = STRING_CHAR_ADVANCE_NO_UNIFY (src), pos += bytes;
7279 else
7280 c = BYTE8_TO_CHAR (*src), src++, pos++;
7281 }
7282 else
7283 c = STRING_CHAR_ADVANCE_NO_UNIFY (src), pos++;
7284 if ((c == '\r') && (coding->mode & CODING_MODE_SELECTIVE_DISPLAY))
7285 c = '\n';
7286 if (! EQ (eol_type, Qunix))
7287 {
7288 if (c == '\n')
7289 {
7290 if (EQ (eol_type, Qdos))
7291 *buf++ = '\r';
7292 else
7293 c = '\r';
7294 }
7295 }
7296
7297 trans = Qnil;
7298 LOOKUP_TRANSLATION_TABLE (translation_table, c, trans);
7299 if (NILP (trans))
7300 *buf++ = c;
7301 else
7302 {
7303 ptrdiff_t from_nchars = 1, to_nchars = 1;
7304 int *lookup_buf_end;
7305 const unsigned char *p = src;
7306 int i;
7307
7308 lookup_buf[0] = c;
7309 for (i = 1; i < max_lookup && p < src_end; i++)
7310 lookup_buf[i] = STRING_CHAR_ADVANCE (p);
7311 lookup_buf_end = lookup_buf + i;
7312 trans = get_translation (trans, lookup_buf, lookup_buf_end);
7313 if (INTEGERP (trans))
7314 c = XINT (trans);
7315 else if (CONSP (trans))
7316 {
7317 from_nchars = ASIZE (XCAR (trans));
7318 trans = XCDR (trans);
7319 if (INTEGERP (trans))
7320 c = XINT (trans);
7321 else
7322 {
7323 to_nchars = ASIZE (trans);
7324 if (buf_end - buf < to_nchars)
7325 break;
7326 c = XINT (AREF (trans, 0));
7327 }
7328 }
7329 else
7330 break;
7331 *buf++ = c;
7332 for (i = 1; i < to_nchars; i++)
7333 *buf++ = XINT (AREF (trans, i));
7334 for (i = 1; i < from_nchars; i++, pos++)
7335 src += MULTIBYTE_LENGTH_NO_CHECK (src);
7336 }
7337 }
7338
7339 coding->consumed = src - coding->source;
7340 coding->consumed_char = pos - coding->src_pos;
7341 coding->charbuf_used = buf - coding->charbuf;
7342 coding->chars_at_source = 0;
7343 }
7344
7345
7346 /* Encode the text at CODING->src_object into CODING->dst_object.
7347 CODING->src_object is a buffer or a string.
7348 CODING->dst_object is a buffer or nil.
7349
7350 If CODING->src_object is a buffer, it must be the current buffer.
7351 In this case, if CODING->src_pos is positive, it is a position of
7352 the source text in the buffer, otherwise. the source text is in the
7353 gap area of the buffer, and coding->src_pos specifies the offset of
7354 the text from GPT (which must be the same as PT). If this is the
7355 same buffer as CODING->dst_object, CODING->src_pos must be
7356 negative and CODING should not have `pre-write-conversion'.
7357
7358 If CODING->src_object is a string, CODING should not have
7359 `pre-write-conversion'.
7360
7361 If CODING->dst_object is a buffer, the encoded data is inserted at
7362 the current point of that buffer.
7363
7364 If CODING->dst_object is nil, the encoded data is placed at the
7365 memory area specified by CODING->destination. */
7366
7367 static void
7368 encode_coding (struct coding_system *coding)
7369 {
7370 Lisp_Object attrs;
7371 Lisp_Object translation_table;
7372 int max_lookup;
7373 struct ccl_spec cclspec;
7374
7375 attrs = CODING_ID_ATTRS (coding->id);
7376 if (coding->encoder == encode_coding_raw_text)
7377 translation_table = Qnil, max_lookup = 0;
7378 else
7379 translation_table = get_translation_table (attrs, 1, &max_lookup);
7380
7381 if (BUFFERP (coding->dst_object))
7382 {
7383 set_buffer_internal (XBUFFER (coding->dst_object));
7384 coding->dst_multibyte
7385 = ! NILP (BVAR (current_buffer, enable_multibyte_characters));
7386 }
7387
7388 coding->consumed = coding->consumed_char = 0;
7389 coding->produced = coding->produced_char = 0;
7390 record_conversion_result (coding, CODING_RESULT_SUCCESS);
7391 coding->errors = 0;
7392
7393 ALLOC_CONVERSION_WORK_AREA (coding);
7394
7395 if (coding->encoder == encode_coding_ccl)
7396 {
7397 coding->spec.ccl = &cclspec;
7398 setup_ccl_program (&cclspec.ccl, CODING_CCL_ENCODER (coding));
7399 }
7400 do {
7401 coding_set_source (coding);
7402 consume_chars (coding, translation_table, max_lookup);
7403 coding_set_destination (coding);
7404 (*(coding->encoder)) (coding);
7405 } while (coding->consumed_char < coding->src_chars);
7406
7407 if (BUFFERP (coding->dst_object) && coding->produced_char > 0)
7408 insert_from_gap (coding->produced_char, coding->produced);
7409 }
7410
7411
7412 /* Name (or base name) of work buffer for code conversion. */
7413 static Lisp_Object Vcode_conversion_workbuf_name;
7414
7415 /* A working buffer used by the top level conversion. Once it is
7416 created, it is never destroyed. It has the name
7417 Vcode_conversion_workbuf_name. The other working buffers are
7418 destroyed after the use is finished, and their names are modified
7419 versions of Vcode_conversion_workbuf_name. */
7420 static Lisp_Object Vcode_conversion_reused_workbuf;
7421
7422 /* True iff Vcode_conversion_reused_workbuf is already in use. */
7423 static bool reused_workbuf_in_use;
7424
7425
7426 /* Return a working buffer of code conversion. MULTIBYTE specifies the
7427 multibyteness of returning buffer. */
7428
7429 static Lisp_Object
7430 make_conversion_work_buffer (bool multibyte)
7431 {
7432 Lisp_Object name, workbuf;
7433 struct buffer *current;
7434
7435 if (reused_workbuf_in_use)
7436 {
7437 name = Fgenerate_new_buffer_name (Vcode_conversion_workbuf_name, Qnil);
7438 workbuf = Fget_buffer_create (name);
7439 }
7440 else
7441 {
7442 reused_workbuf_in_use = 1;
7443 if (NILP (Fbuffer_live_p (Vcode_conversion_reused_workbuf)))
7444 Vcode_conversion_reused_workbuf
7445 = Fget_buffer_create (Vcode_conversion_workbuf_name);
7446 workbuf = Vcode_conversion_reused_workbuf;
7447 }
7448 current = current_buffer;
7449 set_buffer_internal (XBUFFER (workbuf));
7450 /* We can't allow modification hooks to run in the work buffer. For
7451 instance, directory_files_internal assumes that file decoding
7452 doesn't compile new regexps. */
7453 Fset (Fmake_local_variable (Qinhibit_modification_hooks), Qt);
7454 Ferase_buffer ();
7455 bset_undo_list (current_buffer, Qt);
7456 bset_enable_multibyte_characters (current_buffer, multibyte ? Qt : Qnil);
7457 set_buffer_internal (current);
7458 return workbuf;
7459 }
7460
7461
7462 static Lisp_Object
7463 code_conversion_restore (Lisp_Object arg)
7464 {
7465 Lisp_Object current, workbuf;
7466 struct gcpro gcpro1;
7467
7468 GCPRO1 (arg);
7469 current = XCAR (arg);
7470 workbuf = XCDR (arg);
7471 if (! NILP (workbuf))
7472 {
7473 if (EQ (workbuf, Vcode_conversion_reused_workbuf))
7474 reused_workbuf_in_use = 0;
7475 else
7476 Fkill_buffer (workbuf);
7477 }
7478 set_buffer_internal (XBUFFER (current));
7479 UNGCPRO;
7480 return Qnil;
7481 }
7482
7483 Lisp_Object
7484 code_conversion_save (bool with_work_buf, bool multibyte)
7485 {
7486 Lisp_Object workbuf = Qnil;
7487
7488 if (with_work_buf)
7489 workbuf = make_conversion_work_buffer (multibyte);
7490 record_unwind_protect (code_conversion_restore,
7491 Fcons (Fcurrent_buffer (), workbuf));
7492 return workbuf;
7493 }
7494
7495 void
7496 decode_coding_gap (struct coding_system *coding,
7497 ptrdiff_t chars, ptrdiff_t bytes)
7498 {
7499 ptrdiff_t count = SPECPDL_INDEX ();
7500 Lisp_Object attrs;
7501
7502 code_conversion_save (0, 0);
7503
7504 coding->src_object = Fcurrent_buffer ();
7505 coding->src_chars = chars;
7506 coding->src_bytes = bytes;
7507 coding->src_pos = -chars;
7508 coding->src_pos_byte = -bytes;
7509 coding->src_multibyte = chars < bytes;
7510 coding->dst_object = coding->src_object;
7511 coding->dst_pos = PT;
7512 coding->dst_pos_byte = PT_BYTE;
7513 coding->dst_multibyte = ! NILP (BVAR (current_buffer, enable_multibyte_characters));
7514
7515 if (CODING_REQUIRE_DETECTION (coding))
7516 detect_coding (coding);
7517
7518 coding->mode |= CODING_MODE_LAST_BLOCK;
7519 current_buffer->text->inhibit_shrinking = 1;
7520 decode_coding (coding);
7521 current_buffer->text->inhibit_shrinking = 0;
7522
7523 attrs = CODING_ID_ATTRS (coding->id);
7524 if (! NILP (CODING_ATTR_POST_READ (attrs)))
7525 {
7526 ptrdiff_t prev_Z = Z, prev_Z_BYTE = Z_BYTE;
7527 Lisp_Object val;
7528
7529 TEMP_SET_PT_BOTH (coding->dst_pos, coding->dst_pos_byte);
7530 val = call1 (CODING_ATTR_POST_READ (attrs),
7531 make_number (coding->produced_char));
7532 CHECK_NATNUM (val);
7533 coding->produced_char += Z - prev_Z;
7534 coding->produced += Z_BYTE - prev_Z_BYTE;
7535 }
7536
7537 unbind_to (count, Qnil);
7538 }
7539
7540
7541 /* Decode the text in the range FROM/FROM_BYTE and TO/TO_BYTE in
7542 SRC_OBJECT into DST_OBJECT by coding context CODING.
7543
7544 SRC_OBJECT is a buffer, a string, or Qnil.
7545
7546 If it is a buffer, the text is at point of the buffer. FROM and TO
7547 are positions in the buffer.
7548
7549 If it is a string, the text is at the beginning of the string.
7550 FROM and TO are indices to the string.
7551
7552 If it is nil, the text is at coding->source. FROM and TO are
7553 indices to coding->source.
7554
7555 DST_OBJECT is a buffer, Qt, or Qnil.
7556
7557 If it is a buffer, the decoded text is inserted at point of the
7558 buffer. If the buffer is the same as SRC_OBJECT, the source text
7559 is deleted.
7560
7561 If it is Qt, a string is made from the decoded text, and
7562 set in CODING->dst_object.
7563
7564 If it is Qnil, the decoded text is stored at CODING->destination.
7565 The caller must allocate CODING->dst_bytes bytes at
7566 CODING->destination by xmalloc. If the decoded text is longer than
7567 CODING->dst_bytes, CODING->destination is relocated by xrealloc.
7568 */
7569
7570 void
7571 decode_coding_object (struct coding_system *coding,
7572 Lisp_Object src_object,
7573 ptrdiff_t from, ptrdiff_t from_byte,
7574 ptrdiff_t to, ptrdiff_t to_byte,
7575 Lisp_Object dst_object)
7576 {
7577 ptrdiff_t count = SPECPDL_INDEX ();
7578 unsigned char *destination IF_LINT (= NULL);
7579 ptrdiff_t dst_bytes IF_LINT (= 0);
7580 ptrdiff_t chars = to - from;
7581 ptrdiff_t bytes = to_byte - from_byte;
7582 Lisp_Object attrs;
7583 ptrdiff_t saved_pt = -1, saved_pt_byte IF_LINT (= 0);
7584 bool need_marker_adjustment = 0;
7585 Lisp_Object old_deactivate_mark;
7586
7587 old_deactivate_mark = Vdeactivate_mark;
7588
7589 if (NILP (dst_object))
7590 {
7591 destination = coding->destination;
7592 dst_bytes = coding->dst_bytes;
7593 }
7594
7595 coding->src_object = src_object;
7596 coding->src_chars = chars;
7597 coding->src_bytes = bytes;
7598 coding->src_multibyte = chars < bytes;
7599
7600 if (STRINGP (src_object))
7601 {
7602 coding->src_pos = from;
7603 coding->src_pos_byte = from_byte;
7604 }
7605 else if (BUFFERP (src_object))
7606 {
7607 set_buffer_internal (XBUFFER (src_object));
7608 if (from != GPT)
7609 move_gap_both (from, from_byte);
7610 if (EQ (src_object, dst_object))
7611 {
7612 struct Lisp_Marker *tail;
7613
7614 for (tail = BUF_MARKERS (current_buffer); tail; tail = tail->next)
7615 {
7616 tail->need_adjustment
7617 = tail->charpos == (tail->insertion_type ? from : to);
7618 need_marker_adjustment |= tail->need_adjustment;
7619 }
7620 saved_pt = PT, saved_pt_byte = PT_BYTE;
7621 TEMP_SET_PT_BOTH (from, from_byte);
7622 current_buffer->text->inhibit_shrinking = 1;
7623 del_range_both (from, from_byte, to, to_byte, 1);
7624 coding->src_pos = -chars;
7625 coding->src_pos_byte = -bytes;
7626 }
7627 else
7628 {
7629 coding->src_pos = from;
7630 coding->src_pos_byte = from_byte;
7631 }
7632 }
7633
7634 if (CODING_REQUIRE_DETECTION (coding))
7635 detect_coding (coding);
7636 attrs = CODING_ID_ATTRS (coding->id);
7637
7638 if (EQ (dst_object, Qt)
7639 || (! NILP (CODING_ATTR_POST_READ (attrs))
7640 && NILP (dst_object)))
7641 {
7642 coding->dst_multibyte = !CODING_FOR_UNIBYTE (coding);
7643 coding->dst_object = code_conversion_save (1, coding->dst_multibyte);
7644 coding->dst_pos = BEG;
7645 coding->dst_pos_byte = BEG_BYTE;
7646 }
7647 else if (BUFFERP (dst_object))
7648 {
7649 code_conversion_save (0, 0);
7650 coding->dst_object = dst_object;
7651 coding->dst_pos = BUF_PT (XBUFFER (dst_object));
7652 coding->dst_pos_byte = BUF_PT_BYTE (XBUFFER (dst_object));
7653 coding->dst_multibyte
7654 = ! NILP (BVAR (XBUFFER (dst_object), enable_multibyte_characters));
7655 }
7656 else
7657 {
7658 code_conversion_save (0, 0);
7659 coding->dst_object = Qnil;
7660 /* Most callers presume this will return a multibyte result, and they
7661 won't use `binary' or `raw-text' anyway, so let's not worry about
7662 CODING_FOR_UNIBYTE. */
7663 coding->dst_multibyte = 1;
7664 }
7665
7666 decode_coding (coding);
7667
7668 if (BUFFERP (coding->dst_object))
7669 set_buffer_internal (XBUFFER (coding->dst_object));
7670
7671 if (! NILP (CODING_ATTR_POST_READ (attrs)))
7672 {
7673 struct gcpro gcpro1, gcpro2, gcpro3, gcpro4, gcpro5;
7674 ptrdiff_t prev_Z = Z, prev_Z_BYTE = Z_BYTE;
7675 Lisp_Object val;
7676
7677 TEMP_SET_PT_BOTH (coding->dst_pos, coding->dst_pos_byte);
7678 GCPRO5 (coding->src_object, coding->dst_object, src_object, dst_object,
7679 old_deactivate_mark);
7680 val = safe_call1 (CODING_ATTR_POST_READ (attrs),
7681 make_number (coding->produced_char));
7682 UNGCPRO;
7683 CHECK_NATNUM (val);
7684 coding->produced_char += Z - prev_Z;
7685 coding->produced += Z_BYTE - prev_Z_BYTE;
7686 }
7687
7688 if (EQ (dst_object, Qt))
7689 {
7690 coding->dst_object = Fbuffer_string ();
7691 }
7692 else if (NILP (dst_object) && BUFFERP (coding->dst_object))
7693 {
7694 set_buffer_internal (XBUFFER (coding->dst_object));
7695 if (dst_bytes < coding->produced)
7696 {
7697 destination = xrealloc (destination, coding->produced);
7698 if (! destination)
7699 {
7700 record_conversion_result (coding,
7701 CODING_RESULT_INSUFFICIENT_MEM);
7702 unbind_to (count, Qnil);
7703 return;
7704 }
7705 if (BEGV < GPT && GPT < BEGV + coding->produced_char)
7706 move_gap_both (BEGV, BEGV_BYTE);
7707 memcpy (destination, BEGV_ADDR, coding->produced);
7708 coding->destination = destination;
7709 }
7710 }
7711
7712 if (saved_pt >= 0)
7713 {
7714 /* This is the case of:
7715 (BUFFERP (src_object) && EQ (src_object, dst_object))
7716 As we have moved PT while replacing the original buffer
7717 contents, we must recover it now. */
7718 set_buffer_internal (XBUFFER (src_object));
7719 current_buffer->text->inhibit_shrinking = 0;
7720 if (saved_pt < from)
7721 TEMP_SET_PT_BOTH (saved_pt, saved_pt_byte);
7722 else if (saved_pt < from + chars)
7723 TEMP_SET_PT_BOTH (from, from_byte);
7724 else if (! NILP (BVAR (current_buffer, enable_multibyte_characters)))
7725 TEMP_SET_PT_BOTH (saved_pt + (coding->produced_char - chars),
7726 saved_pt_byte + (coding->produced - bytes));
7727 else
7728 TEMP_SET_PT_BOTH (saved_pt + (coding->produced - bytes),
7729 saved_pt_byte + (coding->produced - bytes));
7730
7731 if (need_marker_adjustment)
7732 {
7733 struct Lisp_Marker *tail;
7734
7735 for (tail = BUF_MARKERS (current_buffer); tail; tail = tail->next)
7736 if (tail->need_adjustment)
7737 {
7738 tail->need_adjustment = 0;
7739 if (tail->insertion_type)
7740 {
7741 tail->bytepos = from_byte;
7742 tail->charpos = from;
7743 }
7744 else
7745 {
7746 tail->bytepos = from_byte + coding->produced;
7747 tail->charpos
7748 = (NILP (BVAR (current_buffer, enable_multibyte_characters))
7749 ? tail->bytepos : from + coding->produced_char);
7750 }
7751 }
7752 }
7753 }
7754
7755 Vdeactivate_mark = old_deactivate_mark;
7756 unbind_to (count, coding->dst_object);
7757 }
7758
7759
7760 void
7761 encode_coding_object (struct coding_system *coding,
7762 Lisp_Object src_object,
7763 ptrdiff_t from, ptrdiff_t from_byte,
7764 ptrdiff_t to, ptrdiff_t to_byte,
7765 Lisp_Object dst_object)
7766 {
7767 ptrdiff_t count = SPECPDL_INDEX ();
7768 ptrdiff_t chars = to - from;
7769 ptrdiff_t bytes = to_byte - from_byte;
7770 Lisp_Object attrs;
7771 ptrdiff_t saved_pt = -1, saved_pt_byte IF_LINT (= 0);
7772 bool need_marker_adjustment = 0;
7773 bool kill_src_buffer = 0;
7774 Lisp_Object old_deactivate_mark;
7775
7776 old_deactivate_mark = Vdeactivate_mark;
7777
7778 coding->src_object = src_object;
7779 coding->src_chars = chars;
7780 coding->src_bytes = bytes;
7781 coding->src_multibyte = chars < bytes;
7782
7783 attrs = CODING_ID_ATTRS (coding->id);
7784
7785 if (EQ (src_object, dst_object))
7786 {
7787 struct Lisp_Marker *tail;
7788
7789 for (tail = BUF_MARKERS (current_buffer); tail; tail = tail->next)
7790 {
7791 tail->need_adjustment
7792 = tail->charpos == (tail->insertion_type ? from : to);
7793 need_marker_adjustment |= tail->need_adjustment;
7794 }
7795 }
7796
7797 if (! NILP (CODING_ATTR_PRE_WRITE (attrs)))
7798 {
7799 coding->src_object = code_conversion_save (1, coding->src_multibyte);
7800 set_buffer_internal (XBUFFER (coding->src_object));
7801 if (STRINGP (src_object))
7802 insert_from_string (src_object, from, from_byte, chars, bytes, 0);
7803 else if (BUFFERP (src_object))
7804 insert_from_buffer (XBUFFER (src_object), from, chars, 0);
7805 else
7806 insert_1_both ((char *) coding->source + from, chars, bytes, 0, 0, 0);
7807
7808 if (EQ (src_object, dst_object))
7809 {
7810 set_buffer_internal (XBUFFER (src_object));
7811 saved_pt = PT, saved_pt_byte = PT_BYTE;
7812 del_range_both (from, from_byte, to, to_byte, 1);
7813 set_buffer_internal (XBUFFER (coding->src_object));
7814 }
7815
7816 {
7817 struct gcpro gcpro1, gcpro2, gcpro3, gcpro4, gcpro5;
7818
7819 GCPRO5 (coding->src_object, coding->dst_object, src_object, dst_object,
7820 old_deactivate_mark);
7821 safe_call2 (CODING_ATTR_PRE_WRITE (attrs),
7822 make_number (BEG), make_number (Z));
7823 UNGCPRO;
7824 }
7825 if (XBUFFER (coding->src_object) != current_buffer)
7826 kill_src_buffer = 1;
7827 coding->src_object = Fcurrent_buffer ();
7828 if (BEG != GPT)
7829 move_gap_both (BEG, BEG_BYTE);
7830 coding->src_chars = Z - BEG;
7831 coding->src_bytes = Z_BYTE - BEG_BYTE;
7832 coding->src_pos = BEG;
7833 coding->src_pos_byte = BEG_BYTE;
7834 coding->src_multibyte = Z < Z_BYTE;
7835 }
7836 else if (STRINGP (src_object))
7837 {
7838 code_conversion_save (0, 0);
7839 coding->src_pos = from;
7840 coding->src_pos_byte = from_byte;
7841 }
7842 else if (BUFFERP (src_object))
7843 {
7844 code_conversion_save (0, 0);
7845 set_buffer_internal (XBUFFER (src_object));
7846 if (EQ (src_object, dst_object))
7847 {
7848 saved_pt = PT, saved_pt_byte = PT_BYTE;
7849 coding->src_object = del_range_1 (from, to, 1, 1);
7850 coding->src_pos = 0;
7851 coding->src_pos_byte = 0;
7852 }
7853 else
7854 {
7855 if (from < GPT && to >= GPT)
7856 move_gap_both (from, from_byte);
7857 coding->src_pos = from;
7858 coding->src_pos_byte = from_byte;
7859 }
7860 }
7861 else
7862 code_conversion_save (0, 0);
7863
7864 if (BUFFERP (dst_object))
7865 {
7866 coding->dst_object = dst_object;
7867 if (EQ (src_object, dst_object))
7868 {
7869 coding->dst_pos = from;
7870 coding->dst_pos_byte = from_byte;
7871 }
7872 else
7873 {
7874 struct buffer *current = current_buffer;
7875
7876 set_buffer_temp (XBUFFER (dst_object));
7877 coding->dst_pos = PT;
7878 coding->dst_pos_byte = PT_BYTE;
7879 move_gap_both (coding->dst_pos, coding->dst_pos_byte);
7880 set_buffer_temp (current);
7881 }
7882 coding->dst_multibyte
7883 = ! NILP (BVAR (XBUFFER (dst_object), enable_multibyte_characters));
7884 }
7885 else if (EQ (dst_object, Qt))
7886 {
7887 ptrdiff_t dst_bytes = max (1, coding->src_chars);
7888 coding->dst_object = Qnil;
7889 coding->destination = xmalloc (dst_bytes);
7890 coding->dst_bytes = dst_bytes;
7891 coding->dst_multibyte = 0;
7892 }
7893 else
7894 {
7895 coding->dst_object = Qnil;
7896 coding->dst_multibyte = 0;
7897 }
7898
7899 encode_coding (coding);
7900
7901 if (EQ (dst_object, Qt))
7902 {
7903 if (BUFFERP (coding->dst_object))
7904 coding->dst_object = Fbuffer_string ();
7905 else
7906 {
7907 coding->dst_object
7908 = make_unibyte_string ((char *) coding->destination,
7909 coding->produced);
7910 xfree (coding->destination);
7911 }
7912 }
7913
7914 if (saved_pt >= 0)
7915 {
7916 /* This is the case of:
7917 (BUFFERP (src_object) && EQ (src_object, dst_object))
7918 As we have moved PT while replacing the original buffer
7919 contents, we must recover it now. */
7920 set_buffer_internal (XBUFFER (src_object));
7921 if (saved_pt < from)
7922 TEMP_SET_PT_BOTH (saved_pt, saved_pt_byte);
7923 else if (saved_pt < from + chars)
7924 TEMP_SET_PT_BOTH (from, from_byte);
7925 else if (! NILP (BVAR (current_buffer, enable_multibyte_characters)))
7926 TEMP_SET_PT_BOTH (saved_pt + (coding->produced_char - chars),
7927 saved_pt_byte + (coding->produced - bytes));
7928 else
7929 TEMP_SET_PT_BOTH (saved_pt + (coding->produced - bytes),
7930 saved_pt_byte + (coding->produced - bytes));
7931
7932 if (need_marker_adjustment)
7933 {
7934 struct Lisp_Marker *tail;
7935
7936 for (tail = BUF_MARKERS (current_buffer); tail; tail = tail->next)
7937 if (tail->need_adjustment)
7938 {
7939 tail->need_adjustment = 0;
7940 if (tail->insertion_type)
7941 {
7942 tail->bytepos = from_byte;
7943 tail->charpos = from;
7944 }
7945 else
7946 {
7947 tail->bytepos = from_byte + coding->produced;
7948 tail->charpos
7949 = (NILP (BVAR (current_buffer, enable_multibyte_characters))
7950 ? tail->bytepos : from + coding->produced_char);
7951 }
7952 }
7953 }
7954 }
7955
7956 if (kill_src_buffer)
7957 Fkill_buffer (coding->src_object);
7958
7959 Vdeactivate_mark = old_deactivate_mark;
7960 unbind_to (count, Qnil);
7961 }
7962
7963
7964 Lisp_Object
7965 preferred_coding_system (void)
7966 {
7967 int id = coding_categories[coding_priorities[0]].id;
7968
7969 return CODING_ID_NAME (id);
7970 }
7971
7972 #if defined (WINDOWSNT) || defined (CYGWIN)
7973
7974 Lisp_Object
7975 from_unicode (Lisp_Object str)
7976 {
7977 CHECK_STRING (str);
7978 if (!STRING_MULTIBYTE (str) &&
7979 SBYTES (str) & 1)
7980 {
7981 str = Fsubstring (str, make_number (0), make_number (-1));
7982 }
7983
7984 return code_convert_string_norecord (str, Qutf_16le, 0);
7985 }
7986
7987 wchar_t *
7988 to_unicode (Lisp_Object str, Lisp_Object *buf)
7989 {
7990 *buf = code_convert_string_norecord (str, Qutf_16le, 1);
7991 /* We need to make a another copy (in addition to the one made by
7992 code_convert_string_norecord) to ensure that the final string is
7993 _doubly_ zero terminated --- that is, that the string is
7994 terminated by two zero bytes and one utf-16le null character.
7995 Because strings are already terminated with a single zero byte,
7996 we just add one additional zero. */
7997 str = make_uninit_string (SBYTES (*buf) + 1);
7998 memcpy (SDATA (str), SDATA (*buf), SBYTES (*buf));
7999 SDATA (str) [SBYTES (*buf)] = '\0';
8000 *buf = str;
8001 return WCSDATA (*buf);
8002 }
8003
8004 #endif /* WINDOWSNT || CYGWIN */
8005
8006 \f
8007 #ifdef emacs
8008 /*** 8. Emacs Lisp library functions ***/
8009
8010 DEFUN ("coding-system-p", Fcoding_system_p, Scoding_system_p, 1, 1, 0,
8011 doc: /* Return t if OBJECT is nil or a coding-system.
8012 See the documentation of `define-coding-system' for information
8013 about coding-system objects. */)
8014 (Lisp_Object object)
8015 {
8016 if (NILP (object)
8017 || CODING_SYSTEM_ID (object) >= 0)
8018 return Qt;
8019 if (! SYMBOLP (object)
8020 || NILP (Fget (object, Qcoding_system_define_form)))
8021 return Qnil;
8022 return Qt;
8023 }
8024
8025 DEFUN ("read-non-nil-coding-system", Fread_non_nil_coding_system,
8026 Sread_non_nil_coding_system, 1, 1, 0,
8027 doc: /* Read a coding system from the minibuffer, prompting with string PROMPT. */)
8028 (Lisp_Object prompt)
8029 {
8030 Lisp_Object val;
8031 do
8032 {
8033 val = Fcompleting_read (prompt, Vcoding_system_alist, Qnil,
8034 Qt, Qnil, Qcoding_system_history, Qnil, Qnil);
8035 }
8036 while (SCHARS (val) == 0);
8037 return (Fintern (val, Qnil));
8038 }
8039
8040 DEFUN ("read-coding-system", Fread_coding_system, Sread_coding_system, 1, 2, 0,
8041 doc: /* Read a coding system from the minibuffer, prompting with string PROMPT.
8042 If the user enters null input, return second argument DEFAULT-CODING-SYSTEM.
8043 Ignores case when completing coding systems (all Emacs coding systems
8044 are lower-case). */)
8045 (Lisp_Object prompt, Lisp_Object default_coding_system)
8046 {
8047 Lisp_Object val;
8048 ptrdiff_t count = SPECPDL_INDEX ();
8049
8050 if (SYMBOLP (default_coding_system))
8051 default_coding_system = SYMBOL_NAME (default_coding_system);
8052 specbind (Qcompletion_ignore_case, Qt);
8053 val = Fcompleting_read (prompt, Vcoding_system_alist, Qnil,
8054 Qt, Qnil, Qcoding_system_history,
8055 default_coding_system, Qnil);
8056 unbind_to (count, Qnil);
8057 return (SCHARS (val) == 0 ? Qnil : Fintern (val, Qnil));
8058 }
8059
8060 DEFUN ("check-coding-system", Fcheck_coding_system, Scheck_coding_system,
8061 1, 1, 0,
8062 doc: /* Check validity of CODING-SYSTEM.
8063 If valid, return CODING-SYSTEM, else signal a `coding-system-error' error.
8064 It is valid if it is nil or a symbol defined as a coding system by the
8065 function `define-coding-system'. */)
8066 (Lisp_Object coding_system)
8067 {
8068 Lisp_Object define_form;
8069
8070 define_form = Fget (coding_system, Qcoding_system_define_form);
8071 if (! NILP (define_form))
8072 {
8073 Fput (coding_system, Qcoding_system_define_form, Qnil);
8074 safe_eval (define_form);
8075 }
8076 if (!NILP (Fcoding_system_p (coding_system)))
8077 return coding_system;
8078 xsignal1 (Qcoding_system_error, coding_system);
8079 }
8080
8081 \f
8082 /* Detect how the bytes at SRC of length SRC_BYTES are encoded. If
8083 HIGHEST, return the coding system of the highest
8084 priority among the detected coding systems. Otherwise return a
8085 list of detected coding systems sorted by their priorities. If
8086 MULTIBYTEP, it is assumed that the bytes are in correct
8087 multibyte form but contains only ASCII and eight-bit chars.
8088 Otherwise, the bytes are raw bytes.
8089
8090 CODING-SYSTEM controls the detection as below:
8091
8092 If it is nil, detect both text-format and eol-format. If the
8093 text-format part of CODING-SYSTEM is already specified
8094 (e.g. `iso-latin-1'), detect only eol-format. If the eol-format
8095 part of CODING-SYSTEM is already specified (e.g. `undecided-unix'),
8096 detect only text-format. */
8097
8098 Lisp_Object
8099 detect_coding_system (const unsigned char *src,
8100 ptrdiff_t src_chars, ptrdiff_t src_bytes,
8101 bool highest, bool multibytep,
8102 Lisp_Object coding_system)
8103 {
8104 const unsigned char *src_end = src + src_bytes;
8105 Lisp_Object attrs, eol_type;
8106 Lisp_Object val = Qnil;
8107 struct coding_system coding;
8108 ptrdiff_t id;
8109 struct coding_detection_info detect_info;
8110 enum coding_category base_category;
8111 bool null_byte_found = 0, eight_bit_found = 0;
8112
8113 if (NILP (coding_system))
8114 coding_system = Qundecided;
8115 setup_coding_system (coding_system, &coding);
8116 attrs = CODING_ID_ATTRS (coding.id);
8117 eol_type = CODING_ID_EOL_TYPE (coding.id);
8118 coding_system = CODING_ATTR_BASE_NAME (attrs);
8119
8120 coding.source = src;
8121 coding.src_chars = src_chars;
8122 coding.src_bytes = src_bytes;
8123 coding.src_multibyte = multibytep;
8124 coding.consumed = 0;
8125 coding.mode |= CODING_MODE_LAST_BLOCK;
8126 coding.head_ascii = 0;
8127
8128 detect_info.checked = detect_info.found = detect_info.rejected = 0;
8129
8130 /* At first, detect text-format if necessary. */
8131 base_category = XINT (CODING_ATTR_CATEGORY (attrs));
8132 if (base_category == coding_category_undecided)
8133 {
8134 enum coding_category category IF_LINT (= 0);
8135 struct coding_system *this IF_LINT (= NULL);
8136 int c, i;
8137
8138 /* Skip all ASCII bytes except for a few ISO2022 controls. */
8139 for (; src < src_end; src++)
8140 {
8141 c = *src;
8142 if (c & 0x80)
8143 {
8144 eight_bit_found = 1;
8145 if (null_byte_found)
8146 break;
8147 }
8148 else if (c < 0x20)
8149 {
8150 if ((c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
8151 && ! inhibit_iso_escape_detection
8152 && ! detect_info.checked)
8153 {
8154 if (detect_coding_iso_2022 (&coding, &detect_info))
8155 {
8156 /* We have scanned the whole data. */
8157 if (! (detect_info.rejected & CATEGORY_MASK_ISO_7_ELSE))
8158 {
8159 /* We didn't find an 8-bit code. We may
8160 have found a null-byte, but it's very
8161 rare that a binary file confirm to
8162 ISO-2022. */
8163 src = src_end;
8164 coding.head_ascii = src - coding.source;
8165 }
8166 detect_info.rejected |= ~CATEGORY_MASK_ISO_ESCAPE;
8167 break;
8168 }
8169 }
8170 else if (! c && !inhibit_null_byte_detection)
8171 {
8172 null_byte_found = 1;
8173 if (eight_bit_found)
8174 break;
8175 }
8176 if (! eight_bit_found)
8177 coding.head_ascii++;
8178 }
8179 else if (! eight_bit_found)
8180 coding.head_ascii++;
8181 }
8182
8183 if (null_byte_found || eight_bit_found
8184 || coding.head_ascii < coding.src_bytes
8185 || detect_info.found)
8186 {
8187 if (coding.head_ascii == coding.src_bytes)
8188 /* As all bytes are 7-bit, we can ignore non-ISO-2022 codings. */
8189 for (i = 0; i < coding_category_raw_text; i++)
8190 {
8191 category = coding_priorities[i];
8192 this = coding_categories + category;
8193 if (detect_info.found & (1 << category))
8194 break;
8195 }
8196 else
8197 {
8198 if (null_byte_found)
8199 {
8200 detect_info.checked |= ~CATEGORY_MASK_UTF_16;
8201 detect_info.rejected |= ~CATEGORY_MASK_UTF_16;
8202 }
8203 for (i = 0; i < coding_category_raw_text; i++)
8204 {
8205 category = coding_priorities[i];
8206 this = coding_categories + category;
8207
8208 if (this->id < 0)
8209 {
8210 /* No coding system of this category is defined. */
8211 detect_info.rejected |= (1 << category);
8212 }
8213 else if (category >= coding_category_raw_text)
8214 continue;
8215 else if (detect_info.checked & (1 << category))
8216 {
8217 if (highest
8218 && (detect_info.found & (1 << category)))
8219 break;
8220 }
8221 else if ((*(this->detector)) (&coding, &detect_info)
8222 && highest
8223 && (detect_info.found & (1 << category)))
8224 {
8225 if (category == coding_category_utf_16_auto)
8226 {
8227 if (detect_info.found & CATEGORY_MASK_UTF_16_LE)
8228 category = coding_category_utf_16_le;
8229 else
8230 category = coding_category_utf_16_be;
8231 }
8232 break;
8233 }
8234 }
8235 }
8236 }
8237
8238 if ((detect_info.rejected & CATEGORY_MASK_ANY) == CATEGORY_MASK_ANY
8239 || null_byte_found)
8240 {
8241 detect_info.found = CATEGORY_MASK_RAW_TEXT;
8242 id = CODING_SYSTEM_ID (Qno_conversion);
8243 val = Fcons (make_number (id), Qnil);
8244 }
8245 else if (! detect_info.rejected && ! detect_info.found)
8246 {
8247 detect_info.found = CATEGORY_MASK_ANY;
8248 id = coding_categories[coding_category_undecided].id;
8249 val = Fcons (make_number (id), Qnil);
8250 }
8251 else if (highest)
8252 {
8253 if (detect_info.found)
8254 {
8255 detect_info.found = 1 << category;
8256 val = Fcons (make_number (this->id), Qnil);
8257 }
8258 else
8259 for (i = 0; i < coding_category_raw_text; i++)
8260 if (! (detect_info.rejected & (1 << coding_priorities[i])))
8261 {
8262 detect_info.found = 1 << coding_priorities[i];
8263 id = coding_categories[coding_priorities[i]].id;
8264 val = Fcons (make_number (id), Qnil);
8265 break;
8266 }
8267 }
8268 else
8269 {
8270 int mask = detect_info.rejected | detect_info.found;
8271 int found = 0;
8272
8273 for (i = coding_category_raw_text - 1; i >= 0; i--)
8274 {
8275 category = coding_priorities[i];
8276 if (! (mask & (1 << category)))
8277 {
8278 found |= 1 << category;
8279 id = coding_categories[category].id;
8280 if (id >= 0)
8281 val = Fcons (make_number (id), val);
8282 }
8283 }
8284 for (i = coding_category_raw_text - 1; i >= 0; i--)
8285 {
8286 category = coding_priorities[i];
8287 if (detect_info.found & (1 << category))
8288 {
8289 id = coding_categories[category].id;
8290 val = Fcons (make_number (id), val);
8291 }
8292 }
8293 detect_info.found |= found;
8294 }
8295 }
8296 else if (base_category == coding_category_utf_8_auto)
8297 {
8298 if (detect_coding_utf_8 (&coding, &detect_info))
8299 {
8300 struct coding_system *this;
8301
8302 if (detect_info.found & CATEGORY_MASK_UTF_8_SIG)
8303 this = coding_categories + coding_category_utf_8_sig;
8304 else
8305 this = coding_categories + coding_category_utf_8_nosig;
8306 val = Fcons (make_number (this->id), Qnil);
8307 }
8308 }
8309 else if (base_category == coding_category_utf_16_auto)
8310 {
8311 if (detect_coding_utf_16 (&coding, &detect_info))
8312 {
8313 struct coding_system *this;
8314
8315 if (detect_info.found & CATEGORY_MASK_UTF_16_LE)
8316 this = coding_categories + coding_category_utf_16_le;
8317 else if (detect_info.found & CATEGORY_MASK_UTF_16_BE)
8318 this = coding_categories + coding_category_utf_16_be;
8319 else if (detect_info.rejected & CATEGORY_MASK_UTF_16_LE_NOSIG)
8320 this = coding_categories + coding_category_utf_16_be_nosig;
8321 else
8322 this = coding_categories + coding_category_utf_16_le_nosig;
8323 val = Fcons (make_number (this->id), Qnil);
8324 }
8325 }
8326 else
8327 {
8328 detect_info.found = 1 << XINT (CODING_ATTR_CATEGORY (attrs));
8329 val = Fcons (make_number (coding.id), Qnil);
8330 }
8331
8332 /* Then, detect eol-format if necessary. */
8333 {
8334 int normal_eol = -1, utf_16_be_eol = -1, utf_16_le_eol = -1;
8335 Lisp_Object tail;
8336
8337 if (VECTORP (eol_type))
8338 {
8339 if (detect_info.found & ~CATEGORY_MASK_UTF_16)
8340 {
8341 if (null_byte_found)
8342 normal_eol = EOL_SEEN_LF;
8343 else
8344 normal_eol = detect_eol (coding.source, src_bytes,
8345 coding_category_raw_text);
8346 }
8347 if (detect_info.found & (CATEGORY_MASK_UTF_16_BE
8348 | CATEGORY_MASK_UTF_16_BE_NOSIG))
8349 utf_16_be_eol = detect_eol (coding.source, src_bytes,
8350 coding_category_utf_16_be);
8351 if (detect_info.found & (CATEGORY_MASK_UTF_16_LE
8352 | CATEGORY_MASK_UTF_16_LE_NOSIG))
8353 utf_16_le_eol = detect_eol (coding.source, src_bytes,
8354 coding_category_utf_16_le);
8355 }
8356 else
8357 {
8358 if (EQ (eol_type, Qunix))
8359 normal_eol = utf_16_be_eol = utf_16_le_eol = EOL_SEEN_LF;
8360 else if (EQ (eol_type, Qdos))
8361 normal_eol = utf_16_be_eol = utf_16_le_eol = EOL_SEEN_CRLF;
8362 else
8363 normal_eol = utf_16_be_eol = utf_16_le_eol = EOL_SEEN_CR;
8364 }
8365
8366 for (tail = val; CONSP (tail); tail = XCDR (tail))
8367 {
8368 enum coding_category category;
8369 int this_eol;
8370
8371 id = XINT (XCAR (tail));
8372 attrs = CODING_ID_ATTRS (id);
8373 category = XINT (CODING_ATTR_CATEGORY (attrs));
8374 eol_type = CODING_ID_EOL_TYPE (id);
8375 if (VECTORP (eol_type))
8376 {
8377 if (category == coding_category_utf_16_be
8378 || category == coding_category_utf_16_be_nosig)
8379 this_eol = utf_16_be_eol;
8380 else if (category == coding_category_utf_16_le
8381 || category == coding_category_utf_16_le_nosig)
8382 this_eol = utf_16_le_eol;
8383 else
8384 this_eol = normal_eol;
8385
8386 if (this_eol == EOL_SEEN_LF)
8387 XSETCAR (tail, AREF (eol_type, 0));
8388 else if (this_eol == EOL_SEEN_CRLF)
8389 XSETCAR (tail, AREF (eol_type, 1));
8390 else if (this_eol == EOL_SEEN_CR)
8391 XSETCAR (tail, AREF (eol_type, 2));
8392 else
8393 XSETCAR (tail, CODING_ID_NAME (id));
8394 }
8395 else
8396 XSETCAR (tail, CODING_ID_NAME (id));
8397 }
8398 }
8399
8400 return (highest ? (CONSP (val) ? XCAR (val) : Qnil) : val);
8401 }
8402
8403
8404 DEFUN ("detect-coding-region", Fdetect_coding_region, Sdetect_coding_region,
8405 2, 3, 0,
8406 doc: /* Detect coding system of the text in the region between START and END.
8407 Return a list of possible coding systems ordered by priority.
8408 The coding systems to try and their priorities follows what
8409 the function `coding-system-priority-list' (which see) returns.
8410
8411 If only ASCII characters are found (except for such ISO-2022 control
8412 characters as ESC), it returns a list of single element `undecided'
8413 or its subsidiary coding system according to a detected end-of-line
8414 format.
8415
8416 If optional argument HIGHEST is non-nil, return the coding system of
8417 highest priority. */)
8418 (Lisp_Object start, Lisp_Object end, Lisp_Object highest)
8419 {
8420 ptrdiff_t from, to;
8421 ptrdiff_t from_byte, to_byte;
8422
8423 validate_region (&start, &end);
8424 from = XINT (start), to = XINT (end);
8425 from_byte = CHAR_TO_BYTE (from);
8426 to_byte = CHAR_TO_BYTE (to);
8427
8428 if (from < GPT && to >= GPT)
8429 move_gap_both (to, to_byte);
8430
8431 return detect_coding_system (BYTE_POS_ADDR (from_byte),
8432 to - from, to_byte - from_byte,
8433 !NILP (highest),
8434 !NILP (BVAR (current_buffer
8435 , enable_multibyte_characters)),
8436 Qnil);
8437 }
8438
8439 DEFUN ("detect-coding-string", Fdetect_coding_string, Sdetect_coding_string,
8440 1, 2, 0,
8441 doc: /* Detect coding system of the text in STRING.
8442 Return a list of possible coding systems ordered by priority.
8443 The coding systems to try and their priorities follows what
8444 the function `coding-system-priority-list' (which see) returns.
8445
8446 If only ASCII characters are found (except for such ISO-2022 control
8447 characters as ESC), it returns a list of single element `undecided'
8448 or its subsidiary coding system according to a detected end-of-line
8449 format.
8450
8451 If optional argument HIGHEST is non-nil, return the coding system of
8452 highest priority. */)
8453 (Lisp_Object string, Lisp_Object highest)
8454 {
8455 CHECK_STRING (string);
8456
8457 return detect_coding_system (SDATA (string),
8458 SCHARS (string), SBYTES (string),
8459 !NILP (highest), STRING_MULTIBYTE (string),
8460 Qnil);
8461 }
8462
8463
8464 static bool
8465 char_encodable_p (int c, Lisp_Object attrs)
8466 {
8467 Lisp_Object tail;
8468 struct charset *charset;
8469 Lisp_Object translation_table;
8470
8471 translation_table = CODING_ATTR_TRANS_TBL (attrs);
8472 if (! NILP (translation_table))
8473 c = translate_char (translation_table, c);
8474 for (tail = CODING_ATTR_CHARSET_LIST (attrs);
8475 CONSP (tail); tail = XCDR (tail))
8476 {
8477 charset = CHARSET_FROM_ID (XINT (XCAR (tail)));
8478 if (CHAR_CHARSET_P (c, charset))
8479 break;
8480 }
8481 return (! NILP (tail));
8482 }
8483
8484
8485 /* Return a list of coding systems that safely encode the text between
8486 START and END. If EXCLUDE is non-nil, it is a list of coding
8487 systems not to check. The returned list doesn't contain any such
8488 coding systems. In any case, if the text contains only ASCII or is
8489 unibyte, return t. */
8490
8491 DEFUN ("find-coding-systems-region-internal",
8492 Ffind_coding_systems_region_internal,
8493 Sfind_coding_systems_region_internal, 2, 3, 0,
8494 doc: /* Internal use only. */)
8495 (Lisp_Object start, Lisp_Object end, Lisp_Object exclude)
8496 {
8497 Lisp_Object coding_attrs_list, safe_codings;
8498 ptrdiff_t start_byte, end_byte;
8499 const unsigned char *p, *pbeg, *pend;
8500 int c;
8501 Lisp_Object tail, elt, work_table;
8502
8503 if (STRINGP (start))
8504 {
8505 if (!STRING_MULTIBYTE (start)
8506 || SCHARS (start) == SBYTES (start))
8507 return Qt;
8508 start_byte = 0;
8509 end_byte = SBYTES (start);
8510 }
8511 else
8512 {
8513 CHECK_NUMBER_COERCE_MARKER (start);
8514 CHECK_NUMBER_COERCE_MARKER (end);
8515 if (XINT (start) < BEG || XINT (end) > Z || XINT (start) > XINT (end))
8516 args_out_of_range (start, end);
8517 if (NILP (BVAR (current_buffer, enable_multibyte_characters)))
8518 return Qt;
8519 start_byte = CHAR_TO_BYTE (XINT (start));
8520 end_byte = CHAR_TO_BYTE (XINT (end));
8521 if (XINT (end) - XINT (start) == end_byte - start_byte)
8522 return Qt;
8523
8524 if (XINT (start) < GPT && XINT (end) > GPT)
8525 {
8526 if ((GPT - XINT (start)) < (XINT (end) - GPT))
8527 move_gap_both (XINT (start), start_byte);
8528 else
8529 move_gap_both (XINT (end), end_byte);
8530 }
8531 }
8532
8533 coding_attrs_list = Qnil;
8534 for (tail = Vcoding_system_list; CONSP (tail); tail = XCDR (tail))
8535 if (NILP (exclude)
8536 || NILP (Fmemq (XCAR (tail), exclude)))
8537 {
8538 Lisp_Object attrs;
8539
8540 attrs = AREF (CODING_SYSTEM_SPEC (XCAR (tail)), 0);
8541 if (EQ (XCAR (tail), CODING_ATTR_BASE_NAME (attrs))
8542 && ! EQ (CODING_ATTR_TYPE (attrs), Qundecided))
8543 {
8544 ASET (attrs, coding_attr_trans_tbl,
8545 get_translation_table (attrs, 1, NULL));
8546 coding_attrs_list = Fcons (attrs, coding_attrs_list);
8547 }
8548 }
8549
8550 if (STRINGP (start))
8551 p = pbeg = SDATA (start);
8552 else
8553 p = pbeg = BYTE_POS_ADDR (start_byte);
8554 pend = p + (end_byte - start_byte);
8555
8556 while (p < pend && ASCII_BYTE_P (*p)) p++;
8557 while (p < pend && ASCII_BYTE_P (*(pend - 1))) pend--;
8558
8559 work_table = Fmake_char_table (Qnil, Qnil);
8560 while (p < pend)
8561 {
8562 if (ASCII_BYTE_P (*p))
8563 p++;
8564 else
8565 {
8566 c = STRING_CHAR_ADVANCE (p);
8567 if (!NILP (char_table_ref (work_table, c)))
8568 /* This character was already checked. Ignore it. */
8569 continue;
8570
8571 charset_map_loaded = 0;
8572 for (tail = coding_attrs_list; CONSP (tail);)
8573 {
8574 elt = XCAR (tail);
8575 if (NILP (elt))
8576 tail = XCDR (tail);
8577 else if (char_encodable_p (c, elt))
8578 tail = XCDR (tail);
8579 else if (CONSP (XCDR (tail)))
8580 {
8581 XSETCAR (tail, XCAR (XCDR (tail)));
8582 XSETCDR (tail, XCDR (XCDR (tail)));
8583 }
8584 else
8585 {
8586 XSETCAR (tail, Qnil);
8587 tail = XCDR (tail);
8588 }
8589 }
8590 if (charset_map_loaded)
8591 {
8592 ptrdiff_t p_offset = p - pbeg, pend_offset = pend - pbeg;
8593
8594 if (STRINGP (start))
8595 pbeg = SDATA (start);
8596 else
8597 pbeg = BYTE_POS_ADDR (start_byte);
8598 p = pbeg + p_offset;
8599 pend = pbeg + pend_offset;
8600 }
8601 char_table_set (work_table, c, Qt);
8602 }
8603 }
8604
8605 safe_codings = list2 (Qraw_text, Qno_conversion);
8606 for (tail = coding_attrs_list; CONSP (tail); tail = XCDR (tail))
8607 if (! NILP (XCAR (tail)))
8608 safe_codings = Fcons (CODING_ATTR_BASE_NAME (XCAR (tail)), safe_codings);
8609
8610 return safe_codings;
8611 }
8612
8613
8614 DEFUN ("unencodable-char-position", Funencodable_char_position,
8615 Sunencodable_char_position, 3, 5, 0,
8616 doc: /*
8617 Return position of first un-encodable character in a region.
8618 START and END specify the region and CODING-SYSTEM specifies the
8619 encoding to check. Return nil if CODING-SYSTEM does encode the region.
8620
8621 If optional 4th argument COUNT is non-nil, it specifies at most how
8622 many un-encodable characters to search. In this case, the value is a
8623 list of positions.
8624
8625 If optional 5th argument STRING is non-nil, it is a string to search
8626 for un-encodable characters. In that case, START and END are indexes
8627 to the string. */)
8628 (Lisp_Object start, Lisp_Object end, Lisp_Object coding_system, Lisp_Object count, Lisp_Object string)
8629 {
8630 EMACS_INT n;
8631 struct coding_system coding;
8632 Lisp_Object attrs, charset_list, translation_table;
8633 Lisp_Object positions;
8634 ptrdiff_t from, to;
8635 const unsigned char *p, *stop, *pend;
8636 bool ascii_compatible;
8637
8638 setup_coding_system (Fcheck_coding_system (coding_system), &coding);
8639 attrs = CODING_ID_ATTRS (coding.id);
8640 if (EQ (CODING_ATTR_TYPE (attrs), Qraw_text))
8641 return Qnil;
8642 ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs));
8643 charset_list = CODING_ATTR_CHARSET_LIST (attrs);
8644 translation_table = get_translation_table (attrs, 1, NULL);
8645
8646 if (NILP (string))
8647 {
8648 validate_region (&start, &end);
8649 from = XINT (start);
8650 to = XINT (end);
8651 if (NILP (BVAR (current_buffer, enable_multibyte_characters))
8652 || (ascii_compatible
8653 && (to - from) == (CHAR_TO_BYTE (to) - (CHAR_TO_BYTE (from)))))
8654 return Qnil;
8655 p = CHAR_POS_ADDR (from);
8656 pend = CHAR_POS_ADDR (to);
8657 if (from < GPT && to >= GPT)
8658 stop = GPT_ADDR;
8659 else
8660 stop = pend;
8661 }
8662 else
8663 {
8664 CHECK_STRING (string);
8665 CHECK_NATNUM (start);
8666 CHECK_NATNUM (end);
8667 if (! (XINT (start) <= XINT (end) && XINT (end) <= SCHARS (string)))
8668 args_out_of_range_3 (string, start, end);
8669 from = XINT (start);
8670 to = XINT (end);
8671 if (! STRING_MULTIBYTE (string))
8672 return Qnil;
8673 p = SDATA (string) + string_char_to_byte (string, from);
8674 stop = pend = SDATA (string) + string_char_to_byte (string, to);
8675 if (ascii_compatible && (to - from) == (pend - p))
8676 return Qnil;
8677 }
8678
8679 if (NILP (count))
8680 n = 1;
8681 else
8682 {
8683 CHECK_NATNUM (count);
8684 n = XINT (count);
8685 }
8686
8687 positions = Qnil;
8688 charset_map_loaded = 0;
8689 while (1)
8690 {
8691 int c;
8692
8693 if (ascii_compatible)
8694 while (p < stop && ASCII_BYTE_P (*p))
8695 p++, from++;
8696 if (p >= stop)
8697 {
8698 if (p >= pend)
8699 break;
8700 stop = pend;
8701 p = GAP_END_ADDR;
8702 }
8703
8704 c = STRING_CHAR_ADVANCE (p);
8705 if (! (ASCII_CHAR_P (c) && ascii_compatible)
8706 && ! char_charset (translate_char (translation_table, c),
8707 charset_list, NULL))
8708 {
8709 positions = Fcons (make_number (from), positions);
8710 n--;
8711 if (n == 0)
8712 break;
8713 }
8714
8715 from++;
8716 if (charset_map_loaded && NILP (string))
8717 {
8718 p = CHAR_POS_ADDR (from);
8719 pend = CHAR_POS_ADDR (to);
8720 if (from < GPT && to >= GPT)
8721 stop = GPT_ADDR;
8722 else
8723 stop = pend;
8724 charset_map_loaded = 0;
8725 }
8726 }
8727
8728 return (NILP (count) ? Fcar (positions) : Fnreverse (positions));
8729 }
8730
8731
8732 DEFUN ("check-coding-systems-region", Fcheck_coding_systems_region,
8733 Scheck_coding_systems_region, 3, 3, 0,
8734 doc: /* Check if the region is encodable by coding systems.
8735
8736 START and END are buffer positions specifying the region.
8737 CODING-SYSTEM-LIST is a list of coding systems to check.
8738
8739 The value is an alist ((CODING-SYSTEM POS0 POS1 ...) ...), where
8740 CODING-SYSTEM is a member of CODING-SYSTEM-LIST and can't encode the
8741 whole region, POS0, POS1, ... are buffer positions where non-encodable
8742 characters are found.
8743
8744 If all coding systems in CODING-SYSTEM-LIST can encode the region, the
8745 value is nil.
8746
8747 START may be a string. In that case, check if the string is
8748 encodable, and the value contains indices to the string instead of
8749 buffer positions. END is ignored.
8750
8751 If the current buffer (or START if it is a string) is unibyte, the value
8752 is nil. */)
8753 (Lisp_Object start, Lisp_Object end, Lisp_Object coding_system_list)
8754 {
8755 Lisp_Object list;
8756 ptrdiff_t start_byte, end_byte;
8757 ptrdiff_t pos;
8758 const unsigned char *p, *pbeg, *pend;
8759 int c;
8760 Lisp_Object tail, elt, attrs;
8761
8762 if (STRINGP (start))
8763 {
8764 if (!STRING_MULTIBYTE (start)
8765 || SCHARS (start) == SBYTES (start))
8766 return Qnil;
8767 start_byte = 0;
8768 end_byte = SBYTES (start);
8769 pos = 0;
8770 }
8771 else
8772 {
8773 CHECK_NUMBER_COERCE_MARKER (start);
8774 CHECK_NUMBER_COERCE_MARKER (end);
8775 if (XINT (start) < BEG || XINT (end) > Z || XINT (start) > XINT (end))
8776 args_out_of_range (start, end);
8777 if (NILP (BVAR (current_buffer, enable_multibyte_characters)))
8778 return Qnil;
8779 start_byte = CHAR_TO_BYTE (XINT (start));
8780 end_byte = CHAR_TO_BYTE (XINT (end));
8781 if (XINT (end) - XINT (start) == end_byte - start_byte)
8782 return Qnil;
8783
8784 if (XINT (start) < GPT && XINT (end) > GPT)
8785 {
8786 if ((GPT - XINT (start)) < (XINT (end) - GPT))
8787 move_gap_both (XINT (start), start_byte);
8788 else
8789 move_gap_both (XINT (end), end_byte);
8790 }
8791 pos = XINT (start);
8792 }
8793
8794 list = Qnil;
8795 for (tail = coding_system_list; CONSP (tail); tail = XCDR (tail))
8796 {
8797 elt = XCAR (tail);
8798 attrs = AREF (CODING_SYSTEM_SPEC (elt), 0);
8799 ASET (attrs, coding_attr_trans_tbl,
8800 get_translation_table (attrs, 1, NULL));
8801 list = Fcons (Fcons (elt, Fcons (attrs, Qnil)), list);
8802 }
8803
8804 if (STRINGP (start))
8805 p = pbeg = SDATA (start);
8806 else
8807 p = pbeg = BYTE_POS_ADDR (start_byte);
8808 pend = p + (end_byte - start_byte);
8809
8810 while (p < pend && ASCII_BYTE_P (*p)) p++, pos++;
8811 while (p < pend && ASCII_BYTE_P (*(pend - 1))) pend--;
8812
8813 while (p < pend)
8814 {
8815 if (ASCII_BYTE_P (*p))
8816 p++;
8817 else
8818 {
8819 c = STRING_CHAR_ADVANCE (p);
8820
8821 charset_map_loaded = 0;
8822 for (tail = list; CONSP (tail); tail = XCDR (tail))
8823 {
8824 elt = XCDR (XCAR (tail));
8825 if (! char_encodable_p (c, XCAR (elt)))
8826 XSETCDR (elt, Fcons (make_number (pos), XCDR (elt)));
8827 }
8828 if (charset_map_loaded)
8829 {
8830 ptrdiff_t p_offset = p - pbeg, pend_offset = pend - pbeg;
8831
8832 if (STRINGP (start))
8833 pbeg = SDATA (start);
8834 else
8835 pbeg = BYTE_POS_ADDR (start_byte);
8836 p = pbeg + p_offset;
8837 pend = pbeg + pend_offset;
8838 }
8839 }
8840 pos++;
8841 }
8842
8843 tail = list;
8844 list = Qnil;
8845 for (; CONSP (tail); tail = XCDR (tail))
8846 {
8847 elt = XCAR (tail);
8848 if (CONSP (XCDR (XCDR (elt))))
8849 list = Fcons (Fcons (XCAR (elt), Fnreverse (XCDR (XCDR (elt)))),
8850 list);
8851 }
8852
8853 return list;
8854 }
8855
8856
8857 static Lisp_Object
8858 code_convert_region (Lisp_Object start, Lisp_Object end,
8859 Lisp_Object coding_system, Lisp_Object dst_object,
8860 bool encodep, bool norecord)
8861 {
8862 struct coding_system coding;
8863 ptrdiff_t from, from_byte, to, to_byte;
8864 Lisp_Object src_object;
8865
8866 if (NILP (coding_system))
8867 coding_system = Qno_conversion;
8868 else
8869 CHECK_CODING_SYSTEM (coding_system);
8870 src_object = Fcurrent_buffer ();
8871 if (NILP (dst_object))
8872 dst_object = src_object;
8873 else if (! EQ (dst_object, Qt))
8874 CHECK_BUFFER (dst_object);
8875
8876 validate_region (&start, &end);
8877 from = XFASTINT (start);
8878 from_byte = CHAR_TO_BYTE (from);
8879 to = XFASTINT (end);
8880 to_byte = CHAR_TO_BYTE (to);
8881
8882 setup_coding_system (coding_system, &coding);
8883 coding.mode |= CODING_MODE_LAST_BLOCK;
8884
8885 if (encodep)
8886 encode_coding_object (&coding, src_object, from, from_byte, to, to_byte,
8887 dst_object);
8888 else
8889 decode_coding_object (&coding, src_object, from, from_byte, to, to_byte,
8890 dst_object);
8891 if (! norecord)
8892 Vlast_coding_system_used = CODING_ID_NAME (coding.id);
8893
8894 return (BUFFERP (dst_object)
8895 ? make_number (coding.produced_char)
8896 : coding.dst_object);
8897 }
8898
8899
8900 DEFUN ("decode-coding-region", Fdecode_coding_region, Sdecode_coding_region,
8901 3, 4, "r\nzCoding system: ",
8902 doc: /* Decode the current region from the specified coding system.
8903 When called from a program, takes four arguments:
8904 START, END, CODING-SYSTEM, and DESTINATION.
8905 START and END are buffer positions.
8906
8907 Optional 4th arguments DESTINATION specifies where the decoded text goes.
8908 If nil, the region between START and END is replaced by the decoded text.
8909 If buffer, the decoded text is inserted in that buffer after point (point
8910 does not move).
8911 In those cases, the length of the decoded text is returned.
8912 If DESTINATION is t, the decoded text is returned.
8913
8914 This function sets `last-coding-system-used' to the precise coding system
8915 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
8916 not fully specified.) */)
8917 (Lisp_Object start, Lisp_Object end, Lisp_Object coding_system, Lisp_Object destination)
8918 {
8919 return code_convert_region (start, end, coding_system, destination, 0, 0);
8920 }
8921
8922 DEFUN ("encode-coding-region", Fencode_coding_region, Sencode_coding_region,
8923 3, 4, "r\nzCoding system: ",
8924 doc: /* Encode the current region by specified coding system.
8925 When called from a program, takes four arguments:
8926 START, END, CODING-SYSTEM and DESTINATION.
8927 START and END are buffer positions.
8928
8929 Optional 4th arguments DESTINATION specifies where the encoded text goes.
8930 If nil, the region between START and END is replace by the encoded text.
8931 If buffer, the encoded text is inserted in that buffer after point (point
8932 does not move).
8933 In those cases, the length of the encoded text is returned.
8934 If DESTINATION is t, the encoded text is returned.
8935
8936 This function sets `last-coding-system-used' to the precise coding system
8937 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
8938 not fully specified.) */)
8939 (Lisp_Object start, Lisp_Object end, Lisp_Object coding_system, Lisp_Object destination)
8940 {
8941 return code_convert_region (start, end, coding_system, destination, 1, 0);
8942 }
8943
8944 Lisp_Object
8945 code_convert_string (Lisp_Object string, Lisp_Object coding_system,
8946 Lisp_Object dst_object, bool encodep, bool nocopy,
8947 bool norecord)
8948 {
8949 struct coding_system coding;
8950 ptrdiff_t chars, bytes;
8951
8952 CHECK_STRING (string);
8953 if (NILP (coding_system))
8954 {
8955 if (! norecord)
8956 Vlast_coding_system_used = Qno_conversion;
8957 if (NILP (dst_object))
8958 return (nocopy ? Fcopy_sequence (string) : string);
8959 }
8960
8961 if (NILP (coding_system))
8962 coding_system = Qno_conversion;
8963 else
8964 CHECK_CODING_SYSTEM (coding_system);
8965 if (NILP (dst_object))
8966 dst_object = Qt;
8967 else if (! EQ (dst_object, Qt))
8968 CHECK_BUFFER (dst_object);
8969
8970 setup_coding_system (coding_system, &coding);
8971 coding.mode |= CODING_MODE_LAST_BLOCK;
8972 chars = SCHARS (string);
8973 bytes = SBYTES (string);
8974 if (encodep)
8975 encode_coding_object (&coding, string, 0, 0, chars, bytes, dst_object);
8976 else
8977 decode_coding_object (&coding, string, 0, 0, chars, bytes, dst_object);
8978 if (! norecord)
8979 Vlast_coding_system_used = CODING_ID_NAME (coding.id);
8980
8981 return (BUFFERP (dst_object)
8982 ? make_number (coding.produced_char)
8983 : coding.dst_object);
8984 }
8985
8986
8987 /* Encode or decode STRING according to CODING_SYSTEM.
8988 Do not set Vlast_coding_system_used.
8989
8990 This function is called only from macros DECODE_FILE and
8991 ENCODE_FILE, thus we ignore character composition. */
8992
8993 Lisp_Object
8994 code_convert_string_norecord (Lisp_Object string, Lisp_Object coding_system,
8995 bool encodep)
8996 {
8997 return code_convert_string (string, coding_system, Qt, encodep, 0, 1);
8998 }
8999
9000
9001 DEFUN ("decode-coding-string", Fdecode_coding_string, Sdecode_coding_string,
9002 2, 4, 0,
9003 doc: /* Decode STRING which is encoded in CODING-SYSTEM, and return the result.
9004
9005 Optional third arg NOCOPY non-nil means it is OK to return STRING itself
9006 if the decoding operation is trivial.
9007
9008 Optional fourth arg BUFFER non-nil means that the decoded text is
9009 inserted in that buffer after point (point does not move). In this
9010 case, the return value is the length of the decoded text.
9011
9012 This function sets `last-coding-system-used' to the precise coding system
9013 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
9014 not fully specified.) */)
9015 (Lisp_Object string, Lisp_Object coding_system, Lisp_Object nocopy, Lisp_Object buffer)
9016 {
9017 return code_convert_string (string, coding_system, buffer,
9018 0, ! NILP (nocopy), 0);
9019 }
9020
9021 DEFUN ("encode-coding-string", Fencode_coding_string, Sencode_coding_string,
9022 2, 4, 0,
9023 doc: /* Encode STRING to CODING-SYSTEM, and return the result.
9024
9025 Optional third arg NOCOPY non-nil means it is OK to return STRING
9026 itself if the encoding operation is trivial.
9027
9028 Optional fourth arg BUFFER non-nil means that the encoded text is
9029 inserted in that buffer after point (point does not move). In this
9030 case, the return value is the length of the encoded text.
9031
9032 This function sets `last-coding-system-used' to the precise coding system
9033 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
9034 not fully specified.) */)
9035 (Lisp_Object string, Lisp_Object coding_system, Lisp_Object nocopy, Lisp_Object buffer)
9036 {
9037 return code_convert_string (string, coding_system, buffer,
9038 1, ! NILP (nocopy), 0);
9039 }
9040
9041 \f
9042 DEFUN ("decode-sjis-char", Fdecode_sjis_char, Sdecode_sjis_char, 1, 1, 0,
9043 doc: /* Decode a Japanese character which has CODE in shift_jis encoding.
9044 Return the corresponding character. */)
9045 (Lisp_Object code)
9046 {
9047 Lisp_Object spec, attrs, val;
9048 struct charset *charset_roman, *charset_kanji, *charset_kana, *charset;
9049 EMACS_INT ch;
9050 int c;
9051
9052 CHECK_NATNUM (code);
9053 ch = XFASTINT (code);
9054 CHECK_CODING_SYSTEM_GET_SPEC (Vsjis_coding_system, spec);
9055 attrs = AREF (spec, 0);
9056
9057 if (ASCII_BYTE_P (ch)
9058 && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
9059 return code;
9060
9061 val = CODING_ATTR_CHARSET_LIST (attrs);
9062 charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
9063 charset_kana = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
9064 charset_kanji = CHARSET_FROM_ID (XINT (XCAR (val)));
9065
9066 if (ch <= 0x7F)
9067 {
9068 c = ch;
9069 charset = charset_roman;
9070 }
9071 else if (ch >= 0xA0 && ch < 0xDF)
9072 {
9073 c = ch - 0x80;
9074 charset = charset_kana;
9075 }
9076 else
9077 {
9078 EMACS_INT c1 = ch >> 8;
9079 int c2 = ch & 0xFF;
9080
9081 if (c1 < 0x81 || (c1 > 0x9F && c1 < 0xE0) || c1 > 0xEF
9082 || c2 < 0x40 || c2 == 0x7F || c2 > 0xFC)
9083 error ("Invalid code: %"pI"d", ch);
9084 c = ch;
9085 SJIS_TO_JIS (c);
9086 charset = charset_kanji;
9087 }
9088 c = DECODE_CHAR (charset, c);
9089 if (c < 0)
9090 error ("Invalid code: %"pI"d", ch);
9091 return make_number (c);
9092 }
9093
9094
9095 DEFUN ("encode-sjis-char", Fencode_sjis_char, Sencode_sjis_char, 1, 1, 0,
9096 doc: /* Encode a Japanese character CH to shift_jis encoding.
9097 Return the corresponding code in SJIS. */)
9098 (Lisp_Object ch)
9099 {
9100 Lisp_Object spec, attrs, charset_list;
9101 int c;
9102 struct charset *charset;
9103 unsigned code;
9104
9105 CHECK_CHARACTER (ch);
9106 c = XFASTINT (ch);
9107 CHECK_CODING_SYSTEM_GET_SPEC (Vsjis_coding_system, spec);
9108 attrs = AREF (spec, 0);
9109
9110 if (ASCII_CHAR_P (c)
9111 && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
9112 return ch;
9113
9114 charset_list = CODING_ATTR_CHARSET_LIST (attrs);
9115 charset = char_charset (c, charset_list, &code);
9116 if (code == CHARSET_INVALID_CODE (charset))
9117 error ("Can't encode by shift_jis encoding: %c", c);
9118 JIS_TO_SJIS (code);
9119
9120 return make_number (code);
9121 }
9122
9123 DEFUN ("decode-big5-char", Fdecode_big5_char, Sdecode_big5_char, 1, 1, 0,
9124 doc: /* Decode a Big5 character which has CODE in BIG5 coding system.
9125 Return the corresponding character. */)
9126 (Lisp_Object code)
9127 {
9128 Lisp_Object spec, attrs, val;
9129 struct charset *charset_roman, *charset_big5, *charset;
9130 EMACS_INT ch;
9131 int c;
9132
9133 CHECK_NATNUM (code);
9134 ch = XFASTINT (code);
9135 CHECK_CODING_SYSTEM_GET_SPEC (Vbig5_coding_system, spec);
9136 attrs = AREF (spec, 0);
9137
9138 if (ASCII_BYTE_P (ch)
9139 && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
9140 return code;
9141
9142 val = CODING_ATTR_CHARSET_LIST (attrs);
9143 charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
9144 charset_big5 = CHARSET_FROM_ID (XINT (XCAR (val)));
9145
9146 if (ch <= 0x7F)
9147 {
9148 c = ch;
9149 charset = charset_roman;
9150 }
9151 else
9152 {
9153 EMACS_INT b1 = ch >> 8;
9154 int b2 = ch & 0x7F;
9155 if (b1 < 0xA1 || b1 > 0xFE
9156 || b2 < 0x40 || (b2 > 0x7E && b2 < 0xA1) || b2 > 0xFE)
9157 error ("Invalid code: %"pI"d", ch);
9158 c = ch;
9159 charset = charset_big5;
9160 }
9161 c = DECODE_CHAR (charset, c);
9162 if (c < 0)
9163 error ("Invalid code: %"pI"d", ch);
9164 return make_number (c);
9165 }
9166
9167 DEFUN ("encode-big5-char", Fencode_big5_char, Sencode_big5_char, 1, 1, 0,
9168 doc: /* Encode the Big5 character CH to BIG5 coding system.
9169 Return the corresponding character code in Big5. */)
9170 (Lisp_Object ch)
9171 {
9172 Lisp_Object spec, attrs, charset_list;
9173 struct charset *charset;
9174 int c;
9175 unsigned code;
9176
9177 CHECK_CHARACTER (ch);
9178 c = XFASTINT (ch);
9179 CHECK_CODING_SYSTEM_GET_SPEC (Vbig5_coding_system, spec);
9180 attrs = AREF (spec, 0);
9181 if (ASCII_CHAR_P (c)
9182 && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
9183 return ch;
9184
9185 charset_list = CODING_ATTR_CHARSET_LIST (attrs);
9186 charset = char_charset (c, charset_list, &code);
9187 if (code == CHARSET_INVALID_CODE (charset))
9188 error ("Can't encode by Big5 encoding: %c", c);
9189
9190 return make_number (code);
9191 }
9192
9193 \f
9194 DEFUN ("set-terminal-coding-system-internal", Fset_terminal_coding_system_internal,
9195 Sset_terminal_coding_system_internal, 1, 2, 0,
9196 doc: /* Internal use only. */)
9197 (Lisp_Object coding_system, Lisp_Object terminal)
9198 {
9199 struct terminal *term = get_terminal (terminal, 1);
9200 struct coding_system *terminal_coding = TERMINAL_TERMINAL_CODING (term);
9201 CHECK_SYMBOL (coding_system);
9202 setup_coding_system (Fcheck_coding_system (coding_system), terminal_coding);
9203 /* We had better not send unsafe characters to terminal. */
9204 terminal_coding->mode |= CODING_MODE_SAFE_ENCODING;
9205 /* Character composition should be disabled. */
9206 terminal_coding->common_flags &= ~CODING_ANNOTATE_COMPOSITION_MASK;
9207 terminal_coding->src_multibyte = 1;
9208 terminal_coding->dst_multibyte = 0;
9209 tset_charset_list
9210 (term, (terminal_coding->common_flags & CODING_REQUIRE_ENCODING_MASK
9211 ? coding_charset_list (terminal_coding)
9212 : Fcons (make_number (charset_ascii), Qnil)));
9213 return Qnil;
9214 }
9215
9216 DEFUN ("set-safe-terminal-coding-system-internal",
9217 Fset_safe_terminal_coding_system_internal,
9218 Sset_safe_terminal_coding_system_internal, 1, 1, 0,
9219 doc: /* Internal use only. */)
9220 (Lisp_Object coding_system)
9221 {
9222 CHECK_SYMBOL (coding_system);
9223 setup_coding_system (Fcheck_coding_system (coding_system),
9224 &safe_terminal_coding);
9225 /* Character composition should be disabled. */
9226 safe_terminal_coding.common_flags &= ~CODING_ANNOTATE_COMPOSITION_MASK;
9227 safe_terminal_coding.src_multibyte = 1;
9228 safe_terminal_coding.dst_multibyte = 0;
9229 return Qnil;
9230 }
9231
9232 DEFUN ("terminal-coding-system", Fterminal_coding_system,
9233 Sterminal_coding_system, 0, 1, 0,
9234 doc: /* Return coding system specified for terminal output on the given terminal.
9235 TERMINAL may be a terminal object, a frame, or nil for the selected
9236 frame's terminal device. */)
9237 (Lisp_Object terminal)
9238 {
9239 struct coding_system *terminal_coding
9240 = TERMINAL_TERMINAL_CODING (get_terminal (terminal, 1));
9241 Lisp_Object coding_system = CODING_ID_NAME (terminal_coding->id);
9242
9243 /* For backward compatibility, return nil if it is `undecided'. */
9244 return (! EQ (coding_system, Qundecided) ? coding_system : Qnil);
9245 }
9246
9247 DEFUN ("set-keyboard-coding-system-internal", Fset_keyboard_coding_system_internal,
9248 Sset_keyboard_coding_system_internal, 1, 2, 0,
9249 doc: /* Internal use only. */)
9250 (Lisp_Object coding_system, Lisp_Object terminal)
9251 {
9252 struct terminal *t = get_terminal (terminal, 1);
9253 CHECK_SYMBOL (coding_system);
9254 if (NILP (coding_system))
9255 coding_system = Qno_conversion;
9256 else
9257 Fcheck_coding_system (coding_system);
9258 setup_coding_system (coding_system, TERMINAL_KEYBOARD_CODING (t));
9259 /* Character composition should be disabled. */
9260 TERMINAL_KEYBOARD_CODING (t)->common_flags
9261 &= ~CODING_ANNOTATE_COMPOSITION_MASK;
9262 return Qnil;
9263 }
9264
9265 DEFUN ("keyboard-coding-system",
9266 Fkeyboard_coding_system, Skeyboard_coding_system, 0, 1, 0,
9267 doc: /* Return coding system specified for decoding keyboard input. */)
9268 (Lisp_Object terminal)
9269 {
9270 return CODING_ID_NAME (TERMINAL_KEYBOARD_CODING
9271 (get_terminal (terminal, 1))->id);
9272 }
9273
9274 \f
9275 DEFUN ("find-operation-coding-system", Ffind_operation_coding_system,
9276 Sfind_operation_coding_system, 1, MANY, 0,
9277 doc: /* Choose a coding system for an operation based on the target name.
9278 The value names a pair of coding systems: (DECODING-SYSTEM . ENCODING-SYSTEM).
9279 DECODING-SYSTEM is the coding system to use for decoding
9280 \(in case OPERATION does decoding), and ENCODING-SYSTEM is the coding system
9281 for encoding (in case OPERATION does encoding).
9282
9283 The first argument OPERATION specifies an I/O primitive:
9284 For file I/O, `insert-file-contents' or `write-region'.
9285 For process I/O, `call-process', `call-process-region', or `start-process'.
9286 For network I/O, `open-network-stream'.
9287
9288 The remaining arguments should be the same arguments that were passed
9289 to the primitive. Depending on which primitive, one of those arguments
9290 is selected as the TARGET. For example, if OPERATION does file I/O,
9291 whichever argument specifies the file name is TARGET.
9292
9293 TARGET has a meaning which depends on OPERATION:
9294 For file I/O, TARGET is a file name (except for the special case below).
9295 For process I/O, TARGET is a process name.
9296 For network I/O, TARGET is a service name or a port number.
9297
9298 This function looks up what is specified for TARGET in
9299 `file-coding-system-alist', `process-coding-system-alist',
9300 or `network-coding-system-alist' depending on OPERATION.
9301 They may specify a coding system, a cons of coding systems,
9302 or a function symbol to call.
9303 In the last case, we call the function with one argument,
9304 which is a list of all the arguments given to this function.
9305 If the function can't decide a coding system, it can return
9306 `undecided' so that the normal code-detection is performed.
9307
9308 If OPERATION is `insert-file-contents', the argument corresponding to
9309 TARGET may be a cons (FILENAME . BUFFER). In that case, FILENAME is a
9310 file name to look up, and BUFFER is a buffer that contains the file's
9311 contents (not yet decoded). If `file-coding-system-alist' specifies a
9312 function to call for FILENAME, that function should examine the
9313 contents of BUFFER instead of reading the file.
9314
9315 usage: (find-operation-coding-system OPERATION ARGUMENTS...) */)
9316 (ptrdiff_t nargs, Lisp_Object *args)
9317 {
9318 Lisp_Object operation, target_idx, target, val;
9319 register Lisp_Object chain;
9320
9321 if (nargs < 2)
9322 error ("Too few arguments");
9323 operation = args[0];
9324 if (!SYMBOLP (operation)
9325 || (target_idx = Fget (operation, Qtarget_idx), !NATNUMP (target_idx)))
9326 error ("Invalid first argument");
9327 if (nargs <= 1 + XFASTINT (target_idx))
9328 error ("Too few arguments for operation `%s'",
9329 SDATA (SYMBOL_NAME (operation)));
9330 target = args[XFASTINT (target_idx) + 1];
9331 if (!(STRINGP (target)
9332 || (EQ (operation, Qinsert_file_contents) && CONSP (target)
9333 && STRINGP (XCAR (target)) && BUFFERP (XCDR (target)))
9334 || (EQ (operation, Qopen_network_stream) && INTEGERP (target))))
9335 error ("Invalid argument %"pI"d of operation `%s'",
9336 XFASTINT (target_idx) + 1, SDATA (SYMBOL_NAME (operation)));
9337 if (CONSP (target))
9338 target = XCAR (target);
9339
9340 chain = ((EQ (operation, Qinsert_file_contents)
9341 || EQ (operation, Qwrite_region))
9342 ? Vfile_coding_system_alist
9343 : (EQ (operation, Qopen_network_stream)
9344 ? Vnetwork_coding_system_alist
9345 : Vprocess_coding_system_alist));
9346 if (NILP (chain))
9347 return Qnil;
9348
9349 for (; CONSP (chain); chain = XCDR (chain))
9350 {
9351 Lisp_Object elt;
9352
9353 elt = XCAR (chain);
9354 if (CONSP (elt)
9355 && ((STRINGP (target)
9356 && STRINGP (XCAR (elt))
9357 && fast_string_match (XCAR (elt), target) >= 0)
9358 || (INTEGERP (target) && EQ (target, XCAR (elt)))))
9359 {
9360 val = XCDR (elt);
9361 /* Here, if VAL is both a valid coding system and a valid
9362 function symbol, we return VAL as a coding system. */
9363 if (CONSP (val))
9364 return val;
9365 if (! SYMBOLP (val))
9366 return Qnil;
9367 if (! NILP (Fcoding_system_p (val)))
9368 return Fcons (val, val);
9369 if (! NILP (Ffboundp (val)))
9370 {
9371 /* We use call1 rather than safe_call1
9372 so as to get bug reports about functions called here
9373 which don't handle the current interface. */
9374 val = call1 (val, Flist (nargs, args));
9375 if (CONSP (val))
9376 return val;
9377 if (SYMBOLP (val) && ! NILP (Fcoding_system_p (val)))
9378 return Fcons (val, val);
9379 }
9380 return Qnil;
9381 }
9382 }
9383 return Qnil;
9384 }
9385
9386 DEFUN ("set-coding-system-priority", Fset_coding_system_priority,
9387 Sset_coding_system_priority, 0, MANY, 0,
9388 doc: /* Assign higher priority to the coding systems given as arguments.
9389 If multiple coding systems belong to the same category,
9390 all but the first one are ignored.
9391
9392 usage: (set-coding-system-priority &rest coding-systems) */)
9393 (ptrdiff_t nargs, Lisp_Object *args)
9394 {
9395 ptrdiff_t i, j;
9396 bool changed[coding_category_max];
9397 enum coding_category priorities[coding_category_max];
9398
9399 memset (changed, 0, sizeof changed);
9400
9401 for (i = j = 0; i < nargs; i++)
9402 {
9403 enum coding_category category;
9404 Lisp_Object spec, attrs;
9405
9406 CHECK_CODING_SYSTEM_GET_SPEC (args[i], spec);
9407 attrs = AREF (spec, 0);
9408 category = XINT (CODING_ATTR_CATEGORY (attrs));
9409 if (changed[category])
9410 /* Ignore this coding system because a coding system of the
9411 same category already had a higher priority. */
9412 continue;
9413 changed[category] = 1;
9414 priorities[j++] = category;
9415 if (coding_categories[category].id >= 0
9416 && ! EQ (args[i], CODING_ID_NAME (coding_categories[category].id)))
9417 setup_coding_system (args[i], &coding_categories[category]);
9418 Fset (AREF (Vcoding_category_table, category), args[i]);
9419 }
9420
9421 /* Now we have decided top J priorities. Reflect the order of the
9422 original priorities to the remaining priorities. */
9423
9424 for (i = j, j = 0; i < coding_category_max; i++, j++)
9425 {
9426 while (j < coding_category_max
9427 && changed[coding_priorities[j]])
9428 j++;
9429 if (j == coding_category_max)
9430 emacs_abort ();
9431 priorities[i] = coding_priorities[j];
9432 }
9433
9434 memcpy (coding_priorities, priorities, sizeof priorities);
9435
9436 /* Update `coding-category-list'. */
9437 Vcoding_category_list = Qnil;
9438 for (i = coding_category_max; i-- > 0; )
9439 Vcoding_category_list
9440 = Fcons (AREF (Vcoding_category_table, priorities[i]),
9441 Vcoding_category_list);
9442
9443 return Qnil;
9444 }
9445
9446 DEFUN ("coding-system-priority-list", Fcoding_system_priority_list,
9447 Scoding_system_priority_list, 0, 1, 0,
9448 doc: /* Return a list of coding systems ordered by their priorities.
9449 The list contains a subset of coding systems; i.e. coding systems
9450 assigned to each coding category (see `coding-category-list').
9451
9452 HIGHESTP non-nil means just return the highest priority one. */)
9453 (Lisp_Object highestp)
9454 {
9455 int i;
9456 Lisp_Object val;
9457
9458 for (i = 0, val = Qnil; i < coding_category_max; i++)
9459 {
9460 enum coding_category category = coding_priorities[i];
9461 int id = coding_categories[category].id;
9462 Lisp_Object attrs;
9463
9464 if (id < 0)
9465 continue;
9466 attrs = CODING_ID_ATTRS (id);
9467 if (! NILP (highestp))
9468 return CODING_ATTR_BASE_NAME (attrs);
9469 val = Fcons (CODING_ATTR_BASE_NAME (attrs), val);
9470 }
9471 return Fnreverse (val);
9472 }
9473
9474 static const char *const suffixes[] = { "-unix", "-dos", "-mac" };
9475
9476 static Lisp_Object
9477 make_subsidiaries (Lisp_Object base)
9478 {
9479 Lisp_Object subsidiaries;
9480 ptrdiff_t base_name_len = SBYTES (SYMBOL_NAME (base));
9481 char *buf = alloca (base_name_len + 6);
9482 int i;
9483
9484 memcpy (buf, SDATA (SYMBOL_NAME (base)), base_name_len);
9485 subsidiaries = Fmake_vector (make_number (3), Qnil);
9486 for (i = 0; i < 3; i++)
9487 {
9488 strcpy (buf + base_name_len, suffixes[i]);
9489 ASET (subsidiaries, i, intern (buf));
9490 }
9491 return subsidiaries;
9492 }
9493
9494
9495 DEFUN ("define-coding-system-internal", Fdefine_coding_system_internal,
9496 Sdefine_coding_system_internal, coding_arg_max, MANY, 0,
9497 doc: /* For internal use only.
9498 usage: (define-coding-system-internal ...) */)
9499 (ptrdiff_t nargs, Lisp_Object *args)
9500 {
9501 Lisp_Object name;
9502 Lisp_Object spec_vec; /* [ ATTRS ALIASE EOL_TYPE ] */
9503 Lisp_Object attrs; /* Vector of attributes. */
9504 Lisp_Object eol_type;
9505 Lisp_Object aliases;
9506 Lisp_Object coding_type, charset_list, safe_charsets;
9507 enum coding_category category;
9508 Lisp_Object tail, val;
9509 int max_charset_id = 0;
9510 int i;
9511
9512 if (nargs < coding_arg_max)
9513 goto short_args;
9514
9515 attrs = Fmake_vector (make_number (coding_attr_last_index), Qnil);
9516
9517 name = args[coding_arg_name];
9518 CHECK_SYMBOL (name);
9519 ASET (attrs, coding_attr_base_name, name);
9520
9521 val = args[coding_arg_mnemonic];
9522 if (! STRINGP (val))
9523 CHECK_CHARACTER (val);
9524 ASET (attrs, coding_attr_mnemonic, val);
9525
9526 coding_type = args[coding_arg_coding_type];
9527 CHECK_SYMBOL (coding_type);
9528 ASET (attrs, coding_attr_type, coding_type);
9529
9530 charset_list = args[coding_arg_charset_list];
9531 if (SYMBOLP (charset_list))
9532 {
9533 if (EQ (charset_list, Qiso_2022))
9534 {
9535 if (! EQ (coding_type, Qiso_2022))
9536 error ("Invalid charset-list");
9537 charset_list = Viso_2022_charset_list;
9538 }
9539 else if (EQ (charset_list, Qemacs_mule))
9540 {
9541 if (! EQ (coding_type, Qemacs_mule))
9542 error ("Invalid charset-list");
9543 charset_list = Vemacs_mule_charset_list;
9544 }
9545 for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
9546 {
9547 if (! RANGED_INTEGERP (0, XCAR (tail), INT_MAX - 1))
9548 error ("Invalid charset-list");
9549 if (max_charset_id < XFASTINT (XCAR (tail)))
9550 max_charset_id = XFASTINT (XCAR (tail));
9551 }
9552 }
9553 else
9554 {
9555 charset_list = Fcopy_sequence (charset_list);
9556 for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
9557 {
9558 struct charset *charset;
9559
9560 val = XCAR (tail);
9561 CHECK_CHARSET_GET_CHARSET (val, charset);
9562 if (EQ (coding_type, Qiso_2022)
9563 ? CHARSET_ISO_FINAL (charset) < 0
9564 : EQ (coding_type, Qemacs_mule)
9565 ? CHARSET_EMACS_MULE_ID (charset) < 0
9566 : 0)
9567 error ("Can't handle charset `%s'",
9568 SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
9569
9570 XSETCAR (tail, make_number (charset->id));
9571 if (max_charset_id < charset->id)
9572 max_charset_id = charset->id;
9573 }
9574 }
9575 ASET (attrs, coding_attr_charset_list, charset_list);
9576
9577 safe_charsets = make_uninit_string (max_charset_id + 1);
9578 memset (SDATA (safe_charsets), 255, max_charset_id + 1);
9579 for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
9580 SSET (safe_charsets, XFASTINT (XCAR (tail)), 0);
9581 ASET (attrs, coding_attr_safe_charsets, safe_charsets);
9582
9583 ASET (attrs, coding_attr_ascii_compat, args[coding_arg_ascii_compatible_p]);
9584
9585 val = args[coding_arg_decode_translation_table];
9586 if (! CHAR_TABLE_P (val) && ! CONSP (val))
9587 CHECK_SYMBOL (val);
9588 ASET (attrs, coding_attr_decode_tbl, val);
9589
9590 val = args[coding_arg_encode_translation_table];
9591 if (! CHAR_TABLE_P (val) && ! CONSP (val))
9592 CHECK_SYMBOL (val);
9593 ASET (attrs, coding_attr_encode_tbl, val);
9594
9595 val = args[coding_arg_post_read_conversion];
9596 CHECK_SYMBOL (val);
9597 ASET (attrs, coding_attr_post_read, val);
9598
9599 val = args[coding_arg_pre_write_conversion];
9600 CHECK_SYMBOL (val);
9601 ASET (attrs, coding_attr_pre_write, val);
9602
9603 val = args[coding_arg_default_char];
9604 if (NILP (val))
9605 ASET (attrs, coding_attr_default_char, make_number (' '));
9606 else
9607 {
9608 CHECK_CHARACTER (val);
9609 ASET (attrs, coding_attr_default_char, val);
9610 }
9611
9612 val = args[coding_arg_for_unibyte];
9613 ASET (attrs, coding_attr_for_unibyte, NILP (val) ? Qnil : Qt);
9614
9615 val = args[coding_arg_plist];
9616 CHECK_LIST (val);
9617 ASET (attrs, coding_attr_plist, val);
9618
9619 if (EQ (coding_type, Qcharset))
9620 {
9621 /* Generate a lisp vector of 256 elements. Each element is nil,
9622 integer, or a list of charset IDs.
9623
9624 If Nth element is nil, the byte code N is invalid in this
9625 coding system.
9626
9627 If Nth element is a number NUM, N is the first byte of a
9628 charset whose ID is NUM.
9629
9630 If Nth element is a list of charset IDs, N is the first byte
9631 of one of them. The list is sorted by dimensions of the
9632 charsets. A charset of smaller dimension comes first. */
9633 val = Fmake_vector (make_number (256), Qnil);
9634
9635 for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
9636 {
9637 struct charset *charset = CHARSET_FROM_ID (XFASTINT (XCAR (tail)));
9638 int dim = CHARSET_DIMENSION (charset);
9639 int idx = (dim - 1) * 4;
9640
9641 if (CHARSET_ASCII_COMPATIBLE_P (charset))
9642 ASET (attrs, coding_attr_ascii_compat, Qt);
9643
9644 for (i = charset->code_space[idx];
9645 i <= charset->code_space[idx + 1]; i++)
9646 {
9647 Lisp_Object tmp, tmp2;
9648 int dim2;
9649
9650 tmp = AREF (val, i);
9651 if (NILP (tmp))
9652 tmp = XCAR (tail);
9653 else if (NUMBERP (tmp))
9654 {
9655 dim2 = CHARSET_DIMENSION (CHARSET_FROM_ID (XFASTINT (tmp)));
9656 if (dim < dim2)
9657 tmp = Fcons (XCAR (tail), Fcons (tmp, Qnil));
9658 else
9659 tmp = Fcons (tmp, Fcons (XCAR (tail), Qnil));
9660 }
9661 else
9662 {
9663 for (tmp2 = tmp; CONSP (tmp2); tmp2 = XCDR (tmp2))
9664 {
9665 dim2 = CHARSET_DIMENSION (CHARSET_FROM_ID (XFASTINT (XCAR (tmp2))));
9666 if (dim < dim2)
9667 break;
9668 }
9669 if (NILP (tmp2))
9670 tmp = nconc2 (tmp, Fcons (XCAR (tail), Qnil));
9671 else
9672 {
9673 XSETCDR (tmp2, Fcons (XCAR (tmp2), XCDR (tmp2)));
9674 XSETCAR (tmp2, XCAR (tail));
9675 }
9676 }
9677 ASET (val, i, tmp);
9678 }
9679 }
9680 ASET (attrs, coding_attr_charset_valids, val);
9681 category = coding_category_charset;
9682 }
9683 else if (EQ (coding_type, Qccl))
9684 {
9685 Lisp_Object valids;
9686
9687 if (nargs < coding_arg_ccl_max)
9688 goto short_args;
9689
9690 val = args[coding_arg_ccl_decoder];
9691 CHECK_CCL_PROGRAM (val);
9692 if (VECTORP (val))
9693 val = Fcopy_sequence (val);
9694 ASET (attrs, coding_attr_ccl_decoder, val);
9695
9696 val = args[coding_arg_ccl_encoder];
9697 CHECK_CCL_PROGRAM (val);
9698 if (VECTORP (val))
9699 val = Fcopy_sequence (val);
9700 ASET (attrs, coding_attr_ccl_encoder, val);
9701
9702 val = args[coding_arg_ccl_valids];
9703 valids = Fmake_string (make_number (256), make_number (0));
9704 for (tail = val; CONSP (tail); tail = XCDR (tail))
9705 {
9706 int from, to;
9707
9708 val = XCAR (tail);
9709 if (INTEGERP (val))
9710 {
9711 if (! (0 <= XINT (val) && XINT (val) <= 255))
9712 args_out_of_range_3 (val, make_number (0), make_number (255));
9713 from = to = XINT (val);
9714 }
9715 else
9716 {
9717 CHECK_CONS (val);
9718 CHECK_NATNUM_CAR (val);
9719 CHECK_NUMBER_CDR (val);
9720 if (XINT (XCAR (val)) > 255)
9721 args_out_of_range_3 (XCAR (val),
9722 make_number (0), make_number (255));
9723 from = XINT (XCAR (val));
9724 if (! (from <= XINT (XCDR (val)) && XINT (XCDR (val)) <= 255))
9725 args_out_of_range_3 (XCDR (val),
9726 XCAR (val), make_number (255));
9727 to = XINT (XCDR (val));
9728 }
9729 for (i = from; i <= to; i++)
9730 SSET (valids, i, 1);
9731 }
9732 ASET (attrs, coding_attr_ccl_valids, valids);
9733
9734 category = coding_category_ccl;
9735 }
9736 else if (EQ (coding_type, Qutf_16))
9737 {
9738 Lisp_Object bom, endian;
9739
9740 ASET (attrs, coding_attr_ascii_compat, Qnil);
9741
9742 if (nargs < coding_arg_utf16_max)
9743 goto short_args;
9744
9745 bom = args[coding_arg_utf16_bom];
9746 if (! NILP (bom) && ! EQ (bom, Qt))
9747 {
9748 CHECK_CONS (bom);
9749 val = XCAR (bom);
9750 CHECK_CODING_SYSTEM (val);
9751 val = XCDR (bom);
9752 CHECK_CODING_SYSTEM (val);
9753 }
9754 ASET (attrs, coding_attr_utf_bom, bom);
9755
9756 endian = args[coding_arg_utf16_endian];
9757 CHECK_SYMBOL (endian);
9758 if (NILP (endian))
9759 endian = Qbig;
9760 else if (! EQ (endian, Qbig) && ! EQ (endian, Qlittle))
9761 error ("Invalid endian: %s", SDATA (SYMBOL_NAME (endian)));
9762 ASET (attrs, coding_attr_utf_16_endian, endian);
9763
9764 category = (CONSP (bom)
9765 ? coding_category_utf_16_auto
9766 : NILP (bom)
9767 ? (EQ (endian, Qbig)
9768 ? coding_category_utf_16_be_nosig
9769 : coding_category_utf_16_le_nosig)
9770 : (EQ (endian, Qbig)
9771 ? coding_category_utf_16_be
9772 : coding_category_utf_16_le));
9773 }
9774 else if (EQ (coding_type, Qiso_2022))
9775 {
9776 Lisp_Object initial, reg_usage, request, flags;
9777
9778 if (nargs < coding_arg_iso2022_max)
9779 goto short_args;
9780
9781 initial = Fcopy_sequence (args[coding_arg_iso2022_initial]);
9782 CHECK_VECTOR (initial);
9783 for (i = 0; i < 4; i++)
9784 {
9785 val = Faref (initial, make_number (i));
9786 if (! NILP (val))
9787 {
9788 struct charset *charset;
9789
9790 CHECK_CHARSET_GET_CHARSET (val, charset);
9791 ASET (initial, i, make_number (CHARSET_ID (charset)));
9792 if (i == 0 && CHARSET_ASCII_COMPATIBLE_P (charset))
9793 ASET (attrs, coding_attr_ascii_compat, Qt);
9794 }
9795 else
9796 ASET (initial, i, make_number (-1));
9797 }
9798
9799 reg_usage = args[coding_arg_iso2022_reg_usage];
9800 CHECK_CONS (reg_usage);
9801 CHECK_NUMBER_CAR (reg_usage);
9802 CHECK_NUMBER_CDR (reg_usage);
9803
9804 request = Fcopy_sequence (args[coding_arg_iso2022_request]);
9805 for (tail = request; CONSP (tail); tail = XCDR (tail))
9806 {
9807 int id;
9808 Lisp_Object tmp1;
9809
9810 val = XCAR (tail);
9811 CHECK_CONS (val);
9812 tmp1 = XCAR (val);
9813 CHECK_CHARSET_GET_ID (tmp1, id);
9814 CHECK_NATNUM_CDR (val);
9815 if (XINT (XCDR (val)) >= 4)
9816 error ("Invalid graphic register number: %"pI"d", XINT (XCDR (val)));
9817 XSETCAR (val, make_number (id));
9818 }
9819
9820 flags = args[coding_arg_iso2022_flags];
9821 CHECK_NATNUM (flags);
9822 i = XINT (flags) & INT_MAX;
9823 if (EQ (args[coding_arg_charset_list], Qiso_2022))
9824 i |= CODING_ISO_FLAG_FULL_SUPPORT;
9825 flags = make_number (i);
9826
9827 ASET (attrs, coding_attr_iso_initial, initial);
9828 ASET (attrs, coding_attr_iso_usage, reg_usage);
9829 ASET (attrs, coding_attr_iso_request, request);
9830 ASET (attrs, coding_attr_iso_flags, flags);
9831 setup_iso_safe_charsets (attrs);
9832
9833 if (i & CODING_ISO_FLAG_SEVEN_BITS)
9834 category = ((i & (CODING_ISO_FLAG_LOCKING_SHIFT
9835 | CODING_ISO_FLAG_SINGLE_SHIFT))
9836 ? coding_category_iso_7_else
9837 : EQ (args[coding_arg_charset_list], Qiso_2022)
9838 ? coding_category_iso_7
9839 : coding_category_iso_7_tight);
9840 else
9841 {
9842 int id = XINT (AREF (initial, 1));
9843
9844 category = (((i & CODING_ISO_FLAG_LOCKING_SHIFT)
9845 || EQ (args[coding_arg_charset_list], Qiso_2022)
9846 || id < 0)
9847 ? coding_category_iso_8_else
9848 : (CHARSET_DIMENSION (CHARSET_FROM_ID (id)) == 1)
9849 ? coding_category_iso_8_1
9850 : coding_category_iso_8_2);
9851 }
9852 if (category != coding_category_iso_8_1
9853 && category != coding_category_iso_8_2)
9854 ASET (attrs, coding_attr_ascii_compat, Qnil);
9855 }
9856 else if (EQ (coding_type, Qemacs_mule))
9857 {
9858 if (EQ (args[coding_arg_charset_list], Qemacs_mule))
9859 ASET (attrs, coding_attr_emacs_mule_full, Qt);
9860 ASET (attrs, coding_attr_ascii_compat, Qt);
9861 category = coding_category_emacs_mule;
9862 }
9863 else if (EQ (coding_type, Qshift_jis))
9864 {
9865
9866 struct charset *charset;
9867
9868 if (XINT (Flength (charset_list)) != 3
9869 && XINT (Flength (charset_list)) != 4)
9870 error ("There should be three or four charsets");
9871
9872 charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
9873 if (CHARSET_DIMENSION (charset) != 1)
9874 error ("Dimension of charset %s is not one",
9875 SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
9876 if (CHARSET_ASCII_COMPATIBLE_P (charset))
9877 ASET (attrs, coding_attr_ascii_compat, Qt);
9878
9879 charset_list = XCDR (charset_list);
9880 charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
9881 if (CHARSET_DIMENSION (charset) != 1)
9882 error ("Dimension of charset %s is not one",
9883 SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
9884
9885 charset_list = XCDR (charset_list);
9886 charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
9887 if (CHARSET_DIMENSION (charset) != 2)
9888 error ("Dimension of charset %s is not two",
9889 SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
9890
9891 charset_list = XCDR (charset_list);
9892 if (! NILP (charset_list))
9893 {
9894 charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
9895 if (CHARSET_DIMENSION (charset) != 2)
9896 error ("Dimension of charset %s is not two",
9897 SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
9898 }
9899
9900 category = coding_category_sjis;
9901 Vsjis_coding_system = name;
9902 }
9903 else if (EQ (coding_type, Qbig5))
9904 {
9905 struct charset *charset;
9906
9907 if (XINT (Flength (charset_list)) != 2)
9908 error ("There should be just two charsets");
9909
9910 charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
9911 if (CHARSET_DIMENSION (charset) != 1)
9912 error ("Dimension of charset %s is not one",
9913 SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
9914 if (CHARSET_ASCII_COMPATIBLE_P (charset))
9915 ASET (attrs, coding_attr_ascii_compat, Qt);
9916
9917 charset_list = XCDR (charset_list);
9918 charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
9919 if (CHARSET_DIMENSION (charset) != 2)
9920 error ("Dimension of charset %s is not two",
9921 SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
9922
9923 category = coding_category_big5;
9924 Vbig5_coding_system = name;
9925 }
9926 else if (EQ (coding_type, Qraw_text))
9927 {
9928 category = coding_category_raw_text;
9929 ASET (attrs, coding_attr_ascii_compat, Qt);
9930 }
9931 else if (EQ (coding_type, Qutf_8))
9932 {
9933 Lisp_Object bom;
9934
9935 if (nargs < coding_arg_utf8_max)
9936 goto short_args;
9937
9938 bom = args[coding_arg_utf8_bom];
9939 if (! NILP (bom) && ! EQ (bom, Qt))
9940 {
9941 CHECK_CONS (bom);
9942 val = XCAR (bom);
9943 CHECK_CODING_SYSTEM (val);
9944 val = XCDR (bom);
9945 CHECK_CODING_SYSTEM (val);
9946 }
9947 ASET (attrs, coding_attr_utf_bom, bom);
9948 if (NILP (bom))
9949 ASET (attrs, coding_attr_ascii_compat, Qt);
9950
9951 category = (CONSP (bom) ? coding_category_utf_8_auto
9952 : NILP (bom) ? coding_category_utf_8_nosig
9953 : coding_category_utf_8_sig);
9954 }
9955 else if (EQ (coding_type, Qundecided))
9956 category = coding_category_undecided;
9957 else
9958 error ("Invalid coding system type: %s",
9959 SDATA (SYMBOL_NAME (coding_type)));
9960
9961 ASET (attrs, coding_attr_category, make_number (category));
9962 ASET (attrs, coding_attr_plist,
9963 Fcons (QCcategory,
9964 Fcons (AREF (Vcoding_category_table, category),
9965 CODING_ATTR_PLIST (attrs))));
9966 ASET (attrs, coding_attr_plist,
9967 Fcons (QCascii_compatible_p,
9968 Fcons (CODING_ATTR_ASCII_COMPAT (attrs),
9969 CODING_ATTR_PLIST (attrs))));
9970
9971 eol_type = args[coding_arg_eol_type];
9972 if (! NILP (eol_type)
9973 && ! EQ (eol_type, Qunix)
9974 && ! EQ (eol_type, Qdos)
9975 && ! EQ (eol_type, Qmac))
9976 error ("Invalid eol-type");
9977
9978 aliases = Fcons (name, Qnil);
9979
9980 if (NILP (eol_type))
9981 {
9982 eol_type = make_subsidiaries (name);
9983 for (i = 0; i < 3; i++)
9984 {
9985 Lisp_Object this_spec, this_name, this_aliases, this_eol_type;
9986
9987 this_name = AREF (eol_type, i);
9988 this_aliases = Fcons (this_name, Qnil);
9989 this_eol_type = (i == 0 ? Qunix : i == 1 ? Qdos : Qmac);
9990 this_spec = Fmake_vector (make_number (3), attrs);
9991 ASET (this_spec, 1, this_aliases);
9992 ASET (this_spec, 2, this_eol_type);
9993 Fputhash (this_name, this_spec, Vcoding_system_hash_table);
9994 Vcoding_system_list = Fcons (this_name, Vcoding_system_list);
9995 val = Fassoc (Fsymbol_name (this_name), Vcoding_system_alist);
9996 if (NILP (val))
9997 Vcoding_system_alist
9998 = Fcons (Fcons (Fsymbol_name (this_name), Qnil),
9999 Vcoding_system_alist);
10000 }
10001 }
10002
10003 spec_vec = Fmake_vector (make_number (3), attrs);
10004 ASET (spec_vec, 1, aliases);
10005 ASET (spec_vec, 2, eol_type);
10006
10007 Fputhash (name, spec_vec, Vcoding_system_hash_table);
10008 Vcoding_system_list = Fcons (name, Vcoding_system_list);
10009 val = Fassoc (Fsymbol_name (name), Vcoding_system_alist);
10010 if (NILP (val))
10011 Vcoding_system_alist = Fcons (Fcons (Fsymbol_name (name), Qnil),
10012 Vcoding_system_alist);
10013
10014 {
10015 int id = coding_categories[category].id;
10016
10017 if (id < 0 || EQ (name, CODING_ID_NAME (id)))
10018 setup_coding_system (name, &coding_categories[category]);
10019 }
10020
10021 return Qnil;
10022
10023 short_args:
10024 return Fsignal (Qwrong_number_of_arguments,
10025 Fcons (intern ("define-coding-system-internal"),
10026 make_number (nargs)));
10027 }
10028
10029
10030 DEFUN ("coding-system-put", Fcoding_system_put, Scoding_system_put,
10031 3, 3, 0,
10032 doc: /* Change value in CODING-SYSTEM's property list PROP to VAL. */)
10033 (Lisp_Object coding_system, Lisp_Object prop, Lisp_Object val)
10034 {
10035 Lisp_Object spec, attrs;
10036
10037 CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
10038 attrs = AREF (spec, 0);
10039 if (EQ (prop, QCmnemonic))
10040 {
10041 if (! STRINGP (val))
10042 CHECK_CHARACTER (val);
10043 ASET (attrs, coding_attr_mnemonic, val);
10044 }
10045 else if (EQ (prop, QCdefault_char))
10046 {
10047 if (NILP (val))
10048 val = make_number (' ');
10049 else
10050 CHECK_CHARACTER (val);
10051 ASET (attrs, coding_attr_default_char, val);
10052 }
10053 else if (EQ (prop, QCdecode_translation_table))
10054 {
10055 if (! CHAR_TABLE_P (val) && ! CONSP (val))
10056 CHECK_SYMBOL (val);
10057 ASET (attrs, coding_attr_decode_tbl, val);
10058 }
10059 else if (EQ (prop, QCencode_translation_table))
10060 {
10061 if (! CHAR_TABLE_P (val) && ! CONSP (val))
10062 CHECK_SYMBOL (val);
10063 ASET (attrs, coding_attr_encode_tbl, val);
10064 }
10065 else if (EQ (prop, QCpost_read_conversion))
10066 {
10067 CHECK_SYMBOL (val);
10068 ASET (attrs, coding_attr_post_read, val);
10069 }
10070 else if (EQ (prop, QCpre_write_conversion))
10071 {
10072 CHECK_SYMBOL (val);
10073 ASET (attrs, coding_attr_pre_write, val);
10074 }
10075 else if (EQ (prop, QCascii_compatible_p))
10076 {
10077 ASET (attrs, coding_attr_ascii_compat, val);
10078 }
10079
10080 ASET (attrs, coding_attr_plist,
10081 Fplist_put (CODING_ATTR_PLIST (attrs), prop, val));
10082 return val;
10083 }
10084
10085
10086 DEFUN ("define-coding-system-alias", Fdefine_coding_system_alias,
10087 Sdefine_coding_system_alias, 2, 2, 0,
10088 doc: /* Define ALIAS as an alias for CODING-SYSTEM. */)
10089 (Lisp_Object alias, Lisp_Object coding_system)
10090 {
10091 Lisp_Object spec, aliases, eol_type, val;
10092
10093 CHECK_SYMBOL (alias);
10094 CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
10095 aliases = AREF (spec, 1);
10096 /* ALIASES should be a list of length more than zero, and the first
10097 element is a base coding system. Append ALIAS at the tail of the
10098 list. */
10099 while (!NILP (XCDR (aliases)))
10100 aliases = XCDR (aliases);
10101 XSETCDR (aliases, Fcons (alias, Qnil));
10102
10103 eol_type = AREF (spec, 2);
10104 if (VECTORP (eol_type))
10105 {
10106 Lisp_Object subsidiaries;
10107 int i;
10108
10109 subsidiaries = make_subsidiaries (alias);
10110 for (i = 0; i < 3; i++)
10111 Fdefine_coding_system_alias (AREF (subsidiaries, i),
10112 AREF (eol_type, i));
10113 }
10114
10115 Fputhash (alias, spec, Vcoding_system_hash_table);
10116 Vcoding_system_list = Fcons (alias, Vcoding_system_list);
10117 val = Fassoc (Fsymbol_name (alias), Vcoding_system_alist);
10118 if (NILP (val))
10119 Vcoding_system_alist = Fcons (Fcons (Fsymbol_name (alias), Qnil),
10120 Vcoding_system_alist);
10121
10122 return Qnil;
10123 }
10124
10125 DEFUN ("coding-system-base", Fcoding_system_base, Scoding_system_base,
10126 1, 1, 0,
10127 doc: /* Return the base of CODING-SYSTEM.
10128 Any alias or subsidiary coding system is not a base coding system. */)
10129 (Lisp_Object coding_system)
10130 {
10131 Lisp_Object spec, attrs;
10132
10133 if (NILP (coding_system))
10134 return (Qno_conversion);
10135 CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
10136 attrs = AREF (spec, 0);
10137 return CODING_ATTR_BASE_NAME (attrs);
10138 }
10139
10140 DEFUN ("coding-system-plist", Fcoding_system_plist, Scoding_system_plist,
10141 1, 1, 0,
10142 doc: "Return the property list of CODING-SYSTEM.")
10143 (Lisp_Object coding_system)
10144 {
10145 Lisp_Object spec, attrs;
10146
10147 if (NILP (coding_system))
10148 coding_system = Qno_conversion;
10149 CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
10150 attrs = AREF (spec, 0);
10151 return CODING_ATTR_PLIST (attrs);
10152 }
10153
10154
10155 DEFUN ("coding-system-aliases", Fcoding_system_aliases, Scoding_system_aliases,
10156 1, 1, 0,
10157 doc: /* Return the list of aliases of CODING-SYSTEM. */)
10158 (Lisp_Object coding_system)
10159 {
10160 Lisp_Object spec;
10161
10162 if (NILP (coding_system))
10163 coding_system = Qno_conversion;
10164 CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
10165 return AREF (spec, 1);
10166 }
10167
10168 DEFUN ("coding-system-eol-type", Fcoding_system_eol_type,
10169 Scoding_system_eol_type, 1, 1, 0,
10170 doc: /* Return eol-type of CODING-SYSTEM.
10171 An eol-type is an integer 0, 1, 2, or a vector of coding systems.
10172
10173 Integer values 0, 1, and 2 indicate a format of end-of-line; LF, CRLF,
10174 and CR respectively.
10175
10176 A vector value indicates that a format of end-of-line should be
10177 detected automatically. Nth element of the vector is the subsidiary
10178 coding system whose eol-type is N. */)
10179 (Lisp_Object coding_system)
10180 {
10181 Lisp_Object spec, eol_type;
10182 int n;
10183
10184 if (NILP (coding_system))
10185 coding_system = Qno_conversion;
10186 if (! CODING_SYSTEM_P (coding_system))
10187 return Qnil;
10188 spec = CODING_SYSTEM_SPEC (coding_system);
10189 eol_type = AREF (spec, 2);
10190 if (VECTORP (eol_type))
10191 return Fcopy_sequence (eol_type);
10192 n = EQ (eol_type, Qunix) ? 0 : EQ (eol_type, Qdos) ? 1 : 2;
10193 return make_number (n);
10194 }
10195
10196 #endif /* emacs */
10197
10198 \f
10199 /*** 9. Post-amble ***/
10200
10201 void
10202 init_coding_once (void)
10203 {
10204 int i;
10205
10206 for (i = 0; i < coding_category_max; i++)
10207 {
10208 coding_categories[i].id = -1;
10209 coding_priorities[i] = i;
10210 }
10211
10212 /* ISO2022 specific initialize routine. */
10213 for (i = 0; i < 0x20; i++)
10214 iso_code_class[i] = ISO_control_0;
10215 for (i = 0x21; i < 0x7F; i++)
10216 iso_code_class[i] = ISO_graphic_plane_0;
10217 for (i = 0x80; i < 0xA0; i++)
10218 iso_code_class[i] = ISO_control_1;
10219 for (i = 0xA1; i < 0xFF; i++)
10220 iso_code_class[i] = ISO_graphic_plane_1;
10221 iso_code_class[0x20] = iso_code_class[0x7F] = ISO_0x20_or_0x7F;
10222 iso_code_class[0xA0] = iso_code_class[0xFF] = ISO_0xA0_or_0xFF;
10223 iso_code_class[ISO_CODE_SO] = ISO_shift_out;
10224 iso_code_class[ISO_CODE_SI] = ISO_shift_in;
10225 iso_code_class[ISO_CODE_SS2_7] = ISO_single_shift_2_7;
10226 iso_code_class[ISO_CODE_ESC] = ISO_escape;
10227 iso_code_class[ISO_CODE_SS2] = ISO_single_shift_2;
10228 iso_code_class[ISO_CODE_SS3] = ISO_single_shift_3;
10229 iso_code_class[ISO_CODE_CSI] = ISO_control_sequence_introducer;
10230
10231 for (i = 0; i < 256; i++)
10232 {
10233 emacs_mule_bytes[i] = 1;
10234 }
10235 emacs_mule_bytes[EMACS_MULE_LEADING_CODE_PRIVATE_11] = 3;
10236 emacs_mule_bytes[EMACS_MULE_LEADING_CODE_PRIVATE_12] = 3;
10237 emacs_mule_bytes[EMACS_MULE_LEADING_CODE_PRIVATE_21] = 4;
10238 emacs_mule_bytes[EMACS_MULE_LEADING_CODE_PRIVATE_22] = 4;
10239 }
10240
10241 #ifdef emacs
10242
10243 void
10244 syms_of_coding (void)
10245 {
10246 staticpro (&Vcoding_system_hash_table);
10247 {
10248 Lisp_Object args[2];
10249 args[0] = QCtest;
10250 args[1] = Qeq;
10251 Vcoding_system_hash_table = Fmake_hash_table (2, args);
10252 }
10253
10254 staticpro (&Vsjis_coding_system);
10255 Vsjis_coding_system = Qnil;
10256
10257 staticpro (&Vbig5_coding_system);
10258 Vbig5_coding_system = Qnil;
10259
10260 staticpro (&Vcode_conversion_reused_workbuf);
10261 Vcode_conversion_reused_workbuf = Qnil;
10262
10263 staticpro (&Vcode_conversion_workbuf_name);
10264 Vcode_conversion_workbuf_name = build_pure_c_string (" *code-conversion-work*");
10265
10266 reused_workbuf_in_use = 0;
10267
10268 DEFSYM (Qcharset, "charset");
10269 DEFSYM (Qtarget_idx, "target-idx");
10270 DEFSYM (Qcoding_system_history, "coding-system-history");
10271 Fset (Qcoding_system_history, Qnil);
10272
10273 /* Target FILENAME is the first argument. */
10274 Fput (Qinsert_file_contents, Qtarget_idx, make_number (0));
10275 /* Target FILENAME is the third argument. */
10276 Fput (Qwrite_region, Qtarget_idx, make_number (2));
10277
10278 DEFSYM (Qcall_process, "call-process");
10279 /* Target PROGRAM is the first argument. */
10280 Fput (Qcall_process, Qtarget_idx, make_number (0));
10281
10282 DEFSYM (Qcall_process_region, "call-process-region");
10283 /* Target PROGRAM is the third argument. */
10284 Fput (Qcall_process_region, Qtarget_idx, make_number (2));
10285
10286 DEFSYM (Qstart_process, "start-process");
10287 /* Target PROGRAM is the third argument. */
10288 Fput (Qstart_process, Qtarget_idx, make_number (2));
10289
10290 DEFSYM (Qopen_network_stream, "open-network-stream");
10291 /* Target SERVICE is the fourth argument. */
10292 Fput (Qopen_network_stream, Qtarget_idx, make_number (3));
10293
10294 DEFSYM (Qcoding_system, "coding-system");
10295 DEFSYM (Qcoding_aliases, "coding-aliases");
10296
10297 DEFSYM (Qeol_type, "eol-type");
10298 DEFSYM (Qunix, "unix");
10299 DEFSYM (Qdos, "dos");
10300 DEFSYM (Qmac, "mac");
10301
10302 DEFSYM (Qbuffer_file_coding_system, "buffer-file-coding-system");
10303 DEFSYM (Qpost_read_conversion, "post-read-conversion");
10304 DEFSYM (Qpre_write_conversion, "pre-write-conversion");
10305 DEFSYM (Qdefault_char, "default-char");
10306 DEFSYM (Qundecided, "undecided");
10307 DEFSYM (Qno_conversion, "no-conversion");
10308 DEFSYM (Qraw_text, "raw-text");
10309
10310 DEFSYM (Qiso_2022, "iso-2022");
10311
10312 DEFSYM (Qutf_8, "utf-8");
10313 DEFSYM (Qutf_8_emacs, "utf-8-emacs");
10314
10315 #if defined (WINDOWSNT) || defined (CYGWIN)
10316 /* No, not utf-16-le: that one has a BOM. */
10317 DEFSYM (Qutf_16le, "utf-16le");
10318 #endif
10319
10320 DEFSYM (Qutf_16, "utf-16");
10321 DEFSYM (Qbig, "big");
10322 DEFSYM (Qlittle, "little");
10323
10324 DEFSYM (Qshift_jis, "shift-jis");
10325 DEFSYM (Qbig5, "big5");
10326
10327 DEFSYM (Qcoding_system_p, "coding-system-p");
10328
10329 DEFSYM (Qcoding_system_error, "coding-system-error");
10330 Fput (Qcoding_system_error, Qerror_conditions,
10331 listn (CONSTYPE_PURE, 2, Qcoding_system_error, Qerror));
10332 Fput (Qcoding_system_error, Qerror_message,
10333 build_pure_c_string ("Invalid coding system"));
10334
10335 /* Intern this now in case it isn't already done.
10336 Setting this variable twice is harmless.
10337 But don't staticpro it here--that is done in alloc.c. */
10338 Qchar_table_extra_slots = intern_c_string ("char-table-extra-slots");
10339
10340 DEFSYM (Qtranslation_table, "translation-table");
10341 Fput (Qtranslation_table, Qchar_table_extra_slots, make_number (2));
10342 DEFSYM (Qtranslation_table_id, "translation-table-id");
10343 DEFSYM (Qtranslation_table_for_decode, "translation-table-for-decode");
10344 DEFSYM (Qtranslation_table_for_encode, "translation-table-for-encode");
10345
10346 DEFSYM (Qvalid_codes, "valid-codes");
10347
10348 DEFSYM (Qemacs_mule, "emacs-mule");
10349
10350 DEFSYM (QCcategory, ":category");
10351 DEFSYM (QCmnemonic, ":mnemonic");
10352 DEFSYM (QCdefault_char, ":default-char");
10353 DEFSYM (QCdecode_translation_table, ":decode-translation-table");
10354 DEFSYM (QCencode_translation_table, ":encode-translation-table");
10355 DEFSYM (QCpost_read_conversion, ":post-read-conversion");
10356 DEFSYM (QCpre_write_conversion, ":pre-write-conversion");
10357 DEFSYM (QCascii_compatible_p, ":ascii-compatible-p");
10358
10359 Vcoding_category_table
10360 = Fmake_vector (make_number (coding_category_max), Qnil);
10361 staticpro (&Vcoding_category_table);
10362 /* Followings are target of code detection. */
10363 ASET (Vcoding_category_table, coding_category_iso_7,
10364 intern_c_string ("coding-category-iso-7"));
10365 ASET (Vcoding_category_table, coding_category_iso_7_tight,
10366 intern_c_string ("coding-category-iso-7-tight"));
10367 ASET (Vcoding_category_table, coding_category_iso_8_1,
10368 intern_c_string ("coding-category-iso-8-1"));
10369 ASET (Vcoding_category_table, coding_category_iso_8_2,
10370 intern_c_string ("coding-category-iso-8-2"));
10371 ASET (Vcoding_category_table, coding_category_iso_7_else,
10372 intern_c_string ("coding-category-iso-7-else"));
10373 ASET (Vcoding_category_table, coding_category_iso_8_else,
10374 intern_c_string ("coding-category-iso-8-else"));
10375 ASET (Vcoding_category_table, coding_category_utf_8_auto,
10376 intern_c_string ("coding-category-utf-8-auto"));
10377 ASET (Vcoding_category_table, coding_category_utf_8_nosig,
10378 intern_c_string ("coding-category-utf-8"));
10379 ASET (Vcoding_category_table, coding_category_utf_8_sig,
10380 intern_c_string ("coding-category-utf-8-sig"));
10381 ASET (Vcoding_category_table, coding_category_utf_16_be,
10382 intern_c_string ("coding-category-utf-16-be"));
10383 ASET (Vcoding_category_table, coding_category_utf_16_auto,
10384 intern_c_string ("coding-category-utf-16-auto"));
10385 ASET (Vcoding_category_table, coding_category_utf_16_le,
10386 intern_c_string ("coding-category-utf-16-le"));
10387 ASET (Vcoding_category_table, coding_category_utf_16_be_nosig,
10388 intern_c_string ("coding-category-utf-16-be-nosig"));
10389 ASET (Vcoding_category_table, coding_category_utf_16_le_nosig,
10390 intern_c_string ("coding-category-utf-16-le-nosig"));
10391 ASET (Vcoding_category_table, coding_category_charset,
10392 intern_c_string ("coding-category-charset"));
10393 ASET (Vcoding_category_table, coding_category_sjis,
10394 intern_c_string ("coding-category-sjis"));
10395 ASET (Vcoding_category_table, coding_category_big5,
10396 intern_c_string ("coding-category-big5"));
10397 ASET (Vcoding_category_table, coding_category_ccl,
10398 intern_c_string ("coding-category-ccl"));
10399 ASET (Vcoding_category_table, coding_category_emacs_mule,
10400 intern_c_string ("coding-category-emacs-mule"));
10401 /* Followings are NOT target of code detection. */
10402 ASET (Vcoding_category_table, coding_category_raw_text,
10403 intern_c_string ("coding-category-raw-text"));
10404 ASET (Vcoding_category_table, coding_category_undecided,
10405 intern_c_string ("coding-category-undecided"));
10406
10407 DEFSYM (Qinsufficient_source, "insufficient-source");
10408 DEFSYM (Qinconsistent_eol, "inconsistent-eol");
10409 DEFSYM (Qinvalid_source, "invalid-source");
10410 DEFSYM (Qinterrupted, "interrupted");
10411 DEFSYM (Qinsufficient_memory, "insufficient-memory");
10412 DEFSYM (Qcoding_system_define_form, "coding-system-define-form");
10413
10414 defsubr (&Scoding_system_p);
10415 defsubr (&Sread_coding_system);
10416 defsubr (&Sread_non_nil_coding_system);
10417 defsubr (&Scheck_coding_system);
10418 defsubr (&Sdetect_coding_region);
10419 defsubr (&Sdetect_coding_string);
10420 defsubr (&Sfind_coding_systems_region_internal);
10421 defsubr (&Sunencodable_char_position);
10422 defsubr (&Scheck_coding_systems_region);
10423 defsubr (&Sdecode_coding_region);
10424 defsubr (&Sencode_coding_region);
10425 defsubr (&Sdecode_coding_string);
10426 defsubr (&Sencode_coding_string);
10427 defsubr (&Sdecode_sjis_char);
10428 defsubr (&Sencode_sjis_char);
10429 defsubr (&Sdecode_big5_char);
10430 defsubr (&Sencode_big5_char);
10431 defsubr (&Sset_terminal_coding_system_internal);
10432 defsubr (&Sset_safe_terminal_coding_system_internal);
10433 defsubr (&Sterminal_coding_system);
10434 defsubr (&Sset_keyboard_coding_system_internal);
10435 defsubr (&Skeyboard_coding_system);
10436 defsubr (&Sfind_operation_coding_system);
10437 defsubr (&Sset_coding_system_priority);
10438 defsubr (&Sdefine_coding_system_internal);
10439 defsubr (&Sdefine_coding_system_alias);
10440 defsubr (&Scoding_system_put);
10441 defsubr (&Scoding_system_base);
10442 defsubr (&Scoding_system_plist);
10443 defsubr (&Scoding_system_aliases);
10444 defsubr (&Scoding_system_eol_type);
10445 defsubr (&Scoding_system_priority_list);
10446
10447 DEFVAR_LISP ("coding-system-list", Vcoding_system_list,
10448 doc: /* List of coding systems.
10449
10450 Do not alter the value of this variable manually. This variable should be
10451 updated by the functions `define-coding-system' and
10452 `define-coding-system-alias'. */);
10453 Vcoding_system_list = Qnil;
10454
10455 DEFVAR_LISP ("coding-system-alist", Vcoding_system_alist,
10456 doc: /* Alist of coding system names.
10457 Each element is one element list of coding system name.
10458 This variable is given to `completing-read' as COLLECTION argument.
10459
10460 Do not alter the value of this variable manually. This variable should be
10461 updated by the functions `make-coding-system' and
10462 `define-coding-system-alias'. */);
10463 Vcoding_system_alist = Qnil;
10464
10465 DEFVAR_LISP ("coding-category-list", Vcoding_category_list,
10466 doc: /* List of coding-categories (symbols) ordered by priority.
10467
10468 On detecting a coding system, Emacs tries code detection algorithms
10469 associated with each coding-category one by one in this order. When
10470 one algorithm agrees with a byte sequence of source text, the coding
10471 system bound to the corresponding coding-category is selected.
10472
10473 Don't modify this variable directly, but use `set-coding-system-priority'. */);
10474 {
10475 int i;
10476
10477 Vcoding_category_list = Qnil;
10478 for (i = coding_category_max - 1; i >= 0; i--)
10479 Vcoding_category_list
10480 = Fcons (AREF (Vcoding_category_table, i),
10481 Vcoding_category_list);
10482 }
10483
10484 DEFVAR_LISP ("coding-system-for-read", Vcoding_system_for_read,
10485 doc: /* Specify the coding system for read operations.
10486 It is useful to bind this variable with `let', but do not set it globally.
10487 If the value is a coding system, it is used for decoding on read operation.
10488 If not, an appropriate element is used from one of the coding system alists.
10489 There are three such tables: `file-coding-system-alist',
10490 `process-coding-system-alist', and `network-coding-system-alist'. */);
10491 Vcoding_system_for_read = Qnil;
10492
10493 DEFVAR_LISP ("coding-system-for-write", Vcoding_system_for_write,
10494 doc: /* Specify the coding system for write operations.
10495 Programs bind this variable with `let', but you should not set it globally.
10496 If the value is a coding system, it is used for encoding of output,
10497 when writing it to a file and when sending it to a file or subprocess.
10498
10499 If this does not specify a coding system, an appropriate element
10500 is used from one of the coding system alists.
10501 There are three such tables: `file-coding-system-alist',
10502 `process-coding-system-alist', and `network-coding-system-alist'.
10503 For output to files, if the above procedure does not specify a coding system,
10504 the value of `buffer-file-coding-system' is used. */);
10505 Vcoding_system_for_write = Qnil;
10506
10507 DEFVAR_LISP ("last-coding-system-used", Vlast_coding_system_used,
10508 doc: /*
10509 Coding system used in the latest file or process I/O. */);
10510 Vlast_coding_system_used = Qnil;
10511
10512 DEFVAR_LISP ("last-code-conversion-error", Vlast_code_conversion_error,
10513 doc: /*
10514 Error status of the last code conversion.
10515
10516 When an error was detected in the last code conversion, this variable
10517 is set to one of the following symbols.
10518 `insufficient-source'
10519 `inconsistent-eol'
10520 `invalid-source'
10521 `interrupted'
10522 `insufficient-memory'
10523 When no error was detected, the value doesn't change. So, to check
10524 the error status of a code conversion by this variable, you must
10525 explicitly set this variable to nil before performing code
10526 conversion. */);
10527 Vlast_code_conversion_error = Qnil;
10528
10529 DEFVAR_BOOL ("inhibit-eol-conversion", inhibit_eol_conversion,
10530 doc: /*
10531 *Non-nil means always inhibit code conversion of end-of-line format.
10532 See info node `Coding Systems' and info node `Text and Binary' concerning
10533 such conversion. */);
10534 inhibit_eol_conversion = 0;
10535
10536 DEFVAR_BOOL ("inherit-process-coding-system", inherit_process_coding_system,
10537 doc: /*
10538 Non-nil means process buffer inherits coding system of process output.
10539 Bind it to t if the process output is to be treated as if it were a file
10540 read from some filesystem. */);
10541 inherit_process_coding_system = 0;
10542
10543 DEFVAR_LISP ("file-coding-system-alist", Vfile_coding_system_alist,
10544 doc: /*
10545 Alist to decide a coding system to use for a file I/O operation.
10546 The format is ((PATTERN . VAL) ...),
10547 where PATTERN is a regular expression matching a file name,
10548 VAL is a coding system, a cons of coding systems, or a function symbol.
10549 If VAL is a coding system, it is used for both decoding and encoding
10550 the file contents.
10551 If VAL is a cons of coding systems, the car part is used for decoding,
10552 and the cdr part is used for encoding.
10553 If VAL is a function symbol, the function must return a coding system
10554 or a cons of coding systems which are used as above. The function is
10555 called with an argument that is a list of the arguments with which
10556 `find-operation-coding-system' was called. If the function can't decide
10557 a coding system, it can return `undecided' so that the normal
10558 code-detection is performed.
10559
10560 See also the function `find-operation-coding-system'
10561 and the variable `auto-coding-alist'. */);
10562 Vfile_coding_system_alist = Qnil;
10563
10564 DEFVAR_LISP ("process-coding-system-alist", Vprocess_coding_system_alist,
10565 doc: /*
10566 Alist to decide a coding system to use for a process I/O operation.
10567 The format is ((PATTERN . VAL) ...),
10568 where PATTERN is a regular expression matching a program name,
10569 VAL is a coding system, a cons of coding systems, or a function symbol.
10570 If VAL is a coding system, it is used for both decoding what received
10571 from the program and encoding what sent to the program.
10572 If VAL is a cons of coding systems, the car part is used for decoding,
10573 and the cdr part is used for encoding.
10574 If VAL is a function symbol, the function must return a coding system
10575 or a cons of coding systems which are used as above.
10576
10577 See also the function `find-operation-coding-system'. */);
10578 Vprocess_coding_system_alist = Qnil;
10579
10580 DEFVAR_LISP ("network-coding-system-alist", Vnetwork_coding_system_alist,
10581 doc: /*
10582 Alist to decide a coding system to use for a network I/O operation.
10583 The format is ((PATTERN . VAL) ...),
10584 where PATTERN is a regular expression matching a network service name
10585 or is a port number to connect to,
10586 VAL is a coding system, a cons of coding systems, or a function symbol.
10587 If VAL is a coding system, it is used for both decoding what received
10588 from the network stream and encoding what sent to the network stream.
10589 If VAL is a cons of coding systems, the car part is used for decoding,
10590 and the cdr part is used for encoding.
10591 If VAL is a function symbol, the function must return a coding system
10592 or a cons of coding systems which are used as above.
10593
10594 See also the function `find-operation-coding-system'. */);
10595 Vnetwork_coding_system_alist = Qnil;
10596
10597 DEFVAR_LISP ("locale-coding-system", Vlocale_coding_system,
10598 doc: /* Coding system to use with system messages.
10599 Also used for decoding keyboard input on X Window system. */);
10600 Vlocale_coding_system = Qnil;
10601
10602 /* The eol mnemonics are reset in startup.el system-dependently. */
10603 DEFVAR_LISP ("eol-mnemonic-unix", eol_mnemonic_unix,
10604 doc: /*
10605 *String displayed in mode line for UNIX-like (LF) end-of-line format. */);
10606 eol_mnemonic_unix = build_pure_c_string (":");
10607
10608 DEFVAR_LISP ("eol-mnemonic-dos", eol_mnemonic_dos,
10609 doc: /*
10610 *String displayed in mode line for DOS-like (CRLF) end-of-line format. */);
10611 eol_mnemonic_dos = build_pure_c_string ("\\");
10612
10613 DEFVAR_LISP ("eol-mnemonic-mac", eol_mnemonic_mac,
10614 doc: /*
10615 *String displayed in mode line for MAC-like (CR) end-of-line format. */);
10616 eol_mnemonic_mac = build_pure_c_string ("/");
10617
10618 DEFVAR_LISP ("eol-mnemonic-undecided", eol_mnemonic_undecided,
10619 doc: /*
10620 *String displayed in mode line when end-of-line format is not yet determined. */);
10621 eol_mnemonic_undecided = build_pure_c_string (":");
10622
10623 DEFVAR_LISP ("enable-character-translation", Venable_character_translation,
10624 doc: /*
10625 *Non-nil enables character translation while encoding and decoding. */);
10626 Venable_character_translation = Qt;
10627
10628 DEFVAR_LISP ("standard-translation-table-for-decode",
10629 Vstandard_translation_table_for_decode,
10630 doc: /* Table for translating characters while decoding. */);
10631 Vstandard_translation_table_for_decode = Qnil;
10632
10633 DEFVAR_LISP ("standard-translation-table-for-encode",
10634 Vstandard_translation_table_for_encode,
10635 doc: /* Table for translating characters while encoding. */);
10636 Vstandard_translation_table_for_encode = Qnil;
10637
10638 DEFVAR_LISP ("charset-revision-table", Vcharset_revision_table,
10639 doc: /* Alist of charsets vs revision numbers.
10640 While encoding, if a charset (car part of an element) is found,
10641 designate it with the escape sequence identifying revision (cdr part
10642 of the element). */);
10643 Vcharset_revision_table = Qnil;
10644
10645 DEFVAR_LISP ("default-process-coding-system",
10646 Vdefault_process_coding_system,
10647 doc: /* Cons of coding systems used for process I/O by default.
10648 The car part is used for decoding a process output,
10649 the cdr part is used for encoding a text to be sent to a process. */);
10650 Vdefault_process_coding_system = Qnil;
10651
10652 DEFVAR_LISP ("latin-extra-code-table", Vlatin_extra_code_table,
10653 doc: /*
10654 Table of extra Latin codes in the range 128..159 (inclusive).
10655 This is a vector of length 256.
10656 If Nth element is non-nil, the existence of code N in a file
10657 \(or output of subprocess) doesn't prevent it to be detected as
10658 a coding system of ISO 2022 variant which has a flag
10659 `accept-latin-extra-code' t (e.g. iso-latin-1) on reading a file
10660 or reading output of a subprocess.
10661 Only 128th through 159th elements have a meaning. */);
10662 Vlatin_extra_code_table = Fmake_vector (make_number (256), Qnil);
10663
10664 DEFVAR_LISP ("select-safe-coding-system-function",
10665 Vselect_safe_coding_system_function,
10666 doc: /*
10667 Function to call to select safe coding system for encoding a text.
10668
10669 If set, this function is called to force a user to select a proper
10670 coding system which can encode the text in the case that a default
10671 coding system used in each operation can't encode the text. The
10672 function should take care that the buffer is not modified while
10673 the coding system is being selected.
10674
10675 The default value is `select-safe-coding-system' (which see). */);
10676 Vselect_safe_coding_system_function = Qnil;
10677
10678 DEFVAR_BOOL ("coding-system-require-warning",
10679 coding_system_require_warning,
10680 doc: /* Internal use only.
10681 If non-nil, on writing a file, `select-safe-coding-system-function' is
10682 called even if `coding-system-for-write' is non-nil. The command
10683 `universal-coding-system-argument' binds this variable to t temporarily. */);
10684 coding_system_require_warning = 0;
10685
10686
10687 DEFVAR_BOOL ("inhibit-iso-escape-detection",
10688 inhibit_iso_escape_detection,
10689 doc: /*
10690 If non-nil, Emacs ignores ISO-2022 escape sequences during code detection.
10691
10692 When Emacs reads text, it tries to detect how the text is encoded.
10693 This code detection is sensitive to escape sequences. If Emacs sees
10694 a valid ISO-2022 escape sequence, it assumes the text is encoded in one
10695 of the ISO2022 encodings, and decodes text by the corresponding coding
10696 system (e.g. `iso-2022-7bit').
10697
10698 However, there may be a case that you want to read escape sequences in
10699 a file as is. In such a case, you can set this variable to non-nil.
10700 Then the code detection will ignore any escape sequences, and no text is
10701 detected as encoded in some ISO-2022 encoding. The result is that all
10702 escape sequences become visible in a buffer.
10703
10704 The default value is nil, and it is strongly recommended not to change
10705 it. That is because many Emacs Lisp source files that contain
10706 non-ASCII characters are encoded by the coding system `iso-2022-7bit'
10707 in Emacs's distribution, and they won't be decoded correctly on
10708 reading if you suppress escape sequence detection.
10709
10710 The other way to read escape sequences in a file without decoding is
10711 to explicitly specify some coding system that doesn't use ISO-2022
10712 escape sequence (e.g `latin-1') on reading by \\[universal-coding-system-argument]. */);
10713 inhibit_iso_escape_detection = 0;
10714
10715 DEFVAR_BOOL ("inhibit-null-byte-detection",
10716 inhibit_null_byte_detection,
10717 doc: /* If non-nil, Emacs ignores null bytes on code detection.
10718 By default, Emacs treats it as binary data, and does not attempt to
10719 decode it. The effect is as if you specified `no-conversion' for
10720 reading that text.
10721
10722 Set this to non-nil when a regular text happens to include null bytes.
10723 Examples are Index nodes of Info files and null-byte delimited output
10724 from GNU Find and GNU Grep. Emacs will then ignore the null bytes and
10725 decode text as usual. */);
10726 inhibit_null_byte_detection = 0;
10727
10728 DEFVAR_LISP ("translation-table-for-input", Vtranslation_table_for_input,
10729 doc: /* Char table for translating self-inserting characters.
10730 This is applied to the result of input methods, not their input.
10731 See also `keyboard-translate-table'.
10732
10733 Use of this variable for character code unification was rendered
10734 obsolete in Emacs 23.1 and later, since Unicode is now the basis of
10735 internal character representation. */);
10736 Vtranslation_table_for_input = Qnil;
10737
10738 {
10739 Lisp_Object args[coding_arg_max];
10740 Lisp_Object plist[16];
10741 int i;
10742
10743 for (i = 0; i < coding_arg_max; i++)
10744 args[i] = Qnil;
10745
10746 plist[0] = intern_c_string (":name");
10747 plist[1] = args[coding_arg_name] = Qno_conversion;
10748 plist[2] = intern_c_string (":mnemonic");
10749 plist[3] = args[coding_arg_mnemonic] = make_number ('=');
10750 plist[4] = intern_c_string (":coding-type");
10751 plist[5] = args[coding_arg_coding_type] = Qraw_text;
10752 plist[6] = intern_c_string (":ascii-compatible-p");
10753 plist[7] = args[coding_arg_ascii_compatible_p] = Qt;
10754 plist[8] = intern_c_string (":default-char");
10755 plist[9] = args[coding_arg_default_char] = make_number (0);
10756 plist[10] = intern_c_string (":for-unibyte");
10757 plist[11] = args[coding_arg_for_unibyte] = Qt;
10758 plist[12] = intern_c_string (":docstring");
10759 plist[13] = build_pure_c_string ("Do no conversion.\n\
10760 \n\
10761 When you visit a file with this coding, the file is read into a\n\
10762 unibyte buffer as is, thus each byte of a file is treated as a\n\
10763 character.");
10764 plist[14] = intern_c_string (":eol-type");
10765 plist[15] = args[coding_arg_eol_type] = Qunix;
10766 args[coding_arg_plist] = Flist (16, plist);
10767 Fdefine_coding_system_internal (coding_arg_max, args);
10768
10769 plist[1] = args[coding_arg_name] = Qundecided;
10770 plist[3] = args[coding_arg_mnemonic] = make_number ('-');
10771 plist[5] = args[coding_arg_coding_type] = Qundecided;
10772 /* This is already set.
10773 plist[7] = args[coding_arg_ascii_compatible_p] = Qt; */
10774 plist[8] = intern_c_string (":charset-list");
10775 plist[9] = args[coding_arg_charset_list] = Fcons (Qascii, Qnil);
10776 plist[11] = args[coding_arg_for_unibyte] = Qnil;
10777 plist[13] = build_pure_c_string ("No conversion on encoding, automatic conversion on decoding.");
10778 plist[15] = args[coding_arg_eol_type] = Qnil;
10779 args[coding_arg_plist] = Flist (16, plist);
10780 Fdefine_coding_system_internal (coding_arg_max, args);
10781 }
10782
10783 setup_coding_system (Qno_conversion, &safe_terminal_coding);
10784
10785 {
10786 int i;
10787
10788 for (i = 0; i < coding_category_max; i++)
10789 Fset (AREF (Vcoding_category_table, i), Qno_conversion);
10790 }
10791 #if defined (DOS_NT)
10792 system_eol_type = Qdos;
10793 #else
10794 system_eol_type = Qunix;
10795 #endif
10796 staticpro (&system_eol_type);
10797 }
10798
10799 char *
10800 emacs_strerror (int error_number)
10801 {
10802 char *str;
10803
10804 synchronize_system_messages_locale ();
10805 str = strerror (error_number);
10806
10807 if (! NILP (Vlocale_coding_system))
10808 {
10809 Lisp_Object dec = code_convert_string_norecord (build_string (str),
10810 Vlocale_coding_system,
10811 0);
10812 str = SSDATA (dec);
10813 }
10814
10815 return str;
10816 }
10817
10818 #endif /* emacs */