* coding.c (coding_set_source, coding_set_destination):
[bpt/emacs.git] / src / coding.c
1 /* Coding system handler (conversion, detection, etc).
2 Copyright (C) 2001-2011 Free Software Foundation, Inc.
3 Copyright (C) 1995, 1996, 1997, 1998, 1999, 2000, 2001, 2002, 2003, 2004,
4 2005, 2006, 2007, 2008, 2009, 2010, 2011
5 National Institute of Advanced Industrial Science and Technology (AIST)
6 Registration Number H14PRO021
7 Copyright (C) 2003
8 National Institute of Advanced Industrial Science and Technology (AIST)
9 Registration Number H13PRO009
10
11 This file is part of GNU Emacs.
12
13 GNU Emacs is free software: you can redistribute it and/or modify
14 it under the terms of the GNU General Public License as published by
15 the Free Software Foundation, either version 3 of the License, or
16 (at your option) any later version.
17
18 GNU Emacs is distributed in the hope that it will be useful,
19 but WITHOUT ANY WARRANTY; without even the implied warranty of
20 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
21 GNU General Public License for more details.
22
23 You should have received a copy of the GNU General Public License
24 along with GNU Emacs. If not, see <http://www.gnu.org/licenses/>. */
25
26 /*** TABLE OF CONTENTS ***
27
28 0. General comments
29 1. Preamble
30 2. Emacs' internal format (emacs-utf-8) handlers
31 3. UTF-8 handlers
32 4. UTF-16 handlers
33 5. Charset-base coding systems handlers
34 6. emacs-mule (old Emacs' internal format) handlers
35 7. ISO2022 handlers
36 8. Shift-JIS and BIG5 handlers
37 9. CCL handlers
38 10. C library functions
39 11. Emacs Lisp library functions
40 12. Postamble
41
42 */
43
44 /*** 0. General comments ***
45
46
47 CODING SYSTEM
48
49 A coding system is an object for an encoding mechanism that contains
50 information about how to convert byte sequences to character
51 sequences and vice versa. When we say "decode", it means converting
52 a byte sequence of a specific coding system into a character
53 sequence that is represented by Emacs' internal coding system
54 `emacs-utf-8', and when we say "encode", it means converting a
55 character sequence of emacs-utf-8 to a byte sequence of a specific
56 coding system.
57
58 In Emacs Lisp, a coding system is represented by a Lisp symbol. In
59 C level, a coding system is represented by a vector of attributes
60 stored in the hash table Vcharset_hash_table. The conversion from
61 coding system symbol to attributes vector is done by looking up
62 Vcharset_hash_table by the symbol.
63
64 Coding systems are classified into the following types depending on
65 the encoding mechanism. Here's a brief description of the types.
66
67 o UTF-8
68
69 o UTF-16
70
71 o Charset-base coding system
72
73 A coding system defined by one or more (coded) character sets.
74 Decoding and encoding are done by a code converter defined for each
75 character set.
76
77 o Old Emacs internal format (emacs-mule)
78
79 The coding system adopted by old versions of Emacs (20 and 21).
80
81 o ISO2022-base coding system
82
83 The most famous coding system for multiple character sets. X's
84 Compound Text, various EUCs (Extended Unix Code), and coding systems
85 used in the Internet communication such as ISO-2022-JP are all
86 variants of ISO2022.
87
88 o SJIS (or Shift-JIS or MS-Kanji-Code)
89
90 A coding system to encode character sets: ASCII, JISX0201, and
91 JISX0208. Widely used for PC's in Japan. Details are described in
92 section 8.
93
94 o BIG5
95
96 A coding system to encode character sets: ASCII and Big5. Widely
97 used for Chinese (mainly in Taiwan and Hong Kong). Details are
98 described in section 8. In this file, when we write "big5" (all
99 lowercase), we mean the coding system, and when we write "Big5"
100 (capitalized), we mean the character set.
101
102 o CCL
103
104 If a user wants to decode/encode text encoded in a coding system
105 not listed above, he can supply a decoder and an encoder for it in
106 CCL (Code Conversion Language) programs. Emacs executes the CCL
107 program while decoding/encoding.
108
109 o Raw-text
110
111 A coding system for text containing raw eight-bit data. Emacs
112 treats each byte of source text as a character (except for
113 end-of-line conversion).
114
115 o No-conversion
116
117 Like raw text, but don't do end-of-line conversion.
118
119
120 END-OF-LINE FORMAT
121
122 How text end-of-line is encoded depends on operating system. For
123 instance, Unix's format is just one byte of LF (line-feed) code,
124 whereas DOS's format is two-byte sequence of `carriage-return' and
125 `line-feed' codes. MacOS's format is usually one byte of
126 `carriage-return'.
127
128 Since text character encoding and end-of-line encoding are
129 independent, any coding system described above can take any format
130 of end-of-line (except for no-conversion).
131
132 STRUCT CODING_SYSTEM
133
134 Before using a coding system for code conversion (i.e. decoding and
135 encoding), we setup a structure of type `struct coding_system'.
136 This structure keeps various information about a specific code
137 conversion (e.g. the location of source and destination data).
138
139 */
140
141 /* COMMON MACROS */
142
143
144 /*** GENERAL NOTES on `detect_coding_XXX ()' functions ***
145
146 These functions check if a byte sequence specified as a source in
147 CODING conforms to the format of XXX, and update the members of
148 DETECT_INFO.
149
150 Return 1 if the byte sequence conforms to XXX, otherwise return 0.
151
152 Below is the template of these functions. */
153
154 #if 0
155 static int
156 detect_coding_XXX (struct coding_system *coding,
157 struct coding_detection_info *detect_info)
158 {
159 const unsigned char *src = coding->source;
160 const unsigned char *src_end = coding->source + coding->src_bytes;
161 int multibytep = coding->src_multibyte;
162 int consumed_chars = 0;
163 int found = 0;
164 ...;
165
166 while (1)
167 {
168 /* Get one byte from the source. If the source is exhausted, jump
169 to no_more_source:. */
170 ONE_MORE_BYTE (c);
171
172 if (! __C_conforms_to_XXX___ (c))
173 break;
174 if (! __C_strongly_suggests_XXX__ (c))
175 found = CATEGORY_MASK_XXX;
176 }
177 /* The byte sequence is invalid for XXX. */
178 detect_info->rejected |= CATEGORY_MASK_XXX;
179 return 0;
180
181 no_more_source:
182 /* The source exhausted successfully. */
183 detect_info->found |= found;
184 return 1;
185 }
186 #endif
187
188 /*** GENERAL NOTES on `decode_coding_XXX ()' functions ***
189
190 These functions decode a byte sequence specified as a source by
191 CODING. The resulting multibyte text goes to a place pointed to by
192 CODING->charbuf, the length of which should not exceed
193 CODING->charbuf_size;
194
195 These functions set the information of original and decoded texts in
196 CODING->consumed, CODING->consumed_char, and CODING->charbuf_used.
197 They also set CODING->result to one of CODING_RESULT_XXX indicating
198 how the decoding is finished.
199
200 Below is the template of these functions. */
201
202 #if 0
203 static void
204 decode_coding_XXXX (struct coding_system *coding)
205 {
206 const unsigned char *src = coding->source + coding->consumed;
207 const unsigned char *src_end = coding->source + coding->src_bytes;
208 /* SRC_BASE remembers the start position in source in each loop.
209 The loop will be exited when there's not enough source code, or
210 when there's no room in CHARBUF for a decoded character. */
211 const unsigned char *src_base;
212 /* A buffer to produce decoded characters. */
213 int *charbuf = coding->charbuf + coding->charbuf_used;
214 int *charbuf_end = coding->charbuf + coding->charbuf_size;
215 int multibytep = coding->src_multibyte;
216
217 while (1)
218 {
219 src_base = src;
220 if (charbuf < charbuf_end)
221 /* No more room to produce a decoded character. */
222 break;
223 ONE_MORE_BYTE (c);
224 /* Decode it. */
225 }
226
227 no_more_source:
228 if (src_base < src_end
229 && coding->mode & CODING_MODE_LAST_BLOCK)
230 /* If the source ends by partial bytes to construct a character,
231 treat them as eight-bit raw data. */
232 while (src_base < src_end && charbuf < charbuf_end)
233 *charbuf++ = *src_base++;
234 /* Remember how many bytes and characters we consumed. If the
235 source is multibyte, the bytes and chars are not identical. */
236 coding->consumed = coding->consumed_char = src_base - coding->source;
237 /* Remember how many characters we produced. */
238 coding->charbuf_used = charbuf - coding->charbuf;
239 }
240 #endif
241
242 /*** GENERAL NOTES on `encode_coding_XXX ()' functions ***
243
244 These functions encode SRC_BYTES length text at SOURCE of Emacs'
245 internal multibyte format by CODING. The resulting byte sequence
246 goes to a place pointed to by DESTINATION, the length of which
247 should not exceed DST_BYTES.
248
249 These functions set the information of original and encoded texts in
250 the members produced, produced_char, consumed, and consumed_char of
251 the structure *CODING. They also set the member result to one of
252 CODING_RESULT_XXX indicating how the encoding finished.
253
254 DST_BYTES zero means that source area and destination area are
255 overlapped, which means that we can produce a encoded text until it
256 reaches at the head of not-yet-encoded source text.
257
258 Below is a template of these functions. */
259 #if 0
260 static void
261 encode_coding_XXX (struct coding_system *coding)
262 {
263 int multibytep = coding->dst_multibyte;
264 int *charbuf = coding->charbuf;
265 int *charbuf_end = charbuf->charbuf + coding->charbuf_used;
266 unsigned char *dst = coding->destination + coding->produced;
267 unsigned char *dst_end = coding->destination + coding->dst_bytes;
268 unsigned char *adjusted_dst_end = dst_end - _MAX_BYTES_PRODUCED_IN_LOOP_;
269 int produced_chars = 0;
270
271 for (; charbuf < charbuf_end && dst < adjusted_dst_end; charbuf++)
272 {
273 int c = *charbuf;
274 /* Encode C into DST, and increment DST. */
275 }
276 label_no_more_destination:
277 /* How many chars and bytes we produced. */
278 coding->produced_char += produced_chars;
279 coding->produced = dst - coding->destination;
280 }
281 #endif
282
283 \f
284 /*** 1. Preamble ***/
285
286 #include <config.h>
287 #include <stdio.h>
288 #include <setjmp.h>
289
290 #include "lisp.h"
291 #include "buffer.h"
292 #include "character.h"
293 #include "charset.h"
294 #include "ccl.h"
295 #include "composite.h"
296 #include "coding.h"
297 #include "window.h"
298 #include "frame.h"
299 #include "termhooks.h"
300
301 Lisp_Object Vcoding_system_hash_table;
302
303 Lisp_Object Qcoding_system, Qcoding_aliases, Qeol_type;
304 Lisp_Object Qunix, Qdos;
305 Lisp_Object Qbuffer_file_coding_system;
306 Lisp_Object Qpost_read_conversion, Qpre_write_conversion;
307 Lisp_Object Qdefault_char;
308 Lisp_Object Qno_conversion, Qundecided;
309 Lisp_Object Qcharset, Qiso_2022, Qutf_8, Qutf_16, Qshift_jis, Qbig5;
310 Lisp_Object Qbig, Qlittle;
311 Lisp_Object Qcoding_system_history;
312 Lisp_Object Qvalid_codes;
313 Lisp_Object QCcategory, QCmnemonic, QCdefault_char;
314 Lisp_Object QCdecode_translation_table, QCencode_translation_table;
315 Lisp_Object QCpost_read_conversion, QCpre_write_conversion;
316 Lisp_Object QCascii_compatible_p;
317
318 Lisp_Object Qcall_process, Qcall_process_region;
319 Lisp_Object Qstart_process, Qopen_network_stream;
320 Lisp_Object Qtarget_idx;
321
322 Lisp_Object Qinsufficient_source, Qinconsistent_eol, Qinvalid_source;
323 Lisp_Object Qinterrupted, Qinsufficient_memory;
324
325 /* If a symbol has this property, evaluate the value to define the
326 symbol as a coding system. */
327 static Lisp_Object Qcoding_system_define_form;
328
329 /* Format of end-of-line decided by system. This is Qunix on
330 Unix and Mac, Qdos on DOS/Windows.
331 This has an effect only for external encoding (i.e. for output to
332 file and process), not for in-buffer or Lisp string encoding. */
333 static Lisp_Object system_eol_type;
334
335 #ifdef emacs
336
337 Lisp_Object Qcoding_system_p, Qcoding_system_error;
338
339 /* Coding system emacs-mule and raw-text are for converting only
340 end-of-line format. */
341 Lisp_Object Qemacs_mule, Qraw_text;
342 Lisp_Object Qutf_8_emacs;
343
344 /* Coding-systems are handed between Emacs Lisp programs and C internal
345 routines by the following three variables. */
346 /* Coding system to be used to encode text for terminal display when
347 terminal coding system is nil. */
348 struct coding_system safe_terminal_coding;
349
350 #endif /* emacs */
351
352 Lisp_Object Qtranslation_table;
353 Lisp_Object Qtranslation_table_id;
354 Lisp_Object Qtranslation_table_for_decode;
355 Lisp_Object Qtranslation_table_for_encode;
356
357 /* Two special coding systems. */
358 Lisp_Object Vsjis_coding_system;
359 Lisp_Object Vbig5_coding_system;
360
361 /* ISO2022 section */
362
363 #define CODING_ISO_INITIAL(coding, reg) \
364 (XINT (AREF (AREF (CODING_ID_ATTRS ((coding)->id), \
365 coding_attr_iso_initial), \
366 reg)))
367
368
369 #define CODING_ISO_REQUEST(coding, charset_id) \
370 (((charset_id) <= (coding)->max_charset_id \
371 ? ((coding)->safe_charsets[charset_id] != 255 \
372 ? (coding)->safe_charsets[charset_id] \
373 : -1) \
374 : -1))
375
376
377 #define CODING_ISO_FLAGS(coding) \
378 ((coding)->spec.iso_2022.flags)
379 #define CODING_ISO_DESIGNATION(coding, reg) \
380 ((coding)->spec.iso_2022.current_designation[reg])
381 #define CODING_ISO_INVOCATION(coding, plane) \
382 ((coding)->spec.iso_2022.current_invocation[plane])
383 #define CODING_ISO_SINGLE_SHIFTING(coding) \
384 ((coding)->spec.iso_2022.single_shifting)
385 #define CODING_ISO_BOL(coding) \
386 ((coding)->spec.iso_2022.bol)
387 #define CODING_ISO_INVOKED_CHARSET(coding, plane) \
388 CODING_ISO_DESIGNATION ((coding), CODING_ISO_INVOCATION ((coding), (plane)))
389 #define CODING_ISO_CMP_STATUS(coding) \
390 (&(coding)->spec.iso_2022.cmp_status)
391 #define CODING_ISO_EXTSEGMENT_LEN(coding) \
392 ((coding)->spec.iso_2022.ctext_extended_segment_len)
393 #define CODING_ISO_EMBEDDED_UTF_8(coding) \
394 ((coding)->spec.iso_2022.embedded_utf_8)
395
396 /* Control characters of ISO2022. */
397 /* code */ /* function */
398 #define ISO_CODE_LF 0x0A /* line-feed */
399 #define ISO_CODE_CR 0x0D /* carriage-return */
400 #define ISO_CODE_SO 0x0E /* shift-out */
401 #define ISO_CODE_SI 0x0F /* shift-in */
402 #define ISO_CODE_SS2_7 0x19 /* single-shift-2 for 7-bit code */
403 #define ISO_CODE_ESC 0x1B /* escape */
404 #define ISO_CODE_SS2 0x8E /* single-shift-2 */
405 #define ISO_CODE_SS3 0x8F /* single-shift-3 */
406 #define ISO_CODE_CSI 0x9B /* control-sequence-introducer */
407
408 /* All code (1-byte) of ISO2022 is classified into one of the
409 followings. */
410 enum iso_code_class_type
411 {
412 ISO_control_0, /* Control codes in the range
413 0x00..0x1F and 0x7F, except for the
414 following 5 codes. */
415 ISO_shift_out, /* ISO_CODE_SO (0x0E) */
416 ISO_shift_in, /* ISO_CODE_SI (0x0F) */
417 ISO_single_shift_2_7, /* ISO_CODE_SS2_7 (0x19) */
418 ISO_escape, /* ISO_CODE_SO (0x1B) */
419 ISO_control_1, /* Control codes in the range
420 0x80..0x9F, except for the
421 following 3 codes. */
422 ISO_single_shift_2, /* ISO_CODE_SS2 (0x8E) */
423 ISO_single_shift_3, /* ISO_CODE_SS3 (0x8F) */
424 ISO_control_sequence_introducer, /* ISO_CODE_CSI (0x9B) */
425 ISO_0x20_or_0x7F, /* Codes of the values 0x20 or 0x7F. */
426 ISO_graphic_plane_0, /* Graphic codes in the range 0x21..0x7E. */
427 ISO_0xA0_or_0xFF, /* Codes of the values 0xA0 or 0xFF. */
428 ISO_graphic_plane_1 /* Graphic codes in the range 0xA1..0xFE. */
429 };
430
431 /** The macros CODING_ISO_FLAG_XXX defines a flag bit of the
432 `iso-flags' attribute of an iso2022 coding system. */
433
434 /* If set, produce long-form designation sequence (e.g. ESC $ ( A)
435 instead of the correct short-form sequence (e.g. ESC $ A). */
436 #define CODING_ISO_FLAG_LONG_FORM 0x0001
437
438 /* If set, reset graphic planes and registers at end-of-line to the
439 initial state. */
440 #define CODING_ISO_FLAG_RESET_AT_EOL 0x0002
441
442 /* If set, reset graphic planes and registers before any control
443 characters to the initial state. */
444 #define CODING_ISO_FLAG_RESET_AT_CNTL 0x0004
445
446 /* If set, encode by 7-bit environment. */
447 #define CODING_ISO_FLAG_SEVEN_BITS 0x0008
448
449 /* If set, use locking-shift function. */
450 #define CODING_ISO_FLAG_LOCKING_SHIFT 0x0010
451
452 /* If set, use single-shift function. Overwrite
453 CODING_ISO_FLAG_LOCKING_SHIFT. */
454 #define CODING_ISO_FLAG_SINGLE_SHIFT 0x0020
455
456 /* If set, use designation escape sequence. */
457 #define CODING_ISO_FLAG_DESIGNATION 0x0040
458
459 /* If set, produce revision number sequence. */
460 #define CODING_ISO_FLAG_REVISION 0x0080
461
462 /* If set, produce ISO6429's direction specifying sequence. */
463 #define CODING_ISO_FLAG_DIRECTION 0x0100
464
465 /* If set, assume designation states are reset at beginning of line on
466 output. */
467 #define CODING_ISO_FLAG_INIT_AT_BOL 0x0200
468
469 /* If set, designation sequence should be placed at beginning of line
470 on output. */
471 #define CODING_ISO_FLAG_DESIGNATE_AT_BOL 0x0400
472
473 /* If set, do not encode unsafe characters on output. */
474 #define CODING_ISO_FLAG_SAFE 0x0800
475
476 /* If set, extra latin codes (128..159) are accepted as a valid code
477 on input. */
478 #define CODING_ISO_FLAG_LATIN_EXTRA 0x1000
479
480 #define CODING_ISO_FLAG_COMPOSITION 0x2000
481
482 #define CODING_ISO_FLAG_EUC_TW_SHIFT 0x4000
483
484 #define CODING_ISO_FLAG_USE_ROMAN 0x8000
485
486 #define CODING_ISO_FLAG_USE_OLDJIS 0x10000
487
488 #define CODING_ISO_FLAG_FULL_SUPPORT 0x100000
489
490 /* A character to be produced on output if encoding of the original
491 character is prohibited by CODING_ISO_FLAG_SAFE. */
492 #define CODING_INHIBIT_CHARACTER_SUBSTITUTION '?'
493
494 /* UTF-8 section */
495 #define CODING_UTF_8_BOM(coding) \
496 ((coding)->spec.utf_8_bom)
497
498 /* UTF-16 section */
499 #define CODING_UTF_16_BOM(coding) \
500 ((coding)->spec.utf_16.bom)
501
502 #define CODING_UTF_16_ENDIAN(coding) \
503 ((coding)->spec.utf_16.endian)
504
505 #define CODING_UTF_16_SURROGATE(coding) \
506 ((coding)->spec.utf_16.surrogate)
507
508
509 /* CCL section */
510 #define CODING_CCL_DECODER(coding) \
511 AREF (CODING_ID_ATTRS ((coding)->id), coding_attr_ccl_decoder)
512 #define CODING_CCL_ENCODER(coding) \
513 AREF (CODING_ID_ATTRS ((coding)->id), coding_attr_ccl_encoder)
514 #define CODING_CCL_VALIDS(coding) \
515 (SDATA (AREF (CODING_ID_ATTRS ((coding)->id), coding_attr_ccl_valids)))
516
517 /* Index for each coding category in `coding_categories' */
518
519 enum coding_category
520 {
521 coding_category_iso_7,
522 coding_category_iso_7_tight,
523 coding_category_iso_8_1,
524 coding_category_iso_8_2,
525 coding_category_iso_7_else,
526 coding_category_iso_8_else,
527 coding_category_utf_8_auto,
528 coding_category_utf_8_nosig,
529 coding_category_utf_8_sig,
530 coding_category_utf_16_auto,
531 coding_category_utf_16_be,
532 coding_category_utf_16_le,
533 coding_category_utf_16_be_nosig,
534 coding_category_utf_16_le_nosig,
535 coding_category_charset,
536 coding_category_sjis,
537 coding_category_big5,
538 coding_category_ccl,
539 coding_category_emacs_mule,
540 /* All above are targets of code detection. */
541 coding_category_raw_text,
542 coding_category_undecided,
543 coding_category_max
544 };
545
546 /* Definitions of flag bits used in detect_coding_XXXX. */
547 #define CATEGORY_MASK_ISO_7 (1 << coding_category_iso_7)
548 #define CATEGORY_MASK_ISO_7_TIGHT (1 << coding_category_iso_7_tight)
549 #define CATEGORY_MASK_ISO_8_1 (1 << coding_category_iso_8_1)
550 #define CATEGORY_MASK_ISO_8_2 (1 << coding_category_iso_8_2)
551 #define CATEGORY_MASK_ISO_7_ELSE (1 << coding_category_iso_7_else)
552 #define CATEGORY_MASK_ISO_8_ELSE (1 << coding_category_iso_8_else)
553 #define CATEGORY_MASK_UTF_8_AUTO (1 << coding_category_utf_8_auto)
554 #define CATEGORY_MASK_UTF_8_NOSIG (1 << coding_category_utf_8_nosig)
555 #define CATEGORY_MASK_UTF_8_SIG (1 << coding_category_utf_8_sig)
556 #define CATEGORY_MASK_UTF_16_AUTO (1 << coding_category_utf_16_auto)
557 #define CATEGORY_MASK_UTF_16_BE (1 << coding_category_utf_16_be)
558 #define CATEGORY_MASK_UTF_16_LE (1 << coding_category_utf_16_le)
559 #define CATEGORY_MASK_UTF_16_BE_NOSIG (1 << coding_category_utf_16_be_nosig)
560 #define CATEGORY_MASK_UTF_16_LE_NOSIG (1 << coding_category_utf_16_le_nosig)
561 #define CATEGORY_MASK_CHARSET (1 << coding_category_charset)
562 #define CATEGORY_MASK_SJIS (1 << coding_category_sjis)
563 #define CATEGORY_MASK_BIG5 (1 << coding_category_big5)
564 #define CATEGORY_MASK_CCL (1 << coding_category_ccl)
565 #define CATEGORY_MASK_EMACS_MULE (1 << coding_category_emacs_mule)
566 #define CATEGORY_MASK_RAW_TEXT (1 << coding_category_raw_text)
567
568 /* This value is returned if detect_coding_mask () find nothing other
569 than ASCII characters. */
570 #define CATEGORY_MASK_ANY \
571 (CATEGORY_MASK_ISO_7 \
572 | CATEGORY_MASK_ISO_7_TIGHT \
573 | CATEGORY_MASK_ISO_8_1 \
574 | CATEGORY_MASK_ISO_8_2 \
575 | CATEGORY_MASK_ISO_7_ELSE \
576 | CATEGORY_MASK_ISO_8_ELSE \
577 | CATEGORY_MASK_UTF_8_AUTO \
578 | CATEGORY_MASK_UTF_8_NOSIG \
579 | CATEGORY_MASK_UTF_8_SIG \
580 | CATEGORY_MASK_UTF_16_AUTO \
581 | CATEGORY_MASK_UTF_16_BE \
582 | CATEGORY_MASK_UTF_16_LE \
583 | CATEGORY_MASK_UTF_16_BE_NOSIG \
584 | CATEGORY_MASK_UTF_16_LE_NOSIG \
585 | CATEGORY_MASK_CHARSET \
586 | CATEGORY_MASK_SJIS \
587 | CATEGORY_MASK_BIG5 \
588 | CATEGORY_MASK_CCL \
589 | CATEGORY_MASK_EMACS_MULE)
590
591
592 #define CATEGORY_MASK_ISO_7BIT \
593 (CATEGORY_MASK_ISO_7 | CATEGORY_MASK_ISO_7_TIGHT)
594
595 #define CATEGORY_MASK_ISO_8BIT \
596 (CATEGORY_MASK_ISO_8_1 | CATEGORY_MASK_ISO_8_2)
597
598 #define CATEGORY_MASK_ISO_ELSE \
599 (CATEGORY_MASK_ISO_7_ELSE | CATEGORY_MASK_ISO_8_ELSE)
600
601 #define CATEGORY_MASK_ISO_ESCAPE \
602 (CATEGORY_MASK_ISO_7 \
603 | CATEGORY_MASK_ISO_7_TIGHT \
604 | CATEGORY_MASK_ISO_7_ELSE \
605 | CATEGORY_MASK_ISO_8_ELSE)
606
607 #define CATEGORY_MASK_ISO \
608 ( CATEGORY_MASK_ISO_7BIT \
609 | CATEGORY_MASK_ISO_8BIT \
610 | CATEGORY_MASK_ISO_ELSE)
611
612 #define CATEGORY_MASK_UTF_16 \
613 (CATEGORY_MASK_UTF_16_AUTO \
614 | CATEGORY_MASK_UTF_16_BE \
615 | CATEGORY_MASK_UTF_16_LE \
616 | CATEGORY_MASK_UTF_16_BE_NOSIG \
617 | CATEGORY_MASK_UTF_16_LE_NOSIG)
618
619 #define CATEGORY_MASK_UTF_8 \
620 (CATEGORY_MASK_UTF_8_AUTO \
621 | CATEGORY_MASK_UTF_8_NOSIG \
622 | CATEGORY_MASK_UTF_8_SIG)
623
624 /* Table of coding categories (Lisp symbols). This variable is for
625 internal use only. */
626 static Lisp_Object Vcoding_category_table;
627
628 /* Table of coding-categories ordered by priority. */
629 static enum coding_category coding_priorities[coding_category_max];
630
631 /* Nth element is a coding context for the coding system bound to the
632 Nth coding category. */
633 static struct coding_system coding_categories[coding_category_max];
634
635 /*** Commonly used macros and functions ***/
636
637 #ifndef min
638 #define min(a, b) ((a) < (b) ? (a) : (b))
639 #endif
640 #ifndef max
641 #define max(a, b) ((a) > (b) ? (a) : (b))
642 #endif
643
644 #define CODING_GET_INFO(coding, attrs, charset_list) \
645 do { \
646 (attrs) = CODING_ID_ATTRS ((coding)->id); \
647 (charset_list) = CODING_ATTR_CHARSET_LIST (attrs); \
648 } while (0)
649
650
651 /* Safely get one byte from the source text pointed by SRC which ends
652 at SRC_END, and set C to that byte. If there are not enough bytes
653 in the source, it jumps to `no_more_source'. If multibytep is
654 nonzero, and a multibyte character is found at SRC, set C to the
655 negative value of the character code. The caller should declare
656 and set these variables appropriately in advance:
657 src, src_end, multibytep */
658
659 #define ONE_MORE_BYTE(c) \
660 do { \
661 if (src == src_end) \
662 { \
663 if (src_base < src) \
664 record_conversion_result \
665 (coding, CODING_RESULT_INSUFFICIENT_SRC); \
666 goto no_more_source; \
667 } \
668 c = *src++; \
669 if (multibytep && (c & 0x80)) \
670 { \
671 if ((c & 0xFE) == 0xC0) \
672 c = ((c & 1) << 6) | *src++; \
673 else \
674 { \
675 src--; \
676 c = - string_char (src, &src, NULL); \
677 record_conversion_result \
678 (coding, CODING_RESULT_INVALID_SRC); \
679 } \
680 } \
681 consumed_chars++; \
682 } while (0)
683
684 /* Safely get two bytes from the source text pointed by SRC which ends
685 at SRC_END, and set C1 and C2 to those bytes while skipping the
686 heading multibyte characters. If there are not enough bytes in the
687 source, it jumps to `no_more_source'. If multibytep is nonzero and
688 a multibyte character is found for C2, set C2 to the negative value
689 of the character code. The caller should declare and set these
690 variables appropriately in advance:
691 src, src_end, multibytep
692 It is intended that this macro is used in detect_coding_utf_16. */
693
694 #define TWO_MORE_BYTES(c1, c2) \
695 do { \
696 do { \
697 if (src == src_end) \
698 goto no_more_source; \
699 c1 = *src++; \
700 if (multibytep && (c1 & 0x80)) \
701 { \
702 if ((c1 & 0xFE) == 0xC0) \
703 c1 = ((c1 & 1) << 6) | *src++; \
704 else \
705 { \
706 src += BYTES_BY_CHAR_HEAD (c1) - 1; \
707 c1 = -1; \
708 } \
709 } \
710 } while (c1 < 0); \
711 if (src == src_end) \
712 goto no_more_source; \
713 c2 = *src++; \
714 if (multibytep && (c2 & 0x80)) \
715 { \
716 if ((c2 & 0xFE) == 0xC0) \
717 c2 = ((c2 & 1) << 6) | *src++; \
718 else \
719 c2 = -1; \
720 } \
721 } while (0)
722
723
724 #define ONE_MORE_BYTE_NO_CHECK(c) \
725 do { \
726 c = *src++; \
727 if (multibytep && (c & 0x80)) \
728 { \
729 if ((c & 0xFE) == 0xC0) \
730 c = ((c & 1) << 6) | *src++; \
731 else \
732 { \
733 src--; \
734 c = - string_char (src, &src, NULL); \
735 record_conversion_result \
736 (coding, CODING_RESULT_INVALID_SRC); \
737 } \
738 } \
739 consumed_chars++; \
740 } while (0)
741
742
743 /* Store a byte C in the place pointed by DST and increment DST to the
744 next free point, and increment PRODUCED_CHARS. The caller should
745 assure that C is 0..127, and declare and set the variable `dst'
746 appropriately in advance.
747 */
748
749
750 #define EMIT_ONE_ASCII_BYTE(c) \
751 do { \
752 produced_chars++; \
753 *dst++ = (c); \
754 } while (0)
755
756
757 /* Like EMIT_ONE_ASCII_BYTE but store two bytes; C1 and C2. */
758
759 #define EMIT_TWO_ASCII_BYTES(c1, c2) \
760 do { \
761 produced_chars += 2; \
762 *dst++ = (c1), *dst++ = (c2); \
763 } while (0)
764
765
766 /* Store a byte C in the place pointed by DST and increment DST to the
767 next free point, and increment PRODUCED_CHARS. If MULTIBYTEP is
768 nonzero, store in an appropriate multibyte from. The caller should
769 declare and set the variables `dst' and `multibytep' appropriately
770 in advance. */
771
772 #define EMIT_ONE_BYTE(c) \
773 do { \
774 produced_chars++; \
775 if (multibytep) \
776 { \
777 int ch = (c); \
778 if (ch >= 0x80) \
779 ch = BYTE8_TO_CHAR (ch); \
780 CHAR_STRING_ADVANCE (ch, dst); \
781 } \
782 else \
783 *dst++ = (c); \
784 } while (0)
785
786
787 /* Like EMIT_ONE_BYTE, but emit two bytes; C1 and C2. */
788
789 #define EMIT_TWO_BYTES(c1, c2) \
790 do { \
791 produced_chars += 2; \
792 if (multibytep) \
793 { \
794 int ch; \
795 \
796 ch = (c1); \
797 if (ch >= 0x80) \
798 ch = BYTE8_TO_CHAR (ch); \
799 CHAR_STRING_ADVANCE (ch, dst); \
800 ch = (c2); \
801 if (ch >= 0x80) \
802 ch = BYTE8_TO_CHAR (ch); \
803 CHAR_STRING_ADVANCE (ch, dst); \
804 } \
805 else \
806 { \
807 *dst++ = (c1); \
808 *dst++ = (c2); \
809 } \
810 } while (0)
811
812
813 #define EMIT_THREE_BYTES(c1, c2, c3) \
814 do { \
815 EMIT_ONE_BYTE (c1); \
816 EMIT_TWO_BYTES (c2, c3); \
817 } while (0)
818
819
820 #define EMIT_FOUR_BYTES(c1, c2, c3, c4) \
821 do { \
822 EMIT_TWO_BYTES (c1, c2); \
823 EMIT_TWO_BYTES (c3, c4); \
824 } while (0)
825
826
827 /* Prototypes for static functions. */
828 static void record_conversion_result (struct coding_system *coding,
829 enum coding_result_code result);
830 static int detect_coding_utf_8 (struct coding_system *,
831 struct coding_detection_info *info);
832 static void decode_coding_utf_8 (struct coding_system *);
833 static int encode_coding_utf_8 (struct coding_system *);
834
835 static int detect_coding_utf_16 (struct coding_system *,
836 struct coding_detection_info *info);
837 static void decode_coding_utf_16 (struct coding_system *);
838 static int encode_coding_utf_16 (struct coding_system *);
839
840 static int detect_coding_iso_2022 (struct coding_system *,
841 struct coding_detection_info *info);
842 static void decode_coding_iso_2022 (struct coding_system *);
843 static int encode_coding_iso_2022 (struct coding_system *);
844
845 static int detect_coding_emacs_mule (struct coding_system *,
846 struct coding_detection_info *info);
847 static void decode_coding_emacs_mule (struct coding_system *);
848 static int encode_coding_emacs_mule (struct coding_system *);
849
850 static int detect_coding_sjis (struct coding_system *,
851 struct coding_detection_info *info);
852 static void decode_coding_sjis (struct coding_system *);
853 static int encode_coding_sjis (struct coding_system *);
854
855 static int detect_coding_big5 (struct coding_system *,
856 struct coding_detection_info *info);
857 static void decode_coding_big5 (struct coding_system *);
858 static int encode_coding_big5 (struct coding_system *);
859
860 static int detect_coding_ccl (struct coding_system *,
861 struct coding_detection_info *info);
862 static void decode_coding_ccl (struct coding_system *);
863 static int encode_coding_ccl (struct coding_system *);
864
865 static void decode_coding_raw_text (struct coding_system *);
866 static int encode_coding_raw_text (struct coding_system *);
867
868 static void coding_set_source (struct coding_system *);
869 static void coding_set_destination (struct coding_system *);
870 static void coding_alloc_by_realloc (struct coding_system *, EMACS_INT);
871 static void coding_alloc_by_making_gap (struct coding_system *,
872 EMACS_INT, EMACS_INT);
873 static unsigned char *alloc_destination (struct coding_system *,
874 EMACS_INT, unsigned char *);
875 static void setup_iso_safe_charsets (Lisp_Object);
876 static unsigned char *encode_designation_at_bol (struct coding_system *,
877 int *, int *,
878 unsigned char *);
879 static int detect_eol (const unsigned char *,
880 EMACS_INT, enum coding_category);
881 static Lisp_Object adjust_coding_eol_type (struct coding_system *, int);
882 static void decode_eol (struct coding_system *);
883 static Lisp_Object get_translation_table (Lisp_Object, int, int *);
884 static Lisp_Object get_translation (Lisp_Object, int *, int *);
885 static int produce_chars (struct coding_system *, Lisp_Object, int);
886 static INLINE void produce_charset (struct coding_system *, int *,
887 EMACS_INT);
888 static void produce_annotation (struct coding_system *, EMACS_INT);
889 static int decode_coding (struct coding_system *);
890 static INLINE int *handle_composition_annotation (EMACS_INT, EMACS_INT,
891 struct coding_system *,
892 int *, EMACS_INT *);
893 static INLINE int *handle_charset_annotation (EMACS_INT, EMACS_INT,
894 struct coding_system *,
895 int *, EMACS_INT *);
896 static void consume_chars (struct coding_system *, Lisp_Object, int);
897 static int encode_coding (struct coding_system *);
898 static Lisp_Object make_conversion_work_buffer (int);
899 static Lisp_Object code_conversion_restore (Lisp_Object);
900 static INLINE int char_encodable_p (int, Lisp_Object);
901 static Lisp_Object make_subsidiaries (Lisp_Object);
902
903 static void
904 record_conversion_result (struct coding_system *coding,
905 enum coding_result_code result)
906 {
907 coding->result = result;
908 switch (result)
909 {
910 case CODING_RESULT_INSUFFICIENT_SRC:
911 Vlast_code_conversion_error = Qinsufficient_source;
912 break;
913 case CODING_RESULT_INCONSISTENT_EOL:
914 Vlast_code_conversion_error = Qinconsistent_eol;
915 break;
916 case CODING_RESULT_INVALID_SRC:
917 Vlast_code_conversion_error = Qinvalid_source;
918 break;
919 case CODING_RESULT_INTERRUPT:
920 Vlast_code_conversion_error = Qinterrupted;
921 break;
922 case CODING_RESULT_INSUFFICIENT_MEM:
923 Vlast_code_conversion_error = Qinsufficient_memory;
924 break;
925 case CODING_RESULT_INSUFFICIENT_DST:
926 /* Don't record this error in Vlast_code_conversion_error
927 because it happens just temporarily and is resolved when the
928 whole conversion is finished. */
929 break;
930 case CODING_RESULT_SUCCESS:
931 break;
932 default:
933 Vlast_code_conversion_error = intern ("Unknown error");
934 }
935 }
936
937 /* This wrapper macro is used to preserve validity of pointers into
938 buffer text across calls to decode_char, which could cause
939 relocation of buffers if it loads a charset map, because loading a
940 charset map allocates large structures. */
941 #define CODING_DECODE_CHAR(coding, src, src_base, src_end, charset, code, c) \
942 do { \
943 charset_map_loaded = 0; \
944 c = DECODE_CHAR (charset, code); \
945 if (charset_map_loaded) \
946 { \
947 const unsigned char *orig = coding->source; \
948 EMACS_INT offset; \
949 \
950 coding_set_source (coding); \
951 offset = coding->source - orig; \
952 src += offset; \
953 src_base += offset; \
954 src_end += offset; \
955 } \
956 } while (0)
957
958
959 /* If there are at least BYTES length of room at dst, allocate memory
960 for coding->destination and update dst and dst_end. We don't have
961 to take care of coding->source which will be relocated. It is
962 handled by calling coding_set_source in encode_coding. */
963
964 #define ASSURE_DESTINATION(bytes) \
965 do { \
966 if (dst + (bytes) >= dst_end) \
967 { \
968 int more_bytes = charbuf_end - charbuf + (bytes); \
969 \
970 dst = alloc_destination (coding, more_bytes, dst); \
971 dst_end = coding->destination + coding->dst_bytes; \
972 } \
973 } while (0)
974
975
976 /* Store multibyte form of the character C in P, and advance P to the
977 end of the multibyte form. This is like CHAR_STRING_ADVANCE but it
978 never calls MAYBE_UNIFY_CHAR. */
979
980 #define CHAR_STRING_ADVANCE_NO_UNIFY(c, p) \
981 do { \
982 if ((c) <= MAX_1_BYTE_CHAR) \
983 *(p)++ = (c); \
984 else if ((c) <= MAX_2_BYTE_CHAR) \
985 *(p)++ = (0xC0 | ((c) >> 6)), \
986 *(p)++ = (0x80 | ((c) & 0x3F)); \
987 else if ((c) <= MAX_3_BYTE_CHAR) \
988 *(p)++ = (0xE0 | ((c) >> 12)), \
989 *(p)++ = (0x80 | (((c) >> 6) & 0x3F)), \
990 *(p)++ = (0x80 | ((c) & 0x3F)); \
991 else if ((c) <= MAX_4_BYTE_CHAR) \
992 *(p)++ = (0xF0 | (c >> 18)), \
993 *(p)++ = (0x80 | ((c >> 12) & 0x3F)), \
994 *(p)++ = (0x80 | ((c >> 6) & 0x3F)), \
995 *(p)++ = (0x80 | (c & 0x3F)); \
996 else if ((c) <= MAX_5_BYTE_CHAR) \
997 *(p)++ = 0xF8, \
998 *(p)++ = (0x80 | ((c >> 18) & 0x0F)), \
999 *(p)++ = (0x80 | ((c >> 12) & 0x3F)), \
1000 *(p)++ = (0x80 | ((c >> 6) & 0x3F)), \
1001 *(p)++ = (0x80 | (c & 0x3F)); \
1002 else \
1003 (p) += BYTE8_STRING ((c) - 0x3FFF80, p); \
1004 } while (0)
1005
1006
1007 /* Return the character code of character whose multibyte form is at
1008 P, and advance P to the end of the multibyte form. This is like
1009 STRING_CHAR_ADVANCE, but it never calls MAYBE_UNIFY_CHAR. */
1010
1011 #define STRING_CHAR_ADVANCE_NO_UNIFY(p) \
1012 (!((p)[0] & 0x80) \
1013 ? *(p)++ \
1014 : ! ((p)[0] & 0x20) \
1015 ? ((p) += 2, \
1016 ((((p)[-2] & 0x1F) << 6) \
1017 | ((p)[-1] & 0x3F) \
1018 | ((unsigned char) ((p)[-2]) < 0xC2 ? 0x3FFF80 : 0))) \
1019 : ! ((p)[0] & 0x10) \
1020 ? ((p) += 3, \
1021 ((((p)[-3] & 0x0F) << 12) \
1022 | (((p)[-2] & 0x3F) << 6) \
1023 | ((p)[-1] & 0x3F))) \
1024 : ! ((p)[0] & 0x08) \
1025 ? ((p) += 4, \
1026 ((((p)[-4] & 0xF) << 18) \
1027 | (((p)[-3] & 0x3F) << 12) \
1028 | (((p)[-2] & 0x3F) << 6) \
1029 | ((p)[-1] & 0x3F))) \
1030 : ((p) += 5, \
1031 ((((p)[-4] & 0x3F) << 18) \
1032 | (((p)[-3] & 0x3F) << 12) \
1033 | (((p)[-2] & 0x3F) << 6) \
1034 | ((p)[-1] & 0x3F))))
1035
1036
1037 static void
1038 coding_set_source (struct coding_system *coding)
1039 {
1040 if (BUFFERP (coding->src_object))
1041 {
1042 struct buffer *buf = XBUFFER (coding->src_object);
1043
1044 if (coding->src_pos < 0)
1045 coding->source = BUF_GAP_END_ADDR (buf) + coding->src_pos_byte;
1046 else
1047 coding->source = BUF_BYTE_ADDRESS (buf, coding->src_pos_byte);
1048 }
1049 else if (STRINGP (coding->src_object))
1050 {
1051 coding->source = SDATA (coding->src_object) + coding->src_pos_byte;
1052 }
1053 else
1054 {
1055 /* Otherwise, the source is C string and is never relocated
1056 automatically. Thus we don't have to update anything. */
1057 }
1058 }
1059
1060 static void
1061 coding_set_destination (struct coding_system *coding)
1062 {
1063 if (BUFFERP (coding->dst_object))
1064 {
1065 if (coding->src_pos < 0)
1066 {
1067 coding->destination = BEG_ADDR + coding->dst_pos_byte - BEG_BYTE;
1068 coding->dst_bytes = (GAP_END_ADDR
1069 - (coding->src_bytes - coding->consumed)
1070 - coding->destination);
1071 }
1072 else
1073 {
1074 /* We are sure that coding->dst_pos_byte is before the gap
1075 of the buffer. */
1076 coding->destination = (BUF_BEG_ADDR (XBUFFER (coding->dst_object))
1077 + coding->dst_pos_byte - BEG_BYTE);
1078 coding->dst_bytes = (BUF_GAP_END_ADDR (XBUFFER (coding->dst_object))
1079 - coding->destination);
1080 }
1081 }
1082 else
1083 {
1084 /* Otherwise, the destination is C string and is never relocated
1085 automatically. Thus we don't have to update anything. */
1086 }
1087 }
1088
1089
1090 static void
1091 coding_alloc_by_realloc (struct coding_system *coding, EMACS_INT bytes)
1092 {
1093 coding->destination = (unsigned char *) xrealloc (coding->destination,
1094 coding->dst_bytes + bytes);
1095 coding->dst_bytes += bytes;
1096 }
1097
1098 static void
1099 coding_alloc_by_making_gap (struct coding_system *coding,
1100 EMACS_INT gap_head_used, EMACS_INT bytes)
1101 {
1102 if (EQ (coding->src_object, coding->dst_object))
1103 {
1104 /* The gap may contain the produced data at the head and not-yet
1105 consumed data at the tail. To preserve those data, we at
1106 first make the gap size to zero, then increase the gap
1107 size. */
1108 EMACS_INT add = GAP_SIZE;
1109
1110 GPT += gap_head_used, GPT_BYTE += gap_head_used;
1111 GAP_SIZE = 0; ZV += add; Z += add; ZV_BYTE += add; Z_BYTE += add;
1112 make_gap (bytes);
1113 GAP_SIZE += add; ZV -= add; Z -= add; ZV_BYTE -= add; Z_BYTE -= add;
1114 GPT -= gap_head_used, GPT_BYTE -= gap_head_used;
1115 }
1116 else
1117 {
1118 Lisp_Object this_buffer;
1119
1120 this_buffer = Fcurrent_buffer ();
1121 set_buffer_internal (XBUFFER (coding->dst_object));
1122 make_gap (bytes);
1123 set_buffer_internal (XBUFFER (this_buffer));
1124 }
1125 }
1126
1127
1128 static unsigned char *
1129 alloc_destination (struct coding_system *coding, EMACS_INT nbytes,
1130 unsigned char *dst)
1131 {
1132 EMACS_INT offset = dst - coding->destination;
1133
1134 if (BUFFERP (coding->dst_object))
1135 {
1136 struct buffer *buf = XBUFFER (coding->dst_object);
1137
1138 coding_alloc_by_making_gap (coding, dst - BUF_GPT_ADDR (buf), nbytes);
1139 }
1140 else
1141 coding_alloc_by_realloc (coding, nbytes);
1142 coding_set_destination (coding);
1143 dst = coding->destination + offset;
1144 return dst;
1145 }
1146
1147 /** Macros for annotations. */
1148
1149 /* An annotation data is stored in the array coding->charbuf in this
1150 format:
1151 [ -LENGTH ANNOTATION_MASK NCHARS ... ]
1152 LENGTH is the number of elements in the annotation.
1153 ANNOTATION_MASK is one of CODING_ANNOTATE_XXX_MASK.
1154 NCHARS is the number of characters in the text annotated.
1155
1156 The format of the following elements depend on ANNOTATION_MASK.
1157
1158 In the case of CODING_ANNOTATE_COMPOSITION_MASK, these elements
1159 follows:
1160 ... NBYTES METHOD [ COMPOSITION-COMPONENTS ... ]
1161
1162 NBYTES is the number of bytes specified in the header part of
1163 old-style emacs-mule encoding, or 0 for the other kind of
1164 composition.
1165
1166 METHOD is one of enum composition_method.
1167
1168 Optional COMPOSITION-COMPONENTS are characters and composition
1169 rules.
1170
1171 In the case of CODING_ANNOTATE_CHARSET_MASK, one element CHARSET-ID
1172 follows.
1173
1174 If ANNOTATION_MASK is 0, this annotation is just a space holder to
1175 recover from an invalid annotation, and should be skipped by
1176 produce_annotation. */
1177
1178 /* Maximum length of the header of annotation data. */
1179 #define MAX_ANNOTATION_LENGTH 5
1180
1181 #define ADD_ANNOTATION_DATA(buf, len, mask, nchars) \
1182 do { \
1183 *(buf)++ = -(len); \
1184 *(buf)++ = (mask); \
1185 *(buf)++ = (nchars); \
1186 coding->annotated = 1; \
1187 } while (0);
1188
1189 #define ADD_COMPOSITION_DATA(buf, nchars, nbytes, method) \
1190 do { \
1191 ADD_ANNOTATION_DATA (buf, 5, CODING_ANNOTATE_COMPOSITION_MASK, nchars); \
1192 *buf++ = nbytes; \
1193 *buf++ = method; \
1194 } while (0)
1195
1196
1197 #define ADD_CHARSET_DATA(buf, nchars, id) \
1198 do { \
1199 ADD_ANNOTATION_DATA (buf, 4, CODING_ANNOTATE_CHARSET_MASK, nchars); \
1200 *buf++ = id; \
1201 } while (0)
1202
1203 \f
1204 /*** 2. Emacs' internal format (emacs-utf-8) ***/
1205
1206
1207
1208 \f
1209 /*** 3. UTF-8 ***/
1210
1211 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
1212 Check if a text is encoded in UTF-8. If it is, return 1, else
1213 return 0. */
1214
1215 #define UTF_8_1_OCTET_P(c) ((c) < 0x80)
1216 #define UTF_8_EXTRA_OCTET_P(c) (((c) & 0xC0) == 0x80)
1217 #define UTF_8_2_OCTET_LEADING_P(c) (((c) & 0xE0) == 0xC0)
1218 #define UTF_8_3_OCTET_LEADING_P(c) (((c) & 0xF0) == 0xE0)
1219 #define UTF_8_4_OCTET_LEADING_P(c) (((c) & 0xF8) == 0xF0)
1220 #define UTF_8_5_OCTET_LEADING_P(c) (((c) & 0xFC) == 0xF8)
1221
1222 #define UTF_BOM 0xFEFF
1223 #define UTF_8_BOM_1 0xEF
1224 #define UTF_8_BOM_2 0xBB
1225 #define UTF_8_BOM_3 0xBF
1226
1227 static int
1228 detect_coding_utf_8 (struct coding_system *coding,
1229 struct coding_detection_info *detect_info)
1230 {
1231 const unsigned char *src = coding->source, *src_base;
1232 const unsigned char *src_end = coding->source + coding->src_bytes;
1233 int multibytep = coding->src_multibyte;
1234 int consumed_chars = 0;
1235 int bom_found = 0;
1236 int found = 0;
1237
1238 detect_info->checked |= CATEGORY_MASK_UTF_8;
1239 /* A coding system of this category is always ASCII compatible. */
1240 src += coding->head_ascii;
1241
1242 while (1)
1243 {
1244 int c, c1, c2, c3, c4;
1245
1246 src_base = src;
1247 ONE_MORE_BYTE (c);
1248 if (c < 0 || UTF_8_1_OCTET_P (c))
1249 continue;
1250 ONE_MORE_BYTE (c1);
1251 if (c1 < 0 || ! UTF_8_EXTRA_OCTET_P (c1))
1252 break;
1253 if (UTF_8_2_OCTET_LEADING_P (c))
1254 {
1255 found = 1;
1256 continue;
1257 }
1258 ONE_MORE_BYTE (c2);
1259 if (c2 < 0 || ! UTF_8_EXTRA_OCTET_P (c2))
1260 break;
1261 if (UTF_8_3_OCTET_LEADING_P (c))
1262 {
1263 found = 1;
1264 if (src_base == coding->source
1265 && c == UTF_8_BOM_1 && c1 == UTF_8_BOM_2 && c2 == UTF_8_BOM_3)
1266 bom_found = 1;
1267 continue;
1268 }
1269 ONE_MORE_BYTE (c3);
1270 if (c3 < 0 || ! UTF_8_EXTRA_OCTET_P (c3))
1271 break;
1272 if (UTF_8_4_OCTET_LEADING_P (c))
1273 {
1274 found = 1;
1275 continue;
1276 }
1277 ONE_MORE_BYTE (c4);
1278 if (c4 < 0 || ! UTF_8_EXTRA_OCTET_P (c4))
1279 break;
1280 if (UTF_8_5_OCTET_LEADING_P (c))
1281 {
1282 found = 1;
1283 continue;
1284 }
1285 break;
1286 }
1287 detect_info->rejected |= CATEGORY_MASK_UTF_8;
1288 return 0;
1289
1290 no_more_source:
1291 if (src_base < src && coding->mode & CODING_MODE_LAST_BLOCK)
1292 {
1293 detect_info->rejected |= CATEGORY_MASK_UTF_8;
1294 return 0;
1295 }
1296 if (bom_found)
1297 {
1298 /* The first character 0xFFFE doesn't necessarily mean a BOM. */
1299 detect_info->found |= CATEGORY_MASK_UTF_8_SIG | CATEGORY_MASK_UTF_8_NOSIG;
1300 }
1301 else
1302 {
1303 detect_info->rejected |= CATEGORY_MASK_UTF_8_SIG;
1304 if (found)
1305 detect_info->found |= CATEGORY_MASK_UTF_8_NOSIG;
1306 }
1307 return 1;
1308 }
1309
1310
1311 static void
1312 decode_coding_utf_8 (struct coding_system *coding)
1313 {
1314 const unsigned char *src = coding->source + coding->consumed;
1315 const unsigned char *src_end = coding->source + coding->src_bytes;
1316 const unsigned char *src_base;
1317 int *charbuf = coding->charbuf + coding->charbuf_used;
1318 int *charbuf_end = coding->charbuf + coding->charbuf_size;
1319 int consumed_chars = 0, consumed_chars_base = 0;
1320 int multibytep = coding->src_multibyte;
1321 enum utf_bom_type bom = CODING_UTF_8_BOM (coding);
1322 Lisp_Object attr, charset_list;
1323 int eol_crlf =
1324 !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
1325 int byte_after_cr = -1;
1326
1327 CODING_GET_INFO (coding, attr, charset_list);
1328
1329 if (bom != utf_without_bom)
1330 {
1331 int c1, c2, c3;
1332
1333 src_base = src;
1334 ONE_MORE_BYTE (c1);
1335 if (! UTF_8_3_OCTET_LEADING_P (c1))
1336 src = src_base;
1337 else
1338 {
1339 ONE_MORE_BYTE (c2);
1340 if (! UTF_8_EXTRA_OCTET_P (c2))
1341 src = src_base;
1342 else
1343 {
1344 ONE_MORE_BYTE (c3);
1345 if (! UTF_8_EXTRA_OCTET_P (c3))
1346 src = src_base;
1347 else
1348 {
1349 if ((c1 != UTF_8_BOM_1)
1350 || (c2 != UTF_8_BOM_2) || (c3 != UTF_8_BOM_3))
1351 src = src_base;
1352 else
1353 CODING_UTF_8_BOM (coding) = utf_without_bom;
1354 }
1355 }
1356 }
1357 }
1358 CODING_UTF_8_BOM (coding) = utf_without_bom;
1359
1360 while (1)
1361 {
1362 int c, c1, c2, c3, c4, c5;
1363
1364 src_base = src;
1365 consumed_chars_base = consumed_chars;
1366
1367 if (charbuf >= charbuf_end)
1368 {
1369 if (byte_after_cr >= 0)
1370 src_base--;
1371 break;
1372 }
1373
1374 if (byte_after_cr >= 0)
1375 c1 = byte_after_cr, byte_after_cr = -1;
1376 else
1377 ONE_MORE_BYTE (c1);
1378 if (c1 < 0)
1379 {
1380 c = - c1;
1381 }
1382 else if (UTF_8_1_OCTET_P (c1))
1383 {
1384 if (eol_crlf && c1 == '\r')
1385 ONE_MORE_BYTE (byte_after_cr);
1386 c = c1;
1387 }
1388 else
1389 {
1390 ONE_MORE_BYTE (c2);
1391 if (c2 < 0 || ! UTF_8_EXTRA_OCTET_P (c2))
1392 goto invalid_code;
1393 if (UTF_8_2_OCTET_LEADING_P (c1))
1394 {
1395 c = ((c1 & 0x1F) << 6) | (c2 & 0x3F);
1396 /* Reject overlong sequences here and below. Encoders
1397 producing them are incorrect, they can be misleading,
1398 and they mess up read/write invariance. */
1399 if (c < 128)
1400 goto invalid_code;
1401 }
1402 else
1403 {
1404 ONE_MORE_BYTE (c3);
1405 if (c3 < 0 || ! UTF_8_EXTRA_OCTET_P (c3))
1406 goto invalid_code;
1407 if (UTF_8_3_OCTET_LEADING_P (c1))
1408 {
1409 c = (((c1 & 0xF) << 12)
1410 | ((c2 & 0x3F) << 6) | (c3 & 0x3F));
1411 if (c < 0x800
1412 || (c >= 0xd800 && c < 0xe000)) /* surrogates (invalid) */
1413 goto invalid_code;
1414 }
1415 else
1416 {
1417 ONE_MORE_BYTE (c4);
1418 if (c4 < 0 || ! UTF_8_EXTRA_OCTET_P (c4))
1419 goto invalid_code;
1420 if (UTF_8_4_OCTET_LEADING_P (c1))
1421 {
1422 c = (((c1 & 0x7) << 18) | ((c2 & 0x3F) << 12)
1423 | ((c3 & 0x3F) << 6) | (c4 & 0x3F));
1424 if (c < 0x10000)
1425 goto invalid_code;
1426 }
1427 else
1428 {
1429 ONE_MORE_BYTE (c5);
1430 if (c5 < 0 || ! UTF_8_EXTRA_OCTET_P (c5))
1431 goto invalid_code;
1432 if (UTF_8_5_OCTET_LEADING_P (c1))
1433 {
1434 c = (((c1 & 0x3) << 24) | ((c2 & 0x3F) << 18)
1435 | ((c3 & 0x3F) << 12) | ((c4 & 0x3F) << 6)
1436 | (c5 & 0x3F));
1437 if ((c > MAX_CHAR) || (c < 0x200000))
1438 goto invalid_code;
1439 }
1440 else
1441 goto invalid_code;
1442 }
1443 }
1444 }
1445 }
1446
1447 *charbuf++ = c;
1448 continue;
1449
1450 invalid_code:
1451 src = src_base;
1452 consumed_chars = consumed_chars_base;
1453 ONE_MORE_BYTE (c);
1454 *charbuf++ = ASCII_BYTE_P (c) ? c : BYTE8_TO_CHAR (c);
1455 coding->errors++;
1456 }
1457
1458 no_more_source:
1459 coding->consumed_char += consumed_chars_base;
1460 coding->consumed = src_base - coding->source;
1461 coding->charbuf_used = charbuf - coding->charbuf;
1462 }
1463
1464
1465 static int
1466 encode_coding_utf_8 (struct coding_system *coding)
1467 {
1468 int multibytep = coding->dst_multibyte;
1469 int *charbuf = coding->charbuf;
1470 int *charbuf_end = charbuf + coding->charbuf_used;
1471 unsigned char *dst = coding->destination + coding->produced;
1472 unsigned char *dst_end = coding->destination + coding->dst_bytes;
1473 int produced_chars = 0;
1474 int c;
1475
1476 if (CODING_UTF_8_BOM (coding) == utf_with_bom)
1477 {
1478 ASSURE_DESTINATION (3);
1479 EMIT_THREE_BYTES (UTF_8_BOM_1, UTF_8_BOM_2, UTF_8_BOM_3);
1480 CODING_UTF_8_BOM (coding) = utf_without_bom;
1481 }
1482
1483 if (multibytep)
1484 {
1485 int safe_room = MAX_MULTIBYTE_LENGTH * 2;
1486
1487 while (charbuf < charbuf_end)
1488 {
1489 unsigned char str[MAX_MULTIBYTE_LENGTH], *p, *pend = str;
1490
1491 ASSURE_DESTINATION (safe_room);
1492 c = *charbuf++;
1493 if (CHAR_BYTE8_P (c))
1494 {
1495 c = CHAR_TO_BYTE8 (c);
1496 EMIT_ONE_BYTE (c);
1497 }
1498 else
1499 {
1500 CHAR_STRING_ADVANCE_NO_UNIFY (c, pend);
1501 for (p = str; p < pend; p++)
1502 EMIT_ONE_BYTE (*p);
1503 }
1504 }
1505 }
1506 else
1507 {
1508 int safe_room = MAX_MULTIBYTE_LENGTH;
1509
1510 while (charbuf < charbuf_end)
1511 {
1512 ASSURE_DESTINATION (safe_room);
1513 c = *charbuf++;
1514 if (CHAR_BYTE8_P (c))
1515 *dst++ = CHAR_TO_BYTE8 (c);
1516 else
1517 CHAR_STRING_ADVANCE_NO_UNIFY (c, dst);
1518 produced_chars++;
1519 }
1520 }
1521 record_conversion_result (coding, CODING_RESULT_SUCCESS);
1522 coding->produced_char += produced_chars;
1523 coding->produced = dst - coding->destination;
1524 return 0;
1525 }
1526
1527
1528 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
1529 Check if a text is encoded in one of UTF-16 based coding systems.
1530 If it is, return 1, else return 0. */
1531
1532 #define UTF_16_HIGH_SURROGATE_P(val) \
1533 (((val) & 0xFC00) == 0xD800)
1534
1535 #define UTF_16_LOW_SURROGATE_P(val) \
1536 (((val) & 0xFC00) == 0xDC00)
1537
1538 #define UTF_16_INVALID_P(val) \
1539 (((val) == 0xFFFE) \
1540 || ((val) == 0xFFFF) \
1541 || UTF_16_LOW_SURROGATE_P (val))
1542
1543
1544 static int
1545 detect_coding_utf_16 (struct coding_system *coding,
1546 struct coding_detection_info *detect_info)
1547 {
1548 const unsigned char *src = coding->source;
1549 const unsigned char *src_end = coding->source + coding->src_bytes;
1550 int multibytep = coding->src_multibyte;
1551 int c1, c2;
1552
1553 detect_info->checked |= CATEGORY_MASK_UTF_16;
1554 if (coding->mode & CODING_MODE_LAST_BLOCK
1555 && (coding->src_chars & 1))
1556 {
1557 detect_info->rejected |= CATEGORY_MASK_UTF_16;
1558 return 0;
1559 }
1560
1561 TWO_MORE_BYTES (c1, c2);
1562 if ((c1 == 0xFF) && (c2 == 0xFE))
1563 {
1564 detect_info->found |= (CATEGORY_MASK_UTF_16_LE
1565 | CATEGORY_MASK_UTF_16_AUTO);
1566 detect_info->rejected |= (CATEGORY_MASK_UTF_16_BE
1567 | CATEGORY_MASK_UTF_16_BE_NOSIG
1568 | CATEGORY_MASK_UTF_16_LE_NOSIG);
1569 }
1570 else if ((c1 == 0xFE) && (c2 == 0xFF))
1571 {
1572 detect_info->found |= (CATEGORY_MASK_UTF_16_BE
1573 | CATEGORY_MASK_UTF_16_AUTO);
1574 detect_info->rejected |= (CATEGORY_MASK_UTF_16_LE
1575 | CATEGORY_MASK_UTF_16_BE_NOSIG
1576 | CATEGORY_MASK_UTF_16_LE_NOSIG);
1577 }
1578 else if (c2 < 0)
1579 {
1580 detect_info->rejected |= CATEGORY_MASK_UTF_16;
1581 return 0;
1582 }
1583 else
1584 {
1585 /* We check the dispersion of Eth and Oth bytes where E is even and
1586 O is odd. If both are high, we assume binary data.*/
1587 unsigned char e[256], o[256];
1588 unsigned e_num = 1, o_num = 1;
1589
1590 memset (e, 0, 256);
1591 memset (o, 0, 256);
1592 e[c1] = 1;
1593 o[c2] = 1;
1594
1595 detect_info->rejected |= (CATEGORY_MASK_UTF_16_AUTO
1596 |CATEGORY_MASK_UTF_16_BE
1597 | CATEGORY_MASK_UTF_16_LE);
1598
1599 while ((detect_info->rejected & CATEGORY_MASK_UTF_16)
1600 != CATEGORY_MASK_UTF_16)
1601 {
1602 TWO_MORE_BYTES (c1, c2);
1603 if (c2 < 0)
1604 break;
1605 if (! e[c1])
1606 {
1607 e[c1] = 1;
1608 e_num++;
1609 if (e_num >= 128)
1610 detect_info->rejected |= CATEGORY_MASK_UTF_16_BE_NOSIG;
1611 }
1612 if (! o[c2])
1613 {
1614 o[c2] = 1;
1615 o_num++;
1616 if (o_num >= 128)
1617 detect_info->rejected |= CATEGORY_MASK_UTF_16_LE_NOSIG;
1618 }
1619 }
1620 return 0;
1621 }
1622
1623 no_more_source:
1624 return 1;
1625 }
1626
1627 static void
1628 decode_coding_utf_16 (struct coding_system *coding)
1629 {
1630 const unsigned char *src = coding->source + coding->consumed;
1631 const unsigned char *src_end = coding->source + coding->src_bytes;
1632 const unsigned char *src_base;
1633 int *charbuf = coding->charbuf + coding->charbuf_used;
1634 /* We may produces at most 3 chars in one loop. */
1635 int *charbuf_end = coding->charbuf + coding->charbuf_size - 2;
1636 int consumed_chars = 0, consumed_chars_base = 0;
1637 int multibytep = coding->src_multibyte;
1638 enum utf_bom_type bom = CODING_UTF_16_BOM (coding);
1639 enum utf_16_endian_type endian = CODING_UTF_16_ENDIAN (coding);
1640 int surrogate = CODING_UTF_16_SURROGATE (coding);
1641 Lisp_Object attr, charset_list;
1642 int eol_crlf =
1643 !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
1644 int byte_after_cr1 = -1, byte_after_cr2 = -1;
1645
1646 CODING_GET_INFO (coding, attr, charset_list);
1647
1648 if (bom == utf_with_bom)
1649 {
1650 int c, c1, c2;
1651
1652 src_base = src;
1653 ONE_MORE_BYTE (c1);
1654 ONE_MORE_BYTE (c2);
1655 c = (c1 << 8) | c2;
1656
1657 if (endian == utf_16_big_endian
1658 ? c != 0xFEFF : c != 0xFFFE)
1659 {
1660 /* The first two bytes are not BOM. Treat them as bytes
1661 for a normal character. */
1662 src = src_base;
1663 coding->errors++;
1664 }
1665 CODING_UTF_16_BOM (coding) = utf_without_bom;
1666 }
1667 else if (bom == utf_detect_bom)
1668 {
1669 /* We have already tried to detect BOM and failed in
1670 detect_coding. */
1671 CODING_UTF_16_BOM (coding) = utf_without_bom;
1672 }
1673
1674 while (1)
1675 {
1676 int c, c1, c2;
1677
1678 src_base = src;
1679 consumed_chars_base = consumed_chars;
1680
1681 if (charbuf >= charbuf_end)
1682 {
1683 if (byte_after_cr1 >= 0)
1684 src_base -= 2;
1685 break;
1686 }
1687
1688 if (byte_after_cr1 >= 0)
1689 c1 = byte_after_cr1, byte_after_cr1 = -1;
1690 else
1691 ONE_MORE_BYTE (c1);
1692 if (c1 < 0)
1693 {
1694 *charbuf++ = -c1;
1695 continue;
1696 }
1697 if (byte_after_cr2 >= 0)
1698 c2 = byte_after_cr2, byte_after_cr2 = -1;
1699 else
1700 ONE_MORE_BYTE (c2);
1701 if (c2 < 0)
1702 {
1703 *charbuf++ = ASCII_BYTE_P (c1) ? c1 : BYTE8_TO_CHAR (c1);
1704 *charbuf++ = -c2;
1705 continue;
1706 }
1707 c = (endian == utf_16_big_endian
1708 ? ((c1 << 8) | c2) : ((c2 << 8) | c1));
1709
1710 if (surrogate)
1711 {
1712 if (! UTF_16_LOW_SURROGATE_P (c))
1713 {
1714 if (endian == utf_16_big_endian)
1715 c1 = surrogate >> 8, c2 = surrogate & 0xFF;
1716 else
1717 c1 = surrogate & 0xFF, c2 = surrogate >> 8;
1718 *charbuf++ = c1;
1719 *charbuf++ = c2;
1720 coding->errors++;
1721 if (UTF_16_HIGH_SURROGATE_P (c))
1722 CODING_UTF_16_SURROGATE (coding) = surrogate = c;
1723 else
1724 *charbuf++ = c;
1725 }
1726 else
1727 {
1728 c = ((surrogate - 0xD800) << 10) | (c - 0xDC00);
1729 CODING_UTF_16_SURROGATE (coding) = surrogate = 0;
1730 *charbuf++ = 0x10000 + c;
1731 }
1732 }
1733 else
1734 {
1735 if (UTF_16_HIGH_SURROGATE_P (c))
1736 CODING_UTF_16_SURROGATE (coding) = surrogate = c;
1737 else
1738 {
1739 if (eol_crlf && c == '\r')
1740 {
1741 ONE_MORE_BYTE (byte_after_cr1);
1742 ONE_MORE_BYTE (byte_after_cr2);
1743 }
1744 *charbuf++ = c;
1745 }
1746 }
1747 }
1748
1749 no_more_source:
1750 coding->consumed_char += consumed_chars_base;
1751 coding->consumed = src_base - coding->source;
1752 coding->charbuf_used = charbuf - coding->charbuf;
1753 }
1754
1755 static int
1756 encode_coding_utf_16 (struct coding_system *coding)
1757 {
1758 int multibytep = coding->dst_multibyte;
1759 int *charbuf = coding->charbuf;
1760 int *charbuf_end = charbuf + coding->charbuf_used;
1761 unsigned char *dst = coding->destination + coding->produced;
1762 unsigned char *dst_end = coding->destination + coding->dst_bytes;
1763 int safe_room = 8;
1764 enum utf_bom_type bom = CODING_UTF_16_BOM (coding);
1765 int big_endian = CODING_UTF_16_ENDIAN (coding) == utf_16_big_endian;
1766 int produced_chars = 0;
1767 Lisp_Object attrs, charset_list;
1768 int c;
1769
1770 CODING_GET_INFO (coding, attrs, charset_list);
1771
1772 if (bom != utf_without_bom)
1773 {
1774 ASSURE_DESTINATION (safe_room);
1775 if (big_endian)
1776 EMIT_TWO_BYTES (0xFE, 0xFF);
1777 else
1778 EMIT_TWO_BYTES (0xFF, 0xFE);
1779 CODING_UTF_16_BOM (coding) = utf_without_bom;
1780 }
1781
1782 while (charbuf < charbuf_end)
1783 {
1784 ASSURE_DESTINATION (safe_room);
1785 c = *charbuf++;
1786 if (c > MAX_UNICODE_CHAR)
1787 c = coding->default_char;
1788
1789 if (c < 0x10000)
1790 {
1791 if (big_endian)
1792 EMIT_TWO_BYTES (c >> 8, c & 0xFF);
1793 else
1794 EMIT_TWO_BYTES (c & 0xFF, c >> 8);
1795 }
1796 else
1797 {
1798 int c1, c2;
1799
1800 c -= 0x10000;
1801 c1 = (c >> 10) + 0xD800;
1802 c2 = (c & 0x3FF) + 0xDC00;
1803 if (big_endian)
1804 EMIT_FOUR_BYTES (c1 >> 8, c1 & 0xFF, c2 >> 8, c2 & 0xFF);
1805 else
1806 EMIT_FOUR_BYTES (c1 & 0xFF, c1 >> 8, c2 & 0xFF, c2 >> 8);
1807 }
1808 }
1809 record_conversion_result (coding, CODING_RESULT_SUCCESS);
1810 coding->produced = dst - coding->destination;
1811 coding->produced_char += produced_chars;
1812 return 0;
1813 }
1814
1815 \f
1816 /*** 6. Old Emacs' internal format (emacs-mule) ***/
1817
1818 /* Emacs' internal format for representation of multiple character
1819 sets is a kind of multi-byte encoding, i.e. characters are
1820 represented by variable-length sequences of one-byte codes.
1821
1822 ASCII characters and control characters (e.g. `tab', `newline') are
1823 represented by one-byte sequences which are their ASCII codes, in
1824 the range 0x00 through 0x7F.
1825
1826 8-bit characters of the range 0x80..0x9F are represented by
1827 two-byte sequences of LEADING_CODE_8_BIT_CONTROL and (their 8-bit
1828 code + 0x20).
1829
1830 8-bit characters of the range 0xA0..0xFF are represented by
1831 one-byte sequences which are their 8-bit code.
1832
1833 The other characters are represented by a sequence of `base
1834 leading-code', optional `extended leading-code', and one or two
1835 `position-code's. The length of the sequence is determined by the
1836 base leading-code. Leading-code takes the range 0x81 through 0x9D,
1837 whereas extended leading-code and position-code take the range 0xA0
1838 through 0xFF. See `charset.h' for more details about leading-code
1839 and position-code.
1840
1841 --- CODE RANGE of Emacs' internal format ---
1842 character set range
1843 ------------- -----
1844 ascii 0x00..0x7F
1845 eight-bit-control LEADING_CODE_8_BIT_CONTROL + 0xA0..0xBF
1846 eight-bit-graphic 0xA0..0xBF
1847 ELSE 0x81..0x9D + [0xA0..0xFF]+
1848 ---------------------------------------------
1849
1850 As this is the internal character representation, the format is
1851 usually not used externally (i.e. in a file or in a data sent to a
1852 process). But, it is possible to have a text externally in this
1853 format (i.e. by encoding by the coding system `emacs-mule').
1854
1855 In that case, a sequence of one-byte codes has a slightly different
1856 form.
1857
1858 At first, all characters in eight-bit-control are represented by
1859 one-byte sequences which are their 8-bit code.
1860
1861 Next, character composition data are represented by the byte
1862 sequence of the form: 0x80 METHOD BYTES CHARS COMPONENT ...,
1863 where,
1864 METHOD is 0xF2 plus one of composition method (enum
1865 composition_method),
1866
1867 BYTES is 0xA0 plus a byte length of this composition data,
1868
1869 CHARS is 0xA0 plus a number of characters composed by this
1870 data,
1871
1872 COMPONENTs are characters of multibyte form or composition
1873 rules encoded by two-byte of ASCII codes.
1874
1875 In addition, for backward compatibility, the following formats are
1876 also recognized as composition data on decoding.
1877
1878 0x80 MSEQ ...
1879 0x80 0xFF MSEQ RULE MSEQ RULE ... MSEQ
1880
1881 Here,
1882 MSEQ is a multibyte form but in these special format:
1883 ASCII: 0xA0 ASCII_CODE+0x80,
1884 other: LEADING_CODE+0x20 FOLLOWING-BYTE ...,
1885 RULE is a one byte code of the range 0xA0..0xF0 that
1886 represents a composition rule.
1887 */
1888
1889 char emacs_mule_bytes[256];
1890
1891
1892 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
1893 Check if a text is encoded in `emacs-mule'. If it is, return 1,
1894 else return 0. */
1895
1896 static int
1897 detect_coding_emacs_mule (struct coding_system *coding,
1898 struct coding_detection_info *detect_info)
1899 {
1900 const unsigned char *src = coding->source, *src_base;
1901 const unsigned char *src_end = coding->source + coding->src_bytes;
1902 int multibytep = coding->src_multibyte;
1903 int consumed_chars = 0;
1904 int c;
1905 int found = 0;
1906
1907 detect_info->checked |= CATEGORY_MASK_EMACS_MULE;
1908 /* A coding system of this category is always ASCII compatible. */
1909 src += coding->head_ascii;
1910
1911 while (1)
1912 {
1913 src_base = src;
1914 ONE_MORE_BYTE (c);
1915 if (c < 0)
1916 continue;
1917 if (c == 0x80)
1918 {
1919 /* Perhaps the start of composite character. We simply skip
1920 it because analyzing it is too heavy for detecting. But,
1921 at least, we check that the composite character
1922 constitutes of more than 4 bytes. */
1923 const unsigned char *src_base;
1924
1925 repeat:
1926 src_base = src;
1927 do
1928 {
1929 ONE_MORE_BYTE (c);
1930 }
1931 while (c >= 0xA0);
1932
1933 if (src - src_base <= 4)
1934 break;
1935 found = CATEGORY_MASK_EMACS_MULE;
1936 if (c == 0x80)
1937 goto repeat;
1938 }
1939
1940 if (c < 0x80)
1941 {
1942 if (c < 0x20
1943 && (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO))
1944 break;
1945 }
1946 else
1947 {
1948 int more_bytes = emacs_mule_bytes[c] - 1;
1949
1950 while (more_bytes > 0)
1951 {
1952 ONE_MORE_BYTE (c);
1953 if (c < 0xA0)
1954 {
1955 src--; /* Unread the last byte. */
1956 break;
1957 }
1958 more_bytes--;
1959 }
1960 if (more_bytes != 0)
1961 break;
1962 found = CATEGORY_MASK_EMACS_MULE;
1963 }
1964 }
1965 detect_info->rejected |= CATEGORY_MASK_EMACS_MULE;
1966 return 0;
1967
1968 no_more_source:
1969 if (src_base < src && coding->mode & CODING_MODE_LAST_BLOCK)
1970 {
1971 detect_info->rejected |= CATEGORY_MASK_EMACS_MULE;
1972 return 0;
1973 }
1974 detect_info->found |= found;
1975 return 1;
1976 }
1977
1978
1979 /* Parse emacs-mule multibyte sequence at SRC and return the decoded
1980 character. If CMP_STATUS indicates that we must expect MSEQ or
1981 RULE described above, decode it and return the negative value of
1982 the decoded character or rule. If an invalid byte is found, return
1983 -1. If SRC is too short, return -2. */
1984
1985 int
1986 emacs_mule_char (struct coding_system *coding, const unsigned char *src,
1987 int *nbytes, int *nchars, int *id,
1988 struct composition_status *cmp_status)
1989 {
1990 const unsigned char *src_end = coding->source + coding->src_bytes;
1991 const unsigned char *src_base = src;
1992 int multibytep = coding->src_multibyte;
1993 int charset_id;
1994 unsigned code;
1995 int c;
1996 int consumed_chars = 0;
1997 int mseq_found = 0;
1998
1999 ONE_MORE_BYTE (c);
2000 if (c < 0)
2001 {
2002 c = -c;
2003 charset_id = emacs_mule_charset[0];
2004 }
2005 else
2006 {
2007 if (c >= 0xA0)
2008 {
2009 if (cmp_status->state != COMPOSING_NO
2010 && cmp_status->old_form)
2011 {
2012 if (cmp_status->state == COMPOSING_CHAR)
2013 {
2014 if (c == 0xA0)
2015 {
2016 ONE_MORE_BYTE (c);
2017 c -= 0x80;
2018 if (c < 0)
2019 goto invalid_code;
2020 }
2021 else
2022 c -= 0x20;
2023 mseq_found = 1;
2024 }
2025 else
2026 {
2027 *nbytes = src - src_base;
2028 *nchars = consumed_chars;
2029 return -c;
2030 }
2031 }
2032 else
2033 goto invalid_code;
2034 }
2035
2036 switch (emacs_mule_bytes[c])
2037 {
2038 case 2:
2039 if ((charset_id = emacs_mule_charset[c]) < 0)
2040 goto invalid_code;
2041 ONE_MORE_BYTE (c);
2042 if (c < 0xA0)
2043 goto invalid_code;
2044 code = c & 0x7F;
2045 break;
2046
2047 case 3:
2048 if (c == EMACS_MULE_LEADING_CODE_PRIVATE_11
2049 || c == EMACS_MULE_LEADING_CODE_PRIVATE_12)
2050 {
2051 ONE_MORE_BYTE (c);
2052 if (c < 0xA0 || (charset_id = emacs_mule_charset[c]) < 0)
2053 goto invalid_code;
2054 ONE_MORE_BYTE (c);
2055 if (c < 0xA0)
2056 goto invalid_code;
2057 code = c & 0x7F;
2058 }
2059 else
2060 {
2061 if ((charset_id = emacs_mule_charset[c]) < 0)
2062 goto invalid_code;
2063 ONE_MORE_BYTE (c);
2064 if (c < 0xA0)
2065 goto invalid_code;
2066 code = (c & 0x7F) << 8;
2067 ONE_MORE_BYTE (c);
2068 if (c < 0xA0)
2069 goto invalid_code;
2070 code |= c & 0x7F;
2071 }
2072 break;
2073
2074 case 4:
2075 ONE_MORE_BYTE (c);
2076 if (c < 0 || (charset_id = emacs_mule_charset[c]) < 0)
2077 goto invalid_code;
2078 ONE_MORE_BYTE (c);
2079 if (c < 0xA0)
2080 goto invalid_code;
2081 code = (c & 0x7F) << 8;
2082 ONE_MORE_BYTE (c);
2083 if (c < 0xA0)
2084 goto invalid_code;
2085 code |= c & 0x7F;
2086 break;
2087
2088 case 1:
2089 code = c;
2090 charset_id = ASCII_BYTE_P (code) ? charset_ascii : charset_eight_bit;
2091 break;
2092
2093 default:
2094 abort ();
2095 }
2096 CODING_DECODE_CHAR (coding, src, src_base, src_end,
2097 CHARSET_FROM_ID (charset_id), code, c);
2098 if (c < 0)
2099 goto invalid_code;
2100 }
2101 *nbytes = src - src_base;
2102 *nchars = consumed_chars;
2103 if (id)
2104 *id = charset_id;
2105 return (mseq_found ? -c : c);
2106
2107 no_more_source:
2108 return -2;
2109
2110 invalid_code:
2111 return -1;
2112 }
2113
2114
2115 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions". */
2116
2117 /* Handle these composition sequence ('|': the end of header elements,
2118 BYTES and CHARS >= 0xA0):
2119
2120 (1) relative composition: 0x80 0xF2 BYTES CHARS | CHAR ...
2121 (2) altchar composition: 0x80 0xF4 BYTES CHARS | ALT ... ALT CHAR ...
2122 (3) alt&rule composition: 0x80 0xF5 BYTES CHARS | ALT RULE ... ALT CHAR ...
2123
2124 and these old form:
2125
2126 (4) relative composition: 0x80 | MSEQ ... MSEQ
2127 (5) rulebase composition: 0x80 0xFF | MSEQ MRULE ... MSEQ
2128
2129 When the starter 0x80 and the following header elements are found,
2130 this annotation header is produced.
2131
2132 [ -LENGTH(==-5) CODING_ANNOTATE_COMPOSITION_MASK NCHARS NBYTES METHOD ]
2133
2134 NCHARS is CHARS - 0xA0 for (1), (2), (3), and 0 for (4), (5).
2135 NBYTES is BYTES - 0xA0 for (1), (2), (3), and 0 for (4), (5).
2136
2137 Then, upon reading the following elements, these codes are produced
2138 until the composition end is found:
2139
2140 (1) CHAR ... CHAR
2141 (2) ALT ... ALT CHAR ... CHAR
2142 (3) ALT -2 DECODED-RULE ALT -2 DECODED-RULE ... ALT CHAR ... CHAR
2143 (4) CHAR ... CHAR
2144 (5) CHAR -2 DECODED-RULE CHAR -2 DECODED-RULE ... CHAR
2145
2146 When the composition end is found, LENGTH and NCHARS in the
2147 annotation header is updated as below:
2148
2149 (1) LENGTH: unchanged, NCHARS: unchanged
2150 (2) LENGTH: length of the whole sequence minus NCHARS, NCHARS: unchanged
2151 (3) LENGTH: length of the whole sequence minus NCHARS, NCHARS: unchanged
2152 (4) LENGTH: unchanged, NCHARS: number of CHARs
2153 (5) LENGTH: unchanged, NCHARS: number of CHARs
2154
2155 If an error is found while composing, the annotation header is
2156 changed to the original composition header (plus filler -1s) as
2157 below:
2158
2159 (1),(2),(3) [ 0x80 0xF2+METHOD BYTES CHARS -1 ]
2160 (5) [ 0x80 0xFF -1 -1- -1 ]
2161
2162 and the sequence [ -2 DECODED-RULE ] is changed to the original
2163 byte sequence as below:
2164 o the original byte sequence is B: [ B -1 ]
2165 o the original byte sequence is B1 B2: [ B1 B2 ]
2166
2167 Most of the routines are implemented by macros because many
2168 variables and labels in the caller decode_coding_emacs_mule must be
2169 accessible, and they are usually called just once (thus doesn't
2170 increase the size of compiled object). */
2171
2172 /* Decode a composition rule represented by C as a component of
2173 composition sequence of Emacs 20 style. Set RULE to the decoded
2174 rule. */
2175
2176 #define DECODE_EMACS_MULE_COMPOSITION_RULE_20(c, rule) \
2177 do { \
2178 int gref, nref; \
2179 \
2180 c -= 0xA0; \
2181 if (c < 0 || c >= 81) \
2182 goto invalid_code; \
2183 gref = c / 9, nref = c % 9; \
2184 if (gref == 4) gref = 10; \
2185 if (nref == 4) nref = 10; \
2186 rule = COMPOSITION_ENCODE_RULE (gref, nref); \
2187 } while (0)
2188
2189
2190 /* Decode a composition rule represented by C and the following byte
2191 at SRC as a component of composition sequence of Emacs 21 style.
2192 Set RULE to the decoded rule. */
2193
2194 #define DECODE_EMACS_MULE_COMPOSITION_RULE_21(c, rule) \
2195 do { \
2196 int gref, nref; \
2197 \
2198 gref = c - 0x20; \
2199 if (gref < 0 || gref >= 81) \
2200 goto invalid_code; \
2201 ONE_MORE_BYTE (c); \
2202 nref = c - 0x20; \
2203 if (nref < 0 || nref >= 81) \
2204 goto invalid_code; \
2205 rule = COMPOSITION_ENCODE_RULE (gref, nref); \
2206 } while (0)
2207
2208
2209 /* Start of Emacs 21 style format. The first three bytes at SRC are
2210 (METHOD - 0xF2), (BYTES - 0xA0), (CHARS - 0xA0), where BYTES is the
2211 byte length of this composition information, CHARS is the number of
2212 characters composed by this composition. */
2213
2214 #define DECODE_EMACS_MULE_21_COMPOSITION() \
2215 do { \
2216 enum composition_method method = c - 0xF2; \
2217 int nbytes, nchars; \
2218 \
2219 ONE_MORE_BYTE (c); \
2220 if (c < 0) \
2221 goto invalid_code; \
2222 nbytes = c - 0xA0; \
2223 if (nbytes < 3 || (method == COMPOSITION_RELATIVE && nbytes != 4)) \
2224 goto invalid_code; \
2225 ONE_MORE_BYTE (c); \
2226 nchars = c - 0xA0; \
2227 if (nchars <= 0 || nchars >= MAX_COMPOSITION_COMPONENTS) \
2228 goto invalid_code; \
2229 cmp_status->old_form = 0; \
2230 cmp_status->method = method; \
2231 if (method == COMPOSITION_RELATIVE) \
2232 cmp_status->state = COMPOSING_CHAR; \
2233 else \
2234 cmp_status->state = COMPOSING_COMPONENT_CHAR; \
2235 cmp_status->length = MAX_ANNOTATION_LENGTH; \
2236 cmp_status->nchars = nchars; \
2237 cmp_status->ncomps = nbytes - 4; \
2238 ADD_COMPOSITION_DATA (charbuf, nchars, nbytes, method); \
2239 } while (0)
2240
2241
2242 /* Start of Emacs 20 style format for relative composition. */
2243
2244 #define DECODE_EMACS_MULE_20_RELATIVE_COMPOSITION() \
2245 do { \
2246 cmp_status->old_form = 1; \
2247 cmp_status->method = COMPOSITION_RELATIVE; \
2248 cmp_status->state = COMPOSING_CHAR; \
2249 cmp_status->length = MAX_ANNOTATION_LENGTH; \
2250 cmp_status->nchars = cmp_status->ncomps = 0; \
2251 ADD_COMPOSITION_DATA (charbuf, 0, 0, cmp_status->method); \
2252 } while (0)
2253
2254
2255 /* Start of Emacs 20 style format for rule-base composition. */
2256
2257 #define DECODE_EMACS_MULE_20_RULEBASE_COMPOSITION() \
2258 do { \
2259 cmp_status->old_form = 1; \
2260 cmp_status->method = COMPOSITION_WITH_RULE; \
2261 cmp_status->state = COMPOSING_CHAR; \
2262 cmp_status->length = MAX_ANNOTATION_LENGTH; \
2263 cmp_status->nchars = cmp_status->ncomps = 0; \
2264 ADD_COMPOSITION_DATA (charbuf, 0, 0, cmp_status->method); \
2265 } while (0)
2266
2267
2268 #define DECODE_EMACS_MULE_COMPOSITION_START() \
2269 do { \
2270 const unsigned char *current_src = src; \
2271 \
2272 ONE_MORE_BYTE (c); \
2273 if (c < 0) \
2274 goto invalid_code; \
2275 if (c - 0xF2 >= COMPOSITION_RELATIVE \
2276 && c - 0xF2 <= COMPOSITION_WITH_RULE_ALTCHARS) \
2277 DECODE_EMACS_MULE_21_COMPOSITION (); \
2278 else if (c < 0xA0) \
2279 goto invalid_code; \
2280 else if (c < 0xC0) \
2281 { \
2282 DECODE_EMACS_MULE_20_RELATIVE_COMPOSITION (); \
2283 /* Re-read C as a composition component. */ \
2284 src = current_src; \
2285 } \
2286 else if (c == 0xFF) \
2287 DECODE_EMACS_MULE_20_RULEBASE_COMPOSITION (); \
2288 else \
2289 goto invalid_code; \
2290 } while (0)
2291
2292 #define EMACS_MULE_COMPOSITION_END() \
2293 do { \
2294 int idx = - cmp_status->length; \
2295 \
2296 if (cmp_status->old_form) \
2297 charbuf[idx + 2] = cmp_status->nchars; \
2298 else if (cmp_status->method > COMPOSITION_RELATIVE) \
2299 charbuf[idx] = charbuf[idx + 2] - cmp_status->length; \
2300 cmp_status->state = COMPOSING_NO; \
2301 } while (0)
2302
2303
2304 static int
2305 emacs_mule_finish_composition (int *charbuf,
2306 struct composition_status *cmp_status)
2307 {
2308 int idx = - cmp_status->length;
2309 int new_chars;
2310
2311 if (cmp_status->old_form && cmp_status->nchars > 0)
2312 {
2313 charbuf[idx + 2] = cmp_status->nchars;
2314 new_chars = 0;
2315 if (cmp_status->method == COMPOSITION_WITH_RULE
2316 && cmp_status->state == COMPOSING_CHAR)
2317 {
2318 /* The last rule was invalid. */
2319 int rule = charbuf[-1] + 0xA0;
2320
2321 charbuf[-2] = BYTE8_TO_CHAR (rule);
2322 charbuf[-1] = -1;
2323 new_chars = 1;
2324 }
2325 }
2326 else
2327 {
2328 charbuf[idx++] = BYTE8_TO_CHAR (0x80);
2329
2330 if (cmp_status->method == COMPOSITION_WITH_RULE)
2331 {
2332 charbuf[idx++] = BYTE8_TO_CHAR (0xFF);
2333 charbuf[idx++] = -3;
2334 charbuf[idx++] = 0;
2335 new_chars = 1;
2336 }
2337 else
2338 {
2339 int nchars = charbuf[idx + 1] + 0xA0;
2340 int nbytes = charbuf[idx + 2] + 0xA0;
2341
2342 charbuf[idx++] = BYTE8_TO_CHAR (0xF2 + cmp_status->method);
2343 charbuf[idx++] = BYTE8_TO_CHAR (nbytes);
2344 charbuf[idx++] = BYTE8_TO_CHAR (nchars);
2345 charbuf[idx++] = -1;
2346 new_chars = 4;
2347 }
2348 }
2349 cmp_status->state = COMPOSING_NO;
2350 return new_chars;
2351 }
2352
2353 #define EMACS_MULE_MAYBE_FINISH_COMPOSITION() \
2354 do { \
2355 if (cmp_status->state != COMPOSING_NO) \
2356 char_offset += emacs_mule_finish_composition (charbuf, cmp_status); \
2357 } while (0)
2358
2359
2360 static void
2361 decode_coding_emacs_mule (struct coding_system *coding)
2362 {
2363 const unsigned char *src = coding->source + coding->consumed;
2364 const unsigned char *src_end = coding->source + coding->src_bytes;
2365 const unsigned char *src_base;
2366 int *charbuf = coding->charbuf + coding->charbuf_used;
2367 /* We may produce two annotations (charset and composition) in one
2368 loop and one more charset annotation at the end. */
2369 int *charbuf_end
2370 = coding->charbuf + coding->charbuf_size - (MAX_ANNOTATION_LENGTH * 3);
2371 int consumed_chars = 0, consumed_chars_base;
2372 int multibytep = coding->src_multibyte;
2373 Lisp_Object attrs, charset_list;
2374 int char_offset = coding->produced_char;
2375 int last_offset = char_offset;
2376 int last_id = charset_ascii;
2377 int eol_crlf =
2378 !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
2379 int byte_after_cr = -1;
2380 struct composition_status *cmp_status = &coding->spec.emacs_mule.cmp_status;
2381
2382 CODING_GET_INFO (coding, attrs, charset_list);
2383
2384 if (cmp_status->state != COMPOSING_NO)
2385 {
2386 int i;
2387
2388 for (i = 0; i < cmp_status->length; i++)
2389 *charbuf++ = cmp_status->carryover[i];
2390 coding->annotated = 1;
2391 }
2392
2393 while (1)
2394 {
2395 int c, id;
2396
2397 src_base = src;
2398 consumed_chars_base = consumed_chars;
2399
2400 if (charbuf >= charbuf_end)
2401 {
2402 if (byte_after_cr >= 0)
2403 src_base--;
2404 break;
2405 }
2406
2407 if (byte_after_cr >= 0)
2408 c = byte_after_cr, byte_after_cr = -1;
2409 else
2410 ONE_MORE_BYTE (c);
2411
2412 if (c < 0 || c == 0x80)
2413 {
2414 EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
2415 if (c < 0)
2416 {
2417 *charbuf++ = -c;
2418 char_offset++;
2419 }
2420 else
2421 DECODE_EMACS_MULE_COMPOSITION_START ();
2422 continue;
2423 }
2424
2425 if (c < 0x80)
2426 {
2427 if (eol_crlf && c == '\r')
2428 ONE_MORE_BYTE (byte_after_cr);
2429 id = charset_ascii;
2430 if (cmp_status->state != COMPOSING_NO)
2431 {
2432 if (cmp_status->old_form)
2433 EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
2434 else if (cmp_status->state >= COMPOSING_COMPONENT_CHAR)
2435 cmp_status->ncomps--;
2436 }
2437 }
2438 else
2439 {
2440 int nchars, nbytes;
2441 /* emacs_mule_char can load a charset map from a file, which
2442 allocates a large structure and might cause buffer text
2443 to be relocated as result. Thus, we need to remember the
2444 original pointer to buffer text, and fix up all related
2445 pointers after the call. */
2446 const unsigned char *orig = coding->source;
2447 EMACS_INT offset;
2448
2449 c = emacs_mule_char (coding, src_base, &nbytes, &nchars, &id,
2450 cmp_status);
2451 offset = coding->source - orig;
2452 if (offset)
2453 {
2454 src += offset;
2455 src_base += offset;
2456 src_end += offset;
2457 }
2458 if (c < 0)
2459 {
2460 if (c == -1)
2461 goto invalid_code;
2462 if (c == -2)
2463 break;
2464 }
2465 src = src_base + nbytes;
2466 consumed_chars = consumed_chars_base + nchars;
2467 if (cmp_status->state >= COMPOSING_COMPONENT_CHAR)
2468 cmp_status->ncomps -= nchars;
2469 }
2470
2471 /* Now if C >= 0, we found a normally encoded character, if C <
2472 0, we found an old-style composition component character or
2473 rule. */
2474
2475 if (cmp_status->state == COMPOSING_NO)
2476 {
2477 if (last_id != id)
2478 {
2479 if (last_id != charset_ascii)
2480 ADD_CHARSET_DATA (charbuf, char_offset - last_offset,
2481 last_id);
2482 last_id = id;
2483 last_offset = char_offset;
2484 }
2485 *charbuf++ = c;
2486 char_offset++;
2487 }
2488 else if (cmp_status->state == COMPOSING_CHAR)
2489 {
2490 if (cmp_status->old_form)
2491 {
2492 if (c >= 0)
2493 {
2494 EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
2495 *charbuf++ = c;
2496 char_offset++;
2497 }
2498 else
2499 {
2500 *charbuf++ = -c;
2501 cmp_status->nchars++;
2502 cmp_status->length++;
2503 if (cmp_status->nchars == MAX_COMPOSITION_COMPONENTS)
2504 EMACS_MULE_COMPOSITION_END ();
2505 else if (cmp_status->method == COMPOSITION_WITH_RULE)
2506 cmp_status->state = COMPOSING_RULE;
2507 }
2508 }
2509 else
2510 {
2511 *charbuf++ = c;
2512 cmp_status->length++;
2513 cmp_status->nchars--;
2514 if (cmp_status->nchars == 0)
2515 EMACS_MULE_COMPOSITION_END ();
2516 }
2517 }
2518 else if (cmp_status->state == COMPOSING_RULE)
2519 {
2520 int rule;
2521
2522 if (c >= 0)
2523 {
2524 EMACS_MULE_COMPOSITION_END ();
2525 *charbuf++ = c;
2526 char_offset++;
2527 }
2528 else
2529 {
2530 c = -c;
2531 DECODE_EMACS_MULE_COMPOSITION_RULE_20 (c, rule);
2532 if (rule < 0)
2533 goto invalid_code;
2534 *charbuf++ = -2;
2535 *charbuf++ = rule;
2536 cmp_status->length += 2;
2537 cmp_status->state = COMPOSING_CHAR;
2538 }
2539 }
2540 else if (cmp_status->state == COMPOSING_COMPONENT_CHAR)
2541 {
2542 *charbuf++ = c;
2543 cmp_status->length++;
2544 if (cmp_status->ncomps == 0)
2545 cmp_status->state = COMPOSING_CHAR;
2546 else if (cmp_status->ncomps > 0)
2547 {
2548 if (cmp_status->method == COMPOSITION_WITH_RULE_ALTCHARS)
2549 cmp_status->state = COMPOSING_COMPONENT_RULE;
2550 }
2551 else
2552 EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
2553 }
2554 else /* COMPOSING_COMPONENT_RULE */
2555 {
2556 int rule;
2557
2558 DECODE_EMACS_MULE_COMPOSITION_RULE_21 (c, rule);
2559 if (rule < 0)
2560 goto invalid_code;
2561 *charbuf++ = -2;
2562 *charbuf++ = rule;
2563 cmp_status->length += 2;
2564 cmp_status->ncomps--;
2565 if (cmp_status->ncomps > 0)
2566 cmp_status->state = COMPOSING_COMPONENT_CHAR;
2567 else
2568 EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
2569 }
2570 continue;
2571
2572 invalid_code:
2573 EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
2574 src = src_base;
2575 consumed_chars = consumed_chars_base;
2576 ONE_MORE_BYTE (c);
2577 *charbuf++ = ASCII_BYTE_P (c) ? c : BYTE8_TO_CHAR (c);
2578 char_offset++;
2579 coding->errors++;
2580 }
2581
2582 no_more_source:
2583 if (cmp_status->state != COMPOSING_NO)
2584 {
2585 if (coding->mode & CODING_MODE_LAST_BLOCK)
2586 EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
2587 else
2588 {
2589 int i;
2590
2591 charbuf -= cmp_status->length;
2592 for (i = 0; i < cmp_status->length; i++)
2593 cmp_status->carryover[i] = charbuf[i];
2594 }
2595 }
2596 if (last_id != charset_ascii)
2597 ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
2598 coding->consumed_char += consumed_chars_base;
2599 coding->consumed = src_base - coding->source;
2600 coding->charbuf_used = charbuf - coding->charbuf;
2601 }
2602
2603
2604 #define EMACS_MULE_LEADING_CODES(id, codes) \
2605 do { \
2606 if (id < 0xA0) \
2607 codes[0] = id, codes[1] = 0; \
2608 else if (id < 0xE0) \
2609 codes[0] = 0x9A, codes[1] = id; \
2610 else if (id < 0xF0) \
2611 codes[0] = 0x9B, codes[1] = id; \
2612 else if (id < 0xF5) \
2613 codes[0] = 0x9C, codes[1] = id; \
2614 else \
2615 codes[0] = 0x9D, codes[1] = id; \
2616 } while (0);
2617
2618
2619 static int
2620 encode_coding_emacs_mule (struct coding_system *coding)
2621 {
2622 int multibytep = coding->dst_multibyte;
2623 int *charbuf = coding->charbuf;
2624 int *charbuf_end = charbuf + coding->charbuf_used;
2625 unsigned char *dst = coding->destination + coding->produced;
2626 unsigned char *dst_end = coding->destination + coding->dst_bytes;
2627 int safe_room = 8;
2628 int produced_chars = 0;
2629 Lisp_Object attrs, charset_list;
2630 int c;
2631 int preferred_charset_id = -1;
2632
2633 CODING_GET_INFO (coding, attrs, charset_list);
2634 if (! EQ (charset_list, Vemacs_mule_charset_list))
2635 {
2636 CODING_ATTR_CHARSET_LIST (attrs)
2637 = charset_list = Vemacs_mule_charset_list;
2638 }
2639
2640 while (charbuf < charbuf_end)
2641 {
2642 ASSURE_DESTINATION (safe_room);
2643 c = *charbuf++;
2644
2645 if (c < 0)
2646 {
2647 /* Handle an annotation. */
2648 switch (*charbuf)
2649 {
2650 case CODING_ANNOTATE_COMPOSITION_MASK:
2651 /* Not yet implemented. */
2652 break;
2653 case CODING_ANNOTATE_CHARSET_MASK:
2654 preferred_charset_id = charbuf[3];
2655 if (preferred_charset_id >= 0
2656 && NILP (Fmemq (make_number (preferred_charset_id),
2657 charset_list)))
2658 preferred_charset_id = -1;
2659 break;
2660 default:
2661 abort ();
2662 }
2663 charbuf += -c - 1;
2664 continue;
2665 }
2666
2667 if (ASCII_CHAR_P (c))
2668 EMIT_ONE_ASCII_BYTE (c);
2669 else if (CHAR_BYTE8_P (c))
2670 {
2671 c = CHAR_TO_BYTE8 (c);
2672 EMIT_ONE_BYTE (c);
2673 }
2674 else
2675 {
2676 struct charset *charset;
2677 unsigned code;
2678 int dimension;
2679 int emacs_mule_id;
2680 unsigned char leading_codes[2];
2681
2682 if (preferred_charset_id >= 0)
2683 {
2684 charset = CHARSET_FROM_ID (preferred_charset_id);
2685 if (CHAR_CHARSET_P (c, charset))
2686 code = ENCODE_CHAR (charset, c);
2687 else
2688 charset = char_charset (c, charset_list, &code);
2689 }
2690 else
2691 charset = char_charset (c, charset_list, &code);
2692 if (! charset)
2693 {
2694 c = coding->default_char;
2695 if (ASCII_CHAR_P (c))
2696 {
2697 EMIT_ONE_ASCII_BYTE (c);
2698 continue;
2699 }
2700 charset = char_charset (c, charset_list, &code);
2701 }
2702 dimension = CHARSET_DIMENSION (charset);
2703 emacs_mule_id = CHARSET_EMACS_MULE_ID (charset);
2704 EMACS_MULE_LEADING_CODES (emacs_mule_id, leading_codes);
2705 EMIT_ONE_BYTE (leading_codes[0]);
2706 if (leading_codes[1])
2707 EMIT_ONE_BYTE (leading_codes[1]);
2708 if (dimension == 1)
2709 EMIT_ONE_BYTE (code | 0x80);
2710 else
2711 {
2712 code |= 0x8080;
2713 EMIT_ONE_BYTE (code >> 8);
2714 EMIT_ONE_BYTE (code & 0xFF);
2715 }
2716 }
2717 }
2718 record_conversion_result (coding, CODING_RESULT_SUCCESS);
2719 coding->produced_char += produced_chars;
2720 coding->produced = dst - coding->destination;
2721 return 0;
2722 }
2723
2724 \f
2725 /*** 7. ISO2022 handlers ***/
2726
2727 /* The following note describes the coding system ISO2022 briefly.
2728 Since the intention of this note is to help understand the
2729 functions in this file, some parts are NOT ACCURATE or are OVERLY
2730 SIMPLIFIED. For thorough understanding, please refer to the
2731 original document of ISO2022. This is equivalent to the standard
2732 ECMA-35, obtainable from <URL:http://www.ecma.ch/> (*).
2733
2734 ISO2022 provides many mechanisms to encode several character sets
2735 in 7-bit and 8-bit environments. For 7-bit environments, all text
2736 is encoded using bytes less than 128. This may make the encoded
2737 text a little bit longer, but the text passes more easily through
2738 several types of gateway, some of which strip off the MSB (Most
2739 Significant Bit).
2740
2741 There are two kinds of character sets: control character sets and
2742 graphic character sets. The former contain control characters such
2743 as `newline' and `escape' to provide control functions (control
2744 functions are also provided by escape sequences). The latter
2745 contain graphic characters such as 'A' and '-'. Emacs recognizes
2746 two control character sets and many graphic character sets.
2747
2748 Graphic character sets are classified into one of the following
2749 four classes, according to the number of bytes (DIMENSION) and
2750 number of characters in one dimension (CHARS) of the set:
2751 - DIMENSION1_CHARS94
2752 - DIMENSION1_CHARS96
2753 - DIMENSION2_CHARS94
2754 - DIMENSION2_CHARS96
2755
2756 In addition, each character set is assigned an identification tag,
2757 unique for each set, called the "final character" (denoted as <F>
2758 hereafter). The <F> of each character set is decided by ECMA(*)
2759 when it is registered in ISO. The code range of <F> is 0x30..0x7F
2760 (0x30..0x3F are for private use only).
2761
2762 Note (*): ECMA = European Computer Manufacturers Association
2763
2764 Here are examples of graphic character sets [NAME(<F>)]:
2765 o DIMENSION1_CHARS94 -- ASCII('B'), right-half-of-JISX0201('I'), ...
2766 o DIMENSION1_CHARS96 -- right-half-of-ISO8859-1('A'), ...
2767 o DIMENSION2_CHARS94 -- GB2312('A'), JISX0208('B'), ...
2768 o DIMENSION2_CHARS96 -- none for the moment
2769
2770 A code area (1 byte=8 bits) is divided into 4 areas, C0, GL, C1, and GR.
2771 C0 [0x00..0x1F] -- control character plane 0
2772 GL [0x20..0x7F] -- graphic character plane 0
2773 C1 [0x80..0x9F] -- control character plane 1
2774 GR [0xA0..0xFF] -- graphic character plane 1
2775
2776 A control character set is directly designated and invoked to C0 or
2777 C1 by an escape sequence. The most common case is that:
2778 - ISO646's control character set is designated/invoked to C0, and
2779 - ISO6429's control character set is designated/invoked to C1,
2780 and usually these designations/invocations are omitted in encoded
2781 text. In a 7-bit environment, only C0 can be used, and a control
2782 character for C1 is encoded by an appropriate escape sequence to
2783 fit into the environment. All control characters for C1 are
2784 defined to have corresponding escape sequences.
2785
2786 A graphic character set is at first designated to one of four
2787 graphic registers (G0 through G3), then these graphic registers are
2788 invoked to GL or GR. These designations and invocations can be
2789 done independently. The most common case is that G0 is invoked to
2790 GL, G1 is invoked to GR, and ASCII is designated to G0. Usually
2791 these invocations and designations are omitted in encoded text.
2792 In a 7-bit environment, only GL can be used.
2793
2794 When a graphic character set of CHARS94 is invoked to GL, codes
2795 0x20 and 0x7F of the GL area work as control characters SPACE and
2796 DEL respectively, and codes 0xA0 and 0xFF of the GR area should not
2797 be used.
2798
2799 There are two ways of invocation: locking-shift and single-shift.
2800 With locking-shift, the invocation lasts until the next different
2801 invocation, whereas with single-shift, the invocation affects the
2802 following character only and doesn't affect the locking-shift
2803 state. Invocations are done by the following control characters or
2804 escape sequences:
2805
2806 ----------------------------------------------------------------------
2807 abbrev function cntrl escape seq description
2808 ----------------------------------------------------------------------
2809 SI/LS0 (shift-in) 0x0F none invoke G0 into GL
2810 SO/LS1 (shift-out) 0x0E none invoke G1 into GL
2811 LS2 (locking-shift-2) none ESC 'n' invoke G2 into GL
2812 LS3 (locking-shift-3) none ESC 'o' invoke G3 into GL
2813 LS1R (locking-shift-1 right) none ESC '~' invoke G1 into GR (*)
2814 LS2R (locking-shift-2 right) none ESC '}' invoke G2 into GR (*)
2815 LS3R (locking-shift 3 right) none ESC '|' invoke G3 into GR (*)
2816 SS2 (single-shift-2) 0x8E ESC 'N' invoke G2 for one char
2817 SS3 (single-shift-3) 0x8F ESC 'O' invoke G3 for one char
2818 ----------------------------------------------------------------------
2819 (*) These are not used by any known coding system.
2820
2821 Control characters for these functions are defined by macros
2822 ISO_CODE_XXX in `coding.h'.
2823
2824 Designations are done by the following escape sequences:
2825 ----------------------------------------------------------------------
2826 escape sequence description
2827 ----------------------------------------------------------------------
2828 ESC '(' <F> designate DIMENSION1_CHARS94<F> to G0
2829 ESC ')' <F> designate DIMENSION1_CHARS94<F> to G1
2830 ESC '*' <F> designate DIMENSION1_CHARS94<F> to G2
2831 ESC '+' <F> designate DIMENSION1_CHARS94<F> to G3
2832 ESC ',' <F> designate DIMENSION1_CHARS96<F> to G0 (*)
2833 ESC '-' <F> designate DIMENSION1_CHARS96<F> to G1
2834 ESC '.' <F> designate DIMENSION1_CHARS96<F> to G2
2835 ESC '/' <F> designate DIMENSION1_CHARS96<F> to G3
2836 ESC '$' '(' <F> designate DIMENSION2_CHARS94<F> to G0 (**)
2837 ESC '$' ')' <F> designate DIMENSION2_CHARS94<F> to G1
2838 ESC '$' '*' <F> designate DIMENSION2_CHARS94<F> to G2
2839 ESC '$' '+' <F> designate DIMENSION2_CHARS94<F> to G3
2840 ESC '$' ',' <F> designate DIMENSION2_CHARS96<F> to G0 (*)
2841 ESC '$' '-' <F> designate DIMENSION2_CHARS96<F> to G1
2842 ESC '$' '.' <F> designate DIMENSION2_CHARS96<F> to G2
2843 ESC '$' '/' <F> designate DIMENSION2_CHARS96<F> to G3
2844 ----------------------------------------------------------------------
2845
2846 In this list, "DIMENSION1_CHARS94<F>" means a graphic character set
2847 of dimension 1, chars 94, and final character <F>, etc...
2848
2849 Note (*): Although these designations are not allowed in ISO2022,
2850 Emacs accepts them on decoding, and produces them on encoding
2851 CHARS96 character sets in a coding system which is characterized as
2852 7-bit environment, non-locking-shift, and non-single-shift.
2853
2854 Note (**): If <F> is '@', 'A', or 'B', the intermediate character
2855 '(' must be omitted. We refer to this as "short-form" hereafter.
2856
2857 Now you may notice that there are a lot of ways of encoding the
2858 same multilingual text in ISO2022. Actually, there exist many
2859 coding systems such as Compound Text (used in X11's inter client
2860 communication, ISO-2022-JP (used in Japanese Internet), ISO-2022-KR
2861 (used in Korean Internet), EUC (Extended UNIX Code, used in Asian
2862 localized platforms), and all of these are variants of ISO2022.
2863
2864 In addition to the above, Emacs handles two more kinds of escape
2865 sequences: ISO6429's direction specification and Emacs' private
2866 sequence for specifying character composition.
2867
2868 ISO6429's direction specification takes the following form:
2869 o CSI ']' -- end of the current direction
2870 o CSI '0' ']' -- end of the current direction
2871 o CSI '1' ']' -- start of left-to-right text
2872 o CSI '2' ']' -- start of right-to-left text
2873 The control character CSI (0x9B: control sequence introducer) is
2874 abbreviated to the escape sequence ESC '[' in a 7-bit environment.
2875
2876 Character composition specification takes the following form:
2877 o ESC '0' -- start relative composition
2878 o ESC '1' -- end composition
2879 o ESC '2' -- start rule-base composition (*)
2880 o ESC '3' -- start relative composition with alternate chars (**)
2881 o ESC '4' -- start rule-base composition with alternate chars (**)
2882 Since these are not standard escape sequences of any ISO standard,
2883 the use of them with these meanings is restricted to Emacs only.
2884
2885 (*) This form is used only in Emacs 20.7 and older versions,
2886 but newer versions can safely decode it.
2887 (**) This form is used only in Emacs 21.1 and newer versions,
2888 and older versions can't decode it.
2889
2890 Here's a list of example usages of these composition escape
2891 sequences (categorized by `enum composition_method').
2892
2893 COMPOSITION_RELATIVE:
2894 ESC 0 CHAR [ CHAR ] ESC 1
2895 COMPOSITION_WITH_RULE:
2896 ESC 2 CHAR [ RULE CHAR ] ESC 1
2897 COMPOSITION_WITH_ALTCHARS:
2898 ESC 3 ALTCHAR [ ALTCHAR ] ESC 0 CHAR [ CHAR ] ESC 1
2899 COMPOSITION_WITH_RULE_ALTCHARS:
2900 ESC 4 ALTCHAR [ RULE ALTCHAR ] ESC 0 CHAR [ CHAR ] ESC 1 */
2901
2902 enum iso_code_class_type iso_code_class[256];
2903
2904 #define SAFE_CHARSET_P(coding, id) \
2905 ((id) <= (coding)->max_charset_id \
2906 && (coding)->safe_charsets[id] != 255)
2907
2908
2909 #define SHIFT_OUT_OK(category) \
2910 (CODING_ISO_INITIAL (&coding_categories[category], 1) >= 0)
2911
2912 static void
2913 setup_iso_safe_charsets (Lisp_Object attrs)
2914 {
2915 Lisp_Object charset_list, safe_charsets;
2916 Lisp_Object request;
2917 Lisp_Object reg_usage;
2918 Lisp_Object tail;
2919 int reg94, reg96;
2920 int flags = XINT (AREF (attrs, coding_attr_iso_flags));
2921 int max_charset_id;
2922
2923 charset_list = CODING_ATTR_CHARSET_LIST (attrs);
2924 if ((flags & CODING_ISO_FLAG_FULL_SUPPORT)
2925 && ! EQ (charset_list, Viso_2022_charset_list))
2926 {
2927 CODING_ATTR_CHARSET_LIST (attrs)
2928 = charset_list = Viso_2022_charset_list;
2929 ASET (attrs, coding_attr_safe_charsets, Qnil);
2930 }
2931
2932 if (STRINGP (AREF (attrs, coding_attr_safe_charsets)))
2933 return;
2934
2935 max_charset_id = 0;
2936 for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
2937 {
2938 int id = XINT (XCAR (tail));
2939 if (max_charset_id < id)
2940 max_charset_id = id;
2941 }
2942
2943 safe_charsets = make_uninit_string (max_charset_id + 1);
2944 memset (SDATA (safe_charsets), 255, max_charset_id + 1);
2945 request = AREF (attrs, coding_attr_iso_request);
2946 reg_usage = AREF (attrs, coding_attr_iso_usage);
2947 reg94 = XINT (XCAR (reg_usage));
2948 reg96 = XINT (XCDR (reg_usage));
2949
2950 for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
2951 {
2952 Lisp_Object id;
2953 Lisp_Object reg;
2954 struct charset *charset;
2955
2956 id = XCAR (tail);
2957 charset = CHARSET_FROM_ID (XINT (id));
2958 reg = Fcdr (Fassq (id, request));
2959 if (! NILP (reg))
2960 SSET (safe_charsets, XINT (id), XINT (reg));
2961 else if (charset->iso_chars_96)
2962 {
2963 if (reg96 < 4)
2964 SSET (safe_charsets, XINT (id), reg96);
2965 }
2966 else
2967 {
2968 if (reg94 < 4)
2969 SSET (safe_charsets, XINT (id), reg94);
2970 }
2971 }
2972 ASET (attrs, coding_attr_safe_charsets, safe_charsets);
2973 }
2974
2975
2976 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
2977 Check if a text is encoded in one of ISO-2022 based coding systems.
2978 If it is, return 1, else return 0. */
2979
2980 static int
2981 detect_coding_iso_2022 (struct coding_system *coding,
2982 struct coding_detection_info *detect_info)
2983 {
2984 const unsigned char *src = coding->source, *src_base = src;
2985 const unsigned char *src_end = coding->source + coding->src_bytes;
2986 int multibytep = coding->src_multibyte;
2987 int single_shifting = 0;
2988 int id;
2989 int c, c1;
2990 int consumed_chars = 0;
2991 int i;
2992 int rejected = 0;
2993 int found = 0;
2994 int composition_count = -1;
2995
2996 detect_info->checked |= CATEGORY_MASK_ISO;
2997
2998 for (i = coding_category_iso_7; i <= coding_category_iso_8_else; i++)
2999 {
3000 struct coding_system *this = &(coding_categories[i]);
3001 Lisp_Object attrs, val;
3002
3003 if (this->id < 0)
3004 continue;
3005 attrs = CODING_ID_ATTRS (this->id);
3006 if (CODING_ISO_FLAGS (this) & CODING_ISO_FLAG_FULL_SUPPORT
3007 && ! EQ (CODING_ATTR_CHARSET_LIST (attrs), Viso_2022_charset_list))
3008 setup_iso_safe_charsets (attrs);
3009 val = CODING_ATTR_SAFE_CHARSETS (attrs);
3010 this->max_charset_id = SCHARS (val) - 1;
3011 this->safe_charsets = SDATA (val);
3012 }
3013
3014 /* A coding system of this category is always ASCII compatible. */
3015 src += coding->head_ascii;
3016
3017 while (rejected != CATEGORY_MASK_ISO)
3018 {
3019 src_base = src;
3020 ONE_MORE_BYTE (c);
3021 switch (c)
3022 {
3023 case ISO_CODE_ESC:
3024 if (inhibit_iso_escape_detection)
3025 break;
3026 single_shifting = 0;
3027 ONE_MORE_BYTE (c);
3028 if (c >= '(' && c <= '/')
3029 {
3030 /* Designation sequence for a charset of dimension 1. */
3031 ONE_MORE_BYTE (c1);
3032 if (c1 < ' ' || c1 >= 0x80
3033 || (id = iso_charset_table[0][c >= ','][c1]) < 0)
3034 /* Invalid designation sequence. Just ignore. */
3035 break;
3036 }
3037 else if (c == '$')
3038 {
3039 /* Designation sequence for a charset of dimension 2. */
3040 ONE_MORE_BYTE (c);
3041 if (c >= '@' && c <= 'B')
3042 /* Designation for JISX0208.1978, GB2312, or JISX0208. */
3043 id = iso_charset_table[1][0][c];
3044 else if (c >= '(' && c <= '/')
3045 {
3046 ONE_MORE_BYTE (c1);
3047 if (c1 < ' ' || c1 >= 0x80
3048 || (id = iso_charset_table[1][c >= ','][c1]) < 0)
3049 /* Invalid designation sequence. Just ignore. */
3050 break;
3051 }
3052 else
3053 /* Invalid designation sequence. Just ignore it. */
3054 break;
3055 }
3056 else if (c == 'N' || c == 'O')
3057 {
3058 /* ESC <Fe> for SS2 or SS3. */
3059 single_shifting = 1;
3060 rejected |= CATEGORY_MASK_ISO_7BIT | CATEGORY_MASK_ISO_8BIT;
3061 break;
3062 }
3063 else if (c == '1')
3064 {
3065 /* End of composition. */
3066 if (composition_count < 0
3067 || composition_count > MAX_COMPOSITION_COMPONENTS)
3068 /* Invalid */
3069 break;
3070 composition_count = -1;
3071 found |= CATEGORY_MASK_ISO;
3072 }
3073 else if (c >= '0' && c <= '4')
3074 {
3075 /* ESC <Fp> for start/end composition. */
3076 composition_count = 0;
3077 break;
3078 }
3079 else
3080 {
3081 /* Invalid escape sequence. Just ignore it. */
3082 break;
3083 }
3084
3085 /* We found a valid designation sequence for CHARSET. */
3086 rejected |= CATEGORY_MASK_ISO_8BIT;
3087 if (SAFE_CHARSET_P (&coding_categories[coding_category_iso_7],
3088 id))
3089 found |= CATEGORY_MASK_ISO_7;
3090 else
3091 rejected |= CATEGORY_MASK_ISO_7;
3092 if (SAFE_CHARSET_P (&coding_categories[coding_category_iso_7_tight],
3093 id))
3094 found |= CATEGORY_MASK_ISO_7_TIGHT;
3095 else
3096 rejected |= CATEGORY_MASK_ISO_7_TIGHT;
3097 if (SAFE_CHARSET_P (&coding_categories[coding_category_iso_7_else],
3098 id))
3099 found |= CATEGORY_MASK_ISO_7_ELSE;
3100 else
3101 rejected |= CATEGORY_MASK_ISO_7_ELSE;
3102 if (SAFE_CHARSET_P (&coding_categories[coding_category_iso_8_else],
3103 id))
3104 found |= CATEGORY_MASK_ISO_8_ELSE;
3105 else
3106 rejected |= CATEGORY_MASK_ISO_8_ELSE;
3107 break;
3108
3109 case ISO_CODE_SO:
3110 case ISO_CODE_SI:
3111 /* Locking shift out/in. */
3112 if (inhibit_iso_escape_detection)
3113 break;
3114 single_shifting = 0;
3115 rejected |= CATEGORY_MASK_ISO_7BIT | CATEGORY_MASK_ISO_8BIT;
3116 break;
3117
3118 case ISO_CODE_CSI:
3119 /* Control sequence introducer. */
3120 single_shifting = 0;
3121 rejected |= CATEGORY_MASK_ISO_7BIT | CATEGORY_MASK_ISO_7_ELSE;
3122 found |= CATEGORY_MASK_ISO_8_ELSE;
3123 goto check_extra_latin;
3124
3125 case ISO_CODE_SS2:
3126 case ISO_CODE_SS3:
3127 /* Single shift. */
3128 if (inhibit_iso_escape_detection)
3129 break;
3130 single_shifting = 0;
3131 rejected |= CATEGORY_MASK_ISO_7BIT;
3132 if (CODING_ISO_FLAGS (&coding_categories[coding_category_iso_8_1])
3133 & CODING_ISO_FLAG_SINGLE_SHIFT)
3134 found |= CATEGORY_MASK_ISO_8_1, single_shifting = 1;
3135 if (CODING_ISO_FLAGS (&coding_categories[coding_category_iso_8_2])
3136 & CODING_ISO_FLAG_SINGLE_SHIFT)
3137 found |= CATEGORY_MASK_ISO_8_2, single_shifting = 1;
3138 if (single_shifting)
3139 break;
3140 goto check_extra_latin;
3141
3142 default:
3143 if (c < 0)
3144 continue;
3145 if (c < 0x80)
3146 {
3147 if (composition_count >= 0)
3148 composition_count++;
3149 single_shifting = 0;
3150 break;
3151 }
3152 if (c >= 0xA0)
3153 {
3154 rejected |= CATEGORY_MASK_ISO_7BIT | CATEGORY_MASK_ISO_7_ELSE;
3155 found |= CATEGORY_MASK_ISO_8_1;
3156 /* Check the length of succeeding codes of the range
3157 0xA0..0FF. If the byte length is even, we include
3158 CATEGORY_MASK_ISO_8_2 in `found'. We can check this
3159 only when we are not single shifting. */
3160 if (! single_shifting
3161 && ! (rejected & CATEGORY_MASK_ISO_8_2))
3162 {
3163 int i = 1;
3164 while (src < src_end)
3165 {
3166 src_base = src;
3167 ONE_MORE_BYTE (c);
3168 if (c < 0xA0)
3169 {
3170 src = src_base;
3171 break;
3172 }
3173 i++;
3174 }
3175
3176 if (i & 1 && src < src_end)
3177 {
3178 rejected |= CATEGORY_MASK_ISO_8_2;
3179 if (composition_count >= 0)
3180 composition_count += i;
3181 }
3182 else
3183 {
3184 found |= CATEGORY_MASK_ISO_8_2;
3185 if (composition_count >= 0)
3186 composition_count += i / 2;
3187 }
3188 }
3189 break;
3190 }
3191 check_extra_latin:
3192 single_shifting = 0;
3193 if (! VECTORP (Vlatin_extra_code_table)
3194 || NILP (XVECTOR (Vlatin_extra_code_table)->contents[c]))
3195 {
3196 rejected = CATEGORY_MASK_ISO;
3197 break;
3198 }
3199 if (CODING_ISO_FLAGS (&coding_categories[coding_category_iso_8_1])
3200 & CODING_ISO_FLAG_LATIN_EXTRA)
3201 found |= CATEGORY_MASK_ISO_8_1;
3202 else
3203 rejected |= CATEGORY_MASK_ISO_8_1;
3204 rejected |= CATEGORY_MASK_ISO_8_2;
3205 }
3206 }
3207 detect_info->rejected |= CATEGORY_MASK_ISO;
3208 return 0;
3209
3210 no_more_source:
3211 detect_info->rejected |= rejected;
3212 detect_info->found |= (found & ~rejected);
3213 return 1;
3214 }
3215
3216
3217 /* Set designation state into CODING. Set CHARS_96 to -1 if the
3218 escape sequence should be kept. */
3219 #define DECODE_DESIGNATION(reg, dim, chars_96, final) \
3220 do { \
3221 int id, prev; \
3222 \
3223 if (final < '0' || final >= 128 \
3224 || ((id = ISO_CHARSET_TABLE (dim, chars_96, final)) < 0) \
3225 || !SAFE_CHARSET_P (coding, id)) \
3226 { \
3227 CODING_ISO_DESIGNATION (coding, reg) = -2; \
3228 chars_96 = -1; \
3229 break; \
3230 } \
3231 prev = CODING_ISO_DESIGNATION (coding, reg); \
3232 if (id == charset_jisx0201_roman) \
3233 { \
3234 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_USE_ROMAN) \
3235 id = charset_ascii; \
3236 } \
3237 else if (id == charset_jisx0208_1978) \
3238 { \
3239 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_USE_OLDJIS) \
3240 id = charset_jisx0208; \
3241 } \
3242 CODING_ISO_DESIGNATION (coding, reg) = id; \
3243 /* If there was an invalid designation to REG previously, and this \
3244 designation is ASCII to REG, we should keep this designation \
3245 sequence. */ \
3246 if (prev == -2 && id == charset_ascii) \
3247 chars_96 = -1; \
3248 } while (0)
3249
3250
3251 /* Handle these composition sequence (ALT: alternate char):
3252
3253 (1) relative composition: ESC 0 CHAR ... ESC 1
3254 (2) rulebase composition: ESC 2 CHAR RULE CHAR RULE ... CHAR ESC 1
3255 (3) altchar composition: ESC 3 ALT ... ALT ESC 0 CHAR ... ESC 1
3256 (4) alt&rule composition: ESC 4 ALT RULE ... ALT ESC 0 CHAR ... ESC 1
3257
3258 When the start sequence (ESC 0/2/3/4) is found, this annotation
3259 header is produced.
3260
3261 [ -LENGTH(==-5) CODING_ANNOTATE_COMPOSITION_MASK NCHARS(==0) 0 METHOD ]
3262
3263 Then, upon reading CHAR or RULE (one or two bytes), these codes are
3264 produced until the end sequence (ESC 1) is found:
3265
3266 (1) CHAR ... CHAR
3267 (2) CHAR -2 DECODED-RULE CHAR -2 DECODED-RULE ... CHAR
3268 (3) ALT ... ALT -1 -1 CHAR ... CHAR
3269 (4) ALT -2 DECODED-RULE ALT -2 DECODED-RULE ... ALT -1 -1 CHAR ... CHAR
3270
3271 When the end sequence (ESC 1) is found, LENGTH and NCHARS in the
3272 annotation header is updated as below:
3273
3274 (1) LENGTH: unchanged, NCHARS: number of CHARs
3275 (2) LENGTH: unchanged, NCHARS: number of CHARs
3276 (3) LENGTH: += number of ALTs + 2, NCHARS: number of CHARs
3277 (4) LENGTH: += number of ALTs * 3, NCHARS: number of CHARs
3278
3279 If an error is found while composing, the annotation header is
3280 changed to:
3281
3282 [ ESC '0'/'2'/'3'/'4' -2 0 ]
3283
3284 and the sequence [ -2 DECODED-RULE ] is changed to the original
3285 byte sequence as below:
3286 o the original byte sequence is B: [ B -1 ]
3287 o the original byte sequence is B1 B2: [ B1 B2 ]
3288 and the sequence [ -1 -1 ] is changed to the original byte
3289 sequence:
3290 [ ESC '0' ]
3291 */
3292
3293 /* Decode a composition rule C1 and maybe one more byte from the
3294 source, and set RULE to the encoded composition rule, NBYTES to the
3295 length of the composition rule. If the rule is invalid, set RULE
3296 to some negative value. */
3297
3298 #define DECODE_COMPOSITION_RULE(rule, nbytes) \
3299 do { \
3300 rule = c1 - 32; \
3301 if (rule < 0) \
3302 break; \
3303 if (rule < 81) /* old format (before ver.21) */ \
3304 { \
3305 int gref = (rule) / 9; \
3306 int nref = (rule) % 9; \
3307 if (gref == 4) gref = 10; \
3308 if (nref == 4) nref = 10; \
3309 rule = COMPOSITION_ENCODE_RULE (gref, nref); \
3310 nbytes = 1; \
3311 } \
3312 else /* new format (after ver.21) */ \
3313 { \
3314 int c; \
3315 \
3316 ONE_MORE_BYTE (c); \
3317 rule = COMPOSITION_ENCODE_RULE (rule - 81, c - 32); \
3318 if (rule >= 0) \
3319 rule += 0x100; /* to destinguish it from the old format */ \
3320 nbytes = 2; \
3321 } \
3322 } while (0)
3323
3324 #define ENCODE_COMPOSITION_RULE(rule) \
3325 do { \
3326 int gref = (rule % 0x100) / 12, nref = (rule % 0x100) % 12; \
3327 \
3328 if (rule < 0x100) /* old format */ \
3329 { \
3330 if (gref == 10) gref = 4; \
3331 if (nref == 10) nref = 4; \
3332 charbuf[idx] = 32 + gref * 9 + nref; \
3333 charbuf[idx + 1] = -1; \
3334 new_chars++; \
3335 } \
3336 else /* new format */ \
3337 { \
3338 charbuf[idx] = 32 + 81 + gref; \
3339 charbuf[idx + 1] = 32 + nref; \
3340 new_chars += 2; \
3341 } \
3342 } while (0)
3343
3344 /* Finish the current composition as invalid. */
3345
3346 static int finish_composition (int *, struct composition_status *);
3347
3348 static int
3349 finish_composition (int *charbuf, struct composition_status *cmp_status)
3350 {
3351 int idx = - cmp_status->length;
3352 int new_chars;
3353
3354 /* Recover the original ESC sequence */
3355 charbuf[idx++] = ISO_CODE_ESC;
3356 charbuf[idx++] = (cmp_status->method == COMPOSITION_RELATIVE ? '0'
3357 : cmp_status->method == COMPOSITION_WITH_RULE ? '2'
3358 : cmp_status->method == COMPOSITION_WITH_ALTCHARS ? '3'
3359 /* cmp_status->method == COMPOSITION_WITH_RULE_ALTCHARS */
3360 : '4');
3361 charbuf[idx++] = -2;
3362 charbuf[idx++] = 0;
3363 charbuf[idx++] = -1;
3364 new_chars = cmp_status->nchars;
3365 if (cmp_status->method >= COMPOSITION_WITH_RULE)
3366 for (; idx < 0; idx++)
3367 {
3368 int elt = charbuf[idx];
3369
3370 if (elt == -2)
3371 {
3372 ENCODE_COMPOSITION_RULE (charbuf[idx + 1]);
3373 idx++;
3374 }
3375 else if (elt == -1)
3376 {
3377 charbuf[idx++] = ISO_CODE_ESC;
3378 charbuf[idx] = '0';
3379 new_chars += 2;
3380 }
3381 }
3382 cmp_status->state = COMPOSING_NO;
3383 return new_chars;
3384 }
3385
3386 /* If characters are under composition, finish the composition. */
3387 #define MAYBE_FINISH_COMPOSITION() \
3388 do { \
3389 if (cmp_status->state != COMPOSING_NO) \
3390 char_offset += finish_composition (charbuf, cmp_status); \
3391 } while (0)
3392
3393 /* Handle composition start sequence ESC 0, ESC 2, ESC 3, or ESC 4.
3394
3395 ESC 0 : relative composition : ESC 0 CHAR ... ESC 1
3396 ESC 2 : rulebase composition : ESC 2 CHAR RULE CHAR RULE ... CHAR ESC 1
3397 ESC 3 : altchar composition : ESC 3 CHAR ... ESC 0 CHAR ... ESC 1
3398 ESC 4 : alt&rule composition : ESC 4 CHAR RULE ... CHAR ESC 0 CHAR ... ESC 1
3399
3400 Produce this annotation sequence now:
3401
3402 [ -LENGTH(==-4) CODING_ANNOTATE_COMPOSITION_MASK NCHARS(==0) METHOD ]
3403 */
3404
3405 #define DECODE_COMPOSITION_START(c1) \
3406 do { \
3407 if (c1 == '0' \
3408 && ((cmp_status->state == COMPOSING_COMPONENT_CHAR \
3409 && cmp_status->method == COMPOSITION_WITH_ALTCHARS) \
3410 || (cmp_status->state == COMPOSING_COMPONENT_RULE \
3411 && cmp_status->method == COMPOSITION_WITH_RULE_ALTCHARS))) \
3412 { \
3413 *charbuf++ = -1; \
3414 *charbuf++= -1; \
3415 cmp_status->state = COMPOSING_CHAR; \
3416 cmp_status->length += 2; \
3417 } \
3418 else \
3419 { \
3420 MAYBE_FINISH_COMPOSITION (); \
3421 cmp_status->method = (c1 == '0' ? COMPOSITION_RELATIVE \
3422 : c1 == '2' ? COMPOSITION_WITH_RULE \
3423 : c1 == '3' ? COMPOSITION_WITH_ALTCHARS \
3424 : COMPOSITION_WITH_RULE_ALTCHARS); \
3425 cmp_status->state \
3426 = (c1 <= '2' ? COMPOSING_CHAR : COMPOSING_COMPONENT_CHAR); \
3427 ADD_COMPOSITION_DATA (charbuf, 0, 0, cmp_status->method); \
3428 cmp_status->length = MAX_ANNOTATION_LENGTH; \
3429 cmp_status->nchars = cmp_status->ncomps = 0; \
3430 coding->annotated = 1; \
3431 } \
3432 } while (0)
3433
3434
3435 /* Handle composition end sequence ESC 1. */
3436
3437 #define DECODE_COMPOSITION_END() \
3438 do { \
3439 if (cmp_status->nchars == 0 \
3440 || ((cmp_status->state == COMPOSING_CHAR) \
3441 == (cmp_status->method == COMPOSITION_WITH_RULE))) \
3442 { \
3443 MAYBE_FINISH_COMPOSITION (); \
3444 goto invalid_code; \
3445 } \
3446 if (cmp_status->method == COMPOSITION_WITH_ALTCHARS) \
3447 charbuf[- cmp_status->length] -= cmp_status->ncomps + 2; \
3448 else if (cmp_status->method == COMPOSITION_WITH_RULE_ALTCHARS) \
3449 charbuf[- cmp_status->length] -= cmp_status->ncomps * 3; \
3450 charbuf[- cmp_status->length + 2] = cmp_status->nchars; \
3451 char_offset += cmp_status->nchars; \
3452 cmp_status->state = COMPOSING_NO; \
3453 } while (0)
3454
3455 /* Store a composition rule RULE in charbuf, and update cmp_status. */
3456
3457 #define STORE_COMPOSITION_RULE(rule) \
3458 do { \
3459 *charbuf++ = -2; \
3460 *charbuf++ = rule; \
3461 cmp_status->length += 2; \
3462 cmp_status->state--; \
3463 } while (0)
3464
3465 /* Store a composed char or a component char C in charbuf, and update
3466 cmp_status. */
3467
3468 #define STORE_COMPOSITION_CHAR(c) \
3469 do { \
3470 *charbuf++ = (c); \
3471 cmp_status->length++; \
3472 if (cmp_status->state == COMPOSING_CHAR) \
3473 cmp_status->nchars++; \
3474 else \
3475 cmp_status->ncomps++; \
3476 if (cmp_status->method == COMPOSITION_WITH_RULE \
3477 || (cmp_status->method == COMPOSITION_WITH_RULE_ALTCHARS \
3478 && cmp_status->state == COMPOSING_COMPONENT_CHAR)) \
3479 cmp_status->state++; \
3480 } while (0)
3481
3482
3483 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions". */
3484
3485 static void
3486 decode_coding_iso_2022 (struct coding_system *coding)
3487 {
3488 const unsigned char *src = coding->source + coding->consumed;
3489 const unsigned char *src_end = coding->source + coding->src_bytes;
3490 const unsigned char *src_base;
3491 int *charbuf = coding->charbuf + coding->charbuf_used;
3492 /* We may produce two annotations (charset and composition) in one
3493 loop and one more charset annotation at the end. */
3494 int *charbuf_end
3495 = coding->charbuf + coding->charbuf_size - (MAX_ANNOTATION_LENGTH * 3);
3496 int consumed_chars = 0, consumed_chars_base;
3497 int multibytep = coding->src_multibyte;
3498 /* Charsets invoked to graphic plane 0 and 1 respectively. */
3499 int charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3500 int charset_id_1 = CODING_ISO_INVOKED_CHARSET (coding, 1);
3501 int charset_id_2, charset_id_3;
3502 struct charset *charset;
3503 int c;
3504 struct composition_status *cmp_status = CODING_ISO_CMP_STATUS (coding);
3505 Lisp_Object attrs, charset_list;
3506 int char_offset = coding->produced_char;
3507 int last_offset = char_offset;
3508 int last_id = charset_ascii;
3509 int eol_crlf =
3510 !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
3511 int byte_after_cr = -1;
3512 int i;
3513
3514 CODING_GET_INFO (coding, attrs, charset_list);
3515 setup_iso_safe_charsets (attrs);
3516 /* Charset list may have been changed. */
3517 charset_list = CODING_ATTR_CHARSET_LIST (attrs);
3518 coding->safe_charsets = SDATA (CODING_ATTR_SAFE_CHARSETS (attrs));
3519
3520 if (cmp_status->state != COMPOSING_NO)
3521 {
3522 for (i = 0; i < cmp_status->length; i++)
3523 *charbuf++ = cmp_status->carryover[i];
3524 coding->annotated = 1;
3525 }
3526
3527 while (1)
3528 {
3529 int c1, c2, c3;
3530
3531 src_base = src;
3532 consumed_chars_base = consumed_chars;
3533
3534 if (charbuf >= charbuf_end)
3535 {
3536 if (byte_after_cr >= 0)
3537 src_base--;
3538 break;
3539 }
3540
3541 if (byte_after_cr >= 0)
3542 c1 = byte_after_cr, byte_after_cr = -1;
3543 else
3544 ONE_MORE_BYTE (c1);
3545 if (c1 < 0)
3546 goto invalid_code;
3547
3548 if (CODING_ISO_EXTSEGMENT_LEN (coding) > 0)
3549 {
3550 *charbuf++ = ASCII_BYTE_P (c1) ? c1 : BYTE8_TO_CHAR (c1);
3551 char_offset++;
3552 CODING_ISO_EXTSEGMENT_LEN (coding)--;
3553 continue;
3554 }
3555
3556 if (CODING_ISO_EMBEDDED_UTF_8 (coding))
3557 {
3558 if (c1 == ISO_CODE_ESC)
3559 {
3560 if (src + 1 >= src_end)
3561 goto no_more_source;
3562 *charbuf++ = ISO_CODE_ESC;
3563 char_offset++;
3564 if (src[0] == '%' && src[1] == '@')
3565 {
3566 src += 2;
3567 consumed_chars += 2;
3568 char_offset += 2;
3569 /* We are sure charbuf can contain two more chars. */
3570 *charbuf++ = '%';
3571 *charbuf++ = '@';
3572 CODING_ISO_EMBEDDED_UTF_8 (coding) = 0;
3573 }
3574 }
3575 else
3576 {
3577 *charbuf++ = ASCII_BYTE_P (c1) ? c1 : BYTE8_TO_CHAR (c1);
3578 char_offset++;
3579 }
3580 continue;
3581 }
3582
3583 if ((cmp_status->state == COMPOSING_RULE
3584 || cmp_status->state == COMPOSING_COMPONENT_RULE)
3585 && c1 != ISO_CODE_ESC)
3586 {
3587 int rule, nbytes;
3588
3589 DECODE_COMPOSITION_RULE (rule, nbytes);
3590 if (rule < 0)
3591 goto invalid_code;
3592 STORE_COMPOSITION_RULE (rule);
3593 continue;
3594 }
3595
3596 /* We produce at most one character. */
3597 switch (iso_code_class [c1])
3598 {
3599 case ISO_0x20_or_0x7F:
3600 if (charset_id_0 < 0
3601 || ! CHARSET_ISO_CHARS_96 (CHARSET_FROM_ID (charset_id_0)))
3602 /* This is SPACE or DEL. */
3603 charset = CHARSET_FROM_ID (charset_ascii);
3604 else
3605 charset = CHARSET_FROM_ID (charset_id_0);
3606 break;
3607
3608 case ISO_graphic_plane_0:
3609 if (charset_id_0 < 0)
3610 charset = CHARSET_FROM_ID (charset_ascii);
3611 else
3612 charset = CHARSET_FROM_ID (charset_id_0);
3613 break;
3614
3615 case ISO_0xA0_or_0xFF:
3616 if (charset_id_1 < 0
3617 || ! CHARSET_ISO_CHARS_96 (CHARSET_FROM_ID (charset_id_1))
3618 || CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS)
3619 goto invalid_code;
3620 /* This is a graphic character, we fall down ... */
3621
3622 case ISO_graphic_plane_1:
3623 if (charset_id_1 < 0)
3624 goto invalid_code;
3625 charset = CHARSET_FROM_ID (charset_id_1);
3626 break;
3627
3628 case ISO_control_0:
3629 if (eol_crlf && c1 == '\r')
3630 ONE_MORE_BYTE (byte_after_cr);
3631 MAYBE_FINISH_COMPOSITION ();
3632 charset = CHARSET_FROM_ID (charset_ascii);
3633 break;
3634
3635 case ISO_control_1:
3636 goto invalid_code;
3637
3638 case ISO_shift_out:
3639 if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LOCKING_SHIFT)
3640 || CODING_ISO_DESIGNATION (coding, 1) < 0)
3641 goto invalid_code;
3642 CODING_ISO_INVOCATION (coding, 0) = 1;
3643 charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3644 continue;
3645
3646 case ISO_shift_in:
3647 if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LOCKING_SHIFT))
3648 goto invalid_code;
3649 CODING_ISO_INVOCATION (coding, 0) = 0;
3650 charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3651 continue;
3652
3653 case ISO_single_shift_2_7:
3654 if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS))
3655 goto invalid_code;
3656 case ISO_single_shift_2:
3657 if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT))
3658 goto invalid_code;
3659 /* SS2 is handled as an escape sequence of ESC 'N' */
3660 c1 = 'N';
3661 goto label_escape_sequence;
3662
3663 case ISO_single_shift_3:
3664 if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT))
3665 goto invalid_code;
3666 /* SS2 is handled as an escape sequence of ESC 'O' */
3667 c1 = 'O';
3668 goto label_escape_sequence;
3669
3670 case ISO_control_sequence_introducer:
3671 /* CSI is handled as an escape sequence of ESC '[' ... */
3672 c1 = '[';
3673 goto label_escape_sequence;
3674
3675 case ISO_escape:
3676 ONE_MORE_BYTE (c1);
3677 label_escape_sequence:
3678 /* Escape sequences handled here are invocation,
3679 designation, direction specification, and character
3680 composition specification. */
3681 switch (c1)
3682 {
3683 case '&': /* revision of following character set */
3684 ONE_MORE_BYTE (c1);
3685 if (!(c1 >= '@' && c1 <= '~'))
3686 goto invalid_code;
3687 ONE_MORE_BYTE (c1);
3688 if (c1 != ISO_CODE_ESC)
3689 goto invalid_code;
3690 ONE_MORE_BYTE (c1);
3691 goto label_escape_sequence;
3692
3693 case '$': /* designation of 2-byte character set */
3694 if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DESIGNATION))
3695 goto invalid_code;
3696 {
3697 int reg, chars96;
3698
3699 ONE_MORE_BYTE (c1);
3700 if (c1 >= '@' && c1 <= 'B')
3701 { /* designation of JISX0208.1978, GB2312.1980,
3702 or JISX0208.1980 */
3703 reg = 0, chars96 = 0;
3704 }
3705 else if (c1 >= 0x28 && c1 <= 0x2B)
3706 { /* designation of DIMENSION2_CHARS94 character set */
3707 reg = c1 - 0x28, chars96 = 0;
3708 ONE_MORE_BYTE (c1);
3709 }
3710 else if (c1 >= 0x2C && c1 <= 0x2F)
3711 { /* designation of DIMENSION2_CHARS96 character set */
3712 reg = c1 - 0x2C, chars96 = 1;
3713 ONE_MORE_BYTE (c1);
3714 }
3715 else
3716 goto invalid_code;
3717 DECODE_DESIGNATION (reg, 2, chars96, c1);
3718 /* We must update these variables now. */
3719 if (reg == 0)
3720 charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3721 else if (reg == 1)
3722 charset_id_1 = CODING_ISO_INVOKED_CHARSET (coding, 1);
3723 if (chars96 < 0)
3724 goto invalid_code;
3725 }
3726 continue;
3727
3728 case 'n': /* invocation of locking-shift-2 */
3729 if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LOCKING_SHIFT)
3730 || CODING_ISO_DESIGNATION (coding, 2) < 0)
3731 goto invalid_code;
3732 CODING_ISO_INVOCATION (coding, 0) = 2;
3733 charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3734 continue;
3735
3736 case 'o': /* invocation of locking-shift-3 */
3737 if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LOCKING_SHIFT)
3738 || CODING_ISO_DESIGNATION (coding, 3) < 0)
3739 goto invalid_code;
3740 CODING_ISO_INVOCATION (coding, 0) = 3;
3741 charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3742 continue;
3743
3744 case 'N': /* invocation of single-shift-2 */
3745 if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT)
3746 || CODING_ISO_DESIGNATION (coding, 2) < 0)
3747 goto invalid_code;
3748 charset_id_2 = CODING_ISO_DESIGNATION (coding, 2);
3749 if (charset_id_2 < 0)
3750 charset = CHARSET_FROM_ID (charset_ascii);
3751 else
3752 charset = CHARSET_FROM_ID (charset_id_2);
3753 ONE_MORE_BYTE (c1);
3754 if (c1 < 0x20 || (c1 >= 0x80 && c1 < 0xA0))
3755 goto invalid_code;
3756 break;
3757
3758 case 'O': /* invocation of single-shift-3 */
3759 if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT)
3760 || CODING_ISO_DESIGNATION (coding, 3) < 0)
3761 goto invalid_code;
3762 charset_id_3 = CODING_ISO_DESIGNATION (coding, 3);
3763 if (charset_id_3 < 0)
3764 charset = CHARSET_FROM_ID (charset_ascii);
3765 else
3766 charset = CHARSET_FROM_ID (charset_id_3);
3767 ONE_MORE_BYTE (c1);
3768 if (c1 < 0x20 || (c1 >= 0x80 && c1 < 0xA0))
3769 goto invalid_code;
3770 break;
3771
3772 case '0': case '2': case '3': case '4': /* start composition */
3773 if (! (coding->common_flags & CODING_ANNOTATE_COMPOSITION_MASK))
3774 goto invalid_code;
3775 if (last_id != charset_ascii)
3776 {
3777 ADD_CHARSET_DATA (charbuf, char_offset- last_offset, last_id);
3778 last_id = charset_ascii;
3779 last_offset = char_offset;
3780 }
3781 DECODE_COMPOSITION_START (c1);
3782 continue;
3783
3784 case '1': /* end composition */
3785 if (cmp_status->state == COMPOSING_NO)
3786 goto invalid_code;
3787 DECODE_COMPOSITION_END ();
3788 continue;
3789
3790 case '[': /* specification of direction */
3791 if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DIRECTION))
3792 goto invalid_code;
3793 /* For the moment, nested direction is not supported.
3794 So, `coding->mode & CODING_MODE_DIRECTION' zero means
3795 left-to-right, and nonzero means right-to-left. */
3796 ONE_MORE_BYTE (c1);
3797 switch (c1)
3798 {
3799 case ']': /* end of the current direction */
3800 coding->mode &= ~CODING_MODE_DIRECTION;
3801
3802 case '0': /* end of the current direction */
3803 case '1': /* start of left-to-right direction */
3804 ONE_MORE_BYTE (c1);
3805 if (c1 == ']')
3806 coding->mode &= ~CODING_MODE_DIRECTION;
3807 else
3808 goto invalid_code;
3809 break;
3810
3811 case '2': /* start of right-to-left direction */
3812 ONE_MORE_BYTE (c1);
3813 if (c1 == ']')
3814 coding->mode |= CODING_MODE_DIRECTION;
3815 else
3816 goto invalid_code;
3817 break;
3818
3819 default:
3820 goto invalid_code;
3821 }
3822 continue;
3823
3824 case '%':
3825 ONE_MORE_BYTE (c1);
3826 if (c1 == '/')
3827 {
3828 /* CTEXT extended segment:
3829 ESC % / [0-4] M L --ENCODING-NAME-- \002 --BYTES--
3830 We keep these bytes as is for the moment.
3831 They may be decoded by post-read-conversion. */
3832 int dim, M, L;
3833 int size;
3834
3835 ONE_MORE_BYTE (dim);
3836 if (dim < '0' || dim > '4')
3837 goto invalid_code;
3838 ONE_MORE_BYTE (M);
3839 if (M < 128)
3840 goto invalid_code;
3841 ONE_MORE_BYTE (L);
3842 if (L < 128)
3843 goto invalid_code;
3844 size = ((M - 128) * 128) + (L - 128);
3845 if (charbuf + 6 > charbuf_end)
3846 goto break_loop;
3847 *charbuf++ = ISO_CODE_ESC;
3848 *charbuf++ = '%';
3849 *charbuf++ = '/';
3850 *charbuf++ = dim;
3851 *charbuf++ = BYTE8_TO_CHAR (M);
3852 *charbuf++ = BYTE8_TO_CHAR (L);
3853 CODING_ISO_EXTSEGMENT_LEN (coding) = size;
3854 }
3855 else if (c1 == 'G')
3856 {
3857 /* XFree86 extension for embedding UTF-8 in CTEXT:
3858 ESC % G --UTF-8-BYTES-- ESC % @
3859 We keep these bytes as is for the moment.
3860 They may be decoded by post-read-conversion. */
3861 if (charbuf + 3 > charbuf_end)
3862 goto break_loop;
3863 *charbuf++ = ISO_CODE_ESC;
3864 *charbuf++ = '%';
3865 *charbuf++ = 'G';
3866 CODING_ISO_EMBEDDED_UTF_8 (coding) = 1;
3867 }
3868 else
3869 goto invalid_code;
3870 continue;
3871 break;
3872
3873 default:
3874 if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DESIGNATION))
3875 goto invalid_code;
3876 {
3877 int reg, chars96;
3878
3879 if (c1 >= 0x28 && c1 <= 0x2B)
3880 { /* designation of DIMENSION1_CHARS94 character set */
3881 reg = c1 - 0x28, chars96 = 0;
3882 ONE_MORE_BYTE (c1);
3883 }
3884 else if (c1 >= 0x2C && c1 <= 0x2F)
3885 { /* designation of DIMENSION1_CHARS96 character set */
3886 reg = c1 - 0x2C, chars96 = 1;
3887 ONE_MORE_BYTE (c1);
3888 }
3889 else
3890 goto invalid_code;
3891 DECODE_DESIGNATION (reg, 1, chars96, c1);
3892 /* We must update these variables now. */
3893 if (reg == 0)
3894 charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3895 else if (reg == 1)
3896 charset_id_1 = CODING_ISO_INVOKED_CHARSET (coding, 1);
3897 if (chars96 < 0)
3898 goto invalid_code;
3899 }
3900 continue;
3901 }
3902 }
3903
3904 if (cmp_status->state == COMPOSING_NO
3905 && charset->id != charset_ascii
3906 && last_id != charset->id)
3907 {
3908 if (last_id != charset_ascii)
3909 ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
3910 last_id = charset->id;
3911 last_offset = char_offset;
3912 }
3913
3914 /* Now we know CHARSET and 1st position code C1 of a character.
3915 Produce a decoded character while getting 2nd and 3rd
3916 position codes C2, C3 if necessary. */
3917 if (CHARSET_DIMENSION (charset) > 1)
3918 {
3919 ONE_MORE_BYTE (c2);
3920 if (c2 < 0x20 || (c2 >= 0x80 && c2 < 0xA0)
3921 || ((c1 & 0x80) != (c2 & 0x80)))
3922 /* C2 is not in a valid range. */
3923 goto invalid_code;
3924 if (CHARSET_DIMENSION (charset) == 2)
3925 c1 = (c1 << 8) | c2;
3926 else
3927 {
3928 ONE_MORE_BYTE (c3);
3929 if (c3 < 0x20 || (c3 >= 0x80 && c3 < 0xA0)
3930 || ((c1 & 0x80) != (c3 & 0x80)))
3931 /* C3 is not in a valid range. */
3932 goto invalid_code;
3933 c1 = (c1 << 16) | (c2 << 8) | c2;
3934 }
3935 }
3936 c1 &= 0x7F7F7F;
3937 CODING_DECODE_CHAR (coding, src, src_base, src_end, charset, c1, c);
3938 if (c < 0)
3939 {
3940 MAYBE_FINISH_COMPOSITION ();
3941 for (; src_base < src; src_base++, char_offset++)
3942 {
3943 if (ASCII_BYTE_P (*src_base))
3944 *charbuf++ = *src_base;
3945 else
3946 *charbuf++ = BYTE8_TO_CHAR (*src_base);
3947 }
3948 }
3949 else if (cmp_status->state == COMPOSING_NO)
3950 {
3951 *charbuf++ = c;
3952 char_offset++;
3953 }
3954 else if ((cmp_status->state == COMPOSING_CHAR
3955 ? cmp_status->nchars
3956 : cmp_status->ncomps)
3957 >= MAX_COMPOSITION_COMPONENTS)
3958 {
3959 /* Too long composition. */
3960 MAYBE_FINISH_COMPOSITION ();
3961 *charbuf++ = c;
3962 char_offset++;
3963 }
3964 else
3965 STORE_COMPOSITION_CHAR (c);
3966 continue;
3967
3968 invalid_code:
3969 MAYBE_FINISH_COMPOSITION ();
3970 src = src_base;
3971 consumed_chars = consumed_chars_base;
3972 ONE_MORE_BYTE (c);
3973 *charbuf++ = c < 0 ? -c : ASCII_BYTE_P (c) ? c : BYTE8_TO_CHAR (c);
3974 char_offset++;
3975 coding->errors++;
3976 continue;
3977
3978 break_loop:
3979 break;
3980 }
3981
3982 no_more_source:
3983 if (cmp_status->state != COMPOSING_NO)
3984 {
3985 if (coding->mode & CODING_MODE_LAST_BLOCK)
3986 MAYBE_FINISH_COMPOSITION ();
3987 else
3988 {
3989 charbuf -= cmp_status->length;
3990 for (i = 0; i < cmp_status->length; i++)
3991 cmp_status->carryover[i] = charbuf[i];
3992 }
3993 }
3994 else if (last_id != charset_ascii)
3995 ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
3996 coding->consumed_char += consumed_chars_base;
3997 coding->consumed = src_base - coding->source;
3998 coding->charbuf_used = charbuf - coding->charbuf;
3999 }
4000
4001
4002 /* ISO2022 encoding stuff. */
4003
4004 /*
4005 It is not enough to say just "ISO2022" on encoding, we have to
4006 specify more details. In Emacs, each coding system of ISO2022
4007 variant has the following specifications:
4008 1. Initial designation to G0 thru G3.
4009 2. Allows short-form designation?
4010 3. ASCII should be designated to G0 before control characters?
4011 4. ASCII should be designated to G0 at end of line?
4012 5. 7-bit environment or 8-bit environment?
4013 6. Use locking-shift?
4014 7. Use Single-shift?
4015 And the following two are only for Japanese:
4016 8. Use ASCII in place of JIS0201-1976-Roman?
4017 9. Use JISX0208-1983 in place of JISX0208-1978?
4018 These specifications are encoded in CODING_ISO_FLAGS (coding) as flag bits
4019 defined by macros CODING_ISO_FLAG_XXX. See `coding.h' for more
4020 details.
4021 */
4022
4023 /* Produce codes (escape sequence) for designating CHARSET to graphic
4024 register REG at DST, and increment DST. If <final-char> of CHARSET is
4025 '@', 'A', or 'B' and the coding system CODING allows, produce
4026 designation sequence of short-form. */
4027
4028 #define ENCODE_DESIGNATION(charset, reg, coding) \
4029 do { \
4030 unsigned char final_char = CHARSET_ISO_FINAL (charset); \
4031 const char *intermediate_char_94 = "()*+"; \
4032 const char *intermediate_char_96 = ",-./"; \
4033 int revision = -1; \
4034 int c; \
4035 \
4036 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_REVISION) \
4037 revision = CHARSET_ISO_REVISION (charset); \
4038 \
4039 if (revision >= 0) \
4040 { \
4041 EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, '&'); \
4042 EMIT_ONE_BYTE ('@' + revision); \
4043 } \
4044 EMIT_ONE_ASCII_BYTE (ISO_CODE_ESC); \
4045 if (CHARSET_DIMENSION (charset) == 1) \
4046 { \
4047 if (! CHARSET_ISO_CHARS_96 (charset)) \
4048 c = intermediate_char_94[reg]; \
4049 else \
4050 c = intermediate_char_96[reg]; \
4051 EMIT_ONE_ASCII_BYTE (c); \
4052 } \
4053 else \
4054 { \
4055 EMIT_ONE_ASCII_BYTE ('$'); \
4056 if (! CHARSET_ISO_CHARS_96 (charset)) \
4057 { \
4058 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LONG_FORM \
4059 || reg != 0 \
4060 || final_char < '@' || final_char > 'B') \
4061 EMIT_ONE_ASCII_BYTE (intermediate_char_94[reg]); \
4062 } \
4063 else \
4064 EMIT_ONE_ASCII_BYTE (intermediate_char_96[reg]); \
4065 } \
4066 EMIT_ONE_ASCII_BYTE (final_char); \
4067 \
4068 CODING_ISO_DESIGNATION (coding, reg) = CHARSET_ID (charset); \
4069 } while (0)
4070
4071
4072 /* The following two macros produce codes (control character or escape
4073 sequence) for ISO2022 single-shift functions (single-shift-2 and
4074 single-shift-3). */
4075
4076 #define ENCODE_SINGLE_SHIFT_2 \
4077 do { \
4078 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS) \
4079 EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'N'); \
4080 else \
4081 EMIT_ONE_BYTE (ISO_CODE_SS2); \
4082 CODING_ISO_SINGLE_SHIFTING (coding) = 1; \
4083 } while (0)
4084
4085
4086 #define ENCODE_SINGLE_SHIFT_3 \
4087 do { \
4088 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS) \
4089 EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'O'); \
4090 else \
4091 EMIT_ONE_BYTE (ISO_CODE_SS3); \
4092 CODING_ISO_SINGLE_SHIFTING (coding) = 1; \
4093 } while (0)
4094
4095
4096 /* The following four macros produce codes (control character or
4097 escape sequence) for ISO2022 locking-shift functions (shift-in,
4098 shift-out, locking-shift-2, and locking-shift-3). */
4099
4100 #define ENCODE_SHIFT_IN \
4101 do { \
4102 EMIT_ONE_ASCII_BYTE (ISO_CODE_SI); \
4103 CODING_ISO_INVOCATION (coding, 0) = 0; \
4104 } while (0)
4105
4106
4107 #define ENCODE_SHIFT_OUT \
4108 do { \
4109 EMIT_ONE_ASCII_BYTE (ISO_CODE_SO); \
4110 CODING_ISO_INVOCATION (coding, 0) = 1; \
4111 } while (0)
4112
4113
4114 #define ENCODE_LOCKING_SHIFT_2 \
4115 do { \
4116 EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'n'); \
4117 CODING_ISO_INVOCATION (coding, 0) = 2; \
4118 } while (0)
4119
4120
4121 #define ENCODE_LOCKING_SHIFT_3 \
4122 do { \
4123 EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'n'); \
4124 CODING_ISO_INVOCATION (coding, 0) = 3; \
4125 } while (0)
4126
4127
4128 /* Produce codes for a DIMENSION1 character whose character set is
4129 CHARSET and whose position-code is C1. Designation and invocation
4130 sequences are also produced in advance if necessary. */
4131
4132 #define ENCODE_ISO_CHARACTER_DIMENSION1(charset, c1) \
4133 do { \
4134 int id = CHARSET_ID (charset); \
4135 \
4136 if ((CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_USE_ROMAN) \
4137 && id == charset_ascii) \
4138 { \
4139 id = charset_jisx0201_roman; \
4140 charset = CHARSET_FROM_ID (id); \
4141 } \
4142 \
4143 if (CODING_ISO_SINGLE_SHIFTING (coding)) \
4144 { \
4145 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS) \
4146 EMIT_ONE_ASCII_BYTE (c1 & 0x7F); \
4147 else \
4148 EMIT_ONE_BYTE (c1 | 0x80); \
4149 CODING_ISO_SINGLE_SHIFTING (coding) = 0; \
4150 break; \
4151 } \
4152 else if (id == CODING_ISO_INVOKED_CHARSET (coding, 0)) \
4153 { \
4154 EMIT_ONE_ASCII_BYTE (c1 & 0x7F); \
4155 break; \
4156 } \
4157 else if (id == CODING_ISO_INVOKED_CHARSET (coding, 1)) \
4158 { \
4159 EMIT_ONE_BYTE (c1 | 0x80); \
4160 break; \
4161 } \
4162 else \
4163 /* Since CHARSET is not yet invoked to any graphic planes, we \
4164 must invoke it, or, at first, designate it to some graphic \
4165 register. Then repeat the loop to actually produce the \
4166 character. */ \
4167 dst = encode_invocation_designation (charset, coding, dst, \
4168 &produced_chars); \
4169 } while (1)
4170
4171
4172 /* Produce codes for a DIMENSION2 character whose character set is
4173 CHARSET and whose position-codes are C1 and C2. Designation and
4174 invocation codes are also produced in advance if necessary. */
4175
4176 #define ENCODE_ISO_CHARACTER_DIMENSION2(charset, c1, c2) \
4177 do { \
4178 int id = CHARSET_ID (charset); \
4179 \
4180 if ((CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_USE_OLDJIS) \
4181 && id == charset_jisx0208) \
4182 { \
4183 id = charset_jisx0208_1978; \
4184 charset = CHARSET_FROM_ID (id); \
4185 } \
4186 \
4187 if (CODING_ISO_SINGLE_SHIFTING (coding)) \
4188 { \
4189 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS) \
4190 EMIT_TWO_ASCII_BYTES ((c1) & 0x7F, (c2) & 0x7F); \
4191 else \
4192 EMIT_TWO_BYTES ((c1) | 0x80, (c2) | 0x80); \
4193 CODING_ISO_SINGLE_SHIFTING (coding) = 0; \
4194 break; \
4195 } \
4196 else if (id == CODING_ISO_INVOKED_CHARSET (coding, 0)) \
4197 { \
4198 EMIT_TWO_ASCII_BYTES ((c1) & 0x7F, (c2) & 0x7F); \
4199 break; \
4200 } \
4201 else if (id == CODING_ISO_INVOKED_CHARSET (coding, 1)) \
4202 { \
4203 EMIT_TWO_BYTES ((c1) | 0x80, (c2) | 0x80); \
4204 break; \
4205 } \
4206 else \
4207 /* Since CHARSET is not yet invoked to any graphic planes, we \
4208 must invoke it, or, at first, designate it to some graphic \
4209 register. Then repeat the loop to actually produce the \
4210 character. */ \
4211 dst = encode_invocation_designation (charset, coding, dst, \
4212 &produced_chars); \
4213 } while (1)
4214
4215
4216 #define ENCODE_ISO_CHARACTER(charset, c) \
4217 do { \
4218 int code = ENCODE_CHAR ((charset), (c)); \
4219 \
4220 if (CHARSET_DIMENSION (charset) == 1) \
4221 ENCODE_ISO_CHARACTER_DIMENSION1 ((charset), code); \
4222 else \
4223 ENCODE_ISO_CHARACTER_DIMENSION2 ((charset), code >> 8, code & 0xFF); \
4224 } while (0)
4225
4226
4227 /* Produce designation and invocation codes at a place pointed by DST
4228 to use CHARSET. The element `spec.iso_2022' of *CODING is updated.
4229 Return new DST. */
4230
4231 unsigned char *
4232 encode_invocation_designation (struct charset *charset,
4233 struct coding_system *coding,
4234 unsigned char *dst, int *p_nchars)
4235 {
4236 int multibytep = coding->dst_multibyte;
4237 int produced_chars = *p_nchars;
4238 int reg; /* graphic register number */
4239 int id = CHARSET_ID (charset);
4240
4241 /* At first, check designations. */
4242 for (reg = 0; reg < 4; reg++)
4243 if (id == CODING_ISO_DESIGNATION (coding, reg))
4244 break;
4245
4246 if (reg >= 4)
4247 {
4248 /* CHARSET is not yet designated to any graphic registers. */
4249 /* At first check the requested designation. */
4250 reg = CODING_ISO_REQUEST (coding, id);
4251 if (reg < 0)
4252 /* Since CHARSET requests no special designation, designate it
4253 to graphic register 0. */
4254 reg = 0;
4255
4256 ENCODE_DESIGNATION (charset, reg, coding);
4257 }
4258
4259 if (CODING_ISO_INVOCATION (coding, 0) != reg
4260 && CODING_ISO_INVOCATION (coding, 1) != reg)
4261 {
4262 /* Since the graphic register REG is not invoked to any graphic
4263 planes, invoke it to graphic plane 0. */
4264 switch (reg)
4265 {
4266 case 0: /* graphic register 0 */
4267 ENCODE_SHIFT_IN;
4268 break;
4269
4270 case 1: /* graphic register 1 */
4271 ENCODE_SHIFT_OUT;
4272 break;
4273
4274 case 2: /* graphic register 2 */
4275 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT)
4276 ENCODE_SINGLE_SHIFT_2;
4277 else
4278 ENCODE_LOCKING_SHIFT_2;
4279 break;
4280
4281 case 3: /* graphic register 3 */
4282 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT)
4283 ENCODE_SINGLE_SHIFT_3;
4284 else
4285 ENCODE_LOCKING_SHIFT_3;
4286 break;
4287 }
4288 }
4289
4290 *p_nchars = produced_chars;
4291 return dst;
4292 }
4293
4294 /* The following three macros produce codes for indicating direction
4295 of text. */
4296 #define ENCODE_CONTROL_SEQUENCE_INTRODUCER \
4297 do { \
4298 if (CODING_ISO_FLAGS (coding) == CODING_ISO_FLAG_SEVEN_BITS) \
4299 EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, '['); \
4300 else \
4301 EMIT_ONE_BYTE (ISO_CODE_CSI); \
4302 } while (0)
4303
4304
4305 #define ENCODE_DIRECTION_R2L() \
4306 do { \
4307 ENCODE_CONTROL_SEQUENCE_INTRODUCER (dst); \
4308 EMIT_TWO_ASCII_BYTES ('2', ']'); \
4309 } while (0)
4310
4311
4312 #define ENCODE_DIRECTION_L2R() \
4313 do { \
4314 ENCODE_CONTROL_SEQUENCE_INTRODUCER (dst); \
4315 EMIT_TWO_ASCII_BYTES ('0', ']'); \
4316 } while (0)
4317
4318
4319 /* Produce codes for designation and invocation to reset the graphic
4320 planes and registers to initial state. */
4321 #define ENCODE_RESET_PLANE_AND_REGISTER() \
4322 do { \
4323 int reg; \
4324 struct charset *charset; \
4325 \
4326 if (CODING_ISO_INVOCATION (coding, 0) != 0) \
4327 ENCODE_SHIFT_IN; \
4328 for (reg = 0; reg < 4; reg++) \
4329 if (CODING_ISO_INITIAL (coding, reg) >= 0 \
4330 && (CODING_ISO_DESIGNATION (coding, reg) \
4331 != CODING_ISO_INITIAL (coding, reg))) \
4332 { \
4333 charset = CHARSET_FROM_ID (CODING_ISO_INITIAL (coding, reg)); \
4334 ENCODE_DESIGNATION (charset, reg, coding); \
4335 } \
4336 } while (0)
4337
4338
4339 /* Produce designation sequences of charsets in the line started from
4340 SRC to a place pointed by DST, and return updated DST.
4341
4342 If the current block ends before any end-of-line, we may fail to
4343 find all the necessary designations. */
4344
4345 static unsigned char *
4346 encode_designation_at_bol (struct coding_system *coding, int *charbuf,
4347 int *charbuf_end, unsigned char *dst)
4348 {
4349 struct charset *charset;
4350 /* Table of charsets to be designated to each graphic register. */
4351 int r[4];
4352 int c, found = 0, reg;
4353 int produced_chars = 0;
4354 int multibytep = coding->dst_multibyte;
4355 Lisp_Object attrs;
4356 Lisp_Object charset_list;
4357
4358 attrs = CODING_ID_ATTRS (coding->id);
4359 charset_list = CODING_ATTR_CHARSET_LIST (attrs);
4360 if (EQ (charset_list, Qiso_2022))
4361 charset_list = Viso_2022_charset_list;
4362
4363 for (reg = 0; reg < 4; reg++)
4364 r[reg] = -1;
4365
4366 while (found < 4)
4367 {
4368 int id;
4369
4370 c = *charbuf++;
4371 if (c == '\n')
4372 break;
4373 charset = char_charset (c, charset_list, NULL);
4374 id = CHARSET_ID (charset);
4375 reg = CODING_ISO_REQUEST (coding, id);
4376 if (reg >= 0 && r[reg] < 0)
4377 {
4378 found++;
4379 r[reg] = id;
4380 }
4381 }
4382
4383 if (found)
4384 {
4385 for (reg = 0; reg < 4; reg++)
4386 if (r[reg] >= 0
4387 && CODING_ISO_DESIGNATION (coding, reg) != r[reg])
4388 ENCODE_DESIGNATION (CHARSET_FROM_ID (r[reg]), reg, coding);
4389 }
4390
4391 return dst;
4392 }
4393
4394 /* See the above "GENERAL NOTES on `encode_coding_XXX ()' functions". */
4395
4396 static int
4397 encode_coding_iso_2022 (struct coding_system *coding)
4398 {
4399 int multibytep = coding->dst_multibyte;
4400 int *charbuf = coding->charbuf;
4401 int *charbuf_end = charbuf + coding->charbuf_used;
4402 unsigned char *dst = coding->destination + coding->produced;
4403 unsigned char *dst_end = coding->destination + coding->dst_bytes;
4404 int safe_room = 16;
4405 int bol_designation
4406 = (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DESIGNATE_AT_BOL
4407 && CODING_ISO_BOL (coding));
4408 int produced_chars = 0;
4409 Lisp_Object attrs, eol_type, charset_list;
4410 int ascii_compatible;
4411 int c;
4412 int preferred_charset_id = -1;
4413
4414 CODING_GET_INFO (coding, attrs, charset_list);
4415 eol_type = inhibit_eol_conversion ? Qunix : CODING_ID_EOL_TYPE (coding->id);
4416 if (VECTORP (eol_type))
4417 eol_type = Qunix;
4418
4419 setup_iso_safe_charsets (attrs);
4420 /* Charset list may have been changed. */
4421 charset_list = CODING_ATTR_CHARSET_LIST (attrs);
4422 coding->safe_charsets = SDATA (CODING_ATTR_SAFE_CHARSETS (attrs));
4423
4424 ascii_compatible
4425 = (! NILP (CODING_ATTR_ASCII_COMPAT (attrs))
4426 && ! (CODING_ISO_FLAGS (coding) & (CODING_ISO_FLAG_DESIGNATION
4427 | CODING_ISO_FLAG_LOCKING_SHIFT)));
4428
4429 while (charbuf < charbuf_end)
4430 {
4431 ASSURE_DESTINATION (safe_room);
4432
4433 if (bol_designation)
4434 {
4435 unsigned char *dst_prev = dst;
4436
4437 /* We have to produce designation sequences if any now. */
4438 dst = encode_designation_at_bol (coding, charbuf, charbuf_end, dst);
4439 bol_designation = 0;
4440 /* We are sure that designation sequences are all ASCII bytes. */
4441 produced_chars += dst - dst_prev;
4442 }
4443
4444 c = *charbuf++;
4445
4446 if (c < 0)
4447 {
4448 /* Handle an annotation. */
4449 switch (*charbuf)
4450 {
4451 case CODING_ANNOTATE_COMPOSITION_MASK:
4452 /* Not yet implemented. */
4453 break;
4454 case CODING_ANNOTATE_CHARSET_MASK:
4455 preferred_charset_id = charbuf[2];
4456 if (preferred_charset_id >= 0
4457 && NILP (Fmemq (make_number (preferred_charset_id),
4458 charset_list)))
4459 preferred_charset_id = -1;
4460 break;
4461 default:
4462 abort ();
4463 }
4464 charbuf += -c - 1;
4465 continue;
4466 }
4467
4468 /* Now encode the character C. */
4469 if (c < 0x20 || c == 0x7F)
4470 {
4471 if (c == '\n'
4472 || (c == '\r' && EQ (eol_type, Qmac)))
4473 {
4474 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_RESET_AT_EOL)
4475 ENCODE_RESET_PLANE_AND_REGISTER ();
4476 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_INIT_AT_BOL)
4477 {
4478 int i;
4479
4480 for (i = 0; i < 4; i++)
4481 CODING_ISO_DESIGNATION (coding, i)
4482 = CODING_ISO_INITIAL (coding, i);
4483 }
4484 bol_designation
4485 = CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DESIGNATE_AT_BOL;
4486 }
4487 else if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_RESET_AT_CNTL)
4488 ENCODE_RESET_PLANE_AND_REGISTER ();
4489 EMIT_ONE_ASCII_BYTE (c);
4490 }
4491 else if (ASCII_CHAR_P (c))
4492 {
4493 if (ascii_compatible)
4494 EMIT_ONE_ASCII_BYTE (c);
4495 else
4496 {
4497 struct charset *charset = CHARSET_FROM_ID (charset_ascii);
4498 ENCODE_ISO_CHARACTER (charset, c);
4499 }
4500 }
4501 else if (CHAR_BYTE8_P (c))
4502 {
4503 c = CHAR_TO_BYTE8 (c);
4504 EMIT_ONE_BYTE (c);
4505 }
4506 else
4507 {
4508 struct charset *charset;
4509
4510 if (preferred_charset_id >= 0)
4511 {
4512 charset = CHARSET_FROM_ID (preferred_charset_id);
4513 if (! CHAR_CHARSET_P (c, charset))
4514 charset = char_charset (c, charset_list, NULL);
4515 }
4516 else
4517 charset = char_charset (c, charset_list, NULL);
4518 if (!charset)
4519 {
4520 if (coding->mode & CODING_MODE_SAFE_ENCODING)
4521 {
4522 c = CODING_INHIBIT_CHARACTER_SUBSTITUTION;
4523 charset = CHARSET_FROM_ID (charset_ascii);
4524 }
4525 else
4526 {
4527 c = coding->default_char;
4528 charset = char_charset (c, charset_list, NULL);
4529 }
4530 }
4531 ENCODE_ISO_CHARACTER (charset, c);
4532 }
4533 }
4534
4535 if (coding->mode & CODING_MODE_LAST_BLOCK
4536 && CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_RESET_AT_EOL)
4537 {
4538 ASSURE_DESTINATION (safe_room);
4539 ENCODE_RESET_PLANE_AND_REGISTER ();
4540 }
4541 record_conversion_result (coding, CODING_RESULT_SUCCESS);
4542 CODING_ISO_BOL (coding) = bol_designation;
4543 coding->produced_char += produced_chars;
4544 coding->produced = dst - coding->destination;
4545 return 0;
4546 }
4547
4548 \f
4549 /*** 8,9. SJIS and BIG5 handlers ***/
4550
4551 /* Although SJIS and BIG5 are not ISO's coding system, they are used
4552 quite widely. So, for the moment, Emacs supports them in the bare
4553 C code. But, in the future, they may be supported only by CCL. */
4554
4555 /* SJIS is a coding system encoding three character sets: ASCII, right
4556 half of JISX0201-Kana, and JISX0208. An ASCII character is encoded
4557 as is. A character of charset katakana-jisx0201 is encoded by
4558 "position-code + 0x80". A character of charset japanese-jisx0208
4559 is encoded in 2-byte but two position-codes are divided and shifted
4560 so that it fit in the range below.
4561
4562 --- CODE RANGE of SJIS ---
4563 (character set) (range)
4564 ASCII 0x00 .. 0x7F
4565 KATAKANA-JISX0201 0xA0 .. 0xDF
4566 JISX0208 (1st byte) 0x81 .. 0x9F and 0xE0 .. 0xEF
4567 (2nd byte) 0x40 .. 0x7E and 0x80 .. 0xFC
4568 -------------------------------
4569
4570 */
4571
4572 /* BIG5 is a coding system encoding two character sets: ASCII and
4573 Big5. An ASCII character is encoded as is. Big5 is a two-byte
4574 character set and is encoded in two-byte.
4575
4576 --- CODE RANGE of BIG5 ---
4577 (character set) (range)
4578 ASCII 0x00 .. 0x7F
4579 Big5 (1st byte) 0xA1 .. 0xFE
4580 (2nd byte) 0x40 .. 0x7E and 0xA1 .. 0xFE
4581 --------------------------
4582
4583 */
4584
4585 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
4586 Check if a text is encoded in SJIS. If it is, return
4587 CATEGORY_MASK_SJIS, else return 0. */
4588
4589 static int
4590 detect_coding_sjis (struct coding_system *coding,
4591 struct coding_detection_info *detect_info)
4592 {
4593 const unsigned char *src = coding->source, *src_base;
4594 const unsigned char *src_end = coding->source + coding->src_bytes;
4595 int multibytep = coding->src_multibyte;
4596 int consumed_chars = 0;
4597 int found = 0;
4598 int c;
4599 Lisp_Object attrs, charset_list;
4600 int max_first_byte_of_2_byte_code;
4601
4602 CODING_GET_INFO (coding, attrs, charset_list);
4603 max_first_byte_of_2_byte_code
4604 = (XINT (Flength (charset_list)) > 3 ? 0xFC : 0xEF);
4605
4606 detect_info->checked |= CATEGORY_MASK_SJIS;
4607 /* A coding system of this category is always ASCII compatible. */
4608 src += coding->head_ascii;
4609
4610 while (1)
4611 {
4612 src_base = src;
4613 ONE_MORE_BYTE (c);
4614 if (c < 0x80)
4615 continue;
4616 if ((c >= 0x81 && c <= 0x9F)
4617 || (c >= 0xE0 && c <= max_first_byte_of_2_byte_code))
4618 {
4619 ONE_MORE_BYTE (c);
4620 if (c < 0x40 || c == 0x7F || c > 0xFC)
4621 break;
4622 found = CATEGORY_MASK_SJIS;
4623 }
4624 else if (c >= 0xA0 && c < 0xE0)
4625 found = CATEGORY_MASK_SJIS;
4626 else
4627 break;
4628 }
4629 detect_info->rejected |= CATEGORY_MASK_SJIS;
4630 return 0;
4631
4632 no_more_source:
4633 if (src_base < src && coding->mode & CODING_MODE_LAST_BLOCK)
4634 {
4635 detect_info->rejected |= CATEGORY_MASK_SJIS;
4636 return 0;
4637 }
4638 detect_info->found |= found;
4639 return 1;
4640 }
4641
4642 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
4643 Check if a text is encoded in BIG5. If it is, return
4644 CATEGORY_MASK_BIG5, else return 0. */
4645
4646 static int
4647 detect_coding_big5 (struct coding_system *coding,
4648 struct coding_detection_info *detect_info)
4649 {
4650 const unsigned char *src = coding->source, *src_base;
4651 const unsigned char *src_end = coding->source + coding->src_bytes;
4652 int multibytep = coding->src_multibyte;
4653 int consumed_chars = 0;
4654 int found = 0;
4655 int c;
4656
4657 detect_info->checked |= CATEGORY_MASK_BIG5;
4658 /* A coding system of this category is always ASCII compatible. */
4659 src += coding->head_ascii;
4660
4661 while (1)
4662 {
4663 src_base = src;
4664 ONE_MORE_BYTE (c);
4665 if (c < 0x80)
4666 continue;
4667 if (c >= 0xA1)
4668 {
4669 ONE_MORE_BYTE (c);
4670 if (c < 0x40 || (c >= 0x7F && c <= 0xA0))
4671 return 0;
4672 found = CATEGORY_MASK_BIG5;
4673 }
4674 else
4675 break;
4676 }
4677 detect_info->rejected |= CATEGORY_MASK_BIG5;
4678 return 0;
4679
4680 no_more_source:
4681 if (src_base < src && coding->mode & CODING_MODE_LAST_BLOCK)
4682 {
4683 detect_info->rejected |= CATEGORY_MASK_BIG5;
4684 return 0;
4685 }
4686 detect_info->found |= found;
4687 return 1;
4688 }
4689
4690 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".
4691 If SJIS_P is 1, decode SJIS text, else decode BIG5 test. */
4692
4693 static void
4694 decode_coding_sjis (struct coding_system *coding)
4695 {
4696 const unsigned char *src = coding->source + coding->consumed;
4697 const unsigned char *src_end = coding->source + coding->src_bytes;
4698 const unsigned char *src_base;
4699 int *charbuf = coding->charbuf + coding->charbuf_used;
4700 /* We may produce one charset annotation in one loop and one more at
4701 the end. */
4702 int *charbuf_end
4703 = coding->charbuf + coding->charbuf_size - (MAX_ANNOTATION_LENGTH * 2);
4704 int consumed_chars = 0, consumed_chars_base;
4705 int multibytep = coding->src_multibyte;
4706 struct charset *charset_roman, *charset_kanji, *charset_kana;
4707 struct charset *charset_kanji2;
4708 Lisp_Object attrs, charset_list, val;
4709 int char_offset = coding->produced_char;
4710 int last_offset = char_offset;
4711 int last_id = charset_ascii;
4712 int eol_crlf =
4713 !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
4714 int byte_after_cr = -1;
4715
4716 CODING_GET_INFO (coding, attrs, charset_list);
4717
4718 val = charset_list;
4719 charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4720 charset_kana = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4721 charset_kanji = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4722 charset_kanji2 = NILP (val) ? NULL : CHARSET_FROM_ID (XINT (XCAR (val)));
4723
4724 while (1)
4725 {
4726 int c, c1;
4727 struct charset *charset;
4728
4729 src_base = src;
4730 consumed_chars_base = consumed_chars;
4731
4732 if (charbuf >= charbuf_end)
4733 {
4734 if (byte_after_cr >= 0)
4735 src_base--;
4736 break;
4737 }
4738
4739 if (byte_after_cr >= 0)
4740 c = byte_after_cr, byte_after_cr = -1;
4741 else
4742 ONE_MORE_BYTE (c);
4743 if (c < 0)
4744 goto invalid_code;
4745 if (c < 0x80)
4746 {
4747 if (eol_crlf && c == '\r')
4748 ONE_MORE_BYTE (byte_after_cr);
4749 charset = charset_roman;
4750 }
4751 else if (c == 0x80 || c == 0xA0)
4752 goto invalid_code;
4753 else if (c >= 0xA1 && c <= 0xDF)
4754 {
4755 /* SJIS -> JISX0201-Kana */
4756 c &= 0x7F;
4757 charset = charset_kana;
4758 }
4759 else if (c <= 0xEF)
4760 {
4761 /* SJIS -> JISX0208 */
4762 ONE_MORE_BYTE (c1);
4763 if (c1 < 0x40 || c1 == 0x7F || c1 > 0xFC)
4764 goto invalid_code;
4765 c = (c << 8) | c1;
4766 SJIS_TO_JIS (c);
4767 charset = charset_kanji;
4768 }
4769 else if (c <= 0xFC && charset_kanji2)
4770 {
4771 /* SJIS -> JISX0213-2 */
4772 ONE_MORE_BYTE (c1);
4773 if (c1 < 0x40 || c1 == 0x7F || c1 > 0xFC)
4774 goto invalid_code;
4775 c = (c << 8) | c1;
4776 SJIS_TO_JIS2 (c);
4777 charset = charset_kanji2;
4778 }
4779 else
4780 goto invalid_code;
4781 if (charset->id != charset_ascii
4782 && last_id != charset->id)
4783 {
4784 if (last_id != charset_ascii)
4785 ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
4786 last_id = charset->id;
4787 last_offset = char_offset;
4788 }
4789 CODING_DECODE_CHAR (coding, src, src_base, src_end, charset, c, c);
4790 *charbuf++ = c;
4791 char_offset++;
4792 continue;
4793
4794 invalid_code:
4795 src = src_base;
4796 consumed_chars = consumed_chars_base;
4797 ONE_MORE_BYTE (c);
4798 *charbuf++ = c < 0 ? -c : BYTE8_TO_CHAR (c);
4799 char_offset++;
4800 coding->errors++;
4801 }
4802
4803 no_more_source:
4804 if (last_id != charset_ascii)
4805 ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
4806 coding->consumed_char += consumed_chars_base;
4807 coding->consumed = src_base - coding->source;
4808 coding->charbuf_used = charbuf - coding->charbuf;
4809 }
4810
4811 static void
4812 decode_coding_big5 (struct coding_system *coding)
4813 {
4814 const unsigned char *src = coding->source + coding->consumed;
4815 const unsigned char *src_end = coding->source + coding->src_bytes;
4816 const unsigned char *src_base;
4817 int *charbuf = coding->charbuf + coding->charbuf_used;
4818 /* We may produce one charset annotation in one loop and one more at
4819 the end. */
4820 int *charbuf_end
4821 = coding->charbuf + coding->charbuf_size - (MAX_ANNOTATION_LENGTH * 2);
4822 int consumed_chars = 0, consumed_chars_base;
4823 int multibytep = coding->src_multibyte;
4824 struct charset *charset_roman, *charset_big5;
4825 Lisp_Object attrs, charset_list, val;
4826 int char_offset = coding->produced_char;
4827 int last_offset = char_offset;
4828 int last_id = charset_ascii;
4829 int eol_crlf =
4830 !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
4831 int byte_after_cr = -1;
4832
4833 CODING_GET_INFO (coding, attrs, charset_list);
4834 val = charset_list;
4835 charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4836 charset_big5 = CHARSET_FROM_ID (XINT (XCAR (val)));
4837
4838 while (1)
4839 {
4840 int c, c1;
4841 struct charset *charset;
4842
4843 src_base = src;
4844 consumed_chars_base = consumed_chars;
4845
4846 if (charbuf >= charbuf_end)
4847 {
4848 if (byte_after_cr >= 0)
4849 src_base--;
4850 break;
4851 }
4852
4853 if (byte_after_cr >= 0)
4854 c = byte_after_cr, byte_after_cr = -1;
4855 else
4856 ONE_MORE_BYTE (c);
4857
4858 if (c < 0)
4859 goto invalid_code;
4860 if (c < 0x80)
4861 {
4862 if (eol_crlf && c == '\r')
4863 ONE_MORE_BYTE (byte_after_cr);
4864 charset = charset_roman;
4865 }
4866 else
4867 {
4868 /* BIG5 -> Big5 */
4869 if (c < 0xA1 || c > 0xFE)
4870 goto invalid_code;
4871 ONE_MORE_BYTE (c1);
4872 if (c1 < 0x40 || (c1 > 0x7E && c1 < 0xA1) || c1 > 0xFE)
4873 goto invalid_code;
4874 c = c << 8 | c1;
4875 charset = charset_big5;
4876 }
4877 if (charset->id != charset_ascii
4878 && last_id != charset->id)
4879 {
4880 if (last_id != charset_ascii)
4881 ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
4882 last_id = charset->id;
4883 last_offset = char_offset;
4884 }
4885 CODING_DECODE_CHAR (coding, src, src_base, src_end, charset, c, c);
4886 *charbuf++ = c;
4887 char_offset++;
4888 continue;
4889
4890 invalid_code:
4891 src = src_base;
4892 consumed_chars = consumed_chars_base;
4893 ONE_MORE_BYTE (c);
4894 *charbuf++ = c < 0 ? -c : BYTE8_TO_CHAR (c);
4895 char_offset++;
4896 coding->errors++;
4897 }
4898
4899 no_more_source:
4900 if (last_id != charset_ascii)
4901 ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
4902 coding->consumed_char += consumed_chars_base;
4903 coding->consumed = src_base - coding->source;
4904 coding->charbuf_used = charbuf - coding->charbuf;
4905 }
4906
4907 /* See the above "GENERAL NOTES on `encode_coding_XXX ()' functions".
4908 This function can encode charsets `ascii', `katakana-jisx0201',
4909 `japanese-jisx0208', `chinese-big5-1', and `chinese-big5-2'. We
4910 are sure that all these charsets are registered as official charset
4911 (i.e. do not have extended leading-codes). Characters of other
4912 charsets are produced without any encoding. If SJIS_P is 1, encode
4913 SJIS text, else encode BIG5 text. */
4914
4915 static int
4916 encode_coding_sjis (struct coding_system *coding)
4917 {
4918 int multibytep = coding->dst_multibyte;
4919 int *charbuf = coding->charbuf;
4920 int *charbuf_end = charbuf + coding->charbuf_used;
4921 unsigned char *dst = coding->destination + coding->produced;
4922 unsigned char *dst_end = coding->destination + coding->dst_bytes;
4923 int safe_room = 4;
4924 int produced_chars = 0;
4925 Lisp_Object attrs, charset_list, val;
4926 int ascii_compatible;
4927 struct charset *charset_roman, *charset_kanji, *charset_kana;
4928 struct charset *charset_kanji2;
4929 int c;
4930
4931 CODING_GET_INFO (coding, attrs, charset_list);
4932 val = charset_list;
4933 charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4934 charset_kana = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4935 charset_kanji = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4936 charset_kanji2 = NILP (val) ? NULL : CHARSET_FROM_ID (XINT (XCAR (val)));
4937
4938 ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs));
4939
4940 while (charbuf < charbuf_end)
4941 {
4942 ASSURE_DESTINATION (safe_room);
4943 c = *charbuf++;
4944 /* Now encode the character C. */
4945 if (ASCII_CHAR_P (c) && ascii_compatible)
4946 EMIT_ONE_ASCII_BYTE (c);
4947 else if (CHAR_BYTE8_P (c))
4948 {
4949 c = CHAR_TO_BYTE8 (c);
4950 EMIT_ONE_BYTE (c);
4951 }
4952 else
4953 {
4954 unsigned code;
4955 struct charset *charset = char_charset (c, charset_list, &code);
4956
4957 if (!charset)
4958 {
4959 if (coding->mode & CODING_MODE_SAFE_ENCODING)
4960 {
4961 code = CODING_INHIBIT_CHARACTER_SUBSTITUTION;
4962 charset = CHARSET_FROM_ID (charset_ascii);
4963 }
4964 else
4965 {
4966 c = coding->default_char;
4967 charset = char_charset (c, charset_list, &code);
4968 }
4969 }
4970 if (code == CHARSET_INVALID_CODE (charset))
4971 abort ();
4972 if (charset == charset_kanji)
4973 {
4974 int c1, c2;
4975 JIS_TO_SJIS (code);
4976 c1 = code >> 8, c2 = code & 0xFF;
4977 EMIT_TWO_BYTES (c1, c2);
4978 }
4979 else if (charset == charset_kana)
4980 EMIT_ONE_BYTE (code | 0x80);
4981 else if (charset_kanji2 && charset == charset_kanji2)
4982 {
4983 int c1, c2;
4984
4985 c1 = code >> 8;
4986 if (c1 == 0x21 || (c1 >= 0x23 && c1 <= 0x25)
4987 || c1 == 0x28
4988 || (c1 >= 0x2C && c1 <= 0x2F) || c1 >= 0x6E)
4989 {
4990 JIS_TO_SJIS2 (code);
4991 c1 = code >> 8, c2 = code & 0xFF;
4992 EMIT_TWO_BYTES (c1, c2);
4993 }
4994 else
4995 EMIT_ONE_ASCII_BYTE (code & 0x7F);
4996 }
4997 else
4998 EMIT_ONE_ASCII_BYTE (code & 0x7F);
4999 }
5000 }
5001 record_conversion_result (coding, CODING_RESULT_SUCCESS);
5002 coding->produced_char += produced_chars;
5003 coding->produced = dst - coding->destination;
5004 return 0;
5005 }
5006
5007 static int
5008 encode_coding_big5 (struct coding_system *coding)
5009 {
5010 int multibytep = coding->dst_multibyte;
5011 int *charbuf = coding->charbuf;
5012 int *charbuf_end = charbuf + coding->charbuf_used;
5013 unsigned char *dst = coding->destination + coding->produced;
5014 unsigned char *dst_end = coding->destination + coding->dst_bytes;
5015 int safe_room = 4;
5016 int produced_chars = 0;
5017 Lisp_Object attrs, charset_list, val;
5018 int ascii_compatible;
5019 struct charset *charset_roman, *charset_big5;
5020 int c;
5021
5022 CODING_GET_INFO (coding, attrs, charset_list);
5023 val = charset_list;
5024 charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
5025 charset_big5 = CHARSET_FROM_ID (XINT (XCAR (val)));
5026 ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs));
5027
5028 while (charbuf < charbuf_end)
5029 {
5030 ASSURE_DESTINATION (safe_room);
5031 c = *charbuf++;
5032 /* Now encode the character C. */
5033 if (ASCII_CHAR_P (c) && ascii_compatible)
5034 EMIT_ONE_ASCII_BYTE (c);
5035 else if (CHAR_BYTE8_P (c))
5036 {
5037 c = CHAR_TO_BYTE8 (c);
5038 EMIT_ONE_BYTE (c);
5039 }
5040 else
5041 {
5042 unsigned code;
5043 struct charset *charset = char_charset (c, charset_list, &code);
5044
5045 if (! charset)
5046 {
5047 if (coding->mode & CODING_MODE_SAFE_ENCODING)
5048 {
5049 code = CODING_INHIBIT_CHARACTER_SUBSTITUTION;
5050 charset = CHARSET_FROM_ID (charset_ascii);
5051 }
5052 else
5053 {
5054 c = coding->default_char;
5055 charset = char_charset (c, charset_list, &code);
5056 }
5057 }
5058 if (code == CHARSET_INVALID_CODE (charset))
5059 abort ();
5060 if (charset == charset_big5)
5061 {
5062 int c1, c2;
5063
5064 c1 = code >> 8, c2 = code & 0xFF;
5065 EMIT_TWO_BYTES (c1, c2);
5066 }
5067 else
5068 EMIT_ONE_ASCII_BYTE (code & 0x7F);
5069 }
5070 }
5071 record_conversion_result (coding, CODING_RESULT_SUCCESS);
5072 coding->produced_char += produced_chars;
5073 coding->produced = dst - coding->destination;
5074 return 0;
5075 }
5076
5077 \f
5078 /*** 10. CCL handlers ***/
5079
5080 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
5081 Check if a text is encoded in a coding system of which
5082 encoder/decoder are written in CCL program. If it is, return
5083 CATEGORY_MASK_CCL, else return 0. */
5084
5085 static int
5086 detect_coding_ccl (struct coding_system *coding,
5087 struct coding_detection_info *detect_info)
5088 {
5089 const unsigned char *src = coding->source, *src_base;
5090 const unsigned char *src_end = coding->source + coding->src_bytes;
5091 int multibytep = coding->src_multibyte;
5092 int consumed_chars = 0;
5093 int found = 0;
5094 unsigned char *valids;
5095 int head_ascii = coding->head_ascii;
5096 Lisp_Object attrs;
5097
5098 detect_info->checked |= CATEGORY_MASK_CCL;
5099
5100 coding = &coding_categories[coding_category_ccl];
5101 valids = CODING_CCL_VALIDS (coding);
5102 attrs = CODING_ID_ATTRS (coding->id);
5103 if (! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
5104 src += head_ascii;
5105
5106 while (1)
5107 {
5108 int c;
5109
5110 src_base = src;
5111 ONE_MORE_BYTE (c);
5112 if (c < 0 || ! valids[c])
5113 break;
5114 if ((valids[c] > 1))
5115 found = CATEGORY_MASK_CCL;
5116 }
5117 detect_info->rejected |= CATEGORY_MASK_CCL;
5118 return 0;
5119
5120 no_more_source:
5121 detect_info->found |= found;
5122 return 1;
5123 }
5124
5125 static void
5126 decode_coding_ccl (struct coding_system *coding)
5127 {
5128 const unsigned char *src = coding->source + coding->consumed;
5129 const unsigned char *src_end = coding->source + coding->src_bytes;
5130 int *charbuf = coding->charbuf + coding->charbuf_used;
5131 int *charbuf_end = coding->charbuf + coding->charbuf_size;
5132 int consumed_chars = 0;
5133 int multibytep = coding->src_multibyte;
5134 struct ccl_program *ccl = &coding->spec.ccl->ccl;
5135 int source_charbuf[1024];
5136 int source_byteidx[1025];
5137 Lisp_Object attrs, charset_list;
5138
5139 CODING_GET_INFO (coding, attrs, charset_list);
5140
5141 while (1)
5142 {
5143 const unsigned char *p = src;
5144 int i = 0;
5145
5146 if (multibytep)
5147 {
5148 while (i < 1024 && p < src_end)
5149 {
5150 source_byteidx[i] = p - src;
5151 source_charbuf[i++] = STRING_CHAR_ADVANCE (p);
5152 }
5153 source_byteidx[i] = p - src;
5154 }
5155 else
5156 while (i < 1024 && p < src_end)
5157 source_charbuf[i++] = *p++;
5158
5159 if (p == src_end && coding->mode & CODING_MODE_LAST_BLOCK)
5160 ccl->last_block = 1;
5161 ccl_driver (ccl, source_charbuf, charbuf, i, charbuf_end - charbuf,
5162 charset_list);
5163 charbuf += ccl->produced;
5164 if (multibytep)
5165 src += source_byteidx[ccl->consumed];
5166 else
5167 src += ccl->consumed;
5168 consumed_chars += ccl->consumed;
5169 if (p == src_end || ccl->status != CCL_STAT_SUSPEND_BY_SRC)
5170 break;
5171 }
5172
5173 switch (ccl->status)
5174 {
5175 case CCL_STAT_SUSPEND_BY_SRC:
5176 record_conversion_result (coding, CODING_RESULT_INSUFFICIENT_SRC);
5177 break;
5178 case CCL_STAT_SUSPEND_BY_DST:
5179 record_conversion_result (coding, CODING_RESULT_INSUFFICIENT_DST);
5180 break;
5181 case CCL_STAT_QUIT:
5182 case CCL_STAT_INVALID_CMD:
5183 record_conversion_result (coding, CODING_RESULT_INTERRUPT);
5184 break;
5185 default:
5186 record_conversion_result (coding, CODING_RESULT_SUCCESS);
5187 break;
5188 }
5189 coding->consumed_char += consumed_chars;
5190 coding->consumed = src - coding->source;
5191 coding->charbuf_used = charbuf - coding->charbuf;
5192 }
5193
5194 static int
5195 encode_coding_ccl (struct coding_system *coding)
5196 {
5197 struct ccl_program *ccl = &coding->spec.ccl->ccl;
5198 int multibytep = coding->dst_multibyte;
5199 int *charbuf = coding->charbuf;
5200 int *charbuf_end = charbuf + coding->charbuf_used;
5201 unsigned char *dst = coding->destination + coding->produced;
5202 unsigned char *dst_end = coding->destination + coding->dst_bytes;
5203 int destination_charbuf[1024];
5204 int i, produced_chars = 0;
5205 Lisp_Object attrs, charset_list;
5206
5207 CODING_GET_INFO (coding, attrs, charset_list);
5208 if (coding->consumed_char == coding->src_chars
5209 && coding->mode & CODING_MODE_LAST_BLOCK)
5210 ccl->last_block = 1;
5211
5212 while (charbuf < charbuf_end)
5213 {
5214 ccl_driver (ccl, charbuf, destination_charbuf,
5215 charbuf_end - charbuf, 1024, charset_list);
5216 if (multibytep)
5217 {
5218 ASSURE_DESTINATION (ccl->produced * 2);
5219 for (i = 0; i < ccl->produced; i++)
5220 EMIT_ONE_BYTE (destination_charbuf[i] & 0xFF);
5221 }
5222 else
5223 {
5224 ASSURE_DESTINATION (ccl->produced);
5225 for (i = 0; i < ccl->produced; i++)
5226 *dst++ = destination_charbuf[i] & 0xFF;
5227 produced_chars += ccl->produced;
5228 }
5229 charbuf += ccl->consumed;
5230 if (ccl->status == CCL_STAT_QUIT
5231 || ccl->status == CCL_STAT_INVALID_CMD)
5232 break;
5233 }
5234
5235 switch (ccl->status)
5236 {
5237 case CCL_STAT_SUSPEND_BY_SRC:
5238 record_conversion_result (coding, CODING_RESULT_INSUFFICIENT_SRC);
5239 break;
5240 case CCL_STAT_SUSPEND_BY_DST:
5241 record_conversion_result (coding, CODING_RESULT_INSUFFICIENT_DST);
5242 break;
5243 case CCL_STAT_QUIT:
5244 case CCL_STAT_INVALID_CMD:
5245 record_conversion_result (coding, CODING_RESULT_INTERRUPT);
5246 break;
5247 default:
5248 record_conversion_result (coding, CODING_RESULT_SUCCESS);
5249 break;
5250 }
5251
5252 coding->produced_char += produced_chars;
5253 coding->produced = dst - coding->destination;
5254 return 0;
5255 }
5256
5257
5258 \f
5259 /*** 10, 11. no-conversion handlers ***/
5260
5261 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions". */
5262
5263 static void
5264 decode_coding_raw_text (struct coding_system *coding)
5265 {
5266 int eol_crlf =
5267 !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
5268
5269 coding->chars_at_source = 1;
5270 coding->consumed_char = coding->src_chars;
5271 coding->consumed = coding->src_bytes;
5272 if (eol_crlf && coding->source[coding->src_bytes - 1] == '\r')
5273 {
5274 coding->consumed_char--;
5275 coding->consumed--;
5276 record_conversion_result (coding, CODING_RESULT_INSUFFICIENT_SRC);
5277 }
5278 else
5279 record_conversion_result (coding, CODING_RESULT_SUCCESS);
5280 }
5281
5282 static int
5283 encode_coding_raw_text (struct coding_system *coding)
5284 {
5285 int multibytep = coding->dst_multibyte;
5286 int *charbuf = coding->charbuf;
5287 int *charbuf_end = coding->charbuf + coding->charbuf_used;
5288 unsigned char *dst = coding->destination + coding->produced;
5289 unsigned char *dst_end = coding->destination + coding->dst_bytes;
5290 int produced_chars = 0;
5291 int c;
5292
5293 if (multibytep)
5294 {
5295 int safe_room = MAX_MULTIBYTE_LENGTH * 2;
5296
5297 if (coding->src_multibyte)
5298 while (charbuf < charbuf_end)
5299 {
5300 ASSURE_DESTINATION (safe_room);
5301 c = *charbuf++;
5302 if (ASCII_CHAR_P (c))
5303 EMIT_ONE_ASCII_BYTE (c);
5304 else if (CHAR_BYTE8_P (c))
5305 {
5306 c = CHAR_TO_BYTE8 (c);
5307 EMIT_ONE_BYTE (c);
5308 }
5309 else
5310 {
5311 unsigned char str[MAX_MULTIBYTE_LENGTH], *p0 = str, *p1 = str;
5312
5313 CHAR_STRING_ADVANCE (c, p1);
5314 while (p0 < p1)
5315 {
5316 EMIT_ONE_BYTE (*p0);
5317 p0++;
5318 }
5319 }
5320 }
5321 else
5322 while (charbuf < charbuf_end)
5323 {
5324 ASSURE_DESTINATION (safe_room);
5325 c = *charbuf++;
5326 EMIT_ONE_BYTE (c);
5327 }
5328 }
5329 else
5330 {
5331 if (coding->src_multibyte)
5332 {
5333 int safe_room = MAX_MULTIBYTE_LENGTH;
5334
5335 while (charbuf < charbuf_end)
5336 {
5337 ASSURE_DESTINATION (safe_room);
5338 c = *charbuf++;
5339 if (ASCII_CHAR_P (c))
5340 *dst++ = c;
5341 else if (CHAR_BYTE8_P (c))
5342 *dst++ = CHAR_TO_BYTE8 (c);
5343 else
5344 CHAR_STRING_ADVANCE (c, dst);
5345 }
5346 }
5347 else
5348 {
5349 ASSURE_DESTINATION (charbuf_end - charbuf);
5350 while (charbuf < charbuf_end && dst < dst_end)
5351 *dst++ = *charbuf++;
5352 }
5353 produced_chars = dst - (coding->destination + coding->produced);
5354 }
5355 record_conversion_result (coding, CODING_RESULT_SUCCESS);
5356 coding->produced_char += produced_chars;
5357 coding->produced = dst - coding->destination;
5358 return 0;
5359 }
5360
5361 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
5362 Check if a text is encoded in a charset-based coding system. If it
5363 is, return 1, else return 0. */
5364
5365 static int
5366 detect_coding_charset (struct coding_system *coding,
5367 struct coding_detection_info *detect_info)
5368 {
5369 const unsigned char *src = coding->source, *src_base;
5370 const unsigned char *src_end = coding->source + coding->src_bytes;
5371 int multibytep = coding->src_multibyte;
5372 int consumed_chars = 0;
5373 Lisp_Object attrs, valids, name;
5374 int found = 0;
5375 int head_ascii = coding->head_ascii;
5376 int check_latin_extra = 0;
5377
5378 detect_info->checked |= CATEGORY_MASK_CHARSET;
5379
5380 coding = &coding_categories[coding_category_charset];
5381 attrs = CODING_ID_ATTRS (coding->id);
5382 valids = AREF (attrs, coding_attr_charset_valids);
5383 name = CODING_ID_NAME (coding->id);
5384 if (strncmp (SSDATA (SYMBOL_NAME (name)),
5385 "iso-8859-", sizeof ("iso-8859-") - 1) == 0
5386 || strncmp (SSDATA (SYMBOL_NAME (name)),
5387 "iso-latin-", sizeof ("iso-latin-") - 1) == 0)
5388 check_latin_extra = 1;
5389
5390 if (! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
5391 src += head_ascii;
5392
5393 while (1)
5394 {
5395 int c;
5396 Lisp_Object val;
5397 struct charset *charset;
5398 int dim, idx;
5399
5400 src_base = src;
5401 ONE_MORE_BYTE (c);
5402 if (c < 0)
5403 continue;
5404 val = AREF (valids, c);
5405 if (NILP (val))
5406 break;
5407 if (c >= 0x80)
5408 {
5409 if (c < 0xA0
5410 && check_latin_extra
5411 && (!VECTORP (Vlatin_extra_code_table)
5412 || NILP (XVECTOR (Vlatin_extra_code_table)->contents[c])))
5413 break;
5414 found = CATEGORY_MASK_CHARSET;
5415 }
5416 if (INTEGERP (val))
5417 {
5418 charset = CHARSET_FROM_ID (XFASTINT (val));
5419 dim = CHARSET_DIMENSION (charset);
5420 for (idx = 1; idx < dim; idx++)
5421 {
5422 if (src == src_end)
5423 goto too_short;
5424 ONE_MORE_BYTE (c);
5425 if (c < charset->code_space[(dim - 1 - idx) * 2]
5426 || c > charset->code_space[(dim - 1 - idx) * 2 + 1])
5427 break;
5428 }
5429 if (idx < dim)
5430 break;
5431 }
5432 else
5433 {
5434 idx = 1;
5435 for (; CONSP (val); val = XCDR (val))
5436 {
5437 charset = CHARSET_FROM_ID (XFASTINT (XCAR (val)));
5438 dim = CHARSET_DIMENSION (charset);
5439 while (idx < dim)
5440 {
5441 if (src == src_end)
5442 goto too_short;
5443 ONE_MORE_BYTE (c);
5444 if (c < charset->code_space[(dim - 1 - idx) * 4]
5445 || c > charset->code_space[(dim - 1 - idx) * 4 + 1])
5446 break;
5447 idx++;
5448 }
5449 if (idx == dim)
5450 {
5451 val = Qnil;
5452 break;
5453 }
5454 }
5455 if (CONSP (val))
5456 break;
5457 }
5458 }
5459 too_short:
5460 detect_info->rejected |= CATEGORY_MASK_CHARSET;
5461 return 0;
5462
5463 no_more_source:
5464 detect_info->found |= found;
5465 return 1;
5466 }
5467
5468 static void
5469 decode_coding_charset (struct coding_system *coding)
5470 {
5471 const unsigned char *src = coding->source + coding->consumed;
5472 const unsigned char *src_end = coding->source + coding->src_bytes;
5473 const unsigned char *src_base;
5474 int *charbuf = coding->charbuf + coding->charbuf_used;
5475 /* We may produce one charset annotation in one loop and one more at
5476 the end. */
5477 int *charbuf_end
5478 = coding->charbuf + coding->charbuf_size - (MAX_ANNOTATION_LENGTH * 2);
5479 int consumed_chars = 0, consumed_chars_base;
5480 int multibytep = coding->src_multibyte;
5481 Lisp_Object attrs, charset_list, valids;
5482 int char_offset = coding->produced_char;
5483 int last_offset = char_offset;
5484 int last_id = charset_ascii;
5485 int eol_crlf =
5486 !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
5487 int byte_after_cr = -1;
5488
5489 CODING_GET_INFO (coding, attrs, charset_list);
5490 valids = AREF (attrs, coding_attr_charset_valids);
5491
5492 while (1)
5493 {
5494 int c;
5495 Lisp_Object val;
5496 struct charset *charset;
5497 int dim;
5498 int len = 1;
5499 unsigned code;
5500
5501 src_base = src;
5502 consumed_chars_base = consumed_chars;
5503
5504 if (charbuf >= charbuf_end)
5505 {
5506 if (byte_after_cr >= 0)
5507 src_base--;
5508 break;
5509 }
5510
5511 if (byte_after_cr >= 0)
5512 {
5513 c = byte_after_cr;
5514 byte_after_cr = -1;
5515 }
5516 else
5517 {
5518 ONE_MORE_BYTE (c);
5519 if (eol_crlf && c == '\r')
5520 ONE_MORE_BYTE (byte_after_cr);
5521 }
5522 if (c < 0)
5523 goto invalid_code;
5524 code = c;
5525
5526 val = AREF (valids, c);
5527 if (! INTEGERP (val) && ! CONSP (val))
5528 goto invalid_code;
5529 if (INTEGERP (val))
5530 {
5531 charset = CHARSET_FROM_ID (XFASTINT (val));
5532 dim = CHARSET_DIMENSION (charset);
5533 while (len < dim)
5534 {
5535 ONE_MORE_BYTE (c);
5536 code = (code << 8) | c;
5537 len++;
5538 }
5539 CODING_DECODE_CHAR (coding, src, src_base, src_end,
5540 charset, code, c);
5541 }
5542 else
5543 {
5544 /* VAL is a list of charset IDs. It is assured that the
5545 list is sorted by charset dimensions (smaller one
5546 comes first). */
5547 while (CONSP (val))
5548 {
5549 charset = CHARSET_FROM_ID (XFASTINT (XCAR (val)));
5550 dim = CHARSET_DIMENSION (charset);
5551 while (len < dim)
5552 {
5553 ONE_MORE_BYTE (c);
5554 code = (code << 8) | c;
5555 len++;
5556 }
5557 CODING_DECODE_CHAR (coding, src, src_base,
5558 src_end, charset, code, c);
5559 if (c >= 0)
5560 break;
5561 val = XCDR (val);
5562 }
5563 }
5564 if (c < 0)
5565 goto invalid_code;
5566 if (charset->id != charset_ascii
5567 && last_id != charset->id)
5568 {
5569 if (last_id != charset_ascii)
5570 ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
5571 last_id = charset->id;
5572 last_offset = char_offset;
5573 }
5574
5575 *charbuf++ = c;
5576 char_offset++;
5577 continue;
5578
5579 invalid_code:
5580 src = src_base;
5581 consumed_chars = consumed_chars_base;
5582 ONE_MORE_BYTE (c);
5583 *charbuf++ = c < 0 ? -c : ASCII_BYTE_P (c) ? c : BYTE8_TO_CHAR (c);
5584 char_offset++;
5585 coding->errors++;
5586 }
5587
5588 no_more_source:
5589 if (last_id != charset_ascii)
5590 ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
5591 coding->consumed_char += consumed_chars_base;
5592 coding->consumed = src_base - coding->source;
5593 coding->charbuf_used = charbuf - coding->charbuf;
5594 }
5595
5596 static int
5597 encode_coding_charset (struct coding_system *coding)
5598 {
5599 int multibytep = coding->dst_multibyte;
5600 int *charbuf = coding->charbuf;
5601 int *charbuf_end = charbuf + coding->charbuf_used;
5602 unsigned char *dst = coding->destination + coding->produced;
5603 unsigned char *dst_end = coding->destination + coding->dst_bytes;
5604 int safe_room = MAX_MULTIBYTE_LENGTH;
5605 int produced_chars = 0;
5606 Lisp_Object attrs, charset_list;
5607 int ascii_compatible;
5608 int c;
5609
5610 CODING_GET_INFO (coding, attrs, charset_list);
5611 ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs));
5612
5613 while (charbuf < charbuf_end)
5614 {
5615 struct charset *charset;
5616 unsigned code;
5617
5618 ASSURE_DESTINATION (safe_room);
5619 c = *charbuf++;
5620 if (ascii_compatible && ASCII_CHAR_P (c))
5621 EMIT_ONE_ASCII_BYTE (c);
5622 else if (CHAR_BYTE8_P (c))
5623 {
5624 c = CHAR_TO_BYTE8 (c);
5625 EMIT_ONE_BYTE (c);
5626 }
5627 else
5628 {
5629 charset = char_charset (c, charset_list, &code);
5630 if (charset)
5631 {
5632 if (CHARSET_DIMENSION (charset) == 1)
5633 EMIT_ONE_BYTE (code);
5634 else if (CHARSET_DIMENSION (charset) == 2)
5635 EMIT_TWO_BYTES (code >> 8, code & 0xFF);
5636 else if (CHARSET_DIMENSION (charset) == 3)
5637 EMIT_THREE_BYTES (code >> 16, (code >> 8) & 0xFF, code & 0xFF);
5638 else
5639 EMIT_FOUR_BYTES (code >> 24, (code >> 16) & 0xFF,
5640 (code >> 8) & 0xFF, code & 0xFF);
5641 }
5642 else
5643 {
5644 if (coding->mode & CODING_MODE_SAFE_ENCODING)
5645 c = CODING_INHIBIT_CHARACTER_SUBSTITUTION;
5646 else
5647 c = coding->default_char;
5648 EMIT_ONE_BYTE (c);
5649 }
5650 }
5651 }
5652
5653 record_conversion_result (coding, CODING_RESULT_SUCCESS);
5654 coding->produced_char += produced_chars;
5655 coding->produced = dst - coding->destination;
5656 return 0;
5657 }
5658
5659 \f
5660 /*** 7. C library functions ***/
5661
5662 /* Setup coding context CODING from information about CODING_SYSTEM.
5663 If CODING_SYSTEM is nil, `no-conversion' is assumed. If
5664 CODING_SYSTEM is invalid, signal an error. */
5665
5666 void
5667 setup_coding_system (Lisp_Object coding_system, struct coding_system *coding)
5668 {
5669 Lisp_Object attrs;
5670 Lisp_Object eol_type;
5671 Lisp_Object coding_type;
5672 Lisp_Object val;
5673
5674 if (NILP (coding_system))
5675 coding_system = Qundecided;
5676
5677 CHECK_CODING_SYSTEM_GET_ID (coding_system, coding->id);
5678
5679 attrs = CODING_ID_ATTRS (coding->id);
5680 eol_type = inhibit_eol_conversion ? Qunix : CODING_ID_EOL_TYPE (coding->id);
5681
5682 coding->mode = 0;
5683 coding->head_ascii = -1;
5684 if (VECTORP (eol_type))
5685 coding->common_flags = (CODING_REQUIRE_DECODING_MASK
5686 | CODING_REQUIRE_DETECTION_MASK);
5687 else if (! EQ (eol_type, Qunix))
5688 coding->common_flags = (CODING_REQUIRE_DECODING_MASK
5689 | CODING_REQUIRE_ENCODING_MASK);
5690 else
5691 coding->common_flags = 0;
5692 if (! NILP (CODING_ATTR_POST_READ (attrs)))
5693 coding->common_flags |= CODING_REQUIRE_DECODING_MASK;
5694 if (! NILP (CODING_ATTR_PRE_WRITE (attrs)))
5695 coding->common_flags |= CODING_REQUIRE_ENCODING_MASK;
5696 if (! NILP (CODING_ATTR_FOR_UNIBYTE (attrs)))
5697 coding->common_flags |= CODING_FOR_UNIBYTE_MASK;
5698
5699 val = CODING_ATTR_SAFE_CHARSETS (attrs);
5700 coding->max_charset_id = SCHARS (val) - 1;
5701 coding->safe_charsets = SDATA (val);
5702 coding->default_char = XINT (CODING_ATTR_DEFAULT_CHAR (attrs));
5703 coding->carryover_bytes = 0;
5704
5705 coding_type = CODING_ATTR_TYPE (attrs);
5706 if (EQ (coding_type, Qundecided))
5707 {
5708 coding->detector = NULL;
5709 coding->decoder = decode_coding_raw_text;
5710 coding->encoder = encode_coding_raw_text;
5711 coding->common_flags |= CODING_REQUIRE_DETECTION_MASK;
5712 }
5713 else if (EQ (coding_type, Qiso_2022))
5714 {
5715 int i;
5716 int flags = XINT (AREF (attrs, coding_attr_iso_flags));
5717
5718 /* Invoke graphic register 0 to plane 0. */
5719 CODING_ISO_INVOCATION (coding, 0) = 0;
5720 /* Invoke graphic register 1 to plane 1 if we can use 8-bit. */
5721 CODING_ISO_INVOCATION (coding, 1)
5722 = (flags & CODING_ISO_FLAG_SEVEN_BITS ? -1 : 1);
5723 /* Setup the initial status of designation. */
5724 for (i = 0; i < 4; i++)
5725 CODING_ISO_DESIGNATION (coding, i) = CODING_ISO_INITIAL (coding, i);
5726 /* Not single shifting initially. */
5727 CODING_ISO_SINGLE_SHIFTING (coding) = 0;
5728 /* Beginning of buffer should also be regarded as bol. */
5729 CODING_ISO_BOL (coding) = 1;
5730 coding->detector = detect_coding_iso_2022;
5731 coding->decoder = decode_coding_iso_2022;
5732 coding->encoder = encode_coding_iso_2022;
5733 if (flags & CODING_ISO_FLAG_SAFE)
5734 coding->mode |= CODING_MODE_SAFE_ENCODING;
5735 coding->common_flags
5736 |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK
5737 | CODING_REQUIRE_FLUSHING_MASK);
5738 if (flags & CODING_ISO_FLAG_COMPOSITION)
5739 coding->common_flags |= CODING_ANNOTATE_COMPOSITION_MASK;
5740 if (flags & CODING_ISO_FLAG_DESIGNATION)
5741 coding->common_flags |= CODING_ANNOTATE_CHARSET_MASK;
5742 if (flags & CODING_ISO_FLAG_FULL_SUPPORT)
5743 {
5744 setup_iso_safe_charsets (attrs);
5745 val = CODING_ATTR_SAFE_CHARSETS (attrs);
5746 coding->max_charset_id = SCHARS (val) - 1;
5747 coding->safe_charsets = SDATA (val);
5748 }
5749 CODING_ISO_FLAGS (coding) = flags;
5750 CODING_ISO_CMP_STATUS (coding)->state = COMPOSING_NO;
5751 CODING_ISO_CMP_STATUS (coding)->method = COMPOSITION_NO;
5752 CODING_ISO_EXTSEGMENT_LEN (coding) = 0;
5753 CODING_ISO_EMBEDDED_UTF_8 (coding) = 0;
5754 }
5755 else if (EQ (coding_type, Qcharset))
5756 {
5757 coding->detector = detect_coding_charset;
5758 coding->decoder = decode_coding_charset;
5759 coding->encoder = encode_coding_charset;
5760 coding->common_flags
5761 |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
5762 }
5763 else if (EQ (coding_type, Qutf_8))
5764 {
5765 val = AREF (attrs, coding_attr_utf_bom);
5766 CODING_UTF_8_BOM (coding) = (CONSP (val) ? utf_detect_bom
5767 : EQ (val, Qt) ? utf_with_bom
5768 : utf_without_bom);
5769 coding->detector = detect_coding_utf_8;
5770 coding->decoder = decode_coding_utf_8;
5771 coding->encoder = encode_coding_utf_8;
5772 coding->common_flags
5773 |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
5774 if (CODING_UTF_8_BOM (coding) == utf_detect_bom)
5775 coding->common_flags |= CODING_REQUIRE_DETECTION_MASK;
5776 }
5777 else if (EQ (coding_type, Qutf_16))
5778 {
5779 val = AREF (attrs, coding_attr_utf_bom);
5780 CODING_UTF_16_BOM (coding) = (CONSP (val) ? utf_detect_bom
5781 : EQ (val, Qt) ? utf_with_bom
5782 : utf_without_bom);
5783 val = AREF (attrs, coding_attr_utf_16_endian);
5784 CODING_UTF_16_ENDIAN (coding) = (EQ (val, Qbig) ? utf_16_big_endian
5785 : utf_16_little_endian);
5786 CODING_UTF_16_SURROGATE (coding) = 0;
5787 coding->detector = detect_coding_utf_16;
5788 coding->decoder = decode_coding_utf_16;
5789 coding->encoder = encode_coding_utf_16;
5790 coding->common_flags
5791 |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
5792 if (CODING_UTF_16_BOM (coding) == utf_detect_bom)
5793 coding->common_flags |= CODING_REQUIRE_DETECTION_MASK;
5794 }
5795 else if (EQ (coding_type, Qccl))
5796 {
5797 coding->detector = detect_coding_ccl;
5798 coding->decoder = decode_coding_ccl;
5799 coding->encoder = encode_coding_ccl;
5800 coding->common_flags
5801 |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK
5802 | CODING_REQUIRE_FLUSHING_MASK);
5803 }
5804 else if (EQ (coding_type, Qemacs_mule))
5805 {
5806 coding->detector = detect_coding_emacs_mule;
5807 coding->decoder = decode_coding_emacs_mule;
5808 coding->encoder = encode_coding_emacs_mule;
5809 coding->common_flags
5810 |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
5811 coding->spec.emacs_mule.full_support = 1;
5812 if (! NILP (AREF (attrs, coding_attr_emacs_mule_full))
5813 && ! EQ (CODING_ATTR_CHARSET_LIST (attrs), Vemacs_mule_charset_list))
5814 {
5815 Lisp_Object tail, safe_charsets;
5816 int max_charset_id = 0;
5817
5818 for (tail = Vemacs_mule_charset_list; CONSP (tail);
5819 tail = XCDR (tail))
5820 if (max_charset_id < XFASTINT (XCAR (tail)))
5821 max_charset_id = XFASTINT (XCAR (tail));
5822 safe_charsets = make_uninit_string (max_charset_id + 1);
5823 memset (SDATA (safe_charsets), 255, max_charset_id + 1);
5824 for (tail = Vemacs_mule_charset_list; CONSP (tail);
5825 tail = XCDR (tail))
5826 SSET (safe_charsets, XFASTINT (XCAR (tail)), 0);
5827 coding->max_charset_id = max_charset_id;
5828 coding->safe_charsets = SDATA (safe_charsets);
5829 coding->spec.emacs_mule.full_support = 1;
5830 }
5831 coding->spec.emacs_mule.cmp_status.state = COMPOSING_NO;
5832 coding->spec.emacs_mule.cmp_status.method = COMPOSITION_NO;
5833 }
5834 else if (EQ (coding_type, Qshift_jis))
5835 {
5836 coding->detector = detect_coding_sjis;
5837 coding->decoder = decode_coding_sjis;
5838 coding->encoder = encode_coding_sjis;
5839 coding->common_flags
5840 |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
5841 }
5842 else if (EQ (coding_type, Qbig5))
5843 {
5844 coding->detector = detect_coding_big5;
5845 coding->decoder = decode_coding_big5;
5846 coding->encoder = encode_coding_big5;
5847 coding->common_flags
5848 |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
5849 }
5850 else /* EQ (coding_type, Qraw_text) */
5851 {
5852 coding->detector = NULL;
5853 coding->decoder = decode_coding_raw_text;
5854 coding->encoder = encode_coding_raw_text;
5855 if (! EQ (eol_type, Qunix))
5856 {
5857 coding->common_flags |= CODING_REQUIRE_DECODING_MASK;
5858 if (! VECTORP (eol_type))
5859 coding->common_flags |= CODING_REQUIRE_ENCODING_MASK;
5860 }
5861
5862 }
5863
5864 return;
5865 }
5866
5867 /* Return a list of charsets supported by CODING. */
5868
5869 Lisp_Object
5870 coding_charset_list (struct coding_system *coding)
5871 {
5872 Lisp_Object attrs, charset_list;
5873
5874 CODING_GET_INFO (coding, attrs, charset_list);
5875 if (EQ (CODING_ATTR_TYPE (attrs), Qiso_2022))
5876 {
5877 int flags = XINT (AREF (attrs, coding_attr_iso_flags));
5878
5879 if (flags & CODING_ISO_FLAG_FULL_SUPPORT)
5880 charset_list = Viso_2022_charset_list;
5881 }
5882 else if (EQ (CODING_ATTR_TYPE (attrs), Qemacs_mule))
5883 {
5884 charset_list = Vemacs_mule_charset_list;
5885 }
5886 return charset_list;
5887 }
5888
5889
5890 /* Return a list of charsets supported by CODING-SYSTEM. */
5891
5892 Lisp_Object
5893 coding_system_charset_list (Lisp_Object coding_system)
5894 {
5895 int id;
5896 Lisp_Object attrs, charset_list;
5897
5898 CHECK_CODING_SYSTEM_GET_ID (coding_system, id);
5899 attrs = CODING_ID_ATTRS (id);
5900
5901 if (EQ (CODING_ATTR_TYPE (attrs), Qiso_2022))
5902 {
5903 int flags = XINT (AREF (attrs, coding_attr_iso_flags));
5904
5905 if (flags & CODING_ISO_FLAG_FULL_SUPPORT)
5906 charset_list = Viso_2022_charset_list;
5907 else
5908 charset_list = CODING_ATTR_CHARSET_LIST (attrs);
5909 }
5910 else if (EQ (CODING_ATTR_TYPE (attrs), Qemacs_mule))
5911 {
5912 charset_list = Vemacs_mule_charset_list;
5913 }
5914 else
5915 {
5916 charset_list = CODING_ATTR_CHARSET_LIST (attrs);
5917 }
5918 return charset_list;
5919 }
5920
5921
5922 /* Return raw-text or one of its subsidiaries that has the same
5923 eol_type as CODING-SYSTEM. */
5924
5925 Lisp_Object
5926 raw_text_coding_system (Lisp_Object coding_system)
5927 {
5928 Lisp_Object spec, attrs;
5929 Lisp_Object eol_type, raw_text_eol_type;
5930
5931 if (NILP (coding_system))
5932 return Qraw_text;
5933 spec = CODING_SYSTEM_SPEC (coding_system);
5934 attrs = AREF (spec, 0);
5935
5936 if (EQ (CODING_ATTR_TYPE (attrs), Qraw_text))
5937 return coding_system;
5938
5939 eol_type = AREF (spec, 2);
5940 if (VECTORP (eol_type))
5941 return Qraw_text;
5942 spec = CODING_SYSTEM_SPEC (Qraw_text);
5943 raw_text_eol_type = AREF (spec, 2);
5944 return (EQ (eol_type, Qunix) ? AREF (raw_text_eol_type, 0)
5945 : EQ (eol_type, Qdos) ? AREF (raw_text_eol_type, 1)
5946 : AREF (raw_text_eol_type, 2));
5947 }
5948
5949
5950 /* If CODING_SYSTEM doesn't specify end-of-line format, return one of
5951 the subsidiary that has the same eol-spec as PARENT (if it is not
5952 nil and specifies end-of-line format) or the system's setting
5953 (system_eol_type). */
5954
5955 Lisp_Object
5956 coding_inherit_eol_type (Lisp_Object coding_system, Lisp_Object parent)
5957 {
5958 Lisp_Object spec, eol_type;
5959
5960 if (NILP (coding_system))
5961 coding_system = Qraw_text;
5962 spec = CODING_SYSTEM_SPEC (coding_system);
5963 eol_type = AREF (spec, 2);
5964 if (VECTORP (eol_type))
5965 {
5966 Lisp_Object parent_eol_type;
5967
5968 if (! NILP (parent))
5969 {
5970 Lisp_Object parent_spec;
5971
5972 parent_spec = CODING_SYSTEM_SPEC (parent);
5973 parent_eol_type = AREF (parent_spec, 2);
5974 if (VECTORP (parent_eol_type))
5975 parent_eol_type = system_eol_type;
5976 }
5977 else
5978 parent_eol_type = system_eol_type;
5979 if (EQ (parent_eol_type, Qunix))
5980 coding_system = AREF (eol_type, 0);
5981 else if (EQ (parent_eol_type, Qdos))
5982 coding_system = AREF (eol_type, 1);
5983 else if (EQ (parent_eol_type, Qmac))
5984 coding_system = AREF (eol_type, 2);
5985 }
5986 return coding_system;
5987 }
5988
5989
5990 /* Check if text-conversion and eol-conversion of CODING_SYSTEM are
5991 decided for writing to a process. If not, complement them, and
5992 return a new coding system. */
5993
5994 Lisp_Object
5995 complement_process_encoding_system (Lisp_Object coding_system)
5996 {
5997 Lisp_Object coding_base = Qnil, eol_base = Qnil;
5998 Lisp_Object spec, attrs;
5999 int i;
6000
6001 for (i = 0; i < 3; i++)
6002 {
6003 if (i == 1)
6004 coding_system = CDR_SAFE (Vdefault_process_coding_system);
6005 else if (i == 2)
6006 coding_system = preferred_coding_system ();
6007 spec = CODING_SYSTEM_SPEC (coding_system);
6008 if (NILP (spec))
6009 continue;
6010 attrs = AREF (spec, 0);
6011 if (NILP (coding_base) && ! EQ (CODING_ATTR_TYPE (attrs), Qundecided))
6012 coding_base = CODING_ATTR_BASE_NAME (attrs);
6013 if (NILP (eol_base) && ! VECTORP (AREF (spec, 2)))
6014 eol_base = coding_system;
6015 if (! NILP (coding_base) && ! NILP (eol_base))
6016 break;
6017 }
6018
6019 if (i > 0)
6020 /* The original CODING_SYSTEM didn't specify text-conversion or
6021 eol-conversion. Be sure that we return a fully complemented
6022 coding system. */
6023 coding_system = coding_inherit_eol_type (coding_base, eol_base);
6024 return coding_system;
6025 }
6026
6027
6028 /* Emacs has a mechanism to automatically detect a coding system if it
6029 is one of Emacs' internal format, ISO2022, SJIS, and BIG5. But,
6030 it's impossible to distinguish some coding systems accurately
6031 because they use the same range of codes. So, at first, coding
6032 systems are categorized into 7, those are:
6033
6034 o coding-category-emacs-mule
6035
6036 The category for a coding system which has the same code range
6037 as Emacs' internal format. Assigned the coding-system (Lisp
6038 symbol) `emacs-mule' by default.
6039
6040 o coding-category-sjis
6041
6042 The category for a coding system which has the same code range
6043 as SJIS. Assigned the coding-system (Lisp
6044 symbol) `japanese-shift-jis' by default.
6045
6046 o coding-category-iso-7
6047
6048 The category for a coding system which has the same code range
6049 as ISO2022 of 7-bit environment. This doesn't use any locking
6050 shift and single shift functions. This can encode/decode all
6051 charsets. Assigned the coding-system (Lisp symbol)
6052 `iso-2022-7bit' by default.
6053
6054 o coding-category-iso-7-tight
6055
6056 Same as coding-category-iso-7 except that this can
6057 encode/decode only the specified charsets.
6058
6059 o coding-category-iso-8-1
6060
6061 The category for a coding system which has the same code range
6062 as ISO2022 of 8-bit environment and graphic plane 1 used only
6063 for DIMENSION1 charset. This doesn't use any locking shift
6064 and single shift functions. Assigned the coding-system (Lisp
6065 symbol) `iso-latin-1' by default.
6066
6067 o coding-category-iso-8-2
6068
6069 The category for a coding system which has the same code range
6070 as ISO2022 of 8-bit environment and graphic plane 1 used only
6071 for DIMENSION2 charset. This doesn't use any locking shift
6072 and single shift functions. Assigned the coding-system (Lisp
6073 symbol) `japanese-iso-8bit' by default.
6074
6075 o coding-category-iso-7-else
6076
6077 The category for a coding system which has the same code range
6078 as ISO2022 of 7-bit environment but uses locking shift or
6079 single shift functions. Assigned the coding-system (Lisp
6080 symbol) `iso-2022-7bit-lock' by default.
6081
6082 o coding-category-iso-8-else
6083
6084 The category for a coding system which has the same code range
6085 as ISO2022 of 8-bit environment but uses locking shift or
6086 single shift functions. Assigned the coding-system (Lisp
6087 symbol) `iso-2022-8bit-ss2' by default.
6088
6089 o coding-category-big5
6090
6091 The category for a coding system which has the same code range
6092 as BIG5. Assigned the coding-system (Lisp symbol)
6093 `cn-big5' by default.
6094
6095 o coding-category-utf-8
6096
6097 The category for a coding system which has the same code range
6098 as UTF-8 (cf. RFC3629). Assigned the coding-system (Lisp
6099 symbol) `utf-8' by default.
6100
6101 o coding-category-utf-16-be
6102
6103 The category for a coding system in which a text has an
6104 Unicode signature (cf. Unicode Standard) in the order of BIG
6105 endian at the head. Assigned the coding-system (Lisp symbol)
6106 `utf-16-be' by default.
6107
6108 o coding-category-utf-16-le
6109
6110 The category for a coding system in which a text has an
6111 Unicode signature (cf. Unicode Standard) in the order of
6112 LITTLE endian at the head. Assigned the coding-system (Lisp
6113 symbol) `utf-16-le' by default.
6114
6115 o coding-category-ccl
6116
6117 The category for a coding system of which encoder/decoder is
6118 written in CCL programs. The default value is nil, i.e., no
6119 coding system is assigned.
6120
6121 o coding-category-binary
6122
6123 The category for a coding system not categorized in any of the
6124 above. Assigned the coding-system (Lisp symbol)
6125 `no-conversion' by default.
6126
6127 Each of them is a Lisp symbol and the value is an actual
6128 `coding-system's (this is also a Lisp symbol) assigned by a user.
6129 What Emacs does actually is to detect a category of coding system.
6130 Then, it uses a `coding-system' assigned to it. If Emacs can't
6131 decide only one possible category, it selects a category of the
6132 highest priority. Priorities of categories are also specified by a
6133 user in a Lisp variable `coding-category-list'.
6134
6135 */
6136
6137 #define EOL_SEEN_NONE 0
6138 #define EOL_SEEN_LF 1
6139 #define EOL_SEEN_CR 2
6140 #define EOL_SEEN_CRLF 4
6141
6142 /* Detect how end-of-line of a text of length SRC_BYTES pointed by
6143 SOURCE is encoded. If CATEGORY is one of
6144 coding_category_utf_16_XXXX, assume that CR and LF are encoded by
6145 two-byte, else they are encoded by one-byte.
6146
6147 Return one of EOL_SEEN_XXX. */
6148
6149 #define MAX_EOL_CHECK_COUNT 3
6150
6151 static int
6152 detect_eol (const unsigned char *source, EMACS_INT src_bytes,
6153 enum coding_category category)
6154 {
6155 const unsigned char *src = source, *src_end = src + src_bytes;
6156 unsigned char c;
6157 int total = 0;
6158 int eol_seen = EOL_SEEN_NONE;
6159
6160 if ((1 << category) & CATEGORY_MASK_UTF_16)
6161 {
6162 int msb, lsb;
6163
6164 msb = category == (coding_category_utf_16_le
6165 | coding_category_utf_16_le_nosig);
6166 lsb = 1 - msb;
6167
6168 while (src + 1 < src_end)
6169 {
6170 c = src[lsb];
6171 if (src[msb] == 0 && (c == '\n' || c == '\r'))
6172 {
6173 int this_eol;
6174
6175 if (c == '\n')
6176 this_eol = EOL_SEEN_LF;
6177 else if (src + 3 >= src_end
6178 || src[msb + 2] != 0
6179 || src[lsb + 2] != '\n')
6180 this_eol = EOL_SEEN_CR;
6181 else
6182 {
6183 this_eol = EOL_SEEN_CRLF;
6184 src += 2;
6185 }
6186
6187 if (eol_seen == EOL_SEEN_NONE)
6188 /* This is the first end-of-line. */
6189 eol_seen = this_eol;
6190 else if (eol_seen != this_eol)
6191 {
6192 /* The found type is different from what found before.
6193 Allow for stray ^M characters in DOS EOL files. */
6194 if ((eol_seen == EOL_SEEN_CR && this_eol == EOL_SEEN_CRLF)
6195 || (eol_seen == EOL_SEEN_CRLF
6196 && this_eol == EOL_SEEN_CR))
6197 eol_seen = EOL_SEEN_CRLF;
6198 else
6199 {
6200 eol_seen = EOL_SEEN_LF;
6201 break;
6202 }
6203 }
6204 if (++total == MAX_EOL_CHECK_COUNT)
6205 break;
6206 }
6207 src += 2;
6208 }
6209 }
6210 else
6211 while (src < src_end)
6212 {
6213 c = *src++;
6214 if (c == '\n' || c == '\r')
6215 {
6216 int this_eol;
6217
6218 if (c == '\n')
6219 this_eol = EOL_SEEN_LF;
6220 else if (src >= src_end || *src != '\n')
6221 this_eol = EOL_SEEN_CR;
6222 else
6223 this_eol = EOL_SEEN_CRLF, src++;
6224
6225 if (eol_seen == EOL_SEEN_NONE)
6226 /* This is the first end-of-line. */
6227 eol_seen = this_eol;
6228 else if (eol_seen != this_eol)
6229 {
6230 /* The found type is different from what found before.
6231 Allow for stray ^M characters in DOS EOL files. */
6232 if ((eol_seen == EOL_SEEN_CR && this_eol == EOL_SEEN_CRLF)
6233 || (eol_seen == EOL_SEEN_CRLF && this_eol == EOL_SEEN_CR))
6234 eol_seen = EOL_SEEN_CRLF;
6235 else
6236 {
6237 eol_seen = EOL_SEEN_LF;
6238 break;
6239 }
6240 }
6241 if (++total == MAX_EOL_CHECK_COUNT)
6242 break;
6243 }
6244 }
6245 return eol_seen;
6246 }
6247
6248
6249 static Lisp_Object
6250 adjust_coding_eol_type (struct coding_system *coding, int eol_seen)
6251 {
6252 Lisp_Object eol_type;
6253
6254 eol_type = CODING_ID_EOL_TYPE (coding->id);
6255 if (eol_seen & EOL_SEEN_LF)
6256 {
6257 coding->id = CODING_SYSTEM_ID (AREF (eol_type, 0));
6258 eol_type = Qunix;
6259 }
6260 else if (eol_seen & EOL_SEEN_CRLF)
6261 {
6262 coding->id = CODING_SYSTEM_ID (AREF (eol_type, 1));
6263 eol_type = Qdos;
6264 }
6265 else if (eol_seen & EOL_SEEN_CR)
6266 {
6267 coding->id = CODING_SYSTEM_ID (AREF (eol_type, 2));
6268 eol_type = Qmac;
6269 }
6270 return eol_type;
6271 }
6272
6273 /* Detect how a text specified in CODING is encoded. If a coding
6274 system is detected, update fields of CODING by the detected coding
6275 system. */
6276
6277 void
6278 detect_coding (struct coding_system *coding)
6279 {
6280 const unsigned char *src, *src_end;
6281 int saved_mode = coding->mode;
6282
6283 coding->consumed = coding->consumed_char = 0;
6284 coding->produced = coding->produced_char = 0;
6285 coding_set_source (coding);
6286
6287 src_end = coding->source + coding->src_bytes;
6288 coding->head_ascii = 0;
6289
6290 /* If we have not yet decided the text encoding type, detect it
6291 now. */
6292 if (EQ (CODING_ATTR_TYPE (CODING_ID_ATTRS (coding->id)), Qundecided))
6293 {
6294 int c, i;
6295 struct coding_detection_info detect_info;
6296 int null_byte_found = 0, eight_bit_found = 0;
6297
6298 detect_info.checked = detect_info.found = detect_info.rejected = 0;
6299 for (src = coding->source; src < src_end; src++)
6300 {
6301 c = *src;
6302 if (c & 0x80)
6303 {
6304 eight_bit_found = 1;
6305 if (null_byte_found)
6306 break;
6307 }
6308 else if (c < 0x20)
6309 {
6310 if ((c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
6311 && ! inhibit_iso_escape_detection
6312 && ! detect_info.checked)
6313 {
6314 if (detect_coding_iso_2022 (coding, &detect_info))
6315 {
6316 /* We have scanned the whole data. */
6317 if (! (detect_info.rejected & CATEGORY_MASK_ISO_7_ELSE))
6318 {
6319 /* We didn't find an 8-bit code. We may
6320 have found a null-byte, but it's very
6321 rare that a binary file conforms to
6322 ISO-2022. */
6323 src = src_end;
6324 coding->head_ascii = src - coding->source;
6325 }
6326 detect_info.rejected |= ~CATEGORY_MASK_ISO_ESCAPE;
6327 break;
6328 }
6329 }
6330 else if (! c && !inhibit_null_byte_detection)
6331 {
6332 null_byte_found = 1;
6333 if (eight_bit_found)
6334 break;
6335 }
6336 if (! eight_bit_found)
6337 coding->head_ascii++;
6338 }
6339 else if (! eight_bit_found)
6340 coding->head_ascii++;
6341 }
6342
6343 if (null_byte_found || eight_bit_found
6344 || coding->head_ascii < coding->src_bytes
6345 || detect_info.found)
6346 {
6347 enum coding_category category;
6348 struct coding_system *this;
6349
6350 if (coding->head_ascii == coding->src_bytes)
6351 /* As all bytes are 7-bit, we can ignore non-ISO-2022 codings. */
6352 for (i = 0; i < coding_category_raw_text; i++)
6353 {
6354 category = coding_priorities[i];
6355 this = coding_categories + category;
6356 if (detect_info.found & (1 << category))
6357 break;
6358 }
6359 else
6360 {
6361 if (null_byte_found)
6362 {
6363 detect_info.checked |= ~CATEGORY_MASK_UTF_16;
6364 detect_info.rejected |= ~CATEGORY_MASK_UTF_16;
6365 }
6366 for (i = 0; i < coding_category_raw_text; i++)
6367 {
6368 category = coding_priorities[i];
6369 this = coding_categories + category;
6370 if (this->id < 0)
6371 {
6372 /* No coding system of this category is defined. */
6373 detect_info.rejected |= (1 << category);
6374 }
6375 else if (category >= coding_category_raw_text)
6376 continue;
6377 else if (detect_info.checked & (1 << category))
6378 {
6379 if (detect_info.found & (1 << category))
6380 break;
6381 }
6382 else if ((*(this->detector)) (coding, &detect_info)
6383 && detect_info.found & (1 << category))
6384 {
6385 if (category == coding_category_utf_16_auto)
6386 {
6387 if (detect_info.found & CATEGORY_MASK_UTF_16_LE)
6388 category = coding_category_utf_16_le;
6389 else
6390 category = coding_category_utf_16_be;
6391 }
6392 break;
6393 }
6394 }
6395 }
6396
6397 if (i < coding_category_raw_text)
6398 setup_coding_system (CODING_ID_NAME (this->id), coding);
6399 else if (null_byte_found)
6400 setup_coding_system (Qno_conversion, coding);
6401 else if ((detect_info.rejected & CATEGORY_MASK_ANY)
6402 == CATEGORY_MASK_ANY)
6403 setup_coding_system (Qraw_text, coding);
6404 else if (detect_info.rejected)
6405 for (i = 0; i < coding_category_raw_text; i++)
6406 if (! (detect_info.rejected & (1 << coding_priorities[i])))
6407 {
6408 this = coding_categories + coding_priorities[i];
6409 setup_coding_system (CODING_ID_NAME (this->id), coding);
6410 break;
6411 }
6412 }
6413 }
6414 else if (XINT (CODING_ATTR_CATEGORY (CODING_ID_ATTRS (coding->id)))
6415 == coding_category_utf_8_auto)
6416 {
6417 Lisp_Object coding_systems;
6418 struct coding_detection_info detect_info;
6419
6420 coding_systems
6421 = AREF (CODING_ID_ATTRS (coding->id), coding_attr_utf_bom);
6422 detect_info.found = detect_info.rejected = 0;
6423 coding->head_ascii = 0;
6424 if (CONSP (coding_systems)
6425 && detect_coding_utf_8 (coding, &detect_info))
6426 {
6427 if (detect_info.found & CATEGORY_MASK_UTF_8_SIG)
6428 setup_coding_system (XCAR (coding_systems), coding);
6429 else
6430 setup_coding_system (XCDR (coding_systems), coding);
6431 }
6432 }
6433 else if (XINT (CODING_ATTR_CATEGORY (CODING_ID_ATTRS (coding->id)))
6434 == coding_category_utf_16_auto)
6435 {
6436 Lisp_Object coding_systems;
6437 struct coding_detection_info detect_info;
6438
6439 coding_systems
6440 = AREF (CODING_ID_ATTRS (coding->id), coding_attr_utf_bom);
6441 detect_info.found = detect_info.rejected = 0;
6442 coding->head_ascii = 0;
6443 if (CONSP (coding_systems)
6444 && detect_coding_utf_16 (coding, &detect_info))
6445 {
6446 if (detect_info.found & CATEGORY_MASK_UTF_16_LE)
6447 setup_coding_system (XCAR (coding_systems), coding);
6448 else if (detect_info.found & CATEGORY_MASK_UTF_16_BE)
6449 setup_coding_system (XCDR (coding_systems), coding);
6450 }
6451 }
6452 coding->mode = saved_mode;
6453 }
6454
6455
6456 static void
6457 decode_eol (struct coding_system *coding)
6458 {
6459 Lisp_Object eol_type;
6460 unsigned char *p, *pbeg, *pend;
6461
6462 eol_type = CODING_ID_EOL_TYPE (coding->id);
6463 if (EQ (eol_type, Qunix) || inhibit_eol_conversion)
6464 return;
6465
6466 if (NILP (coding->dst_object))
6467 pbeg = coding->destination;
6468 else
6469 pbeg = BYTE_POS_ADDR (coding->dst_pos_byte);
6470 pend = pbeg + coding->produced;
6471
6472 if (VECTORP (eol_type))
6473 {
6474 int eol_seen = EOL_SEEN_NONE;
6475
6476 for (p = pbeg; p < pend; p++)
6477 {
6478 if (*p == '\n')
6479 eol_seen |= EOL_SEEN_LF;
6480 else if (*p == '\r')
6481 {
6482 if (p + 1 < pend && *(p + 1) == '\n')
6483 {
6484 eol_seen |= EOL_SEEN_CRLF;
6485 p++;
6486 }
6487 else
6488 eol_seen |= EOL_SEEN_CR;
6489 }
6490 }
6491 /* Handle DOS-style EOLs in a file with stray ^M characters. */
6492 if ((eol_seen & EOL_SEEN_CRLF) != 0
6493 && (eol_seen & EOL_SEEN_CR) != 0
6494 && (eol_seen & EOL_SEEN_LF) == 0)
6495 eol_seen = EOL_SEEN_CRLF;
6496 else if (eol_seen != EOL_SEEN_NONE
6497 && eol_seen != EOL_SEEN_LF
6498 && eol_seen != EOL_SEEN_CRLF
6499 && eol_seen != EOL_SEEN_CR)
6500 eol_seen = EOL_SEEN_LF;
6501 if (eol_seen != EOL_SEEN_NONE)
6502 eol_type = adjust_coding_eol_type (coding, eol_seen);
6503 }
6504
6505 if (EQ (eol_type, Qmac))
6506 {
6507 for (p = pbeg; p < pend; p++)
6508 if (*p == '\r')
6509 *p = '\n';
6510 }
6511 else if (EQ (eol_type, Qdos))
6512 {
6513 int n = 0;
6514
6515 if (NILP (coding->dst_object))
6516 {
6517 /* Start deleting '\r' from the tail to minimize the memory
6518 movement. */
6519 for (p = pend - 2; p >= pbeg; p--)
6520 if (*p == '\r')
6521 {
6522 memmove (p, p + 1, pend-- - p - 1);
6523 n++;
6524 }
6525 }
6526 else
6527 {
6528 int pos_byte = coding->dst_pos_byte;
6529 int pos = coding->dst_pos;
6530 int pos_end = pos + coding->produced_char - 1;
6531
6532 while (pos < pos_end)
6533 {
6534 p = BYTE_POS_ADDR (pos_byte);
6535 if (*p == '\r' && p[1] == '\n')
6536 {
6537 del_range_2 (pos, pos_byte, pos + 1, pos_byte + 1, 0);
6538 n++;
6539 pos_end--;
6540 }
6541 pos++;
6542 if (coding->dst_multibyte)
6543 pos_byte += BYTES_BY_CHAR_HEAD (*p);
6544 else
6545 pos_byte++;
6546 }
6547 }
6548 coding->produced -= n;
6549 coding->produced_char -= n;
6550 }
6551 }
6552
6553
6554 /* Return a translation table (or list of them) from coding system
6555 attribute vector ATTRS for encoding (ENCODEP is nonzero) or
6556 decoding (ENCODEP is zero). */
6557
6558 static Lisp_Object
6559 get_translation_table (Lisp_Object attrs, int encodep, int *max_lookup)
6560 {
6561 Lisp_Object standard, translation_table;
6562 Lisp_Object val;
6563
6564 if (NILP (Venable_character_translation))
6565 {
6566 if (max_lookup)
6567 *max_lookup = 0;
6568 return Qnil;
6569 }
6570 if (encodep)
6571 translation_table = CODING_ATTR_ENCODE_TBL (attrs),
6572 standard = Vstandard_translation_table_for_encode;
6573 else
6574 translation_table = CODING_ATTR_DECODE_TBL (attrs),
6575 standard = Vstandard_translation_table_for_decode;
6576 if (NILP (translation_table))
6577 translation_table = standard;
6578 else
6579 {
6580 if (SYMBOLP (translation_table))
6581 translation_table = Fget (translation_table, Qtranslation_table);
6582 else if (CONSP (translation_table))
6583 {
6584 translation_table = Fcopy_sequence (translation_table);
6585 for (val = translation_table; CONSP (val); val = XCDR (val))
6586 if (SYMBOLP (XCAR (val)))
6587 XSETCAR (val, Fget (XCAR (val), Qtranslation_table));
6588 }
6589 if (CHAR_TABLE_P (standard))
6590 {
6591 if (CONSP (translation_table))
6592 translation_table = nconc2 (translation_table,
6593 Fcons (standard, Qnil));
6594 else
6595 translation_table = Fcons (translation_table,
6596 Fcons (standard, Qnil));
6597 }
6598 }
6599
6600 if (max_lookup)
6601 {
6602 *max_lookup = 1;
6603 if (CHAR_TABLE_P (translation_table)
6604 && CHAR_TABLE_EXTRA_SLOTS (XCHAR_TABLE (translation_table)) > 1)
6605 {
6606 val = XCHAR_TABLE (translation_table)->extras[1];
6607 if (NATNUMP (val) && *max_lookup < XFASTINT (val))
6608 *max_lookup = XFASTINT (val);
6609 }
6610 else if (CONSP (translation_table))
6611 {
6612 Lisp_Object tail, val;
6613
6614 for (tail = translation_table; CONSP (tail); tail = XCDR (tail))
6615 if (CHAR_TABLE_P (XCAR (tail))
6616 && CHAR_TABLE_EXTRA_SLOTS (XCHAR_TABLE (XCAR (tail))) > 1)
6617 {
6618 val = XCHAR_TABLE (XCAR (tail))->extras[1];
6619 if (NATNUMP (val) && *max_lookup < XFASTINT (val))
6620 *max_lookup = XFASTINT (val);
6621 }
6622 }
6623 }
6624 return translation_table;
6625 }
6626
6627 #define LOOKUP_TRANSLATION_TABLE(table, c, trans) \
6628 do { \
6629 trans = Qnil; \
6630 if (CHAR_TABLE_P (table)) \
6631 { \
6632 trans = CHAR_TABLE_REF (table, c); \
6633 if (CHARACTERP (trans)) \
6634 c = XFASTINT (trans), trans = Qnil; \
6635 } \
6636 else if (CONSP (table)) \
6637 { \
6638 Lisp_Object tail; \
6639 \
6640 for (tail = table; CONSP (tail); tail = XCDR (tail)) \
6641 if (CHAR_TABLE_P (XCAR (tail))) \
6642 { \
6643 trans = CHAR_TABLE_REF (XCAR (tail), c); \
6644 if (CHARACTERP (trans)) \
6645 c = XFASTINT (trans), trans = Qnil; \
6646 else if (! NILP (trans)) \
6647 break; \
6648 } \
6649 } \
6650 } while (0)
6651
6652
6653 /* Return a translation of character(s) at BUF according to TRANS.
6654 TRANS is TO-CHAR or ((FROM . TO) ...) where
6655 FROM = [FROM-CHAR ...], TO is TO-CHAR or [TO-CHAR ...].
6656 The return value is TO-CHAR or ([FROM-CHAR ...] . TO) if a
6657 translation is found, and Qnil if not found..
6658 If BUF is too short to lookup characters in FROM, return Qt. */
6659
6660 static Lisp_Object
6661 get_translation (Lisp_Object trans, int *buf, int *buf_end)
6662 {
6663
6664 if (INTEGERP (trans))
6665 return trans;
6666 for (; CONSP (trans); trans = XCDR (trans))
6667 {
6668 Lisp_Object val = XCAR (trans);
6669 Lisp_Object from = XCAR (val);
6670 int len = ASIZE (from);
6671 int i;
6672
6673 for (i = 0; i < len; i++)
6674 {
6675 if (buf + i == buf_end)
6676 return Qt;
6677 if (XINT (AREF (from, i)) != buf[i])
6678 break;
6679 }
6680 if (i == len)
6681 return val;
6682 }
6683 return Qnil;
6684 }
6685
6686
6687 static int
6688 produce_chars (struct coding_system *coding, Lisp_Object translation_table,
6689 int last_block)
6690 {
6691 unsigned char *dst = coding->destination + coding->produced;
6692 unsigned char *dst_end = coding->destination + coding->dst_bytes;
6693 EMACS_INT produced;
6694 EMACS_INT produced_chars = 0;
6695 int carryover = 0;
6696
6697 if (! coding->chars_at_source)
6698 {
6699 /* Source characters are in coding->charbuf. */
6700 int *buf = coding->charbuf;
6701 int *buf_end = buf + coding->charbuf_used;
6702
6703 if (EQ (coding->src_object, coding->dst_object))
6704 {
6705 coding_set_source (coding);
6706 dst_end = ((unsigned char *) coding->source) + coding->consumed;
6707 }
6708
6709 while (buf < buf_end)
6710 {
6711 int c = *buf, i;
6712
6713 if (c >= 0)
6714 {
6715 int from_nchars = 1, to_nchars = 1;
6716 Lisp_Object trans = Qnil;
6717
6718 LOOKUP_TRANSLATION_TABLE (translation_table, c, trans);
6719 if (! NILP (trans))
6720 {
6721 trans = get_translation (trans, buf, buf_end);
6722 if (INTEGERP (trans))
6723 c = XINT (trans);
6724 else if (CONSP (trans))
6725 {
6726 from_nchars = ASIZE (XCAR (trans));
6727 trans = XCDR (trans);
6728 if (INTEGERP (trans))
6729 c = XINT (trans);
6730 else
6731 {
6732 to_nchars = ASIZE (trans);
6733 c = XINT (AREF (trans, 0));
6734 }
6735 }
6736 else if (EQ (trans, Qt) && ! last_block)
6737 break;
6738 }
6739
6740 if (dst + MAX_MULTIBYTE_LENGTH * to_nchars > dst_end)
6741 {
6742 dst = alloc_destination (coding,
6743 buf_end - buf
6744 + MAX_MULTIBYTE_LENGTH * to_nchars,
6745 dst);
6746 if (EQ (coding->src_object, coding->dst_object))
6747 {
6748 coding_set_source (coding);
6749 dst_end = (((unsigned char *) coding->source)
6750 + coding->consumed);
6751 }
6752 else
6753 dst_end = coding->destination + coding->dst_bytes;
6754 }
6755
6756 for (i = 0; i < to_nchars; i++)
6757 {
6758 if (i > 0)
6759 c = XINT (AREF (trans, i));
6760 if (coding->dst_multibyte
6761 || ! CHAR_BYTE8_P (c))
6762 CHAR_STRING_ADVANCE_NO_UNIFY (c, dst);
6763 else
6764 *dst++ = CHAR_TO_BYTE8 (c);
6765 }
6766 produced_chars += to_nchars;
6767 buf += from_nchars;
6768 }
6769 else
6770 /* This is an annotation datum. (-C) is the length. */
6771 buf += -c;
6772 }
6773 carryover = buf_end - buf;
6774 }
6775 else
6776 {
6777 /* Source characters are at coding->source. */
6778 const unsigned char *src = coding->source;
6779 const unsigned char *src_end = src + coding->consumed;
6780
6781 if (EQ (coding->dst_object, coding->src_object))
6782 dst_end = (unsigned char *) src;
6783 if (coding->src_multibyte != coding->dst_multibyte)
6784 {
6785 if (coding->src_multibyte)
6786 {
6787 int multibytep = 1;
6788 EMACS_INT consumed_chars = 0;
6789
6790 while (1)
6791 {
6792 const unsigned char *src_base = src;
6793 int c;
6794
6795 ONE_MORE_BYTE (c);
6796 if (dst == dst_end)
6797 {
6798 if (EQ (coding->src_object, coding->dst_object))
6799 dst_end = (unsigned char *) src;
6800 if (dst == dst_end)
6801 {
6802 EMACS_INT offset = src - coding->source;
6803
6804 dst = alloc_destination (coding, src_end - src + 1,
6805 dst);
6806 dst_end = coding->destination + coding->dst_bytes;
6807 coding_set_source (coding);
6808 src = coding->source + offset;
6809 src_end = coding->source + coding->src_bytes;
6810 if (EQ (coding->src_object, coding->dst_object))
6811 dst_end = (unsigned char *) src;
6812 }
6813 }
6814 *dst++ = c;
6815 produced_chars++;
6816 }
6817 no_more_source:
6818 ;
6819 }
6820 else
6821 while (src < src_end)
6822 {
6823 int multibytep = 1;
6824 int c = *src++;
6825
6826 if (dst >= dst_end - 1)
6827 {
6828 if (EQ (coding->src_object, coding->dst_object))
6829 dst_end = (unsigned char *) src;
6830 if (dst >= dst_end - 1)
6831 {
6832 EMACS_INT offset = src - coding->source;
6833 EMACS_INT more_bytes;
6834
6835 if (EQ (coding->src_object, coding->dst_object))
6836 more_bytes = ((src_end - src) / 2) + 2;
6837 else
6838 more_bytes = src_end - src + 2;
6839 dst = alloc_destination (coding, more_bytes, dst);
6840 dst_end = coding->destination + coding->dst_bytes;
6841 coding_set_source (coding);
6842 src = coding->source + offset;
6843 src_end = coding->source + coding->src_bytes;
6844 if (EQ (coding->src_object, coding->dst_object))
6845 dst_end = (unsigned char *) src;
6846 }
6847 }
6848 EMIT_ONE_BYTE (c);
6849 }
6850 }
6851 else
6852 {
6853 if (!EQ (coding->src_object, coding->dst_object))
6854 {
6855 EMACS_INT require = coding->src_bytes - coding->dst_bytes;
6856
6857 if (require > 0)
6858 {
6859 EMACS_INT offset = src - coding->source;
6860
6861 dst = alloc_destination (coding, require, dst);
6862 coding_set_source (coding);
6863 src = coding->source + offset;
6864 src_end = coding->source + coding->src_bytes;
6865 }
6866 }
6867 produced_chars = coding->consumed_char;
6868 while (src < src_end)
6869 *dst++ = *src++;
6870 }
6871 }
6872
6873 produced = dst - (coding->destination + coding->produced);
6874 if (BUFFERP (coding->dst_object) && produced_chars > 0)
6875 insert_from_gap (produced_chars, produced);
6876 coding->produced += produced;
6877 coding->produced_char += produced_chars;
6878 return carryover;
6879 }
6880
6881 /* Compose text in CODING->object according to the annotation data at
6882 CHARBUF. CHARBUF is an array:
6883 [ -LENGTH ANNOTATION_MASK NCHARS NBYTES METHOD [ COMPONENTS... ] ]
6884 */
6885
6886 static INLINE void
6887 produce_composition (struct coding_system *coding, int *charbuf, EMACS_INT pos)
6888 {
6889 int len;
6890 EMACS_INT to;
6891 enum composition_method method;
6892 Lisp_Object components;
6893
6894 len = -charbuf[0] - MAX_ANNOTATION_LENGTH;
6895 to = pos + charbuf[2];
6896 method = (enum composition_method) (charbuf[4]);
6897
6898 if (method == COMPOSITION_RELATIVE)
6899 components = Qnil;
6900 else
6901 {
6902 Lisp_Object args[MAX_COMPOSITION_COMPONENTS * 2 - 1];
6903 int i, j;
6904
6905 if (method == COMPOSITION_WITH_RULE)
6906 len = charbuf[2] * 3 - 2;
6907 charbuf += MAX_ANNOTATION_LENGTH;
6908 /* charbuf = [ CHRA ... CHAR] or [ CHAR -2 RULE ... CHAR ] */
6909 for (i = j = 0; i < len && charbuf[i] != -1; i++, j++)
6910 {
6911 if (charbuf[i] >= 0)
6912 args[j] = make_number (charbuf[i]);
6913 else
6914 {
6915 i++;
6916 args[j] = make_number (charbuf[i] % 0x100);
6917 }
6918 }
6919 components = (i == j ? Fstring (j, args) : Fvector (j, args));
6920 }
6921 compose_text (pos, to, components, Qnil, coding->dst_object);
6922 }
6923
6924
6925 /* Put `charset' property on text in CODING->object according to
6926 the annotation data at CHARBUF. CHARBUF is an array:
6927 [ -LENGTH ANNOTATION_MASK NCHARS CHARSET-ID ]
6928 */
6929
6930 static INLINE void
6931 produce_charset (struct coding_system *coding, int *charbuf, EMACS_INT pos)
6932 {
6933 EMACS_INT from = pos - charbuf[2];
6934 struct charset *charset = CHARSET_FROM_ID (charbuf[3]);
6935
6936 Fput_text_property (make_number (from), make_number (pos),
6937 Qcharset, CHARSET_NAME (charset),
6938 coding->dst_object);
6939 }
6940
6941
6942 #define CHARBUF_SIZE 0x4000
6943
6944 #define ALLOC_CONVERSION_WORK_AREA(coding) \
6945 do { \
6946 int size = CHARBUF_SIZE; \
6947 \
6948 coding->charbuf = NULL; \
6949 while (size > 1024) \
6950 { \
6951 coding->charbuf = (int *) alloca (sizeof (int) * size); \
6952 if (coding->charbuf) \
6953 break; \
6954 size >>= 1; \
6955 } \
6956 if (! coding->charbuf) \
6957 { \
6958 record_conversion_result (coding, CODING_RESULT_INSUFFICIENT_MEM); \
6959 return coding->result; \
6960 } \
6961 coding->charbuf_size = size; \
6962 } while (0)
6963
6964
6965 static void
6966 produce_annotation (struct coding_system *coding, EMACS_INT pos)
6967 {
6968 int *charbuf = coding->charbuf;
6969 int *charbuf_end = charbuf + coding->charbuf_used;
6970
6971 if (NILP (coding->dst_object))
6972 return;
6973
6974 while (charbuf < charbuf_end)
6975 {
6976 if (*charbuf >= 0)
6977 pos++, charbuf++;
6978 else
6979 {
6980 int len = -*charbuf;
6981
6982 if (len > 2)
6983 switch (charbuf[1])
6984 {
6985 case CODING_ANNOTATE_COMPOSITION_MASK:
6986 produce_composition (coding, charbuf, pos);
6987 break;
6988 case CODING_ANNOTATE_CHARSET_MASK:
6989 produce_charset (coding, charbuf, pos);
6990 break;
6991 }
6992 charbuf += len;
6993 }
6994 }
6995 }
6996
6997 /* Decode the data at CODING->src_object into CODING->dst_object.
6998 CODING->src_object is a buffer, a string, or nil.
6999 CODING->dst_object is a buffer.
7000
7001 If CODING->src_object is a buffer, it must be the current buffer.
7002 In this case, if CODING->src_pos is positive, it is a position of
7003 the source text in the buffer, otherwise, the source text is in the
7004 gap area of the buffer, and CODING->src_pos specifies the offset of
7005 the text from GPT (which must be the same as PT). If this is the
7006 same buffer as CODING->dst_object, CODING->src_pos must be
7007 negative.
7008
7009 If CODING->src_object is a string, CODING->src_pos is an index to
7010 that string.
7011
7012 If CODING->src_object is nil, CODING->source must already point to
7013 the non-relocatable memory area. In this case, CODING->src_pos is
7014 an offset from CODING->source.
7015
7016 The decoded data is inserted at the current point of the buffer
7017 CODING->dst_object.
7018 */
7019
7020 static int
7021 decode_coding (struct coding_system *coding)
7022 {
7023 Lisp_Object attrs;
7024 Lisp_Object undo_list;
7025 Lisp_Object translation_table;
7026 struct ccl_spec cclspec;
7027 int carryover;
7028 int i;
7029
7030 if (BUFFERP (coding->src_object)
7031 && coding->src_pos > 0
7032 && coding->src_pos < GPT
7033 && coding->src_pos + coding->src_chars > GPT)
7034 move_gap_both (coding->src_pos, coding->src_pos_byte);
7035
7036 undo_list = Qt;
7037 if (BUFFERP (coding->dst_object))
7038 {
7039 if (current_buffer != XBUFFER (coding->dst_object))
7040 set_buffer_internal (XBUFFER (coding->dst_object));
7041 if (GPT != PT)
7042 move_gap_both (PT, PT_BYTE);
7043 undo_list = BVAR (current_buffer, undo_list);
7044 BVAR (current_buffer, undo_list) = Qt;
7045 }
7046
7047 coding->consumed = coding->consumed_char = 0;
7048 coding->produced = coding->produced_char = 0;
7049 coding->chars_at_source = 0;
7050 record_conversion_result (coding, CODING_RESULT_SUCCESS);
7051 coding->errors = 0;
7052
7053 ALLOC_CONVERSION_WORK_AREA (coding);
7054
7055 attrs = CODING_ID_ATTRS (coding->id);
7056 translation_table = get_translation_table (attrs, 0, NULL);
7057
7058 carryover = 0;
7059 if (coding->decoder == decode_coding_ccl)
7060 {
7061 coding->spec.ccl = &cclspec;
7062 setup_ccl_program (&cclspec.ccl, CODING_CCL_DECODER (coding));
7063 }
7064 do
7065 {
7066 EMACS_INT pos = coding->dst_pos + coding->produced_char;
7067
7068 coding_set_source (coding);
7069 coding->annotated = 0;
7070 coding->charbuf_used = carryover;
7071 (*(coding->decoder)) (coding);
7072 coding_set_destination (coding);
7073 carryover = produce_chars (coding, translation_table, 0);
7074 if (coding->annotated)
7075 produce_annotation (coding, pos);
7076 for (i = 0; i < carryover; i++)
7077 coding->charbuf[i]
7078 = coding->charbuf[coding->charbuf_used - carryover + i];
7079 }
7080 while (coding->result == CODING_RESULT_INSUFFICIENT_DST
7081 || (coding->consumed < coding->src_bytes
7082 && (coding->result == CODING_RESULT_SUCCESS
7083 || coding->result == CODING_RESULT_INVALID_SRC)));
7084
7085 if (carryover > 0)
7086 {
7087 coding_set_destination (coding);
7088 coding->charbuf_used = carryover;
7089 produce_chars (coding, translation_table, 1);
7090 }
7091
7092 coding->carryover_bytes = 0;
7093 if (coding->consumed < coding->src_bytes)
7094 {
7095 int nbytes = coding->src_bytes - coding->consumed;
7096 const unsigned char *src;
7097
7098 coding_set_source (coding);
7099 coding_set_destination (coding);
7100 src = coding->source + coding->consumed;
7101
7102 if (coding->mode & CODING_MODE_LAST_BLOCK)
7103 {
7104 /* Flush out unprocessed data as binary chars. We are sure
7105 that the number of data is less than the size of
7106 coding->charbuf. */
7107 coding->charbuf_used = 0;
7108 coding->chars_at_source = 0;
7109
7110 while (nbytes-- > 0)
7111 {
7112 int c = *src++;
7113
7114 if (c & 0x80)
7115 c = BYTE8_TO_CHAR (c);
7116 coding->charbuf[coding->charbuf_used++] = c;
7117 }
7118 produce_chars (coding, Qnil, 1);
7119 }
7120 else
7121 {
7122 /* Record unprocessed bytes in coding->carryover. We are
7123 sure that the number of data is less than the size of
7124 coding->carryover. */
7125 unsigned char *p = coding->carryover;
7126
7127 if (nbytes > sizeof coding->carryover)
7128 nbytes = sizeof coding->carryover;
7129 coding->carryover_bytes = nbytes;
7130 while (nbytes-- > 0)
7131 *p++ = *src++;
7132 }
7133 coding->consumed = coding->src_bytes;
7134 }
7135
7136 if (! EQ (CODING_ID_EOL_TYPE (coding->id), Qunix)
7137 && !inhibit_eol_conversion)
7138 decode_eol (coding);
7139 if (BUFFERP (coding->dst_object))
7140 {
7141 BVAR (current_buffer, undo_list) = undo_list;
7142 record_insert (coding->dst_pos, coding->produced_char);
7143 }
7144 return coding->result;
7145 }
7146
7147
7148 /* Extract an annotation datum from a composition starting at POS and
7149 ending before LIMIT of CODING->src_object (buffer or string), store
7150 the data in BUF, set *STOP to a starting position of the next
7151 composition (if any) or to LIMIT, and return the address of the
7152 next element of BUF.
7153
7154 If such an annotation is not found, set *STOP to a starting
7155 position of a composition after POS (if any) or to LIMIT, and
7156 return BUF. */
7157
7158 static INLINE int *
7159 handle_composition_annotation (EMACS_INT pos, EMACS_INT limit,
7160 struct coding_system *coding, int *buf,
7161 EMACS_INT *stop)
7162 {
7163 EMACS_INT start, end;
7164 Lisp_Object prop;
7165
7166 if (! find_composition (pos, limit, &start, &end, &prop, coding->src_object)
7167 || end > limit)
7168 *stop = limit;
7169 else if (start > pos)
7170 *stop = start;
7171 else
7172 {
7173 if (start == pos)
7174 {
7175 /* We found a composition. Store the corresponding
7176 annotation data in BUF. */
7177 int *head = buf;
7178 enum composition_method method = COMPOSITION_METHOD (prop);
7179 int nchars = COMPOSITION_LENGTH (prop);
7180
7181 ADD_COMPOSITION_DATA (buf, nchars, 0, method);
7182 if (method != COMPOSITION_RELATIVE)
7183 {
7184 Lisp_Object components;
7185 int len, i, i_byte;
7186
7187 components = COMPOSITION_COMPONENTS (prop);
7188 if (VECTORP (components))
7189 {
7190 len = XVECTOR (components)->size;
7191 for (i = 0; i < len; i++)
7192 *buf++ = XINT (AREF (components, i));
7193 }
7194 else if (STRINGP (components))
7195 {
7196 len = SCHARS (components);
7197 i = i_byte = 0;
7198 while (i < len)
7199 {
7200 FETCH_STRING_CHAR_ADVANCE (*buf, components, i, i_byte);
7201 buf++;
7202 }
7203 }
7204 else if (INTEGERP (components))
7205 {
7206 len = 1;
7207 *buf++ = XINT (components);
7208 }
7209 else if (CONSP (components))
7210 {
7211 for (len = 0; CONSP (components);
7212 len++, components = XCDR (components))
7213 *buf++ = XINT (XCAR (components));
7214 }
7215 else
7216 abort ();
7217 *head -= len;
7218 }
7219 }
7220
7221 if (find_composition (end, limit, &start, &end, &prop,
7222 coding->src_object)
7223 && end <= limit)
7224 *stop = start;
7225 else
7226 *stop = limit;
7227 }
7228 return buf;
7229 }
7230
7231
7232 /* Extract an annotation datum from a text property `charset' at POS of
7233 CODING->src_object (buffer of string), store the data in BUF, set
7234 *STOP to the position where the value of `charset' property changes
7235 (limiting by LIMIT), and return the address of the next element of
7236 BUF.
7237
7238 If the property value is nil, set *STOP to the position where the
7239 property value is non-nil (limiting by LIMIT), and return BUF. */
7240
7241 static INLINE int *
7242 handle_charset_annotation (EMACS_INT pos, EMACS_INT limit,
7243 struct coding_system *coding, int *buf,
7244 EMACS_INT *stop)
7245 {
7246 Lisp_Object val, next;
7247 int id;
7248
7249 val = Fget_text_property (make_number (pos), Qcharset, coding->src_object);
7250 if (! NILP (val) && CHARSETP (val))
7251 id = XINT (CHARSET_SYMBOL_ID (val));
7252 else
7253 id = -1;
7254 ADD_CHARSET_DATA (buf, 0, id);
7255 next = Fnext_single_property_change (make_number (pos), Qcharset,
7256 coding->src_object,
7257 make_number (limit));
7258 *stop = XINT (next);
7259 return buf;
7260 }
7261
7262
7263 static void
7264 consume_chars (struct coding_system *coding, Lisp_Object translation_table,
7265 int max_lookup)
7266 {
7267 int *buf = coding->charbuf;
7268 int *buf_end = coding->charbuf + coding->charbuf_size;
7269 const unsigned char *src = coding->source + coding->consumed;
7270 const unsigned char *src_end = coding->source + coding->src_bytes;
7271 EMACS_INT pos = coding->src_pos + coding->consumed_char;
7272 EMACS_INT end_pos = coding->src_pos + coding->src_chars;
7273 int multibytep = coding->src_multibyte;
7274 Lisp_Object eol_type;
7275 int c;
7276 EMACS_INT stop, stop_composition, stop_charset;
7277 int *lookup_buf = NULL;
7278
7279 if (! NILP (translation_table))
7280 lookup_buf = alloca (sizeof (int) * max_lookup);
7281
7282 eol_type = inhibit_eol_conversion ? Qunix : CODING_ID_EOL_TYPE (coding->id);
7283 if (VECTORP (eol_type))
7284 eol_type = Qunix;
7285
7286 /* Note: composition handling is not yet implemented. */
7287 coding->common_flags &= ~CODING_ANNOTATE_COMPOSITION_MASK;
7288
7289 if (NILP (coding->src_object))
7290 stop = stop_composition = stop_charset = end_pos;
7291 else
7292 {
7293 if (coding->common_flags & CODING_ANNOTATE_COMPOSITION_MASK)
7294 stop = stop_composition = pos;
7295 else
7296 stop = stop_composition = end_pos;
7297 if (coding->common_flags & CODING_ANNOTATE_CHARSET_MASK)
7298 stop = stop_charset = pos;
7299 else
7300 stop_charset = end_pos;
7301 }
7302
7303 /* Compensate for CRLF and conversion. */
7304 buf_end -= 1 + MAX_ANNOTATION_LENGTH;
7305 while (buf < buf_end)
7306 {
7307 Lisp_Object trans;
7308
7309 if (pos == stop)
7310 {
7311 if (pos == end_pos)
7312 break;
7313 if (pos == stop_composition)
7314 buf = handle_composition_annotation (pos, end_pos, coding,
7315 buf, &stop_composition);
7316 if (pos == stop_charset)
7317 buf = handle_charset_annotation (pos, end_pos, coding,
7318 buf, &stop_charset);
7319 stop = (stop_composition < stop_charset
7320 ? stop_composition : stop_charset);
7321 }
7322
7323 if (! multibytep)
7324 {
7325 EMACS_INT bytes;
7326
7327 if (coding->encoder == encode_coding_raw_text
7328 || coding->encoder == encode_coding_ccl)
7329 c = *src++, pos++;
7330 else if ((bytes = MULTIBYTE_LENGTH (src, src_end)) > 0)
7331 c = STRING_CHAR_ADVANCE_NO_UNIFY (src), pos += bytes;
7332 else
7333 c = BYTE8_TO_CHAR (*src), src++, pos++;
7334 }
7335 else
7336 c = STRING_CHAR_ADVANCE_NO_UNIFY (src), pos++;
7337 if ((c == '\r') && (coding->mode & CODING_MODE_SELECTIVE_DISPLAY))
7338 c = '\n';
7339 if (! EQ (eol_type, Qunix))
7340 {
7341 if (c == '\n')
7342 {
7343 if (EQ (eol_type, Qdos))
7344 *buf++ = '\r';
7345 else
7346 c = '\r';
7347 }
7348 }
7349
7350 trans = Qnil;
7351 LOOKUP_TRANSLATION_TABLE (translation_table, c, trans);
7352 if (NILP (trans))
7353 *buf++ = c;
7354 else
7355 {
7356 int from_nchars = 1, to_nchars = 1;
7357 int *lookup_buf_end;
7358 const unsigned char *p = src;
7359 int i;
7360
7361 lookup_buf[0] = c;
7362 for (i = 1; i < max_lookup && p < src_end; i++)
7363 lookup_buf[i] = STRING_CHAR_ADVANCE (p);
7364 lookup_buf_end = lookup_buf + i;
7365 trans = get_translation (trans, lookup_buf, lookup_buf_end);
7366 if (INTEGERP (trans))
7367 c = XINT (trans);
7368 else if (CONSP (trans))
7369 {
7370 from_nchars = ASIZE (XCAR (trans));
7371 trans = XCDR (trans);
7372 if (INTEGERP (trans))
7373 c = XINT (trans);
7374 else
7375 {
7376 to_nchars = ASIZE (trans);
7377 if (buf + to_nchars > buf_end)
7378 break;
7379 c = XINT (AREF (trans, 0));
7380 }
7381 }
7382 else
7383 break;
7384 *buf++ = c;
7385 for (i = 1; i < to_nchars; i++)
7386 *buf++ = XINT (AREF (trans, i));
7387 for (i = 1; i < from_nchars; i++, pos++)
7388 src += MULTIBYTE_LENGTH_NO_CHECK (src);
7389 }
7390 }
7391
7392 coding->consumed = src - coding->source;
7393 coding->consumed_char = pos - coding->src_pos;
7394 coding->charbuf_used = buf - coding->charbuf;
7395 coding->chars_at_source = 0;
7396 }
7397
7398
7399 /* Encode the text at CODING->src_object into CODING->dst_object.
7400 CODING->src_object is a buffer or a string.
7401 CODING->dst_object is a buffer or nil.
7402
7403 If CODING->src_object is a buffer, it must be the current buffer.
7404 In this case, if CODING->src_pos is positive, it is a position of
7405 the source text in the buffer, otherwise. the source text is in the
7406 gap area of the buffer, and coding->src_pos specifies the offset of
7407 the text from GPT (which must be the same as PT). If this is the
7408 same buffer as CODING->dst_object, CODING->src_pos must be
7409 negative and CODING should not have `pre-write-conversion'.
7410
7411 If CODING->src_object is a string, CODING should not have
7412 `pre-write-conversion'.
7413
7414 If CODING->dst_object is a buffer, the encoded data is inserted at
7415 the current point of that buffer.
7416
7417 If CODING->dst_object is nil, the encoded data is placed at the
7418 memory area specified by CODING->destination. */
7419
7420 static int
7421 encode_coding (struct coding_system *coding)
7422 {
7423 Lisp_Object attrs;
7424 Lisp_Object translation_table;
7425 int max_lookup;
7426 struct ccl_spec cclspec;
7427
7428 attrs = CODING_ID_ATTRS (coding->id);
7429 if (coding->encoder == encode_coding_raw_text)
7430 translation_table = Qnil, max_lookup = 0;
7431 else
7432 translation_table = get_translation_table (attrs, 1, &max_lookup);
7433
7434 if (BUFFERP (coding->dst_object))
7435 {
7436 set_buffer_internal (XBUFFER (coding->dst_object));
7437 coding->dst_multibyte
7438 = ! NILP (BVAR (current_buffer, enable_multibyte_characters));
7439 }
7440
7441 coding->consumed = coding->consumed_char = 0;
7442 coding->produced = coding->produced_char = 0;
7443 record_conversion_result (coding, CODING_RESULT_SUCCESS);
7444 coding->errors = 0;
7445
7446 ALLOC_CONVERSION_WORK_AREA (coding);
7447
7448 if (coding->encoder == encode_coding_ccl)
7449 {
7450 coding->spec.ccl = &cclspec;
7451 setup_ccl_program (&cclspec.ccl, CODING_CCL_ENCODER (coding));
7452 }
7453 do {
7454 coding_set_source (coding);
7455 consume_chars (coding, translation_table, max_lookup);
7456 coding_set_destination (coding);
7457 (*(coding->encoder)) (coding);
7458 } while (coding->consumed_char < coding->src_chars);
7459
7460 if (BUFFERP (coding->dst_object) && coding->produced_char > 0)
7461 insert_from_gap (coding->produced_char, coding->produced);
7462
7463 return (coding->result);
7464 }
7465
7466
7467 /* Name (or base name) of work buffer for code conversion. */
7468 static Lisp_Object Vcode_conversion_workbuf_name;
7469
7470 /* A working buffer used by the top level conversion. Once it is
7471 created, it is never destroyed. It has the name
7472 Vcode_conversion_workbuf_name. The other working buffers are
7473 destroyed after the use is finished, and their names are modified
7474 versions of Vcode_conversion_workbuf_name. */
7475 static Lisp_Object Vcode_conversion_reused_workbuf;
7476
7477 /* 1 iff Vcode_conversion_reused_workbuf is already in use. */
7478 static int reused_workbuf_in_use;
7479
7480
7481 /* Return a working buffer of code conversion. MULTIBYTE specifies the
7482 multibyteness of returning buffer. */
7483
7484 static Lisp_Object
7485 make_conversion_work_buffer (int multibyte)
7486 {
7487 Lisp_Object name, workbuf;
7488 struct buffer *current;
7489
7490 if (reused_workbuf_in_use++)
7491 {
7492 name = Fgenerate_new_buffer_name (Vcode_conversion_workbuf_name, Qnil);
7493 workbuf = Fget_buffer_create (name);
7494 }
7495 else
7496 {
7497 if (NILP (Fbuffer_live_p (Vcode_conversion_reused_workbuf)))
7498 Vcode_conversion_reused_workbuf
7499 = Fget_buffer_create (Vcode_conversion_workbuf_name);
7500 workbuf = Vcode_conversion_reused_workbuf;
7501 }
7502 current = current_buffer;
7503 set_buffer_internal (XBUFFER (workbuf));
7504 /* We can't allow modification hooks to run in the work buffer. For
7505 instance, directory_files_internal assumes that file decoding
7506 doesn't compile new regexps. */
7507 Fset (Fmake_local_variable (Qinhibit_modification_hooks), Qt);
7508 Ferase_buffer ();
7509 BVAR (current_buffer, undo_list) = Qt;
7510 BVAR (current_buffer, enable_multibyte_characters) = multibyte ? Qt : Qnil;
7511 set_buffer_internal (current);
7512 return workbuf;
7513 }
7514
7515
7516 static Lisp_Object
7517 code_conversion_restore (Lisp_Object arg)
7518 {
7519 Lisp_Object current, workbuf;
7520 struct gcpro gcpro1;
7521
7522 GCPRO1 (arg);
7523 current = XCAR (arg);
7524 workbuf = XCDR (arg);
7525 if (! NILP (workbuf))
7526 {
7527 if (EQ (workbuf, Vcode_conversion_reused_workbuf))
7528 reused_workbuf_in_use = 0;
7529 else if (! NILP (Fbuffer_live_p (workbuf)))
7530 Fkill_buffer (workbuf);
7531 }
7532 set_buffer_internal (XBUFFER (current));
7533 UNGCPRO;
7534 return Qnil;
7535 }
7536
7537 Lisp_Object
7538 code_conversion_save (int with_work_buf, int multibyte)
7539 {
7540 Lisp_Object workbuf = Qnil;
7541
7542 if (with_work_buf)
7543 workbuf = make_conversion_work_buffer (multibyte);
7544 record_unwind_protect (code_conversion_restore,
7545 Fcons (Fcurrent_buffer (), workbuf));
7546 return workbuf;
7547 }
7548
7549 int
7550 decode_coding_gap (struct coding_system *coding,
7551 EMACS_INT chars, EMACS_INT bytes)
7552 {
7553 int count = SPECPDL_INDEX ();
7554 Lisp_Object attrs;
7555
7556 code_conversion_save (0, 0);
7557
7558 coding->src_object = Fcurrent_buffer ();
7559 coding->src_chars = chars;
7560 coding->src_bytes = bytes;
7561 coding->src_pos = -chars;
7562 coding->src_pos_byte = -bytes;
7563 coding->src_multibyte = chars < bytes;
7564 coding->dst_object = coding->src_object;
7565 coding->dst_pos = PT;
7566 coding->dst_pos_byte = PT_BYTE;
7567 coding->dst_multibyte = ! NILP (BVAR (current_buffer, enable_multibyte_characters));
7568
7569 if (CODING_REQUIRE_DETECTION (coding))
7570 detect_coding (coding);
7571
7572 coding->mode |= CODING_MODE_LAST_BLOCK;
7573 current_buffer->text->inhibit_shrinking = 1;
7574 decode_coding (coding);
7575 current_buffer->text->inhibit_shrinking = 0;
7576
7577 attrs = CODING_ID_ATTRS (coding->id);
7578 if (! NILP (CODING_ATTR_POST_READ (attrs)))
7579 {
7580 EMACS_INT prev_Z = Z, prev_Z_BYTE = Z_BYTE;
7581 Lisp_Object val;
7582
7583 TEMP_SET_PT_BOTH (coding->dst_pos, coding->dst_pos_byte);
7584 val = call1 (CODING_ATTR_POST_READ (attrs),
7585 make_number (coding->produced_char));
7586 CHECK_NATNUM (val);
7587 coding->produced_char += Z - prev_Z;
7588 coding->produced += Z_BYTE - prev_Z_BYTE;
7589 }
7590
7591 unbind_to (count, Qnil);
7592 return coding->result;
7593 }
7594
7595 int
7596 encode_coding_gap (struct coding_system *coding,
7597 EMACS_INT chars, EMACS_INT bytes)
7598 {
7599 int count = SPECPDL_INDEX ();
7600
7601 code_conversion_save (0, 0);
7602
7603 coding->src_object = Fcurrent_buffer ();
7604 coding->src_chars = chars;
7605 coding->src_bytes = bytes;
7606 coding->src_pos = -chars;
7607 coding->src_pos_byte = -bytes;
7608 coding->src_multibyte = chars < bytes;
7609 coding->dst_object = coding->src_object;
7610 coding->dst_pos = PT;
7611 coding->dst_pos_byte = PT_BYTE;
7612
7613 encode_coding (coding);
7614
7615 unbind_to (count, Qnil);
7616 return coding->result;
7617 }
7618
7619
7620 /* Decode the text in the range FROM/FROM_BYTE and TO/TO_BYTE in
7621 SRC_OBJECT into DST_OBJECT by coding context CODING.
7622
7623 SRC_OBJECT is a buffer, a string, or Qnil.
7624
7625 If it is a buffer, the text is at point of the buffer. FROM and TO
7626 are positions in the buffer.
7627
7628 If it is a string, the text is at the beginning of the string.
7629 FROM and TO are indices to the string.
7630
7631 If it is nil, the text is at coding->source. FROM and TO are
7632 indices to coding->source.
7633
7634 DST_OBJECT is a buffer, Qt, or Qnil.
7635
7636 If it is a buffer, the decoded text is inserted at point of the
7637 buffer. If the buffer is the same as SRC_OBJECT, the source text
7638 is deleted.
7639
7640 If it is Qt, a string is made from the decoded text, and
7641 set in CODING->dst_object.
7642
7643 If it is Qnil, the decoded text is stored at CODING->destination.
7644 The caller must allocate CODING->dst_bytes bytes at
7645 CODING->destination by xmalloc. If the decoded text is longer than
7646 CODING->dst_bytes, CODING->destination is relocated by xrealloc.
7647 */
7648
7649 void
7650 decode_coding_object (struct coding_system *coding,
7651 Lisp_Object src_object,
7652 EMACS_INT from, EMACS_INT from_byte,
7653 EMACS_INT to, EMACS_INT to_byte,
7654 Lisp_Object dst_object)
7655 {
7656 int count = SPECPDL_INDEX ();
7657 unsigned char *destination;
7658 EMACS_INT dst_bytes;
7659 EMACS_INT chars = to - from;
7660 EMACS_INT bytes = to_byte - from_byte;
7661 Lisp_Object attrs;
7662 int saved_pt = -1, saved_pt_byte;
7663 int need_marker_adjustment = 0;
7664 Lisp_Object old_deactivate_mark;
7665
7666 old_deactivate_mark = Vdeactivate_mark;
7667
7668 if (NILP (dst_object))
7669 {
7670 destination = coding->destination;
7671 dst_bytes = coding->dst_bytes;
7672 }
7673
7674 coding->src_object = src_object;
7675 coding->src_chars = chars;
7676 coding->src_bytes = bytes;
7677 coding->src_multibyte = chars < bytes;
7678
7679 if (STRINGP (src_object))
7680 {
7681 coding->src_pos = from;
7682 coding->src_pos_byte = from_byte;
7683 }
7684 else if (BUFFERP (src_object))
7685 {
7686 set_buffer_internal (XBUFFER (src_object));
7687 if (from != GPT)
7688 move_gap_both (from, from_byte);
7689 if (EQ (src_object, dst_object))
7690 {
7691 struct Lisp_Marker *tail;
7692
7693 for (tail = BUF_MARKERS (current_buffer); tail; tail = tail->next)
7694 {
7695 tail->need_adjustment
7696 = tail->charpos == (tail->insertion_type ? from : to);
7697 need_marker_adjustment |= tail->need_adjustment;
7698 }
7699 saved_pt = PT, saved_pt_byte = PT_BYTE;
7700 TEMP_SET_PT_BOTH (from, from_byte);
7701 current_buffer->text->inhibit_shrinking = 1;
7702 del_range_both (from, from_byte, to, to_byte, 1);
7703 coding->src_pos = -chars;
7704 coding->src_pos_byte = -bytes;
7705 }
7706 else
7707 {
7708 coding->src_pos = from;
7709 coding->src_pos_byte = from_byte;
7710 }
7711 }
7712
7713 if (CODING_REQUIRE_DETECTION (coding))
7714 detect_coding (coding);
7715 attrs = CODING_ID_ATTRS (coding->id);
7716
7717 if (EQ (dst_object, Qt)
7718 || (! NILP (CODING_ATTR_POST_READ (attrs))
7719 && NILP (dst_object)))
7720 {
7721 coding->dst_multibyte = !CODING_FOR_UNIBYTE (coding);
7722 coding->dst_object = code_conversion_save (1, coding->dst_multibyte);
7723 coding->dst_pos = BEG;
7724 coding->dst_pos_byte = BEG_BYTE;
7725 }
7726 else if (BUFFERP (dst_object))
7727 {
7728 code_conversion_save (0, 0);
7729 coding->dst_object = dst_object;
7730 coding->dst_pos = BUF_PT (XBUFFER (dst_object));
7731 coding->dst_pos_byte = BUF_PT_BYTE (XBUFFER (dst_object));
7732 coding->dst_multibyte
7733 = ! NILP (BVAR (XBUFFER (dst_object), enable_multibyte_characters));
7734 }
7735 else
7736 {
7737 code_conversion_save (0, 0);
7738 coding->dst_object = Qnil;
7739 /* Most callers presume this will return a multibyte result, and they
7740 won't use `binary' or `raw-text' anyway, so let's not worry about
7741 CODING_FOR_UNIBYTE. */
7742 coding->dst_multibyte = 1;
7743 }
7744
7745 decode_coding (coding);
7746
7747 if (BUFFERP (coding->dst_object))
7748 set_buffer_internal (XBUFFER (coding->dst_object));
7749
7750 if (! NILP (CODING_ATTR_POST_READ (attrs)))
7751 {
7752 struct gcpro gcpro1, gcpro2, gcpro3, gcpro4, gcpro5;
7753 EMACS_INT prev_Z = Z, prev_Z_BYTE = Z_BYTE;
7754 Lisp_Object val;
7755
7756 TEMP_SET_PT_BOTH (coding->dst_pos, coding->dst_pos_byte);
7757 GCPRO5 (coding->src_object, coding->dst_object, src_object, dst_object,
7758 old_deactivate_mark);
7759 val = safe_call1 (CODING_ATTR_POST_READ (attrs),
7760 make_number (coding->produced_char));
7761 UNGCPRO;
7762 CHECK_NATNUM (val);
7763 coding->produced_char += Z - prev_Z;
7764 coding->produced += Z_BYTE - prev_Z_BYTE;
7765 }
7766
7767 if (EQ (dst_object, Qt))
7768 {
7769 coding->dst_object = Fbuffer_string ();
7770 }
7771 else if (NILP (dst_object) && BUFFERP (coding->dst_object))
7772 {
7773 set_buffer_internal (XBUFFER (coding->dst_object));
7774 if (dst_bytes < coding->produced)
7775 {
7776 destination = xrealloc (destination, coding->produced);
7777 if (! destination)
7778 {
7779 record_conversion_result (coding,
7780 CODING_RESULT_INSUFFICIENT_MEM);
7781 unbind_to (count, Qnil);
7782 return;
7783 }
7784 if (BEGV < GPT && GPT < BEGV + coding->produced_char)
7785 move_gap_both (BEGV, BEGV_BYTE);
7786 memcpy (destination, BEGV_ADDR, coding->produced);
7787 coding->destination = destination;
7788 }
7789 }
7790
7791 if (saved_pt >= 0)
7792 {
7793 /* This is the case of:
7794 (BUFFERP (src_object) && EQ (src_object, dst_object))
7795 As we have moved PT while replacing the original buffer
7796 contents, we must recover it now. */
7797 set_buffer_internal (XBUFFER (src_object));
7798 current_buffer->text->inhibit_shrinking = 0;
7799 if (saved_pt < from)
7800 TEMP_SET_PT_BOTH (saved_pt, saved_pt_byte);
7801 else if (saved_pt < from + chars)
7802 TEMP_SET_PT_BOTH (from, from_byte);
7803 else if (! NILP (BVAR (current_buffer, enable_multibyte_characters)))
7804 TEMP_SET_PT_BOTH (saved_pt + (coding->produced_char - chars),
7805 saved_pt_byte + (coding->produced - bytes));
7806 else
7807 TEMP_SET_PT_BOTH (saved_pt + (coding->produced - bytes),
7808 saved_pt_byte + (coding->produced - bytes));
7809
7810 if (need_marker_adjustment)
7811 {
7812 struct Lisp_Marker *tail;
7813
7814 for (tail = BUF_MARKERS (current_buffer); tail; tail = tail->next)
7815 if (tail->need_adjustment)
7816 {
7817 tail->need_adjustment = 0;
7818 if (tail->insertion_type)
7819 {
7820 tail->bytepos = from_byte;
7821 tail->charpos = from;
7822 }
7823 else
7824 {
7825 tail->bytepos = from_byte + coding->produced;
7826 tail->charpos
7827 = (NILP (BVAR (current_buffer, enable_multibyte_characters))
7828 ? tail->bytepos : from + coding->produced_char);
7829 }
7830 }
7831 }
7832 }
7833
7834 Vdeactivate_mark = old_deactivate_mark;
7835 unbind_to (count, coding->dst_object);
7836 }
7837
7838
7839 void
7840 encode_coding_object (struct coding_system *coding,
7841 Lisp_Object src_object,
7842 EMACS_INT from, EMACS_INT from_byte,
7843 EMACS_INT to, EMACS_INT to_byte,
7844 Lisp_Object dst_object)
7845 {
7846 int count = SPECPDL_INDEX ();
7847 EMACS_INT chars = to - from;
7848 EMACS_INT bytes = to_byte - from_byte;
7849 Lisp_Object attrs;
7850 int saved_pt = -1, saved_pt_byte;
7851 int need_marker_adjustment = 0;
7852 int kill_src_buffer = 0;
7853 Lisp_Object old_deactivate_mark;
7854
7855 old_deactivate_mark = Vdeactivate_mark;
7856
7857 coding->src_object = src_object;
7858 coding->src_chars = chars;
7859 coding->src_bytes = bytes;
7860 coding->src_multibyte = chars < bytes;
7861
7862 attrs = CODING_ID_ATTRS (coding->id);
7863
7864 if (EQ (src_object, dst_object))
7865 {
7866 struct Lisp_Marker *tail;
7867
7868 for (tail = BUF_MARKERS (current_buffer); tail; tail = tail->next)
7869 {
7870 tail->need_adjustment
7871 = tail->charpos == (tail->insertion_type ? from : to);
7872 need_marker_adjustment |= tail->need_adjustment;
7873 }
7874 }
7875
7876 if (! NILP (CODING_ATTR_PRE_WRITE (attrs)))
7877 {
7878 coding->src_object = code_conversion_save (1, coding->src_multibyte);
7879 set_buffer_internal (XBUFFER (coding->src_object));
7880 if (STRINGP (src_object))
7881 insert_from_string (src_object, from, from_byte, chars, bytes, 0);
7882 else if (BUFFERP (src_object))
7883 insert_from_buffer (XBUFFER (src_object), from, chars, 0);
7884 else
7885 insert_1_both ((char *) coding->source + from, chars, bytes, 0, 0, 0);
7886
7887 if (EQ (src_object, dst_object))
7888 {
7889 set_buffer_internal (XBUFFER (src_object));
7890 saved_pt = PT, saved_pt_byte = PT_BYTE;
7891 del_range_both (from, from_byte, to, to_byte, 1);
7892 set_buffer_internal (XBUFFER (coding->src_object));
7893 }
7894
7895 {
7896 Lisp_Object args[3];
7897 struct gcpro gcpro1, gcpro2, gcpro3, gcpro4, gcpro5;
7898
7899 GCPRO5 (coding->src_object, coding->dst_object, src_object, dst_object,
7900 old_deactivate_mark);
7901 args[0] = CODING_ATTR_PRE_WRITE (attrs);
7902 args[1] = make_number (BEG);
7903 args[2] = make_number (Z);
7904 safe_call (3, args);
7905 UNGCPRO;
7906 }
7907 if (XBUFFER (coding->src_object) != current_buffer)
7908 kill_src_buffer = 1;
7909 coding->src_object = Fcurrent_buffer ();
7910 if (BEG != GPT)
7911 move_gap_both (BEG, BEG_BYTE);
7912 coding->src_chars = Z - BEG;
7913 coding->src_bytes = Z_BYTE - BEG_BYTE;
7914 coding->src_pos = BEG;
7915 coding->src_pos_byte = BEG_BYTE;
7916 coding->src_multibyte = Z < Z_BYTE;
7917 }
7918 else if (STRINGP (src_object))
7919 {
7920 code_conversion_save (0, 0);
7921 coding->src_pos = from;
7922 coding->src_pos_byte = from_byte;
7923 }
7924 else if (BUFFERP (src_object))
7925 {
7926 code_conversion_save (0, 0);
7927 set_buffer_internal (XBUFFER (src_object));
7928 if (EQ (src_object, dst_object))
7929 {
7930 saved_pt = PT, saved_pt_byte = PT_BYTE;
7931 coding->src_object = del_range_1 (from, to, 1, 1);
7932 coding->src_pos = 0;
7933 coding->src_pos_byte = 0;
7934 }
7935 else
7936 {
7937 if (from < GPT && to >= GPT)
7938 move_gap_both (from, from_byte);
7939 coding->src_pos = from;
7940 coding->src_pos_byte = from_byte;
7941 }
7942 }
7943 else
7944 code_conversion_save (0, 0);
7945
7946 if (BUFFERP (dst_object))
7947 {
7948 coding->dst_object = dst_object;
7949 if (EQ (src_object, dst_object))
7950 {
7951 coding->dst_pos = from;
7952 coding->dst_pos_byte = from_byte;
7953 }
7954 else
7955 {
7956 struct buffer *current = current_buffer;
7957
7958 set_buffer_temp (XBUFFER (dst_object));
7959 coding->dst_pos = PT;
7960 coding->dst_pos_byte = PT_BYTE;
7961 move_gap_both (coding->dst_pos, coding->dst_pos_byte);
7962 set_buffer_temp (current);
7963 }
7964 coding->dst_multibyte
7965 = ! NILP (BVAR (XBUFFER (dst_object), enable_multibyte_characters));
7966 }
7967 else if (EQ (dst_object, Qt))
7968 {
7969 coding->dst_object = Qnil;
7970 coding->dst_bytes = coding->src_chars;
7971 if (coding->dst_bytes == 0)
7972 coding->dst_bytes = 1;
7973 coding->destination = (unsigned char *) xmalloc (coding->dst_bytes);
7974 coding->dst_multibyte = 0;
7975 }
7976 else
7977 {
7978 coding->dst_object = Qnil;
7979 coding->dst_multibyte = 0;
7980 }
7981
7982 encode_coding (coding);
7983
7984 if (EQ (dst_object, Qt))
7985 {
7986 if (BUFFERP (coding->dst_object))
7987 coding->dst_object = Fbuffer_string ();
7988 else
7989 {
7990 coding->dst_object
7991 = make_unibyte_string ((char *) coding->destination,
7992 coding->produced);
7993 xfree (coding->destination);
7994 }
7995 }
7996
7997 if (saved_pt >= 0)
7998 {
7999 /* This is the case of:
8000 (BUFFERP (src_object) && EQ (src_object, dst_object))
8001 As we have moved PT while replacing the original buffer
8002 contents, we must recover it now. */
8003 set_buffer_internal (XBUFFER (src_object));
8004 if (saved_pt < from)
8005 TEMP_SET_PT_BOTH (saved_pt, saved_pt_byte);
8006 else if (saved_pt < from + chars)
8007 TEMP_SET_PT_BOTH (from, from_byte);
8008 else if (! NILP (BVAR (current_buffer, enable_multibyte_characters)))
8009 TEMP_SET_PT_BOTH (saved_pt + (coding->produced_char - chars),
8010 saved_pt_byte + (coding->produced - bytes));
8011 else
8012 TEMP_SET_PT_BOTH (saved_pt + (coding->produced - bytes),
8013 saved_pt_byte + (coding->produced - bytes));
8014
8015 if (need_marker_adjustment)
8016 {
8017 struct Lisp_Marker *tail;
8018
8019 for (tail = BUF_MARKERS (current_buffer); tail; tail = tail->next)
8020 if (tail->need_adjustment)
8021 {
8022 tail->need_adjustment = 0;
8023 if (tail->insertion_type)
8024 {
8025 tail->bytepos = from_byte;
8026 tail->charpos = from;
8027 }
8028 else
8029 {
8030 tail->bytepos = from_byte + coding->produced;
8031 tail->charpos
8032 = (NILP (BVAR (current_buffer, enable_multibyte_characters))
8033 ? tail->bytepos : from + coding->produced_char);
8034 }
8035 }
8036 }
8037 }
8038
8039 if (kill_src_buffer)
8040 Fkill_buffer (coding->src_object);
8041
8042 Vdeactivate_mark = old_deactivate_mark;
8043 unbind_to (count, Qnil);
8044 }
8045
8046
8047 Lisp_Object
8048 preferred_coding_system (void)
8049 {
8050 int id = coding_categories[coding_priorities[0]].id;
8051
8052 return CODING_ID_NAME (id);
8053 }
8054
8055 \f
8056 #ifdef emacs
8057 /*** 8. Emacs Lisp library functions ***/
8058
8059 DEFUN ("coding-system-p", Fcoding_system_p, Scoding_system_p, 1, 1, 0,
8060 doc: /* Return t if OBJECT is nil or a coding-system.
8061 See the documentation of `define-coding-system' for information
8062 about coding-system objects. */)
8063 (Lisp_Object object)
8064 {
8065 if (NILP (object)
8066 || CODING_SYSTEM_ID (object) >= 0)
8067 return Qt;
8068 if (! SYMBOLP (object)
8069 || NILP (Fget (object, Qcoding_system_define_form)))
8070 return Qnil;
8071 return Qt;
8072 }
8073
8074 DEFUN ("read-non-nil-coding-system", Fread_non_nil_coding_system,
8075 Sread_non_nil_coding_system, 1, 1, 0,
8076 doc: /* Read a coding system from the minibuffer, prompting with string PROMPT. */)
8077 (Lisp_Object prompt)
8078 {
8079 Lisp_Object val;
8080 do
8081 {
8082 val = Fcompleting_read (prompt, Vcoding_system_alist, Qnil,
8083 Qt, Qnil, Qcoding_system_history, Qnil, Qnil);
8084 }
8085 while (SCHARS (val) == 0);
8086 return (Fintern (val, Qnil));
8087 }
8088
8089 DEFUN ("read-coding-system", Fread_coding_system, Sread_coding_system, 1, 2, 0,
8090 doc: /* Read a coding system from the minibuffer, prompting with string PROMPT.
8091 If the user enters null input, return second argument DEFAULT-CODING-SYSTEM.
8092 Ignores case when completing coding systems (all Emacs coding systems
8093 are lower-case). */)
8094 (Lisp_Object prompt, Lisp_Object default_coding_system)
8095 {
8096 Lisp_Object val;
8097 int count = SPECPDL_INDEX ();
8098
8099 if (SYMBOLP (default_coding_system))
8100 default_coding_system = SYMBOL_NAME (default_coding_system);
8101 specbind (Qcompletion_ignore_case, Qt);
8102 val = Fcompleting_read (prompt, Vcoding_system_alist, Qnil,
8103 Qt, Qnil, Qcoding_system_history,
8104 default_coding_system, Qnil);
8105 unbind_to (count, Qnil);
8106 return (SCHARS (val) == 0 ? Qnil : Fintern (val, Qnil));
8107 }
8108
8109 DEFUN ("check-coding-system", Fcheck_coding_system, Scheck_coding_system,
8110 1, 1, 0,
8111 doc: /* Check validity of CODING-SYSTEM.
8112 If valid, return CODING-SYSTEM, else signal a `coding-system-error' error.
8113 It is valid if it is nil or a symbol defined as a coding system by the
8114 function `define-coding-system'. */)
8115 (Lisp_Object coding_system)
8116 {
8117 Lisp_Object define_form;
8118
8119 define_form = Fget (coding_system, Qcoding_system_define_form);
8120 if (! NILP (define_form))
8121 {
8122 Fput (coding_system, Qcoding_system_define_form, Qnil);
8123 safe_eval (define_form);
8124 }
8125 if (!NILP (Fcoding_system_p (coding_system)))
8126 return coding_system;
8127 xsignal1 (Qcoding_system_error, coding_system);
8128 }
8129
8130 \f
8131 /* Detect how the bytes at SRC of length SRC_BYTES are encoded. If
8132 HIGHEST is nonzero, return the coding system of the highest
8133 priority among the detected coding systems. Otherwise return a
8134 list of detected coding systems sorted by their priorities. If
8135 MULTIBYTEP is nonzero, it is assumed that the bytes are in correct
8136 multibyte form but contains only ASCII and eight-bit chars.
8137 Otherwise, the bytes are raw bytes.
8138
8139 CODING-SYSTEM controls the detection as below:
8140
8141 If it is nil, detect both text-format and eol-format. If the
8142 text-format part of CODING-SYSTEM is already specified
8143 (e.g. `iso-latin-1'), detect only eol-format. If the eol-format
8144 part of CODING-SYSTEM is already specified (e.g. `undecided-unix'),
8145 detect only text-format. */
8146
8147 Lisp_Object
8148 detect_coding_system (const unsigned char *src,
8149 EMACS_INT src_chars, EMACS_INT src_bytes,
8150 int highest, int multibytep,
8151 Lisp_Object coding_system)
8152 {
8153 const unsigned char *src_end = src + src_bytes;
8154 Lisp_Object attrs, eol_type;
8155 Lisp_Object val = Qnil;
8156 struct coding_system coding;
8157 int id;
8158 struct coding_detection_info detect_info;
8159 enum coding_category base_category;
8160 int null_byte_found = 0, eight_bit_found = 0;
8161
8162 if (NILP (coding_system))
8163 coding_system = Qundecided;
8164 setup_coding_system (coding_system, &coding);
8165 attrs = CODING_ID_ATTRS (coding.id);
8166 eol_type = CODING_ID_EOL_TYPE (coding.id);
8167 coding_system = CODING_ATTR_BASE_NAME (attrs);
8168
8169 coding.source = src;
8170 coding.src_chars = src_chars;
8171 coding.src_bytes = src_bytes;
8172 coding.src_multibyte = multibytep;
8173 coding.consumed = 0;
8174 coding.mode |= CODING_MODE_LAST_BLOCK;
8175 coding.head_ascii = 0;
8176
8177 detect_info.checked = detect_info.found = detect_info.rejected = 0;
8178
8179 /* At first, detect text-format if necessary. */
8180 base_category = XINT (CODING_ATTR_CATEGORY (attrs));
8181 if (base_category == coding_category_undecided)
8182 {
8183 enum coding_category category;
8184 struct coding_system *this;
8185 int c, i;
8186
8187 /* Skip all ASCII bytes except for a few ISO2022 controls. */
8188 for (; src < src_end; src++)
8189 {
8190 c = *src;
8191 if (c & 0x80)
8192 {
8193 eight_bit_found = 1;
8194 if (null_byte_found)
8195 break;
8196 }
8197 else if (c < 0x20)
8198 {
8199 if ((c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
8200 && ! inhibit_iso_escape_detection
8201 && ! detect_info.checked)
8202 {
8203 if (detect_coding_iso_2022 (&coding, &detect_info))
8204 {
8205 /* We have scanned the whole data. */
8206 if (! (detect_info.rejected & CATEGORY_MASK_ISO_7_ELSE))
8207 {
8208 /* We didn't find an 8-bit code. We may
8209 have found a null-byte, but it's very
8210 rare that a binary file confirm to
8211 ISO-2022. */
8212 src = src_end;
8213 coding.head_ascii = src - coding.source;
8214 }
8215 detect_info.rejected |= ~CATEGORY_MASK_ISO_ESCAPE;
8216 break;
8217 }
8218 }
8219 else if (! c && !inhibit_null_byte_detection)
8220 {
8221 null_byte_found = 1;
8222 if (eight_bit_found)
8223 break;
8224 }
8225 if (! eight_bit_found)
8226 coding.head_ascii++;
8227 }
8228 else if (! eight_bit_found)
8229 coding.head_ascii++;
8230 }
8231
8232 if (null_byte_found || eight_bit_found
8233 || coding.head_ascii < coding.src_bytes
8234 || detect_info.found)
8235 {
8236 if (coding.head_ascii == coding.src_bytes)
8237 /* As all bytes are 7-bit, we can ignore non-ISO-2022 codings. */
8238 for (i = 0; i < coding_category_raw_text; i++)
8239 {
8240 category = coding_priorities[i];
8241 this = coding_categories + category;
8242 if (detect_info.found & (1 << category))
8243 break;
8244 }
8245 else
8246 {
8247 if (null_byte_found)
8248 {
8249 detect_info.checked |= ~CATEGORY_MASK_UTF_16;
8250 detect_info.rejected |= ~CATEGORY_MASK_UTF_16;
8251 }
8252 for (i = 0; i < coding_category_raw_text; i++)
8253 {
8254 category = coding_priorities[i];
8255 this = coding_categories + category;
8256
8257 if (this->id < 0)
8258 {
8259 /* No coding system of this category is defined. */
8260 detect_info.rejected |= (1 << category);
8261 }
8262 else if (category >= coding_category_raw_text)
8263 continue;
8264 else if (detect_info.checked & (1 << category))
8265 {
8266 if (highest
8267 && (detect_info.found & (1 << category)))
8268 break;
8269 }
8270 else if ((*(this->detector)) (&coding, &detect_info)
8271 && highest
8272 && (detect_info.found & (1 << category)))
8273 {
8274 if (category == coding_category_utf_16_auto)
8275 {
8276 if (detect_info.found & CATEGORY_MASK_UTF_16_LE)
8277 category = coding_category_utf_16_le;
8278 else
8279 category = coding_category_utf_16_be;
8280 }
8281 break;
8282 }
8283 }
8284 }
8285 }
8286
8287 if ((detect_info.rejected & CATEGORY_MASK_ANY) == CATEGORY_MASK_ANY
8288 || null_byte_found)
8289 {
8290 detect_info.found = CATEGORY_MASK_RAW_TEXT;
8291 id = CODING_SYSTEM_ID (Qno_conversion);
8292 val = Fcons (make_number (id), Qnil);
8293 }
8294 else if (! detect_info.rejected && ! detect_info.found)
8295 {
8296 detect_info.found = CATEGORY_MASK_ANY;
8297 id = coding_categories[coding_category_undecided].id;
8298 val = Fcons (make_number (id), Qnil);
8299 }
8300 else if (highest)
8301 {
8302 if (detect_info.found)
8303 {
8304 detect_info.found = 1 << category;
8305 val = Fcons (make_number (this->id), Qnil);
8306 }
8307 else
8308 for (i = 0; i < coding_category_raw_text; i++)
8309 if (! (detect_info.rejected & (1 << coding_priorities[i])))
8310 {
8311 detect_info.found = 1 << coding_priorities[i];
8312 id = coding_categories[coding_priorities[i]].id;
8313 val = Fcons (make_number (id), Qnil);
8314 break;
8315 }
8316 }
8317 else
8318 {
8319 int mask = detect_info.rejected | detect_info.found;
8320 int found = 0;
8321
8322 for (i = coding_category_raw_text - 1; i >= 0; i--)
8323 {
8324 category = coding_priorities[i];
8325 if (! (mask & (1 << category)))
8326 {
8327 found |= 1 << category;
8328 id = coding_categories[category].id;
8329 if (id >= 0)
8330 val = Fcons (make_number (id), val);
8331 }
8332 }
8333 for (i = coding_category_raw_text - 1; i >= 0; i--)
8334 {
8335 category = coding_priorities[i];
8336 if (detect_info.found & (1 << category))
8337 {
8338 id = coding_categories[category].id;
8339 val = Fcons (make_number (id), val);
8340 }
8341 }
8342 detect_info.found |= found;
8343 }
8344 }
8345 else if (base_category == coding_category_utf_8_auto)
8346 {
8347 if (detect_coding_utf_8 (&coding, &detect_info))
8348 {
8349 struct coding_system *this;
8350
8351 if (detect_info.found & CATEGORY_MASK_UTF_8_SIG)
8352 this = coding_categories + coding_category_utf_8_sig;
8353 else
8354 this = coding_categories + coding_category_utf_8_nosig;
8355 val = Fcons (make_number (this->id), Qnil);
8356 }
8357 }
8358 else if (base_category == coding_category_utf_16_auto)
8359 {
8360 if (detect_coding_utf_16 (&coding, &detect_info))
8361 {
8362 struct coding_system *this;
8363
8364 if (detect_info.found & CATEGORY_MASK_UTF_16_LE)
8365 this = coding_categories + coding_category_utf_16_le;
8366 else if (detect_info.found & CATEGORY_MASK_UTF_16_BE)
8367 this = coding_categories + coding_category_utf_16_be;
8368 else if (detect_info.rejected & CATEGORY_MASK_UTF_16_LE_NOSIG)
8369 this = coding_categories + coding_category_utf_16_be_nosig;
8370 else
8371 this = coding_categories + coding_category_utf_16_le_nosig;
8372 val = Fcons (make_number (this->id), Qnil);
8373 }
8374 }
8375 else
8376 {
8377 detect_info.found = 1 << XINT (CODING_ATTR_CATEGORY (attrs));
8378 val = Fcons (make_number (coding.id), Qnil);
8379 }
8380
8381 /* Then, detect eol-format if necessary. */
8382 {
8383 int normal_eol = -1, utf_16_be_eol = -1, utf_16_le_eol = -1;
8384 Lisp_Object tail;
8385
8386 if (VECTORP (eol_type))
8387 {
8388 if (detect_info.found & ~CATEGORY_MASK_UTF_16)
8389 {
8390 if (null_byte_found)
8391 normal_eol = EOL_SEEN_LF;
8392 else
8393 normal_eol = detect_eol (coding.source, src_bytes,
8394 coding_category_raw_text);
8395 }
8396 if (detect_info.found & (CATEGORY_MASK_UTF_16_BE
8397 | CATEGORY_MASK_UTF_16_BE_NOSIG))
8398 utf_16_be_eol = detect_eol (coding.source, src_bytes,
8399 coding_category_utf_16_be);
8400 if (detect_info.found & (CATEGORY_MASK_UTF_16_LE
8401 | CATEGORY_MASK_UTF_16_LE_NOSIG))
8402 utf_16_le_eol = detect_eol (coding.source, src_bytes,
8403 coding_category_utf_16_le);
8404 }
8405 else
8406 {
8407 if (EQ (eol_type, Qunix))
8408 normal_eol = utf_16_be_eol = utf_16_le_eol = EOL_SEEN_LF;
8409 else if (EQ (eol_type, Qdos))
8410 normal_eol = utf_16_be_eol = utf_16_le_eol = EOL_SEEN_CRLF;
8411 else
8412 normal_eol = utf_16_be_eol = utf_16_le_eol = EOL_SEEN_CR;
8413 }
8414
8415 for (tail = val; CONSP (tail); tail = XCDR (tail))
8416 {
8417 enum coding_category category;
8418 int this_eol;
8419
8420 id = XINT (XCAR (tail));
8421 attrs = CODING_ID_ATTRS (id);
8422 category = XINT (CODING_ATTR_CATEGORY (attrs));
8423 eol_type = CODING_ID_EOL_TYPE (id);
8424 if (VECTORP (eol_type))
8425 {
8426 if (category == coding_category_utf_16_be
8427 || category == coding_category_utf_16_be_nosig)
8428 this_eol = utf_16_be_eol;
8429 else if (category == coding_category_utf_16_le
8430 || category == coding_category_utf_16_le_nosig)
8431 this_eol = utf_16_le_eol;
8432 else
8433 this_eol = normal_eol;
8434
8435 if (this_eol == EOL_SEEN_LF)
8436 XSETCAR (tail, AREF (eol_type, 0));
8437 else if (this_eol == EOL_SEEN_CRLF)
8438 XSETCAR (tail, AREF (eol_type, 1));
8439 else if (this_eol == EOL_SEEN_CR)
8440 XSETCAR (tail, AREF (eol_type, 2));
8441 else
8442 XSETCAR (tail, CODING_ID_NAME (id));
8443 }
8444 else
8445 XSETCAR (tail, CODING_ID_NAME (id));
8446 }
8447 }
8448
8449 return (highest ? (CONSP (val) ? XCAR (val) : Qnil) : val);
8450 }
8451
8452
8453 DEFUN ("detect-coding-region", Fdetect_coding_region, Sdetect_coding_region,
8454 2, 3, 0,
8455 doc: /* Detect coding system of the text in the region between START and END.
8456 Return a list of possible coding systems ordered by priority.
8457 The coding systems to try and their priorities follows what
8458 the function `coding-system-priority-list' (which see) returns.
8459
8460 If only ASCII characters are found (except for such ISO-2022 control
8461 characters as ESC), it returns a list of single element `undecided'
8462 or its subsidiary coding system according to a detected end-of-line
8463 format.
8464
8465 If optional argument HIGHEST is non-nil, return the coding system of
8466 highest priority. */)
8467 (Lisp_Object start, Lisp_Object end, Lisp_Object highest)
8468 {
8469 int from, to;
8470 int from_byte, to_byte;
8471
8472 CHECK_NUMBER_COERCE_MARKER (start);
8473 CHECK_NUMBER_COERCE_MARKER (end);
8474
8475 validate_region (&start, &end);
8476 from = XINT (start), to = XINT (end);
8477 from_byte = CHAR_TO_BYTE (from);
8478 to_byte = CHAR_TO_BYTE (to);
8479
8480 if (from < GPT && to >= GPT)
8481 move_gap_both (to, to_byte);
8482
8483 return detect_coding_system (BYTE_POS_ADDR (from_byte),
8484 to - from, to_byte - from_byte,
8485 !NILP (highest),
8486 !NILP (BVAR (current_buffer
8487 , enable_multibyte_characters)),
8488 Qnil);
8489 }
8490
8491 DEFUN ("detect-coding-string", Fdetect_coding_string, Sdetect_coding_string,
8492 1, 2, 0,
8493 doc: /* Detect coding system of the text in STRING.
8494 Return a list of possible coding systems ordered by priority.
8495 The coding systems to try and their priorities follows what
8496 the function `coding-system-priority-list' (which see) returns.
8497
8498 If only ASCII characters are found (except for such ISO-2022 control
8499 characters as ESC), it returns a list of single element `undecided'
8500 or its subsidiary coding system according to a detected end-of-line
8501 format.
8502
8503 If optional argument HIGHEST is non-nil, return the coding system of
8504 highest priority. */)
8505 (Lisp_Object string, Lisp_Object highest)
8506 {
8507 CHECK_STRING (string);
8508
8509 return detect_coding_system (SDATA (string),
8510 SCHARS (string), SBYTES (string),
8511 !NILP (highest), STRING_MULTIBYTE (string),
8512 Qnil);
8513 }
8514
8515
8516 static INLINE int
8517 char_encodable_p (int c, Lisp_Object attrs)
8518 {
8519 Lisp_Object tail;
8520 struct charset *charset;
8521 Lisp_Object translation_table;
8522
8523 translation_table = CODING_ATTR_TRANS_TBL (attrs);
8524 if (! NILP (translation_table))
8525 c = translate_char (translation_table, c);
8526 for (tail = CODING_ATTR_CHARSET_LIST (attrs);
8527 CONSP (tail); tail = XCDR (tail))
8528 {
8529 charset = CHARSET_FROM_ID (XINT (XCAR (tail)));
8530 if (CHAR_CHARSET_P (c, charset))
8531 break;
8532 }
8533 return (! NILP (tail));
8534 }
8535
8536
8537 /* Return a list of coding systems that safely encode the text between
8538 START and END. If EXCLUDE is non-nil, it is a list of coding
8539 systems not to check. The returned list doesn't contain any such
8540 coding systems. In any case, if the text contains only ASCII or is
8541 unibyte, return t. */
8542
8543 DEFUN ("find-coding-systems-region-internal",
8544 Ffind_coding_systems_region_internal,
8545 Sfind_coding_systems_region_internal, 2, 3, 0,
8546 doc: /* Internal use only. */)
8547 (Lisp_Object start, Lisp_Object end, Lisp_Object exclude)
8548 {
8549 Lisp_Object coding_attrs_list, safe_codings;
8550 EMACS_INT start_byte, end_byte;
8551 const unsigned char *p, *pbeg, *pend;
8552 int c;
8553 Lisp_Object tail, elt, work_table;
8554
8555 if (STRINGP (start))
8556 {
8557 if (!STRING_MULTIBYTE (start)
8558 || SCHARS (start) == SBYTES (start))
8559 return Qt;
8560 start_byte = 0;
8561 end_byte = SBYTES (start);
8562 }
8563 else
8564 {
8565 CHECK_NUMBER_COERCE_MARKER (start);
8566 CHECK_NUMBER_COERCE_MARKER (end);
8567 if (XINT (start) < BEG || XINT (end) > Z || XINT (start) > XINT (end))
8568 args_out_of_range (start, end);
8569 if (NILP (BVAR (current_buffer, enable_multibyte_characters)))
8570 return Qt;
8571 start_byte = CHAR_TO_BYTE (XINT (start));
8572 end_byte = CHAR_TO_BYTE (XINT (end));
8573 if (XINT (end) - XINT (start) == end_byte - start_byte)
8574 return Qt;
8575
8576 if (XINT (start) < GPT && XINT (end) > GPT)
8577 {
8578 if ((GPT - XINT (start)) < (XINT (end) - GPT))
8579 move_gap_both (XINT (start), start_byte);
8580 else
8581 move_gap_both (XINT (end), end_byte);
8582 }
8583 }
8584
8585 coding_attrs_list = Qnil;
8586 for (tail = Vcoding_system_list; CONSP (tail); tail = XCDR (tail))
8587 if (NILP (exclude)
8588 || NILP (Fmemq (XCAR (tail), exclude)))
8589 {
8590 Lisp_Object attrs;
8591
8592 attrs = AREF (CODING_SYSTEM_SPEC (XCAR (tail)), 0);
8593 if (EQ (XCAR (tail), CODING_ATTR_BASE_NAME (attrs))
8594 && ! EQ (CODING_ATTR_TYPE (attrs), Qundecided))
8595 {
8596 ASET (attrs, coding_attr_trans_tbl,
8597 get_translation_table (attrs, 1, NULL));
8598 coding_attrs_list = Fcons (attrs, coding_attrs_list);
8599 }
8600 }
8601
8602 if (STRINGP (start))
8603 p = pbeg = SDATA (start);
8604 else
8605 p = pbeg = BYTE_POS_ADDR (start_byte);
8606 pend = p + (end_byte - start_byte);
8607
8608 while (p < pend && ASCII_BYTE_P (*p)) p++;
8609 while (p < pend && ASCII_BYTE_P (*(pend - 1))) pend--;
8610
8611 work_table = Fmake_char_table (Qnil, Qnil);
8612 while (p < pend)
8613 {
8614 if (ASCII_BYTE_P (*p))
8615 p++;
8616 else
8617 {
8618 c = STRING_CHAR_ADVANCE (p);
8619 if (!NILP (char_table_ref (work_table, c)))
8620 /* This character was already checked. Ignore it. */
8621 continue;
8622
8623 charset_map_loaded = 0;
8624 for (tail = coding_attrs_list; CONSP (tail);)
8625 {
8626 elt = XCAR (tail);
8627 if (NILP (elt))
8628 tail = XCDR (tail);
8629 else if (char_encodable_p (c, elt))
8630 tail = XCDR (tail);
8631 else if (CONSP (XCDR (tail)))
8632 {
8633 XSETCAR (tail, XCAR (XCDR (tail)));
8634 XSETCDR (tail, XCDR (XCDR (tail)));
8635 }
8636 else
8637 {
8638 XSETCAR (tail, Qnil);
8639 tail = XCDR (tail);
8640 }
8641 }
8642 if (charset_map_loaded)
8643 {
8644 EMACS_INT p_offset = p - pbeg, pend_offset = pend - pbeg;
8645
8646 if (STRINGP (start))
8647 pbeg = SDATA (start);
8648 else
8649 pbeg = BYTE_POS_ADDR (start_byte);
8650 p = pbeg + p_offset;
8651 pend = pbeg + pend_offset;
8652 }
8653 char_table_set (work_table, c, Qt);
8654 }
8655 }
8656
8657 safe_codings = list2 (Qraw_text, Qno_conversion);
8658 for (tail = coding_attrs_list; CONSP (tail); tail = XCDR (tail))
8659 if (! NILP (XCAR (tail)))
8660 safe_codings = Fcons (CODING_ATTR_BASE_NAME (XCAR (tail)), safe_codings);
8661
8662 return safe_codings;
8663 }
8664
8665
8666 DEFUN ("unencodable-char-position", Funencodable_char_position,
8667 Sunencodable_char_position, 3, 5, 0,
8668 doc: /*
8669 Return position of first un-encodable character in a region.
8670 START and END specify the region and CODING-SYSTEM specifies the
8671 encoding to check. Return nil if CODING-SYSTEM does encode the region.
8672
8673 If optional 4th argument COUNT is non-nil, it specifies at most how
8674 many un-encodable characters to search. In this case, the value is a
8675 list of positions.
8676
8677 If optional 5th argument STRING is non-nil, it is a string to search
8678 for un-encodable characters. In that case, START and END are indexes
8679 to the string. */)
8680 (Lisp_Object start, Lisp_Object end, Lisp_Object coding_system, Lisp_Object count, Lisp_Object string)
8681 {
8682 int n;
8683 struct coding_system coding;
8684 Lisp_Object attrs, charset_list, translation_table;
8685 Lisp_Object positions;
8686 int from, to;
8687 const unsigned char *p, *stop, *pend;
8688 int ascii_compatible;
8689
8690 setup_coding_system (Fcheck_coding_system (coding_system), &coding);
8691 attrs = CODING_ID_ATTRS (coding.id);
8692 if (EQ (CODING_ATTR_TYPE (attrs), Qraw_text))
8693 return Qnil;
8694 ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs));
8695 charset_list = CODING_ATTR_CHARSET_LIST (attrs);
8696 translation_table = get_translation_table (attrs, 1, NULL);
8697
8698 if (NILP (string))
8699 {
8700 validate_region (&start, &end);
8701 from = XINT (start);
8702 to = XINT (end);
8703 if (NILP (BVAR (current_buffer, enable_multibyte_characters))
8704 || (ascii_compatible
8705 && (to - from) == (CHAR_TO_BYTE (to) - (CHAR_TO_BYTE (from)))))
8706 return Qnil;
8707 p = CHAR_POS_ADDR (from);
8708 pend = CHAR_POS_ADDR (to);
8709 if (from < GPT && to >= GPT)
8710 stop = GPT_ADDR;
8711 else
8712 stop = pend;
8713 }
8714 else
8715 {
8716 CHECK_STRING (string);
8717 CHECK_NATNUM (start);
8718 CHECK_NATNUM (end);
8719 from = XINT (start);
8720 to = XINT (end);
8721 if (from > to
8722 || to > SCHARS (string))
8723 args_out_of_range_3 (string, start, end);
8724 if (! STRING_MULTIBYTE (string))
8725 return Qnil;
8726 p = SDATA (string) + string_char_to_byte (string, from);
8727 stop = pend = SDATA (string) + string_char_to_byte (string, to);
8728 if (ascii_compatible && (to - from) == (pend - p))
8729 return Qnil;
8730 }
8731
8732 if (NILP (count))
8733 n = 1;
8734 else
8735 {
8736 CHECK_NATNUM (count);
8737 n = XINT (count);
8738 }
8739
8740 positions = Qnil;
8741 while (1)
8742 {
8743 int c;
8744
8745 if (ascii_compatible)
8746 while (p < stop && ASCII_BYTE_P (*p))
8747 p++, from++;
8748 if (p >= stop)
8749 {
8750 if (p >= pend)
8751 break;
8752 stop = pend;
8753 p = GAP_END_ADDR;
8754 }
8755
8756 c = STRING_CHAR_ADVANCE (p);
8757 if (! (ASCII_CHAR_P (c) && ascii_compatible)
8758 && ! char_charset (translate_char (translation_table, c),
8759 charset_list, NULL))
8760 {
8761 positions = Fcons (make_number (from), positions);
8762 n--;
8763 if (n == 0)
8764 break;
8765 }
8766
8767 from++;
8768 }
8769
8770 return (NILP (count) ? Fcar (positions) : Fnreverse (positions));
8771 }
8772
8773
8774 DEFUN ("check-coding-systems-region", Fcheck_coding_systems_region,
8775 Scheck_coding_systems_region, 3, 3, 0,
8776 doc: /* Check if the region is encodable by coding systems.
8777
8778 START and END are buffer positions specifying the region.
8779 CODING-SYSTEM-LIST is a list of coding systems to check.
8780
8781 The value is an alist ((CODING-SYSTEM POS0 POS1 ...) ...), where
8782 CODING-SYSTEM is a member of CODING-SYSTEM-LIST and can't encode the
8783 whole region, POS0, POS1, ... are buffer positions where non-encodable
8784 characters are found.
8785
8786 If all coding systems in CODING-SYSTEM-LIST can encode the region, the
8787 value is nil.
8788
8789 START may be a string. In that case, check if the string is
8790 encodable, and the value contains indices to the string instead of
8791 buffer positions. END is ignored.
8792
8793 If the current buffer (or START if it is a string) is unibyte, the value
8794 is nil. */)
8795 (Lisp_Object start, Lisp_Object end, Lisp_Object coding_system_list)
8796 {
8797 Lisp_Object list;
8798 EMACS_INT start_byte, end_byte;
8799 int pos;
8800 const unsigned char *p, *pbeg, *pend;
8801 int c;
8802 Lisp_Object tail, elt, attrs;
8803
8804 if (STRINGP (start))
8805 {
8806 if (!STRING_MULTIBYTE (start)
8807 || SCHARS (start) == SBYTES (start))
8808 return Qnil;
8809 start_byte = 0;
8810 end_byte = SBYTES (start);
8811 pos = 0;
8812 }
8813 else
8814 {
8815 CHECK_NUMBER_COERCE_MARKER (start);
8816 CHECK_NUMBER_COERCE_MARKER (end);
8817 if (XINT (start) < BEG || XINT (end) > Z || XINT (start) > XINT (end))
8818 args_out_of_range (start, end);
8819 if (NILP (BVAR (current_buffer, enable_multibyte_characters)))
8820 return Qnil;
8821 start_byte = CHAR_TO_BYTE (XINT (start));
8822 end_byte = CHAR_TO_BYTE (XINT (end));
8823 if (XINT (end) - XINT (start) == end_byte - start_byte)
8824 return Qnil;
8825
8826 if (XINT (start) < GPT && XINT (end) > GPT)
8827 {
8828 if ((GPT - XINT (start)) < (XINT (end) - GPT))
8829 move_gap_both (XINT (start), start_byte);
8830 else
8831 move_gap_both (XINT (end), end_byte);
8832 }
8833 pos = XINT (start);
8834 }
8835
8836 list = Qnil;
8837 for (tail = coding_system_list; CONSP (tail); tail = XCDR (tail))
8838 {
8839 elt = XCAR (tail);
8840 attrs = AREF (CODING_SYSTEM_SPEC (elt), 0);
8841 ASET (attrs, coding_attr_trans_tbl,
8842 get_translation_table (attrs, 1, NULL));
8843 list = Fcons (Fcons (elt, Fcons (attrs, Qnil)), list);
8844 }
8845
8846 if (STRINGP (start))
8847 p = pbeg = SDATA (start);
8848 else
8849 p = pbeg = BYTE_POS_ADDR (start_byte);
8850 pend = p + (end_byte - start_byte);
8851
8852 while (p < pend && ASCII_BYTE_P (*p)) p++, pos++;
8853 while (p < pend && ASCII_BYTE_P (*(pend - 1))) pend--;
8854
8855 while (p < pend)
8856 {
8857 if (ASCII_BYTE_P (*p))
8858 p++;
8859 else
8860 {
8861 c = STRING_CHAR_ADVANCE (p);
8862
8863 charset_map_loaded = 0;
8864 for (tail = list; CONSP (tail); tail = XCDR (tail))
8865 {
8866 elt = XCDR (XCAR (tail));
8867 if (! char_encodable_p (c, XCAR (elt)))
8868 XSETCDR (elt, Fcons (make_number (pos), XCDR (elt)));
8869 }
8870 if (charset_map_loaded)
8871 {
8872 EMACS_INT p_offset = p - pbeg, pend_offset = pend - pbeg;
8873
8874 if (STRINGP (start))
8875 pbeg = SDATA (start);
8876 else
8877 pbeg = BYTE_POS_ADDR (start_byte);
8878 p = pbeg + p_offset;
8879 pend = pbeg + pend_offset;
8880 }
8881 }
8882 pos++;
8883 }
8884
8885 tail = list;
8886 list = Qnil;
8887 for (; CONSP (tail); tail = XCDR (tail))
8888 {
8889 elt = XCAR (tail);
8890 if (CONSP (XCDR (XCDR (elt))))
8891 list = Fcons (Fcons (XCAR (elt), Fnreverse (XCDR (XCDR (elt)))),
8892 list);
8893 }
8894
8895 return list;
8896 }
8897
8898
8899 Lisp_Object
8900 code_convert_region (Lisp_Object start, Lisp_Object end,
8901 Lisp_Object coding_system, Lisp_Object dst_object,
8902 int encodep, int norecord)
8903 {
8904 struct coding_system coding;
8905 EMACS_INT from, from_byte, to, to_byte;
8906 Lisp_Object src_object;
8907
8908 CHECK_NUMBER_COERCE_MARKER (start);
8909 CHECK_NUMBER_COERCE_MARKER (end);
8910 if (NILP (coding_system))
8911 coding_system = Qno_conversion;
8912 else
8913 CHECK_CODING_SYSTEM (coding_system);
8914 src_object = Fcurrent_buffer ();
8915 if (NILP (dst_object))
8916 dst_object = src_object;
8917 else if (! EQ (dst_object, Qt))
8918 CHECK_BUFFER (dst_object);
8919
8920 validate_region (&start, &end);
8921 from = XFASTINT (start);
8922 from_byte = CHAR_TO_BYTE (from);
8923 to = XFASTINT (end);
8924 to_byte = CHAR_TO_BYTE (to);
8925
8926 setup_coding_system (coding_system, &coding);
8927 coding.mode |= CODING_MODE_LAST_BLOCK;
8928
8929 if (encodep)
8930 encode_coding_object (&coding, src_object, from, from_byte, to, to_byte,
8931 dst_object);
8932 else
8933 decode_coding_object (&coding, src_object, from, from_byte, to, to_byte,
8934 dst_object);
8935 if (! norecord)
8936 Vlast_coding_system_used = CODING_ID_NAME (coding.id);
8937
8938 return (BUFFERP (dst_object)
8939 ? make_number (coding.produced_char)
8940 : coding.dst_object);
8941 }
8942
8943
8944 DEFUN ("decode-coding-region", Fdecode_coding_region, Sdecode_coding_region,
8945 3, 4, "r\nzCoding system: ",
8946 doc: /* Decode the current region from the specified coding system.
8947 When called from a program, takes four arguments:
8948 START, END, CODING-SYSTEM, and DESTINATION.
8949 START and END are buffer positions.
8950
8951 Optional 4th arguments DESTINATION specifies where the decoded text goes.
8952 If nil, the region between START and END is replaced by the decoded text.
8953 If buffer, the decoded text is inserted in that buffer after point (point
8954 does not move).
8955 In those cases, the length of the decoded text is returned.
8956 If DESTINATION is t, the decoded text is returned.
8957
8958 This function sets `last-coding-system-used' to the precise coding system
8959 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
8960 not fully specified.) */)
8961 (Lisp_Object start, Lisp_Object end, Lisp_Object coding_system, Lisp_Object destination)
8962 {
8963 return code_convert_region (start, end, coding_system, destination, 0, 0);
8964 }
8965
8966 DEFUN ("encode-coding-region", Fencode_coding_region, Sencode_coding_region,
8967 3, 4, "r\nzCoding system: ",
8968 doc: /* Encode the current region by specified coding system.
8969 When called from a program, takes four arguments:
8970 START, END, CODING-SYSTEM and DESTINATION.
8971 START and END are buffer positions.
8972
8973 Optional 4th arguments DESTINATION specifies where the encoded text goes.
8974 If nil, the region between START and END is replace by the encoded text.
8975 If buffer, the encoded text is inserted in that buffer after point (point
8976 does not move).
8977 In those cases, the length of the encoded text is returned.
8978 If DESTINATION is t, the encoded text is returned.
8979
8980 This function sets `last-coding-system-used' to the precise coding system
8981 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
8982 not fully specified.) */)
8983 (Lisp_Object start, Lisp_Object end, Lisp_Object coding_system, Lisp_Object destination)
8984 {
8985 return code_convert_region (start, end, coding_system, destination, 1, 0);
8986 }
8987
8988 Lisp_Object
8989 code_convert_string (Lisp_Object string, Lisp_Object coding_system,
8990 Lisp_Object dst_object, int encodep, int nocopy, int norecord)
8991 {
8992 struct coding_system coding;
8993 EMACS_INT chars, bytes;
8994
8995 CHECK_STRING (string);
8996 if (NILP (coding_system))
8997 {
8998 if (! norecord)
8999 Vlast_coding_system_used = Qno_conversion;
9000 if (NILP (dst_object))
9001 return (nocopy ? Fcopy_sequence (string) : string);
9002 }
9003
9004 if (NILP (coding_system))
9005 coding_system = Qno_conversion;
9006 else
9007 CHECK_CODING_SYSTEM (coding_system);
9008 if (NILP (dst_object))
9009 dst_object = Qt;
9010 else if (! EQ (dst_object, Qt))
9011 CHECK_BUFFER (dst_object);
9012
9013 setup_coding_system (coding_system, &coding);
9014 coding.mode |= CODING_MODE_LAST_BLOCK;
9015 chars = SCHARS (string);
9016 bytes = SBYTES (string);
9017 if (encodep)
9018 encode_coding_object (&coding, string, 0, 0, chars, bytes, dst_object);
9019 else
9020 decode_coding_object (&coding, string, 0, 0, chars, bytes, dst_object);
9021 if (! norecord)
9022 Vlast_coding_system_used = CODING_ID_NAME (coding.id);
9023
9024 return (BUFFERP (dst_object)
9025 ? make_number (coding.produced_char)
9026 : coding.dst_object);
9027 }
9028
9029
9030 /* Encode or decode STRING according to CODING_SYSTEM.
9031 Do not set Vlast_coding_system_used.
9032
9033 This function is called only from macros DECODE_FILE and
9034 ENCODE_FILE, thus we ignore character composition. */
9035
9036 Lisp_Object
9037 code_convert_string_norecord (Lisp_Object string, Lisp_Object coding_system,
9038 int encodep)
9039 {
9040 return code_convert_string (string, coding_system, Qt, encodep, 0, 1);
9041 }
9042
9043
9044 DEFUN ("decode-coding-string", Fdecode_coding_string, Sdecode_coding_string,
9045 2, 4, 0,
9046 doc: /* Decode STRING which is encoded in CODING-SYSTEM, and return the result.
9047
9048 Optional third arg NOCOPY non-nil means it is OK to return STRING itself
9049 if the decoding operation is trivial.
9050
9051 Optional fourth arg BUFFER non-nil means that the decoded text is
9052 inserted in that buffer after point (point does not move). In this
9053 case, the return value is the length of the decoded text.
9054
9055 This function sets `last-coding-system-used' to the precise coding system
9056 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
9057 not fully specified.) */)
9058 (Lisp_Object string, Lisp_Object coding_system, Lisp_Object nocopy, Lisp_Object buffer)
9059 {
9060 return code_convert_string (string, coding_system, buffer,
9061 0, ! NILP (nocopy), 0);
9062 }
9063
9064 DEFUN ("encode-coding-string", Fencode_coding_string, Sencode_coding_string,
9065 2, 4, 0,
9066 doc: /* Encode STRING to CODING-SYSTEM, and return the result.
9067
9068 Optional third arg NOCOPY non-nil means it is OK to return STRING
9069 itself if the encoding operation is trivial.
9070
9071 Optional fourth arg BUFFER non-nil means that the encoded text is
9072 inserted in that buffer after point (point does not move). In this
9073 case, the return value is the length of the encoded text.
9074
9075 This function sets `last-coding-system-used' to the precise coding system
9076 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
9077 not fully specified.) */)
9078 (Lisp_Object string, Lisp_Object coding_system, Lisp_Object nocopy, Lisp_Object buffer)
9079 {
9080 return code_convert_string (string, coding_system, buffer,
9081 1, ! NILP (nocopy), 1);
9082 }
9083
9084 \f
9085 DEFUN ("decode-sjis-char", Fdecode_sjis_char, Sdecode_sjis_char, 1, 1, 0,
9086 doc: /* Decode a Japanese character which has CODE in shift_jis encoding.
9087 Return the corresponding character. */)
9088 (Lisp_Object code)
9089 {
9090 Lisp_Object spec, attrs, val;
9091 struct charset *charset_roman, *charset_kanji, *charset_kana, *charset;
9092 int c;
9093
9094 CHECK_NATNUM (code);
9095 c = XFASTINT (code);
9096 CHECK_CODING_SYSTEM_GET_SPEC (Vsjis_coding_system, spec);
9097 attrs = AREF (spec, 0);
9098
9099 if (ASCII_BYTE_P (c)
9100 && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
9101 return code;
9102
9103 val = CODING_ATTR_CHARSET_LIST (attrs);
9104 charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
9105 charset_kana = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
9106 charset_kanji = CHARSET_FROM_ID (XINT (XCAR (val)));
9107
9108 if (c <= 0x7F)
9109 charset = charset_roman;
9110 else if (c >= 0xA0 && c < 0xDF)
9111 {
9112 charset = charset_kana;
9113 c -= 0x80;
9114 }
9115 else
9116 {
9117 int s1 = c >> 8, s2 = c & 0xFF;
9118
9119 if (s1 < 0x81 || (s1 > 0x9F && s1 < 0xE0) || s1 > 0xEF
9120 || s2 < 0x40 || s2 == 0x7F || s2 > 0xFC)
9121 error ("Invalid code: %d", code);
9122 SJIS_TO_JIS (c);
9123 charset = charset_kanji;
9124 }
9125 c = DECODE_CHAR (charset, c);
9126 if (c < 0)
9127 error ("Invalid code: %d", code);
9128 return make_number (c);
9129 }
9130
9131
9132 DEFUN ("encode-sjis-char", Fencode_sjis_char, Sencode_sjis_char, 1, 1, 0,
9133 doc: /* Encode a Japanese character CH to shift_jis encoding.
9134 Return the corresponding code in SJIS. */)
9135 (Lisp_Object ch)
9136 {
9137 Lisp_Object spec, attrs, charset_list;
9138 int c;
9139 struct charset *charset;
9140 unsigned code;
9141
9142 CHECK_CHARACTER (ch);
9143 c = XFASTINT (ch);
9144 CHECK_CODING_SYSTEM_GET_SPEC (Vsjis_coding_system, spec);
9145 attrs = AREF (spec, 0);
9146
9147 if (ASCII_CHAR_P (c)
9148 && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
9149 return ch;
9150
9151 charset_list = CODING_ATTR_CHARSET_LIST (attrs);
9152 charset = char_charset (c, charset_list, &code);
9153 if (code == CHARSET_INVALID_CODE (charset))
9154 error ("Can't encode by shift_jis encoding: %d", c);
9155 JIS_TO_SJIS (code);
9156
9157 return make_number (code);
9158 }
9159
9160 DEFUN ("decode-big5-char", Fdecode_big5_char, Sdecode_big5_char, 1, 1, 0,
9161 doc: /* Decode a Big5 character which has CODE in BIG5 coding system.
9162 Return the corresponding character. */)
9163 (Lisp_Object code)
9164 {
9165 Lisp_Object spec, attrs, val;
9166 struct charset *charset_roman, *charset_big5, *charset;
9167 int c;
9168
9169 CHECK_NATNUM (code);
9170 c = XFASTINT (code);
9171 CHECK_CODING_SYSTEM_GET_SPEC (Vbig5_coding_system, spec);
9172 attrs = AREF (spec, 0);
9173
9174 if (ASCII_BYTE_P (c)
9175 && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
9176 return code;
9177
9178 val = CODING_ATTR_CHARSET_LIST (attrs);
9179 charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
9180 charset_big5 = CHARSET_FROM_ID (XINT (XCAR (val)));
9181
9182 if (c <= 0x7F)
9183 charset = charset_roman;
9184 else
9185 {
9186 int b1 = c >> 8, b2 = c & 0x7F;
9187 if (b1 < 0xA1 || b1 > 0xFE
9188 || b2 < 0x40 || (b2 > 0x7E && b2 < 0xA1) || b2 > 0xFE)
9189 error ("Invalid code: %d", code);
9190 charset = charset_big5;
9191 }
9192 c = DECODE_CHAR (charset, (unsigned )c);
9193 if (c < 0)
9194 error ("Invalid code: %d", code);
9195 return make_number (c);
9196 }
9197
9198 DEFUN ("encode-big5-char", Fencode_big5_char, Sencode_big5_char, 1, 1, 0,
9199 doc: /* Encode the Big5 character CH to BIG5 coding system.
9200 Return the corresponding character code in Big5. */)
9201 (Lisp_Object ch)
9202 {
9203 Lisp_Object spec, attrs, charset_list;
9204 struct charset *charset;
9205 int c;
9206 unsigned code;
9207
9208 CHECK_CHARACTER (ch);
9209 c = XFASTINT (ch);
9210 CHECK_CODING_SYSTEM_GET_SPEC (Vbig5_coding_system, spec);
9211 attrs = AREF (spec, 0);
9212 if (ASCII_CHAR_P (c)
9213 && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
9214 return ch;
9215
9216 charset_list = CODING_ATTR_CHARSET_LIST (attrs);
9217 charset = char_charset (c, charset_list, &code);
9218 if (code == CHARSET_INVALID_CODE (charset))
9219 error ("Can't encode by Big5 encoding: %d", c);
9220
9221 return make_number (code);
9222 }
9223
9224 \f
9225 DEFUN ("set-terminal-coding-system-internal", Fset_terminal_coding_system_internal,
9226 Sset_terminal_coding_system_internal, 1, 2, 0,
9227 doc: /* Internal use only. */)
9228 (Lisp_Object coding_system, Lisp_Object terminal)
9229 {
9230 struct terminal *term = get_terminal (terminal, 1);
9231 struct coding_system *terminal_coding = TERMINAL_TERMINAL_CODING (term);
9232 CHECK_SYMBOL (coding_system);
9233 setup_coding_system (Fcheck_coding_system (coding_system), terminal_coding);
9234 /* We had better not send unsafe characters to terminal. */
9235 terminal_coding->mode |= CODING_MODE_SAFE_ENCODING;
9236 /* Character composition should be disabled. */
9237 terminal_coding->common_flags &= ~CODING_ANNOTATE_COMPOSITION_MASK;
9238 terminal_coding->src_multibyte = 1;
9239 terminal_coding->dst_multibyte = 0;
9240 if (terminal_coding->common_flags & CODING_REQUIRE_ENCODING_MASK)
9241 term->charset_list = coding_charset_list (terminal_coding);
9242 else
9243 term->charset_list = Fcons (make_number (charset_ascii), Qnil);
9244 return Qnil;
9245 }
9246
9247 DEFUN ("set-safe-terminal-coding-system-internal",
9248 Fset_safe_terminal_coding_system_internal,
9249 Sset_safe_terminal_coding_system_internal, 1, 1, 0,
9250 doc: /* Internal use only. */)
9251 (Lisp_Object coding_system)
9252 {
9253 CHECK_SYMBOL (coding_system);
9254 setup_coding_system (Fcheck_coding_system (coding_system),
9255 &safe_terminal_coding);
9256 /* Character composition should be disabled. */
9257 safe_terminal_coding.common_flags &= ~CODING_ANNOTATE_COMPOSITION_MASK;
9258 safe_terminal_coding.src_multibyte = 1;
9259 safe_terminal_coding.dst_multibyte = 0;
9260 return Qnil;
9261 }
9262
9263 DEFUN ("terminal-coding-system", Fterminal_coding_system,
9264 Sterminal_coding_system, 0, 1, 0,
9265 doc: /* Return coding system specified for terminal output on the given terminal.
9266 TERMINAL may be a terminal object, a frame, or nil for the selected
9267 frame's terminal device. */)
9268 (Lisp_Object terminal)
9269 {
9270 struct coding_system *terminal_coding
9271 = TERMINAL_TERMINAL_CODING (get_terminal (terminal, 1));
9272 Lisp_Object coding_system = CODING_ID_NAME (terminal_coding->id);
9273
9274 /* For backward compatibility, return nil if it is `undecided'. */
9275 return (! EQ (coding_system, Qundecided) ? coding_system : Qnil);
9276 }
9277
9278 DEFUN ("set-keyboard-coding-system-internal", Fset_keyboard_coding_system_internal,
9279 Sset_keyboard_coding_system_internal, 1, 2, 0,
9280 doc: /* Internal use only. */)
9281 (Lisp_Object coding_system, Lisp_Object terminal)
9282 {
9283 struct terminal *t = get_terminal (terminal, 1);
9284 CHECK_SYMBOL (coding_system);
9285 if (NILP (coding_system))
9286 coding_system = Qno_conversion;
9287 else
9288 Fcheck_coding_system (coding_system);
9289 setup_coding_system (coding_system, TERMINAL_KEYBOARD_CODING (t));
9290 /* Character composition should be disabled. */
9291 TERMINAL_KEYBOARD_CODING (t)->common_flags
9292 &= ~CODING_ANNOTATE_COMPOSITION_MASK;
9293 return Qnil;
9294 }
9295
9296 DEFUN ("keyboard-coding-system",
9297 Fkeyboard_coding_system, Skeyboard_coding_system, 0, 1, 0,
9298 doc: /* Return coding system specified for decoding keyboard input. */)
9299 (Lisp_Object terminal)
9300 {
9301 return CODING_ID_NAME (TERMINAL_KEYBOARD_CODING
9302 (get_terminal (terminal, 1))->id);
9303 }
9304
9305 \f
9306 DEFUN ("find-operation-coding-system", Ffind_operation_coding_system,
9307 Sfind_operation_coding_system, 1, MANY, 0,
9308 doc: /* Choose a coding system for an operation based on the target name.
9309 The value names a pair of coding systems: (DECODING-SYSTEM . ENCODING-SYSTEM).
9310 DECODING-SYSTEM is the coding system to use for decoding
9311 \(in case OPERATION does decoding), and ENCODING-SYSTEM is the coding system
9312 for encoding (in case OPERATION does encoding).
9313
9314 The first argument OPERATION specifies an I/O primitive:
9315 For file I/O, `insert-file-contents' or `write-region'.
9316 For process I/O, `call-process', `call-process-region', or `start-process'.
9317 For network I/O, `open-network-stream'.
9318
9319 The remaining arguments should be the same arguments that were passed
9320 to the primitive. Depending on which primitive, one of those arguments
9321 is selected as the TARGET. For example, if OPERATION does file I/O,
9322 whichever argument specifies the file name is TARGET.
9323
9324 TARGET has a meaning which depends on OPERATION:
9325 For file I/O, TARGET is a file name (except for the special case below).
9326 For process I/O, TARGET is a process name.
9327 For network I/O, TARGET is a service name or a port number.
9328
9329 This function looks up what is specified for TARGET in
9330 `file-coding-system-alist', `process-coding-system-alist',
9331 or `network-coding-system-alist' depending on OPERATION.
9332 They may specify a coding system, a cons of coding systems,
9333 or a function symbol to call.
9334 In the last case, we call the function with one argument,
9335 which is a list of all the arguments given to this function.
9336 If the function can't decide a coding system, it can return
9337 `undecided' so that the normal code-detection is performed.
9338
9339 If OPERATION is `insert-file-contents', the argument corresponding to
9340 TARGET may be a cons (FILENAME . BUFFER). In that case, FILENAME is a
9341 file name to look up, and BUFFER is a buffer that contains the file's
9342 contents (not yet decoded). If `file-coding-system-alist' specifies a
9343 function to call for FILENAME, that function should examine the
9344 contents of BUFFER instead of reading the file.
9345
9346 usage: (find-operation-coding-system OPERATION ARGUMENTS...) */)
9347 (int nargs, Lisp_Object *args)
9348 {
9349 Lisp_Object operation, target_idx, target, val;
9350 register Lisp_Object chain;
9351
9352 if (nargs < 2)
9353 error ("Too few arguments");
9354 operation = args[0];
9355 if (!SYMBOLP (operation)
9356 || !INTEGERP (target_idx = Fget (operation, Qtarget_idx)))
9357 error ("Invalid first argument");
9358 if (nargs < 1 + XINT (target_idx))
9359 error ("Too few arguments for operation: %s",
9360 SDATA (SYMBOL_NAME (operation)));
9361 target = args[XINT (target_idx) + 1];
9362 if (!(STRINGP (target)
9363 || (EQ (operation, Qinsert_file_contents) && CONSP (target)
9364 && STRINGP (XCAR (target)) && BUFFERP (XCDR (target)))
9365 || (EQ (operation, Qopen_network_stream) && INTEGERP (target))))
9366 error ("Invalid %dth argument", XINT (target_idx) + 1);
9367 if (CONSP (target))
9368 target = XCAR (target);
9369
9370 chain = ((EQ (operation, Qinsert_file_contents)
9371 || EQ (operation, Qwrite_region))
9372 ? Vfile_coding_system_alist
9373 : (EQ (operation, Qopen_network_stream)
9374 ? Vnetwork_coding_system_alist
9375 : Vprocess_coding_system_alist));
9376 if (NILP (chain))
9377 return Qnil;
9378
9379 for (; CONSP (chain); chain = XCDR (chain))
9380 {
9381 Lisp_Object elt;
9382
9383 elt = XCAR (chain);
9384 if (CONSP (elt)
9385 && ((STRINGP (target)
9386 && STRINGP (XCAR (elt))
9387 && fast_string_match (XCAR (elt), target) >= 0)
9388 || (INTEGERP (target) && EQ (target, XCAR (elt)))))
9389 {
9390 val = XCDR (elt);
9391 /* Here, if VAL is both a valid coding system and a valid
9392 function symbol, we return VAL as a coding system. */
9393 if (CONSP (val))
9394 return val;
9395 if (! SYMBOLP (val))
9396 return Qnil;
9397 if (! NILP (Fcoding_system_p (val)))
9398 return Fcons (val, val);
9399 if (! NILP (Ffboundp (val)))
9400 {
9401 /* We use call1 rather than safe_call1
9402 so as to get bug reports about functions called here
9403 which don't handle the current interface. */
9404 val = call1 (val, Flist (nargs, args));
9405 if (CONSP (val))
9406 return val;
9407 if (SYMBOLP (val) && ! NILP (Fcoding_system_p (val)))
9408 return Fcons (val, val);
9409 }
9410 return Qnil;
9411 }
9412 }
9413 return Qnil;
9414 }
9415
9416 DEFUN ("set-coding-system-priority", Fset_coding_system_priority,
9417 Sset_coding_system_priority, 0, MANY, 0,
9418 doc: /* Assign higher priority to the coding systems given as arguments.
9419 If multiple coding systems belong to the same category,
9420 all but the first one are ignored.
9421
9422 usage: (set-coding-system-priority &rest coding-systems) */)
9423 (int nargs, Lisp_Object *args)
9424 {
9425 int i, j;
9426 int changed[coding_category_max];
9427 enum coding_category priorities[coding_category_max];
9428
9429 memset (changed, 0, sizeof changed);
9430
9431 for (i = j = 0; i < nargs; i++)
9432 {
9433 enum coding_category category;
9434 Lisp_Object spec, attrs;
9435
9436 CHECK_CODING_SYSTEM_GET_SPEC (args[i], spec);
9437 attrs = AREF (spec, 0);
9438 category = XINT (CODING_ATTR_CATEGORY (attrs));
9439 if (changed[category])
9440 /* Ignore this coding system because a coding system of the
9441 same category already had a higher priority. */
9442 continue;
9443 changed[category] = 1;
9444 priorities[j++] = category;
9445 if (coding_categories[category].id >= 0
9446 && ! EQ (args[i], CODING_ID_NAME (coding_categories[category].id)))
9447 setup_coding_system (args[i], &coding_categories[category]);
9448 Fset (AREF (Vcoding_category_table, category), args[i]);
9449 }
9450
9451 /* Now we have decided top J priorities. Reflect the order of the
9452 original priorities to the remaining priorities. */
9453
9454 for (i = j, j = 0; i < coding_category_max; i++, j++)
9455 {
9456 while (j < coding_category_max
9457 && changed[coding_priorities[j]])
9458 j++;
9459 if (j == coding_category_max)
9460 abort ();
9461 priorities[i] = coding_priorities[j];
9462 }
9463
9464 memcpy (coding_priorities, priorities, sizeof priorities);
9465
9466 /* Update `coding-category-list'. */
9467 Vcoding_category_list = Qnil;
9468 for (i = coding_category_max - 1; i >= 0; i--)
9469 Vcoding_category_list
9470 = Fcons (AREF (Vcoding_category_table, priorities[i]),
9471 Vcoding_category_list);
9472
9473 return Qnil;
9474 }
9475
9476 DEFUN ("coding-system-priority-list", Fcoding_system_priority_list,
9477 Scoding_system_priority_list, 0, 1, 0,
9478 doc: /* Return a list of coding systems ordered by their priorities.
9479 The list contains a subset of coding systems; i.e. coding systems
9480 assigned to each coding category (see `coding-category-list').
9481
9482 HIGHESTP non-nil means just return the highest priority one. */)
9483 (Lisp_Object highestp)
9484 {
9485 int i;
9486 Lisp_Object val;
9487
9488 for (i = 0, val = Qnil; i < coding_category_max; i++)
9489 {
9490 enum coding_category category = coding_priorities[i];
9491 int id = coding_categories[category].id;
9492 Lisp_Object attrs;
9493
9494 if (id < 0)
9495 continue;
9496 attrs = CODING_ID_ATTRS (id);
9497 if (! NILP (highestp))
9498 return CODING_ATTR_BASE_NAME (attrs);
9499 val = Fcons (CODING_ATTR_BASE_NAME (attrs), val);
9500 }
9501 return Fnreverse (val);
9502 }
9503
9504 static const char *const suffixes[] = { "-unix", "-dos", "-mac" };
9505
9506 static Lisp_Object
9507 make_subsidiaries (Lisp_Object base)
9508 {
9509 Lisp_Object subsidiaries;
9510 int base_name_len = SBYTES (SYMBOL_NAME (base));
9511 char *buf = (char *) alloca (base_name_len + 6);
9512 int i;
9513
9514 memcpy (buf, SDATA (SYMBOL_NAME (base)), base_name_len);
9515 subsidiaries = Fmake_vector (make_number (3), Qnil);
9516 for (i = 0; i < 3; i++)
9517 {
9518 memcpy (buf + base_name_len, suffixes[i], strlen (suffixes[i]) + 1);
9519 ASET (subsidiaries, i, intern (buf));
9520 }
9521 return subsidiaries;
9522 }
9523
9524
9525 DEFUN ("define-coding-system-internal", Fdefine_coding_system_internal,
9526 Sdefine_coding_system_internal, coding_arg_max, MANY, 0,
9527 doc: /* For internal use only.
9528 usage: (define-coding-system-internal ...) */)
9529 (int nargs, Lisp_Object *args)
9530 {
9531 Lisp_Object name;
9532 Lisp_Object spec_vec; /* [ ATTRS ALIASE EOL_TYPE ] */
9533 Lisp_Object attrs; /* Vector of attributes. */
9534 Lisp_Object eol_type;
9535 Lisp_Object aliases;
9536 Lisp_Object coding_type, charset_list, safe_charsets;
9537 enum coding_category category;
9538 Lisp_Object tail, val;
9539 int max_charset_id = 0;
9540 int i;
9541
9542 if (nargs < coding_arg_max)
9543 goto short_args;
9544
9545 attrs = Fmake_vector (make_number (coding_attr_last_index), Qnil);
9546
9547 name = args[coding_arg_name];
9548 CHECK_SYMBOL (name);
9549 CODING_ATTR_BASE_NAME (attrs) = name;
9550
9551 val = args[coding_arg_mnemonic];
9552 if (! STRINGP (val))
9553 CHECK_CHARACTER (val);
9554 CODING_ATTR_MNEMONIC (attrs) = val;
9555
9556 coding_type = args[coding_arg_coding_type];
9557 CHECK_SYMBOL (coding_type);
9558 CODING_ATTR_TYPE (attrs) = coding_type;
9559
9560 charset_list = args[coding_arg_charset_list];
9561 if (SYMBOLP (charset_list))
9562 {
9563 if (EQ (charset_list, Qiso_2022))
9564 {
9565 if (! EQ (coding_type, Qiso_2022))
9566 error ("Invalid charset-list");
9567 charset_list = Viso_2022_charset_list;
9568 }
9569 else if (EQ (charset_list, Qemacs_mule))
9570 {
9571 if (! EQ (coding_type, Qemacs_mule))
9572 error ("Invalid charset-list");
9573 charset_list = Vemacs_mule_charset_list;
9574 }
9575 for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
9576 if (max_charset_id < XFASTINT (XCAR (tail)))
9577 max_charset_id = XFASTINT (XCAR (tail));
9578 }
9579 else
9580 {
9581 charset_list = Fcopy_sequence (charset_list);
9582 for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
9583 {
9584 struct charset *charset;
9585
9586 val = XCAR (tail);
9587 CHECK_CHARSET_GET_CHARSET (val, charset);
9588 if (EQ (coding_type, Qiso_2022)
9589 ? CHARSET_ISO_FINAL (charset) < 0
9590 : EQ (coding_type, Qemacs_mule)
9591 ? CHARSET_EMACS_MULE_ID (charset) < 0
9592 : 0)
9593 error ("Can't handle charset `%s'",
9594 SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
9595
9596 XSETCAR (tail, make_number (charset->id));
9597 if (max_charset_id < charset->id)
9598 max_charset_id = charset->id;
9599 }
9600 }
9601 CODING_ATTR_CHARSET_LIST (attrs) = charset_list;
9602
9603 safe_charsets = make_uninit_string (max_charset_id + 1);
9604 memset (SDATA (safe_charsets), 255, max_charset_id + 1);
9605 for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
9606 SSET (safe_charsets, XFASTINT (XCAR (tail)), 0);
9607 CODING_ATTR_SAFE_CHARSETS (attrs) = safe_charsets;
9608
9609 CODING_ATTR_ASCII_COMPAT (attrs) = args[coding_arg_ascii_compatible_p];
9610
9611 val = args[coding_arg_decode_translation_table];
9612 if (! CHAR_TABLE_P (val) && ! CONSP (val))
9613 CHECK_SYMBOL (val);
9614 CODING_ATTR_DECODE_TBL (attrs) = val;
9615
9616 val = args[coding_arg_encode_translation_table];
9617 if (! CHAR_TABLE_P (val) && ! CONSP (val))
9618 CHECK_SYMBOL (val);
9619 CODING_ATTR_ENCODE_TBL (attrs) = val;
9620
9621 val = args[coding_arg_post_read_conversion];
9622 CHECK_SYMBOL (val);
9623 CODING_ATTR_POST_READ (attrs) = val;
9624
9625 val = args[coding_arg_pre_write_conversion];
9626 CHECK_SYMBOL (val);
9627 CODING_ATTR_PRE_WRITE (attrs) = val;
9628
9629 val = args[coding_arg_default_char];
9630 if (NILP (val))
9631 CODING_ATTR_DEFAULT_CHAR (attrs) = make_number (' ');
9632 else
9633 {
9634 CHECK_CHARACTER (val);
9635 CODING_ATTR_DEFAULT_CHAR (attrs) = val;
9636 }
9637
9638 val = args[coding_arg_for_unibyte];
9639 CODING_ATTR_FOR_UNIBYTE (attrs) = NILP (val) ? Qnil : Qt;
9640
9641 val = args[coding_arg_plist];
9642 CHECK_LIST (val);
9643 CODING_ATTR_PLIST (attrs) = val;
9644
9645 if (EQ (coding_type, Qcharset))
9646 {
9647 /* Generate a lisp vector of 256 elements. Each element is nil,
9648 integer, or a list of charset IDs.
9649
9650 If Nth element is nil, the byte code N is invalid in this
9651 coding system.
9652
9653 If Nth element is a number NUM, N is the first byte of a
9654 charset whose ID is NUM.
9655
9656 If Nth element is a list of charset IDs, N is the first byte
9657 of one of them. The list is sorted by dimensions of the
9658 charsets. A charset of smaller dimension comes first. */
9659 val = Fmake_vector (make_number (256), Qnil);
9660
9661 for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
9662 {
9663 struct charset *charset = CHARSET_FROM_ID (XFASTINT (XCAR (tail)));
9664 int dim = CHARSET_DIMENSION (charset);
9665 int idx = (dim - 1) * 4;
9666
9667 if (CHARSET_ASCII_COMPATIBLE_P (charset))
9668 CODING_ATTR_ASCII_COMPAT (attrs) = Qt;
9669
9670 for (i = charset->code_space[idx];
9671 i <= charset->code_space[idx + 1]; i++)
9672 {
9673 Lisp_Object tmp, tmp2;
9674 int dim2;
9675
9676 tmp = AREF (val, i);
9677 if (NILP (tmp))
9678 tmp = XCAR (tail);
9679 else if (NUMBERP (tmp))
9680 {
9681 dim2 = CHARSET_DIMENSION (CHARSET_FROM_ID (XFASTINT (tmp)));
9682 if (dim < dim2)
9683 tmp = Fcons (XCAR (tail), Fcons (tmp, Qnil));
9684 else
9685 tmp = Fcons (tmp, Fcons (XCAR (tail), Qnil));
9686 }
9687 else
9688 {
9689 for (tmp2 = tmp; CONSP (tmp2); tmp2 = XCDR (tmp2))
9690 {
9691 dim2 = CHARSET_DIMENSION (CHARSET_FROM_ID (XFASTINT (XCAR (tmp2))));
9692 if (dim < dim2)
9693 break;
9694 }
9695 if (NILP (tmp2))
9696 tmp = nconc2 (tmp, Fcons (XCAR (tail), Qnil));
9697 else
9698 {
9699 XSETCDR (tmp2, Fcons (XCAR (tmp2), XCDR (tmp2)));
9700 XSETCAR (tmp2, XCAR (tail));
9701 }
9702 }
9703 ASET (val, i, tmp);
9704 }
9705 }
9706 ASET (attrs, coding_attr_charset_valids, val);
9707 category = coding_category_charset;
9708 }
9709 else if (EQ (coding_type, Qccl))
9710 {
9711 Lisp_Object valids;
9712
9713 if (nargs < coding_arg_ccl_max)
9714 goto short_args;
9715
9716 val = args[coding_arg_ccl_decoder];
9717 CHECK_CCL_PROGRAM (val);
9718 if (VECTORP (val))
9719 val = Fcopy_sequence (val);
9720 ASET (attrs, coding_attr_ccl_decoder, val);
9721
9722 val = args[coding_arg_ccl_encoder];
9723 CHECK_CCL_PROGRAM (val);
9724 if (VECTORP (val))
9725 val = Fcopy_sequence (val);
9726 ASET (attrs, coding_attr_ccl_encoder, val);
9727
9728 val = args[coding_arg_ccl_valids];
9729 valids = Fmake_string (make_number (256), make_number (0));
9730 for (tail = val; !NILP (tail); tail = Fcdr (tail))
9731 {
9732 int from, to;
9733
9734 val = Fcar (tail);
9735 if (INTEGERP (val))
9736 {
9737 from = to = XINT (val);
9738 if (from < 0 || from > 255)
9739 args_out_of_range_3 (val, make_number (0), make_number (255));
9740 }
9741 else
9742 {
9743 CHECK_CONS (val);
9744 CHECK_NATNUM_CAR (val);
9745 CHECK_NATNUM_CDR (val);
9746 from = XINT (XCAR (val));
9747 if (from > 255)
9748 args_out_of_range_3 (XCAR (val),
9749 make_number (0), make_number (255));
9750 to = XINT (XCDR (val));
9751 if (to < from || to > 255)
9752 args_out_of_range_3 (XCDR (val),
9753 XCAR (val), make_number (255));
9754 }
9755 for (i = from; i <= to; i++)
9756 SSET (valids, i, 1);
9757 }
9758 ASET (attrs, coding_attr_ccl_valids, valids);
9759
9760 category = coding_category_ccl;
9761 }
9762 else if (EQ (coding_type, Qutf_16))
9763 {
9764 Lisp_Object bom, endian;
9765
9766 CODING_ATTR_ASCII_COMPAT (attrs) = Qnil;
9767
9768 if (nargs < coding_arg_utf16_max)
9769 goto short_args;
9770
9771 bom = args[coding_arg_utf16_bom];
9772 if (! NILP (bom) && ! EQ (bom, Qt))
9773 {
9774 CHECK_CONS (bom);
9775 val = XCAR (bom);
9776 CHECK_CODING_SYSTEM (val);
9777 val = XCDR (bom);
9778 CHECK_CODING_SYSTEM (val);
9779 }
9780 ASET (attrs, coding_attr_utf_bom, bom);
9781
9782 endian = args[coding_arg_utf16_endian];
9783 CHECK_SYMBOL (endian);
9784 if (NILP (endian))
9785 endian = Qbig;
9786 else if (! EQ (endian, Qbig) && ! EQ (endian, Qlittle))
9787 error ("Invalid endian: %s", SDATA (SYMBOL_NAME (endian)));
9788 ASET (attrs, coding_attr_utf_16_endian, endian);
9789
9790 category = (CONSP (bom)
9791 ? coding_category_utf_16_auto
9792 : NILP (bom)
9793 ? (EQ (endian, Qbig)
9794 ? coding_category_utf_16_be_nosig
9795 : coding_category_utf_16_le_nosig)
9796 : (EQ (endian, Qbig)
9797 ? coding_category_utf_16_be
9798 : coding_category_utf_16_le));
9799 }
9800 else if (EQ (coding_type, Qiso_2022))
9801 {
9802 Lisp_Object initial, reg_usage, request, flags;
9803 int i;
9804
9805 if (nargs < coding_arg_iso2022_max)
9806 goto short_args;
9807
9808 initial = Fcopy_sequence (args[coding_arg_iso2022_initial]);
9809 CHECK_VECTOR (initial);
9810 for (i = 0; i < 4; i++)
9811 {
9812 val = Faref (initial, make_number (i));
9813 if (! NILP (val))
9814 {
9815 struct charset *charset;
9816
9817 CHECK_CHARSET_GET_CHARSET (val, charset);
9818 ASET (initial, i, make_number (CHARSET_ID (charset)));
9819 if (i == 0 && CHARSET_ASCII_COMPATIBLE_P (charset))
9820 CODING_ATTR_ASCII_COMPAT (attrs) = Qt;
9821 }
9822 else
9823 ASET (initial, i, make_number (-1));
9824 }
9825
9826 reg_usage = args[coding_arg_iso2022_reg_usage];
9827 CHECK_CONS (reg_usage);
9828 CHECK_NUMBER_CAR (reg_usage);
9829 CHECK_NUMBER_CDR (reg_usage);
9830
9831 request = Fcopy_sequence (args[coding_arg_iso2022_request]);
9832 for (tail = request; ! NILP (tail); tail = Fcdr (tail))
9833 {
9834 int id;
9835 Lisp_Object tmp;
9836
9837 val = Fcar (tail);
9838 CHECK_CONS (val);
9839 tmp = XCAR (val);
9840 CHECK_CHARSET_GET_ID (tmp, id);
9841 CHECK_NATNUM_CDR (val);
9842 if (XINT (XCDR (val)) >= 4)
9843 error ("Invalid graphic register number: %d", XINT (XCDR (val)));
9844 XSETCAR (val, make_number (id));
9845 }
9846
9847 flags = args[coding_arg_iso2022_flags];
9848 CHECK_NATNUM (flags);
9849 i = XINT (flags);
9850 if (EQ (args[coding_arg_charset_list], Qiso_2022))
9851 flags = make_number (i | CODING_ISO_FLAG_FULL_SUPPORT);
9852
9853 ASET (attrs, coding_attr_iso_initial, initial);
9854 ASET (attrs, coding_attr_iso_usage, reg_usage);
9855 ASET (attrs, coding_attr_iso_request, request);
9856 ASET (attrs, coding_attr_iso_flags, flags);
9857 setup_iso_safe_charsets (attrs);
9858
9859 if (i & CODING_ISO_FLAG_SEVEN_BITS)
9860 category = ((i & (CODING_ISO_FLAG_LOCKING_SHIFT
9861 | CODING_ISO_FLAG_SINGLE_SHIFT))
9862 ? coding_category_iso_7_else
9863 : EQ (args[coding_arg_charset_list], Qiso_2022)
9864 ? coding_category_iso_7
9865 : coding_category_iso_7_tight);
9866 else
9867 {
9868 int id = XINT (AREF (initial, 1));
9869
9870 category = (((i & CODING_ISO_FLAG_LOCKING_SHIFT)
9871 || EQ (args[coding_arg_charset_list], Qiso_2022)
9872 || id < 0)
9873 ? coding_category_iso_8_else
9874 : (CHARSET_DIMENSION (CHARSET_FROM_ID (id)) == 1)
9875 ? coding_category_iso_8_1
9876 : coding_category_iso_8_2);
9877 }
9878 if (category != coding_category_iso_8_1
9879 && category != coding_category_iso_8_2)
9880 CODING_ATTR_ASCII_COMPAT (attrs) = Qnil;
9881 }
9882 else if (EQ (coding_type, Qemacs_mule))
9883 {
9884 if (EQ (args[coding_arg_charset_list], Qemacs_mule))
9885 ASET (attrs, coding_attr_emacs_mule_full, Qt);
9886 CODING_ATTR_ASCII_COMPAT (attrs) = Qt;
9887 category = coding_category_emacs_mule;
9888 }
9889 else if (EQ (coding_type, Qshift_jis))
9890 {
9891
9892 struct charset *charset;
9893
9894 if (XINT (Flength (charset_list)) != 3
9895 && XINT (Flength (charset_list)) != 4)
9896 error ("There should be three or four charsets");
9897
9898 charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
9899 if (CHARSET_DIMENSION (charset) != 1)
9900 error ("Dimension of charset %s is not one",
9901 SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
9902 if (CHARSET_ASCII_COMPATIBLE_P (charset))
9903 CODING_ATTR_ASCII_COMPAT (attrs) = Qt;
9904
9905 charset_list = XCDR (charset_list);
9906 charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
9907 if (CHARSET_DIMENSION (charset) != 1)
9908 error ("Dimension of charset %s is not one",
9909 SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
9910
9911 charset_list = XCDR (charset_list);
9912 charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
9913 if (CHARSET_DIMENSION (charset) != 2)
9914 error ("Dimension of charset %s is not two",
9915 SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
9916
9917 charset_list = XCDR (charset_list);
9918 if (! NILP (charset_list))
9919 {
9920 charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
9921 if (CHARSET_DIMENSION (charset) != 2)
9922 error ("Dimension of charset %s is not two",
9923 SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
9924 }
9925
9926 category = coding_category_sjis;
9927 Vsjis_coding_system = name;
9928 }
9929 else if (EQ (coding_type, Qbig5))
9930 {
9931 struct charset *charset;
9932
9933 if (XINT (Flength (charset_list)) != 2)
9934 error ("There should be just two charsets");
9935
9936 charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
9937 if (CHARSET_DIMENSION (charset) != 1)
9938 error ("Dimension of charset %s is not one",
9939 SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
9940 if (CHARSET_ASCII_COMPATIBLE_P (charset))
9941 CODING_ATTR_ASCII_COMPAT (attrs) = Qt;
9942
9943 charset_list = XCDR (charset_list);
9944 charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
9945 if (CHARSET_DIMENSION (charset) != 2)
9946 error ("Dimension of charset %s is not two",
9947 SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
9948
9949 category = coding_category_big5;
9950 Vbig5_coding_system = name;
9951 }
9952 else if (EQ (coding_type, Qraw_text))
9953 {
9954 category = coding_category_raw_text;
9955 CODING_ATTR_ASCII_COMPAT (attrs) = Qt;
9956 }
9957 else if (EQ (coding_type, Qutf_8))
9958 {
9959 Lisp_Object bom;
9960
9961 CODING_ATTR_ASCII_COMPAT (attrs) = Qt;
9962
9963 if (nargs < coding_arg_utf8_max)
9964 goto short_args;
9965
9966 bom = args[coding_arg_utf8_bom];
9967 if (! NILP (bom) && ! EQ (bom, Qt))
9968 {
9969 CHECK_CONS (bom);
9970 val = XCAR (bom);
9971 CHECK_CODING_SYSTEM (val);
9972 val = XCDR (bom);
9973 CHECK_CODING_SYSTEM (val);
9974 }
9975 ASET (attrs, coding_attr_utf_bom, bom);
9976
9977 category = (CONSP (bom) ? coding_category_utf_8_auto
9978 : NILP (bom) ? coding_category_utf_8_nosig
9979 : coding_category_utf_8_sig);
9980 }
9981 else if (EQ (coding_type, Qundecided))
9982 category = coding_category_undecided;
9983 else
9984 error ("Invalid coding system type: %s",
9985 SDATA (SYMBOL_NAME (coding_type)));
9986
9987 CODING_ATTR_CATEGORY (attrs) = make_number (category);
9988 CODING_ATTR_PLIST (attrs)
9989 = Fcons (QCcategory, Fcons (AREF (Vcoding_category_table, category),
9990 CODING_ATTR_PLIST (attrs)));
9991 CODING_ATTR_PLIST (attrs)
9992 = Fcons (QCascii_compatible_p,
9993 Fcons (CODING_ATTR_ASCII_COMPAT (attrs),
9994 CODING_ATTR_PLIST (attrs)));
9995
9996 eol_type = args[coding_arg_eol_type];
9997 if (! NILP (eol_type)
9998 && ! EQ (eol_type, Qunix)
9999 && ! EQ (eol_type, Qdos)
10000 && ! EQ (eol_type, Qmac))
10001 error ("Invalid eol-type");
10002
10003 aliases = Fcons (name, Qnil);
10004
10005 if (NILP (eol_type))
10006 {
10007 eol_type = make_subsidiaries (name);
10008 for (i = 0; i < 3; i++)
10009 {
10010 Lisp_Object this_spec, this_name, this_aliases, this_eol_type;
10011
10012 this_name = AREF (eol_type, i);
10013 this_aliases = Fcons (this_name, Qnil);
10014 this_eol_type = (i == 0 ? Qunix : i == 1 ? Qdos : Qmac);
10015 this_spec = Fmake_vector (make_number (3), attrs);
10016 ASET (this_spec, 1, this_aliases);
10017 ASET (this_spec, 2, this_eol_type);
10018 Fputhash (this_name, this_spec, Vcoding_system_hash_table);
10019 Vcoding_system_list = Fcons (this_name, Vcoding_system_list);
10020 val = Fassoc (Fsymbol_name (this_name), Vcoding_system_alist);
10021 if (NILP (val))
10022 Vcoding_system_alist
10023 = Fcons (Fcons (Fsymbol_name (this_name), Qnil),
10024 Vcoding_system_alist);
10025 }
10026 }
10027
10028 spec_vec = Fmake_vector (make_number (3), attrs);
10029 ASET (spec_vec, 1, aliases);
10030 ASET (spec_vec, 2, eol_type);
10031
10032 Fputhash (name, spec_vec, Vcoding_system_hash_table);
10033 Vcoding_system_list = Fcons (name, Vcoding_system_list);
10034 val = Fassoc (Fsymbol_name (name), Vcoding_system_alist);
10035 if (NILP (val))
10036 Vcoding_system_alist = Fcons (Fcons (Fsymbol_name (name), Qnil),
10037 Vcoding_system_alist);
10038
10039 {
10040 int id = coding_categories[category].id;
10041
10042 if (id < 0 || EQ (name, CODING_ID_NAME (id)))
10043 setup_coding_system (name, &coding_categories[category]);
10044 }
10045
10046 return Qnil;
10047
10048 short_args:
10049 return Fsignal (Qwrong_number_of_arguments,
10050 Fcons (intern ("define-coding-system-internal"),
10051 make_number (nargs)));
10052 }
10053
10054
10055 DEFUN ("coding-system-put", Fcoding_system_put, Scoding_system_put,
10056 3, 3, 0,
10057 doc: /* Change value in CODING-SYSTEM's property list PROP to VAL. */)
10058 (Lisp_Object coding_system, Lisp_Object prop, Lisp_Object val)
10059 {
10060 Lisp_Object spec, attrs;
10061
10062 CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
10063 attrs = AREF (spec, 0);
10064 if (EQ (prop, QCmnemonic))
10065 {
10066 if (! STRINGP (val))
10067 CHECK_CHARACTER (val);
10068 CODING_ATTR_MNEMONIC (attrs) = val;
10069 }
10070 else if (EQ (prop, QCdefault_char))
10071 {
10072 if (NILP (val))
10073 val = make_number (' ');
10074 else
10075 CHECK_CHARACTER (val);
10076 CODING_ATTR_DEFAULT_CHAR (attrs) = val;
10077 }
10078 else if (EQ (prop, QCdecode_translation_table))
10079 {
10080 if (! CHAR_TABLE_P (val) && ! CONSP (val))
10081 CHECK_SYMBOL (val);
10082 CODING_ATTR_DECODE_TBL (attrs) = val;
10083 }
10084 else if (EQ (prop, QCencode_translation_table))
10085 {
10086 if (! CHAR_TABLE_P (val) && ! CONSP (val))
10087 CHECK_SYMBOL (val);
10088 CODING_ATTR_ENCODE_TBL (attrs) = val;
10089 }
10090 else if (EQ (prop, QCpost_read_conversion))
10091 {
10092 CHECK_SYMBOL (val);
10093 CODING_ATTR_POST_READ (attrs) = val;
10094 }
10095 else if (EQ (prop, QCpre_write_conversion))
10096 {
10097 CHECK_SYMBOL (val);
10098 CODING_ATTR_PRE_WRITE (attrs) = val;
10099 }
10100 else if (EQ (prop, QCascii_compatible_p))
10101 {
10102 CODING_ATTR_ASCII_COMPAT (attrs) = val;
10103 }
10104
10105 CODING_ATTR_PLIST (attrs)
10106 = Fplist_put (CODING_ATTR_PLIST (attrs), prop, val);
10107 return val;
10108 }
10109
10110
10111 DEFUN ("define-coding-system-alias", Fdefine_coding_system_alias,
10112 Sdefine_coding_system_alias, 2, 2, 0,
10113 doc: /* Define ALIAS as an alias for CODING-SYSTEM. */)
10114 (Lisp_Object alias, Lisp_Object coding_system)
10115 {
10116 Lisp_Object spec, aliases, eol_type, val;
10117
10118 CHECK_SYMBOL (alias);
10119 CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
10120 aliases = AREF (spec, 1);
10121 /* ALIASES should be a list of length more than zero, and the first
10122 element is a base coding system. Append ALIAS at the tail of the
10123 list. */
10124 while (!NILP (XCDR (aliases)))
10125 aliases = XCDR (aliases);
10126 XSETCDR (aliases, Fcons (alias, Qnil));
10127
10128 eol_type = AREF (spec, 2);
10129 if (VECTORP (eol_type))
10130 {
10131 Lisp_Object subsidiaries;
10132 int i;
10133
10134 subsidiaries = make_subsidiaries (alias);
10135 for (i = 0; i < 3; i++)
10136 Fdefine_coding_system_alias (AREF (subsidiaries, i),
10137 AREF (eol_type, i));
10138 }
10139
10140 Fputhash (alias, spec, Vcoding_system_hash_table);
10141 Vcoding_system_list = Fcons (alias, Vcoding_system_list);
10142 val = Fassoc (Fsymbol_name (alias), Vcoding_system_alist);
10143 if (NILP (val))
10144 Vcoding_system_alist = Fcons (Fcons (Fsymbol_name (alias), Qnil),
10145 Vcoding_system_alist);
10146
10147 return Qnil;
10148 }
10149
10150 DEFUN ("coding-system-base", Fcoding_system_base, Scoding_system_base,
10151 1, 1, 0,
10152 doc: /* Return the base of CODING-SYSTEM.
10153 Any alias or subsidiary coding system is not a base coding system. */)
10154 (Lisp_Object coding_system)
10155 {
10156 Lisp_Object spec, attrs;
10157
10158 if (NILP (coding_system))
10159 return (Qno_conversion);
10160 CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
10161 attrs = AREF (spec, 0);
10162 return CODING_ATTR_BASE_NAME (attrs);
10163 }
10164
10165 DEFUN ("coding-system-plist", Fcoding_system_plist, Scoding_system_plist,
10166 1, 1, 0,
10167 doc: "Return the property list of CODING-SYSTEM.")
10168 (Lisp_Object coding_system)
10169 {
10170 Lisp_Object spec, attrs;
10171
10172 if (NILP (coding_system))
10173 coding_system = Qno_conversion;
10174 CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
10175 attrs = AREF (spec, 0);
10176 return CODING_ATTR_PLIST (attrs);
10177 }
10178
10179
10180 DEFUN ("coding-system-aliases", Fcoding_system_aliases, Scoding_system_aliases,
10181 1, 1, 0,
10182 doc: /* Return the list of aliases of CODING-SYSTEM. */)
10183 (Lisp_Object coding_system)
10184 {
10185 Lisp_Object spec;
10186
10187 if (NILP (coding_system))
10188 coding_system = Qno_conversion;
10189 CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
10190 return AREF (spec, 1);
10191 }
10192
10193 DEFUN ("coding-system-eol-type", Fcoding_system_eol_type,
10194 Scoding_system_eol_type, 1, 1, 0,
10195 doc: /* Return eol-type of CODING-SYSTEM.
10196 An eol-type is an integer 0, 1, 2, or a vector of coding systems.
10197
10198 Integer values 0, 1, and 2 indicate a format of end-of-line; LF, CRLF,
10199 and CR respectively.
10200
10201 A vector value indicates that a format of end-of-line should be
10202 detected automatically. Nth element of the vector is the subsidiary
10203 coding system whose eol-type is N. */)
10204 (Lisp_Object coding_system)
10205 {
10206 Lisp_Object spec, eol_type;
10207 int n;
10208
10209 if (NILP (coding_system))
10210 coding_system = Qno_conversion;
10211 if (! CODING_SYSTEM_P (coding_system))
10212 return Qnil;
10213 spec = CODING_SYSTEM_SPEC (coding_system);
10214 eol_type = AREF (spec, 2);
10215 if (VECTORP (eol_type))
10216 return Fcopy_sequence (eol_type);
10217 n = EQ (eol_type, Qunix) ? 0 : EQ (eol_type, Qdos) ? 1 : 2;
10218 return make_number (n);
10219 }
10220
10221 #endif /* emacs */
10222
10223 \f
10224 /*** 9. Post-amble ***/
10225
10226 void
10227 init_coding_once (void)
10228 {
10229 int i;
10230
10231 for (i = 0; i < coding_category_max; i++)
10232 {
10233 coding_categories[i].id = -1;
10234 coding_priorities[i] = i;
10235 }
10236
10237 /* ISO2022 specific initialize routine. */
10238 for (i = 0; i < 0x20; i++)
10239 iso_code_class[i] = ISO_control_0;
10240 for (i = 0x21; i < 0x7F; i++)
10241 iso_code_class[i] = ISO_graphic_plane_0;
10242 for (i = 0x80; i < 0xA0; i++)
10243 iso_code_class[i] = ISO_control_1;
10244 for (i = 0xA1; i < 0xFF; i++)
10245 iso_code_class[i] = ISO_graphic_plane_1;
10246 iso_code_class[0x20] = iso_code_class[0x7F] = ISO_0x20_or_0x7F;
10247 iso_code_class[0xA0] = iso_code_class[0xFF] = ISO_0xA0_or_0xFF;
10248 iso_code_class[ISO_CODE_SO] = ISO_shift_out;
10249 iso_code_class[ISO_CODE_SI] = ISO_shift_in;
10250 iso_code_class[ISO_CODE_SS2_7] = ISO_single_shift_2_7;
10251 iso_code_class[ISO_CODE_ESC] = ISO_escape;
10252 iso_code_class[ISO_CODE_SS2] = ISO_single_shift_2;
10253 iso_code_class[ISO_CODE_SS3] = ISO_single_shift_3;
10254 iso_code_class[ISO_CODE_CSI] = ISO_control_sequence_introducer;
10255
10256 for (i = 0; i < 256; i++)
10257 {
10258 emacs_mule_bytes[i] = 1;
10259 }
10260 emacs_mule_bytes[EMACS_MULE_LEADING_CODE_PRIVATE_11] = 3;
10261 emacs_mule_bytes[EMACS_MULE_LEADING_CODE_PRIVATE_12] = 3;
10262 emacs_mule_bytes[EMACS_MULE_LEADING_CODE_PRIVATE_21] = 4;
10263 emacs_mule_bytes[EMACS_MULE_LEADING_CODE_PRIVATE_22] = 4;
10264 }
10265
10266 #ifdef emacs
10267
10268 void
10269 syms_of_coding (void)
10270 {
10271 staticpro (&Vcoding_system_hash_table);
10272 {
10273 Lisp_Object args[2];
10274 args[0] = QCtest;
10275 args[1] = Qeq;
10276 Vcoding_system_hash_table = Fmake_hash_table (2, args);
10277 }
10278
10279 staticpro (&Vsjis_coding_system);
10280 Vsjis_coding_system = Qnil;
10281
10282 staticpro (&Vbig5_coding_system);
10283 Vbig5_coding_system = Qnil;
10284
10285 staticpro (&Vcode_conversion_reused_workbuf);
10286 Vcode_conversion_reused_workbuf = Qnil;
10287
10288 staticpro (&Vcode_conversion_workbuf_name);
10289 Vcode_conversion_workbuf_name = make_pure_c_string (" *code-conversion-work*");
10290
10291 reused_workbuf_in_use = 0;
10292
10293 DEFSYM (Qcharset, "charset");
10294 DEFSYM (Qtarget_idx, "target-idx");
10295 DEFSYM (Qcoding_system_history, "coding-system-history");
10296 Fset (Qcoding_system_history, Qnil);
10297
10298 /* Target FILENAME is the first argument. */
10299 Fput (Qinsert_file_contents, Qtarget_idx, make_number (0));
10300 /* Target FILENAME is the third argument. */
10301 Fput (Qwrite_region, Qtarget_idx, make_number (2));
10302
10303 DEFSYM (Qcall_process, "call-process");
10304 /* Target PROGRAM is the first argument. */
10305 Fput (Qcall_process, Qtarget_idx, make_number (0));
10306
10307 DEFSYM (Qcall_process_region, "call-process-region");
10308 /* Target PROGRAM is the third argument. */
10309 Fput (Qcall_process_region, Qtarget_idx, make_number (2));
10310
10311 DEFSYM (Qstart_process, "start-process");
10312 /* Target PROGRAM is the third argument. */
10313 Fput (Qstart_process, Qtarget_idx, make_number (2));
10314
10315 DEFSYM (Qopen_network_stream, "open-network-stream");
10316 /* Target SERVICE is the fourth argument. */
10317 Fput (Qopen_network_stream, Qtarget_idx, make_number (3));
10318
10319 DEFSYM (Qcoding_system, "coding-system");
10320 DEFSYM (Qcoding_aliases, "coding-aliases");
10321
10322 DEFSYM (Qeol_type, "eol-type");
10323 DEFSYM (Qunix, "unix");
10324 DEFSYM (Qdos, "dos");
10325
10326 DEFSYM (Qbuffer_file_coding_system, "buffer-file-coding-system");
10327 DEFSYM (Qpost_read_conversion, "post-read-conversion");
10328 DEFSYM (Qpre_write_conversion, "pre-write-conversion");
10329 DEFSYM (Qdefault_char, "default-char");
10330 DEFSYM (Qundecided, "undecided");
10331 DEFSYM (Qno_conversion, "no-conversion");
10332 DEFSYM (Qraw_text, "raw-text");
10333
10334 DEFSYM (Qiso_2022, "iso-2022");
10335
10336 DEFSYM (Qutf_8, "utf-8");
10337 DEFSYM (Qutf_8_emacs, "utf-8-emacs");
10338
10339 DEFSYM (Qutf_16, "utf-16");
10340 DEFSYM (Qbig, "big");
10341 DEFSYM (Qlittle, "little");
10342
10343 DEFSYM (Qshift_jis, "shift-jis");
10344 DEFSYM (Qbig5, "big5");
10345
10346 DEFSYM (Qcoding_system_p, "coding-system-p");
10347
10348 DEFSYM (Qcoding_system_error, "coding-system-error");
10349 Fput (Qcoding_system_error, Qerror_conditions,
10350 pure_cons (Qcoding_system_error, pure_cons (Qerror, Qnil)));
10351 Fput (Qcoding_system_error, Qerror_message,
10352 make_pure_c_string ("Invalid coding system"));
10353
10354 /* Intern this now in case it isn't already done.
10355 Setting this variable twice is harmless.
10356 But don't staticpro it here--that is done in alloc.c. */
10357 Qchar_table_extra_slots = intern_c_string ("char-table-extra-slots");
10358
10359 DEFSYM (Qtranslation_table, "translation-table");
10360 Fput (Qtranslation_table, Qchar_table_extra_slots, make_number (2));
10361 DEFSYM (Qtranslation_table_id, "translation-table-id");
10362 DEFSYM (Qtranslation_table_for_decode, "translation-table-for-decode");
10363 DEFSYM (Qtranslation_table_for_encode, "translation-table-for-encode");
10364
10365 DEFSYM (Qvalid_codes, "valid-codes");
10366
10367 DEFSYM (Qemacs_mule, "emacs-mule");
10368
10369 DEFSYM (QCcategory, ":category");
10370 DEFSYM (QCmnemonic, ":mnemonic");
10371 DEFSYM (QCdefault_char, ":default-char");
10372 DEFSYM (QCdecode_translation_table, ":decode-translation-table");
10373 DEFSYM (QCencode_translation_table, ":encode-translation-table");
10374 DEFSYM (QCpost_read_conversion, ":post-read-conversion");
10375 DEFSYM (QCpre_write_conversion, ":pre-write-conversion");
10376 DEFSYM (QCascii_compatible_p, ":ascii-compatible-p");
10377
10378 Vcoding_category_table
10379 = Fmake_vector (make_number (coding_category_max), Qnil);
10380 staticpro (&Vcoding_category_table);
10381 /* Followings are target of code detection. */
10382 ASET (Vcoding_category_table, coding_category_iso_7,
10383 intern_c_string ("coding-category-iso-7"));
10384 ASET (Vcoding_category_table, coding_category_iso_7_tight,
10385 intern_c_string ("coding-category-iso-7-tight"));
10386 ASET (Vcoding_category_table, coding_category_iso_8_1,
10387 intern_c_string ("coding-category-iso-8-1"));
10388 ASET (Vcoding_category_table, coding_category_iso_8_2,
10389 intern_c_string ("coding-category-iso-8-2"));
10390 ASET (Vcoding_category_table, coding_category_iso_7_else,
10391 intern_c_string ("coding-category-iso-7-else"));
10392 ASET (Vcoding_category_table, coding_category_iso_8_else,
10393 intern_c_string ("coding-category-iso-8-else"));
10394 ASET (Vcoding_category_table, coding_category_utf_8_auto,
10395 intern_c_string ("coding-category-utf-8-auto"));
10396 ASET (Vcoding_category_table, coding_category_utf_8_nosig,
10397 intern_c_string ("coding-category-utf-8"));
10398 ASET (Vcoding_category_table, coding_category_utf_8_sig,
10399 intern_c_string ("coding-category-utf-8-sig"));
10400 ASET (Vcoding_category_table, coding_category_utf_16_be,
10401 intern_c_string ("coding-category-utf-16-be"));
10402 ASET (Vcoding_category_table, coding_category_utf_16_auto,
10403 intern_c_string ("coding-category-utf-16-auto"));
10404 ASET (Vcoding_category_table, coding_category_utf_16_le,
10405 intern_c_string ("coding-category-utf-16-le"));
10406 ASET (Vcoding_category_table, coding_category_utf_16_be_nosig,
10407 intern_c_string ("coding-category-utf-16-be-nosig"));
10408 ASET (Vcoding_category_table, coding_category_utf_16_le_nosig,
10409 intern_c_string ("coding-category-utf-16-le-nosig"));
10410 ASET (Vcoding_category_table, coding_category_charset,
10411 intern_c_string ("coding-category-charset"));
10412 ASET (Vcoding_category_table, coding_category_sjis,
10413 intern_c_string ("coding-category-sjis"));
10414 ASET (Vcoding_category_table, coding_category_big5,
10415 intern_c_string ("coding-category-big5"));
10416 ASET (Vcoding_category_table, coding_category_ccl,
10417 intern_c_string ("coding-category-ccl"));
10418 ASET (Vcoding_category_table, coding_category_emacs_mule,
10419 intern_c_string ("coding-category-emacs-mule"));
10420 /* Followings are NOT target of code detection. */
10421 ASET (Vcoding_category_table, coding_category_raw_text,
10422 intern_c_string ("coding-category-raw-text"));
10423 ASET (Vcoding_category_table, coding_category_undecided,
10424 intern_c_string ("coding-category-undecided"));
10425
10426 DEFSYM (Qinsufficient_source, "insufficient-source");
10427 DEFSYM (Qinconsistent_eol, "inconsistent-eol");
10428 DEFSYM (Qinvalid_source, "invalid-source");
10429 DEFSYM (Qinterrupted, "interrupted");
10430 DEFSYM (Qinsufficient_memory, "insufficient-memory");
10431 DEFSYM (Qcoding_system_define_form, "coding-system-define-form");
10432
10433 defsubr (&Scoding_system_p);
10434 defsubr (&Sread_coding_system);
10435 defsubr (&Sread_non_nil_coding_system);
10436 defsubr (&Scheck_coding_system);
10437 defsubr (&Sdetect_coding_region);
10438 defsubr (&Sdetect_coding_string);
10439 defsubr (&Sfind_coding_systems_region_internal);
10440 defsubr (&Sunencodable_char_position);
10441 defsubr (&Scheck_coding_systems_region);
10442 defsubr (&Sdecode_coding_region);
10443 defsubr (&Sencode_coding_region);
10444 defsubr (&Sdecode_coding_string);
10445 defsubr (&Sencode_coding_string);
10446 defsubr (&Sdecode_sjis_char);
10447 defsubr (&Sencode_sjis_char);
10448 defsubr (&Sdecode_big5_char);
10449 defsubr (&Sencode_big5_char);
10450 defsubr (&Sset_terminal_coding_system_internal);
10451 defsubr (&Sset_safe_terminal_coding_system_internal);
10452 defsubr (&Sterminal_coding_system);
10453 defsubr (&Sset_keyboard_coding_system_internal);
10454 defsubr (&Skeyboard_coding_system);
10455 defsubr (&Sfind_operation_coding_system);
10456 defsubr (&Sset_coding_system_priority);
10457 defsubr (&Sdefine_coding_system_internal);
10458 defsubr (&Sdefine_coding_system_alias);
10459 defsubr (&Scoding_system_put);
10460 defsubr (&Scoding_system_base);
10461 defsubr (&Scoding_system_plist);
10462 defsubr (&Scoding_system_aliases);
10463 defsubr (&Scoding_system_eol_type);
10464 defsubr (&Scoding_system_priority_list);
10465
10466 DEFVAR_LISP ("coding-system-list", Vcoding_system_list,
10467 doc: /* List of coding systems.
10468
10469 Do not alter the value of this variable manually. This variable should be
10470 updated by the functions `define-coding-system' and
10471 `define-coding-system-alias'. */);
10472 Vcoding_system_list = Qnil;
10473
10474 DEFVAR_LISP ("coding-system-alist", Vcoding_system_alist,
10475 doc: /* Alist of coding system names.
10476 Each element is one element list of coding system name.
10477 This variable is given to `completing-read' as COLLECTION argument.
10478
10479 Do not alter the value of this variable manually. This variable should be
10480 updated by the functions `make-coding-system' and
10481 `define-coding-system-alias'. */);
10482 Vcoding_system_alist = Qnil;
10483
10484 DEFVAR_LISP ("coding-category-list", Vcoding_category_list,
10485 doc: /* List of coding-categories (symbols) ordered by priority.
10486
10487 On detecting a coding system, Emacs tries code detection algorithms
10488 associated with each coding-category one by one in this order. When
10489 one algorithm agrees with a byte sequence of source text, the coding
10490 system bound to the corresponding coding-category is selected.
10491
10492 Don't modify this variable directly, but use `set-coding-system-priority'. */);
10493 {
10494 int i;
10495
10496 Vcoding_category_list = Qnil;
10497 for (i = coding_category_max - 1; i >= 0; i--)
10498 Vcoding_category_list
10499 = Fcons (XVECTOR (Vcoding_category_table)->contents[i],
10500 Vcoding_category_list);
10501 }
10502
10503 DEFVAR_LISP ("coding-system-for-read", Vcoding_system_for_read,
10504 doc: /* Specify the coding system for read operations.
10505 It is useful to bind this variable with `let', but do not set it globally.
10506 If the value is a coding system, it is used for decoding on read operation.
10507 If not, an appropriate element is used from one of the coding system alists.
10508 There are three such tables: `file-coding-system-alist',
10509 `process-coding-system-alist', and `network-coding-system-alist'. */);
10510 Vcoding_system_for_read = Qnil;
10511
10512 DEFVAR_LISP ("coding-system-for-write", Vcoding_system_for_write,
10513 doc: /* Specify the coding system for write operations.
10514 Programs bind this variable with `let', but you should not set it globally.
10515 If the value is a coding system, it is used for encoding of output,
10516 when writing it to a file and when sending it to a file or subprocess.
10517
10518 If this does not specify a coding system, an appropriate element
10519 is used from one of the coding system alists.
10520 There are three such tables: `file-coding-system-alist',
10521 `process-coding-system-alist', and `network-coding-system-alist'.
10522 For output to files, if the above procedure does not specify a coding system,
10523 the value of `buffer-file-coding-system' is used. */);
10524 Vcoding_system_for_write = Qnil;
10525
10526 DEFVAR_LISP ("last-coding-system-used", Vlast_coding_system_used,
10527 doc: /*
10528 Coding system used in the latest file or process I/O. */);
10529 Vlast_coding_system_used = Qnil;
10530
10531 DEFVAR_LISP ("last-code-conversion-error", Vlast_code_conversion_error,
10532 doc: /*
10533 Error status of the last code conversion.
10534
10535 When an error was detected in the last code conversion, this variable
10536 is set to one of the following symbols.
10537 `insufficient-source'
10538 `inconsistent-eol'
10539 `invalid-source'
10540 `interrupted'
10541 `insufficient-memory'
10542 When no error was detected, the value doesn't change. So, to check
10543 the error status of a code conversion by this variable, you must
10544 explicitly set this variable to nil before performing code
10545 conversion. */);
10546 Vlast_code_conversion_error = Qnil;
10547
10548 DEFVAR_BOOL ("inhibit-eol-conversion", inhibit_eol_conversion,
10549 doc: /*
10550 *Non-nil means always inhibit code conversion of end-of-line format.
10551 See info node `Coding Systems' and info node `Text and Binary' concerning
10552 such conversion. */);
10553 inhibit_eol_conversion = 0;
10554
10555 DEFVAR_BOOL ("inherit-process-coding-system", inherit_process_coding_system,
10556 doc: /*
10557 Non-nil means process buffer inherits coding system of process output.
10558 Bind it to t if the process output is to be treated as if it were a file
10559 read from some filesystem. */);
10560 inherit_process_coding_system = 0;
10561
10562 DEFVAR_LISP ("file-coding-system-alist", Vfile_coding_system_alist,
10563 doc: /*
10564 Alist to decide a coding system to use for a file I/O operation.
10565 The format is ((PATTERN . VAL) ...),
10566 where PATTERN is a regular expression matching a file name,
10567 VAL is a coding system, a cons of coding systems, or a function symbol.
10568 If VAL is a coding system, it is used for both decoding and encoding
10569 the file contents.
10570 If VAL is a cons of coding systems, the car part is used for decoding,
10571 and the cdr part is used for encoding.
10572 If VAL is a function symbol, the function must return a coding system
10573 or a cons of coding systems which are used as above. The function is
10574 called with an argument that is a list of the arguments with which
10575 `find-operation-coding-system' was called. If the function can't decide
10576 a coding system, it can return `undecided' so that the normal
10577 code-detection is performed.
10578
10579 See also the function `find-operation-coding-system'
10580 and the variable `auto-coding-alist'. */);
10581 Vfile_coding_system_alist = Qnil;
10582
10583 DEFVAR_LISP ("process-coding-system-alist", Vprocess_coding_system_alist,
10584 doc: /*
10585 Alist to decide a coding system to use for a process I/O operation.
10586 The format is ((PATTERN . VAL) ...),
10587 where PATTERN is a regular expression matching a program name,
10588 VAL is a coding system, a cons of coding systems, or a function symbol.
10589 If VAL is a coding system, it is used for both decoding what received
10590 from the program and encoding what sent to the program.
10591 If VAL is a cons of coding systems, the car part is used for decoding,
10592 and the cdr part is used for encoding.
10593 If VAL is a function symbol, the function must return a coding system
10594 or a cons of coding systems which are used as above.
10595
10596 See also the function `find-operation-coding-system'. */);
10597 Vprocess_coding_system_alist = Qnil;
10598
10599 DEFVAR_LISP ("network-coding-system-alist", Vnetwork_coding_system_alist,
10600 doc: /*
10601 Alist to decide a coding system to use for a network I/O operation.
10602 The format is ((PATTERN . VAL) ...),
10603 where PATTERN is a regular expression matching a network service name
10604 or is a port number to connect to,
10605 VAL is a coding system, a cons of coding systems, or a function symbol.
10606 If VAL is a coding system, it is used for both decoding what received
10607 from the network stream and encoding what sent to the network stream.
10608 If VAL is a cons of coding systems, the car part is used for decoding,
10609 and the cdr part is used for encoding.
10610 If VAL is a function symbol, the function must return a coding system
10611 or a cons of coding systems which are used as above.
10612
10613 See also the function `find-operation-coding-system'. */);
10614 Vnetwork_coding_system_alist = Qnil;
10615
10616 DEFVAR_LISP ("locale-coding-system", Vlocale_coding_system,
10617 doc: /* Coding system to use with system messages.
10618 Also used for decoding keyboard input on X Window system. */);
10619 Vlocale_coding_system = Qnil;
10620
10621 /* The eol mnemonics are reset in startup.el system-dependently. */
10622 DEFVAR_LISP ("eol-mnemonic-unix", eol_mnemonic_unix,
10623 doc: /*
10624 *String displayed in mode line for UNIX-like (LF) end-of-line format. */);
10625 eol_mnemonic_unix = make_pure_c_string (":");
10626
10627 DEFVAR_LISP ("eol-mnemonic-dos", eol_mnemonic_dos,
10628 doc: /*
10629 *String displayed in mode line for DOS-like (CRLF) end-of-line format. */);
10630 eol_mnemonic_dos = make_pure_c_string ("\\");
10631
10632 DEFVAR_LISP ("eol-mnemonic-mac", eol_mnemonic_mac,
10633 doc: /*
10634 *String displayed in mode line for MAC-like (CR) end-of-line format. */);
10635 eol_mnemonic_mac = make_pure_c_string ("/");
10636
10637 DEFVAR_LISP ("eol-mnemonic-undecided", eol_mnemonic_undecided,
10638 doc: /*
10639 *String displayed in mode line when end-of-line format is not yet determined. */);
10640 eol_mnemonic_undecided = make_pure_c_string (":");
10641
10642 DEFVAR_LISP ("enable-character-translation", Venable_character_translation,
10643 doc: /*
10644 *Non-nil enables character translation while encoding and decoding. */);
10645 Venable_character_translation = Qt;
10646
10647 DEFVAR_LISP ("standard-translation-table-for-decode",
10648 Vstandard_translation_table_for_decode,
10649 doc: /* Table for translating characters while decoding. */);
10650 Vstandard_translation_table_for_decode = Qnil;
10651
10652 DEFVAR_LISP ("standard-translation-table-for-encode",
10653 Vstandard_translation_table_for_encode,
10654 doc: /* Table for translating characters while encoding. */);
10655 Vstandard_translation_table_for_encode = Qnil;
10656
10657 DEFVAR_LISP ("charset-revision-table", Vcharset_revision_table,
10658 doc: /* Alist of charsets vs revision numbers.
10659 While encoding, if a charset (car part of an element) is found,
10660 designate it with the escape sequence identifying revision (cdr part
10661 of the element). */);
10662 Vcharset_revision_table = Qnil;
10663
10664 DEFVAR_LISP ("default-process-coding-system",
10665 Vdefault_process_coding_system,
10666 doc: /* Cons of coding systems used for process I/O by default.
10667 The car part is used for decoding a process output,
10668 the cdr part is used for encoding a text to be sent to a process. */);
10669 Vdefault_process_coding_system = Qnil;
10670
10671 DEFVAR_LISP ("latin-extra-code-table", Vlatin_extra_code_table,
10672 doc: /*
10673 Table of extra Latin codes in the range 128..159 (inclusive).
10674 This is a vector of length 256.
10675 If Nth element is non-nil, the existence of code N in a file
10676 \(or output of subprocess) doesn't prevent it to be detected as
10677 a coding system of ISO 2022 variant which has a flag
10678 `accept-latin-extra-code' t (e.g. iso-latin-1) on reading a file
10679 or reading output of a subprocess.
10680 Only 128th through 159th elements have a meaning. */);
10681 Vlatin_extra_code_table = Fmake_vector (make_number (256), Qnil);
10682
10683 DEFVAR_LISP ("select-safe-coding-system-function",
10684 Vselect_safe_coding_system_function,
10685 doc: /*
10686 Function to call to select safe coding system for encoding a text.
10687
10688 If set, this function is called to force a user to select a proper
10689 coding system which can encode the text in the case that a default
10690 coding system used in each operation can't encode the text. The
10691 function should take care that the buffer is not modified while
10692 the coding system is being selected.
10693
10694 The default value is `select-safe-coding-system' (which see). */);
10695 Vselect_safe_coding_system_function = Qnil;
10696
10697 DEFVAR_BOOL ("coding-system-require-warning",
10698 coding_system_require_warning,
10699 doc: /* Internal use only.
10700 If non-nil, on writing a file, `select-safe-coding-system-function' is
10701 called even if `coding-system-for-write' is non-nil. The command
10702 `universal-coding-system-argument' binds this variable to t temporarily. */);
10703 coding_system_require_warning = 0;
10704
10705
10706 DEFVAR_BOOL ("inhibit-iso-escape-detection",
10707 inhibit_iso_escape_detection,
10708 doc: /*
10709 If non-nil, Emacs ignores ISO-2022 escape sequences during code detection.
10710
10711 When Emacs reads text, it tries to detect how the text is encoded.
10712 This code detection is sensitive to escape sequences. If Emacs sees
10713 a valid ISO-2022 escape sequence, it assumes the text is encoded in one
10714 of the ISO2022 encodings, and decodes text by the corresponding coding
10715 system (e.g. `iso-2022-7bit').
10716
10717 However, there may be a case that you want to read escape sequences in
10718 a file as is. In such a case, you can set this variable to non-nil.
10719 Then the code detection will ignore any escape sequences, and no text is
10720 detected as encoded in some ISO-2022 encoding. The result is that all
10721 escape sequences become visible in a buffer.
10722
10723 The default value is nil, and it is strongly recommended not to change
10724 it. That is because many Emacs Lisp source files that contain
10725 non-ASCII characters are encoded by the coding system `iso-2022-7bit'
10726 in Emacs's distribution, and they won't be decoded correctly on
10727 reading if you suppress escape sequence detection.
10728
10729 The other way to read escape sequences in a file without decoding is
10730 to explicitly specify some coding system that doesn't use ISO-2022
10731 escape sequence (e.g `latin-1') on reading by \\[universal-coding-system-argument]. */);
10732 inhibit_iso_escape_detection = 0;
10733
10734 DEFVAR_BOOL ("inhibit-null-byte-detection",
10735 inhibit_null_byte_detection,
10736 doc: /* If non-nil, Emacs ignores null bytes on code detection.
10737 By default, Emacs treats it as binary data, and does not attempt to
10738 decode it. The effect is as if you specified `no-conversion' for
10739 reading that text.
10740
10741 Set this to non-nil when a regular text happens to include null bytes.
10742 Examples are Index nodes of Info files and null-byte delimited output
10743 from GNU Find and GNU Grep. Emacs will then ignore the null bytes and
10744 decode text as usual. */);
10745 inhibit_null_byte_detection = 0;
10746
10747 DEFVAR_LISP ("translation-table-for-input", Vtranslation_table_for_input,
10748 doc: /* Char table for translating self-inserting characters.
10749 This is applied to the result of input methods, not their input.
10750 See also `keyboard-translate-table'.
10751
10752 Use of this variable for character code unification was rendered
10753 obsolete in Emacs 23.1 and later, since Unicode is now the basis of
10754 internal character representation. */);
10755 Vtranslation_table_for_input = Qnil;
10756
10757 {
10758 Lisp_Object args[coding_arg_max];
10759 Lisp_Object plist[16];
10760 int i;
10761
10762 for (i = 0; i < coding_arg_max; i++)
10763 args[i] = Qnil;
10764
10765 plist[0] = intern_c_string (":name");
10766 plist[1] = args[coding_arg_name] = Qno_conversion;
10767 plist[2] = intern_c_string (":mnemonic");
10768 plist[3] = args[coding_arg_mnemonic] = make_number ('=');
10769 plist[4] = intern_c_string (":coding-type");
10770 plist[5] = args[coding_arg_coding_type] = Qraw_text;
10771 plist[6] = intern_c_string (":ascii-compatible-p");
10772 plist[7] = args[coding_arg_ascii_compatible_p] = Qt;
10773 plist[8] = intern_c_string (":default-char");
10774 plist[9] = args[coding_arg_default_char] = make_number (0);
10775 plist[10] = intern_c_string (":for-unibyte");
10776 plist[11] = args[coding_arg_for_unibyte] = Qt;
10777 plist[12] = intern_c_string (":docstring");
10778 plist[13] = make_pure_c_string ("Do no conversion.\n\
10779 \n\
10780 When you visit a file with this coding, the file is read into a\n\
10781 unibyte buffer as is, thus each byte of a file is treated as a\n\
10782 character.");
10783 plist[14] = intern_c_string (":eol-type");
10784 plist[15] = args[coding_arg_eol_type] = Qunix;
10785 args[coding_arg_plist] = Flist (16, plist);
10786 Fdefine_coding_system_internal (coding_arg_max, args);
10787
10788 plist[1] = args[coding_arg_name] = Qundecided;
10789 plist[3] = args[coding_arg_mnemonic] = make_number ('-');
10790 plist[5] = args[coding_arg_coding_type] = Qundecided;
10791 /* This is already set.
10792 plist[7] = args[coding_arg_ascii_compatible_p] = Qt; */
10793 plist[8] = intern_c_string (":charset-list");
10794 plist[9] = args[coding_arg_charset_list] = Fcons (Qascii, Qnil);
10795 plist[11] = args[coding_arg_for_unibyte] = Qnil;
10796 plist[13] = make_pure_c_string ("No conversion on encoding, automatic conversion on decoding.");
10797 plist[15] = args[coding_arg_eol_type] = Qnil;
10798 args[coding_arg_plist] = Flist (16, plist);
10799 Fdefine_coding_system_internal (coding_arg_max, args);
10800 }
10801
10802 setup_coding_system (Qno_conversion, &safe_terminal_coding);
10803
10804 {
10805 int i;
10806
10807 for (i = 0; i < coding_category_max; i++)
10808 Fset (AREF (Vcoding_category_table, i), Qno_conversion);
10809 }
10810 #if defined (DOS_NT)
10811 system_eol_type = Qdos;
10812 #else
10813 system_eol_type = Qunix;
10814 #endif
10815 staticpro (&system_eol_type);
10816 }
10817
10818 char *
10819 emacs_strerror (int error_number)
10820 {
10821 char *str;
10822
10823 synchronize_system_messages_locale ();
10824 str = strerror (error_number);
10825
10826 if (! NILP (Vlocale_coding_system))
10827 {
10828 Lisp_Object dec = code_convert_string_norecord (build_string (str),
10829 Vlocale_coding_system,
10830 0);
10831 str = SSDATA (dec);
10832 }
10833
10834 return str;
10835 }
10836
10837 #endif /* emacs */