(decode_coding_object): Revert part of last change.
[bpt/emacs.git] / src / coding.c
1 /* Coding system handler (conversion, detection, etc).
2 Copyright (C) 2001, 2002, 2003, 2004, 2005,
3 2006, 2007, 2008 Free Software Foundation, Inc.
4 Copyright (C) 1995, 1996, 1997, 1998, 1999, 2000, 2001, 2002, 2003, 2004,
5 2005, 2006, 2007, 2008
6 National Institute of Advanced Industrial Science and Technology (AIST)
7 Registration Number H14PRO021
8 Copyright (C) 2003
9 National Institute of Advanced Industrial Science and Technology (AIST)
10 Registration Number H13PRO009
11
12 This file is part of GNU Emacs.
13
14 GNU Emacs is free software; you can redistribute it and/or modify
15 it under the terms of the GNU General Public License as published by
16 the Free Software Foundation; either version 3, or (at your option)
17 any later version.
18
19 GNU Emacs is distributed in the hope that it will be useful,
20 but WITHOUT ANY WARRANTY; without even the implied warranty of
21 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
22 GNU General Public License for more details.
23
24 You should have received a copy of the GNU General Public License
25 along with GNU Emacs; see the file COPYING. If not, write to
26 the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
27 Boston, MA 02110-1301, USA. */
28
29 /*** TABLE OF CONTENTS ***
30
31 0. General comments
32 1. Preamble
33 2. Emacs' internal format (emacs-utf-8) handlers
34 3. UTF-8 handlers
35 4. UTF-16 handlers
36 5. Charset-base coding systems handlers
37 6. emacs-mule (old Emacs' internal format) handlers
38 7. ISO2022 handlers
39 8. Shift-JIS and BIG5 handlers
40 9. CCL handlers
41 10. C library functions
42 11. Emacs Lisp library functions
43 12. Postamble
44
45 */
46
47 /*** 0. General comments ***
48
49
50 CODING SYSTEM
51
52 A coding system is an object for an encoding mechanism that contains
53 information about how to convert byte sequences to character
54 sequences and vice versa. When we say "decode", it means converting
55 a byte sequence of a specific coding system into a character
56 sequence that is represented by Emacs' internal coding system
57 `emacs-utf-8', and when we say "encode", it means converting a
58 character sequence of emacs-utf-8 to a byte sequence of a specific
59 coding system.
60
61 In Emacs Lisp, a coding system is represented by a Lisp symbol. In
62 C level, a coding system is represented by a vector of attributes
63 stored in the hash table Vcharset_hash_table. The conversion from
64 coding system symbol to attributes vector is done by looking up
65 Vcharset_hash_table by the symbol.
66
67 Coding systems are classified into the following types depending on
68 the encoding mechanism. Here's a brief description of the types.
69
70 o UTF-8
71
72 o UTF-16
73
74 o Charset-base coding system
75
76 A coding system defined by one or more (coded) character sets.
77 Decoding and encoding are done by a code converter defined for each
78 character set.
79
80 o Old Emacs internal format (emacs-mule)
81
82 The coding system adopted by old versions of Emacs (20 and 21).
83
84 o ISO2022-base coding system
85
86 The most famous coding system for multiple character sets. X's
87 Compound Text, various EUCs (Extended Unix Code), and coding systems
88 used in the Internet communication such as ISO-2022-JP are all
89 variants of ISO2022.
90
91 o SJIS (or Shift-JIS or MS-Kanji-Code)
92
93 A coding system to encode character sets: ASCII, JISX0201, and
94 JISX0208. Widely used for PC's in Japan. Details are described in
95 section 8.
96
97 o BIG5
98
99 A coding system to encode character sets: ASCII and Big5. Widely
100 used for Chinese (mainly in Taiwan and Hong Kong). Details are
101 described in section 8. In this file, when we write "big5" (all
102 lowercase), we mean the coding system, and when we write "Big5"
103 (capitalized), we mean the character set.
104
105 o CCL
106
107 If a user wants to decode/encode text encoded in a coding system
108 not listed above, he can supply a decoder and an encoder for it in
109 CCL (Code Conversion Language) programs. Emacs executes the CCL
110 program while decoding/encoding.
111
112 o Raw-text
113
114 A coding system for text containing raw eight-bit data. Emacs
115 treats each byte of source text as a character (except for
116 end-of-line conversion).
117
118 o No-conversion
119
120 Like raw text, but don't do end-of-line conversion.
121
122
123 END-OF-LINE FORMAT
124
125 How text end-of-line is encoded depends on operating system. For
126 instance, Unix's format is just one byte of LF (line-feed) code,
127 whereas DOS's format is two-byte sequence of `carriage-return' and
128 `line-feed' codes. MacOS's format is usually one byte of
129 `carriage-return'.
130
131 Since text character encoding and end-of-line encoding are
132 independent, any coding system described above can take any format
133 of end-of-line (except for no-conversion).
134
135 STRUCT CODING_SYSTEM
136
137 Before using a coding system for code conversion (i.e. decoding and
138 encoding), we setup a structure of type `struct coding_system'.
139 This structure keeps various information about a specific code
140 conversion (e.g. the location of source and destination data).
141
142 */
143
144 /* COMMON MACROS */
145
146
147 /*** GENERAL NOTES on `detect_coding_XXX ()' functions ***
148
149 These functions check if a byte sequence specified as a source in
150 CODING conforms to the format of XXX, and update the members of
151 DETECT_INFO.
152
153 Return 1 if the byte sequence conforms to XXX, otherwise return 0.
154
155 Below is the template of these functions. */
156
157 #if 0
158 static int
159 detect_coding_XXX (coding, detect_info)
160 struct coding_system *coding;
161 struct coding_detection_info *detect_info;
162 {
163 const unsigned char *src = coding->source;
164 const unsigned char *src_end = coding->source + coding->src_bytes;
165 int multibytep = coding->src_multibyte;
166 int consumed_chars = 0;
167 int found = 0;
168 ...;
169
170 while (1)
171 {
172 /* Get one byte from the source. If the souce is exausted, jump
173 to no_more_source:. */
174 ONE_MORE_BYTE (c);
175
176 if (! __C_conforms_to_XXX___ (c))
177 break;
178 if (! __C_strongly_suggests_XXX__ (c))
179 found = CATEGORY_MASK_XXX;
180 }
181 /* The byte sequence is invalid for XXX. */
182 detect_info->rejected |= CATEGORY_MASK_XXX;
183 return 0;
184
185 no_more_source:
186 /* The source exausted successfully. */
187 detect_info->found |= found;
188 return 1;
189 }
190 #endif
191
192 /*** GENERAL NOTES on `decode_coding_XXX ()' functions ***
193
194 These functions decode a byte sequence specified as a source by
195 CODING. The resulting multibyte text goes to a place pointed to by
196 CODING->charbuf, the length of which should not exceed
197 CODING->charbuf_size;
198
199 These functions set the information of original and decoded texts in
200 CODING->consumed, CODING->consumed_char, and CODING->charbuf_used.
201 They also set CODING->result to one of CODING_RESULT_XXX indicating
202 how the decoding is finished.
203
204 Below is the template of these functions. */
205
206 #if 0
207 static void
208 decode_coding_XXXX (coding)
209 struct coding_system *coding;
210 {
211 const unsigned char *src = coding->source + coding->consumed;
212 const unsigned char *src_end = coding->source + coding->src_bytes;
213 /* SRC_BASE remembers the start position in source in each loop.
214 The loop will be exited when there's not enough source code, or
215 when there's no room in CHARBUF for a decoded character. */
216 const unsigned char *src_base;
217 /* A buffer to produce decoded characters. */
218 int *charbuf = coding->charbuf + coding->charbuf_used;
219 int *charbuf_end = coding->charbuf + coding->charbuf_size;
220 int multibytep = coding->src_multibyte;
221
222 while (1)
223 {
224 src_base = src;
225 if (charbuf < charbuf_end)
226 /* No more room to produce a decoded character. */
227 break;
228 ONE_MORE_BYTE (c);
229 /* Decode it. */
230 }
231
232 no_more_source:
233 if (src_base < src_end
234 && coding->mode & CODING_MODE_LAST_BLOCK)
235 /* If the source ends by partial bytes to construct a character,
236 treat them as eight-bit raw data. */
237 while (src_base < src_end && charbuf < charbuf_end)
238 *charbuf++ = *src_base++;
239 /* Remember how many bytes and characters we consumed. If the
240 source is multibyte, the bytes and chars are not identical. */
241 coding->consumed = coding->consumed_char = src_base - coding->source;
242 /* Remember how many characters we produced. */
243 coding->charbuf_used = charbuf - coding->charbuf;
244 }
245 #endif
246
247 /*** GENERAL NOTES on `encode_coding_XXX ()' functions ***
248
249 These functions encode SRC_BYTES length text at SOURCE of Emacs'
250 internal multibyte format by CODING. The resulting byte sequence
251 goes to a place pointed to by DESTINATION, the length of which
252 should not exceed DST_BYTES.
253
254 These functions set the information of original and encoded texts in
255 the members produced, produced_char, consumed, and consumed_char of
256 the structure *CODING. They also set the member result to one of
257 CODING_RESULT_XXX indicating how the encoding finished.
258
259 DST_BYTES zero means that source area and destination area are
260 overlapped, which means that we can produce a encoded text until it
261 reaches at the head of not-yet-encoded source text.
262
263 Below is a template of these functions. */
264 #if 0
265 static void
266 encode_coding_XXX (coding)
267 struct coding_system *coding;
268 {
269 int multibytep = coding->dst_multibyte;
270 int *charbuf = coding->charbuf;
271 int *charbuf_end = charbuf->charbuf + coding->charbuf_used;
272 unsigned char *dst = coding->destination + coding->produced;
273 unsigned char *dst_end = coding->destination + coding->dst_bytes;
274 unsigned char *adjusted_dst_end = dst_end - _MAX_BYTES_PRODUCED_IN_LOOP_;
275 int produced_chars = 0;
276
277 for (; charbuf < charbuf_end && dst < adjusted_dst_end; charbuf++)
278 {
279 int c = *charbuf;
280 /* Encode C into DST, and increment DST. */
281 }
282 label_no_more_destination:
283 /* How many chars and bytes we produced. */
284 coding->produced_char += produced_chars;
285 coding->produced = dst - coding->destination;
286 }
287 #endif
288
289 \f
290 /*** 1. Preamble ***/
291
292 #include <config.h>
293 #include <stdio.h>
294
295 #include "lisp.h"
296 #include "buffer.h"
297 #include "character.h"
298 #include "charset.h"
299 #include "ccl.h"
300 #include "composite.h"
301 #include "coding.h"
302 #include "window.h"
303 #include "frame.h"
304 #include "termhooks.h"
305
306 Lisp_Object Vcoding_system_hash_table;
307
308 Lisp_Object Qcoding_system, Qcoding_aliases, Qeol_type;
309 Lisp_Object Qunix, Qdos;
310 extern Lisp_Object Qmac; /* frame.c */
311 Lisp_Object Qbuffer_file_coding_system;
312 Lisp_Object Qpost_read_conversion, Qpre_write_conversion;
313 Lisp_Object Qdefault_char;
314 Lisp_Object Qno_conversion, Qundecided;
315 Lisp_Object Qcharset, Qiso_2022, Qutf_8, Qutf_16, Qshift_jis, Qbig5;
316 Lisp_Object Qbig, Qlittle;
317 Lisp_Object Qcoding_system_history;
318 Lisp_Object Qvalid_codes;
319 Lisp_Object QCcategory, QCmnemonic, QCdefalut_char;
320 Lisp_Object QCdecode_translation_table, QCencode_translation_table;
321 Lisp_Object QCpost_read_conversion, QCpre_write_conversion;
322 Lisp_Object QCascii_compatible_p;
323
324 extern Lisp_Object Qinsert_file_contents, Qwrite_region;
325 Lisp_Object Qcall_process, Qcall_process_region;
326 Lisp_Object Qstart_process, Qopen_network_stream;
327 Lisp_Object Qtarget_idx;
328
329 Lisp_Object Qinsufficient_source, Qinconsistent_eol, Qinvalid_source;
330 Lisp_Object Qinterrupted, Qinsufficient_memory;
331
332 extern Lisp_Object Qcompletion_ignore_case;
333
334 /* If a symbol has this property, evaluate the value to define the
335 symbol as a coding system. */
336 static Lisp_Object Qcoding_system_define_form;
337
338 int coding_system_require_warning;
339
340 Lisp_Object Vselect_safe_coding_system_function;
341
342 /* Mnemonic string for each format of end-of-line. */
343 Lisp_Object eol_mnemonic_unix, eol_mnemonic_dos, eol_mnemonic_mac;
344 /* Mnemonic string to indicate format of end-of-line is not yet
345 decided. */
346 Lisp_Object eol_mnemonic_undecided;
347
348 /* Format of end-of-line decided by system. This is Qunix on
349 Unix and Mac, Qdos on DOS/Windows.
350 This has an effect only for external encoding (i.e. for output to
351 file and process), not for in-buffer or Lisp string encoding. */
352 static Lisp_Object system_eol_type;
353
354 #ifdef emacs
355
356 Lisp_Object Vcoding_system_list, Vcoding_system_alist;
357
358 Lisp_Object Qcoding_system_p, Qcoding_system_error;
359
360 /* Coding system emacs-mule and raw-text are for converting only
361 end-of-line format. */
362 Lisp_Object Qemacs_mule, Qraw_text;
363 Lisp_Object Qutf_8_emacs;
364
365 /* Coding-systems are handed between Emacs Lisp programs and C internal
366 routines by the following three variables. */
367 /* Coding-system for reading files and receiving data from process. */
368 Lisp_Object Vcoding_system_for_read;
369 /* Coding-system for writing files and sending data to process. */
370 Lisp_Object Vcoding_system_for_write;
371 /* Coding-system actually used in the latest I/O. */
372 Lisp_Object Vlast_coding_system_used;
373 /* Set to non-nil when an error is detected while code conversion. */
374 Lisp_Object Vlast_code_conversion_error;
375 /* A vector of length 256 which contains information about special
376 Latin codes (especially for dealing with Microsoft codes). */
377 Lisp_Object Vlatin_extra_code_table;
378
379 /* Flag to inhibit code conversion of end-of-line format. */
380 int inhibit_eol_conversion;
381
382 /* Flag to inhibit ISO2022 escape sequence detection. */
383 int inhibit_iso_escape_detection;
384
385 /* Flag to make buffer-file-coding-system inherit from process-coding. */
386 int inherit_process_coding_system;
387
388 /* Coding system to be used to encode text for terminal display when
389 terminal coding system is nil. */
390 struct coding_system safe_terminal_coding;
391
392 Lisp_Object Vfile_coding_system_alist;
393 Lisp_Object Vprocess_coding_system_alist;
394 Lisp_Object Vnetwork_coding_system_alist;
395
396 Lisp_Object Vlocale_coding_system;
397
398 #endif /* emacs */
399
400 /* Flag to tell if we look up translation table on character code
401 conversion. */
402 Lisp_Object Venable_character_translation;
403 /* Standard translation table to look up on decoding (reading). */
404 Lisp_Object Vstandard_translation_table_for_decode;
405 /* Standard translation table to look up on encoding (writing). */
406 Lisp_Object Vstandard_translation_table_for_encode;
407
408 Lisp_Object Qtranslation_table;
409 Lisp_Object Qtranslation_table_id;
410 Lisp_Object Qtranslation_table_for_decode;
411 Lisp_Object Qtranslation_table_for_encode;
412
413 /* Alist of charsets vs revision number. */
414 static Lisp_Object Vcharset_revision_table;
415
416 /* Default coding systems used for process I/O. */
417 Lisp_Object Vdefault_process_coding_system;
418
419 /* Char table for translating Quail and self-inserting input. */
420 Lisp_Object Vtranslation_table_for_input;
421
422 /* Two special coding systems. */
423 Lisp_Object Vsjis_coding_system;
424 Lisp_Object Vbig5_coding_system;
425
426 /* ISO2022 section */
427
428 #define CODING_ISO_INITIAL(coding, reg) \
429 (XINT (AREF (AREF (CODING_ID_ATTRS ((coding)->id), \
430 coding_attr_iso_initial), \
431 reg)))
432
433
434 #define CODING_ISO_REQUEST(coding, charset_id) \
435 ((charset_id <= (coding)->max_charset_id \
436 ? (coding)->safe_charsets[charset_id] \
437 : -1))
438
439
440 #define CODING_ISO_FLAGS(coding) \
441 ((coding)->spec.iso_2022.flags)
442 #define CODING_ISO_DESIGNATION(coding, reg) \
443 ((coding)->spec.iso_2022.current_designation[reg])
444 #define CODING_ISO_INVOCATION(coding, plane) \
445 ((coding)->spec.iso_2022.current_invocation[plane])
446 #define CODING_ISO_SINGLE_SHIFTING(coding) \
447 ((coding)->spec.iso_2022.single_shifting)
448 #define CODING_ISO_BOL(coding) \
449 ((coding)->spec.iso_2022.bol)
450 #define CODING_ISO_INVOKED_CHARSET(coding, plane) \
451 CODING_ISO_DESIGNATION ((coding), CODING_ISO_INVOCATION ((coding), (plane)))
452
453 /* Control characters of ISO2022. */
454 /* code */ /* function */
455 #define ISO_CODE_LF 0x0A /* line-feed */
456 #define ISO_CODE_CR 0x0D /* carriage-return */
457 #define ISO_CODE_SO 0x0E /* shift-out */
458 #define ISO_CODE_SI 0x0F /* shift-in */
459 #define ISO_CODE_SS2_7 0x19 /* single-shift-2 for 7-bit code */
460 #define ISO_CODE_ESC 0x1B /* escape */
461 #define ISO_CODE_SS2 0x8E /* single-shift-2 */
462 #define ISO_CODE_SS3 0x8F /* single-shift-3 */
463 #define ISO_CODE_CSI 0x9B /* control-sequence-introducer */
464
465 /* All code (1-byte) of ISO2022 is classified into one of the
466 followings. */
467 enum iso_code_class_type
468 {
469 ISO_control_0, /* Control codes in the range
470 0x00..0x1F and 0x7F, except for the
471 following 5 codes. */
472 ISO_shift_out, /* ISO_CODE_SO (0x0E) */
473 ISO_shift_in, /* ISO_CODE_SI (0x0F) */
474 ISO_single_shift_2_7, /* ISO_CODE_SS2_7 (0x19) */
475 ISO_escape, /* ISO_CODE_SO (0x1B) */
476 ISO_control_1, /* Control codes in the range
477 0x80..0x9F, except for the
478 following 3 codes. */
479 ISO_single_shift_2, /* ISO_CODE_SS2 (0x8E) */
480 ISO_single_shift_3, /* ISO_CODE_SS3 (0x8F) */
481 ISO_control_sequence_introducer, /* ISO_CODE_CSI (0x9B) */
482 ISO_0x20_or_0x7F, /* Codes of the values 0x20 or 0x7F. */
483 ISO_graphic_plane_0, /* Graphic codes in the range 0x21..0x7E. */
484 ISO_0xA0_or_0xFF, /* Codes of the values 0xA0 or 0xFF. */
485 ISO_graphic_plane_1 /* Graphic codes in the range 0xA1..0xFE. */
486 };
487
488 /** The macros CODING_ISO_FLAG_XXX defines a flag bit of the
489 `iso-flags' attribute of an iso2022 coding system. */
490
491 /* If set, produce long-form designation sequence (e.g. ESC $ ( A)
492 instead of the correct short-form sequence (e.g. ESC $ A). */
493 #define CODING_ISO_FLAG_LONG_FORM 0x0001
494
495 /* If set, reset graphic planes and registers at end-of-line to the
496 initial state. */
497 #define CODING_ISO_FLAG_RESET_AT_EOL 0x0002
498
499 /* If set, reset graphic planes and registers before any control
500 characters to the initial state. */
501 #define CODING_ISO_FLAG_RESET_AT_CNTL 0x0004
502
503 /* If set, encode by 7-bit environment. */
504 #define CODING_ISO_FLAG_SEVEN_BITS 0x0008
505
506 /* If set, use locking-shift function. */
507 #define CODING_ISO_FLAG_LOCKING_SHIFT 0x0010
508
509 /* If set, use single-shift function. Overwrite
510 CODING_ISO_FLAG_LOCKING_SHIFT. */
511 #define CODING_ISO_FLAG_SINGLE_SHIFT 0x0020
512
513 /* If set, use designation escape sequence. */
514 #define CODING_ISO_FLAG_DESIGNATION 0x0040
515
516 /* If set, produce revision number sequence. */
517 #define CODING_ISO_FLAG_REVISION 0x0080
518
519 /* If set, produce ISO6429's direction specifying sequence. */
520 #define CODING_ISO_FLAG_DIRECTION 0x0100
521
522 /* If set, assume designation states are reset at beginning of line on
523 output. */
524 #define CODING_ISO_FLAG_INIT_AT_BOL 0x0200
525
526 /* If set, designation sequence should be placed at beginning of line
527 on output. */
528 #define CODING_ISO_FLAG_DESIGNATE_AT_BOL 0x0400
529
530 /* If set, do not encode unsafe charactes on output. */
531 #define CODING_ISO_FLAG_SAFE 0x0800
532
533 /* If set, extra latin codes (128..159) are accepted as a valid code
534 on input. */
535 #define CODING_ISO_FLAG_LATIN_EXTRA 0x1000
536
537 #define CODING_ISO_FLAG_COMPOSITION 0x2000
538
539 #define CODING_ISO_FLAG_EUC_TW_SHIFT 0x4000
540
541 #define CODING_ISO_FLAG_USE_ROMAN 0x8000
542
543 #define CODING_ISO_FLAG_USE_OLDJIS 0x10000
544
545 #define CODING_ISO_FLAG_FULL_SUPPORT 0x100000
546
547 /* A character to be produced on output if encoding of the original
548 character is prohibited by CODING_ISO_FLAG_SAFE. */
549 #define CODING_INHIBIT_CHARACTER_SUBSTITUTION '?'
550
551
552 /* UTF-16 section */
553 #define CODING_UTF_16_BOM(coding) \
554 ((coding)->spec.utf_16.bom)
555
556 #define CODING_UTF_16_ENDIAN(coding) \
557 ((coding)->spec.utf_16.endian)
558
559 #define CODING_UTF_16_SURROGATE(coding) \
560 ((coding)->spec.utf_16.surrogate)
561
562
563 /* CCL section */
564 #define CODING_CCL_DECODER(coding) \
565 AREF (CODING_ID_ATTRS ((coding)->id), coding_attr_ccl_decoder)
566 #define CODING_CCL_ENCODER(coding) \
567 AREF (CODING_ID_ATTRS ((coding)->id), coding_attr_ccl_encoder)
568 #define CODING_CCL_VALIDS(coding) \
569 (SDATA (AREF (CODING_ID_ATTRS ((coding)->id), coding_attr_ccl_valids)))
570
571 /* Index for each coding category in `coding_categories' */
572
573 enum coding_category
574 {
575 coding_category_iso_7,
576 coding_category_iso_7_tight,
577 coding_category_iso_8_1,
578 coding_category_iso_8_2,
579 coding_category_iso_7_else,
580 coding_category_iso_8_else,
581 coding_category_utf_8,
582 coding_category_utf_16_auto,
583 coding_category_utf_16_be,
584 coding_category_utf_16_le,
585 coding_category_utf_16_be_nosig,
586 coding_category_utf_16_le_nosig,
587 coding_category_charset,
588 coding_category_sjis,
589 coding_category_big5,
590 coding_category_ccl,
591 coding_category_emacs_mule,
592 /* All above are targets of code detection. */
593 coding_category_raw_text,
594 coding_category_undecided,
595 coding_category_max
596 };
597
598 /* Definitions of flag bits used in detect_coding_XXXX. */
599 #define CATEGORY_MASK_ISO_7 (1 << coding_category_iso_7)
600 #define CATEGORY_MASK_ISO_7_TIGHT (1 << coding_category_iso_7_tight)
601 #define CATEGORY_MASK_ISO_8_1 (1 << coding_category_iso_8_1)
602 #define CATEGORY_MASK_ISO_8_2 (1 << coding_category_iso_8_2)
603 #define CATEGORY_MASK_ISO_7_ELSE (1 << coding_category_iso_7_else)
604 #define CATEGORY_MASK_ISO_8_ELSE (1 << coding_category_iso_8_else)
605 #define CATEGORY_MASK_UTF_8 (1 << coding_category_utf_8)
606 #define CATEGORY_MASK_UTF_16_AUTO (1 << coding_category_utf_16_auto)
607 #define CATEGORY_MASK_UTF_16_BE (1 << coding_category_utf_16_be)
608 #define CATEGORY_MASK_UTF_16_LE (1 << coding_category_utf_16_le)
609 #define CATEGORY_MASK_UTF_16_BE_NOSIG (1 << coding_category_utf_16_be_nosig)
610 #define CATEGORY_MASK_UTF_16_LE_NOSIG (1 << coding_category_utf_16_le_nosig)
611 #define CATEGORY_MASK_CHARSET (1 << coding_category_charset)
612 #define CATEGORY_MASK_SJIS (1 << coding_category_sjis)
613 #define CATEGORY_MASK_BIG5 (1 << coding_category_big5)
614 #define CATEGORY_MASK_CCL (1 << coding_category_ccl)
615 #define CATEGORY_MASK_EMACS_MULE (1 << coding_category_emacs_mule)
616 #define CATEGORY_MASK_RAW_TEXT (1 << coding_category_raw_text)
617
618 /* This value is returned if detect_coding_mask () find nothing other
619 than ASCII characters. */
620 #define CATEGORY_MASK_ANY \
621 (CATEGORY_MASK_ISO_7 \
622 | CATEGORY_MASK_ISO_7_TIGHT \
623 | CATEGORY_MASK_ISO_8_1 \
624 | CATEGORY_MASK_ISO_8_2 \
625 | CATEGORY_MASK_ISO_7_ELSE \
626 | CATEGORY_MASK_ISO_8_ELSE \
627 | CATEGORY_MASK_UTF_8 \
628 | CATEGORY_MASK_UTF_16_BE \
629 | CATEGORY_MASK_UTF_16_LE \
630 | CATEGORY_MASK_UTF_16_BE_NOSIG \
631 | CATEGORY_MASK_UTF_16_LE_NOSIG \
632 | CATEGORY_MASK_CHARSET \
633 | CATEGORY_MASK_SJIS \
634 | CATEGORY_MASK_BIG5 \
635 | CATEGORY_MASK_CCL \
636 | CATEGORY_MASK_EMACS_MULE)
637
638
639 #define CATEGORY_MASK_ISO_7BIT \
640 (CATEGORY_MASK_ISO_7 | CATEGORY_MASK_ISO_7_TIGHT)
641
642 #define CATEGORY_MASK_ISO_8BIT \
643 (CATEGORY_MASK_ISO_8_1 | CATEGORY_MASK_ISO_8_2)
644
645 #define CATEGORY_MASK_ISO_ELSE \
646 (CATEGORY_MASK_ISO_7_ELSE | CATEGORY_MASK_ISO_8_ELSE)
647
648 #define CATEGORY_MASK_ISO_ESCAPE \
649 (CATEGORY_MASK_ISO_7 \
650 | CATEGORY_MASK_ISO_7_TIGHT \
651 | CATEGORY_MASK_ISO_7_ELSE \
652 | CATEGORY_MASK_ISO_8_ELSE)
653
654 #define CATEGORY_MASK_ISO \
655 ( CATEGORY_MASK_ISO_7BIT \
656 | CATEGORY_MASK_ISO_8BIT \
657 | CATEGORY_MASK_ISO_ELSE)
658
659 #define CATEGORY_MASK_UTF_16 \
660 (CATEGORY_MASK_UTF_16_BE \
661 | CATEGORY_MASK_UTF_16_LE \
662 | CATEGORY_MASK_UTF_16_BE_NOSIG \
663 | CATEGORY_MASK_UTF_16_LE_NOSIG)
664
665
666 /* List of symbols `coding-category-xxx' ordered by priority. This
667 variable is exposed to Emacs Lisp. */
668 static Lisp_Object Vcoding_category_list;
669
670 /* Table of coding categories (Lisp symbols). This variable is for
671 internal use oly. */
672 static Lisp_Object Vcoding_category_table;
673
674 /* Table of coding-categories ordered by priority. */
675 static enum coding_category coding_priorities[coding_category_max];
676
677 /* Nth element is a coding context for the coding system bound to the
678 Nth coding category. */
679 static struct coding_system coding_categories[coding_category_max];
680
681 /*** Commonly used macros and functions ***/
682
683 #ifndef min
684 #define min(a, b) ((a) < (b) ? (a) : (b))
685 #endif
686 #ifndef max
687 #define max(a, b) ((a) > (b) ? (a) : (b))
688 #endif
689
690 #define CODING_GET_INFO(coding, attrs, charset_list) \
691 do { \
692 (attrs) = CODING_ID_ATTRS ((coding)->id); \
693 (charset_list) = CODING_ATTR_CHARSET_LIST (attrs); \
694 } while (0)
695
696
697 /* Safely get one byte from the source text pointed by SRC which ends
698 at SRC_END, and set C to that byte. If there are not enough bytes
699 in the source, it jumps to `no_more_source'. If multibytep is
700 nonzero, and a multibyte character is found at SRC, set C to the
701 negative value of the character code. The caller should declare
702 and set these variables appropriately in advance:
703 src, src_end, multibytep */
704
705 #define ONE_MORE_BYTE(c) \
706 do { \
707 if (src == src_end) \
708 { \
709 if (src_base < src) \
710 record_conversion_result \
711 (coding, CODING_RESULT_INSUFFICIENT_SRC); \
712 goto no_more_source; \
713 } \
714 c = *src++; \
715 if (multibytep && (c & 0x80)) \
716 { \
717 if ((c & 0xFE) == 0xC0) \
718 c = ((c & 1) << 6) | *src++; \
719 else \
720 { \
721 src--; \
722 c = - string_char (src, &src, NULL); \
723 record_conversion_result \
724 (coding, CODING_RESULT_INVALID_SRC); \
725 } \
726 } \
727 consumed_chars++; \
728 } while (0)
729
730
731 #define ONE_MORE_BYTE_NO_CHECK(c) \
732 do { \
733 c = *src++; \
734 if (multibytep && (c & 0x80)) \
735 { \
736 if ((c & 0xFE) == 0xC0) \
737 c = ((c & 1) << 6) | *src++; \
738 else \
739 { \
740 src--; \
741 c = - string_char (src, &src, NULL); \
742 record_conversion_result \
743 (coding, CODING_RESULT_INVALID_SRC); \
744 } \
745 } \
746 consumed_chars++; \
747 } while (0)
748
749
750 /* Store a byte C in the place pointed by DST and increment DST to the
751 next free point, and increment PRODUCED_CHARS. The caller should
752 assure that C is 0..127, and declare and set the variable `dst'
753 appropriately in advance.
754 */
755
756
757 #define EMIT_ONE_ASCII_BYTE(c) \
758 do { \
759 produced_chars++; \
760 *dst++ = (c); \
761 } while (0)
762
763
764 /* Like EMIT_ONE_ASCII_BYTE byt store two bytes; C1 and C2. */
765
766 #define EMIT_TWO_ASCII_BYTES(c1, c2) \
767 do { \
768 produced_chars += 2; \
769 *dst++ = (c1), *dst++ = (c2); \
770 } while (0)
771
772
773 /* Store a byte C in the place pointed by DST and increment DST to the
774 next free point, and increment PRODUCED_CHARS. If MULTIBYTEP is
775 nonzero, store in an appropriate multibyte from. The caller should
776 declare and set the variables `dst' and `multibytep' appropriately
777 in advance. */
778
779 #define EMIT_ONE_BYTE(c) \
780 do { \
781 produced_chars++; \
782 if (multibytep) \
783 { \
784 int ch = (c); \
785 if (ch >= 0x80) \
786 ch = BYTE8_TO_CHAR (ch); \
787 CHAR_STRING_ADVANCE (ch, dst); \
788 } \
789 else \
790 *dst++ = (c); \
791 } while (0)
792
793
794 /* Like EMIT_ONE_BYTE, but emit two bytes; C1 and C2. */
795
796 #define EMIT_TWO_BYTES(c1, c2) \
797 do { \
798 produced_chars += 2; \
799 if (multibytep) \
800 { \
801 int ch; \
802 \
803 ch = (c1); \
804 if (ch >= 0x80) \
805 ch = BYTE8_TO_CHAR (ch); \
806 CHAR_STRING_ADVANCE (ch, dst); \
807 ch = (c2); \
808 if (ch >= 0x80) \
809 ch = BYTE8_TO_CHAR (ch); \
810 CHAR_STRING_ADVANCE (ch, dst); \
811 } \
812 else \
813 { \
814 *dst++ = (c1); \
815 *dst++ = (c2); \
816 } \
817 } while (0)
818
819
820 #define EMIT_THREE_BYTES(c1, c2, c3) \
821 do { \
822 EMIT_ONE_BYTE (c1); \
823 EMIT_TWO_BYTES (c2, c3); \
824 } while (0)
825
826
827 #define EMIT_FOUR_BYTES(c1, c2, c3, c4) \
828 do { \
829 EMIT_TWO_BYTES (c1, c2); \
830 EMIT_TWO_BYTES (c3, c4); \
831 } while (0)
832
833
834 /* Prototypes for static functions. */
835 static void record_conversion_result P_ ((struct coding_system *coding,
836 enum coding_result_code result));
837 static int detect_coding_utf_8 P_ ((struct coding_system *,
838 struct coding_detection_info *info));
839 static void decode_coding_utf_8 P_ ((struct coding_system *));
840 static int encode_coding_utf_8 P_ ((struct coding_system *));
841
842 static int detect_coding_utf_16 P_ ((struct coding_system *,
843 struct coding_detection_info *info));
844 static void decode_coding_utf_16 P_ ((struct coding_system *));
845 static int encode_coding_utf_16 P_ ((struct coding_system *));
846
847 static int detect_coding_iso_2022 P_ ((struct coding_system *,
848 struct coding_detection_info *info));
849 static void decode_coding_iso_2022 P_ ((struct coding_system *));
850 static int encode_coding_iso_2022 P_ ((struct coding_system *));
851
852 static int detect_coding_emacs_mule P_ ((struct coding_system *,
853 struct coding_detection_info *info));
854 static void decode_coding_emacs_mule P_ ((struct coding_system *));
855 static int encode_coding_emacs_mule P_ ((struct coding_system *));
856
857 static int detect_coding_sjis P_ ((struct coding_system *,
858 struct coding_detection_info *info));
859 static void decode_coding_sjis P_ ((struct coding_system *));
860 static int encode_coding_sjis P_ ((struct coding_system *));
861
862 static int detect_coding_big5 P_ ((struct coding_system *,
863 struct coding_detection_info *info));
864 static void decode_coding_big5 P_ ((struct coding_system *));
865 static int encode_coding_big5 P_ ((struct coding_system *));
866
867 static int detect_coding_ccl P_ ((struct coding_system *,
868 struct coding_detection_info *info));
869 static void decode_coding_ccl P_ ((struct coding_system *));
870 static int encode_coding_ccl P_ ((struct coding_system *));
871
872 static void decode_coding_raw_text P_ ((struct coding_system *));
873 static int encode_coding_raw_text P_ ((struct coding_system *));
874
875 static void coding_set_source P_ ((struct coding_system *));
876 static void coding_set_destination P_ ((struct coding_system *));
877 static void coding_alloc_by_realloc P_ ((struct coding_system *, EMACS_INT));
878 static void coding_alloc_by_making_gap P_ ((struct coding_system *,
879 EMACS_INT, EMACS_INT));
880 static unsigned char *alloc_destination P_ ((struct coding_system *,
881 EMACS_INT, unsigned char *));
882 static void setup_iso_safe_charsets P_ ((Lisp_Object));
883 static unsigned char *encode_designation_at_bol P_ ((struct coding_system *,
884 int *, int *,
885 unsigned char *));
886 static int detect_eol P_ ((const unsigned char *,
887 EMACS_INT, enum coding_category));
888 static Lisp_Object adjust_coding_eol_type P_ ((struct coding_system *, int));
889 static void decode_eol P_ ((struct coding_system *));
890 static Lisp_Object get_translation_table P_ ((Lisp_Object, int, int *));
891 static Lisp_Object get_translation P_ ((Lisp_Object, int *, int *,
892 int, int *, int *));
893 static int produce_chars P_ ((struct coding_system *, Lisp_Object, int));
894 static INLINE void produce_composition P_ ((struct coding_system *, int *,
895 EMACS_INT));
896 static INLINE void produce_charset P_ ((struct coding_system *, int *,
897 EMACS_INT));
898 static void produce_annotation P_ ((struct coding_system *, EMACS_INT));
899 static int decode_coding P_ ((struct coding_system *));
900 static INLINE int *handle_composition_annotation P_ ((EMACS_INT, EMACS_INT,
901 struct coding_system *,
902 int *, EMACS_INT *));
903 static INLINE int *handle_charset_annotation P_ ((EMACS_INT, EMACS_INT,
904 struct coding_system *,
905 int *, EMACS_INT *));
906 static void consume_chars P_ ((struct coding_system *, Lisp_Object, int));
907 static int encode_coding P_ ((struct coding_system *));
908 static Lisp_Object make_conversion_work_buffer P_ ((int));
909 static Lisp_Object code_conversion_restore P_ ((Lisp_Object));
910 static INLINE int char_encodable_p P_ ((int, Lisp_Object));
911 static Lisp_Object make_subsidiaries P_ ((Lisp_Object));
912
913 static void
914 record_conversion_result (struct coding_system *coding,
915 enum coding_result_code result)
916 {
917 coding->result = result;
918 switch (result)
919 {
920 case CODING_RESULT_INSUFFICIENT_SRC:
921 Vlast_code_conversion_error = Qinsufficient_source;
922 break;
923 case CODING_RESULT_INCONSISTENT_EOL:
924 Vlast_code_conversion_error = Qinconsistent_eol;
925 break;
926 case CODING_RESULT_INVALID_SRC:
927 Vlast_code_conversion_error = Qinvalid_source;
928 break;
929 case CODING_RESULT_INTERRUPT:
930 Vlast_code_conversion_error = Qinterrupted;
931 break;
932 case CODING_RESULT_INSUFFICIENT_MEM:
933 Vlast_code_conversion_error = Qinsufficient_memory;
934 break;
935 default:
936 Vlast_code_conversion_error = intern ("Unknown error");
937 }
938 }
939
940 #define CODING_DECODE_CHAR(coding, src, src_base, src_end, charset, code, c) \
941 do { \
942 charset_map_loaded = 0; \
943 c = DECODE_CHAR (charset, code); \
944 if (charset_map_loaded) \
945 { \
946 const unsigned char *orig = coding->source; \
947 EMACS_INT offset; \
948 \
949 coding_set_source (coding); \
950 offset = coding->source - orig; \
951 src += offset; \
952 src_base += offset; \
953 src_end += offset; \
954 } \
955 } while (0)
956
957
958 /* If there are at least BYTES length of room at dst, allocate memory
959 for coding->destination and update dst and dst_end. We don't have
960 to take care of coding->source which will be relocated. It is
961 handled by calling coding_set_source in encode_coding. */
962
963 #define ASSURE_DESTINATION(bytes) \
964 do { \
965 if (dst + (bytes) >= dst_end) \
966 { \
967 int more_bytes = charbuf_end - charbuf + (bytes); \
968 \
969 dst = alloc_destination (coding, more_bytes, dst); \
970 dst_end = coding->destination + coding->dst_bytes; \
971 } \
972 } while (0)
973
974
975 /* Store multibyte form of the character C in P, and advance P to the
976 end of the multibyte form. This is like CHAR_STRING_ADVANCE but it
977 never calls MAYBE_UNIFY_CHAR. */
978
979 #define CHAR_STRING_ADVANCE_NO_UNIFY(c, p) \
980 do { \
981 if ((c) <= MAX_1_BYTE_CHAR) \
982 *(p)++ = (c); \
983 else if ((c) <= MAX_2_BYTE_CHAR) \
984 *(p)++ = (0xC0 | ((c) >> 6)), \
985 *(p)++ = (0x80 | ((c) & 0x3F)); \
986 else if ((c) <= MAX_3_BYTE_CHAR) \
987 *(p)++ = (0xE0 | ((c) >> 12)), \
988 *(p)++ = (0x80 | (((c) >> 6) & 0x3F)), \
989 *(p)++ = (0x80 | ((c) & 0x3F)); \
990 else if ((c) <= MAX_4_BYTE_CHAR) \
991 *(p)++ = (0xF0 | (c >> 18)), \
992 *(p)++ = (0x80 | ((c >> 12) & 0x3F)), \
993 *(p)++ = (0x80 | ((c >> 6) & 0x3F)), \
994 *(p)++ = (0x80 | (c & 0x3F)); \
995 else if ((c) <= MAX_5_BYTE_CHAR) \
996 *(p)++ = 0xF8, \
997 *(p)++ = (0x80 | ((c >> 18) & 0x0F)), \
998 *(p)++ = (0x80 | ((c >> 12) & 0x3F)), \
999 *(p)++ = (0x80 | ((c >> 6) & 0x3F)), \
1000 *(p)++ = (0x80 | (c & 0x3F)); \
1001 else \
1002 (p) += BYTE8_STRING ((c) - 0x3FFF80, p); \
1003 } while (0)
1004
1005
1006 /* Return the character code of character whose multibyte form is at
1007 P, and advance P to the end of the multibyte form. This is like
1008 STRING_CHAR_ADVANCE, but it never calls MAYBE_UNIFY_CHAR. */
1009
1010 #define STRING_CHAR_ADVANCE_NO_UNIFY(p) \
1011 (!((p)[0] & 0x80) \
1012 ? *(p)++ \
1013 : ! ((p)[0] & 0x20) \
1014 ? ((p) += 2, \
1015 ((((p)[-2] & 0x1F) << 6) \
1016 | ((p)[-1] & 0x3F) \
1017 | ((unsigned char) ((p)[-2]) < 0xC2 ? 0x3FFF80 : 0))) \
1018 : ! ((p)[0] & 0x10) \
1019 ? ((p) += 3, \
1020 ((((p)[-3] & 0x0F) << 12) \
1021 | (((p)[-2] & 0x3F) << 6) \
1022 | ((p)[-1] & 0x3F))) \
1023 : ! ((p)[0] & 0x08) \
1024 ? ((p) += 4, \
1025 ((((p)[-4] & 0xF) << 18) \
1026 | (((p)[-3] & 0x3F) << 12) \
1027 | (((p)[-2] & 0x3F) << 6) \
1028 | ((p)[-1] & 0x3F))) \
1029 : ((p) += 5, \
1030 ((((p)[-4] & 0x3F) << 18) \
1031 | (((p)[-3] & 0x3F) << 12) \
1032 | (((p)[-2] & 0x3F) << 6) \
1033 | ((p)[-1] & 0x3F))))
1034
1035
1036 static void
1037 coding_set_source (coding)
1038 struct coding_system *coding;
1039 {
1040 if (BUFFERP (coding->src_object))
1041 {
1042 struct buffer *buf = XBUFFER (coding->src_object);
1043
1044 if (coding->src_pos < 0)
1045 coding->source = BUF_GAP_END_ADDR (buf) + coding->src_pos_byte;
1046 else
1047 coding->source = BUF_BYTE_ADDRESS (buf, coding->src_pos_byte);
1048 }
1049 else if (STRINGP (coding->src_object))
1050 {
1051 coding->source = SDATA (coding->src_object) + coding->src_pos_byte;
1052 }
1053 else
1054 /* Otherwise, the source is C string and is never relocated
1055 automatically. Thus we don't have to update anything. */
1056 ;
1057 }
1058
1059 static void
1060 coding_set_destination (coding)
1061 struct coding_system *coding;
1062 {
1063 if (BUFFERP (coding->dst_object))
1064 {
1065 if (coding->src_pos < 0)
1066 {
1067 coding->destination = BEG_ADDR + coding->dst_pos_byte - BEG_BYTE;
1068 coding->dst_bytes = (GAP_END_ADDR
1069 - (coding->src_bytes - coding->consumed)
1070 - coding->destination);
1071 }
1072 else
1073 {
1074 /* We are sure that coding->dst_pos_byte is before the gap
1075 of the buffer. */
1076 coding->destination = (BUF_BEG_ADDR (XBUFFER (coding->dst_object))
1077 + coding->dst_pos_byte - BEG_BYTE);
1078 coding->dst_bytes = (BUF_GAP_END_ADDR (XBUFFER (coding->dst_object))
1079 - coding->destination);
1080 }
1081 }
1082 else
1083 /* Otherwise, the destination is C string and is never relocated
1084 automatically. Thus we don't have to update anything. */
1085 ;
1086 }
1087
1088
1089 static void
1090 coding_alloc_by_realloc (coding, bytes)
1091 struct coding_system *coding;
1092 EMACS_INT bytes;
1093 {
1094 coding->destination = (unsigned char *) xrealloc (coding->destination,
1095 coding->dst_bytes + bytes);
1096 coding->dst_bytes += bytes;
1097 }
1098
1099 static void
1100 coding_alloc_by_making_gap (coding, gap_head_used, bytes)
1101 struct coding_system *coding;
1102 EMACS_INT gap_head_used, bytes;
1103 {
1104 if (EQ (coding->src_object, coding->dst_object))
1105 {
1106 /* The gap may contain the produced data at the head and not-yet
1107 consumed data at the tail. To preserve those data, we at
1108 first make the gap size to zero, then increase the gap
1109 size. */
1110 EMACS_INT add = GAP_SIZE;
1111
1112 GPT += gap_head_used, GPT_BYTE += gap_head_used;
1113 GAP_SIZE = 0; ZV += add; Z += add; ZV_BYTE += add; Z_BYTE += add;
1114 make_gap (bytes);
1115 GAP_SIZE += add; ZV -= add; Z -= add; ZV_BYTE -= add; Z_BYTE -= add;
1116 GPT -= gap_head_used, GPT_BYTE -= gap_head_used;
1117 }
1118 else
1119 {
1120 Lisp_Object this_buffer;
1121
1122 this_buffer = Fcurrent_buffer ();
1123 set_buffer_internal (XBUFFER (coding->dst_object));
1124 make_gap (bytes);
1125 set_buffer_internal (XBUFFER (this_buffer));
1126 }
1127 }
1128
1129
1130 static unsigned char *
1131 alloc_destination (coding, nbytes, dst)
1132 struct coding_system *coding;
1133 EMACS_INT nbytes;
1134 unsigned char *dst;
1135 {
1136 EMACS_INT offset = dst - coding->destination;
1137
1138 if (BUFFERP (coding->dst_object))
1139 {
1140 struct buffer *buf = XBUFFER (coding->dst_object);
1141
1142 coding_alloc_by_making_gap (coding, dst - BUF_GPT_ADDR (buf), nbytes);
1143 }
1144 else
1145 coding_alloc_by_realloc (coding, nbytes);
1146 record_conversion_result (coding, CODING_RESULT_SUCCESS);
1147 coding_set_destination (coding);
1148 dst = coding->destination + offset;
1149 return dst;
1150 }
1151
1152 /** Macros for annotations. */
1153
1154 /* Maximum length of annotation data (sum of annotations for
1155 composition and charset). */
1156 #define MAX_ANNOTATION_LENGTH (4 + (MAX_COMPOSITION_COMPONENTS * 2) - 1 + 4)
1157
1158 /* An annotation data is stored in the array coding->charbuf in this
1159 format:
1160 [ -LENGTH ANNOTATION_MASK NCHARS ... ]
1161 LENGTH is the number of elements in the annotation.
1162 ANNOTATION_MASK is one of CODING_ANNOTATE_XXX_MASK.
1163 NCHARS is the number of characters in the text annotated.
1164
1165 The format of the following elements depend on ANNOTATION_MASK.
1166
1167 In the case of CODING_ANNOTATE_COMPOSITION_MASK, these elements
1168 follows:
1169 ... METHOD [ COMPOSITION-COMPONENTS ... ]
1170 METHOD is one of enum composition_method.
1171 Optionnal COMPOSITION-COMPONENTS are characters and composition
1172 rules.
1173
1174 In the case of CODING_ANNOTATE_CHARSET_MASK, one element CHARSET-ID
1175 follows. */
1176
1177 #define ADD_ANNOTATION_DATA(buf, len, mask, nchars) \
1178 do { \
1179 *(buf)++ = -(len); \
1180 *(buf)++ = (mask); \
1181 *(buf)++ = (nchars); \
1182 coding->annotated = 1; \
1183 } while (0);
1184
1185 #define ADD_COMPOSITION_DATA(buf, nchars, method) \
1186 do { \
1187 ADD_ANNOTATION_DATA (buf, 4, CODING_ANNOTATE_COMPOSITION_MASK, nchars); \
1188 *buf++ = method; \
1189 } while (0)
1190
1191
1192 #define ADD_CHARSET_DATA(buf, nchars, id) \
1193 do { \
1194 ADD_ANNOTATION_DATA (buf, 4, CODING_ANNOTATE_CHARSET_MASK, nchars); \
1195 *buf++ = id; \
1196 } while (0)
1197
1198 \f
1199 /*** 2. Emacs' internal format (emacs-utf-8) ***/
1200
1201
1202
1203 \f
1204 /*** 3. UTF-8 ***/
1205
1206 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
1207 Check if a text is encoded in UTF-8. If it is, return 1, else
1208 return 0. */
1209
1210 #define UTF_8_1_OCTET_P(c) ((c) < 0x80)
1211 #define UTF_8_EXTRA_OCTET_P(c) (((c) & 0xC0) == 0x80)
1212 #define UTF_8_2_OCTET_LEADING_P(c) (((c) & 0xE0) == 0xC0)
1213 #define UTF_8_3_OCTET_LEADING_P(c) (((c) & 0xF0) == 0xE0)
1214 #define UTF_8_4_OCTET_LEADING_P(c) (((c) & 0xF8) == 0xF0)
1215 #define UTF_8_5_OCTET_LEADING_P(c) (((c) & 0xFC) == 0xF8)
1216
1217 static int
1218 detect_coding_utf_8 (coding, detect_info)
1219 struct coding_system *coding;
1220 struct coding_detection_info *detect_info;
1221 {
1222 const unsigned char *src = coding->source, *src_base;
1223 const unsigned char *src_end = coding->source + coding->src_bytes;
1224 int multibytep = coding->src_multibyte;
1225 int consumed_chars = 0;
1226 int found = 0;
1227
1228 detect_info->checked |= CATEGORY_MASK_UTF_8;
1229 /* A coding system of this category is always ASCII compatible. */
1230 src += coding->head_ascii;
1231
1232 while (1)
1233 {
1234 int c, c1, c2, c3, c4;
1235
1236 src_base = src;
1237 ONE_MORE_BYTE (c);
1238 if (c < 0 || UTF_8_1_OCTET_P (c))
1239 continue;
1240 ONE_MORE_BYTE (c1);
1241 if (c1 < 0 || ! UTF_8_EXTRA_OCTET_P (c1))
1242 break;
1243 if (UTF_8_2_OCTET_LEADING_P (c))
1244 {
1245 found = CATEGORY_MASK_UTF_8;
1246 continue;
1247 }
1248 ONE_MORE_BYTE (c2);
1249 if (c2 < 0 || ! UTF_8_EXTRA_OCTET_P (c2))
1250 break;
1251 if (UTF_8_3_OCTET_LEADING_P (c))
1252 {
1253 found = CATEGORY_MASK_UTF_8;
1254 continue;
1255 }
1256 ONE_MORE_BYTE (c3);
1257 if (c3 < 0 || ! UTF_8_EXTRA_OCTET_P (c3))
1258 break;
1259 if (UTF_8_4_OCTET_LEADING_P (c))
1260 {
1261 found = CATEGORY_MASK_UTF_8;
1262 continue;
1263 }
1264 ONE_MORE_BYTE (c4);
1265 if (c4 < 0 || ! UTF_8_EXTRA_OCTET_P (c4))
1266 break;
1267 if (UTF_8_5_OCTET_LEADING_P (c))
1268 {
1269 found = CATEGORY_MASK_UTF_8;
1270 continue;
1271 }
1272 break;
1273 }
1274 detect_info->rejected |= CATEGORY_MASK_UTF_8;
1275 return 0;
1276
1277 no_more_source:
1278 if (src_base < src && coding->mode & CODING_MODE_LAST_BLOCK)
1279 {
1280 detect_info->rejected |= CATEGORY_MASK_UTF_8;
1281 return 0;
1282 }
1283 detect_info->found |= found;
1284 return 1;
1285 }
1286
1287
1288 static void
1289 decode_coding_utf_8 (coding)
1290 struct coding_system *coding;
1291 {
1292 const unsigned char *src = coding->source + coding->consumed;
1293 const unsigned char *src_end = coding->source + coding->src_bytes;
1294 const unsigned char *src_base;
1295 int *charbuf = coding->charbuf + coding->charbuf_used;
1296 int *charbuf_end = coding->charbuf + coding->charbuf_size;
1297 int consumed_chars = 0, consumed_chars_base;
1298 int multibytep = coding->src_multibyte;
1299 Lisp_Object attr, charset_list;
1300 int eol_crlf = EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
1301 int byte_after_cr = -1;
1302
1303 CODING_GET_INFO (coding, attr, charset_list);
1304
1305 while (1)
1306 {
1307 int c, c1, c2, c3, c4, c5;
1308
1309 src_base = src;
1310 consumed_chars_base = consumed_chars;
1311
1312 if (charbuf >= charbuf_end)
1313 break;
1314
1315 if (byte_after_cr >= 0)
1316 c1 = byte_after_cr, byte_after_cr = -1;
1317 else
1318 ONE_MORE_BYTE (c1);
1319 if (c1 < 0)
1320 {
1321 c = - c1;
1322 }
1323 else if (UTF_8_1_OCTET_P(c1))
1324 {
1325 if (eol_crlf && c1 == '\r')
1326 ONE_MORE_BYTE (byte_after_cr);
1327 c = c1;
1328 }
1329 else
1330 {
1331 ONE_MORE_BYTE (c2);
1332 if (c2 < 0 || ! UTF_8_EXTRA_OCTET_P (c2))
1333 goto invalid_code;
1334 if (UTF_8_2_OCTET_LEADING_P (c1))
1335 {
1336 c = ((c1 & 0x1F) << 6) | (c2 & 0x3F);
1337 /* Reject overlong sequences here and below. Encoders
1338 producing them are incorrect, they can be misleading,
1339 and they mess up read/write invariance. */
1340 if (c < 128)
1341 goto invalid_code;
1342 }
1343 else
1344 {
1345 ONE_MORE_BYTE (c3);
1346 if (c3 < 0 || ! UTF_8_EXTRA_OCTET_P (c3))
1347 goto invalid_code;
1348 if (UTF_8_3_OCTET_LEADING_P (c1))
1349 {
1350 c = (((c1 & 0xF) << 12)
1351 | ((c2 & 0x3F) << 6) | (c3 & 0x3F));
1352 if (c < 0x800
1353 || (c >= 0xd800 && c < 0xe000)) /* surrogates (invalid) */
1354 goto invalid_code;
1355 }
1356 else
1357 {
1358 ONE_MORE_BYTE (c4);
1359 if (c4 < 0 || ! UTF_8_EXTRA_OCTET_P (c4))
1360 goto invalid_code;
1361 if (UTF_8_4_OCTET_LEADING_P (c1))
1362 {
1363 c = (((c1 & 0x7) << 18) | ((c2 & 0x3F) << 12)
1364 | ((c3 & 0x3F) << 6) | (c4 & 0x3F));
1365 if (c < 0x10000)
1366 goto invalid_code;
1367 }
1368 else
1369 {
1370 ONE_MORE_BYTE (c5);
1371 if (c5 < 0 || ! UTF_8_EXTRA_OCTET_P (c5))
1372 goto invalid_code;
1373 if (UTF_8_5_OCTET_LEADING_P (c1))
1374 {
1375 c = (((c1 & 0x3) << 24) | ((c2 & 0x3F) << 18)
1376 | ((c3 & 0x3F) << 12) | ((c4 & 0x3F) << 6)
1377 | (c5 & 0x3F));
1378 if ((c > MAX_CHAR) || (c < 0x200000))
1379 goto invalid_code;
1380 }
1381 else
1382 goto invalid_code;
1383 }
1384 }
1385 }
1386 }
1387
1388 *charbuf++ = c;
1389 continue;
1390
1391 invalid_code:
1392 src = src_base;
1393 consumed_chars = consumed_chars_base;
1394 ONE_MORE_BYTE (c);
1395 *charbuf++ = ASCII_BYTE_P (c) ? c : BYTE8_TO_CHAR (c);
1396 coding->errors++;
1397 }
1398
1399 no_more_source:
1400 coding->consumed_char += consumed_chars_base;
1401 coding->consumed = src_base - coding->source;
1402 coding->charbuf_used = charbuf - coding->charbuf;
1403 }
1404
1405
1406 static int
1407 encode_coding_utf_8 (coding)
1408 struct coding_system *coding;
1409 {
1410 int multibytep = coding->dst_multibyte;
1411 int *charbuf = coding->charbuf;
1412 int *charbuf_end = charbuf + coding->charbuf_used;
1413 unsigned char *dst = coding->destination + coding->produced;
1414 unsigned char *dst_end = coding->destination + coding->dst_bytes;
1415 int produced_chars = 0;
1416 int c;
1417
1418 if (multibytep)
1419 {
1420 int safe_room = MAX_MULTIBYTE_LENGTH * 2;
1421
1422 while (charbuf < charbuf_end)
1423 {
1424 unsigned char str[MAX_MULTIBYTE_LENGTH], *p, *pend = str;
1425
1426 ASSURE_DESTINATION (safe_room);
1427 c = *charbuf++;
1428 if (CHAR_BYTE8_P (c))
1429 {
1430 c = CHAR_TO_BYTE8 (c);
1431 EMIT_ONE_BYTE (c);
1432 }
1433 else
1434 {
1435 CHAR_STRING_ADVANCE_NO_UNIFY (c, pend);
1436 for (p = str; p < pend; p++)
1437 EMIT_ONE_BYTE (*p);
1438 }
1439 }
1440 }
1441 else
1442 {
1443 int safe_room = MAX_MULTIBYTE_LENGTH;
1444
1445 while (charbuf < charbuf_end)
1446 {
1447 ASSURE_DESTINATION (safe_room);
1448 c = *charbuf++;
1449 if (CHAR_BYTE8_P (c))
1450 *dst++ = CHAR_TO_BYTE8 (c);
1451 else
1452 CHAR_STRING_ADVANCE_NO_UNIFY (c, dst);
1453 produced_chars++;
1454 }
1455 }
1456 record_conversion_result (coding, CODING_RESULT_SUCCESS);
1457 coding->produced_char += produced_chars;
1458 coding->produced = dst - coding->destination;
1459 return 0;
1460 }
1461
1462
1463 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
1464 Check if a text is encoded in one of UTF-16 based coding systems.
1465 If it is, return 1, else return 0. */
1466
1467 #define UTF_16_HIGH_SURROGATE_P(val) \
1468 (((val) & 0xFC00) == 0xD800)
1469
1470 #define UTF_16_LOW_SURROGATE_P(val) \
1471 (((val) & 0xFC00) == 0xDC00)
1472
1473 #define UTF_16_INVALID_P(val) \
1474 (((val) == 0xFFFE) \
1475 || ((val) == 0xFFFF) \
1476 || UTF_16_LOW_SURROGATE_P (val))
1477
1478
1479 static int
1480 detect_coding_utf_16 (coding, detect_info)
1481 struct coding_system *coding;
1482 struct coding_detection_info *detect_info;
1483 {
1484 const unsigned char *src = coding->source, *src_base = src;
1485 const unsigned char *src_end = coding->source + coding->src_bytes;
1486 int multibytep = coding->src_multibyte;
1487 int consumed_chars = 0;
1488 int c1, c2;
1489
1490 detect_info->checked |= CATEGORY_MASK_UTF_16;
1491 if (coding->mode & CODING_MODE_LAST_BLOCK
1492 && (coding->src_chars & 1))
1493 {
1494 detect_info->rejected |= CATEGORY_MASK_UTF_16;
1495 return 0;
1496 }
1497
1498 ONE_MORE_BYTE (c1);
1499 ONE_MORE_BYTE (c2);
1500 if ((c1 == 0xFF) && (c2 == 0xFE))
1501 {
1502 detect_info->found |= (CATEGORY_MASK_UTF_16_LE
1503 | CATEGORY_MASK_UTF_16_AUTO);
1504 detect_info->rejected |= (CATEGORY_MASK_UTF_16_BE
1505 | CATEGORY_MASK_UTF_16_BE_NOSIG
1506 | CATEGORY_MASK_UTF_16_LE_NOSIG);
1507 }
1508 else if ((c1 == 0xFE) && (c2 == 0xFF))
1509 {
1510 detect_info->found |= (CATEGORY_MASK_UTF_16_BE
1511 | CATEGORY_MASK_UTF_16_AUTO);
1512 detect_info->rejected |= (CATEGORY_MASK_UTF_16_LE
1513 | CATEGORY_MASK_UTF_16_BE_NOSIG
1514 | CATEGORY_MASK_UTF_16_LE_NOSIG);
1515 }
1516 else if (c1 >= 0 && c2 >= 0)
1517 {
1518 detect_info->rejected
1519 |= (CATEGORY_MASK_UTF_16_BE | CATEGORY_MASK_UTF_16_LE);
1520 }
1521 no_more_source:
1522 return 1;
1523 }
1524
1525 static void
1526 decode_coding_utf_16 (coding)
1527 struct coding_system *coding;
1528 {
1529 const unsigned char *src = coding->source + coding->consumed;
1530 const unsigned char *src_end = coding->source + coding->src_bytes;
1531 const unsigned char *src_base;
1532 int *charbuf = coding->charbuf + coding->charbuf_used;
1533 int *charbuf_end = coding->charbuf + coding->charbuf_size;
1534 int consumed_chars = 0, consumed_chars_base;
1535 int multibytep = coding->src_multibyte;
1536 enum utf_16_bom_type bom = CODING_UTF_16_BOM (coding);
1537 enum utf_16_endian_type endian = CODING_UTF_16_ENDIAN (coding);
1538 int surrogate = CODING_UTF_16_SURROGATE (coding);
1539 Lisp_Object attr, charset_list;
1540 int eol_crlf = EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
1541 int byte_after_cr1 = -1, byte_after_cr2 = -1;
1542
1543 CODING_GET_INFO (coding, attr, charset_list);
1544
1545 if (bom == utf_16_with_bom)
1546 {
1547 int c, c1, c2;
1548
1549 src_base = src;
1550 ONE_MORE_BYTE (c1);
1551 ONE_MORE_BYTE (c2);
1552 c = (c1 << 8) | c2;
1553
1554 if (endian == utf_16_big_endian
1555 ? c != 0xFEFF : c != 0xFFFE)
1556 {
1557 /* The first two bytes are not BOM. Treat them as bytes
1558 for a normal character. */
1559 src = src_base;
1560 coding->errors++;
1561 }
1562 CODING_UTF_16_BOM (coding) = utf_16_without_bom;
1563 }
1564 else if (bom == utf_16_detect_bom)
1565 {
1566 /* We have already tried to detect BOM and failed in
1567 detect_coding. */
1568 CODING_UTF_16_BOM (coding) = utf_16_without_bom;
1569 }
1570
1571 while (1)
1572 {
1573 int c, c1, c2;
1574
1575 src_base = src;
1576 consumed_chars_base = consumed_chars;
1577
1578 if (charbuf + 2 >= charbuf_end)
1579 break;
1580
1581 if (byte_after_cr1 >= 0)
1582 c1 = byte_after_cr1, byte_after_cr1 = -1;
1583 else
1584 ONE_MORE_BYTE (c1);
1585 if (c1 < 0)
1586 {
1587 *charbuf++ = -c1;
1588 continue;
1589 }
1590 if (byte_after_cr2 >= 0)
1591 c2 = byte_after_cr2, byte_after_cr2 = -1;
1592 else
1593 ONE_MORE_BYTE (c2);
1594 if (c2 < 0)
1595 {
1596 *charbuf++ = ASCII_BYTE_P (c1) ? c1 : BYTE8_TO_CHAR (c1);
1597 *charbuf++ = -c2;
1598 continue;
1599 }
1600 c = (endian == utf_16_big_endian
1601 ? ((c1 << 8) | c2) : ((c2 << 8) | c1));
1602
1603 if (surrogate)
1604 {
1605 if (! UTF_16_LOW_SURROGATE_P (c))
1606 {
1607 if (endian == utf_16_big_endian)
1608 c1 = surrogate >> 8, c2 = surrogate & 0xFF;
1609 else
1610 c1 = surrogate & 0xFF, c2 = surrogate >> 8;
1611 *charbuf++ = c1;
1612 *charbuf++ = c2;
1613 coding->errors++;
1614 if (UTF_16_HIGH_SURROGATE_P (c))
1615 CODING_UTF_16_SURROGATE (coding) = surrogate = c;
1616 else
1617 *charbuf++ = c;
1618 }
1619 else
1620 {
1621 c = ((surrogate - 0xD800) << 10) | (c - 0xDC00);
1622 CODING_UTF_16_SURROGATE (coding) = surrogate = 0;
1623 *charbuf++ = 0x10000 + c;
1624 }
1625 }
1626 else
1627 {
1628 if (UTF_16_HIGH_SURROGATE_P (c))
1629 CODING_UTF_16_SURROGATE (coding) = surrogate = c;
1630 else
1631 {
1632 if (eol_crlf && c == '\r')
1633 {
1634 ONE_MORE_BYTE (byte_after_cr1);
1635 ONE_MORE_BYTE (byte_after_cr2);
1636 }
1637 *charbuf++ = c;
1638 }
1639 }
1640 }
1641
1642 no_more_source:
1643 coding->consumed_char += consumed_chars_base;
1644 coding->consumed = src_base - coding->source;
1645 coding->charbuf_used = charbuf - coding->charbuf;
1646 }
1647
1648 static int
1649 encode_coding_utf_16 (coding)
1650 struct coding_system *coding;
1651 {
1652 int multibytep = coding->dst_multibyte;
1653 int *charbuf = coding->charbuf;
1654 int *charbuf_end = charbuf + coding->charbuf_used;
1655 unsigned char *dst = coding->destination + coding->produced;
1656 unsigned char *dst_end = coding->destination + coding->dst_bytes;
1657 int safe_room = 8;
1658 enum utf_16_bom_type bom = CODING_UTF_16_BOM (coding);
1659 int big_endian = CODING_UTF_16_ENDIAN (coding) == utf_16_big_endian;
1660 int produced_chars = 0;
1661 Lisp_Object attrs, charset_list;
1662 int c;
1663
1664 CODING_GET_INFO (coding, attrs, charset_list);
1665
1666 if (bom != utf_16_without_bom)
1667 {
1668 ASSURE_DESTINATION (safe_room);
1669 if (big_endian)
1670 EMIT_TWO_BYTES (0xFE, 0xFF);
1671 else
1672 EMIT_TWO_BYTES (0xFF, 0xFE);
1673 CODING_UTF_16_BOM (coding) = utf_16_without_bom;
1674 }
1675
1676 while (charbuf < charbuf_end)
1677 {
1678 ASSURE_DESTINATION (safe_room);
1679 c = *charbuf++;
1680 if (c >= MAX_UNICODE_CHAR)
1681 c = coding->default_char;
1682
1683 if (c < 0x10000)
1684 {
1685 if (big_endian)
1686 EMIT_TWO_BYTES (c >> 8, c & 0xFF);
1687 else
1688 EMIT_TWO_BYTES (c & 0xFF, c >> 8);
1689 }
1690 else
1691 {
1692 int c1, c2;
1693
1694 c -= 0x10000;
1695 c1 = (c >> 10) + 0xD800;
1696 c2 = (c & 0x3FF) + 0xDC00;
1697 if (big_endian)
1698 EMIT_FOUR_BYTES (c1 >> 8, c1 & 0xFF, c2 >> 8, c2 & 0xFF);
1699 else
1700 EMIT_FOUR_BYTES (c1 & 0xFF, c1 >> 8, c2 & 0xFF, c2 >> 8);
1701 }
1702 }
1703 record_conversion_result (coding, CODING_RESULT_SUCCESS);
1704 coding->produced = dst - coding->destination;
1705 coding->produced_char += produced_chars;
1706 return 0;
1707 }
1708
1709 \f
1710 /*** 6. Old Emacs' internal format (emacs-mule) ***/
1711
1712 /* Emacs' internal format for representation of multiple character
1713 sets is a kind of multi-byte encoding, i.e. characters are
1714 represented by variable-length sequences of one-byte codes.
1715
1716 ASCII characters and control characters (e.g. `tab', `newline') are
1717 represented by one-byte sequences which are their ASCII codes, in
1718 the range 0x00 through 0x7F.
1719
1720 8-bit characters of the range 0x80..0x9F are represented by
1721 two-byte sequences of LEADING_CODE_8_BIT_CONTROL and (their 8-bit
1722 code + 0x20).
1723
1724 8-bit characters of the range 0xA0..0xFF are represented by
1725 one-byte sequences which are their 8-bit code.
1726
1727 The other characters are represented by a sequence of `base
1728 leading-code', optional `extended leading-code', and one or two
1729 `position-code's. The length of the sequence is determined by the
1730 base leading-code. Leading-code takes the range 0x81 through 0x9D,
1731 whereas extended leading-code and position-code take the range 0xA0
1732 through 0xFF. See `charset.h' for more details about leading-code
1733 and position-code.
1734
1735 --- CODE RANGE of Emacs' internal format ---
1736 character set range
1737 ------------- -----
1738 ascii 0x00..0x7F
1739 eight-bit-control LEADING_CODE_8_BIT_CONTROL + 0xA0..0xBF
1740 eight-bit-graphic 0xA0..0xBF
1741 ELSE 0x81..0x9D + [0xA0..0xFF]+
1742 ---------------------------------------------
1743
1744 As this is the internal character representation, the format is
1745 usually not used externally (i.e. in a file or in a data sent to a
1746 process). But, it is possible to have a text externally in this
1747 format (i.e. by encoding by the coding system `emacs-mule').
1748
1749 In that case, a sequence of one-byte codes has a slightly different
1750 form.
1751
1752 At first, all characters in eight-bit-control are represented by
1753 one-byte sequences which are their 8-bit code.
1754
1755 Next, character composition data are represented by the byte
1756 sequence of the form: 0x80 METHOD BYTES CHARS COMPONENT ...,
1757 where,
1758 METHOD is 0xF0 plus one of composition method (enum
1759 composition_method),
1760
1761 BYTES is 0xA0 plus a byte length of this composition data,
1762
1763 CHARS is 0x20 plus a number of characters composed by this
1764 data,
1765
1766 COMPONENTs are characters of multibye form or composition
1767 rules encoded by two-byte of ASCII codes.
1768
1769 In addition, for backward compatibility, the following formats are
1770 also recognized as composition data on decoding.
1771
1772 0x80 MSEQ ...
1773 0x80 0xFF MSEQ RULE MSEQ RULE ... MSEQ
1774
1775 Here,
1776 MSEQ is a multibyte form but in these special format:
1777 ASCII: 0xA0 ASCII_CODE+0x80,
1778 other: LEADING_CODE+0x20 FOLLOWING-BYTE ...,
1779 RULE is a one byte code of the range 0xA0..0xF0 that
1780 represents a composition rule.
1781 */
1782
1783 char emacs_mule_bytes[256];
1784
1785 int
1786 emacs_mule_char (coding, src, nbytes, nchars, id)
1787 struct coding_system *coding;
1788 const unsigned char *src;
1789 int *nbytes, *nchars, *id;
1790 {
1791 const unsigned char *src_end = coding->source + coding->src_bytes;
1792 const unsigned char *src_base = src;
1793 int multibytep = coding->src_multibyte;
1794 struct charset *charset;
1795 unsigned code;
1796 int c;
1797 int consumed_chars = 0;
1798
1799 ONE_MORE_BYTE (c);
1800 if (c < 0)
1801 {
1802 c = -c;
1803 charset = emacs_mule_charset[0];
1804 }
1805 else
1806 {
1807 if (c >= 0xA0)
1808 {
1809 /* Old style component character of a composition. */
1810 if (c == 0xA0)
1811 {
1812 ONE_MORE_BYTE (c);
1813 c -= 0x80;
1814 }
1815 else
1816 c -= 0x20;
1817 }
1818
1819 switch (emacs_mule_bytes[c])
1820 {
1821 case 2:
1822 if (! (charset = emacs_mule_charset[c]))
1823 goto invalid_code;
1824 ONE_MORE_BYTE (c);
1825 if (c < 0xA0)
1826 goto invalid_code;
1827 code = c & 0x7F;
1828 break;
1829
1830 case 3:
1831 if (c == EMACS_MULE_LEADING_CODE_PRIVATE_11
1832 || c == EMACS_MULE_LEADING_CODE_PRIVATE_12)
1833 {
1834 ONE_MORE_BYTE (c);
1835 if (c < 0xA0 || ! (charset = emacs_mule_charset[c]))
1836 goto invalid_code;
1837 ONE_MORE_BYTE (c);
1838 if (c < 0xA0)
1839 goto invalid_code;
1840 code = c & 0x7F;
1841 }
1842 else
1843 {
1844 if (! (charset = emacs_mule_charset[c]))
1845 goto invalid_code;
1846 ONE_MORE_BYTE (c);
1847 if (c < 0xA0)
1848 goto invalid_code;
1849 code = (c & 0x7F) << 8;
1850 ONE_MORE_BYTE (c);
1851 if (c < 0xA0)
1852 goto invalid_code;
1853 code |= c & 0x7F;
1854 }
1855 break;
1856
1857 case 4:
1858 ONE_MORE_BYTE (c);
1859 if (c < 0 || ! (charset = emacs_mule_charset[c]))
1860 goto invalid_code;
1861 ONE_MORE_BYTE (c);
1862 if (c < 0xA0)
1863 goto invalid_code;
1864 code = (c & 0x7F) << 8;
1865 ONE_MORE_BYTE (c);
1866 if (c < 0xA0)
1867 goto invalid_code;
1868 code |= c & 0x7F;
1869 break;
1870
1871 case 1:
1872 code = c;
1873 charset = CHARSET_FROM_ID (ASCII_BYTE_P (code)
1874 ? charset_ascii : charset_eight_bit);
1875 break;
1876
1877 default:
1878 abort ();
1879 }
1880 c = DECODE_CHAR (charset, code);
1881 if (c < 0)
1882 goto invalid_code;
1883 }
1884 *nbytes = src - src_base;
1885 *nchars = consumed_chars;
1886 if (id)
1887 *id = charset->id;
1888 return c;
1889
1890 no_more_source:
1891 return -2;
1892
1893 invalid_code:
1894 return -1;
1895 }
1896
1897
1898 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
1899 Check if a text is encoded in `emacs-mule'. If it is, return 1,
1900 else return 0. */
1901
1902 static int
1903 detect_coding_emacs_mule (coding, detect_info)
1904 struct coding_system *coding;
1905 struct coding_detection_info *detect_info;
1906 {
1907 const unsigned char *src = coding->source, *src_base;
1908 const unsigned char *src_end = coding->source + coding->src_bytes;
1909 int multibytep = coding->src_multibyte;
1910 int consumed_chars = 0;
1911 int c;
1912 int found = 0;
1913
1914 detect_info->checked |= CATEGORY_MASK_EMACS_MULE;
1915 /* A coding system of this category is always ASCII compatible. */
1916 src += coding->head_ascii;
1917
1918 while (1)
1919 {
1920 src_base = src;
1921 ONE_MORE_BYTE (c);
1922 if (c < 0)
1923 continue;
1924 if (c == 0x80)
1925 {
1926 /* Perhaps the start of composite character. We simple skip
1927 it because analyzing it is too heavy for detecting. But,
1928 at least, we check that the composite character
1929 constitues of more than 4 bytes. */
1930 const unsigned char *src_base;
1931
1932 repeat:
1933 src_base = src;
1934 do
1935 {
1936 ONE_MORE_BYTE (c);
1937 }
1938 while (c >= 0xA0);
1939
1940 if (src - src_base <= 4)
1941 break;
1942 found = CATEGORY_MASK_EMACS_MULE;
1943 if (c == 0x80)
1944 goto repeat;
1945 }
1946
1947 if (c < 0x80)
1948 {
1949 if (c < 0x20
1950 && (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO))
1951 break;
1952 }
1953 else
1954 {
1955 int more_bytes = emacs_mule_bytes[*src_base] - 1;
1956
1957 while (more_bytes > 0)
1958 {
1959 ONE_MORE_BYTE (c);
1960 if (c < 0xA0)
1961 {
1962 src--; /* Unread the last byte. */
1963 break;
1964 }
1965 more_bytes--;
1966 }
1967 if (more_bytes != 0)
1968 break;
1969 found = CATEGORY_MASK_EMACS_MULE;
1970 }
1971 }
1972 detect_info->rejected |= CATEGORY_MASK_EMACS_MULE;
1973 return 0;
1974
1975 no_more_source:
1976 if (src_base < src && coding->mode & CODING_MODE_LAST_BLOCK)
1977 {
1978 detect_info->rejected |= CATEGORY_MASK_EMACS_MULE;
1979 return 0;
1980 }
1981 detect_info->found |= found;
1982 return 1;
1983 }
1984
1985
1986 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions". */
1987
1988 /* Decode a character represented as a component of composition
1989 sequence of Emacs 20/21 style at SRC. Set C to that character and
1990 update SRC to the head of next character (or an encoded composition
1991 rule). If SRC doesn't points a composition component, set C to -1.
1992 If SRC points an invalid byte sequence, global exit by a return
1993 value 0. */
1994
1995 #define DECODE_EMACS_MULE_COMPOSITION_CHAR(buf) \
1996 do \
1997 { \
1998 int c; \
1999 int nbytes, nchars; \
2000 \
2001 if (src == src_end) \
2002 break; \
2003 c = emacs_mule_char (coding, src, &nbytes, &nchars, NULL);\
2004 if (c < 0) \
2005 { \
2006 if (c == -2) \
2007 break; \
2008 goto invalid_code; \
2009 } \
2010 *buf++ = c; \
2011 src += nbytes; \
2012 consumed_chars += nchars; \
2013 } \
2014 while (0)
2015
2016
2017 /* Decode a composition rule represented as a component of composition
2018 sequence of Emacs 20 style at SRC. Store the decoded rule in *BUF,
2019 and increment BUF. If SRC points an invalid byte sequence, set C
2020 to -1. */
2021
2022 #define DECODE_EMACS_MULE_COMPOSITION_RULE_20(buf) \
2023 do { \
2024 int c, gref, nref; \
2025 \
2026 if (src >= src_end) \
2027 goto invalid_code; \
2028 ONE_MORE_BYTE_NO_CHECK (c); \
2029 c -= 0xA0; \
2030 if (c < 0 || c >= 81) \
2031 goto invalid_code; \
2032 \
2033 gref = c / 9, nref = c % 9; \
2034 *buf++ = COMPOSITION_ENCODE_RULE (gref, nref); \
2035 } while (0)
2036
2037
2038 /* Decode a composition rule represented as a component of composition
2039 sequence of Emacs 21 style at SRC. Store the decoded rule in *BUF,
2040 and increment BUF. If SRC points an invalid byte sequence, set C
2041 to -1. */
2042
2043 #define DECODE_EMACS_MULE_COMPOSITION_RULE_21(buf) \
2044 do { \
2045 int gref, nref; \
2046 \
2047 if (src + 1>= src_end) \
2048 goto invalid_code; \
2049 ONE_MORE_BYTE_NO_CHECK (gref); \
2050 gref -= 0x20; \
2051 ONE_MORE_BYTE_NO_CHECK (nref); \
2052 nref -= 0x20; \
2053 if (gref < 0 || gref >= 81 \
2054 || nref < 0 || nref >= 81) \
2055 goto invalid_code; \
2056 *buf++ = COMPOSITION_ENCODE_RULE (gref, nref); \
2057 } while (0)
2058
2059
2060 #define DECODE_EMACS_MULE_21_COMPOSITION(c) \
2061 do { \
2062 /* Emacs 21 style format. The first three bytes at SRC are \
2063 (METHOD - 0xF2), (BYTES - 0xA0), (CHARS - 0xA0), where BYTES is \
2064 the byte length of this composition information, CHARS is the \
2065 number of characters composed by this composition. */ \
2066 enum composition_method method = c - 0xF2; \
2067 int *charbuf_base = charbuf; \
2068 int consumed_chars_limit; \
2069 int nbytes, nchars; \
2070 \
2071 ONE_MORE_BYTE (c); \
2072 if (c < 0) \
2073 goto invalid_code; \
2074 nbytes = c - 0xA0; \
2075 if (nbytes < 3) \
2076 goto invalid_code; \
2077 ONE_MORE_BYTE (c); \
2078 if (c < 0) \
2079 goto invalid_code; \
2080 nchars = c - 0xA0; \
2081 ADD_COMPOSITION_DATA (charbuf, nchars, method); \
2082 consumed_chars_limit = consumed_chars_base + nbytes; \
2083 if (method != COMPOSITION_RELATIVE) \
2084 { \
2085 int i = 0; \
2086 while (consumed_chars < consumed_chars_limit) \
2087 { \
2088 if (i % 2 && method != COMPOSITION_WITH_ALTCHARS) \
2089 DECODE_EMACS_MULE_COMPOSITION_RULE_21 (charbuf); \
2090 else \
2091 DECODE_EMACS_MULE_COMPOSITION_CHAR (charbuf); \
2092 i++; \
2093 } \
2094 if (consumed_chars < consumed_chars_limit) \
2095 goto invalid_code; \
2096 charbuf_base[0] -= i; \
2097 } \
2098 } while (0)
2099
2100
2101 #define DECODE_EMACS_MULE_20_RELATIVE_COMPOSITION(c) \
2102 do { \
2103 /* Emacs 20 style format for relative composition. */ \
2104 /* Store multibyte form of characters to be composed. */ \
2105 enum composition_method method = COMPOSITION_RELATIVE; \
2106 int components[MAX_COMPOSITION_COMPONENTS * 2 - 1]; \
2107 int *buf = components; \
2108 int i, j; \
2109 \
2110 src = src_base; \
2111 ONE_MORE_BYTE (c); /* skip 0x80 */ \
2112 for (i = 0; *src >= 0xA0 && i < MAX_COMPOSITION_COMPONENTS; i++) \
2113 DECODE_EMACS_MULE_COMPOSITION_CHAR (buf); \
2114 if (i < 2) \
2115 goto invalid_code; \
2116 ADD_COMPOSITION_DATA (charbuf, i, method); \
2117 for (j = 0; j < i; j++) \
2118 *charbuf++ = components[j]; \
2119 } while (0)
2120
2121
2122 #define DECODE_EMACS_MULE_20_RULEBASE_COMPOSITION(c) \
2123 do { \
2124 /* Emacs 20 style format for rule-base composition. */ \
2125 /* Store multibyte form of characters to be composed. */ \
2126 enum composition_method method = COMPOSITION_WITH_RULE; \
2127 int *charbuf_base = charbuf; \
2128 int components[MAX_COMPOSITION_COMPONENTS * 2 - 1]; \
2129 int *buf = components; \
2130 int i, j; \
2131 \
2132 DECODE_EMACS_MULE_COMPOSITION_CHAR (buf); \
2133 for (i = 1; i < MAX_COMPOSITION_COMPONENTS; i++) \
2134 { \
2135 if (*src < 0xA0) \
2136 break; \
2137 DECODE_EMACS_MULE_COMPOSITION_RULE_20 (buf); \
2138 DECODE_EMACS_MULE_COMPOSITION_CHAR (buf); \
2139 } \
2140 if (i <= 1 || (buf - components) % 2 == 0) \
2141 goto invalid_code; \
2142 if (charbuf + i + (i / 2) + 1 >= charbuf_end) \
2143 goto no_more_source; \
2144 ADD_COMPOSITION_DATA (charbuf, i, method); \
2145 i = i * 2 - 1; \
2146 for (j = 0; j < i; j++) \
2147 *charbuf++ = components[j]; \
2148 charbuf_base[0] -= i; \
2149 for (j = 0; j < i; j += 2) \
2150 *charbuf++ = components[j]; \
2151 } while (0)
2152
2153
2154 static void
2155 decode_coding_emacs_mule (coding)
2156 struct coding_system *coding;
2157 {
2158 const unsigned char *src = coding->source + coding->consumed;
2159 const unsigned char *src_end = coding->source + coding->src_bytes;
2160 const unsigned char *src_base;
2161 int *charbuf = coding->charbuf + coding->charbuf_used;
2162 int *charbuf_end
2163 = coding->charbuf + coding->charbuf_size - MAX_ANNOTATION_LENGTH;
2164 int consumed_chars = 0, consumed_chars_base;
2165 int multibytep = coding->src_multibyte;
2166 Lisp_Object attrs, charset_list;
2167 int char_offset = coding->produced_char;
2168 int last_offset = char_offset;
2169 int last_id = charset_ascii;
2170 int eol_crlf = EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
2171 int byte_after_cr = -1;
2172
2173 CODING_GET_INFO (coding, attrs, charset_list);
2174
2175 while (1)
2176 {
2177 int c;
2178
2179 src_base = src;
2180 consumed_chars_base = consumed_chars;
2181
2182 if (charbuf >= charbuf_end)
2183 break;
2184
2185 if (byte_after_cr >= 0)
2186 c = byte_after_cr, byte_after_cr = -1;
2187 else
2188 ONE_MORE_BYTE (c);
2189 if (c < 0)
2190 {
2191 *charbuf++ = -c;
2192 char_offset++;
2193 }
2194 else if (c < 0x80)
2195 {
2196 if (eol_crlf && c == '\r')
2197 ONE_MORE_BYTE (byte_after_cr);
2198 *charbuf++ = c;
2199 char_offset++;
2200 }
2201 else if (c == 0x80)
2202 {
2203 ONE_MORE_BYTE (c);
2204 if (c < 0)
2205 goto invalid_code;
2206 if (c - 0xF2 >= COMPOSITION_RELATIVE
2207 && c - 0xF2 <= COMPOSITION_WITH_RULE_ALTCHARS)
2208 DECODE_EMACS_MULE_21_COMPOSITION (c);
2209 else if (c < 0xC0)
2210 DECODE_EMACS_MULE_20_RELATIVE_COMPOSITION (c);
2211 else if (c == 0xFF)
2212 DECODE_EMACS_MULE_20_RULEBASE_COMPOSITION (c);
2213 else
2214 goto invalid_code;
2215 }
2216 else if (c < 0xA0 && emacs_mule_bytes[c] > 1)
2217 {
2218 int nbytes, nchars;
2219 int id;
2220
2221 src = src_base;
2222 consumed_chars = consumed_chars_base;
2223 c = emacs_mule_char (coding, src, &nbytes, &nchars, &id);
2224 if (c < 0)
2225 {
2226 if (c == -2)
2227 break;
2228 goto invalid_code;
2229 }
2230 if (last_id != id)
2231 {
2232 if (last_id != charset_ascii)
2233 ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
2234 last_id = id;
2235 last_offset = char_offset;
2236 }
2237 *charbuf++ = c;
2238 src += nbytes;
2239 consumed_chars += nchars;
2240 char_offset++;
2241 }
2242 else
2243 goto invalid_code;
2244 continue;
2245
2246 invalid_code:
2247 src = src_base;
2248 consumed_chars = consumed_chars_base;
2249 ONE_MORE_BYTE (c);
2250 *charbuf++ = ASCII_BYTE_P (c) ? c : BYTE8_TO_CHAR (c);
2251 char_offset++;
2252 coding->errors++;
2253 }
2254
2255 no_more_source:
2256 if (last_id != charset_ascii)
2257 ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
2258 coding->consumed_char += consumed_chars_base;
2259 coding->consumed = src_base - coding->source;
2260 coding->charbuf_used = charbuf - coding->charbuf;
2261 }
2262
2263
2264 #define EMACS_MULE_LEADING_CODES(id, codes) \
2265 do { \
2266 if (id < 0xA0) \
2267 codes[0] = id, codes[1] = 0; \
2268 else if (id < 0xE0) \
2269 codes[0] = 0x9A, codes[1] = id; \
2270 else if (id < 0xF0) \
2271 codes[0] = 0x9B, codes[1] = id; \
2272 else if (id < 0xF5) \
2273 codes[0] = 0x9C, codes[1] = id; \
2274 else \
2275 codes[0] = 0x9D, codes[1] = id; \
2276 } while (0);
2277
2278
2279 static int
2280 encode_coding_emacs_mule (coding)
2281 struct coding_system *coding;
2282 {
2283 int multibytep = coding->dst_multibyte;
2284 int *charbuf = coding->charbuf;
2285 int *charbuf_end = charbuf + coding->charbuf_used;
2286 unsigned char *dst = coding->destination + coding->produced;
2287 unsigned char *dst_end = coding->destination + coding->dst_bytes;
2288 int safe_room = 8;
2289 int produced_chars = 0;
2290 Lisp_Object attrs, charset_list;
2291 int c;
2292 int preferred_charset_id = -1;
2293
2294 CODING_GET_INFO (coding, attrs, charset_list);
2295 if (! EQ (charset_list, Vemacs_mule_charset_list))
2296 {
2297 CODING_ATTR_CHARSET_LIST (attrs)
2298 = charset_list = Vemacs_mule_charset_list;
2299 }
2300
2301 while (charbuf < charbuf_end)
2302 {
2303 ASSURE_DESTINATION (safe_room);
2304 c = *charbuf++;
2305
2306 if (c < 0)
2307 {
2308 /* Handle an annotation. */
2309 switch (*charbuf)
2310 {
2311 case CODING_ANNOTATE_COMPOSITION_MASK:
2312 /* Not yet implemented. */
2313 break;
2314 case CODING_ANNOTATE_CHARSET_MASK:
2315 preferred_charset_id = charbuf[3];
2316 if (preferred_charset_id >= 0
2317 && NILP (Fmemq (make_number (preferred_charset_id),
2318 charset_list)))
2319 preferred_charset_id = -1;
2320 break;
2321 default:
2322 abort ();
2323 }
2324 charbuf += -c - 1;
2325 continue;
2326 }
2327
2328 if (ASCII_CHAR_P (c))
2329 EMIT_ONE_ASCII_BYTE (c);
2330 else if (CHAR_BYTE8_P (c))
2331 {
2332 c = CHAR_TO_BYTE8 (c);
2333 EMIT_ONE_BYTE (c);
2334 }
2335 else
2336 {
2337 struct charset *charset;
2338 unsigned code;
2339 int dimension;
2340 int emacs_mule_id;
2341 unsigned char leading_codes[2];
2342
2343 if (preferred_charset_id >= 0)
2344 {
2345 charset = CHARSET_FROM_ID (preferred_charset_id);
2346 if (! CHAR_CHARSET_P (c, charset))
2347 charset = char_charset (c, charset_list, NULL);
2348 }
2349 else
2350 charset = char_charset (c, charset_list, &code);
2351 if (! charset)
2352 {
2353 c = coding->default_char;
2354 if (ASCII_CHAR_P (c))
2355 {
2356 EMIT_ONE_ASCII_BYTE (c);
2357 continue;
2358 }
2359 charset = char_charset (c, charset_list, &code);
2360 }
2361 dimension = CHARSET_DIMENSION (charset);
2362 emacs_mule_id = CHARSET_EMACS_MULE_ID (charset);
2363 EMACS_MULE_LEADING_CODES (emacs_mule_id, leading_codes);
2364 EMIT_ONE_BYTE (leading_codes[0]);
2365 if (leading_codes[1])
2366 EMIT_ONE_BYTE (leading_codes[1]);
2367 if (dimension == 1)
2368 EMIT_ONE_BYTE (code | 0x80);
2369 else
2370 {
2371 code |= 0x8080;
2372 EMIT_ONE_BYTE (code >> 8);
2373 EMIT_ONE_BYTE (code & 0xFF);
2374 }
2375 }
2376 }
2377 record_conversion_result (coding, CODING_RESULT_SUCCESS);
2378 coding->produced_char += produced_chars;
2379 coding->produced = dst - coding->destination;
2380 return 0;
2381 }
2382
2383 \f
2384 /*** 7. ISO2022 handlers ***/
2385
2386 /* The following note describes the coding system ISO2022 briefly.
2387 Since the intention of this note is to help understand the
2388 functions in this file, some parts are NOT ACCURATE or are OVERLY
2389 SIMPLIFIED. For thorough understanding, please refer to the
2390 original document of ISO2022. This is equivalent to the standard
2391 ECMA-35, obtainable from <URL:http://www.ecma.ch/> (*).
2392
2393 ISO2022 provides many mechanisms to encode several character sets
2394 in 7-bit and 8-bit environments. For 7-bit environments, all text
2395 is encoded using bytes less than 128. This may make the encoded
2396 text a little bit longer, but the text passes more easily through
2397 several types of gateway, some of which strip off the MSB (Most
2398 Significant Bit).
2399
2400 There are two kinds of character sets: control character sets and
2401 graphic character sets. The former contain control characters such
2402 as `newline' and `escape' to provide control functions (control
2403 functions are also provided by escape sequences). The latter
2404 contain graphic characters such as 'A' and '-'. Emacs recognizes
2405 two control character sets and many graphic character sets.
2406
2407 Graphic character sets are classified into one of the following
2408 four classes, according to the number of bytes (DIMENSION) and
2409 number of characters in one dimension (CHARS) of the set:
2410 - DIMENSION1_CHARS94
2411 - DIMENSION1_CHARS96
2412 - DIMENSION2_CHARS94
2413 - DIMENSION2_CHARS96
2414
2415 In addition, each character set is assigned an identification tag,
2416 unique for each set, called the "final character" (denoted as <F>
2417 hereafter). The <F> of each character set is decided by ECMA(*)
2418 when it is registered in ISO. The code range of <F> is 0x30..0x7F
2419 (0x30..0x3F are for private use only).
2420
2421 Note (*): ECMA = European Computer Manufacturers Association
2422
2423 Here are examples of graphic character sets [NAME(<F>)]:
2424 o DIMENSION1_CHARS94 -- ASCII('B'), right-half-of-JISX0201('I'), ...
2425 o DIMENSION1_CHARS96 -- right-half-of-ISO8859-1('A'), ...
2426 o DIMENSION2_CHARS94 -- GB2312('A'), JISX0208('B'), ...
2427 o DIMENSION2_CHARS96 -- none for the moment
2428
2429 A code area (1 byte=8 bits) is divided into 4 areas, C0, GL, C1, and GR.
2430 C0 [0x00..0x1F] -- control character plane 0
2431 GL [0x20..0x7F] -- graphic character plane 0
2432 C1 [0x80..0x9F] -- control character plane 1
2433 GR [0xA0..0xFF] -- graphic character plane 1
2434
2435 A control character set is directly designated and invoked to C0 or
2436 C1 by an escape sequence. The most common case is that:
2437 - ISO646's control character set is designated/invoked to C0, and
2438 - ISO6429's control character set is designated/invoked to C1,
2439 and usually these designations/invocations are omitted in encoded
2440 text. In a 7-bit environment, only C0 can be used, and a control
2441 character for C1 is encoded by an appropriate escape sequence to
2442 fit into the environment. All control characters for C1 are
2443 defined to have corresponding escape sequences.
2444
2445 A graphic character set is at first designated to one of four
2446 graphic registers (G0 through G3), then these graphic registers are
2447 invoked to GL or GR. These designations and invocations can be
2448 done independently. The most common case is that G0 is invoked to
2449 GL, G1 is invoked to GR, and ASCII is designated to G0. Usually
2450 these invocations and designations are omitted in encoded text.
2451 In a 7-bit environment, only GL can be used.
2452
2453 When a graphic character set of CHARS94 is invoked to GL, codes
2454 0x20 and 0x7F of the GL area work as control characters SPACE and
2455 DEL respectively, and codes 0xA0 and 0xFF of the GR area should not
2456 be used.
2457
2458 There are two ways of invocation: locking-shift and single-shift.
2459 With locking-shift, the invocation lasts until the next different
2460 invocation, whereas with single-shift, the invocation affects the
2461 following character only and doesn't affect the locking-shift
2462 state. Invocations are done by the following control characters or
2463 escape sequences:
2464
2465 ----------------------------------------------------------------------
2466 abbrev function cntrl escape seq description
2467 ----------------------------------------------------------------------
2468 SI/LS0 (shift-in) 0x0F none invoke G0 into GL
2469 SO/LS1 (shift-out) 0x0E none invoke G1 into GL
2470 LS2 (locking-shift-2) none ESC 'n' invoke G2 into GL
2471 LS3 (locking-shift-3) none ESC 'o' invoke G3 into GL
2472 LS1R (locking-shift-1 right) none ESC '~' invoke G1 into GR (*)
2473 LS2R (locking-shift-2 right) none ESC '}' invoke G2 into GR (*)
2474 LS3R (locking-shift 3 right) none ESC '|' invoke G3 into GR (*)
2475 SS2 (single-shift-2) 0x8E ESC 'N' invoke G2 for one char
2476 SS3 (single-shift-3) 0x8F ESC 'O' invoke G3 for one char
2477 ----------------------------------------------------------------------
2478 (*) These are not used by any known coding system.
2479
2480 Control characters for these functions are defined by macros
2481 ISO_CODE_XXX in `coding.h'.
2482
2483 Designations are done by the following escape sequences:
2484 ----------------------------------------------------------------------
2485 escape sequence description
2486 ----------------------------------------------------------------------
2487 ESC '(' <F> designate DIMENSION1_CHARS94<F> to G0
2488 ESC ')' <F> designate DIMENSION1_CHARS94<F> to G1
2489 ESC '*' <F> designate DIMENSION1_CHARS94<F> to G2
2490 ESC '+' <F> designate DIMENSION1_CHARS94<F> to G3
2491 ESC ',' <F> designate DIMENSION1_CHARS96<F> to G0 (*)
2492 ESC '-' <F> designate DIMENSION1_CHARS96<F> to G1
2493 ESC '.' <F> designate DIMENSION1_CHARS96<F> to G2
2494 ESC '/' <F> designate DIMENSION1_CHARS96<F> to G3
2495 ESC '$' '(' <F> designate DIMENSION2_CHARS94<F> to G0 (**)
2496 ESC '$' ')' <F> designate DIMENSION2_CHARS94<F> to G1
2497 ESC '$' '*' <F> designate DIMENSION2_CHARS94<F> to G2
2498 ESC '$' '+' <F> designate DIMENSION2_CHARS94<F> to G3
2499 ESC '$' ',' <F> designate DIMENSION2_CHARS96<F> to G0 (*)
2500 ESC '$' '-' <F> designate DIMENSION2_CHARS96<F> to G1
2501 ESC '$' '.' <F> designate DIMENSION2_CHARS96<F> to G2
2502 ESC '$' '/' <F> designate DIMENSION2_CHARS96<F> to G3
2503 ----------------------------------------------------------------------
2504
2505 In this list, "DIMENSION1_CHARS94<F>" means a graphic character set
2506 of dimension 1, chars 94, and final character <F>, etc...
2507
2508 Note (*): Although these designations are not allowed in ISO2022,
2509 Emacs accepts them on decoding, and produces them on encoding
2510 CHARS96 character sets in a coding system which is characterized as
2511 7-bit environment, non-locking-shift, and non-single-shift.
2512
2513 Note (**): If <F> is '@', 'A', or 'B', the intermediate character
2514 '(' must be omitted. We refer to this as "short-form" hereafter.
2515
2516 Now you may notice that there are a lot of ways of encoding the
2517 same multilingual text in ISO2022. Actually, there exist many
2518 coding systems such as Compound Text (used in X11's inter client
2519 communication, ISO-2022-JP (used in Japanese Internet), ISO-2022-KR
2520 (used in Korean Internet), EUC (Extended UNIX Code, used in Asian
2521 localized platforms), and all of these are variants of ISO2022.
2522
2523 In addition to the above, Emacs handles two more kinds of escape
2524 sequences: ISO6429's direction specification and Emacs' private
2525 sequence for specifying character composition.
2526
2527 ISO6429's direction specification takes the following form:
2528 o CSI ']' -- end of the current direction
2529 o CSI '0' ']' -- end of the current direction
2530 o CSI '1' ']' -- start of left-to-right text
2531 o CSI '2' ']' -- start of right-to-left text
2532 The control character CSI (0x9B: control sequence introducer) is
2533 abbreviated to the escape sequence ESC '[' in a 7-bit environment.
2534
2535 Character composition specification takes the following form:
2536 o ESC '0' -- start relative composition
2537 o ESC '1' -- end composition
2538 o ESC '2' -- start rule-base composition (*)
2539 o ESC '3' -- start relative composition with alternate chars (**)
2540 o ESC '4' -- start rule-base composition with alternate chars (**)
2541 Since these are not standard escape sequences of any ISO standard,
2542 the use of them with these meanings is restricted to Emacs only.
2543
2544 (*) This form is used only in Emacs 20.7 and older versions,
2545 but newer versions can safely decode it.
2546 (**) This form is used only in Emacs 21.1 and newer versions,
2547 and older versions can't decode it.
2548
2549 Here's a list of example usages of these composition escape
2550 sequences (categorized by `enum composition_method').
2551
2552 COMPOSITION_RELATIVE:
2553 ESC 0 CHAR [ CHAR ] ESC 1
2554 COMPOSITION_WITH_RULE:
2555 ESC 2 CHAR [ RULE CHAR ] ESC 1
2556 COMPOSITION_WITH_ALTCHARS:
2557 ESC 3 ALTCHAR [ ALTCHAR ] ESC 0 CHAR [ CHAR ] ESC 1
2558 COMPOSITION_WITH_RULE_ALTCHARS:
2559 ESC 4 ALTCHAR [ RULE ALTCHAR ] ESC 0 CHAR [ CHAR ] ESC 1 */
2560
2561 enum iso_code_class_type iso_code_class[256];
2562
2563 #define SAFE_CHARSET_P(coding, id) \
2564 ((id) <= (coding)->max_charset_id \
2565 && (coding)->safe_charsets[id] >= 0)
2566
2567
2568 #define SHIFT_OUT_OK(category) \
2569 (CODING_ISO_INITIAL (&coding_categories[category], 1) >= 0)
2570
2571 static void
2572 setup_iso_safe_charsets (attrs)
2573 Lisp_Object attrs;
2574 {
2575 Lisp_Object charset_list, safe_charsets;
2576 Lisp_Object request;
2577 Lisp_Object reg_usage;
2578 Lisp_Object tail;
2579 int reg94, reg96;
2580 int flags = XINT (AREF (attrs, coding_attr_iso_flags));
2581 int max_charset_id;
2582
2583 charset_list = CODING_ATTR_CHARSET_LIST (attrs);
2584 if ((flags & CODING_ISO_FLAG_FULL_SUPPORT)
2585 && ! EQ (charset_list, Viso_2022_charset_list))
2586 {
2587 CODING_ATTR_CHARSET_LIST (attrs)
2588 = charset_list = Viso_2022_charset_list;
2589 ASET (attrs, coding_attr_safe_charsets, Qnil);
2590 }
2591
2592 if (STRINGP (AREF (attrs, coding_attr_safe_charsets)))
2593 return;
2594
2595 max_charset_id = 0;
2596 for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
2597 {
2598 int id = XINT (XCAR (tail));
2599 if (max_charset_id < id)
2600 max_charset_id = id;
2601 }
2602
2603 safe_charsets = Fmake_string (make_number (max_charset_id + 1),
2604 make_number (255));
2605 request = AREF (attrs, coding_attr_iso_request);
2606 reg_usage = AREF (attrs, coding_attr_iso_usage);
2607 reg94 = XINT (XCAR (reg_usage));
2608 reg96 = XINT (XCDR (reg_usage));
2609
2610 for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
2611 {
2612 Lisp_Object id;
2613 Lisp_Object reg;
2614 struct charset *charset;
2615
2616 id = XCAR (tail);
2617 charset = CHARSET_FROM_ID (XINT (id));
2618 reg = Fcdr (Fassq (id, request));
2619 if (! NILP (reg))
2620 SSET (safe_charsets, XINT (id), XINT (reg));
2621 else if (charset->iso_chars_96)
2622 {
2623 if (reg96 < 4)
2624 SSET (safe_charsets, XINT (id), reg96);
2625 }
2626 else
2627 {
2628 if (reg94 < 4)
2629 SSET (safe_charsets, XINT (id), reg94);
2630 }
2631 }
2632 ASET (attrs, coding_attr_safe_charsets, safe_charsets);
2633 }
2634
2635
2636 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
2637 Check if a text is encoded in one of ISO-2022 based codig systems.
2638 If it is, return 1, else return 0. */
2639
2640 static int
2641 detect_coding_iso_2022 (coding, detect_info)
2642 struct coding_system *coding;
2643 struct coding_detection_info *detect_info;
2644 {
2645 const unsigned char *src = coding->source, *src_base = src;
2646 const unsigned char *src_end = coding->source + coding->src_bytes;
2647 int multibytep = coding->src_multibyte;
2648 int single_shifting = 0;
2649 int id;
2650 int c, c1;
2651 int consumed_chars = 0;
2652 int i;
2653 int rejected = 0;
2654 int found = 0;
2655
2656 detect_info->checked |= CATEGORY_MASK_ISO;
2657
2658 for (i = coding_category_iso_7; i <= coding_category_iso_8_else; i++)
2659 {
2660 struct coding_system *this = &(coding_categories[i]);
2661 Lisp_Object attrs, val;
2662
2663 attrs = CODING_ID_ATTRS (this->id);
2664 if (CODING_ISO_FLAGS (this) & CODING_ISO_FLAG_FULL_SUPPORT
2665 && ! EQ (CODING_ATTR_SAFE_CHARSETS (attrs), Viso_2022_charset_list))
2666 setup_iso_safe_charsets (attrs);
2667 val = CODING_ATTR_SAFE_CHARSETS (attrs);
2668 this->max_charset_id = SCHARS (val) - 1;
2669 this->safe_charsets = (char *) SDATA (val);
2670 }
2671
2672 /* A coding system of this category is always ASCII compatible. */
2673 src += coding->head_ascii;
2674
2675 while (rejected != CATEGORY_MASK_ISO)
2676 {
2677 src_base = src;
2678 ONE_MORE_BYTE (c);
2679 switch (c)
2680 {
2681 case ISO_CODE_ESC:
2682 if (inhibit_iso_escape_detection)
2683 break;
2684 single_shifting = 0;
2685 ONE_MORE_BYTE (c);
2686 if (c >= '(' && c <= '/')
2687 {
2688 /* Designation sequence for a charset of dimension 1. */
2689 ONE_MORE_BYTE (c1);
2690 if (c1 < ' ' || c1 >= 0x80
2691 || (id = iso_charset_table[0][c >= ','][c1]) < 0)
2692 /* Invalid designation sequence. Just ignore. */
2693 break;
2694 }
2695 else if (c == '$')
2696 {
2697 /* Designation sequence for a charset of dimension 2. */
2698 ONE_MORE_BYTE (c);
2699 if (c >= '@' && c <= 'B')
2700 /* Designation for JISX0208.1978, GB2312, or JISX0208. */
2701 id = iso_charset_table[1][0][c];
2702 else if (c >= '(' && c <= '/')
2703 {
2704 ONE_MORE_BYTE (c1);
2705 if (c1 < ' ' || c1 >= 0x80
2706 || (id = iso_charset_table[1][c >= ','][c1]) < 0)
2707 /* Invalid designation sequence. Just ignore. */
2708 break;
2709 }
2710 else
2711 /* Invalid designation sequence. Just ignore it. */
2712 break;
2713 }
2714 else if (c == 'N' || c == 'O')
2715 {
2716 /* ESC <Fe> for SS2 or SS3. */
2717 single_shifting = 1;
2718 rejected |= CATEGORY_MASK_ISO_7BIT | CATEGORY_MASK_ISO_8BIT;
2719 break;
2720 }
2721 else if (c >= '0' && c <= '4')
2722 {
2723 /* ESC <Fp> for start/end composition. */
2724 found |= CATEGORY_MASK_ISO;
2725 break;
2726 }
2727 else
2728 {
2729 /* Invalid escape sequence. Just ignore it. */
2730 break;
2731 }
2732
2733 /* We found a valid designation sequence for CHARSET. */
2734 rejected |= CATEGORY_MASK_ISO_8BIT;
2735 if (SAFE_CHARSET_P (&coding_categories[coding_category_iso_7],
2736 id))
2737 found |= CATEGORY_MASK_ISO_7;
2738 else
2739 rejected |= CATEGORY_MASK_ISO_7;
2740 if (SAFE_CHARSET_P (&coding_categories[coding_category_iso_7_tight],
2741 id))
2742 found |= CATEGORY_MASK_ISO_7_TIGHT;
2743 else
2744 rejected |= CATEGORY_MASK_ISO_7_TIGHT;
2745 if (SAFE_CHARSET_P (&coding_categories[coding_category_iso_7_else],
2746 id))
2747 found |= CATEGORY_MASK_ISO_7_ELSE;
2748 else
2749 rejected |= CATEGORY_MASK_ISO_7_ELSE;
2750 if (SAFE_CHARSET_P (&coding_categories[coding_category_iso_8_else],
2751 id))
2752 found |= CATEGORY_MASK_ISO_8_ELSE;
2753 else
2754 rejected |= CATEGORY_MASK_ISO_8_ELSE;
2755 break;
2756
2757 case ISO_CODE_SO:
2758 case ISO_CODE_SI:
2759 /* Locking shift out/in. */
2760 if (inhibit_iso_escape_detection)
2761 break;
2762 single_shifting = 0;
2763 rejected |= CATEGORY_MASK_ISO_7BIT | CATEGORY_MASK_ISO_8BIT;
2764 break;
2765
2766 case ISO_CODE_CSI:
2767 /* Control sequence introducer. */
2768 single_shifting = 0;
2769 rejected |= CATEGORY_MASK_ISO_7BIT | CATEGORY_MASK_ISO_7_ELSE;
2770 found |= CATEGORY_MASK_ISO_8_ELSE;
2771 goto check_extra_latin;
2772
2773 case ISO_CODE_SS2:
2774 case ISO_CODE_SS3:
2775 /* Single shift. */
2776 if (inhibit_iso_escape_detection)
2777 break;
2778 single_shifting = 0;
2779 rejected |= CATEGORY_MASK_ISO_7BIT;
2780 if (CODING_ISO_FLAGS (&coding_categories[coding_category_iso_8_1])
2781 & CODING_ISO_FLAG_SINGLE_SHIFT)
2782 found |= CATEGORY_MASK_ISO_8_1, single_shifting = 1;
2783 if (CODING_ISO_FLAGS (&coding_categories[coding_category_iso_8_2])
2784 & CODING_ISO_FLAG_SINGLE_SHIFT)
2785 found |= CATEGORY_MASK_ISO_8_2, single_shifting = 1;
2786 if (single_shifting)
2787 break;
2788 goto check_extra_latin;
2789
2790 default:
2791 if (c < 0)
2792 continue;
2793 if (c < 0x80)
2794 {
2795 single_shifting = 0;
2796 break;
2797 }
2798 if (c >= 0xA0)
2799 {
2800 rejected |= CATEGORY_MASK_ISO_7BIT | CATEGORY_MASK_ISO_7_ELSE;
2801 found |= CATEGORY_MASK_ISO_8_1;
2802 /* Check the length of succeeding codes of the range
2803 0xA0..0FF. If the byte length is even, we include
2804 CATEGORY_MASK_ISO_8_2 in `found'. We can check this
2805 only when we are not single shifting. */
2806 if (! single_shifting
2807 && ! (rejected & CATEGORY_MASK_ISO_8_2))
2808 {
2809 int i = 1;
2810 while (src < src_end)
2811 {
2812 ONE_MORE_BYTE (c);
2813 if (c < 0xA0)
2814 break;
2815 i++;
2816 }
2817
2818 if (i & 1 && src < src_end)
2819 rejected |= CATEGORY_MASK_ISO_8_2;
2820 else
2821 found |= CATEGORY_MASK_ISO_8_2;
2822 }
2823 break;
2824 }
2825 check_extra_latin:
2826 single_shifting = 0;
2827 if (! VECTORP (Vlatin_extra_code_table)
2828 || NILP (XVECTOR (Vlatin_extra_code_table)->contents[c]))
2829 {
2830 rejected = CATEGORY_MASK_ISO;
2831 break;
2832 }
2833 if (CODING_ISO_FLAGS (&coding_categories[coding_category_iso_8_1])
2834 & CODING_ISO_FLAG_LATIN_EXTRA)
2835 found |= CATEGORY_MASK_ISO_8_1;
2836 else
2837 rejected |= CATEGORY_MASK_ISO_8_1;
2838 rejected |= CATEGORY_MASK_ISO_8_2;
2839 }
2840 }
2841 detect_info->rejected |= CATEGORY_MASK_ISO;
2842 return 0;
2843
2844 no_more_source:
2845 detect_info->rejected |= rejected;
2846 detect_info->found |= (found & ~rejected);
2847 return 1;
2848 }
2849
2850
2851 /* Set designation state into CODING. Set CHARS_96 to -1 if the
2852 escape sequence should be kept. */
2853 #define DECODE_DESIGNATION(reg, dim, chars_96, final) \
2854 do { \
2855 int id, prev; \
2856 \
2857 if (final < '0' || final >= 128 \
2858 || ((id = ISO_CHARSET_TABLE (dim, chars_96, final)) < 0) \
2859 || !SAFE_CHARSET_P (coding, id)) \
2860 { \
2861 CODING_ISO_DESIGNATION (coding, reg) = -2; \
2862 chars_96 = -1; \
2863 break; \
2864 } \
2865 prev = CODING_ISO_DESIGNATION (coding, reg); \
2866 if (id == charset_jisx0201_roman) \
2867 { \
2868 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_USE_ROMAN) \
2869 id = charset_ascii; \
2870 } \
2871 else if (id == charset_jisx0208_1978) \
2872 { \
2873 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_USE_OLDJIS) \
2874 id = charset_jisx0208; \
2875 } \
2876 CODING_ISO_DESIGNATION (coding, reg) = id; \
2877 /* If there was an invalid designation to REG previously, and this \
2878 designation is ASCII to REG, we should keep this designation \
2879 sequence. */ \
2880 if (prev == -2 && id == charset_ascii) \
2881 chars_96 = -1; \
2882 } while (0)
2883
2884
2885 #define MAYBE_FINISH_COMPOSITION() \
2886 do { \
2887 int i; \
2888 if (composition_state == COMPOSING_NO) \
2889 break; \
2890 /* It is assured that we have enough room for producing \
2891 characters stored in the table `components'. */ \
2892 if (charbuf + component_idx > charbuf_end) \
2893 goto no_more_source; \
2894 composition_state = COMPOSING_NO; \
2895 if (method == COMPOSITION_RELATIVE \
2896 || method == COMPOSITION_WITH_ALTCHARS) \
2897 { \
2898 for (i = 0; i < component_idx; i++) \
2899 *charbuf++ = components[i]; \
2900 char_offset += component_idx; \
2901 } \
2902 else \
2903 { \
2904 for (i = 0; i < component_idx; i += 2) \
2905 *charbuf++ = components[i]; \
2906 char_offset += (component_idx / 2) + 1; \
2907 } \
2908 } while (0)
2909
2910
2911 /* Handle composition start sequence ESC 0, ESC 2, ESC 3, or ESC 4.
2912 ESC 0 : relative composition : ESC 0 CHAR ... ESC 1
2913 ESC 2 : rulebase composition : ESC 2 CHAR RULE CHAR RULE ... CHAR ESC 1
2914 ESC 3 : altchar composition : ESC 3 CHAR ... ESC 0 CHAR ... ESC 1
2915 ESC 4 : alt&rule composition : ESC 4 CHAR RULE ... CHAR ESC 0 CHAR ... ESC 1
2916 */
2917
2918 #define DECODE_COMPOSITION_START(c1) \
2919 do { \
2920 if (c1 == '0' \
2921 && composition_state == COMPOSING_COMPONENT_RULE) \
2922 { \
2923 component_len = component_idx; \
2924 composition_state = COMPOSING_CHAR; \
2925 } \
2926 else \
2927 { \
2928 const unsigned char *p; \
2929 \
2930 MAYBE_FINISH_COMPOSITION (); \
2931 if (charbuf + MAX_COMPOSITION_COMPONENTS > charbuf_end) \
2932 goto no_more_source; \
2933 for (p = src; p < src_end - 1; p++) \
2934 if (*p == ISO_CODE_ESC && p[1] == '1') \
2935 break; \
2936 if (p == src_end - 1) \
2937 { \
2938 /* The current composition doesn't end in the current \
2939 source. */ \
2940 record_conversion_result \
2941 (coding, CODING_RESULT_INSUFFICIENT_SRC); \
2942 goto no_more_source; \
2943 } \
2944 \
2945 /* This is surely the start of a composition. */ \
2946 method = (c1 == '0' ? COMPOSITION_RELATIVE \
2947 : c1 == '2' ? COMPOSITION_WITH_RULE \
2948 : c1 == '3' ? COMPOSITION_WITH_ALTCHARS \
2949 : COMPOSITION_WITH_RULE_ALTCHARS); \
2950 composition_state = (c1 <= '2' ? COMPOSING_CHAR \
2951 : COMPOSING_COMPONENT_CHAR); \
2952 component_idx = component_len = 0; \
2953 } \
2954 } while (0)
2955
2956
2957 /* Handle compositoin end sequence ESC 1. */
2958
2959 #define DECODE_COMPOSITION_END() \
2960 do { \
2961 int nchars = (component_len > 0 ? component_idx - component_len \
2962 : method == COMPOSITION_RELATIVE ? component_idx \
2963 : (component_idx + 1) / 2); \
2964 int i; \
2965 int *saved_charbuf = charbuf; \
2966 \
2967 ADD_COMPOSITION_DATA (charbuf, nchars, method); \
2968 if (method != COMPOSITION_RELATIVE) \
2969 { \
2970 if (component_len == 0) \
2971 for (i = 0; i < component_idx; i++) \
2972 *charbuf++ = components[i]; \
2973 else \
2974 for (i = 0; i < component_len; i++) \
2975 *charbuf++ = components[i]; \
2976 *saved_charbuf = saved_charbuf - charbuf; \
2977 } \
2978 if (method == COMPOSITION_WITH_RULE) \
2979 for (i = 0; i < component_idx; i += 2, char_offset++) \
2980 *charbuf++ = components[i]; \
2981 else \
2982 for (i = component_len; i < component_idx; i++, char_offset++) \
2983 *charbuf++ = components[i]; \
2984 coding->annotated = 1; \
2985 composition_state = COMPOSING_NO; \
2986 } while (0)
2987
2988
2989 /* Decode a composition rule from the byte C1 (and maybe one more byte
2990 from SRC) and store one encoded composition rule in
2991 coding->cmp_data. */
2992
2993 #define DECODE_COMPOSITION_RULE(c1) \
2994 do { \
2995 (c1) -= 32; \
2996 if (c1 < 81) /* old format (before ver.21) */ \
2997 { \
2998 int gref = (c1) / 9; \
2999 int nref = (c1) % 9; \
3000 if (gref == 4) gref = 10; \
3001 if (nref == 4) nref = 10; \
3002 c1 = COMPOSITION_ENCODE_RULE (gref, nref); \
3003 } \
3004 else if (c1 < 93) /* new format (after ver.21) */ \
3005 { \
3006 ONE_MORE_BYTE (c2); \
3007 c1 = COMPOSITION_ENCODE_RULE (c1 - 81, c2 - 32); \
3008 } \
3009 else \
3010 c1 = 0; \
3011 } while (0)
3012
3013
3014 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions". */
3015
3016 static void
3017 decode_coding_iso_2022 (coding)
3018 struct coding_system *coding;
3019 {
3020 const unsigned char *src = coding->source + coding->consumed;
3021 const unsigned char *src_end = coding->source + coding->src_bytes;
3022 const unsigned char *src_base;
3023 int *charbuf = coding->charbuf + coding->charbuf_used;
3024 int *charbuf_end
3025 = coding->charbuf + coding->charbuf_size - 4 - MAX_ANNOTATION_LENGTH;
3026 int consumed_chars = 0, consumed_chars_base;
3027 int multibytep = coding->src_multibyte;
3028 /* Charsets invoked to graphic plane 0 and 1 respectively. */
3029 int charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3030 int charset_id_1 = CODING_ISO_INVOKED_CHARSET (coding, 1);
3031 int charset_id_2, charset_id_3;
3032 struct charset *charset;
3033 int c;
3034 /* For handling composition sequence. */
3035 #define COMPOSING_NO 0
3036 #define COMPOSING_CHAR 1
3037 #define COMPOSING_RULE 2
3038 #define COMPOSING_COMPONENT_CHAR 3
3039 #define COMPOSING_COMPONENT_RULE 4
3040
3041 int composition_state = COMPOSING_NO;
3042 enum composition_method method;
3043 int components[MAX_COMPOSITION_COMPONENTS * 2 + 1];
3044 int component_idx;
3045 int component_len;
3046 Lisp_Object attrs, charset_list;
3047 int char_offset = coding->produced_char;
3048 int last_offset = char_offset;
3049 int last_id = charset_ascii;
3050 int eol_crlf = EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
3051 int byte_after_cr = -1;
3052
3053 CODING_GET_INFO (coding, attrs, charset_list);
3054 setup_iso_safe_charsets (attrs);
3055 /* Charset list may have been changed. */
3056 charset_list = CODING_ATTR_CHARSET_LIST (attrs);
3057 coding->safe_charsets = (char *) SDATA (CODING_ATTR_SAFE_CHARSETS(attrs));
3058
3059 while (1)
3060 {
3061 int c1, c2;
3062
3063 src_base = src;
3064 consumed_chars_base = consumed_chars;
3065
3066 if (charbuf >= charbuf_end)
3067 break;
3068
3069 if (byte_after_cr >= 0)
3070 c1 = byte_after_cr, byte_after_cr = -1;
3071 else
3072 ONE_MORE_BYTE (c1);
3073 if (c1 < 0)
3074 goto invalid_code;
3075
3076 /* We produce at most one character. */
3077 switch (iso_code_class [c1])
3078 {
3079 case ISO_0x20_or_0x7F:
3080 if (composition_state != COMPOSING_NO)
3081 {
3082 if (composition_state == COMPOSING_RULE
3083 || composition_state == COMPOSING_COMPONENT_RULE)
3084 {
3085 DECODE_COMPOSITION_RULE (c1);
3086 components[component_idx++] = c1;
3087 composition_state--;
3088 continue;
3089 }
3090 }
3091 if (charset_id_0 < 0
3092 || ! CHARSET_ISO_CHARS_96 (CHARSET_FROM_ID (charset_id_0)))
3093 /* This is SPACE or DEL. */
3094 charset = CHARSET_FROM_ID (charset_ascii);
3095 else
3096 charset = CHARSET_FROM_ID (charset_id_0);
3097 break;
3098
3099 case ISO_graphic_plane_0:
3100 if (composition_state != COMPOSING_NO)
3101 {
3102 if (composition_state == COMPOSING_RULE
3103 || composition_state == COMPOSING_COMPONENT_RULE)
3104 {
3105 DECODE_COMPOSITION_RULE (c1);
3106 components[component_idx++] = c1;
3107 composition_state--;
3108 continue;
3109 }
3110 }
3111 if (charset_id_0 < 0)
3112 charset = CHARSET_FROM_ID (charset_ascii);
3113 else
3114 charset = CHARSET_FROM_ID (charset_id_0);
3115 break;
3116
3117 case ISO_0xA0_or_0xFF:
3118 if (charset_id_1 < 0
3119 || ! CHARSET_ISO_CHARS_96 (CHARSET_FROM_ID (charset_id_1))
3120 || CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS)
3121 goto invalid_code;
3122 /* This is a graphic character, we fall down ... */
3123
3124 case ISO_graphic_plane_1:
3125 if (charset_id_1 < 0)
3126 goto invalid_code;
3127 charset = CHARSET_FROM_ID (charset_id_1);
3128 break;
3129
3130 case ISO_control_0:
3131 if (eol_crlf && c1 == '\r')
3132 ONE_MORE_BYTE (byte_after_cr);
3133 MAYBE_FINISH_COMPOSITION ();
3134 charset = CHARSET_FROM_ID (charset_ascii);
3135 break;
3136
3137 case ISO_control_1:
3138 MAYBE_FINISH_COMPOSITION ();
3139 goto invalid_code;
3140
3141 case ISO_shift_out:
3142 if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LOCKING_SHIFT)
3143 || CODING_ISO_DESIGNATION (coding, 1) < 0)
3144 goto invalid_code;
3145 CODING_ISO_INVOCATION (coding, 0) = 1;
3146 charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3147 continue;
3148
3149 case ISO_shift_in:
3150 if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LOCKING_SHIFT))
3151 goto invalid_code;
3152 CODING_ISO_INVOCATION (coding, 0) = 0;
3153 charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3154 continue;
3155
3156 case ISO_single_shift_2_7:
3157 case ISO_single_shift_2:
3158 if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT))
3159 goto invalid_code;
3160 /* SS2 is handled as an escape sequence of ESC 'N' */
3161 c1 = 'N';
3162 goto label_escape_sequence;
3163
3164 case ISO_single_shift_3:
3165 if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT))
3166 goto invalid_code;
3167 /* SS2 is handled as an escape sequence of ESC 'O' */
3168 c1 = 'O';
3169 goto label_escape_sequence;
3170
3171 case ISO_control_sequence_introducer:
3172 /* CSI is handled as an escape sequence of ESC '[' ... */
3173 c1 = '[';
3174 goto label_escape_sequence;
3175
3176 case ISO_escape:
3177 ONE_MORE_BYTE (c1);
3178 label_escape_sequence:
3179 /* Escape sequences handled here are invocation,
3180 designation, direction specification, and character
3181 composition specification. */
3182 switch (c1)
3183 {
3184 case '&': /* revision of following character set */
3185 ONE_MORE_BYTE (c1);
3186 if (!(c1 >= '@' && c1 <= '~'))
3187 goto invalid_code;
3188 ONE_MORE_BYTE (c1);
3189 if (c1 != ISO_CODE_ESC)
3190 goto invalid_code;
3191 ONE_MORE_BYTE (c1);
3192 goto label_escape_sequence;
3193
3194 case '$': /* designation of 2-byte character set */
3195 if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DESIGNATION))
3196 goto invalid_code;
3197 {
3198 int reg, chars96;
3199
3200 ONE_MORE_BYTE (c1);
3201 if (c1 >= '@' && c1 <= 'B')
3202 { /* designation of JISX0208.1978, GB2312.1980,
3203 or JISX0208.1980 */
3204 reg = 0, chars96 = 0;
3205 }
3206 else if (c1 >= 0x28 && c1 <= 0x2B)
3207 { /* designation of DIMENSION2_CHARS94 character set */
3208 reg = c1 - 0x28, chars96 = 0;
3209 ONE_MORE_BYTE (c1);
3210 }
3211 else if (c1 >= 0x2C && c1 <= 0x2F)
3212 { /* designation of DIMENSION2_CHARS96 character set */
3213 reg = c1 - 0x2C, chars96 = 1;
3214 ONE_MORE_BYTE (c1);
3215 }
3216 else
3217 goto invalid_code;
3218 DECODE_DESIGNATION (reg, 2, chars96, c1);
3219 /* We must update these variables now. */
3220 if (reg == 0)
3221 charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3222 else if (reg == 1)
3223 charset_id_1 = CODING_ISO_INVOKED_CHARSET (coding, 1);
3224 if (chars96 < 0)
3225 goto invalid_code;
3226 }
3227 continue;
3228
3229 case 'n': /* invocation of locking-shift-2 */
3230 if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LOCKING_SHIFT)
3231 || CODING_ISO_DESIGNATION (coding, 2) < 0)
3232 goto invalid_code;
3233 CODING_ISO_INVOCATION (coding, 0) = 2;
3234 charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3235 continue;
3236
3237 case 'o': /* invocation of locking-shift-3 */
3238 if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LOCKING_SHIFT)
3239 || CODING_ISO_DESIGNATION (coding, 3) < 0)
3240 goto invalid_code;
3241 CODING_ISO_INVOCATION (coding, 0) = 3;
3242 charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3243 continue;
3244
3245 case 'N': /* invocation of single-shift-2 */
3246 if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT)
3247 || CODING_ISO_DESIGNATION (coding, 2) < 0)
3248 goto invalid_code;
3249 charset_id_2 = CODING_ISO_DESIGNATION (coding, 2);
3250 if (charset_id_2 < 0)
3251 charset = CHARSET_FROM_ID (charset_ascii);
3252 else
3253 charset = CHARSET_FROM_ID (charset_id_2);
3254 ONE_MORE_BYTE (c1);
3255 if (c1 < 0x20 || (c1 >= 0x80 && c1 < 0xA0))
3256 goto invalid_code;
3257 break;
3258
3259 case 'O': /* invocation of single-shift-3 */
3260 if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT)
3261 || CODING_ISO_DESIGNATION (coding, 3) < 0)
3262 goto invalid_code;
3263 charset_id_3 = CODING_ISO_DESIGNATION (coding, 3);
3264 if (charset_id_3 < 0)
3265 charset = CHARSET_FROM_ID (charset_ascii);
3266 else
3267 charset = CHARSET_FROM_ID (charset_id_3);
3268 ONE_MORE_BYTE (c1);
3269 if (c1 < 0x20 || (c1 >= 0x80 && c1 < 0xA0))
3270 goto invalid_code;
3271 break;
3272
3273 case '0': case '2': case '3': case '4': /* start composition */
3274 if (! (coding->common_flags & CODING_ANNOTATE_COMPOSITION_MASK))
3275 goto invalid_code;
3276 DECODE_COMPOSITION_START (c1);
3277 continue;
3278
3279 case '1': /* end composition */
3280 if (composition_state == COMPOSING_NO)
3281 goto invalid_code;
3282 DECODE_COMPOSITION_END ();
3283 continue;
3284
3285 case '[': /* specification of direction */
3286 if (! CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DIRECTION)
3287 goto invalid_code;
3288 /* For the moment, nested direction is not supported.
3289 So, `coding->mode & CODING_MODE_DIRECTION' zero means
3290 left-to-right, and nozero means right-to-left. */
3291 ONE_MORE_BYTE (c1);
3292 switch (c1)
3293 {
3294 case ']': /* end of the current direction */
3295 coding->mode &= ~CODING_MODE_DIRECTION;
3296
3297 case '0': /* end of the current direction */
3298 case '1': /* start of left-to-right direction */
3299 ONE_MORE_BYTE (c1);
3300 if (c1 == ']')
3301 coding->mode &= ~CODING_MODE_DIRECTION;
3302 else
3303 goto invalid_code;
3304 break;
3305
3306 case '2': /* start of right-to-left direction */
3307 ONE_MORE_BYTE (c1);
3308 if (c1 == ']')
3309 coding->mode |= CODING_MODE_DIRECTION;
3310 else
3311 goto invalid_code;
3312 break;
3313
3314 default:
3315 goto invalid_code;
3316 }
3317 continue;
3318
3319 case '%':
3320 ONE_MORE_BYTE (c1);
3321 if (c1 == '/')
3322 {
3323 /* CTEXT extended segment:
3324 ESC % / [0-4] M L --ENCODING-NAME-- \002 --BYTES--
3325 We keep these bytes as is for the moment.
3326 They may be decoded by post-read-conversion. */
3327 int dim, M, L;
3328 int size;
3329
3330 ONE_MORE_BYTE (dim);
3331 ONE_MORE_BYTE (M);
3332 ONE_MORE_BYTE (L);
3333 size = ((M - 128) * 128) + (L - 128);
3334 if (charbuf + 8 + size > charbuf_end)
3335 goto break_loop;
3336 *charbuf++ = ISO_CODE_ESC;
3337 *charbuf++ = '%';
3338 *charbuf++ = '/';
3339 *charbuf++ = dim;
3340 *charbuf++ = BYTE8_TO_CHAR (M);
3341 *charbuf++ = BYTE8_TO_CHAR (L);
3342 while (size-- > 0)
3343 {
3344 ONE_MORE_BYTE (c1);
3345 *charbuf++ = ASCII_BYTE_P (c1) ? c1 : BYTE8_TO_CHAR (c1);
3346 }
3347 }
3348 else if (c1 == 'G')
3349 {
3350 /* XFree86 extension for embedding UTF-8 in CTEXT:
3351 ESC % G --UTF-8-BYTES-- ESC % @
3352 We keep these bytes as is for the moment.
3353 They may be decoded by post-read-conversion. */
3354 int *p = charbuf;
3355
3356 if (p + 6 > charbuf_end)
3357 goto break_loop;
3358 *p++ = ISO_CODE_ESC;
3359 *p++ = '%';
3360 *p++ = 'G';
3361 while (p < charbuf_end)
3362 {
3363 ONE_MORE_BYTE (c1);
3364 if (c1 == ISO_CODE_ESC
3365 && src + 1 < src_end
3366 && src[0] == '%'
3367 && src[1] == '@')
3368 {
3369 src += 2;
3370 break;
3371 }
3372 *p++ = ASCII_BYTE_P (c1) ? c1 : BYTE8_TO_CHAR (c1);
3373 }
3374 if (p + 3 > charbuf_end)
3375 goto break_loop;
3376 *p++ = ISO_CODE_ESC;
3377 *p++ = '%';
3378 *p++ = '@';
3379 charbuf = p;
3380 }
3381 else
3382 goto invalid_code;
3383 continue;
3384 break;
3385
3386 default:
3387 if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DESIGNATION))
3388 goto invalid_code;
3389 {
3390 int reg, chars96;
3391
3392 if (c1 >= 0x28 && c1 <= 0x2B)
3393 { /* designation of DIMENSION1_CHARS94 character set */
3394 reg = c1 - 0x28, chars96 = 0;
3395 ONE_MORE_BYTE (c1);
3396 }
3397 else if (c1 >= 0x2C && c1 <= 0x2F)
3398 { /* designation of DIMENSION1_CHARS96 character set */
3399 reg = c1 - 0x2C, chars96 = 1;
3400 ONE_MORE_BYTE (c1);
3401 }
3402 else
3403 goto invalid_code;
3404 DECODE_DESIGNATION (reg, 1, chars96, c1);
3405 /* We must update these variables now. */
3406 if (reg == 0)
3407 charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3408 else if (reg == 1)
3409 charset_id_1 = CODING_ISO_INVOKED_CHARSET (coding, 1);
3410 if (chars96 < 0)
3411 goto invalid_code;
3412 }
3413 continue;
3414 }
3415 }
3416
3417 if (charset->id != charset_ascii
3418 && last_id != charset->id)
3419 {
3420 if (last_id != charset_ascii)
3421 ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
3422 last_id = charset->id;
3423 last_offset = char_offset;
3424 }
3425
3426 /* Now we know CHARSET and 1st position code C1 of a character.
3427 Produce a decoded character while getting 2nd position code
3428 C2 if necessary. */
3429 c1 &= 0x7F;
3430 if (CHARSET_DIMENSION (charset) > 1)
3431 {
3432 ONE_MORE_BYTE (c2);
3433 if (c2 < 0x20 || (c2 >= 0x80 && c2 < 0xA0))
3434 /* C2 is not in a valid range. */
3435 goto invalid_code;
3436 c1 = (c1 << 8) | (c2 & 0x7F);
3437 if (CHARSET_DIMENSION (charset) > 2)
3438 {
3439 ONE_MORE_BYTE (c2);
3440 if (c2 < 0x20 || (c2 >= 0x80 && c2 < 0xA0))
3441 /* C2 is not in a valid range. */
3442 goto invalid_code;
3443 c1 = (c1 << 8) | (c2 & 0x7F);
3444 }
3445 }
3446
3447 CODING_DECODE_CHAR (coding, src, src_base, src_end, charset, c1, c);
3448 if (c < 0)
3449 {
3450 MAYBE_FINISH_COMPOSITION ();
3451 for (; src_base < src; src_base++, char_offset++)
3452 {
3453 if (ASCII_BYTE_P (*src_base))
3454 *charbuf++ = *src_base;
3455 else
3456 *charbuf++ = BYTE8_TO_CHAR (*src_base);
3457 }
3458 }
3459 else if (composition_state == COMPOSING_NO)
3460 {
3461 *charbuf++ = c;
3462 char_offset++;
3463 }
3464 else
3465 {
3466 components[component_idx++] = c;
3467 if (method == COMPOSITION_WITH_RULE
3468 || (method == COMPOSITION_WITH_RULE_ALTCHARS
3469 && composition_state == COMPOSING_COMPONENT_CHAR))
3470 composition_state++;
3471 }
3472 continue;
3473
3474 invalid_code:
3475 MAYBE_FINISH_COMPOSITION ();
3476 src = src_base;
3477 consumed_chars = consumed_chars_base;
3478 ONE_MORE_BYTE (c);
3479 *charbuf++ = c < 0 ? -c : ASCII_BYTE_P (c) ? c : BYTE8_TO_CHAR (c);
3480 char_offset++;
3481 coding->errors++;
3482 continue;
3483
3484 break_loop:
3485 break;
3486 }
3487
3488 no_more_source:
3489 if (last_id != charset_ascii)
3490 ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
3491 coding->consumed_char += consumed_chars_base;
3492 coding->consumed = src_base - coding->source;
3493 coding->charbuf_used = charbuf - coding->charbuf;
3494 }
3495
3496
3497 /* ISO2022 encoding stuff. */
3498
3499 /*
3500 It is not enough to say just "ISO2022" on encoding, we have to
3501 specify more details. In Emacs, each coding system of ISO2022
3502 variant has the following specifications:
3503 1. Initial designation to G0 thru G3.
3504 2. Allows short-form designation?
3505 3. ASCII should be designated to G0 before control characters?
3506 4. ASCII should be designated to G0 at end of line?
3507 5. 7-bit environment or 8-bit environment?
3508 6. Use locking-shift?
3509 7. Use Single-shift?
3510 And the following two are only for Japanese:
3511 8. Use ASCII in place of JIS0201-1976-Roman?
3512 9. Use JISX0208-1983 in place of JISX0208-1978?
3513 These specifications are encoded in CODING_ISO_FLAGS (coding) as flag bits
3514 defined by macros CODING_ISO_FLAG_XXX. See `coding.h' for more
3515 details.
3516 */
3517
3518 /* Produce codes (escape sequence) for designating CHARSET to graphic
3519 register REG at DST, and increment DST. If <final-char> of CHARSET is
3520 '@', 'A', or 'B' and the coding system CODING allows, produce
3521 designation sequence of short-form. */
3522
3523 #define ENCODE_DESIGNATION(charset, reg, coding) \
3524 do { \
3525 unsigned char final_char = CHARSET_ISO_FINAL (charset); \
3526 char *intermediate_char_94 = "()*+"; \
3527 char *intermediate_char_96 = ",-./"; \
3528 int revision = -1; \
3529 int c; \
3530 \
3531 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_REVISION) \
3532 revision = CHARSET_ISO_REVISION (charset); \
3533 \
3534 if (revision >= 0) \
3535 { \
3536 EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, '&'); \
3537 EMIT_ONE_BYTE ('@' + revision); \
3538 } \
3539 EMIT_ONE_ASCII_BYTE (ISO_CODE_ESC); \
3540 if (CHARSET_DIMENSION (charset) == 1) \
3541 { \
3542 if (! CHARSET_ISO_CHARS_96 (charset)) \
3543 c = intermediate_char_94[reg]; \
3544 else \
3545 c = intermediate_char_96[reg]; \
3546 EMIT_ONE_ASCII_BYTE (c); \
3547 } \
3548 else \
3549 { \
3550 EMIT_ONE_ASCII_BYTE ('$'); \
3551 if (! CHARSET_ISO_CHARS_96 (charset)) \
3552 { \
3553 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LONG_FORM \
3554 || reg != 0 \
3555 || final_char < '@' || final_char > 'B') \
3556 EMIT_ONE_ASCII_BYTE (intermediate_char_94[reg]); \
3557 } \
3558 else \
3559 EMIT_ONE_ASCII_BYTE (intermediate_char_96[reg]); \
3560 } \
3561 EMIT_ONE_ASCII_BYTE (final_char); \
3562 \
3563 CODING_ISO_DESIGNATION (coding, reg) = CHARSET_ID (charset); \
3564 } while (0)
3565
3566
3567 /* The following two macros produce codes (control character or escape
3568 sequence) for ISO2022 single-shift functions (single-shift-2 and
3569 single-shift-3). */
3570
3571 #define ENCODE_SINGLE_SHIFT_2 \
3572 do { \
3573 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS) \
3574 EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'N'); \
3575 else \
3576 EMIT_ONE_BYTE (ISO_CODE_SS2); \
3577 CODING_ISO_SINGLE_SHIFTING (coding) = 1; \
3578 } while (0)
3579
3580
3581 #define ENCODE_SINGLE_SHIFT_3 \
3582 do { \
3583 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS) \
3584 EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'O'); \
3585 else \
3586 EMIT_ONE_BYTE (ISO_CODE_SS3); \
3587 CODING_ISO_SINGLE_SHIFTING (coding) = 1; \
3588 } while (0)
3589
3590
3591 /* The following four macros produce codes (control character or
3592 escape sequence) for ISO2022 locking-shift functions (shift-in,
3593 shift-out, locking-shift-2, and locking-shift-3). */
3594
3595 #define ENCODE_SHIFT_IN \
3596 do { \
3597 EMIT_ONE_ASCII_BYTE (ISO_CODE_SI); \
3598 CODING_ISO_INVOCATION (coding, 0) = 0; \
3599 } while (0)
3600
3601
3602 #define ENCODE_SHIFT_OUT \
3603 do { \
3604 EMIT_ONE_ASCII_BYTE (ISO_CODE_SO); \
3605 CODING_ISO_INVOCATION (coding, 0) = 1; \
3606 } while (0)
3607
3608
3609 #define ENCODE_LOCKING_SHIFT_2 \
3610 do { \
3611 EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'n'); \
3612 CODING_ISO_INVOCATION (coding, 0) = 2; \
3613 } while (0)
3614
3615
3616 #define ENCODE_LOCKING_SHIFT_3 \
3617 do { \
3618 EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'n'); \
3619 CODING_ISO_INVOCATION (coding, 0) = 3; \
3620 } while (0)
3621
3622
3623 /* Produce codes for a DIMENSION1 character whose character set is
3624 CHARSET and whose position-code is C1. Designation and invocation
3625 sequences are also produced in advance if necessary. */
3626
3627 #define ENCODE_ISO_CHARACTER_DIMENSION1(charset, c1) \
3628 do { \
3629 int id = CHARSET_ID (charset); \
3630 \
3631 if ((CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_USE_ROMAN) \
3632 && id == charset_ascii) \
3633 { \
3634 id = charset_jisx0201_roman; \
3635 charset = CHARSET_FROM_ID (id); \
3636 } \
3637 \
3638 if (CODING_ISO_SINGLE_SHIFTING (coding)) \
3639 { \
3640 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS) \
3641 EMIT_ONE_ASCII_BYTE (c1 & 0x7F); \
3642 else \
3643 EMIT_ONE_BYTE (c1 | 0x80); \
3644 CODING_ISO_SINGLE_SHIFTING (coding) = 0; \
3645 break; \
3646 } \
3647 else if (id == CODING_ISO_INVOKED_CHARSET (coding, 0)) \
3648 { \
3649 EMIT_ONE_ASCII_BYTE (c1 & 0x7F); \
3650 break; \
3651 } \
3652 else if (id == CODING_ISO_INVOKED_CHARSET (coding, 1)) \
3653 { \
3654 EMIT_ONE_BYTE (c1 | 0x80); \
3655 break; \
3656 } \
3657 else \
3658 /* Since CHARSET is not yet invoked to any graphic planes, we \
3659 must invoke it, or, at first, designate it to some graphic \
3660 register. Then repeat the loop to actually produce the \
3661 character. */ \
3662 dst = encode_invocation_designation (charset, coding, dst, \
3663 &produced_chars); \
3664 } while (1)
3665
3666
3667 /* Produce codes for a DIMENSION2 character whose character set is
3668 CHARSET and whose position-codes are C1 and C2. Designation and
3669 invocation codes are also produced in advance if necessary. */
3670
3671 #define ENCODE_ISO_CHARACTER_DIMENSION2(charset, c1, c2) \
3672 do { \
3673 int id = CHARSET_ID (charset); \
3674 \
3675 if ((CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_USE_OLDJIS) \
3676 && id == charset_jisx0208) \
3677 { \
3678 id = charset_jisx0208_1978; \
3679 charset = CHARSET_FROM_ID (id); \
3680 } \
3681 \
3682 if (CODING_ISO_SINGLE_SHIFTING (coding)) \
3683 { \
3684 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS) \
3685 EMIT_TWO_ASCII_BYTES ((c1) & 0x7F, (c2) & 0x7F); \
3686 else \
3687 EMIT_TWO_BYTES ((c1) | 0x80, (c2) | 0x80); \
3688 CODING_ISO_SINGLE_SHIFTING (coding) = 0; \
3689 break; \
3690 } \
3691 else if (id == CODING_ISO_INVOKED_CHARSET (coding, 0)) \
3692 { \
3693 EMIT_TWO_ASCII_BYTES ((c1) & 0x7F, (c2) & 0x7F); \
3694 break; \
3695 } \
3696 else if (id == CODING_ISO_INVOKED_CHARSET (coding, 1)) \
3697 { \
3698 EMIT_TWO_BYTES ((c1) | 0x80, (c2) | 0x80); \
3699 break; \
3700 } \
3701 else \
3702 /* Since CHARSET is not yet invoked to any graphic planes, we \
3703 must invoke it, or, at first, designate it to some graphic \
3704 register. Then repeat the loop to actually produce the \
3705 character. */ \
3706 dst = encode_invocation_designation (charset, coding, dst, \
3707 &produced_chars); \
3708 } while (1)
3709
3710
3711 #define ENCODE_ISO_CHARACTER(charset, c) \
3712 do { \
3713 int code = ENCODE_CHAR ((charset),(c)); \
3714 \
3715 if (CHARSET_DIMENSION (charset) == 1) \
3716 ENCODE_ISO_CHARACTER_DIMENSION1 ((charset), code); \
3717 else \
3718 ENCODE_ISO_CHARACTER_DIMENSION2 ((charset), code >> 8, code & 0xFF); \
3719 } while (0)
3720
3721
3722 /* Produce designation and invocation codes at a place pointed by DST
3723 to use CHARSET. The element `spec.iso_2022' of *CODING is updated.
3724 Return new DST. */
3725
3726 unsigned char *
3727 encode_invocation_designation (charset, coding, dst, p_nchars)
3728 struct charset *charset;
3729 struct coding_system *coding;
3730 unsigned char *dst;
3731 int *p_nchars;
3732 {
3733 int multibytep = coding->dst_multibyte;
3734 int produced_chars = *p_nchars;
3735 int reg; /* graphic register number */
3736 int id = CHARSET_ID (charset);
3737
3738 /* At first, check designations. */
3739 for (reg = 0; reg < 4; reg++)
3740 if (id == CODING_ISO_DESIGNATION (coding, reg))
3741 break;
3742
3743 if (reg >= 4)
3744 {
3745 /* CHARSET is not yet designated to any graphic registers. */
3746 /* At first check the requested designation. */
3747 reg = CODING_ISO_REQUEST (coding, id);
3748 if (reg < 0)
3749 /* Since CHARSET requests no special designation, designate it
3750 to graphic register 0. */
3751 reg = 0;
3752
3753 ENCODE_DESIGNATION (charset, reg, coding);
3754 }
3755
3756 if (CODING_ISO_INVOCATION (coding, 0) != reg
3757 && CODING_ISO_INVOCATION (coding, 1) != reg)
3758 {
3759 /* Since the graphic register REG is not invoked to any graphic
3760 planes, invoke it to graphic plane 0. */
3761 switch (reg)
3762 {
3763 case 0: /* graphic register 0 */
3764 ENCODE_SHIFT_IN;
3765 break;
3766
3767 case 1: /* graphic register 1 */
3768 ENCODE_SHIFT_OUT;
3769 break;
3770
3771 case 2: /* graphic register 2 */
3772 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT)
3773 ENCODE_SINGLE_SHIFT_2;
3774 else
3775 ENCODE_LOCKING_SHIFT_2;
3776 break;
3777
3778 case 3: /* graphic register 3 */
3779 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT)
3780 ENCODE_SINGLE_SHIFT_3;
3781 else
3782 ENCODE_LOCKING_SHIFT_3;
3783 break;
3784 }
3785 }
3786
3787 *p_nchars = produced_chars;
3788 return dst;
3789 }
3790
3791 /* The following three macros produce codes for indicating direction
3792 of text. */
3793 #define ENCODE_CONTROL_SEQUENCE_INTRODUCER \
3794 do { \
3795 if (CODING_ISO_FLAGS (coding) == CODING_ISO_FLAG_SEVEN_BITS) \
3796 EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, '['); \
3797 else \
3798 EMIT_ONE_BYTE (ISO_CODE_CSI); \
3799 } while (0)
3800
3801
3802 #define ENCODE_DIRECTION_R2L() \
3803 do { \
3804 ENCODE_CONTROL_SEQUENCE_INTRODUCER (dst); \
3805 EMIT_TWO_ASCII_BYTES ('2', ']'); \
3806 } while (0)
3807
3808
3809 #define ENCODE_DIRECTION_L2R() \
3810 do { \
3811 ENCODE_CONTROL_SEQUENCE_INTRODUCER (dst); \
3812 EMIT_TWO_ASCII_BYTES ('0', ']'); \
3813 } while (0)
3814
3815
3816 /* Produce codes for designation and invocation to reset the graphic
3817 planes and registers to initial state. */
3818 #define ENCODE_RESET_PLANE_AND_REGISTER() \
3819 do { \
3820 int reg; \
3821 struct charset *charset; \
3822 \
3823 if (CODING_ISO_INVOCATION (coding, 0) != 0) \
3824 ENCODE_SHIFT_IN; \
3825 for (reg = 0; reg < 4; reg++) \
3826 if (CODING_ISO_INITIAL (coding, reg) >= 0 \
3827 && (CODING_ISO_DESIGNATION (coding, reg) \
3828 != CODING_ISO_INITIAL (coding, reg))) \
3829 { \
3830 charset = CHARSET_FROM_ID (CODING_ISO_INITIAL (coding, reg)); \
3831 ENCODE_DESIGNATION (charset, reg, coding); \
3832 } \
3833 } while (0)
3834
3835
3836 /* Produce designation sequences of charsets in the line started from
3837 SRC to a place pointed by DST, and return updated DST.
3838
3839 If the current block ends before any end-of-line, we may fail to
3840 find all the necessary designations. */
3841
3842 static unsigned char *
3843 encode_designation_at_bol (coding, charbuf, charbuf_end, dst)
3844 struct coding_system *coding;
3845 int *charbuf, *charbuf_end;
3846 unsigned char *dst;
3847 {
3848 struct charset *charset;
3849 /* Table of charsets to be designated to each graphic register. */
3850 int r[4];
3851 int c, found = 0, reg;
3852 int produced_chars = 0;
3853 int multibytep = coding->dst_multibyte;
3854 Lisp_Object attrs;
3855 Lisp_Object charset_list;
3856
3857 attrs = CODING_ID_ATTRS (coding->id);
3858 charset_list = CODING_ATTR_CHARSET_LIST (attrs);
3859 if (EQ (charset_list, Qiso_2022))
3860 charset_list = Viso_2022_charset_list;
3861
3862 for (reg = 0; reg < 4; reg++)
3863 r[reg] = -1;
3864
3865 while (found < 4)
3866 {
3867 int id;
3868
3869 c = *charbuf++;
3870 if (c == '\n')
3871 break;
3872 charset = char_charset (c, charset_list, NULL);
3873 id = CHARSET_ID (charset);
3874 reg = CODING_ISO_REQUEST (coding, id);
3875 if (reg >= 0 && r[reg] < 0)
3876 {
3877 found++;
3878 r[reg] = id;
3879 }
3880 }
3881
3882 if (found)
3883 {
3884 for (reg = 0; reg < 4; reg++)
3885 if (r[reg] >= 0
3886 && CODING_ISO_DESIGNATION (coding, reg) != r[reg])
3887 ENCODE_DESIGNATION (CHARSET_FROM_ID (r[reg]), reg, coding);
3888 }
3889
3890 return dst;
3891 }
3892
3893 /* See the above "GENERAL NOTES on `encode_coding_XXX ()' functions". */
3894
3895 static int
3896 encode_coding_iso_2022 (coding)
3897 struct coding_system *coding;
3898 {
3899 int multibytep = coding->dst_multibyte;
3900 int *charbuf = coding->charbuf;
3901 int *charbuf_end = charbuf + coding->charbuf_used;
3902 unsigned char *dst = coding->destination + coding->produced;
3903 unsigned char *dst_end = coding->destination + coding->dst_bytes;
3904 int safe_room = 16;
3905 int bol_designation
3906 = (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DESIGNATE_AT_BOL
3907 && CODING_ISO_BOL (coding));
3908 int produced_chars = 0;
3909 Lisp_Object attrs, eol_type, charset_list;
3910 int ascii_compatible;
3911 int c;
3912 int preferred_charset_id = -1;
3913
3914 CODING_GET_INFO (coding, attrs, charset_list);
3915 eol_type = CODING_ID_EOL_TYPE (coding->id);
3916 if (VECTORP (eol_type))
3917 eol_type = Qunix;
3918
3919 setup_iso_safe_charsets (attrs);
3920 /* Charset list may have been changed. */
3921 charset_list = CODING_ATTR_CHARSET_LIST (attrs);
3922 coding->safe_charsets = (char *) SDATA (CODING_ATTR_SAFE_CHARSETS(attrs));
3923
3924 ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs));
3925
3926 while (charbuf < charbuf_end)
3927 {
3928 ASSURE_DESTINATION (safe_room);
3929
3930 if (bol_designation)
3931 {
3932 unsigned char *dst_prev = dst;
3933
3934 /* We have to produce designation sequences if any now. */
3935 dst = encode_designation_at_bol (coding, charbuf, charbuf_end, dst);
3936 bol_designation = 0;
3937 /* We are sure that designation sequences are all ASCII bytes. */
3938 produced_chars += dst - dst_prev;
3939 }
3940
3941 c = *charbuf++;
3942
3943 if (c < 0)
3944 {
3945 /* Handle an annotation. */
3946 switch (*charbuf)
3947 {
3948 case CODING_ANNOTATE_COMPOSITION_MASK:
3949 /* Not yet implemented. */
3950 break;
3951 case CODING_ANNOTATE_CHARSET_MASK:
3952 preferred_charset_id = charbuf[2];
3953 if (preferred_charset_id >= 0
3954 && NILP (Fmemq (make_number (preferred_charset_id),
3955 charset_list)))
3956 preferred_charset_id = -1;
3957 break;
3958 default:
3959 abort ();
3960 }
3961 charbuf += -c - 1;
3962 continue;
3963 }
3964
3965 /* Now encode the character C. */
3966 if (c < 0x20 || c == 0x7F)
3967 {
3968 if (c == '\n'
3969 || (c == '\r' && EQ (eol_type, Qmac)))
3970 {
3971 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_RESET_AT_EOL)
3972 ENCODE_RESET_PLANE_AND_REGISTER ();
3973 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_INIT_AT_BOL)
3974 {
3975 int i;
3976
3977 for (i = 0; i < 4; i++)
3978 CODING_ISO_DESIGNATION (coding, i)
3979 = CODING_ISO_INITIAL (coding, i);
3980 }
3981 bol_designation
3982 = CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DESIGNATE_AT_BOL;
3983 }
3984 else if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_RESET_AT_CNTL)
3985 ENCODE_RESET_PLANE_AND_REGISTER ();
3986 EMIT_ONE_ASCII_BYTE (c);
3987 }
3988 else if (ASCII_CHAR_P (c))
3989 {
3990 if (ascii_compatible)
3991 EMIT_ONE_ASCII_BYTE (c);
3992 else
3993 {
3994 struct charset *charset = CHARSET_FROM_ID (charset_ascii);
3995 ENCODE_ISO_CHARACTER (charset, c);
3996 }
3997 }
3998 else if (CHAR_BYTE8_P (c))
3999 {
4000 c = CHAR_TO_BYTE8 (c);
4001 EMIT_ONE_BYTE (c);
4002 }
4003 else
4004 {
4005 struct charset *charset;
4006
4007 if (preferred_charset_id >= 0)
4008 {
4009 charset = CHARSET_FROM_ID (preferred_charset_id);
4010 if (! CHAR_CHARSET_P (c, charset))
4011 charset = char_charset (c, charset_list, NULL);
4012 }
4013 else
4014 charset = char_charset (c, charset_list, NULL);
4015 if (!charset)
4016 {
4017 if (coding->mode & CODING_MODE_SAFE_ENCODING)
4018 {
4019 c = CODING_INHIBIT_CHARACTER_SUBSTITUTION;
4020 charset = CHARSET_FROM_ID (charset_ascii);
4021 }
4022 else
4023 {
4024 c = coding->default_char;
4025 charset = char_charset (c, charset_list, NULL);
4026 }
4027 }
4028 ENCODE_ISO_CHARACTER (charset, c);
4029 }
4030 }
4031
4032 if (coding->mode & CODING_MODE_LAST_BLOCK
4033 && CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_RESET_AT_EOL)
4034 {
4035 ASSURE_DESTINATION (safe_room);
4036 ENCODE_RESET_PLANE_AND_REGISTER ();
4037 }
4038 record_conversion_result (coding, CODING_RESULT_SUCCESS);
4039 CODING_ISO_BOL (coding) = bol_designation;
4040 coding->produced_char += produced_chars;
4041 coding->produced = dst - coding->destination;
4042 return 0;
4043 }
4044
4045 \f
4046 /*** 8,9. SJIS and BIG5 handlers ***/
4047
4048 /* Although SJIS and BIG5 are not ISO's coding system, they are used
4049 quite widely. So, for the moment, Emacs supports them in the bare
4050 C code. But, in the future, they may be supported only by CCL. */
4051
4052 /* SJIS is a coding system encoding three character sets: ASCII, right
4053 half of JISX0201-Kana, and JISX0208. An ASCII character is encoded
4054 as is. A character of charset katakana-jisx0201 is encoded by
4055 "position-code + 0x80". A character of charset japanese-jisx0208
4056 is encoded in 2-byte but two position-codes are divided and shifted
4057 so that it fit in the range below.
4058
4059 --- CODE RANGE of SJIS ---
4060 (character set) (range)
4061 ASCII 0x00 .. 0x7F
4062 KATAKANA-JISX0201 0xA0 .. 0xDF
4063 JISX0208 (1st byte) 0x81 .. 0x9F and 0xE0 .. 0xEF
4064 (2nd byte) 0x40 .. 0x7E and 0x80 .. 0xFC
4065 -------------------------------
4066
4067 */
4068
4069 /* BIG5 is a coding system encoding two character sets: ASCII and
4070 Big5. An ASCII character is encoded as is. Big5 is a two-byte
4071 character set and is encoded in two-byte.
4072
4073 --- CODE RANGE of BIG5 ---
4074 (character set) (range)
4075 ASCII 0x00 .. 0x7F
4076 Big5 (1st byte) 0xA1 .. 0xFE
4077 (2nd byte) 0x40 .. 0x7E and 0xA1 .. 0xFE
4078 --------------------------
4079
4080 */
4081
4082 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
4083 Check if a text is encoded in SJIS. If it is, return
4084 CATEGORY_MASK_SJIS, else return 0. */
4085
4086 static int
4087 detect_coding_sjis (coding, detect_info)
4088 struct coding_system *coding;
4089 struct coding_detection_info *detect_info;
4090 {
4091 const unsigned char *src = coding->source, *src_base;
4092 const unsigned char *src_end = coding->source + coding->src_bytes;
4093 int multibytep = coding->src_multibyte;
4094 int consumed_chars = 0;
4095 int found = 0;
4096 int c;
4097
4098 detect_info->checked |= CATEGORY_MASK_SJIS;
4099 /* A coding system of this category is always ASCII compatible. */
4100 src += coding->head_ascii;
4101
4102 while (1)
4103 {
4104 src_base = src;
4105 ONE_MORE_BYTE (c);
4106 if (c < 0x80)
4107 continue;
4108 if ((c >= 0x81 && c <= 0x9F) || (c >= 0xE0 && c <= 0xEF))
4109 {
4110 ONE_MORE_BYTE (c);
4111 if (c < 0x40 || c == 0x7F || c > 0xFC)
4112 break;
4113 found = CATEGORY_MASK_SJIS;
4114 }
4115 else if (c >= 0xA0 && c < 0xE0)
4116 found = CATEGORY_MASK_SJIS;
4117 else
4118 break;
4119 }
4120 detect_info->rejected |= CATEGORY_MASK_SJIS;
4121 return 0;
4122
4123 no_more_source:
4124 if (src_base < src && coding->mode & CODING_MODE_LAST_BLOCK)
4125 {
4126 detect_info->rejected |= CATEGORY_MASK_SJIS;
4127 return 0;
4128 }
4129 detect_info->found |= found;
4130 return 1;
4131 }
4132
4133 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
4134 Check if a text is encoded in BIG5. If it is, return
4135 CATEGORY_MASK_BIG5, else return 0. */
4136
4137 static int
4138 detect_coding_big5 (coding, detect_info)
4139 struct coding_system *coding;
4140 struct coding_detection_info *detect_info;
4141 {
4142 const unsigned char *src = coding->source, *src_base;
4143 const unsigned char *src_end = coding->source + coding->src_bytes;
4144 int multibytep = coding->src_multibyte;
4145 int consumed_chars = 0;
4146 int found = 0;
4147 int c;
4148
4149 detect_info->checked |= CATEGORY_MASK_BIG5;
4150 /* A coding system of this category is always ASCII compatible. */
4151 src += coding->head_ascii;
4152
4153 while (1)
4154 {
4155 src_base = src;
4156 ONE_MORE_BYTE (c);
4157 if (c < 0x80)
4158 continue;
4159 if (c >= 0xA1)
4160 {
4161 ONE_MORE_BYTE (c);
4162 if (c < 0x40 || (c >= 0x7F && c <= 0xA0))
4163 return 0;
4164 found = CATEGORY_MASK_BIG5;
4165 }
4166 else
4167 break;
4168 }
4169 detect_info->rejected |= CATEGORY_MASK_BIG5;
4170 return 0;
4171
4172 no_more_source:
4173 if (src_base < src && coding->mode & CODING_MODE_LAST_BLOCK)
4174 {
4175 detect_info->rejected |= CATEGORY_MASK_BIG5;
4176 return 0;
4177 }
4178 detect_info->found |= found;
4179 return 1;
4180 }
4181
4182 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".
4183 If SJIS_P is 1, decode SJIS text, else decode BIG5 test. */
4184
4185 static void
4186 decode_coding_sjis (coding)
4187 struct coding_system *coding;
4188 {
4189 const unsigned char *src = coding->source + coding->consumed;
4190 const unsigned char *src_end = coding->source + coding->src_bytes;
4191 const unsigned char *src_base;
4192 int *charbuf = coding->charbuf + coding->charbuf_used;
4193 int *charbuf_end
4194 = coding->charbuf + coding->charbuf_size - MAX_ANNOTATION_LENGTH;
4195 int consumed_chars = 0, consumed_chars_base;
4196 int multibytep = coding->src_multibyte;
4197 struct charset *charset_roman, *charset_kanji, *charset_kana;
4198 struct charset *charset_kanji2;
4199 Lisp_Object attrs, charset_list, val;
4200 int char_offset = coding->produced_char;
4201 int last_offset = char_offset;
4202 int last_id = charset_ascii;
4203 int eol_crlf = EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
4204 int byte_after_cr = -1;
4205
4206 CODING_GET_INFO (coding, attrs, charset_list);
4207
4208 val = charset_list;
4209 charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4210 charset_kana = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4211 charset_kanji = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4212 charset_kanji2 = NILP (val) ? NULL : CHARSET_FROM_ID (XINT (XCAR (val)));
4213
4214 while (1)
4215 {
4216 int c, c1;
4217 struct charset *charset;
4218
4219 src_base = src;
4220 consumed_chars_base = consumed_chars;
4221
4222 if (charbuf >= charbuf_end)
4223 break;
4224
4225 if (byte_after_cr >= 0)
4226 c = byte_after_cr, byte_after_cr = -1;
4227 else
4228 ONE_MORE_BYTE (c);
4229 if (c < 0)
4230 goto invalid_code;
4231 if (c < 0x80)
4232 {
4233 if (eol_crlf && c == '\r')
4234 ONE_MORE_BYTE (byte_after_cr);
4235 charset = charset_roman;
4236 }
4237 else if (c == 0x80 || c == 0xA0)
4238 goto invalid_code;
4239 else if (c >= 0xA1 && c <= 0xDF)
4240 {
4241 /* SJIS -> JISX0201-Kana */
4242 c &= 0x7F;
4243 charset = charset_kana;
4244 }
4245 else if (c <= 0xEF)
4246 {
4247 /* SJIS -> JISX0208 */
4248 ONE_MORE_BYTE (c1);
4249 if (c1 < 0x40 || c1 == 0x7F || c1 > 0xFC)
4250 goto invalid_code;
4251 c = (c << 8) | c1;
4252 SJIS_TO_JIS (c);
4253 charset = charset_kanji;
4254 }
4255 else if (c <= 0xFC && charset_kanji2)
4256 {
4257 /* SJIS -> JISX0213-2 */
4258 ONE_MORE_BYTE (c1);
4259 if (c1 < 0x40 || c1 == 0x7F || c1 > 0xFC)
4260 goto invalid_code;
4261 c = (c << 8) | c1;
4262 SJIS_TO_JIS2 (c);
4263 charset = charset_kanji2;
4264 }
4265 else
4266 goto invalid_code;
4267 if (charset->id != charset_ascii
4268 && last_id != charset->id)
4269 {
4270 if (last_id != charset_ascii)
4271 ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
4272 last_id = charset->id;
4273 last_offset = char_offset;
4274 }
4275 CODING_DECODE_CHAR (coding, src, src_base, src_end, charset, c, c);
4276 *charbuf++ = c;
4277 char_offset++;
4278 continue;
4279
4280 invalid_code:
4281 src = src_base;
4282 consumed_chars = consumed_chars_base;
4283 ONE_MORE_BYTE (c);
4284 *charbuf++ = c < 0 ? -c : BYTE8_TO_CHAR (c);
4285 char_offset++;
4286 coding->errors++;
4287 }
4288
4289 no_more_source:
4290 if (last_id != charset_ascii)
4291 ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
4292 coding->consumed_char += consumed_chars_base;
4293 coding->consumed = src_base - coding->source;
4294 coding->charbuf_used = charbuf - coding->charbuf;
4295 }
4296
4297 static void
4298 decode_coding_big5 (coding)
4299 struct coding_system *coding;
4300 {
4301 const unsigned char *src = coding->source + coding->consumed;
4302 const unsigned char *src_end = coding->source + coding->src_bytes;
4303 const unsigned char *src_base;
4304 int *charbuf = coding->charbuf + coding->charbuf_used;
4305 int *charbuf_end
4306 = coding->charbuf + coding->charbuf_size - MAX_ANNOTATION_LENGTH;
4307 int consumed_chars = 0, consumed_chars_base;
4308 int multibytep = coding->src_multibyte;
4309 struct charset *charset_roman, *charset_big5;
4310 Lisp_Object attrs, charset_list, val;
4311 int char_offset = coding->produced_char;
4312 int last_offset = char_offset;
4313 int last_id = charset_ascii;
4314 int eol_crlf = EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
4315 int byte_after_cr = -1;
4316
4317 CODING_GET_INFO (coding, attrs, charset_list);
4318 val = charset_list;
4319 charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4320 charset_big5 = CHARSET_FROM_ID (XINT (XCAR (val)));
4321
4322 while (1)
4323 {
4324 int c, c1;
4325 struct charset *charset;
4326
4327 src_base = src;
4328 consumed_chars_base = consumed_chars;
4329
4330 if (charbuf >= charbuf_end)
4331 break;
4332
4333 if (byte_after_cr >= 0)
4334 c = byte_after_cr, byte_after_cr = -1;
4335 else
4336 ONE_MORE_BYTE (c);
4337
4338 if (c < 0)
4339 goto invalid_code;
4340 if (c < 0x80)
4341 {
4342 if (eol_crlf && c == '\r')
4343 ONE_MORE_BYTE (byte_after_cr);
4344 charset = charset_roman;
4345 }
4346 else
4347 {
4348 /* BIG5 -> Big5 */
4349 if (c < 0xA1 || c > 0xFE)
4350 goto invalid_code;
4351 ONE_MORE_BYTE (c1);
4352 if (c1 < 0x40 || (c1 > 0x7E && c1 < 0xA1) || c1 > 0xFE)
4353 goto invalid_code;
4354 c = c << 8 | c1;
4355 charset = charset_big5;
4356 }
4357 if (charset->id != charset_ascii
4358 && last_id != charset->id)
4359 {
4360 if (last_id != charset_ascii)
4361 ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
4362 last_id = charset->id;
4363 last_offset = char_offset;
4364 }
4365 CODING_DECODE_CHAR (coding, src, src_base, src_end, charset, c, c);
4366 *charbuf++ = c;
4367 char_offset++;
4368 continue;
4369
4370 invalid_code:
4371 src = src_base;
4372 consumed_chars = consumed_chars_base;
4373 ONE_MORE_BYTE (c);
4374 *charbuf++ = c < 0 ? -c : BYTE8_TO_CHAR (c);
4375 char_offset++;
4376 coding->errors++;
4377 }
4378
4379 no_more_source:
4380 if (last_id != charset_ascii)
4381 ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
4382 coding->consumed_char += consumed_chars_base;
4383 coding->consumed = src_base - coding->source;
4384 coding->charbuf_used = charbuf - coding->charbuf;
4385 }
4386
4387 /* See the above "GENERAL NOTES on `encode_coding_XXX ()' functions".
4388 This function can encode charsets `ascii', `katakana-jisx0201',
4389 `japanese-jisx0208', `chinese-big5-1', and `chinese-big5-2'. We
4390 are sure that all these charsets are registered as official charset
4391 (i.e. do not have extended leading-codes). Characters of other
4392 charsets are produced without any encoding. If SJIS_P is 1, encode
4393 SJIS text, else encode BIG5 text. */
4394
4395 static int
4396 encode_coding_sjis (coding)
4397 struct coding_system *coding;
4398 {
4399 int multibytep = coding->dst_multibyte;
4400 int *charbuf = coding->charbuf;
4401 int *charbuf_end = charbuf + coding->charbuf_used;
4402 unsigned char *dst = coding->destination + coding->produced;
4403 unsigned char *dst_end = coding->destination + coding->dst_bytes;
4404 int safe_room = 4;
4405 int produced_chars = 0;
4406 Lisp_Object attrs, charset_list, val;
4407 int ascii_compatible;
4408 struct charset *charset_roman, *charset_kanji, *charset_kana;
4409 struct charset *charset_kanji2;
4410 int c;
4411
4412 CODING_GET_INFO (coding, attrs, charset_list);
4413 val = charset_list;
4414 charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4415 charset_kana = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4416 charset_kanji = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4417 charset_kanji2 = NILP (val) ? NULL : CHARSET_FROM_ID (XINT (XCAR (val)));
4418
4419 ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs));
4420
4421 while (charbuf < charbuf_end)
4422 {
4423 ASSURE_DESTINATION (safe_room);
4424 c = *charbuf++;
4425 /* Now encode the character C. */
4426 if (ASCII_CHAR_P (c) && ascii_compatible)
4427 EMIT_ONE_ASCII_BYTE (c);
4428 else if (CHAR_BYTE8_P (c))
4429 {
4430 c = CHAR_TO_BYTE8 (c);
4431 EMIT_ONE_BYTE (c);
4432 }
4433 else
4434 {
4435 unsigned code;
4436 struct charset *charset = char_charset (c, charset_list, &code);
4437
4438 if (!charset)
4439 {
4440 if (coding->mode & CODING_MODE_SAFE_ENCODING)
4441 {
4442 code = CODING_INHIBIT_CHARACTER_SUBSTITUTION;
4443 charset = CHARSET_FROM_ID (charset_ascii);
4444 }
4445 else
4446 {
4447 c = coding->default_char;
4448 charset = char_charset (c, charset_list, &code);
4449 }
4450 }
4451 if (code == CHARSET_INVALID_CODE (charset))
4452 abort ();
4453 if (charset == charset_kanji)
4454 {
4455 int c1, c2;
4456 JIS_TO_SJIS (code);
4457 c1 = code >> 8, c2 = code & 0xFF;
4458 EMIT_TWO_BYTES (c1, c2);
4459 }
4460 else if (charset == charset_kana)
4461 EMIT_ONE_BYTE (code | 0x80);
4462 else if (charset_kanji2 && charset == charset_kanji2)
4463 {
4464 int c1, c2;
4465
4466 c1 = code >> 8;
4467 if (c1 == 0x21 || (c1 >= 0x23 && c1 < 0x25)
4468 || (c1 >= 0x2C && c1 <= 0x2F) || c1 >= 0x6E)
4469 {
4470 JIS_TO_SJIS2 (code);
4471 c1 = code >> 8, c2 = code & 0xFF;
4472 EMIT_TWO_BYTES (c1, c2);
4473 }
4474 else
4475 EMIT_ONE_ASCII_BYTE (code & 0x7F);
4476 }
4477 else
4478 EMIT_ONE_ASCII_BYTE (code & 0x7F);
4479 }
4480 }
4481 record_conversion_result (coding, CODING_RESULT_SUCCESS);
4482 coding->produced_char += produced_chars;
4483 coding->produced = dst - coding->destination;
4484 return 0;
4485 }
4486
4487 static int
4488 encode_coding_big5 (coding)
4489 struct coding_system *coding;
4490 {
4491 int multibytep = coding->dst_multibyte;
4492 int *charbuf = coding->charbuf;
4493 int *charbuf_end = charbuf + coding->charbuf_used;
4494 unsigned char *dst = coding->destination + coding->produced;
4495 unsigned char *dst_end = coding->destination + coding->dst_bytes;
4496 int safe_room = 4;
4497 int produced_chars = 0;
4498 Lisp_Object attrs, charset_list, val;
4499 int ascii_compatible;
4500 struct charset *charset_roman, *charset_big5;
4501 int c;
4502
4503 CODING_GET_INFO (coding, attrs, charset_list);
4504 val = charset_list;
4505 charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4506 charset_big5 = CHARSET_FROM_ID (XINT (XCAR (val)));
4507 ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs));
4508
4509 while (charbuf < charbuf_end)
4510 {
4511 ASSURE_DESTINATION (safe_room);
4512 c = *charbuf++;
4513 /* Now encode the character C. */
4514 if (ASCII_CHAR_P (c) && ascii_compatible)
4515 EMIT_ONE_ASCII_BYTE (c);
4516 else if (CHAR_BYTE8_P (c))
4517 {
4518 c = CHAR_TO_BYTE8 (c);
4519 EMIT_ONE_BYTE (c);
4520 }
4521 else
4522 {
4523 unsigned code;
4524 struct charset *charset = char_charset (c, charset_list, &code);
4525
4526 if (! charset)
4527 {
4528 if (coding->mode & CODING_MODE_SAFE_ENCODING)
4529 {
4530 code = CODING_INHIBIT_CHARACTER_SUBSTITUTION;
4531 charset = CHARSET_FROM_ID (charset_ascii);
4532 }
4533 else
4534 {
4535 c = coding->default_char;
4536 charset = char_charset (c, charset_list, &code);
4537 }
4538 }
4539 if (code == CHARSET_INVALID_CODE (charset))
4540 abort ();
4541 if (charset == charset_big5)
4542 {
4543 int c1, c2;
4544
4545 c1 = code >> 8, c2 = code & 0xFF;
4546 EMIT_TWO_BYTES (c1, c2);
4547 }
4548 else
4549 EMIT_ONE_ASCII_BYTE (code & 0x7F);
4550 }
4551 }
4552 record_conversion_result (coding, CODING_RESULT_SUCCESS);
4553 coding->produced_char += produced_chars;
4554 coding->produced = dst - coding->destination;
4555 return 0;
4556 }
4557
4558 \f
4559 /*** 10. CCL handlers ***/
4560
4561 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
4562 Check if a text is encoded in a coding system of which
4563 encoder/decoder are written in CCL program. If it is, return
4564 CATEGORY_MASK_CCL, else return 0. */
4565
4566 static int
4567 detect_coding_ccl (coding, detect_info)
4568 struct coding_system *coding;
4569 struct coding_detection_info *detect_info;
4570 {
4571 const unsigned char *src = coding->source, *src_base;
4572 const unsigned char *src_end = coding->source + coding->src_bytes;
4573 int multibytep = coding->src_multibyte;
4574 int consumed_chars = 0;
4575 int found = 0;
4576 unsigned char *valids;
4577 int head_ascii = coding->head_ascii;
4578 Lisp_Object attrs;
4579
4580 detect_info->checked |= CATEGORY_MASK_CCL;
4581
4582 coding = &coding_categories[coding_category_ccl];
4583 valids = CODING_CCL_VALIDS (coding);
4584 attrs = CODING_ID_ATTRS (coding->id);
4585 if (! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
4586 src += head_ascii;
4587
4588 while (1)
4589 {
4590 int c;
4591
4592 src_base = src;
4593 ONE_MORE_BYTE (c);
4594 if (c < 0 || ! valids[c])
4595 break;
4596 if ((valids[c] > 1))
4597 found = CATEGORY_MASK_CCL;
4598 }
4599 detect_info->rejected |= CATEGORY_MASK_CCL;
4600 return 0;
4601
4602 no_more_source:
4603 detect_info->found |= found;
4604 return 1;
4605 }
4606
4607 static void
4608 decode_coding_ccl (coding)
4609 struct coding_system *coding;
4610 {
4611 const unsigned char *src = coding->source + coding->consumed;
4612 const unsigned char *src_end = coding->source + coding->src_bytes;
4613 int *charbuf = coding->charbuf + coding->charbuf_used;
4614 int *charbuf_end = coding->charbuf + coding->charbuf_size;
4615 int consumed_chars = 0;
4616 int multibytep = coding->src_multibyte;
4617 struct ccl_program ccl;
4618 int source_charbuf[1024];
4619 int source_byteidx[1024];
4620 Lisp_Object attrs, charset_list;
4621
4622 CODING_GET_INFO (coding, attrs, charset_list);
4623 setup_ccl_program (&ccl, CODING_CCL_DECODER (coding));
4624
4625 while (src < src_end)
4626 {
4627 const unsigned char *p = src;
4628 int *source, *source_end;
4629 int i = 0;
4630
4631 if (multibytep)
4632 while (i < 1024 && p < src_end)
4633 {
4634 source_byteidx[i] = p - src;
4635 source_charbuf[i++] = STRING_CHAR_ADVANCE (p);
4636 }
4637 else
4638 while (i < 1024 && p < src_end)
4639 source_charbuf[i++] = *p++;
4640
4641 if (p == src_end && coding->mode & CODING_MODE_LAST_BLOCK)
4642 ccl.last_block = 1;
4643
4644 source = source_charbuf;
4645 source_end = source + i;
4646 while (source < source_end)
4647 {
4648 ccl_driver (&ccl, source, charbuf,
4649 source_end - source, charbuf_end - charbuf,
4650 charset_list);
4651 source += ccl.consumed;
4652 charbuf += ccl.produced;
4653 if (ccl.status != CCL_STAT_SUSPEND_BY_DST)
4654 break;
4655 }
4656 if (source < source_end)
4657 src += source_byteidx[source - source_charbuf];
4658 else
4659 src = p;
4660 consumed_chars += source - source_charbuf;
4661
4662 if (ccl.status != CCL_STAT_SUSPEND_BY_SRC
4663 && ccl.status != CODING_RESULT_INSUFFICIENT_SRC)
4664 break;
4665 }
4666
4667 switch (ccl.status)
4668 {
4669 case CCL_STAT_SUSPEND_BY_SRC:
4670 record_conversion_result (coding, CODING_RESULT_INSUFFICIENT_SRC);
4671 break;
4672 case CCL_STAT_SUSPEND_BY_DST:
4673 break;
4674 case CCL_STAT_QUIT:
4675 case CCL_STAT_INVALID_CMD:
4676 record_conversion_result (coding, CODING_RESULT_INTERRUPT);
4677 break;
4678 default:
4679 record_conversion_result (coding, CODING_RESULT_SUCCESS);
4680 break;
4681 }
4682 coding->consumed_char += consumed_chars;
4683 coding->consumed = src - coding->source;
4684 coding->charbuf_used = charbuf - coding->charbuf;
4685 }
4686
4687 static int
4688 encode_coding_ccl (coding)
4689 struct coding_system *coding;
4690 {
4691 struct ccl_program ccl;
4692 int multibytep = coding->dst_multibyte;
4693 int *charbuf = coding->charbuf;
4694 int *charbuf_end = charbuf + coding->charbuf_used;
4695 unsigned char *dst = coding->destination + coding->produced;
4696 unsigned char *dst_end = coding->destination + coding->dst_bytes;
4697 int destination_charbuf[1024];
4698 int i, produced_chars = 0;
4699 Lisp_Object attrs, charset_list;
4700
4701 CODING_GET_INFO (coding, attrs, charset_list);
4702 setup_ccl_program (&ccl, CODING_CCL_ENCODER (coding));
4703
4704 ccl.last_block = coding->mode & CODING_MODE_LAST_BLOCK;
4705 ccl.dst_multibyte = coding->dst_multibyte;
4706
4707 while (charbuf < charbuf_end)
4708 {
4709 ccl_driver (&ccl, charbuf, destination_charbuf,
4710 charbuf_end - charbuf, 1024, charset_list);
4711 if (multibytep)
4712 {
4713 ASSURE_DESTINATION (ccl.produced * 2);
4714 for (i = 0; i < ccl.produced; i++)
4715 EMIT_ONE_BYTE (destination_charbuf[i] & 0xFF);
4716 }
4717 else
4718 {
4719 ASSURE_DESTINATION (ccl.produced);
4720 for (i = 0; i < ccl.produced; i++)
4721 *dst++ = destination_charbuf[i] & 0xFF;
4722 produced_chars += ccl.produced;
4723 }
4724 charbuf += ccl.consumed;
4725 if (ccl.status == CCL_STAT_QUIT
4726 || ccl.status == CCL_STAT_INVALID_CMD)
4727 break;
4728 }
4729
4730 switch (ccl.status)
4731 {
4732 case CCL_STAT_SUSPEND_BY_SRC:
4733 record_conversion_result (coding, CODING_RESULT_INSUFFICIENT_SRC);
4734 break;
4735 case CCL_STAT_SUSPEND_BY_DST:
4736 record_conversion_result (coding, CODING_RESULT_INSUFFICIENT_DST);
4737 break;
4738 case CCL_STAT_QUIT:
4739 case CCL_STAT_INVALID_CMD:
4740 record_conversion_result (coding, CODING_RESULT_INTERRUPT);
4741 break;
4742 default:
4743 record_conversion_result (coding, CODING_RESULT_SUCCESS);
4744 break;
4745 }
4746
4747 coding->produced_char += produced_chars;
4748 coding->produced = dst - coding->destination;
4749 return 0;
4750 }
4751
4752
4753 \f
4754 /*** 10, 11. no-conversion handlers ***/
4755
4756 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions". */
4757
4758 static void
4759 decode_coding_raw_text (coding)
4760 struct coding_system *coding;
4761 {
4762 int eol_crlf = EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
4763
4764 coding->chars_at_source = 1;
4765 coding->consumed_char = coding->src_chars;
4766 coding->consumed = coding->src_bytes;
4767 if (eol_crlf && coding->source[coding->src_bytes - 1] == '\r')
4768 {
4769 coding->consumed_char--;
4770 coding->consumed--;
4771 record_conversion_result (coding, CODING_RESULT_INSUFFICIENT_SRC);
4772 }
4773 else
4774 record_conversion_result (coding, CODING_RESULT_SUCCESS);
4775 }
4776
4777 static int
4778 encode_coding_raw_text (coding)
4779 struct coding_system *coding;
4780 {
4781 int multibytep = coding->dst_multibyte;
4782 int *charbuf = coding->charbuf;
4783 int *charbuf_end = coding->charbuf + coding->charbuf_used;
4784 unsigned char *dst = coding->destination + coding->produced;
4785 unsigned char *dst_end = coding->destination + coding->dst_bytes;
4786 int produced_chars = 0;
4787 int c;
4788
4789 if (multibytep)
4790 {
4791 int safe_room = MAX_MULTIBYTE_LENGTH * 2;
4792
4793 if (coding->src_multibyte)
4794 while (charbuf < charbuf_end)
4795 {
4796 ASSURE_DESTINATION (safe_room);
4797 c = *charbuf++;
4798 if (ASCII_CHAR_P (c))
4799 EMIT_ONE_ASCII_BYTE (c);
4800 else if (CHAR_BYTE8_P (c))
4801 {
4802 c = CHAR_TO_BYTE8 (c);
4803 EMIT_ONE_BYTE (c);
4804 }
4805 else
4806 {
4807 unsigned char str[MAX_MULTIBYTE_LENGTH], *p0 = str, *p1 = str;
4808
4809 CHAR_STRING_ADVANCE (c, p1);
4810 while (p0 < p1)
4811 {
4812 EMIT_ONE_BYTE (*p0);
4813 p0++;
4814 }
4815 }
4816 }
4817 else
4818 while (charbuf < charbuf_end)
4819 {
4820 ASSURE_DESTINATION (safe_room);
4821 c = *charbuf++;
4822 EMIT_ONE_BYTE (c);
4823 }
4824 }
4825 else
4826 {
4827 if (coding->src_multibyte)
4828 {
4829 int safe_room = MAX_MULTIBYTE_LENGTH;
4830
4831 while (charbuf < charbuf_end)
4832 {
4833 ASSURE_DESTINATION (safe_room);
4834 c = *charbuf++;
4835 if (ASCII_CHAR_P (c))
4836 *dst++ = c;
4837 else if (CHAR_BYTE8_P (c))
4838 *dst++ = CHAR_TO_BYTE8 (c);
4839 else
4840 CHAR_STRING_ADVANCE (c, dst);
4841 produced_chars++;
4842 }
4843 }
4844 else
4845 {
4846 ASSURE_DESTINATION (charbuf_end - charbuf);
4847 while (charbuf < charbuf_end && dst < dst_end)
4848 *dst++ = *charbuf++;
4849 produced_chars = dst - (coding->destination + coding->dst_bytes);
4850 }
4851 }
4852 record_conversion_result (coding, CODING_RESULT_SUCCESS);
4853 coding->produced_char += produced_chars;
4854 coding->produced = dst - coding->destination;
4855 return 0;
4856 }
4857
4858 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
4859 Check if a text is encoded in a charset-based coding system. If it
4860 is, return 1, else return 0. */
4861
4862 static int
4863 detect_coding_charset (coding, detect_info)
4864 struct coding_system *coding;
4865 struct coding_detection_info *detect_info;
4866 {
4867 const unsigned char *src = coding->source, *src_base;
4868 const unsigned char *src_end = coding->source + coding->src_bytes;
4869 int multibytep = coding->src_multibyte;
4870 int consumed_chars = 0;
4871 Lisp_Object attrs, valids;
4872 int found = 0;
4873 int head_ascii = coding->head_ascii;
4874
4875 detect_info->checked |= CATEGORY_MASK_CHARSET;
4876
4877 coding = &coding_categories[coding_category_charset];
4878 attrs = CODING_ID_ATTRS (coding->id);
4879 valids = AREF (attrs, coding_attr_charset_valids);
4880
4881 if (! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
4882 src += head_ascii;
4883
4884 while (1)
4885 {
4886 int c;
4887 Lisp_Object val;
4888 struct charset *charset;
4889 int dim, idx;
4890
4891 src_base = src;
4892 ONE_MORE_BYTE (c);
4893 if (c < 0)
4894 continue;
4895 val = AREF (valids, c);
4896 if (NILP (val))
4897 break;
4898 if (c >= 0x80)
4899 found = CATEGORY_MASK_CHARSET;
4900 if (INTEGERP (val))
4901 {
4902 charset = CHARSET_FROM_ID (XFASTINT (val));
4903 dim = CHARSET_DIMENSION (charset);
4904 for (idx = 1; idx < dim; idx++)
4905 {
4906 if (src == src_end)
4907 goto too_short;
4908 ONE_MORE_BYTE (c);
4909 if (c < charset->code_space[(dim - 1 - idx) * 2]
4910 || c > charset->code_space[(dim - 1 - idx) * 2 + 1])
4911 break;
4912 }
4913 if (idx < dim)
4914 break;
4915 }
4916 else
4917 {
4918 idx = 1;
4919 for (; CONSP (val); val = XCDR (val))
4920 {
4921 charset = CHARSET_FROM_ID (XFASTINT (XCAR (val)));
4922 dim = CHARSET_DIMENSION (charset);
4923 while (idx < dim)
4924 {
4925 if (src == src_end)
4926 goto too_short;
4927 ONE_MORE_BYTE (c);
4928 if (c < charset->code_space[(dim - 1 - idx) * 4]
4929 || c > charset->code_space[(dim - 1 - idx) * 4 + 1])
4930 break;
4931 idx++;
4932 }
4933 if (idx == dim)
4934 {
4935 val = Qnil;
4936 break;
4937 }
4938 }
4939 if (CONSP (val))
4940 break;
4941 }
4942 }
4943 too_short:
4944 detect_info->rejected |= CATEGORY_MASK_CHARSET;
4945 return 0;
4946
4947 no_more_source:
4948 detect_info->found |= found;
4949 return 1;
4950 }
4951
4952 static void
4953 decode_coding_charset (coding)
4954 struct coding_system *coding;
4955 {
4956 const unsigned char *src = coding->source + coding->consumed;
4957 const unsigned char *src_end = coding->source + coding->src_bytes;
4958 const unsigned char *src_base;
4959 int *charbuf = coding->charbuf + coding->charbuf_used;
4960 int *charbuf_end
4961 = coding->charbuf + coding->charbuf_size - MAX_ANNOTATION_LENGTH;
4962 int consumed_chars = 0, consumed_chars_base;
4963 int multibytep = coding->src_multibyte;
4964 Lisp_Object attrs, charset_list, valids;
4965 int char_offset = coding->produced_char;
4966 int last_offset = char_offset;
4967 int last_id = charset_ascii;
4968 int eol_crlf = EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
4969 int byte_after_cr = -1;
4970
4971 CODING_GET_INFO (coding, attrs, charset_list);
4972 valids = AREF (attrs, coding_attr_charset_valids);
4973
4974 while (1)
4975 {
4976 int c;
4977 Lisp_Object val;
4978 struct charset *charset;
4979 int dim;
4980 int len = 1;
4981 unsigned code;
4982
4983 src_base = src;
4984 consumed_chars_base = consumed_chars;
4985
4986 if (charbuf >= charbuf_end)
4987 break;
4988
4989 if (byte_after_cr >= 0)
4990 {
4991 c = byte_after_cr;
4992 byte_after_cr = -1;
4993 }
4994 else
4995 {
4996 ONE_MORE_BYTE (c);
4997 if (eol_crlf && c == '\r')
4998 ONE_MORE_BYTE (byte_after_cr);
4999 }
5000 if (c < 0)
5001 goto invalid_code;
5002 code = c;
5003
5004 val = AREF (valids, c);
5005 if (NILP (val))
5006 goto invalid_code;
5007 if (INTEGERP (val))
5008 {
5009 charset = CHARSET_FROM_ID (XFASTINT (val));
5010 dim = CHARSET_DIMENSION (charset);
5011 while (len < dim)
5012 {
5013 ONE_MORE_BYTE (c);
5014 code = (code << 8) | c;
5015 len++;
5016 }
5017 CODING_DECODE_CHAR (coding, src, src_base, src_end,
5018 charset, code, c);
5019 }
5020 else
5021 {
5022 /* VAL is a list of charset IDs. It is assured that the
5023 list is sorted by charset dimensions (smaller one
5024 comes first). */
5025 while (CONSP (val))
5026 {
5027 charset = CHARSET_FROM_ID (XFASTINT (XCAR (val)));
5028 dim = CHARSET_DIMENSION (charset);
5029 while (len < dim)
5030 {
5031 ONE_MORE_BYTE (c);
5032 code = (code << 8) | c;
5033 len++;
5034 }
5035 CODING_DECODE_CHAR (coding, src, src_base,
5036 src_end, charset, code, c);
5037 if (c >= 0)
5038 break;
5039 val = XCDR (val);
5040 }
5041 }
5042 if (c < 0)
5043 goto invalid_code;
5044 if (charset->id != charset_ascii
5045 && last_id != charset->id)
5046 {
5047 if (last_id != charset_ascii)
5048 ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
5049 last_id = charset->id;
5050 last_offset = char_offset;
5051 }
5052
5053 *charbuf++ = c;
5054 char_offset++;
5055 continue;
5056
5057 invalid_code:
5058 src = src_base;
5059 consumed_chars = consumed_chars_base;
5060 ONE_MORE_BYTE (c);
5061 *charbuf++ = c < 0 ? -c : ASCII_BYTE_P (c) ? c : BYTE8_TO_CHAR (c);
5062 char_offset++;
5063 coding->errors++;
5064 }
5065
5066 no_more_source:
5067 if (last_id != charset_ascii)
5068 ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
5069 coding->consumed_char += consumed_chars_base;
5070 coding->consumed = src_base - coding->source;
5071 coding->charbuf_used = charbuf - coding->charbuf;
5072 }
5073
5074 static int
5075 encode_coding_charset (coding)
5076 struct coding_system *coding;
5077 {
5078 int multibytep = coding->dst_multibyte;
5079 int *charbuf = coding->charbuf;
5080 int *charbuf_end = charbuf + coding->charbuf_used;
5081 unsigned char *dst = coding->destination + coding->produced;
5082 unsigned char *dst_end = coding->destination + coding->dst_bytes;
5083 int safe_room = MAX_MULTIBYTE_LENGTH;
5084 int produced_chars = 0;
5085 Lisp_Object attrs, charset_list;
5086 int ascii_compatible;
5087 int c;
5088
5089 CODING_GET_INFO (coding, attrs, charset_list);
5090 ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs));
5091
5092 while (charbuf < charbuf_end)
5093 {
5094 struct charset *charset;
5095 unsigned code;
5096
5097 ASSURE_DESTINATION (safe_room);
5098 c = *charbuf++;
5099 if (ascii_compatible && ASCII_CHAR_P (c))
5100 EMIT_ONE_ASCII_BYTE (c);
5101 else if (CHAR_BYTE8_P (c))
5102 {
5103 c = CHAR_TO_BYTE8 (c);
5104 EMIT_ONE_BYTE (c);
5105 }
5106 else
5107 {
5108 charset = char_charset (c, charset_list, &code);
5109 if (charset)
5110 {
5111 if (CHARSET_DIMENSION (charset) == 1)
5112 EMIT_ONE_BYTE (code);
5113 else if (CHARSET_DIMENSION (charset) == 2)
5114 EMIT_TWO_BYTES (code >> 8, code & 0xFF);
5115 else if (CHARSET_DIMENSION (charset) == 3)
5116 EMIT_THREE_BYTES (code >> 16, (code >> 8) & 0xFF, code & 0xFF);
5117 else
5118 EMIT_FOUR_BYTES (code >> 24, (code >> 16) & 0xFF,
5119 (code >> 8) & 0xFF, code & 0xFF);
5120 }
5121 else
5122 {
5123 if (coding->mode & CODING_MODE_SAFE_ENCODING)
5124 c = CODING_INHIBIT_CHARACTER_SUBSTITUTION;
5125 else
5126 c = coding->default_char;
5127 EMIT_ONE_BYTE (c);
5128 }
5129 }
5130 }
5131
5132 record_conversion_result (coding, CODING_RESULT_SUCCESS);
5133 coding->produced_char += produced_chars;
5134 coding->produced = dst - coding->destination;
5135 return 0;
5136 }
5137
5138 \f
5139 /*** 7. C library functions ***/
5140
5141 /* Setup coding context CODING from information about CODING_SYSTEM.
5142 If CODING_SYSTEM is nil, `no-conversion' is assumed. If
5143 CODING_SYSTEM is invalid, signal an error. */
5144
5145 void
5146 setup_coding_system (coding_system, coding)
5147 Lisp_Object coding_system;
5148 struct coding_system *coding;
5149 {
5150 Lisp_Object attrs;
5151 Lisp_Object eol_type;
5152 Lisp_Object coding_type;
5153 Lisp_Object val;
5154
5155 if (NILP (coding_system))
5156 coding_system = Qundecided;
5157
5158 CHECK_CODING_SYSTEM_GET_ID (coding_system, coding->id);
5159
5160 attrs = CODING_ID_ATTRS (coding->id);
5161 eol_type = CODING_ID_EOL_TYPE (coding->id);
5162
5163 coding->mode = 0;
5164 coding->head_ascii = -1;
5165 if (VECTORP (eol_type))
5166 coding->common_flags = (CODING_REQUIRE_DECODING_MASK
5167 | CODING_REQUIRE_DETECTION_MASK);
5168 else if (! EQ (eol_type, Qunix))
5169 coding->common_flags = (CODING_REQUIRE_DECODING_MASK
5170 | CODING_REQUIRE_ENCODING_MASK);
5171 else
5172 coding->common_flags = 0;
5173 if (! NILP (CODING_ATTR_POST_READ (attrs)))
5174 coding->common_flags |= CODING_REQUIRE_DECODING_MASK;
5175 if (! NILP (CODING_ATTR_PRE_WRITE (attrs)))
5176 coding->common_flags |= CODING_REQUIRE_ENCODING_MASK;
5177 if (! NILP (CODING_ATTR_FOR_UNIBYTE (attrs)))
5178 coding->common_flags |= CODING_FOR_UNIBYTE_MASK;
5179
5180 val = CODING_ATTR_SAFE_CHARSETS (attrs);
5181 coding->max_charset_id = SCHARS (val) - 1;
5182 coding->safe_charsets = (char *) SDATA (val);
5183 coding->default_char = XINT (CODING_ATTR_DEFAULT_CHAR (attrs));
5184
5185 coding_type = CODING_ATTR_TYPE (attrs);
5186 if (EQ (coding_type, Qundecided))
5187 {
5188 coding->detector = NULL;
5189 coding->decoder = decode_coding_raw_text;
5190 coding->encoder = encode_coding_raw_text;
5191 coding->common_flags |= CODING_REQUIRE_DETECTION_MASK;
5192 }
5193 else if (EQ (coding_type, Qiso_2022))
5194 {
5195 int i;
5196 int flags = XINT (AREF (attrs, coding_attr_iso_flags));
5197
5198 /* Invoke graphic register 0 to plane 0. */
5199 CODING_ISO_INVOCATION (coding, 0) = 0;
5200 /* Invoke graphic register 1 to plane 1 if we can use 8-bit. */
5201 CODING_ISO_INVOCATION (coding, 1)
5202 = (flags & CODING_ISO_FLAG_SEVEN_BITS ? -1 : 1);
5203 /* Setup the initial status of designation. */
5204 for (i = 0; i < 4; i++)
5205 CODING_ISO_DESIGNATION (coding, i) = CODING_ISO_INITIAL (coding, i);
5206 /* Not single shifting initially. */
5207 CODING_ISO_SINGLE_SHIFTING (coding) = 0;
5208 /* Beginning of buffer should also be regarded as bol. */
5209 CODING_ISO_BOL (coding) = 1;
5210 coding->detector = detect_coding_iso_2022;
5211 coding->decoder = decode_coding_iso_2022;
5212 coding->encoder = encode_coding_iso_2022;
5213 if (flags & CODING_ISO_FLAG_SAFE)
5214 coding->mode |= CODING_MODE_SAFE_ENCODING;
5215 coding->common_flags
5216 |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK
5217 | CODING_REQUIRE_FLUSHING_MASK);
5218 if (flags & CODING_ISO_FLAG_COMPOSITION)
5219 coding->common_flags |= CODING_ANNOTATE_COMPOSITION_MASK;
5220 if (flags & CODING_ISO_FLAG_DESIGNATION)
5221 coding->common_flags |= CODING_ANNOTATE_CHARSET_MASK;
5222 if (flags & CODING_ISO_FLAG_FULL_SUPPORT)
5223 {
5224 setup_iso_safe_charsets (attrs);
5225 val = CODING_ATTR_SAFE_CHARSETS (attrs);
5226 coding->max_charset_id = SCHARS (val) - 1;
5227 coding->safe_charsets = (char *) SDATA (val);
5228 }
5229 CODING_ISO_FLAGS (coding) = flags;
5230 }
5231 else if (EQ (coding_type, Qcharset))
5232 {
5233 coding->detector = detect_coding_charset;
5234 coding->decoder = decode_coding_charset;
5235 coding->encoder = encode_coding_charset;
5236 coding->common_flags
5237 |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
5238 }
5239 else if (EQ (coding_type, Qutf_8))
5240 {
5241 coding->detector = detect_coding_utf_8;
5242 coding->decoder = decode_coding_utf_8;
5243 coding->encoder = encode_coding_utf_8;
5244 coding->common_flags
5245 |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
5246 }
5247 else if (EQ (coding_type, Qutf_16))
5248 {
5249 val = AREF (attrs, coding_attr_utf_16_bom);
5250 CODING_UTF_16_BOM (coding) = (CONSP (val) ? utf_16_detect_bom
5251 : EQ (val, Qt) ? utf_16_with_bom
5252 : utf_16_without_bom);
5253 val = AREF (attrs, coding_attr_utf_16_endian);
5254 CODING_UTF_16_ENDIAN (coding) = (EQ (val, Qbig) ? utf_16_big_endian
5255 : utf_16_little_endian);
5256 CODING_UTF_16_SURROGATE (coding) = 0;
5257 coding->detector = detect_coding_utf_16;
5258 coding->decoder = decode_coding_utf_16;
5259 coding->encoder = encode_coding_utf_16;
5260 coding->common_flags
5261 |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
5262 if (CODING_UTF_16_BOM (coding) == utf_16_detect_bom)
5263 coding->common_flags |= CODING_REQUIRE_DETECTION_MASK;
5264 }
5265 else if (EQ (coding_type, Qccl))
5266 {
5267 coding->detector = detect_coding_ccl;
5268 coding->decoder = decode_coding_ccl;
5269 coding->encoder = encode_coding_ccl;
5270 coding->common_flags
5271 |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK
5272 | CODING_REQUIRE_FLUSHING_MASK);
5273 }
5274 else if (EQ (coding_type, Qemacs_mule))
5275 {
5276 coding->detector = detect_coding_emacs_mule;
5277 coding->decoder = decode_coding_emacs_mule;
5278 coding->encoder = encode_coding_emacs_mule;
5279 coding->common_flags
5280 |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
5281 if (! NILP (AREF (attrs, coding_attr_emacs_mule_full))
5282 && ! EQ (CODING_ATTR_CHARSET_LIST (attrs), Vemacs_mule_charset_list))
5283 {
5284 Lisp_Object tail, safe_charsets;
5285 int max_charset_id = 0;
5286
5287 for (tail = Vemacs_mule_charset_list; CONSP (tail);
5288 tail = XCDR (tail))
5289 if (max_charset_id < XFASTINT (XCAR (tail)))
5290 max_charset_id = XFASTINT (XCAR (tail));
5291 safe_charsets = Fmake_string (make_number (max_charset_id + 1),
5292 make_number (255));
5293 for (tail = Vemacs_mule_charset_list; CONSP (tail);
5294 tail = XCDR (tail))
5295 SSET (safe_charsets, XFASTINT (XCAR (tail)), 0);
5296 coding->max_charset_id = max_charset_id;
5297 coding->safe_charsets = (char *) SDATA (safe_charsets);
5298 }
5299 }
5300 else if (EQ (coding_type, Qshift_jis))
5301 {
5302 coding->detector = detect_coding_sjis;
5303 coding->decoder = decode_coding_sjis;
5304 coding->encoder = encode_coding_sjis;
5305 coding->common_flags
5306 |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
5307 }
5308 else if (EQ (coding_type, Qbig5))
5309 {
5310 coding->detector = detect_coding_big5;
5311 coding->decoder = decode_coding_big5;
5312 coding->encoder = encode_coding_big5;
5313 coding->common_flags
5314 |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
5315 }
5316 else /* EQ (coding_type, Qraw_text) */
5317 {
5318 coding->detector = NULL;
5319 coding->decoder = decode_coding_raw_text;
5320 coding->encoder = encode_coding_raw_text;
5321 if (! EQ (eol_type, Qunix))
5322 {
5323 coding->common_flags |= CODING_REQUIRE_DECODING_MASK;
5324 if (! VECTORP (eol_type))
5325 coding->common_flags |= CODING_REQUIRE_ENCODING_MASK;
5326 }
5327
5328 }
5329
5330 return;
5331 }
5332
5333 /* Return a list of charsets supported by CODING. */
5334
5335 Lisp_Object
5336 coding_charset_list (coding)
5337 struct coding_system *coding;
5338 {
5339 Lisp_Object attrs, charset_list;
5340
5341 CODING_GET_INFO (coding, attrs, charset_list);
5342 if (EQ (CODING_ATTR_TYPE (attrs), Qiso_2022))
5343 {
5344 int flags = XINT (AREF (attrs, coding_attr_iso_flags));
5345
5346 if (flags & CODING_ISO_FLAG_FULL_SUPPORT)
5347 charset_list = Viso_2022_charset_list;
5348 }
5349 else if (EQ (CODING_ATTR_TYPE (attrs), Qemacs_mule))
5350 {
5351 charset_list = Vemacs_mule_charset_list;
5352 }
5353 return charset_list;
5354 }
5355
5356
5357 /* Return raw-text or one of its subsidiaries that has the same
5358 eol_type as CODING-SYSTEM. */
5359
5360 Lisp_Object
5361 raw_text_coding_system (coding_system)
5362 Lisp_Object coding_system;
5363 {
5364 Lisp_Object spec, attrs;
5365 Lisp_Object eol_type, raw_text_eol_type;
5366
5367 if (NILP (coding_system))
5368 return Qraw_text;
5369 spec = CODING_SYSTEM_SPEC (coding_system);
5370 attrs = AREF (spec, 0);
5371
5372 if (EQ (CODING_ATTR_TYPE (attrs), Qraw_text))
5373 return coding_system;
5374
5375 eol_type = AREF (spec, 2);
5376 if (VECTORP (eol_type))
5377 return Qraw_text;
5378 spec = CODING_SYSTEM_SPEC (Qraw_text);
5379 raw_text_eol_type = AREF (spec, 2);
5380 return (EQ (eol_type, Qunix) ? AREF (raw_text_eol_type, 0)
5381 : EQ (eol_type, Qdos) ? AREF (raw_text_eol_type, 1)
5382 : AREF (raw_text_eol_type, 2));
5383 }
5384
5385
5386 /* If CODING_SYSTEM doesn't specify end-of-line format but PARENT
5387 does, return one of the subsidiary that has the same eol-spec as
5388 PARENT. Otherwise, return CODING_SYSTEM. If PARENT is nil,
5389 inherit end-of-line format from the system's setting
5390 (system_eol_type). */
5391
5392 Lisp_Object
5393 coding_inherit_eol_type (coding_system, parent)
5394 Lisp_Object coding_system, parent;
5395 {
5396 Lisp_Object spec, eol_type;
5397
5398 if (NILP (coding_system))
5399 coding_system = Qraw_text;
5400 spec = CODING_SYSTEM_SPEC (coding_system);
5401 eol_type = AREF (spec, 2);
5402 if (VECTORP (eol_type))
5403 {
5404 Lisp_Object parent_eol_type;
5405
5406 if (! NILP (parent))
5407 {
5408 Lisp_Object parent_spec;
5409
5410 parent_spec = CODING_SYSTEM_SPEC (parent);
5411 parent_eol_type = AREF (parent_spec, 2);
5412 }
5413 else
5414 parent_eol_type = system_eol_type;
5415 if (EQ (parent_eol_type, Qunix))
5416 coding_system = AREF (eol_type, 0);
5417 else if (EQ (parent_eol_type, Qdos))
5418 coding_system = AREF (eol_type, 1);
5419 else if (EQ (parent_eol_type, Qmac))
5420 coding_system = AREF (eol_type, 2);
5421 }
5422 return coding_system;
5423 }
5424
5425 /* Emacs has a mechanism to automatically detect a coding system if it
5426 is one of Emacs' internal format, ISO2022, SJIS, and BIG5. But,
5427 it's impossible to distinguish some coding systems accurately
5428 because they use the same range of codes. So, at first, coding
5429 systems are categorized into 7, those are:
5430
5431 o coding-category-emacs-mule
5432
5433 The category for a coding system which has the same code range
5434 as Emacs' internal format. Assigned the coding-system (Lisp
5435 symbol) `emacs-mule' by default.
5436
5437 o coding-category-sjis
5438
5439 The category for a coding system which has the same code range
5440 as SJIS. Assigned the coding-system (Lisp
5441 symbol) `japanese-shift-jis' by default.
5442
5443 o coding-category-iso-7
5444
5445 The category for a coding system which has the same code range
5446 as ISO2022 of 7-bit environment. This doesn't use any locking
5447 shift and single shift functions. This can encode/decode all
5448 charsets. Assigned the coding-system (Lisp symbol)
5449 `iso-2022-7bit' by default.
5450
5451 o coding-category-iso-7-tight
5452
5453 Same as coding-category-iso-7 except that this can
5454 encode/decode only the specified charsets.
5455
5456 o coding-category-iso-8-1
5457
5458 The category for a coding system which has the same code range
5459 as ISO2022 of 8-bit environment and graphic plane 1 used only
5460 for DIMENSION1 charset. This doesn't use any locking shift
5461 and single shift functions. Assigned the coding-system (Lisp
5462 symbol) `iso-latin-1' by default.
5463
5464 o coding-category-iso-8-2
5465
5466 The category for a coding system which has the same code range
5467 as ISO2022 of 8-bit environment and graphic plane 1 used only
5468 for DIMENSION2 charset. This doesn't use any locking shift
5469 and single shift functions. Assigned the coding-system (Lisp
5470 symbol) `japanese-iso-8bit' by default.
5471
5472 o coding-category-iso-7-else
5473
5474 The category for a coding system which has the same code range
5475 as ISO2022 of 7-bit environemnt but uses locking shift or
5476 single shift functions. Assigned the coding-system (Lisp
5477 symbol) `iso-2022-7bit-lock' by default.
5478
5479 o coding-category-iso-8-else
5480
5481 The category for a coding system which has the same code range
5482 as ISO2022 of 8-bit environemnt but uses locking shift or
5483 single shift functions. Assigned the coding-system (Lisp
5484 symbol) `iso-2022-8bit-ss2' by default.
5485
5486 o coding-category-big5
5487
5488 The category for a coding system which has the same code range
5489 as BIG5. Assigned the coding-system (Lisp symbol)
5490 `cn-big5' by default.
5491
5492 o coding-category-utf-8
5493
5494 The category for a coding system which has the same code range
5495 as UTF-8 (cf. RFC3629). Assigned the coding-system (Lisp
5496 symbol) `utf-8' by default.
5497
5498 o coding-category-utf-16-be
5499
5500 The category for a coding system in which a text has an
5501 Unicode signature (cf. Unicode Standard) in the order of BIG
5502 endian at the head. Assigned the coding-system (Lisp symbol)
5503 `utf-16-be' by default.
5504
5505 o coding-category-utf-16-le
5506
5507 The category for a coding system in which a text has an
5508 Unicode signature (cf. Unicode Standard) in the order of
5509 LITTLE endian at the head. Assigned the coding-system (Lisp
5510 symbol) `utf-16-le' by default.
5511
5512 o coding-category-ccl
5513
5514 The category for a coding system of which encoder/decoder is
5515 written in CCL programs. The default value is nil, i.e., no
5516 coding system is assigned.
5517
5518 o coding-category-binary
5519
5520 The category for a coding system not categorized in any of the
5521 above. Assigned the coding-system (Lisp symbol)
5522 `no-conversion' by default.
5523
5524 Each of them is a Lisp symbol and the value is an actual
5525 `coding-system's (this is also a Lisp symbol) assigned by a user.
5526 What Emacs does actually is to detect a category of coding system.
5527 Then, it uses a `coding-system' assigned to it. If Emacs can't
5528 decide only one possible category, it selects a category of the
5529 highest priority. Priorities of categories are also specified by a
5530 user in a Lisp variable `coding-category-list'.
5531
5532 */
5533
5534 #define EOL_SEEN_NONE 0
5535 #define EOL_SEEN_LF 1
5536 #define EOL_SEEN_CR 2
5537 #define EOL_SEEN_CRLF 4
5538
5539 /* Detect how end-of-line of a text of length SRC_BYTES pointed by
5540 SOURCE is encoded. If CATEGORY is one of
5541 coding_category_utf_16_XXXX, assume that CR and LF are encoded by
5542 two-byte, else they are encoded by one-byte.
5543
5544 Return one of EOL_SEEN_XXX. */
5545
5546 #define MAX_EOL_CHECK_COUNT 3
5547
5548 static int
5549 detect_eol (source, src_bytes, category)
5550 const unsigned char *source;
5551 EMACS_INT src_bytes;
5552 enum coding_category category;
5553 {
5554 const unsigned char *src = source, *src_end = src + src_bytes;
5555 unsigned char c;
5556 int total = 0;
5557 int eol_seen = EOL_SEEN_NONE;
5558
5559 if ((1 << category) & CATEGORY_MASK_UTF_16)
5560 {
5561 int msb, lsb;
5562
5563 msb = category == (coding_category_utf_16_le
5564 | coding_category_utf_16_le_nosig);
5565 lsb = 1 - msb;
5566
5567 while (src + 1 < src_end)
5568 {
5569 c = src[lsb];
5570 if (src[msb] == 0 && (c == '\n' || c == '\r'))
5571 {
5572 int this_eol;
5573
5574 if (c == '\n')
5575 this_eol = EOL_SEEN_LF;
5576 else if (src + 3 >= src_end
5577 || src[msb + 2] != 0
5578 || src[lsb + 2] != '\n')
5579 this_eol = EOL_SEEN_CR;
5580 else
5581 this_eol = EOL_SEEN_CRLF;
5582
5583 if (eol_seen == EOL_SEEN_NONE)
5584 /* This is the first end-of-line. */
5585 eol_seen = this_eol;
5586 else if (eol_seen != this_eol)
5587 {
5588 /* The found type is different from what found before. */
5589 eol_seen = EOL_SEEN_LF;
5590 break;
5591 }
5592 if (++total == MAX_EOL_CHECK_COUNT)
5593 break;
5594 }
5595 src += 2;
5596 }
5597 }
5598 else
5599 {
5600 while (src < src_end)
5601 {
5602 c = *src++;
5603 if (c == '\n' || c == '\r')
5604 {
5605 int this_eol;
5606
5607 if (c == '\n')
5608 this_eol = EOL_SEEN_LF;
5609 else if (src >= src_end || *src != '\n')
5610 this_eol = EOL_SEEN_CR;
5611 else
5612 this_eol = EOL_SEEN_CRLF, src++;
5613
5614 if (eol_seen == EOL_SEEN_NONE)
5615 /* This is the first end-of-line. */
5616 eol_seen = this_eol;
5617 else if (eol_seen != this_eol)
5618 {
5619 /* The found type is different from what found before. */
5620 eol_seen = EOL_SEEN_LF;
5621 break;
5622 }
5623 if (++total == MAX_EOL_CHECK_COUNT)
5624 break;
5625 }
5626 }
5627 }
5628 return eol_seen;
5629 }
5630
5631
5632 static Lisp_Object
5633 adjust_coding_eol_type (coding, eol_seen)
5634 struct coding_system *coding;
5635 int eol_seen;
5636 {
5637 Lisp_Object eol_type;
5638
5639 eol_type = CODING_ID_EOL_TYPE (coding->id);
5640 if (eol_seen & EOL_SEEN_LF)
5641 {
5642 coding->id = CODING_SYSTEM_ID (AREF (eol_type, 0));
5643 eol_type = Qunix;
5644 }
5645 else if (eol_seen & EOL_SEEN_CRLF)
5646 {
5647 coding->id = CODING_SYSTEM_ID (AREF (eol_type, 1));
5648 eol_type = Qdos;
5649 }
5650 else if (eol_seen & EOL_SEEN_CR)
5651 {
5652 coding->id = CODING_SYSTEM_ID (AREF (eol_type, 2));
5653 eol_type = Qmac;
5654 }
5655 return eol_type;
5656 }
5657
5658 /* Detect how a text specified in CODING is encoded. If a coding
5659 system is detected, update fields of CODING by the detected coding
5660 system. */
5661
5662 void
5663 detect_coding (coding)
5664 struct coding_system *coding;
5665 {
5666 const unsigned char *src, *src_end;
5667
5668 coding->consumed = coding->consumed_char = 0;
5669 coding->produced = coding->produced_char = 0;
5670 coding_set_source (coding);
5671
5672 src_end = coding->source + coding->src_bytes;
5673
5674 /* If we have not yet decided the text encoding type, detect it
5675 now. */
5676 if (EQ (CODING_ATTR_TYPE (CODING_ID_ATTRS (coding->id)), Qundecided))
5677 {
5678 int c, i;
5679 struct coding_detection_info detect_info;
5680
5681 detect_info.checked = detect_info.found = detect_info.rejected = 0;
5682 for (i = 0, src = coding->source; src < src_end; i++, src++)
5683 {
5684 c = *src;
5685 if (c & 0x80)
5686 break;
5687 if (c < 0x20
5688 && (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
5689 && ! inhibit_iso_escape_detection
5690 && ! detect_info.checked)
5691 {
5692 coding->head_ascii = src - (coding->source + coding->consumed);
5693 if (detect_coding_iso_2022 (coding, &detect_info))
5694 {
5695 /* We have scanned the whole data. */
5696 if (! (detect_info.rejected & CATEGORY_MASK_ISO_7_ELSE))
5697 /* We didn't find an 8-bit code. */
5698 src = src_end;
5699 break;
5700 }
5701 }
5702 }
5703 coding->head_ascii = src - (coding->source + coding->consumed);
5704
5705 if (coding->head_ascii < coding->src_bytes
5706 || detect_info.found)
5707 {
5708 enum coding_category category;
5709 struct coding_system *this;
5710
5711 if (coding->head_ascii == coding->src_bytes)
5712 /* As all bytes are 7-bit, we can ignore non-ISO-2022 codings. */
5713 for (i = 0; i < coding_category_raw_text; i++)
5714 {
5715 category = coding_priorities[i];
5716 this = coding_categories + category;
5717 if (detect_info.found & (1 << category))
5718 break;
5719 }
5720 else
5721 for (i = 0; i < coding_category_raw_text; i++)
5722 {
5723 category = coding_priorities[i];
5724 this = coding_categories + category;
5725 if (this->id < 0)
5726 {
5727 /* No coding system of this category is defined. */
5728 detect_info.rejected |= (1 << category);
5729 }
5730 else if (category >= coding_category_raw_text)
5731 continue;
5732 else if (detect_info.checked & (1 << category))
5733 {
5734 if (detect_info.found & (1 << category))
5735 break;
5736 }
5737 else if ((*(this->detector)) (coding, &detect_info)
5738 && detect_info.found & (1 << category))
5739 {
5740 if (category == coding_category_utf_16_auto)
5741 {
5742 if (detect_info.found & CATEGORY_MASK_UTF_16_LE)
5743 category = coding_category_utf_16_le;
5744 else
5745 category = coding_category_utf_16_be;
5746 }
5747 break;
5748 }
5749 }
5750
5751 if (i < coding_category_raw_text)
5752 setup_coding_system (CODING_ID_NAME (this->id), coding);
5753 else if (detect_info.rejected == CATEGORY_MASK_ANY)
5754 setup_coding_system (Qraw_text, coding);
5755 else if (detect_info.rejected)
5756 for (i = 0; i < coding_category_raw_text; i++)
5757 if (! (detect_info.rejected & (1 << coding_priorities[i])))
5758 {
5759 this = coding_categories + coding_priorities[i];
5760 setup_coding_system (CODING_ID_NAME (this->id), coding);
5761 break;
5762 }
5763 }
5764 }
5765 else if (XINT (CODING_ATTR_CATEGORY (CODING_ID_ATTRS (coding->id)))
5766 == coding_category_utf_16_auto)
5767 {
5768 Lisp_Object coding_systems;
5769 struct coding_detection_info detect_info;
5770
5771 coding_systems
5772 = AREF (CODING_ID_ATTRS (coding->id), coding_attr_utf_16_bom);
5773 detect_info.found = detect_info.rejected = 0;
5774 if (CONSP (coding_systems)
5775 && detect_coding_utf_16 (coding, &detect_info))
5776 {
5777 if (detect_info.found & CATEGORY_MASK_UTF_16_LE)
5778 setup_coding_system (XCAR (coding_systems), coding);
5779 else if (detect_info.found & CATEGORY_MASK_UTF_16_BE)
5780 setup_coding_system (XCDR (coding_systems), coding);
5781 }
5782 }
5783 }
5784
5785
5786 static void
5787 decode_eol (coding)
5788 struct coding_system *coding;
5789 {
5790 Lisp_Object eol_type;
5791 unsigned char *p, *pbeg, *pend;
5792
5793 eol_type = CODING_ID_EOL_TYPE (coding->id);
5794 if (EQ (eol_type, Qunix))
5795 return;
5796
5797 if (NILP (coding->dst_object))
5798 pbeg = coding->destination;
5799 else
5800 pbeg = BYTE_POS_ADDR (coding->dst_pos_byte);
5801 pend = pbeg + coding->produced;
5802
5803 if (VECTORP (eol_type))
5804 {
5805 int eol_seen = EOL_SEEN_NONE;
5806
5807 for (p = pbeg; p < pend; p++)
5808 {
5809 if (*p == '\n')
5810 eol_seen |= EOL_SEEN_LF;
5811 else if (*p == '\r')
5812 {
5813 if (p + 1 < pend && *(p + 1) == '\n')
5814 {
5815 eol_seen |= EOL_SEEN_CRLF;
5816 p++;
5817 }
5818 else
5819 eol_seen |= EOL_SEEN_CR;
5820 }
5821 }
5822 if (eol_seen != EOL_SEEN_NONE
5823 && eol_seen != EOL_SEEN_LF
5824 && eol_seen != EOL_SEEN_CRLF
5825 && eol_seen != EOL_SEEN_CR)
5826 eol_seen = EOL_SEEN_LF;
5827 if (eol_seen != EOL_SEEN_NONE)
5828 eol_type = adjust_coding_eol_type (coding, eol_seen);
5829 }
5830
5831 if (EQ (eol_type, Qmac))
5832 {
5833 for (p = pbeg; p < pend; p++)
5834 if (*p == '\r')
5835 *p = '\n';
5836 }
5837 else if (EQ (eol_type, Qdos))
5838 {
5839 int n = 0;
5840
5841 if (NILP (coding->dst_object))
5842 {
5843 /* Start deleting '\r' from the tail to minimize the memory
5844 movement. */
5845 for (p = pend - 2; p >= pbeg; p--)
5846 if (*p == '\r')
5847 {
5848 safe_bcopy ((char *) (p + 1), (char *) p, pend-- - p - 1);
5849 n++;
5850 }
5851 }
5852 else
5853 {
5854 int pos_byte = coding->dst_pos_byte;
5855 int pos = coding->dst_pos;
5856 int pos_end = pos + coding->produced_char - 1;
5857
5858 while (pos < pos_end)
5859 {
5860 p = BYTE_POS_ADDR (pos_byte);
5861 if (*p == '\r' && p[1] == '\n')
5862 {
5863 del_range_2 (pos, pos_byte, pos + 1, pos_byte + 1, 0);
5864 n++;
5865 pos_end--;
5866 }
5867 pos++;
5868 if (coding->dst_multibyte)
5869 pos_byte += BYTES_BY_CHAR_HEAD (*p);
5870 else
5871 pos_byte++;
5872 }
5873 }
5874 coding->produced -= n;
5875 coding->produced_char -= n;
5876 }
5877 }
5878
5879
5880 /* Return a translation table (or list of them) from coding system
5881 attribute vector ATTRS for encoding (ENCODEP is nonzero) or
5882 decoding (ENCODEP is zero). */
5883
5884 static Lisp_Object
5885 get_translation_table (attrs, encodep, max_lookup)
5886 Lisp_Object attrs;
5887 int encodep, *max_lookup;
5888 {
5889 Lisp_Object standard, translation_table;
5890 Lisp_Object val;
5891
5892 if (encodep)
5893 translation_table = CODING_ATTR_ENCODE_TBL (attrs),
5894 standard = Vstandard_translation_table_for_encode;
5895 else
5896 translation_table = CODING_ATTR_DECODE_TBL (attrs),
5897 standard = Vstandard_translation_table_for_decode;
5898 if (NILP (translation_table))
5899 translation_table = standard;
5900 else
5901 {
5902 if (SYMBOLP (translation_table))
5903 translation_table = Fget (translation_table, Qtranslation_table);
5904 else if (CONSP (translation_table))
5905 {
5906 translation_table = Fcopy_sequence (translation_table);
5907 for (val = translation_table; CONSP (val); val = XCDR (val))
5908 if (SYMBOLP (XCAR (val)))
5909 XSETCAR (val, Fget (XCAR (val), Qtranslation_table));
5910 }
5911 if (CHAR_TABLE_P (standard))
5912 {
5913 if (CONSP (translation_table))
5914 translation_table = nconc2 (translation_table,
5915 Fcons (standard, Qnil));
5916 else
5917 translation_table = Fcons (translation_table,
5918 Fcons (standard, Qnil));
5919 }
5920 }
5921
5922 if (max_lookup)
5923 {
5924 *max_lookup = 1;
5925 if (CHAR_TABLE_P (translation_table)
5926 && CHAR_TABLE_EXTRA_SLOTS (XCHAR_TABLE (translation_table)) > 1)
5927 {
5928 val = XCHAR_TABLE (translation_table)->extras[1];
5929 if (NATNUMP (val) && *max_lookup < XFASTINT (val))
5930 *max_lookup = XFASTINT (val);
5931 }
5932 else if (CONSP (translation_table))
5933 {
5934 Lisp_Object tail, val;
5935
5936 for (tail = translation_table; CONSP (tail); tail = XCDR (tail))
5937 if (CHAR_TABLE_P (XCAR (tail))
5938 && CHAR_TABLE_EXTRA_SLOTS (XCHAR_TABLE (XCAR (tail))) > 1)
5939 {
5940 val = XCHAR_TABLE (XCAR (tail))->extras[1];
5941 if (NATNUMP (val) && *max_lookup < XFASTINT (val))
5942 *max_lookup = XFASTINT (val);
5943 }
5944 }
5945 }
5946 return translation_table;
5947 }
5948
5949 #define LOOKUP_TRANSLATION_TABLE(table, c, trans) \
5950 do { \
5951 trans = Qnil; \
5952 if (CHAR_TABLE_P (table)) \
5953 { \
5954 trans = CHAR_TABLE_REF (table, c); \
5955 if (CHARACTERP (trans)) \
5956 c = XFASTINT (trans), trans = Qnil; \
5957 } \
5958 else if (CONSP (table)) \
5959 { \
5960 Lisp_Object tail; \
5961 \
5962 for (tail = table; CONSP (tail); tail = XCDR (tail)) \
5963 if (CHAR_TABLE_P (XCAR (tail))) \
5964 { \
5965 trans = CHAR_TABLE_REF (XCAR (tail), c); \
5966 if (CHARACTERP (trans)) \
5967 c = XFASTINT (trans), trans = Qnil; \
5968 else if (! NILP (trans)) \
5969 break; \
5970 } \
5971 } \
5972 } while (0)
5973
5974
5975 static Lisp_Object
5976 get_translation (val, buf, buf_end, last_block, from_nchars, to_nchars)
5977 Lisp_Object val;
5978 int *buf, *buf_end;
5979 int last_block;
5980 int *from_nchars, *to_nchars;
5981 {
5982 /* VAL is TO or (([FROM-CHAR ...] . TO) ...) where TO is TO-CHAR or
5983 [TO-CHAR ...]. */
5984 if (CONSP (val))
5985 {
5986 Lisp_Object from, tail;
5987 int i, len;
5988
5989 for (tail = val; CONSP (tail); tail = XCDR (tail))
5990 {
5991 val = XCAR (tail);
5992 from = XCAR (val);
5993 len = ASIZE (from);
5994 for (i = 0; i < len; i++)
5995 {
5996 if (buf + i == buf_end)
5997 {
5998 if (! last_block)
5999 return Qt;
6000 break;
6001 }
6002 if (XINT (AREF (from, i)) != buf[i])
6003 break;
6004 }
6005 if (i == len)
6006 {
6007 val = XCDR (val);
6008 *from_nchars = len;
6009 break;
6010 }
6011 }
6012 if (! CONSP (tail))
6013 return Qnil;
6014 }
6015 if (VECTORP (val))
6016 *buf = XINT (AREF (val, 0)), *to_nchars = ASIZE (val);
6017 else
6018 *buf = XINT (val);
6019 return val;
6020 }
6021
6022
6023 static int
6024 produce_chars (coding, translation_table, last_block)
6025 struct coding_system *coding;
6026 Lisp_Object translation_table;
6027 int last_block;
6028 {
6029 unsigned char *dst = coding->destination + coding->produced;
6030 unsigned char *dst_end = coding->destination + coding->dst_bytes;
6031 EMACS_INT produced;
6032 EMACS_INT produced_chars = 0;
6033 int carryover = 0;
6034
6035 if (! coding->chars_at_source)
6036 {
6037 /* Source characters are in coding->charbuf. */
6038 int *buf = coding->charbuf;
6039 int *buf_end = buf + coding->charbuf_used;
6040
6041 if (EQ (coding->src_object, coding->dst_object))
6042 {
6043 coding_set_source (coding);
6044 dst_end = ((unsigned char *) coding->source) + coding->consumed;
6045 }
6046
6047 while (buf < buf_end)
6048 {
6049 int c = *buf, i;
6050
6051 if (c >= 0)
6052 {
6053 int from_nchars = 1, to_nchars = 1;
6054 Lisp_Object trans = Qnil;
6055
6056 LOOKUP_TRANSLATION_TABLE (translation_table, c, trans);
6057 if (! NILP (trans))
6058 {
6059 trans = get_translation (trans, buf, buf_end, last_block,
6060 &from_nchars, &to_nchars);
6061 if (EQ (trans, Qt))
6062 break;
6063 c = *buf;
6064 }
6065
6066 if (dst + MAX_MULTIBYTE_LENGTH * to_nchars > dst_end)
6067 {
6068 dst = alloc_destination (coding,
6069 buf_end - buf
6070 + MAX_MULTIBYTE_LENGTH * to_nchars,
6071 dst);
6072 if (EQ (coding->src_object, coding->dst_object))
6073 {
6074 coding_set_source (coding);
6075 dst_end = ((unsigned char *) coding->source) + coding->consumed;
6076 }
6077 else
6078 dst_end = coding->destination + coding->dst_bytes;
6079 }
6080
6081 for (i = 0; i < to_nchars; i++)
6082 {
6083 if (i > 0)
6084 c = XINT (AREF (trans, i));
6085 if (coding->dst_multibyte
6086 || ! CHAR_BYTE8_P (c))
6087 CHAR_STRING_ADVANCE_NO_UNIFY (c, dst);
6088 else
6089 *dst++ = CHAR_TO_BYTE8 (c);
6090 }
6091 produced_chars += to_nchars;
6092 *buf++ = to_nchars;
6093 while (--from_nchars > 0)
6094 *buf++ = 0;
6095 }
6096 else
6097 /* This is an annotation datum. (-C) is the length. */
6098 buf += -c;
6099 }
6100 carryover = buf_end - buf;
6101 }
6102 else
6103 {
6104 /* Source characters are at coding->source. */
6105 const unsigned char *src = coding->source;
6106 const unsigned char *src_end = src + coding->consumed;
6107
6108 if (EQ (coding->dst_object, coding->src_object))
6109 dst_end = (unsigned char *) src;
6110 if (coding->src_multibyte != coding->dst_multibyte)
6111 {
6112 if (coding->src_multibyte)
6113 {
6114 int multibytep = 1;
6115 EMACS_INT consumed_chars;
6116
6117 while (1)
6118 {
6119 const unsigned char *src_base = src;
6120 int c;
6121
6122 ONE_MORE_BYTE (c);
6123 if (dst == dst_end)
6124 {
6125 if (EQ (coding->src_object, coding->dst_object))
6126 dst_end = (unsigned char *) src;
6127 if (dst == dst_end)
6128 {
6129 EMACS_INT offset = src - coding->source;
6130
6131 dst = alloc_destination (coding, src_end - src + 1,
6132 dst);
6133 dst_end = coding->destination + coding->dst_bytes;
6134 coding_set_source (coding);
6135 src = coding->source + offset;
6136 src_end = coding->source + coding->src_bytes;
6137 if (EQ (coding->src_object, coding->dst_object))
6138 dst_end = (unsigned char *) src;
6139 }
6140 }
6141 *dst++ = c;
6142 produced_chars++;
6143 }
6144 no_more_source:
6145 ;
6146 }
6147 else
6148 while (src < src_end)
6149 {
6150 int multibytep = 1;
6151 int c = *src++;
6152
6153 if (dst >= dst_end - 1)
6154 {
6155 if (EQ (coding->src_object, coding->dst_object))
6156 dst_end = (unsigned char *) src;
6157 if (dst >= dst_end - 1)
6158 {
6159 EMACS_INT offset = src - coding->source;
6160 EMACS_INT more_bytes;
6161
6162 if (EQ (coding->src_object, coding->dst_object))
6163 more_bytes = ((src_end - src) / 2) + 2;
6164 else
6165 more_bytes = src_end - src + 2;
6166 dst = alloc_destination (coding, more_bytes, dst);
6167 dst_end = coding->destination + coding->dst_bytes;
6168 coding_set_source (coding);
6169 src = coding->source + offset;
6170 src_end = coding->source + coding->src_bytes;
6171 if (EQ (coding->src_object, coding->dst_object))
6172 dst_end = (unsigned char *) src;
6173 }
6174 }
6175 EMIT_ONE_BYTE (c);
6176 }
6177 }
6178 else
6179 {
6180 if (!EQ (coding->src_object, coding->dst_object))
6181 {
6182 EMACS_INT require = coding->src_bytes - coding->dst_bytes;
6183
6184 if (require > 0)
6185 {
6186 EMACS_INT offset = src - coding->source;
6187
6188 dst = alloc_destination (coding, require, dst);
6189 coding_set_source (coding);
6190 src = coding->source + offset;
6191 src_end = coding->source + coding->src_bytes;
6192 }
6193 }
6194 produced_chars = coding->consumed_char;
6195 while (src < src_end)
6196 *dst++ = *src++;
6197 }
6198 }
6199
6200 produced = dst - (coding->destination + coding->produced);
6201 if (BUFFERP (coding->dst_object) && produced_chars > 0)
6202 insert_from_gap (produced_chars, produced);
6203 coding->produced += produced;
6204 coding->produced_char += produced_chars;
6205 return carryover;
6206 }
6207
6208 /* Compose text in CODING->object according to the annotation data at
6209 CHARBUF. CHARBUF is an array:
6210 [ -LENGTH ANNOTATION_MASK FROM TO METHOD COMP_LEN [ COMPONENTS... ] ]
6211 */
6212
6213 static INLINE void
6214 produce_composition (coding, charbuf, pos)
6215 struct coding_system *coding;
6216 int *charbuf;
6217 EMACS_INT pos;
6218 {
6219 int len;
6220 EMACS_INT to;
6221 enum composition_method method;
6222 Lisp_Object components;
6223
6224 len = -charbuf[0];
6225 to = pos + charbuf[2];
6226 if (to <= pos)
6227 return;
6228 method = (enum composition_method) (charbuf[3]);
6229
6230 if (method == COMPOSITION_RELATIVE)
6231 components = Qnil;
6232 else if (method >= COMPOSITION_WITH_RULE
6233 && method <= COMPOSITION_WITH_RULE_ALTCHARS)
6234 {
6235 Lisp_Object args[MAX_COMPOSITION_COMPONENTS * 2 - 1];
6236 int i;
6237
6238 len -= 4;
6239 charbuf += 4;
6240 for (i = 0; i < len; i++)
6241 {
6242 args[i] = make_number (charbuf[i]);
6243 if (charbuf[i] < 0)
6244 return;
6245 }
6246 components = (method == COMPOSITION_WITH_ALTCHARS
6247 ? Fstring (len, args) : Fvector (len, args));
6248 }
6249 else
6250 return;
6251 compose_text (pos, to, components, Qnil, coding->dst_object);
6252 }
6253
6254
6255 /* Put `charset' property on text in CODING->object according to
6256 the annotation data at CHARBUF. CHARBUF is an array:
6257 [ -LENGTH ANNOTATION_MASK NCHARS CHARSET-ID ]
6258 */
6259
6260 static INLINE void
6261 produce_charset (coding, charbuf, pos)
6262 struct coding_system *coding;
6263 int *charbuf;
6264 EMACS_INT pos;
6265 {
6266 EMACS_INT from = pos - charbuf[2];
6267 struct charset *charset = CHARSET_FROM_ID (charbuf[3]);
6268
6269 Fput_text_property (make_number (from), make_number (pos),
6270 Qcharset, CHARSET_NAME (charset),
6271 coding->dst_object);
6272 }
6273
6274
6275 #define CHARBUF_SIZE 0x4000
6276
6277 #define ALLOC_CONVERSION_WORK_AREA(coding) \
6278 do { \
6279 int size = CHARBUF_SIZE;; \
6280 \
6281 coding->charbuf = NULL; \
6282 while (size > 1024) \
6283 { \
6284 coding->charbuf = (int *) alloca (sizeof (int) * size); \
6285 if (coding->charbuf) \
6286 break; \
6287 size >>= 1; \
6288 } \
6289 if (! coding->charbuf) \
6290 { \
6291 record_conversion_result (coding, CODING_RESULT_INSUFFICIENT_MEM); \
6292 return coding->result; \
6293 } \
6294 coding->charbuf_size = size; \
6295 } while (0)
6296
6297
6298 static void
6299 produce_annotation (coding, pos)
6300 struct coding_system *coding;
6301 EMACS_INT pos;
6302 {
6303 int *charbuf = coding->charbuf;
6304 int *charbuf_end = charbuf + coding->charbuf_used;
6305
6306 if (NILP (coding->dst_object))
6307 return;
6308
6309 while (charbuf < charbuf_end)
6310 {
6311 if (*charbuf >= 0)
6312 pos += *charbuf++;
6313 else
6314 {
6315 int len = -*charbuf;
6316 switch (charbuf[1])
6317 {
6318 case CODING_ANNOTATE_COMPOSITION_MASK:
6319 produce_composition (coding, charbuf, pos);
6320 break;
6321 case CODING_ANNOTATE_CHARSET_MASK:
6322 produce_charset (coding, charbuf, pos);
6323 break;
6324 default:
6325 abort ();
6326 }
6327 charbuf += len;
6328 }
6329 }
6330 }
6331
6332 /* Decode the data at CODING->src_object into CODING->dst_object.
6333 CODING->src_object is a buffer, a string, or nil.
6334 CODING->dst_object is a buffer.
6335
6336 If CODING->src_object is a buffer, it must be the current buffer.
6337 In this case, if CODING->src_pos is positive, it is a position of
6338 the source text in the buffer, otherwise, the source text is in the
6339 gap area of the buffer, and CODING->src_pos specifies the offset of
6340 the text from GPT (which must be the same as PT). If this is the
6341 same buffer as CODING->dst_object, CODING->src_pos must be
6342 negative.
6343
6344 If CODING->src_object is a string, CODING->src_pos is an index to
6345 that string.
6346
6347 If CODING->src_object is nil, CODING->source must already point to
6348 the non-relocatable memory area. In this case, CODING->src_pos is
6349 an offset from CODING->source.
6350
6351 The decoded data is inserted at the current point of the buffer
6352 CODING->dst_object.
6353 */
6354
6355 static int
6356 decode_coding (coding)
6357 struct coding_system *coding;
6358 {
6359 Lisp_Object attrs;
6360 Lisp_Object undo_list;
6361 Lisp_Object translation_table;
6362 int carryover;
6363 int i;
6364
6365 if (BUFFERP (coding->src_object)
6366 && coding->src_pos > 0
6367 && coding->src_pos < GPT
6368 && coding->src_pos + coding->src_chars > GPT)
6369 move_gap_both (coding->src_pos, coding->src_pos_byte);
6370
6371 undo_list = Qt;
6372 if (BUFFERP (coding->dst_object))
6373 {
6374 if (current_buffer != XBUFFER (coding->dst_object))
6375 set_buffer_internal (XBUFFER (coding->dst_object));
6376 if (GPT != PT)
6377 move_gap_both (PT, PT_BYTE);
6378 undo_list = current_buffer->undo_list;
6379 current_buffer->undo_list = Qt;
6380 }
6381
6382 coding->consumed = coding->consumed_char = 0;
6383 coding->produced = coding->produced_char = 0;
6384 coding->chars_at_source = 0;
6385 record_conversion_result (coding, CODING_RESULT_SUCCESS);
6386 coding->errors = 0;
6387
6388 ALLOC_CONVERSION_WORK_AREA (coding);
6389
6390 attrs = CODING_ID_ATTRS (coding->id);
6391 translation_table = get_translation_table (attrs, 0, NULL);
6392
6393 carryover = 0;
6394 do
6395 {
6396 EMACS_INT pos = coding->dst_pos + coding->produced_char;
6397
6398 coding_set_source (coding);
6399 coding->annotated = 0;
6400 coding->charbuf_used = carryover;
6401 (*(coding->decoder)) (coding);
6402 coding_set_destination (coding);
6403 carryover = produce_chars (coding, translation_table, 0);
6404 if (coding->annotated)
6405 produce_annotation (coding, pos);
6406 for (i = 0; i < carryover; i++)
6407 coding->charbuf[i]
6408 = coding->charbuf[coding->charbuf_used - carryover + i];
6409 }
6410 while (coding->consumed < coding->src_bytes
6411 && (coding->result == CODING_RESULT_SUCCESS
6412 || coding->result == CODING_RESULT_INVALID_SRC));
6413
6414 if (carryover > 0)
6415 {
6416 coding_set_destination (coding);
6417 coding->charbuf_used = carryover;
6418 produce_chars (coding, translation_table, 1);
6419 }
6420
6421 coding->carryover_bytes = 0;
6422 if (coding->consumed < coding->src_bytes)
6423 {
6424 int nbytes = coding->src_bytes - coding->consumed;
6425 const unsigned char *src;
6426
6427 coding_set_source (coding);
6428 coding_set_destination (coding);
6429 src = coding->source + coding->consumed;
6430
6431 if (coding->mode & CODING_MODE_LAST_BLOCK)
6432 {
6433 /* Flush out unprocessed data as binary chars. We are sure
6434 that the number of data is less than the size of
6435 coding->charbuf. */
6436 coding->charbuf_used = 0;
6437 while (nbytes-- > 0)
6438 {
6439 int c = *src++;
6440
6441 if (c & 0x80)
6442 c = BYTE8_TO_CHAR (c);
6443 coding->charbuf[coding->charbuf_used++] = c;
6444 }
6445 produce_chars (coding, Qnil, 1);
6446 }
6447 else
6448 {
6449 /* Record unprocessed bytes in coding->carryover. We are
6450 sure that the number of data is less than the size of
6451 coding->carryover. */
6452 unsigned char *p = coding->carryover;
6453
6454 coding->carryover_bytes = nbytes;
6455 while (nbytes-- > 0)
6456 *p++ = *src++;
6457 }
6458 coding->consumed = coding->src_bytes;
6459 }
6460
6461 if (! EQ (CODING_ID_EOL_TYPE (coding->id), Qunix))
6462 decode_eol (coding);
6463 if (BUFFERP (coding->dst_object))
6464 {
6465 current_buffer->undo_list = undo_list;
6466 record_insert (coding->dst_pos, coding->produced_char);
6467 }
6468 return coding->result;
6469 }
6470
6471
6472 /* Extract an annotation datum from a composition starting at POS and
6473 ending before LIMIT of CODING->src_object (buffer or string), store
6474 the data in BUF, set *STOP to a starting position of the next
6475 composition (if any) or to LIMIT, and return the address of the
6476 next element of BUF.
6477
6478 If such an annotation is not found, set *STOP to a starting
6479 position of a composition after POS (if any) or to LIMIT, and
6480 return BUF. */
6481
6482 static INLINE int *
6483 handle_composition_annotation (pos, limit, coding, buf, stop)
6484 EMACS_INT pos, limit;
6485 struct coding_system *coding;
6486 int *buf;
6487 EMACS_INT *stop;
6488 {
6489 EMACS_INT start, end;
6490 Lisp_Object prop;
6491
6492 if (! find_composition (pos, limit, &start, &end, &prop, coding->src_object)
6493 || end > limit)
6494 *stop = limit;
6495 else if (start > pos)
6496 *stop = start;
6497 else
6498 {
6499 if (start == pos)
6500 {
6501 /* We found a composition. Store the corresponding
6502 annotation data in BUF. */
6503 int *head = buf;
6504 enum composition_method method = COMPOSITION_METHOD (prop);
6505 int nchars = COMPOSITION_LENGTH (prop);
6506
6507 ADD_COMPOSITION_DATA (buf, nchars, method);
6508 if (method != COMPOSITION_RELATIVE)
6509 {
6510 Lisp_Object components;
6511 int len, i, i_byte;
6512
6513 components = COMPOSITION_COMPONENTS (prop);
6514 if (VECTORP (components))
6515 {
6516 len = XVECTOR (components)->size;
6517 for (i = 0; i < len; i++)
6518 *buf++ = XINT (AREF (components, i));
6519 }
6520 else if (STRINGP (components))
6521 {
6522 len = SCHARS (components);
6523 i = i_byte = 0;
6524 while (i < len)
6525 {
6526 FETCH_STRING_CHAR_ADVANCE (*buf, components, i, i_byte);
6527 buf++;
6528 }
6529 }
6530 else if (INTEGERP (components))
6531 {
6532 len = 1;
6533 *buf++ = XINT (components);
6534 }
6535 else if (CONSP (components))
6536 {
6537 for (len = 0; CONSP (components);
6538 len++, components = XCDR (components))
6539 *buf++ = XINT (XCAR (components));
6540 }
6541 else
6542 abort ();
6543 *head -= len;
6544 }
6545 }
6546
6547 if (find_composition (end, limit, &start, &end, &prop,
6548 coding->src_object)
6549 && end <= limit)
6550 *stop = start;
6551 else
6552 *stop = limit;
6553 }
6554 return buf;
6555 }
6556
6557
6558 /* Extract an annotation datum from a text property `charset' at POS of
6559 CODING->src_object (buffer of string), store the data in BUF, set
6560 *STOP to the position where the value of `charset' property changes
6561 (limiting by LIMIT), and return the address of the next element of
6562 BUF.
6563
6564 If the property value is nil, set *STOP to the position where the
6565 property value is non-nil (limiting by LIMIT), and return BUF. */
6566
6567 static INLINE int *
6568 handle_charset_annotation (pos, limit, coding, buf, stop)
6569 EMACS_INT pos, limit;
6570 struct coding_system *coding;
6571 int *buf;
6572 EMACS_INT *stop;
6573 {
6574 Lisp_Object val, next;
6575 int id;
6576
6577 val = Fget_text_property (make_number (pos), Qcharset, coding->src_object);
6578 if (! NILP (val) && CHARSETP (val))
6579 id = XINT (CHARSET_SYMBOL_ID (val));
6580 else
6581 id = -1;
6582 ADD_CHARSET_DATA (buf, 0, id);
6583 next = Fnext_single_property_change (make_number (pos), Qcharset,
6584 coding->src_object,
6585 make_number (limit));
6586 *stop = XINT (next);
6587 return buf;
6588 }
6589
6590
6591 static void
6592 consume_chars (coding, translation_table, max_lookup)
6593 struct coding_system *coding;
6594 Lisp_Object translation_table;
6595 int max_lookup;
6596 {
6597 int *buf = coding->charbuf;
6598 int *buf_end = coding->charbuf + coding->charbuf_size;
6599 const unsigned char *src = coding->source + coding->consumed;
6600 const unsigned char *src_end = coding->source + coding->src_bytes;
6601 EMACS_INT pos = coding->src_pos + coding->consumed_char;
6602 EMACS_INT end_pos = coding->src_pos + coding->src_chars;
6603 int multibytep = coding->src_multibyte;
6604 Lisp_Object eol_type;
6605 int c;
6606 EMACS_INT stop, stop_composition, stop_charset;
6607 int *lookup_buf = NULL;
6608
6609 if (! NILP (translation_table))
6610 lookup_buf = alloca (sizeof (int) * max_lookup);
6611
6612 eol_type = CODING_ID_EOL_TYPE (coding->id);
6613 if (VECTORP (eol_type))
6614 eol_type = Qunix;
6615
6616 /* Note: composition handling is not yet implemented. */
6617 coding->common_flags &= ~CODING_ANNOTATE_COMPOSITION_MASK;
6618
6619 if (NILP (coding->src_object))
6620 stop = stop_composition = stop_charset = end_pos;
6621 else
6622 {
6623 if (coding->common_flags & CODING_ANNOTATE_COMPOSITION_MASK)
6624 stop = stop_composition = pos;
6625 else
6626 stop = stop_composition = end_pos;
6627 if (coding->common_flags & CODING_ANNOTATE_CHARSET_MASK)
6628 stop = stop_charset = pos;
6629 else
6630 stop_charset = end_pos;
6631 }
6632
6633 /* Compensate for CRLF and conversion. */
6634 buf_end -= 1 + MAX_ANNOTATION_LENGTH;
6635 while (buf < buf_end)
6636 {
6637 Lisp_Object trans;
6638
6639 if (pos == stop)
6640 {
6641 if (pos == end_pos)
6642 break;
6643 if (pos == stop_composition)
6644 buf = handle_composition_annotation (pos, end_pos, coding,
6645 buf, &stop_composition);
6646 if (pos == stop_charset)
6647 buf = handle_charset_annotation (pos, end_pos, coding,
6648 buf, &stop_charset);
6649 stop = (stop_composition < stop_charset
6650 ? stop_composition : stop_charset);
6651 }
6652
6653 if (! multibytep)
6654 {
6655 EMACS_INT bytes;
6656
6657 if (coding->encoder == encode_coding_raw_text)
6658 c = *src++, pos++;
6659 else if ((bytes = MULTIBYTE_LENGTH (src, src_end)) > 0)
6660 c = STRING_CHAR_ADVANCE_NO_UNIFY (src), pos += bytes;
6661 else
6662 c = BYTE8_TO_CHAR (*src), src++, pos++;
6663 }
6664 else
6665 c = STRING_CHAR_ADVANCE_NO_UNIFY (src), pos++;
6666 if ((c == '\r') && (coding->mode & CODING_MODE_SELECTIVE_DISPLAY))
6667 c = '\n';
6668 if (! EQ (eol_type, Qunix))
6669 {
6670 if (c == '\n')
6671 {
6672 if (EQ (eol_type, Qdos))
6673 *buf++ = '\r';
6674 else
6675 c = '\r';
6676 }
6677 }
6678
6679 trans = Qnil;
6680 LOOKUP_TRANSLATION_TABLE (translation_table, c, trans);
6681 if (NILP (trans))
6682 *buf++ = c;
6683 else
6684 {
6685 int from_nchars = 1, to_nchars = 1;
6686 int *lookup_buf_end;
6687 const unsigned char *p = src;
6688 int i;
6689
6690 lookup_buf[0] = c;
6691 for (i = 1; i < max_lookup && p < src_end; i++)
6692 lookup_buf[i] = STRING_CHAR_ADVANCE (p);
6693 lookup_buf_end = lookup_buf + i;
6694 trans = get_translation (trans, lookup_buf, lookup_buf_end, 1,
6695 &from_nchars, &to_nchars);
6696 if (EQ (trans, Qt)
6697 || buf + to_nchars > buf_end)
6698 break;
6699 *buf++ = *lookup_buf;
6700 for (i = 1; i < to_nchars; i++)
6701 *buf++ = XINT (AREF (trans, i));
6702 for (i = 1; i < from_nchars; i++, pos++)
6703 src += MULTIBYTE_LENGTH_NO_CHECK (src);
6704 }
6705 }
6706
6707 coding->consumed = src - coding->source;
6708 coding->consumed_char = pos - coding->src_pos;
6709 coding->charbuf_used = buf - coding->charbuf;
6710 coding->chars_at_source = 0;
6711 }
6712
6713
6714 /* Encode the text at CODING->src_object into CODING->dst_object.
6715 CODING->src_object is a buffer or a string.
6716 CODING->dst_object is a buffer or nil.
6717
6718 If CODING->src_object is a buffer, it must be the current buffer.
6719 In this case, if CODING->src_pos is positive, it is a position of
6720 the source text in the buffer, otherwise. the source text is in the
6721 gap area of the buffer, and coding->src_pos specifies the offset of
6722 the text from GPT (which must be the same as PT). If this is the
6723 same buffer as CODING->dst_object, CODING->src_pos must be
6724 negative and CODING should not have `pre-write-conversion'.
6725
6726 If CODING->src_object is a string, CODING should not have
6727 `pre-write-conversion'.
6728
6729 If CODING->dst_object is a buffer, the encoded data is inserted at
6730 the current point of that buffer.
6731
6732 If CODING->dst_object is nil, the encoded data is placed at the
6733 memory area specified by CODING->destination. */
6734
6735 static int
6736 encode_coding (coding)
6737 struct coding_system *coding;
6738 {
6739 Lisp_Object attrs;
6740 Lisp_Object translation_table;
6741 int max_lookup;
6742
6743 attrs = CODING_ID_ATTRS (coding->id);
6744 if (coding->encoder == encode_coding_raw_text)
6745 translation_table = Qnil, max_lookup = 0;
6746 else
6747 translation_table = get_translation_table (attrs, 1, &max_lookup);
6748
6749 if (BUFFERP (coding->dst_object))
6750 {
6751 set_buffer_internal (XBUFFER (coding->dst_object));
6752 coding->dst_multibyte
6753 = ! NILP (current_buffer->enable_multibyte_characters);
6754 }
6755
6756 coding->consumed = coding->consumed_char = 0;
6757 coding->produced = coding->produced_char = 0;
6758 record_conversion_result (coding, CODING_RESULT_SUCCESS);
6759 coding->errors = 0;
6760
6761 ALLOC_CONVERSION_WORK_AREA (coding);
6762
6763 do {
6764 coding_set_source (coding);
6765 consume_chars (coding, translation_table, max_lookup);
6766 coding_set_destination (coding);
6767 (*(coding->encoder)) (coding);
6768 } while (coding->consumed_char < coding->src_chars);
6769
6770 if (BUFFERP (coding->dst_object) && coding->produced_char > 0)
6771 insert_from_gap (coding->produced_char, coding->produced);
6772
6773 return (coding->result);
6774 }
6775
6776
6777 /* Name (or base name) of work buffer for code conversion. */
6778 static Lisp_Object Vcode_conversion_workbuf_name;
6779
6780 /* A working buffer used by the top level conversion. Once it is
6781 created, it is never destroyed. It has the name
6782 Vcode_conversion_workbuf_name. The other working buffers are
6783 destroyed after the use is finished, and their names are modified
6784 versions of Vcode_conversion_workbuf_name. */
6785 static Lisp_Object Vcode_conversion_reused_workbuf;
6786
6787 /* 1 iff Vcode_conversion_reused_workbuf is already in use. */
6788 static int reused_workbuf_in_use;
6789
6790
6791 /* Return a working buffer of code convesion. MULTIBYTE specifies the
6792 multibyteness of returning buffer. */
6793
6794 static Lisp_Object
6795 make_conversion_work_buffer (multibyte)
6796 int multibyte;
6797 {
6798 Lisp_Object name, workbuf;
6799 struct buffer *current;
6800
6801 if (reused_workbuf_in_use++)
6802 {
6803 name = Fgenerate_new_buffer_name (Vcode_conversion_workbuf_name, Qnil);
6804 workbuf = Fget_buffer_create (name);
6805 }
6806 else
6807 {
6808 name = Vcode_conversion_workbuf_name;
6809 workbuf = Fget_buffer_create (name);
6810 if (NILP (Vcode_conversion_reused_workbuf))
6811 Vcode_conversion_reused_workbuf = workbuf;
6812 }
6813 current = current_buffer;
6814 set_buffer_internal (XBUFFER (workbuf));
6815 Ferase_buffer ();
6816 current_buffer->undo_list = Qt;
6817 current_buffer->enable_multibyte_characters = multibyte ? Qt : Qnil;
6818 set_buffer_internal (current);
6819 return workbuf;
6820 }
6821
6822
6823 static Lisp_Object
6824 code_conversion_restore (arg)
6825 Lisp_Object arg;
6826 {
6827 Lisp_Object current, workbuf;
6828 struct gcpro gcpro1;
6829
6830 GCPRO1 (arg);
6831 current = XCAR (arg);
6832 workbuf = XCDR (arg);
6833 if (! NILP (workbuf))
6834 {
6835 if (EQ (workbuf, Vcode_conversion_reused_workbuf))
6836 reused_workbuf_in_use = 0;
6837 else if (! NILP (Fbuffer_live_p (workbuf)))
6838 Fkill_buffer (workbuf);
6839 }
6840 set_buffer_internal (XBUFFER (current));
6841 UNGCPRO;
6842 return Qnil;
6843 }
6844
6845 Lisp_Object
6846 code_conversion_save (with_work_buf, multibyte)
6847 int with_work_buf, multibyte;
6848 {
6849 Lisp_Object workbuf = Qnil;
6850
6851 if (with_work_buf)
6852 workbuf = make_conversion_work_buffer (multibyte);
6853 record_unwind_protect (code_conversion_restore,
6854 Fcons (Fcurrent_buffer (), workbuf));
6855 return workbuf;
6856 }
6857
6858 int
6859 decode_coding_gap (coding, chars, bytes)
6860 struct coding_system *coding;
6861 EMACS_INT chars, bytes;
6862 {
6863 int count = specpdl_ptr - specpdl;
6864 Lisp_Object attrs;
6865
6866 code_conversion_save (0, 0);
6867
6868 coding->src_object = Fcurrent_buffer ();
6869 coding->src_chars = chars;
6870 coding->src_bytes = bytes;
6871 coding->src_pos = -chars;
6872 coding->src_pos_byte = -bytes;
6873 coding->src_multibyte = chars < bytes;
6874 coding->dst_object = coding->src_object;
6875 coding->dst_pos = PT;
6876 coding->dst_pos_byte = PT_BYTE;
6877 coding->dst_multibyte = ! NILP (current_buffer->enable_multibyte_characters);
6878
6879 if (CODING_REQUIRE_DETECTION (coding))
6880 detect_coding (coding);
6881
6882 coding->mode |= CODING_MODE_LAST_BLOCK;
6883 current_buffer->text->inhibit_shrinking = 1;
6884 decode_coding (coding);
6885 current_buffer->text->inhibit_shrinking = 0;
6886
6887 attrs = CODING_ID_ATTRS (coding->id);
6888 if (! NILP (CODING_ATTR_POST_READ (attrs)))
6889 {
6890 EMACS_INT prev_Z = Z, prev_Z_BYTE = Z_BYTE;
6891 Lisp_Object val;
6892
6893 TEMP_SET_PT_BOTH (coding->dst_pos, coding->dst_pos_byte);
6894 val = call1 (CODING_ATTR_POST_READ (attrs),
6895 make_number (coding->produced_char));
6896 CHECK_NATNUM (val);
6897 coding->produced_char += Z - prev_Z;
6898 coding->produced += Z_BYTE - prev_Z_BYTE;
6899 }
6900
6901 unbind_to (count, Qnil);
6902 return coding->result;
6903 }
6904
6905 int
6906 encode_coding_gap (coding, chars, bytes)
6907 struct coding_system *coding;
6908 EMACS_INT chars, bytes;
6909 {
6910 int count = specpdl_ptr - specpdl;
6911
6912 code_conversion_save (0, 0);
6913
6914 coding->src_object = Fcurrent_buffer ();
6915 coding->src_chars = chars;
6916 coding->src_bytes = bytes;
6917 coding->src_pos = -chars;
6918 coding->src_pos_byte = -bytes;
6919 coding->src_multibyte = chars < bytes;
6920 coding->dst_object = coding->src_object;
6921 coding->dst_pos = PT;
6922 coding->dst_pos_byte = PT_BYTE;
6923
6924 encode_coding (coding);
6925
6926 unbind_to (count, Qnil);
6927 return coding->result;
6928 }
6929
6930
6931 /* Decode the text in the range FROM/FROM_BYTE and TO/TO_BYTE in
6932 SRC_OBJECT into DST_OBJECT by coding context CODING.
6933
6934 SRC_OBJECT is a buffer, a string, or Qnil.
6935
6936 If it is a buffer, the text is at point of the buffer. FROM and TO
6937 are positions in the buffer.
6938
6939 If it is a string, the text is at the beginning of the string.
6940 FROM and TO are indices to the string.
6941
6942 If it is nil, the text is at coding->source. FROM and TO are
6943 indices to coding->source.
6944
6945 DST_OBJECT is a buffer, Qt, or Qnil.
6946
6947 If it is a buffer, the decoded text is inserted at point of the
6948 buffer. If the buffer is the same as SRC_OBJECT, the source text
6949 is deleted.
6950
6951 If it is Qt, a string is made from the decoded text, and
6952 set in CODING->dst_object.
6953
6954 If it is Qnil, the decoded text is stored at CODING->destination.
6955 The caller must allocate CODING->dst_bytes bytes at
6956 CODING->destination by xmalloc. If the decoded text is longer than
6957 CODING->dst_bytes, CODING->destination is relocated by xrealloc.
6958 */
6959
6960 void
6961 decode_coding_object (coding, src_object, from, from_byte, to, to_byte,
6962 dst_object)
6963 struct coding_system *coding;
6964 Lisp_Object src_object;
6965 EMACS_INT from, from_byte, to, to_byte;
6966 Lisp_Object dst_object;
6967 {
6968 int count = specpdl_ptr - specpdl;
6969 unsigned char *destination;
6970 EMACS_INT dst_bytes;
6971 EMACS_INT chars = to - from;
6972 EMACS_INT bytes = to_byte - from_byte;
6973 Lisp_Object attrs;
6974 int saved_pt = -1, saved_pt_byte;
6975 int need_marker_adjustment = 0;
6976 Lisp_Object old_deactivate_mark;
6977
6978 old_deactivate_mark = Vdeactivate_mark;
6979
6980 if (NILP (dst_object))
6981 {
6982 destination = coding->destination;
6983 dst_bytes = coding->dst_bytes;
6984 }
6985
6986 coding->src_object = src_object;
6987 coding->src_chars = chars;
6988 coding->src_bytes = bytes;
6989 coding->src_multibyte = chars < bytes;
6990
6991 if (STRINGP (src_object))
6992 {
6993 coding->src_pos = from;
6994 coding->src_pos_byte = from_byte;
6995 }
6996 else if (BUFFERP (src_object))
6997 {
6998 set_buffer_internal (XBUFFER (src_object));
6999 if (from != GPT)
7000 move_gap_both (from, from_byte);
7001 if (EQ (src_object, dst_object))
7002 {
7003 struct Lisp_Marker *tail;
7004
7005 for (tail = BUF_MARKERS (current_buffer); tail; tail = tail->next)
7006 {
7007 tail->need_adjustment
7008 = tail->charpos == (tail->insertion_type ? from : to);
7009 need_marker_adjustment |= tail->need_adjustment;
7010 }
7011 saved_pt = PT, saved_pt_byte = PT_BYTE;
7012 TEMP_SET_PT_BOTH (from, from_byte);
7013 current_buffer->text->inhibit_shrinking = 1;
7014 del_range_both (from, from_byte, to, to_byte, 1);
7015 coding->src_pos = -chars;
7016 coding->src_pos_byte = -bytes;
7017 }
7018 else
7019 {
7020 coding->src_pos = from;
7021 coding->src_pos_byte = from_byte;
7022 }
7023 }
7024
7025 if (CODING_REQUIRE_DETECTION (coding))
7026 detect_coding (coding);
7027 attrs = CODING_ID_ATTRS (coding->id);
7028
7029 if (EQ (dst_object, Qt)
7030 || (! NILP (CODING_ATTR_POST_READ (attrs))
7031 && NILP (dst_object)))
7032 {
7033 coding->dst_multibyte = !CODING_FOR_UNIBYTE (coding);
7034 coding->dst_object = code_conversion_save (1, coding->dst_multibyte);
7035 coding->dst_pos = BEG;
7036 coding->dst_pos_byte = BEG_BYTE;
7037 }
7038 else if (BUFFERP (dst_object))
7039 {
7040 code_conversion_save (0, 0);
7041 coding->dst_object = dst_object;
7042 coding->dst_pos = BUF_PT (XBUFFER (dst_object));
7043 coding->dst_pos_byte = BUF_PT_BYTE (XBUFFER (dst_object));
7044 coding->dst_multibyte
7045 = ! NILP (XBUFFER (dst_object)->enable_multibyte_characters);
7046 }
7047 else
7048 {
7049 code_conversion_save (0, 0);
7050 coding->dst_object = Qnil;
7051 /* Most callers presume this will return a multibyte result, and they
7052 won't use `binary' or `raw-text' anyway, so let's not worry about
7053 CODING_FOR_UNIBYTE. */
7054 coding->dst_multibyte = Qt;
7055 }
7056
7057 decode_coding (coding);
7058
7059 if (BUFFERP (coding->dst_object))
7060 set_buffer_internal (XBUFFER (coding->dst_object));
7061
7062 if (! NILP (CODING_ATTR_POST_READ (attrs)))
7063 {
7064 struct gcpro gcpro1, gcpro2, gcpro3, gcpro4, gcpro5;
7065 EMACS_INT prev_Z = Z, prev_Z_BYTE = Z_BYTE;
7066 Lisp_Object val;
7067
7068 TEMP_SET_PT_BOTH (coding->dst_pos, coding->dst_pos_byte);
7069 GCPRO5 (coding->src_object, coding->dst_object, src_object, dst_object,
7070 old_deactivate_mark);
7071 val = safe_call1 (CODING_ATTR_POST_READ (attrs),
7072 make_number (coding->produced_char));
7073 UNGCPRO;
7074 CHECK_NATNUM (val);
7075 coding->produced_char += Z - prev_Z;
7076 coding->produced += Z_BYTE - prev_Z_BYTE;
7077 }
7078
7079 if (EQ (dst_object, Qt))
7080 {
7081 coding->dst_object = Fbuffer_string ();
7082 }
7083 else if (NILP (dst_object) && BUFFERP (coding->dst_object))
7084 {
7085 set_buffer_internal (XBUFFER (coding->dst_object));
7086 if (dst_bytes < coding->produced)
7087 {
7088 destination = xrealloc (destination, coding->produced);
7089 if (! destination)
7090 {
7091 record_conversion_result (coding,
7092 CODING_RESULT_INSUFFICIENT_DST);
7093 unbind_to (count, Qnil);
7094 return;
7095 }
7096 if (BEGV < GPT && GPT < BEGV + coding->produced_char)
7097 move_gap_both (BEGV, BEGV_BYTE);
7098 bcopy (BEGV_ADDR, destination, coding->produced);
7099 coding->destination = destination;
7100 }
7101 }
7102
7103 if (saved_pt >= 0)
7104 {
7105 /* This is the case of:
7106 (BUFFERP (src_object) && EQ (src_object, dst_object))
7107 As we have moved PT while replacing the original buffer
7108 contents, we must recover it now. */
7109 set_buffer_internal (XBUFFER (src_object));
7110 current_buffer->text->inhibit_shrinking = 0;
7111 if (saved_pt < from)
7112 TEMP_SET_PT_BOTH (saved_pt, saved_pt_byte);
7113 else if (saved_pt < from + chars)
7114 TEMP_SET_PT_BOTH (from, from_byte);
7115 else if (! NILP (current_buffer->enable_multibyte_characters))
7116 TEMP_SET_PT_BOTH (saved_pt + (coding->produced_char - chars),
7117 saved_pt_byte + (coding->produced - bytes));
7118 else
7119 TEMP_SET_PT_BOTH (saved_pt + (coding->produced - bytes),
7120 saved_pt_byte + (coding->produced - bytes));
7121
7122 if (need_marker_adjustment)
7123 {
7124 struct Lisp_Marker *tail;
7125
7126 for (tail = BUF_MARKERS (current_buffer); tail; tail = tail->next)
7127 if (tail->need_adjustment)
7128 {
7129 tail->need_adjustment = 0;
7130 if (tail->insertion_type)
7131 {
7132 tail->bytepos = from_byte;
7133 tail->charpos = from;
7134 }
7135 else
7136 {
7137 tail->bytepos = from_byte + coding->produced;
7138 tail->charpos
7139 = (NILP (current_buffer->enable_multibyte_characters)
7140 ? tail->bytepos : from + coding->produced_char);
7141 }
7142 }
7143 }
7144 }
7145
7146 Vdeactivate_mark = old_deactivate_mark;
7147 unbind_to (count, coding->dst_object);
7148 }
7149
7150
7151 void
7152 encode_coding_object (coding, src_object, from, from_byte, to, to_byte,
7153 dst_object)
7154 struct coding_system *coding;
7155 Lisp_Object src_object;
7156 EMACS_INT from, from_byte, to, to_byte;
7157 Lisp_Object dst_object;
7158 {
7159 int count = specpdl_ptr - specpdl;
7160 EMACS_INT chars = to - from;
7161 EMACS_INT bytes = to_byte - from_byte;
7162 Lisp_Object attrs;
7163 int saved_pt = -1, saved_pt_byte;
7164 int need_marker_adjustment = 0;
7165 int kill_src_buffer = 0;
7166 Lisp_Object old_deactivate_mark;
7167
7168 old_deactivate_mark = Vdeactivate_mark;
7169
7170 coding->src_object = src_object;
7171 coding->src_chars = chars;
7172 coding->src_bytes = bytes;
7173 coding->src_multibyte = chars < bytes;
7174
7175 attrs = CODING_ID_ATTRS (coding->id);
7176
7177 if (EQ (src_object, dst_object))
7178 {
7179 struct Lisp_Marker *tail;
7180
7181 for (tail = BUF_MARKERS (current_buffer); tail; tail = tail->next)
7182 {
7183 tail->need_adjustment
7184 = tail->charpos == (tail->insertion_type ? from : to);
7185 need_marker_adjustment |= tail->need_adjustment;
7186 }
7187 }
7188
7189 if (! NILP (CODING_ATTR_PRE_WRITE (attrs)))
7190 {
7191 coding->src_object = code_conversion_save (1, coding->src_multibyte);
7192 set_buffer_internal (XBUFFER (coding->src_object));
7193 if (STRINGP (src_object))
7194 insert_from_string (src_object, from, from_byte, chars, bytes, 0);
7195 else if (BUFFERP (src_object))
7196 insert_from_buffer (XBUFFER (src_object), from, chars, 0);
7197 else
7198 insert_1_both (coding->source + from, chars, bytes, 0, 0, 0);
7199
7200 if (EQ (src_object, dst_object))
7201 {
7202 set_buffer_internal (XBUFFER (src_object));
7203 saved_pt = PT, saved_pt_byte = PT_BYTE;
7204 del_range_both (from, from_byte, to, to_byte, 1);
7205 set_buffer_internal (XBUFFER (coding->src_object));
7206 }
7207
7208 {
7209 Lisp_Object args[3];
7210 struct gcpro gcpro1, gcpro2, gcpro3, gcpro4, gcpro5;
7211
7212 GCPRO5 (coding->src_object, coding->dst_object, src_object, dst_object,
7213 old_deactivate_mark);
7214 args[0] = CODING_ATTR_PRE_WRITE (attrs);
7215 args[1] = make_number (BEG);
7216 args[2] = make_number (Z);
7217 safe_call (3, args);
7218 UNGCPRO;
7219 }
7220 if (XBUFFER (coding->src_object) != current_buffer)
7221 kill_src_buffer = 1;
7222 coding->src_object = Fcurrent_buffer ();
7223 if (BEG != GPT)
7224 move_gap_both (BEG, BEG_BYTE);
7225 coding->src_chars = Z - BEG;
7226 coding->src_bytes = Z_BYTE - BEG_BYTE;
7227 coding->src_pos = BEG;
7228 coding->src_pos_byte = BEG_BYTE;
7229 coding->src_multibyte = Z < Z_BYTE;
7230 }
7231 else if (STRINGP (src_object))
7232 {
7233 code_conversion_save (0, 0);
7234 coding->src_pos = from;
7235 coding->src_pos_byte = from_byte;
7236 }
7237 else if (BUFFERP (src_object))
7238 {
7239 code_conversion_save (0, 0);
7240 set_buffer_internal (XBUFFER (src_object));
7241 if (EQ (src_object, dst_object))
7242 {
7243 saved_pt = PT, saved_pt_byte = PT_BYTE;
7244 coding->src_object = del_range_1 (from, to, 1, 1);
7245 coding->src_pos = 0;
7246 coding->src_pos_byte = 0;
7247 }
7248 else
7249 {
7250 if (from < GPT && to >= GPT)
7251 move_gap_both (from, from_byte);
7252 coding->src_pos = from;
7253 coding->src_pos_byte = from_byte;
7254 }
7255 }
7256 else
7257 code_conversion_save (0, 0);
7258
7259 if (BUFFERP (dst_object))
7260 {
7261 coding->dst_object = dst_object;
7262 if (EQ (src_object, dst_object))
7263 {
7264 coding->dst_pos = from;
7265 coding->dst_pos_byte = from_byte;
7266 }
7267 else
7268 {
7269 coding->dst_pos = BUF_PT (XBUFFER (dst_object));
7270 coding->dst_pos_byte = BUF_PT_BYTE (XBUFFER (dst_object));
7271 }
7272 coding->dst_multibyte
7273 = ! NILP (XBUFFER (dst_object)->enable_multibyte_characters);
7274 }
7275 else if (EQ (dst_object, Qt))
7276 {
7277 coding->dst_object = Qnil;
7278 coding->dst_bytes = coding->src_chars;
7279 if (coding->dst_bytes == 0)
7280 coding->dst_bytes = 1;
7281 coding->destination = (unsigned char *) xmalloc (coding->dst_bytes);
7282 coding->dst_multibyte = 0;
7283 }
7284 else
7285 {
7286 coding->dst_object = Qnil;
7287 coding->dst_multibyte = 0;
7288 }
7289
7290 encode_coding (coding);
7291
7292 if (EQ (dst_object, Qt))
7293 {
7294 if (BUFFERP (coding->dst_object))
7295 coding->dst_object = Fbuffer_string ();
7296 else
7297 {
7298 coding->dst_object
7299 = make_unibyte_string ((char *) coding->destination,
7300 coding->produced);
7301 xfree (coding->destination);
7302 }
7303 }
7304
7305 if (saved_pt >= 0)
7306 {
7307 /* This is the case of:
7308 (BUFFERP (src_object) && EQ (src_object, dst_object))
7309 As we have moved PT while replacing the original buffer
7310 contents, we must recover it now. */
7311 set_buffer_internal (XBUFFER (src_object));
7312 if (saved_pt < from)
7313 TEMP_SET_PT_BOTH (saved_pt, saved_pt_byte);
7314 else if (saved_pt < from + chars)
7315 TEMP_SET_PT_BOTH (from, from_byte);
7316 else if (! NILP (current_buffer->enable_multibyte_characters))
7317 TEMP_SET_PT_BOTH (saved_pt + (coding->produced_char - chars),
7318 saved_pt_byte + (coding->produced - bytes));
7319 else
7320 TEMP_SET_PT_BOTH (saved_pt + (coding->produced - bytes),
7321 saved_pt_byte + (coding->produced - bytes));
7322
7323 if (need_marker_adjustment)
7324 {
7325 struct Lisp_Marker *tail;
7326
7327 for (tail = BUF_MARKERS (current_buffer); tail; tail = tail->next)
7328 if (tail->need_adjustment)
7329 {
7330 tail->need_adjustment = 0;
7331 if (tail->insertion_type)
7332 {
7333 tail->bytepos = from_byte;
7334 tail->charpos = from;
7335 }
7336 else
7337 {
7338 tail->bytepos = from_byte + coding->produced;
7339 tail->charpos
7340 = (NILP (current_buffer->enable_multibyte_characters)
7341 ? tail->bytepos : from + coding->produced_char);
7342 }
7343 }
7344 }
7345 }
7346
7347 if (kill_src_buffer)
7348 Fkill_buffer (coding->src_object);
7349
7350 Vdeactivate_mark = old_deactivate_mark;
7351 unbind_to (count, Qnil);
7352 }
7353
7354
7355 Lisp_Object
7356 preferred_coding_system ()
7357 {
7358 int id = coding_categories[coding_priorities[0]].id;
7359
7360 return CODING_ID_NAME (id);
7361 }
7362
7363 \f
7364 #ifdef emacs
7365 /*** 8. Emacs Lisp library functions ***/
7366
7367 DEFUN ("coding-system-p", Fcoding_system_p, Scoding_system_p, 1, 1, 0,
7368 doc: /* Return t if OBJECT is nil or a coding-system.
7369 See the documentation of `define-coding-system' for information
7370 about coding-system objects. */)
7371 (obj)
7372 Lisp_Object obj;
7373 {
7374 if (NILP (obj)
7375 || CODING_SYSTEM_ID (obj) >= 0)
7376 return Qt;
7377 if (! SYMBOLP (obj)
7378 || NILP (Fget (obj, Qcoding_system_define_form)))
7379 return Qnil;
7380 return Qt;
7381 }
7382
7383 DEFUN ("read-non-nil-coding-system", Fread_non_nil_coding_system,
7384 Sread_non_nil_coding_system, 1, 1, 0,
7385 doc: /* Read a coding system from the minibuffer, prompting with string PROMPT. */)
7386 (prompt)
7387 Lisp_Object prompt;
7388 {
7389 Lisp_Object val;
7390 do
7391 {
7392 val = Fcompleting_read (prompt, Vcoding_system_alist, Qnil,
7393 Qt, Qnil, Qcoding_system_history, Qnil, Qnil);
7394 }
7395 while (SCHARS (val) == 0);
7396 return (Fintern (val, Qnil));
7397 }
7398
7399 DEFUN ("read-coding-system", Fread_coding_system, Sread_coding_system, 1, 2, 0,
7400 doc: /* Read a coding system from the minibuffer, prompting with string PROMPT.
7401 If the user enters null input, return second argument DEFAULT-CODING-SYSTEM.
7402 Ignores case when completing coding systems (all Emacs coding systems
7403 are lower-case). */)
7404 (prompt, default_coding_system)
7405 Lisp_Object prompt, default_coding_system;
7406 {
7407 Lisp_Object val;
7408 int count = SPECPDL_INDEX ();
7409
7410 if (SYMBOLP (default_coding_system))
7411 default_coding_system = SYMBOL_NAME (default_coding_system);
7412 specbind (Qcompletion_ignore_case, Qt);
7413 val = Fcompleting_read (prompt, Vcoding_system_alist, Qnil,
7414 Qt, Qnil, Qcoding_system_history,
7415 default_coding_system, Qnil);
7416 unbind_to (count, Qnil);
7417 return (SCHARS (val) == 0 ? Qnil : Fintern (val, Qnil));
7418 }
7419
7420 DEFUN ("check-coding-system", Fcheck_coding_system, Scheck_coding_system,
7421 1, 1, 0,
7422 doc: /* Check validity of CODING-SYSTEM.
7423 If valid, return CODING-SYSTEM, else signal a `coding-system-error' error.
7424 It is valid if it is nil or a symbol defined as a coding system by the
7425 function `define-coding-system'. */)
7426 (coding_system)
7427 Lisp_Object coding_system;
7428 {
7429 Lisp_Object define_form;
7430
7431 define_form = Fget (coding_system, Qcoding_system_define_form);
7432 if (! NILP (define_form))
7433 {
7434 Fput (coding_system, Qcoding_system_define_form, Qnil);
7435 safe_eval (define_form);
7436 }
7437 if (!NILP (Fcoding_system_p (coding_system)))
7438 return coding_system;
7439 xsignal1 (Qcoding_system_error, coding_system);
7440 }
7441
7442 \f
7443 /* Detect how the bytes at SRC of length SRC_BYTES are encoded. If
7444 HIGHEST is nonzero, return the coding system of the highest
7445 priority among the detected coding systems. Otherwize return a
7446 list of detected coding systems sorted by their priorities. If
7447 MULTIBYTEP is nonzero, it is assumed that the bytes are in correct
7448 multibyte form but contains only ASCII and eight-bit chars.
7449 Otherwise, the bytes are raw bytes.
7450
7451 CODING-SYSTEM controls the detection as below:
7452
7453 If it is nil, detect both text-format and eol-format. If the
7454 text-format part of CODING-SYSTEM is already specified
7455 (e.g. `iso-latin-1'), detect only eol-format. If the eol-format
7456 part of CODING-SYSTEM is already specified (e.g. `undecided-unix'),
7457 detect only text-format. */
7458
7459 Lisp_Object
7460 detect_coding_system (src, src_chars, src_bytes, highest, multibytep,
7461 coding_system)
7462 const unsigned char *src;
7463 EMACS_INT src_chars, src_bytes;
7464 int highest;
7465 int multibytep;
7466 Lisp_Object coding_system;
7467 {
7468 const unsigned char *src_end = src + src_bytes;
7469 Lisp_Object attrs, eol_type;
7470 Lisp_Object val;
7471 struct coding_system coding;
7472 int id;
7473 struct coding_detection_info detect_info;
7474 enum coding_category base_category;
7475
7476 if (NILP (coding_system))
7477 coding_system = Qundecided;
7478 setup_coding_system (coding_system, &coding);
7479 attrs = CODING_ID_ATTRS (coding.id);
7480 eol_type = CODING_ID_EOL_TYPE (coding.id);
7481 coding_system = CODING_ATTR_BASE_NAME (attrs);
7482
7483 coding.source = src;
7484 coding.src_chars = src_chars;
7485 coding.src_bytes = src_bytes;
7486 coding.src_multibyte = multibytep;
7487 coding.consumed = 0;
7488 coding.mode |= CODING_MODE_LAST_BLOCK;
7489
7490 detect_info.checked = detect_info.found = detect_info.rejected = 0;
7491
7492 /* At first, detect text-format if necessary. */
7493 base_category = XINT (CODING_ATTR_CATEGORY (attrs));
7494 if (base_category == coding_category_undecided)
7495 {
7496 enum coding_category category;
7497 struct coding_system *this;
7498 int c, i;
7499
7500 /* Skip all ASCII bytes except for a few ISO2022 controls. */
7501 for (i = 0; src < src_end; i++, src++)
7502 {
7503 c = *src;
7504 if (c & 0x80)
7505 break;
7506 if (c < 0x20
7507 && (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
7508 && ! inhibit_iso_escape_detection)
7509 {
7510 coding.head_ascii = src - coding.source;
7511 if (detect_coding_iso_2022 (&coding, &detect_info))
7512 {
7513 /* We have scanned the whole data. */
7514 if (! (detect_info.rejected & CATEGORY_MASK_ISO_7_ELSE))
7515 /* We didn't find an 8-bit code. */
7516 src = src_end;
7517 break;
7518 }
7519 }
7520 }
7521 coding.head_ascii = src - coding.source;
7522
7523 if (src < src_end
7524 || detect_info.found)
7525 {
7526 if (src == src_end)
7527 /* As all bytes are 7-bit, we can ignore non-ISO-2022 codings. */
7528 for (i = 0; i < coding_category_raw_text; i++)
7529 {
7530 category = coding_priorities[i];
7531 this = coding_categories + category;
7532 if (detect_info.found & (1 << category))
7533 break;
7534 }
7535 else
7536 for (i = 0; i < coding_category_raw_text; i++)
7537 {
7538 category = coding_priorities[i];
7539 this = coding_categories + category;
7540
7541 if (this->id < 0)
7542 {
7543 /* No coding system of this category is defined. */
7544 detect_info.rejected |= (1 << category);
7545 }
7546 else if (category >= coding_category_raw_text)
7547 continue;
7548 else if (detect_info.checked & (1 << category))
7549 {
7550 if (highest
7551 && (detect_info.found & (1 << category)))
7552 break;
7553 }
7554 else
7555 {
7556 if ((*(this->detector)) (&coding, &detect_info)
7557 && highest
7558 && (detect_info.found & (1 << category)))
7559 {
7560 if (category == coding_category_utf_16_auto)
7561 {
7562 if (detect_info.found & CATEGORY_MASK_UTF_16_LE)
7563 category = coding_category_utf_16_le;
7564 else
7565 category = coding_category_utf_16_be;
7566 }
7567 break;
7568 }
7569 }
7570 }
7571 }
7572
7573 if (detect_info.rejected == CATEGORY_MASK_ANY)
7574 {
7575 detect_info.found = CATEGORY_MASK_RAW_TEXT;
7576 id = coding_categories[coding_category_raw_text].id;
7577 val = Fcons (make_number (id), Qnil);
7578 }
7579 else if (! detect_info.rejected && ! detect_info.found)
7580 {
7581 detect_info.found = CATEGORY_MASK_ANY;
7582 id = coding_categories[coding_category_undecided].id;
7583 val = Fcons (make_number (id), Qnil);
7584 }
7585 else if (highest)
7586 {
7587 if (detect_info.found)
7588 {
7589 detect_info.found = 1 << category;
7590 val = Fcons (make_number (this->id), Qnil);
7591 }
7592 else
7593 for (i = 0; i < coding_category_raw_text; i++)
7594 if (! (detect_info.rejected & (1 << coding_priorities[i])))
7595 {
7596 detect_info.found = 1 << coding_priorities[i];
7597 id = coding_categories[coding_priorities[i]].id;
7598 val = Fcons (make_number (id), Qnil);
7599 break;
7600 }
7601 }
7602 else
7603 {
7604 int mask = detect_info.rejected | detect_info.found;
7605 int found = 0;
7606 val = Qnil;
7607
7608 for (i = coding_category_raw_text - 1; i >= 0; i--)
7609 {
7610 category = coding_priorities[i];
7611 if (! (mask & (1 << category)))
7612 {
7613 found |= 1 << category;
7614 id = coding_categories[category].id;
7615 if (id >= 0)
7616 val = Fcons (make_number (id), val);
7617 }
7618 }
7619 for (i = coding_category_raw_text - 1; i >= 0; i--)
7620 {
7621 category = coding_priorities[i];
7622 if (detect_info.found & (1 << category))
7623 {
7624 id = coding_categories[category].id;
7625 val = Fcons (make_number (id), val);
7626 }
7627 }
7628 detect_info.found |= found;
7629 }
7630 }
7631 else if (base_category == coding_category_utf_16_auto)
7632 {
7633 if (detect_coding_utf_16 (&coding, &detect_info))
7634 {
7635 struct coding_system *this;
7636
7637 if (detect_info.found & CATEGORY_MASK_UTF_16_LE)
7638 this = coding_categories + coding_category_utf_16_le;
7639 else if (detect_info.found & CATEGORY_MASK_UTF_16_BE)
7640 this = coding_categories + coding_category_utf_16_be;
7641 else if (detect_info.rejected & CATEGORY_MASK_UTF_16_LE_NOSIG)
7642 this = coding_categories + coding_category_utf_16_be_nosig;
7643 else
7644 this = coding_categories + coding_category_utf_16_le_nosig;
7645 val = Fcons (make_number (this->id), Qnil);
7646 }
7647 }
7648 else
7649 {
7650 detect_info.found = 1 << XINT (CODING_ATTR_CATEGORY (attrs));
7651 val = Fcons (make_number (coding.id), Qnil);
7652 }
7653
7654 /* Then, detect eol-format if necessary. */
7655 {
7656 int normal_eol = -1, utf_16_be_eol = -1, utf_16_le_eol;
7657 Lisp_Object tail;
7658
7659 if (VECTORP (eol_type))
7660 {
7661 if (detect_info.found & ~CATEGORY_MASK_UTF_16)
7662 normal_eol = detect_eol (coding.source, src_bytes,
7663 coding_category_raw_text);
7664 if (detect_info.found & (CATEGORY_MASK_UTF_16_BE
7665 | CATEGORY_MASK_UTF_16_BE_NOSIG))
7666 utf_16_be_eol = detect_eol (coding.source, src_bytes,
7667 coding_category_utf_16_be);
7668 if (detect_info.found & (CATEGORY_MASK_UTF_16_LE
7669 | CATEGORY_MASK_UTF_16_LE_NOSIG))
7670 utf_16_le_eol = detect_eol (coding.source, src_bytes,
7671 coding_category_utf_16_le);
7672 }
7673 else
7674 {
7675 if (EQ (eol_type, Qunix))
7676 normal_eol = utf_16_be_eol = utf_16_le_eol = EOL_SEEN_LF;
7677 else if (EQ (eol_type, Qdos))
7678 normal_eol = utf_16_be_eol = utf_16_le_eol = EOL_SEEN_CRLF;
7679 else
7680 normal_eol = utf_16_be_eol = utf_16_le_eol = EOL_SEEN_CR;
7681 }
7682
7683 for (tail = val; CONSP (tail); tail = XCDR (tail))
7684 {
7685 enum coding_category category;
7686 int this_eol;
7687
7688 id = XINT (XCAR (tail));
7689 attrs = CODING_ID_ATTRS (id);
7690 category = XINT (CODING_ATTR_CATEGORY (attrs));
7691 eol_type = CODING_ID_EOL_TYPE (id);
7692 if (VECTORP (eol_type))
7693 {
7694 if (category == coding_category_utf_16_be
7695 || category == coding_category_utf_16_be_nosig)
7696 this_eol = utf_16_be_eol;
7697 else if (category == coding_category_utf_16_le
7698 || category == coding_category_utf_16_le_nosig)
7699 this_eol = utf_16_le_eol;
7700 else
7701 this_eol = normal_eol;
7702
7703 if (this_eol == EOL_SEEN_LF)
7704 XSETCAR (tail, AREF (eol_type, 0));
7705 else if (this_eol == EOL_SEEN_CRLF)
7706 XSETCAR (tail, AREF (eol_type, 1));
7707 else if (this_eol == EOL_SEEN_CR)
7708 XSETCAR (tail, AREF (eol_type, 2));
7709 else
7710 XSETCAR (tail, CODING_ID_NAME (id));
7711 }
7712 else
7713 XSETCAR (tail, CODING_ID_NAME (id));
7714 }
7715 }
7716
7717 return (highest ? XCAR (val) : val);
7718 }
7719
7720
7721 DEFUN ("detect-coding-region", Fdetect_coding_region, Sdetect_coding_region,
7722 2, 3, 0,
7723 doc: /* Detect coding system of the text in the region between START and END.
7724 Return a list of possible coding systems ordered by priority.
7725
7726 If only ASCII characters are found (except for such ISO-2022 control
7727 characters ISO-2022 as ESC), it returns a list of single element
7728 `undecided' or its subsidiary coding system according to a detected
7729 end-of-line format.
7730
7731 If optional argument HIGHEST is non-nil, return the coding system of
7732 highest priority. */)
7733 (start, end, highest)
7734 Lisp_Object start, end, highest;
7735 {
7736 int from, to;
7737 int from_byte, to_byte;
7738
7739 CHECK_NUMBER_COERCE_MARKER (start);
7740 CHECK_NUMBER_COERCE_MARKER (end);
7741
7742 validate_region (&start, &end);
7743 from = XINT (start), to = XINT (end);
7744 from_byte = CHAR_TO_BYTE (from);
7745 to_byte = CHAR_TO_BYTE (to);
7746
7747 if (from < GPT && to >= GPT)
7748 move_gap_both (to, to_byte);
7749
7750 return detect_coding_system (BYTE_POS_ADDR (from_byte),
7751 to - from, to_byte - from_byte,
7752 !NILP (highest),
7753 !NILP (current_buffer
7754 ->enable_multibyte_characters),
7755 Qnil);
7756 }
7757
7758 DEFUN ("detect-coding-string", Fdetect_coding_string, Sdetect_coding_string,
7759 1, 2, 0,
7760 doc: /* Detect coding system of the text in STRING.
7761 Return a list of possible coding systems ordered by priority.
7762
7763 If only ASCII characters are found (except for such ISO-2022 control
7764 characters ISO-2022 as ESC), it returns a list of single element
7765 `undecided' or its subsidiary coding system according to a detected
7766 end-of-line format.
7767
7768 If optional argument HIGHEST is non-nil, return the coding system of
7769 highest priority. */)
7770 (string, highest)
7771 Lisp_Object string, highest;
7772 {
7773 CHECK_STRING (string);
7774
7775 return detect_coding_system (SDATA (string),
7776 SCHARS (string), SBYTES (string),
7777 !NILP (highest), STRING_MULTIBYTE (string),
7778 Qnil);
7779 }
7780
7781
7782 static INLINE int
7783 char_encodable_p (c, attrs)
7784 int c;
7785 Lisp_Object attrs;
7786 {
7787 Lisp_Object tail;
7788 struct charset *charset;
7789 Lisp_Object translation_table;
7790
7791 translation_table = CODING_ATTR_TRANS_TBL (attrs);
7792 if (! NILP (translation_table))
7793 c = translate_char (translation_table, c);
7794 for (tail = CODING_ATTR_CHARSET_LIST (attrs);
7795 CONSP (tail); tail = XCDR (tail))
7796 {
7797 charset = CHARSET_FROM_ID (XINT (XCAR (tail)));
7798 if (CHAR_CHARSET_P (c, charset))
7799 break;
7800 }
7801 return (! NILP (tail));
7802 }
7803
7804
7805 /* Return a list of coding systems that safely encode the text between
7806 START and END. If EXCLUDE is non-nil, it is a list of coding
7807 systems not to check. The returned list doesn't contain any such
7808 coding systems. In any case, if the text contains only ASCII or is
7809 unibyte, return t. */
7810
7811 DEFUN ("find-coding-systems-region-internal",
7812 Ffind_coding_systems_region_internal,
7813 Sfind_coding_systems_region_internal, 2, 3, 0,
7814 doc: /* Internal use only. */)
7815 (start, end, exclude)
7816 Lisp_Object start, end, exclude;
7817 {
7818 Lisp_Object coding_attrs_list, safe_codings;
7819 EMACS_INT start_byte, end_byte;
7820 const unsigned char *p, *pbeg, *pend;
7821 int c;
7822 Lisp_Object tail, elt;
7823
7824 if (STRINGP (start))
7825 {
7826 if (!STRING_MULTIBYTE (start)
7827 || SCHARS (start) == SBYTES (start))
7828 return Qt;
7829 start_byte = 0;
7830 end_byte = SBYTES (start);
7831 }
7832 else
7833 {
7834 CHECK_NUMBER_COERCE_MARKER (start);
7835 CHECK_NUMBER_COERCE_MARKER (end);
7836 if (XINT (start) < BEG || XINT (end) > Z || XINT (start) > XINT (end))
7837 args_out_of_range (start, end);
7838 if (NILP (current_buffer->enable_multibyte_characters))
7839 return Qt;
7840 start_byte = CHAR_TO_BYTE (XINT (start));
7841 end_byte = CHAR_TO_BYTE (XINT (end));
7842 if (XINT (end) - XINT (start) == end_byte - start_byte)
7843 return Qt;
7844
7845 if (XINT (start) < GPT && XINT (end) > GPT)
7846 {
7847 if ((GPT - XINT (start)) < (XINT (end) - GPT))
7848 move_gap_both (XINT (start), start_byte);
7849 else
7850 move_gap_both (XINT (end), end_byte);
7851 }
7852 }
7853
7854 coding_attrs_list = Qnil;
7855 for (tail = Vcoding_system_list; CONSP (tail); tail = XCDR (tail))
7856 if (NILP (exclude)
7857 || NILP (Fmemq (XCAR (tail), exclude)))
7858 {
7859 Lisp_Object attrs;
7860
7861 attrs = AREF (CODING_SYSTEM_SPEC (XCAR (tail)), 0);
7862 if (EQ (XCAR (tail), CODING_ATTR_BASE_NAME (attrs))
7863 && ! EQ (CODING_ATTR_TYPE (attrs), Qundecided))
7864 {
7865 ASET (attrs, coding_attr_trans_tbl,
7866 get_translation_table (attrs, 1, NULL));
7867 coding_attrs_list = Fcons (attrs, coding_attrs_list);
7868 }
7869 }
7870
7871 if (STRINGP (start))
7872 p = pbeg = SDATA (start);
7873 else
7874 p = pbeg = BYTE_POS_ADDR (start_byte);
7875 pend = p + (end_byte - start_byte);
7876
7877 while (p < pend && ASCII_BYTE_P (*p)) p++;
7878 while (p < pend && ASCII_BYTE_P (*(pend - 1))) pend--;
7879
7880 while (p < pend)
7881 {
7882 if (ASCII_BYTE_P (*p))
7883 p++;
7884 else
7885 {
7886 c = STRING_CHAR_ADVANCE (p);
7887
7888 charset_map_loaded = 0;
7889 for (tail = coding_attrs_list; CONSP (tail);)
7890 {
7891 elt = XCAR (tail);
7892 if (NILP (elt))
7893 tail = XCDR (tail);
7894 else if (char_encodable_p (c, elt))
7895 tail = XCDR (tail);
7896 else if (CONSP (XCDR (tail)))
7897 {
7898 XSETCAR (tail, XCAR (XCDR (tail)));
7899 XSETCDR (tail, XCDR (XCDR (tail)));
7900 }
7901 else
7902 {
7903 XSETCAR (tail, Qnil);
7904 tail = XCDR (tail);
7905 }
7906 }
7907 if (charset_map_loaded)
7908 {
7909 EMACS_INT p_offset = p - pbeg, pend_offset = pend - pbeg;
7910
7911 if (STRINGP (start))
7912 pbeg = SDATA (start);
7913 else
7914 pbeg = BYTE_POS_ADDR (start_byte);
7915 p = pbeg + p_offset;
7916 pend = pbeg + pend_offset;
7917 }
7918 }
7919 }
7920
7921 safe_codings = list2 (Qraw_text, Qno_conversion);
7922 for (tail = coding_attrs_list; CONSP (tail); tail = XCDR (tail))
7923 if (! NILP (XCAR (tail)))
7924 safe_codings = Fcons (CODING_ATTR_BASE_NAME (XCAR (tail)), safe_codings);
7925
7926 return safe_codings;
7927 }
7928
7929
7930 DEFUN ("unencodable-char-position", Funencodable_char_position,
7931 Sunencodable_char_position, 3, 5, 0,
7932 doc: /*
7933 Return position of first un-encodable character in a region.
7934 START and END specfiy the region and CODING-SYSTEM specifies the
7935 encoding to check. Return nil if CODING-SYSTEM does encode the region.
7936
7937 If optional 4th argument COUNT is non-nil, it specifies at most how
7938 many un-encodable characters to search. In this case, the value is a
7939 list of positions.
7940
7941 If optional 5th argument STRING is non-nil, it is a string to search
7942 for un-encodable characters. In that case, START and END are indexes
7943 to the string. */)
7944 (start, end, coding_system, count, string)
7945 Lisp_Object start, end, coding_system, count, string;
7946 {
7947 int n;
7948 struct coding_system coding;
7949 Lisp_Object attrs, charset_list, translation_table;
7950 Lisp_Object positions;
7951 int from, to;
7952 const unsigned char *p, *stop, *pend;
7953 int ascii_compatible;
7954
7955 setup_coding_system (Fcheck_coding_system (coding_system), &coding);
7956 attrs = CODING_ID_ATTRS (coding.id);
7957 if (EQ (CODING_ATTR_TYPE (attrs), Qraw_text))
7958 return Qnil;
7959 ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs));
7960 charset_list = CODING_ATTR_CHARSET_LIST (attrs);
7961 translation_table = get_translation_table (attrs, 1, NULL);
7962
7963 if (NILP (string))
7964 {
7965 validate_region (&start, &end);
7966 from = XINT (start);
7967 to = XINT (end);
7968 if (NILP (current_buffer->enable_multibyte_characters)
7969 || (ascii_compatible
7970 && (to - from) == (CHAR_TO_BYTE (to) - (CHAR_TO_BYTE (from)))))
7971 return Qnil;
7972 p = CHAR_POS_ADDR (from);
7973 pend = CHAR_POS_ADDR (to);
7974 if (from < GPT && to >= GPT)
7975 stop = GPT_ADDR;
7976 else
7977 stop = pend;
7978 }
7979 else
7980 {
7981 CHECK_STRING (string);
7982 CHECK_NATNUM (start);
7983 CHECK_NATNUM (end);
7984 from = XINT (start);
7985 to = XINT (end);
7986 if (from > to
7987 || to > SCHARS (string))
7988 args_out_of_range_3 (string, start, end);
7989 if (! STRING_MULTIBYTE (string))
7990 return Qnil;
7991 p = SDATA (string) + string_char_to_byte (string, from);
7992 stop = pend = SDATA (string) + string_char_to_byte (string, to);
7993 if (ascii_compatible && (to - from) == (pend - p))
7994 return Qnil;
7995 }
7996
7997 if (NILP (count))
7998 n = 1;
7999 else
8000 {
8001 CHECK_NATNUM (count);
8002 n = XINT (count);
8003 }
8004
8005 positions = Qnil;
8006 while (1)
8007 {
8008 int c;
8009
8010 if (ascii_compatible)
8011 while (p < stop && ASCII_BYTE_P (*p))
8012 p++, from++;
8013 if (p >= stop)
8014 {
8015 if (p >= pend)
8016 break;
8017 stop = pend;
8018 p = GAP_END_ADDR;
8019 }
8020
8021 c = STRING_CHAR_ADVANCE (p);
8022 if (! (ASCII_CHAR_P (c) && ascii_compatible)
8023 && ! char_charset (translate_char (translation_table, c),
8024 charset_list, NULL))
8025 {
8026 positions = Fcons (make_number (from), positions);
8027 n--;
8028 if (n == 0)
8029 break;
8030 }
8031
8032 from++;
8033 }
8034
8035 return (NILP (count) ? Fcar (positions) : Fnreverse (positions));
8036 }
8037
8038
8039 DEFUN ("check-coding-systems-region", Fcheck_coding_systems_region,
8040 Scheck_coding_systems_region, 3, 3, 0,
8041 doc: /* Check if the region is encodable by coding systems.
8042
8043 START and END are buffer positions specifying the region.
8044 CODING-SYSTEM-LIST is a list of coding systems to check.
8045
8046 The value is an alist ((CODING-SYSTEM POS0 POS1 ...) ...), where
8047 CODING-SYSTEM is a member of CODING-SYSTEM-LIst and can't encode the
8048 whole region, POS0, POS1, ... are buffer positions where non-encodable
8049 characters are found.
8050
8051 If all coding systems in CODING-SYSTEM-LIST can encode the region, the
8052 value is nil.
8053
8054 START may be a string. In that case, check if the string is
8055 encodable, and the value contains indices to the string instead of
8056 buffer positions. END is ignored. */)
8057 (start, end, coding_system_list)
8058 Lisp_Object start, end, coding_system_list;
8059 {
8060 Lisp_Object list;
8061 EMACS_INT start_byte, end_byte;
8062 int pos;
8063 const unsigned char *p, *pbeg, *pend;
8064 int c;
8065 Lisp_Object tail, elt, attrs;
8066
8067 if (STRINGP (start))
8068 {
8069 if (!STRING_MULTIBYTE (start)
8070 && SCHARS (start) != SBYTES (start))
8071 return Qnil;
8072 start_byte = 0;
8073 end_byte = SBYTES (start);
8074 pos = 0;
8075 }
8076 else
8077 {
8078 CHECK_NUMBER_COERCE_MARKER (start);
8079 CHECK_NUMBER_COERCE_MARKER (end);
8080 if (XINT (start) < BEG || XINT (end) > Z || XINT (start) > XINT (end))
8081 args_out_of_range (start, end);
8082 if (NILP (current_buffer->enable_multibyte_characters))
8083 return Qnil;
8084 start_byte = CHAR_TO_BYTE (XINT (start));
8085 end_byte = CHAR_TO_BYTE (XINT (end));
8086 if (XINT (end) - XINT (start) == end_byte - start_byte)
8087 return Qt;
8088
8089 if (XINT (start) < GPT && XINT (end) > GPT)
8090 {
8091 if ((GPT - XINT (start)) < (XINT (end) - GPT))
8092 move_gap_both (XINT (start), start_byte);
8093 else
8094 move_gap_both (XINT (end), end_byte);
8095 }
8096 pos = XINT (start);
8097 }
8098
8099 list = Qnil;
8100 for (tail = coding_system_list; CONSP (tail); tail = XCDR (tail))
8101 {
8102 elt = XCAR (tail);
8103 attrs = AREF (CODING_SYSTEM_SPEC (elt), 0);
8104 ASET (attrs, coding_attr_trans_tbl,
8105 get_translation_table (attrs, 1, NULL));
8106 list = Fcons (Fcons (elt, Fcons (attrs, Qnil)), list);
8107 }
8108
8109 if (STRINGP (start))
8110 p = pbeg = SDATA (start);
8111 else
8112 p = pbeg = BYTE_POS_ADDR (start_byte);
8113 pend = p + (end_byte - start_byte);
8114
8115 while (p < pend && ASCII_BYTE_P (*p)) p++, pos++;
8116 while (p < pend && ASCII_BYTE_P (*(pend - 1))) pend--;
8117
8118 while (p < pend)
8119 {
8120 if (ASCII_BYTE_P (*p))
8121 p++;
8122 else
8123 {
8124 c = STRING_CHAR_ADVANCE (p);
8125
8126 charset_map_loaded = 0;
8127 for (tail = list; CONSP (tail); tail = XCDR (tail))
8128 {
8129 elt = XCDR (XCAR (tail));
8130 if (! char_encodable_p (c, XCAR (elt)))
8131 XSETCDR (elt, Fcons (make_number (pos), XCDR (elt)));
8132 }
8133 if (charset_map_loaded)
8134 {
8135 EMACS_INT p_offset = p - pbeg, pend_offset = pend - pbeg;
8136
8137 if (STRINGP (start))
8138 pbeg = SDATA (start);
8139 else
8140 pbeg = BYTE_POS_ADDR (start_byte);
8141 p = pbeg + p_offset;
8142 pend = pbeg + pend_offset;
8143 }
8144 }
8145 pos++;
8146 }
8147
8148 tail = list;
8149 list = Qnil;
8150 for (; CONSP (tail); tail = XCDR (tail))
8151 {
8152 elt = XCAR (tail);
8153 if (CONSP (XCDR (XCDR (elt))))
8154 list = Fcons (Fcons (XCAR (elt), Fnreverse (XCDR (XCDR (elt)))),
8155 list);
8156 }
8157
8158 return list;
8159 }
8160
8161
8162 Lisp_Object
8163 code_convert_region (start, end, coding_system, dst_object, encodep, norecord)
8164 Lisp_Object start, end, coding_system, dst_object;
8165 int encodep, norecord;
8166 {
8167 struct coding_system coding;
8168 EMACS_INT from, from_byte, to, to_byte;
8169 Lisp_Object src_object;
8170
8171 CHECK_NUMBER_COERCE_MARKER (start);
8172 CHECK_NUMBER_COERCE_MARKER (end);
8173 if (NILP (coding_system))
8174 coding_system = Qno_conversion;
8175 else
8176 CHECK_CODING_SYSTEM (coding_system);
8177 src_object = Fcurrent_buffer ();
8178 if (NILP (dst_object))
8179 dst_object = src_object;
8180 else if (! EQ (dst_object, Qt))
8181 CHECK_BUFFER (dst_object);
8182
8183 validate_region (&start, &end);
8184 from = XFASTINT (start);
8185 from_byte = CHAR_TO_BYTE (from);
8186 to = XFASTINT (end);
8187 to_byte = CHAR_TO_BYTE (to);
8188
8189 setup_coding_system (coding_system, &coding);
8190 coding.mode |= CODING_MODE_LAST_BLOCK;
8191
8192 if (encodep)
8193 encode_coding_object (&coding, src_object, from, from_byte, to, to_byte,
8194 dst_object);
8195 else
8196 decode_coding_object (&coding, src_object, from, from_byte, to, to_byte,
8197 dst_object);
8198 if (! norecord)
8199 Vlast_coding_system_used = CODING_ID_NAME (coding.id);
8200
8201 return (BUFFERP (dst_object)
8202 ? make_number (coding.produced_char)
8203 : coding.dst_object);
8204 }
8205
8206
8207 DEFUN ("decode-coding-region", Fdecode_coding_region, Sdecode_coding_region,
8208 3, 4, "r\nzCoding system: ",
8209 doc: /* Decode the current region from the specified coding system.
8210 When called from a program, takes four arguments:
8211 START, END, CODING-SYSTEM, and DESTINATION.
8212 START and END are buffer positions.
8213
8214 Optional 4th arguments DESTINATION specifies where the decoded text goes.
8215 If nil, the region between START and END is replaced by the decoded text.
8216 If buffer, the decoded text is inserted in the buffer.
8217 If t, the decoded text is returned.
8218
8219 This function sets `last-coding-system-used' to the precise coding system
8220 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
8221 not fully specified.)
8222 It returns the length of the decoded text. */)
8223 (start, end, coding_system, destination)
8224 Lisp_Object start, end, coding_system, destination;
8225 {
8226 return code_convert_region (start, end, coding_system, destination, 0, 0);
8227 }
8228
8229 DEFUN ("encode-coding-region", Fencode_coding_region, Sencode_coding_region,
8230 3, 4, "r\nzCoding system: ",
8231 doc: /* Encode the current region by specified coding system.
8232 When called from a program, takes three arguments:
8233 START, END, and CODING-SYSTEM. START and END are buffer positions.
8234
8235 Optional 4th arguments DESTINATION specifies where the encoded text goes.
8236 If nil, the region between START and END is replace by the encoded text.
8237 If buffer, the encoded text is inserted in the buffer.
8238 If t, the encoded text is returned.
8239
8240 This function sets `last-coding-system-used' to the precise coding system
8241 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
8242 not fully specified.)
8243 It returns the length of the encoded text. */)
8244 (start, end, coding_system, destination)
8245 Lisp_Object start, end, coding_system, destination;
8246 {
8247 return code_convert_region (start, end, coding_system, destination, 1, 0);
8248 }
8249
8250 Lisp_Object
8251 code_convert_string (string, coding_system, dst_object,
8252 encodep, nocopy, norecord)
8253 Lisp_Object string, coding_system, dst_object;
8254 int encodep, nocopy, norecord;
8255 {
8256 struct coding_system coding;
8257 EMACS_INT chars, bytes;
8258
8259 CHECK_STRING (string);
8260 if (NILP (coding_system))
8261 {
8262 if (! norecord)
8263 Vlast_coding_system_used = Qno_conversion;
8264 if (NILP (dst_object))
8265 return (nocopy ? Fcopy_sequence (string) : string);
8266 }
8267
8268 if (NILP (coding_system))
8269 coding_system = Qno_conversion;
8270 else
8271 CHECK_CODING_SYSTEM (coding_system);
8272 if (NILP (dst_object))
8273 dst_object = Qt;
8274 else if (! EQ (dst_object, Qt))
8275 CHECK_BUFFER (dst_object);
8276
8277 setup_coding_system (coding_system, &coding);
8278 coding.mode |= CODING_MODE_LAST_BLOCK;
8279 chars = SCHARS (string);
8280 bytes = SBYTES (string);
8281 if (encodep)
8282 encode_coding_object (&coding, string, 0, 0, chars, bytes, dst_object);
8283 else
8284 decode_coding_object (&coding, string, 0, 0, chars, bytes, dst_object);
8285 if (! norecord)
8286 Vlast_coding_system_used = CODING_ID_NAME (coding.id);
8287
8288 return (BUFFERP (dst_object)
8289 ? make_number (coding.produced_char)
8290 : coding.dst_object);
8291 }
8292
8293
8294 /* Encode or decode STRING according to CODING_SYSTEM.
8295 Do not set Vlast_coding_system_used.
8296
8297 This function is called only from macros DECODE_FILE and
8298 ENCODE_FILE, thus we ignore character composition. */
8299
8300 Lisp_Object
8301 code_convert_string_norecord (string, coding_system, encodep)
8302 Lisp_Object string, coding_system;
8303 int encodep;
8304 {
8305 return code_convert_string (string, coding_system, Qt, encodep, 0, 1);
8306 }
8307
8308
8309 DEFUN ("decode-coding-string", Fdecode_coding_string, Sdecode_coding_string,
8310 2, 4, 0,
8311 doc: /* Decode STRING which is encoded in CODING-SYSTEM, and return the result.
8312
8313 Optional third arg NOCOPY non-nil means it is OK to return STRING itself
8314 if the decoding operation is trivial.
8315
8316 Optional fourth arg BUFFER non-nil meant that the decoded text is
8317 inserted in BUFFER instead of returned as a string. In this case,
8318 the return value is BUFFER.
8319
8320 This function sets `last-coding-system-used' to the precise coding system
8321 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
8322 not fully specified. */)
8323 (string, coding_system, nocopy, buffer)
8324 Lisp_Object string, coding_system, nocopy, buffer;
8325 {
8326 return code_convert_string (string, coding_system, buffer,
8327 0, ! NILP (nocopy), 0);
8328 }
8329
8330 DEFUN ("encode-coding-string", Fencode_coding_string, Sencode_coding_string,
8331 2, 4, 0,
8332 doc: /* Encode STRING to CODING-SYSTEM, and return the result.
8333
8334 Optional third arg NOCOPY non-nil means it is OK to return STRING
8335 itself if the encoding operation is trivial.
8336
8337 Optional fourth arg BUFFER non-nil meant that the encoded text is
8338 inserted in BUFFER instead of returned as a string. In this case,
8339 the return value is BUFFER.
8340
8341 This function sets `last-coding-system-used' to the precise coding system
8342 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
8343 not fully specified.) */)
8344 (string, coding_system, nocopy, buffer)
8345 Lisp_Object string, coding_system, nocopy, buffer;
8346 {
8347 return code_convert_string (string, coding_system, buffer,
8348 1, ! NILP (nocopy), 1);
8349 }
8350
8351 \f
8352 DEFUN ("decode-sjis-char", Fdecode_sjis_char, Sdecode_sjis_char, 1, 1, 0,
8353 doc: /* Decode a Japanese character which has CODE in shift_jis encoding.
8354 Return the corresponding character. */)
8355 (code)
8356 Lisp_Object code;
8357 {
8358 Lisp_Object spec, attrs, val;
8359 struct charset *charset_roman, *charset_kanji, *charset_kana, *charset;
8360 int c;
8361
8362 CHECK_NATNUM (code);
8363 c = XFASTINT (code);
8364 CHECK_CODING_SYSTEM_GET_SPEC (Vsjis_coding_system, spec);
8365 attrs = AREF (spec, 0);
8366
8367 if (ASCII_BYTE_P (c)
8368 && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
8369 return code;
8370
8371 val = CODING_ATTR_CHARSET_LIST (attrs);
8372 charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
8373 charset_kana = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
8374 charset_kanji = CHARSET_FROM_ID (XINT (XCAR (val)));
8375
8376 if (c <= 0x7F)
8377 charset = charset_roman;
8378 else if (c >= 0xA0 && c < 0xDF)
8379 {
8380 charset = charset_kana;
8381 c -= 0x80;
8382 }
8383 else
8384 {
8385 int s1 = c >> 8, s2 = c & 0xFF;
8386
8387 if (s1 < 0x81 || (s1 > 0x9F && s1 < 0xE0) || s1 > 0xEF
8388 || s2 < 0x40 || s2 == 0x7F || s2 > 0xFC)
8389 error ("Invalid code: %d", code);
8390 SJIS_TO_JIS (c);
8391 charset = charset_kanji;
8392 }
8393 c = DECODE_CHAR (charset, c);
8394 if (c < 0)
8395 error ("Invalid code: %d", code);
8396 return make_number (c);
8397 }
8398
8399
8400 DEFUN ("encode-sjis-char", Fencode_sjis_char, Sencode_sjis_char, 1, 1, 0,
8401 doc: /* Encode a Japanese character CH to shift_jis encoding.
8402 Return the corresponding code in SJIS. */)
8403 (ch)
8404 Lisp_Object ch;
8405 {
8406 Lisp_Object spec, attrs, charset_list;
8407 int c;
8408 struct charset *charset;
8409 unsigned code;
8410
8411 CHECK_CHARACTER (ch);
8412 c = XFASTINT (ch);
8413 CHECK_CODING_SYSTEM_GET_SPEC (Vsjis_coding_system, spec);
8414 attrs = AREF (spec, 0);
8415
8416 if (ASCII_CHAR_P (c)
8417 && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
8418 return ch;
8419
8420 charset_list = CODING_ATTR_CHARSET_LIST (attrs);
8421 charset = char_charset (c, charset_list, &code);
8422 if (code == CHARSET_INVALID_CODE (charset))
8423 error ("Can't encode by shift_jis encoding: %d", c);
8424 JIS_TO_SJIS (code);
8425
8426 return make_number (code);
8427 }
8428
8429 DEFUN ("decode-big5-char", Fdecode_big5_char, Sdecode_big5_char, 1, 1, 0,
8430 doc: /* Decode a Big5 character which has CODE in BIG5 coding system.
8431 Return the corresponding character. */)
8432 (code)
8433 Lisp_Object code;
8434 {
8435 Lisp_Object spec, attrs, val;
8436 struct charset *charset_roman, *charset_big5, *charset;
8437 int c;
8438
8439 CHECK_NATNUM (code);
8440 c = XFASTINT (code);
8441 CHECK_CODING_SYSTEM_GET_SPEC (Vbig5_coding_system, spec);
8442 attrs = AREF (spec, 0);
8443
8444 if (ASCII_BYTE_P (c)
8445 && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
8446 return code;
8447
8448 val = CODING_ATTR_CHARSET_LIST (attrs);
8449 charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
8450 charset_big5 = CHARSET_FROM_ID (XINT (XCAR (val)));
8451
8452 if (c <= 0x7F)
8453 charset = charset_roman;
8454 else
8455 {
8456 int b1 = c >> 8, b2 = c & 0x7F;
8457 if (b1 < 0xA1 || b1 > 0xFE
8458 || b2 < 0x40 || (b2 > 0x7E && b2 < 0xA1) || b2 > 0xFE)
8459 error ("Invalid code: %d", code);
8460 charset = charset_big5;
8461 }
8462 c = DECODE_CHAR (charset, (unsigned )c);
8463 if (c < 0)
8464 error ("Invalid code: %d", code);
8465 return make_number (c);
8466 }
8467
8468 DEFUN ("encode-big5-char", Fencode_big5_char, Sencode_big5_char, 1, 1, 0,
8469 doc: /* Encode the Big5 character CH to BIG5 coding system.
8470 Return the corresponding character code in Big5. */)
8471 (ch)
8472 Lisp_Object ch;
8473 {
8474 Lisp_Object spec, attrs, charset_list;
8475 struct charset *charset;
8476 int c;
8477 unsigned code;
8478
8479 CHECK_CHARACTER (ch);
8480 c = XFASTINT (ch);
8481 CHECK_CODING_SYSTEM_GET_SPEC (Vbig5_coding_system, spec);
8482 attrs = AREF (spec, 0);
8483 if (ASCII_CHAR_P (c)
8484 && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
8485 return ch;
8486
8487 charset_list = CODING_ATTR_CHARSET_LIST (attrs);
8488 charset = char_charset (c, charset_list, &code);
8489 if (code == CHARSET_INVALID_CODE (charset))
8490 error ("Can't encode by Big5 encoding: %d", c);
8491
8492 return make_number (code);
8493 }
8494
8495 \f
8496 DEFUN ("set-terminal-coding-system-internal", Fset_terminal_coding_system_internal,
8497 Sset_terminal_coding_system_internal, 1, 2, 0,
8498 doc: /* Internal use only. */)
8499 (coding_system, terminal)
8500 Lisp_Object coding_system;
8501 Lisp_Object terminal;
8502 {
8503 struct coding_system *terminal_coding = TERMINAL_TERMINAL_CODING (get_terminal (terminal, 1));
8504 CHECK_SYMBOL (coding_system);
8505 setup_coding_system (Fcheck_coding_system (coding_system), terminal_coding);
8506 /* We had better not send unsafe characters to terminal. */
8507 terminal_coding->mode |= CODING_MODE_SAFE_ENCODING;
8508 /* Characer composition should be disabled. */
8509 terminal_coding->common_flags &= ~CODING_ANNOTATE_COMPOSITION_MASK;
8510 terminal_coding->src_multibyte = 1;
8511 terminal_coding->dst_multibyte = 0;
8512 return Qnil;
8513 }
8514
8515 DEFUN ("set-safe-terminal-coding-system-internal",
8516 Fset_safe_terminal_coding_system_internal,
8517 Sset_safe_terminal_coding_system_internal, 1, 1, 0,
8518 doc: /* Internal use only. */)
8519 (coding_system)
8520 Lisp_Object coding_system;
8521 {
8522 CHECK_SYMBOL (coding_system);
8523 setup_coding_system (Fcheck_coding_system (coding_system),
8524 &safe_terminal_coding);
8525 /* Characer composition should be disabled. */
8526 safe_terminal_coding.common_flags &= ~CODING_ANNOTATE_COMPOSITION_MASK;
8527 safe_terminal_coding.src_multibyte = 1;
8528 safe_terminal_coding.dst_multibyte = 0;
8529 return Qnil;
8530 }
8531
8532 DEFUN ("terminal-coding-system", Fterminal_coding_system,
8533 Sterminal_coding_system, 0, 1, 0,
8534 doc: /* Return coding system specified for terminal output on the given terminal.
8535 TERMINAL may be a terminal id, a frame, or nil for the selected
8536 frame's terminal device. */)
8537 (terminal)
8538 Lisp_Object terminal;
8539 {
8540 struct coding_system *terminal_coding
8541 = TERMINAL_TERMINAL_CODING (get_terminal (terminal, 1));
8542 Lisp_Object coding_system = CODING_ID_NAME (terminal_coding->id);
8543
8544 /* For backward compatibility, return nil if it is `undecided'. */
8545 return (! EQ (coding_system, Qundecided) ? coding_system : Qnil);
8546 }
8547
8548 DEFUN ("set-keyboard-coding-system-internal", Fset_keyboard_coding_system_internal,
8549 Sset_keyboard_coding_system_internal, 1, 2, 0,
8550 doc: /* Internal use only. */)
8551 (coding_system, terminal)
8552 Lisp_Object coding_system;
8553 Lisp_Object terminal;
8554 {
8555 struct terminal *t = get_terminal (terminal, 1);
8556 CHECK_SYMBOL (coding_system);
8557 setup_coding_system (Fcheck_coding_system (coding_system),
8558 TERMINAL_KEYBOARD_CODING (t));
8559 /* Characer composition should be disabled. */
8560 TERMINAL_KEYBOARD_CODING (t)->common_flags
8561 &= ~CODING_ANNOTATE_COMPOSITION_MASK;
8562 return Qnil;
8563 }
8564
8565 DEFUN ("keyboard-coding-system",
8566 Fkeyboard_coding_system, Skeyboard_coding_system, 0, 1, 0,
8567 doc: /* Return coding system specified for decoding keyboard input. */)
8568 (terminal)
8569 Lisp_Object terminal;
8570 {
8571 return CODING_ID_NAME (TERMINAL_KEYBOARD_CODING
8572 (get_terminal (terminal, 1))->id);
8573 }
8574
8575 \f
8576 DEFUN ("find-operation-coding-system", Ffind_operation_coding_system,
8577 Sfind_operation_coding_system, 1, MANY, 0,
8578 doc: /* Choose a coding system for an operation based on the target name.
8579 The value names a pair of coding systems: (DECODING-SYSTEM . ENCODING-SYSTEM).
8580 DECODING-SYSTEM is the coding system to use for decoding
8581 \(in case OPERATION does decoding), and ENCODING-SYSTEM is the coding system
8582 for encoding (in case OPERATION does encoding).
8583
8584 The first argument OPERATION specifies an I/O primitive:
8585 For file I/O, `insert-file-contents' or `write-region'.
8586 For process I/O, `call-process', `call-process-region', or `start-process'.
8587 For network I/O, `open-network-stream'.
8588
8589 The remaining arguments should be the same arguments that were passed
8590 to the primitive. Depending on which primitive, one of those arguments
8591 is selected as the TARGET. For example, if OPERATION does file I/O,
8592 whichever argument specifies the file name is TARGET.
8593
8594 TARGET has a meaning which depends on OPERATION:
8595 For file I/O, TARGET is a file name (except for the special case below).
8596 For process I/O, TARGET is a process name.
8597 For network I/O, TARGET is a service name or a port number
8598
8599 This function looks up what specified for TARGET in,
8600 `file-coding-system-alist', `process-coding-system-alist',
8601 or `network-coding-system-alist' depending on OPERATION.
8602 They may specify a coding system, a cons of coding systems,
8603 or a function symbol to call.
8604 In the last case, we call the function with one argument,
8605 which is a list of all the arguments given to this function.
8606 If the function can't decide a coding system, it can return
8607 `undecided' so that the normal code-detection is performed.
8608
8609 If OPERATION is `insert-file-contents', the argument corresponding to
8610 TARGET may be a cons (FILENAME . BUFFER). In that case, FILENAME is a
8611 file name to look up, and BUFFER is a buffer that contains the file's
8612 contents (not yet decoded). If `file-coding-system-alist' specifies a
8613 function to call for FILENAME, that function should examine the
8614 contents of BUFFER instead of reading the file.
8615
8616 usage: (find-operation-coding-system OPERATION ARGUMENTS...) */)
8617 (nargs, args)
8618 int nargs;
8619 Lisp_Object *args;
8620 {
8621 Lisp_Object operation, target_idx, target, val;
8622 register Lisp_Object chain;
8623
8624 if (nargs < 2)
8625 error ("Too few arguments");
8626 operation = args[0];
8627 if (!SYMBOLP (operation)
8628 || !INTEGERP (target_idx = Fget (operation, Qtarget_idx)))
8629 error ("Invalid first arguement");
8630 if (nargs < 1 + XINT (target_idx))
8631 error ("Too few arguments for operation: %s",
8632 SDATA (SYMBOL_NAME (operation)));
8633 target = args[XINT (target_idx) + 1];
8634 if (!(STRINGP (target)
8635 || (EQ (operation, Qinsert_file_contents) && CONSP (target)
8636 && STRINGP (XCAR (target)) && BUFFERP (XCDR (target)))
8637 || (EQ (operation, Qopen_network_stream) && INTEGERP (target))))
8638 error ("Invalid %dth argument", XINT (target_idx) + 1);
8639 if (CONSP (target))
8640 target = XCAR (target);
8641
8642 chain = ((EQ (operation, Qinsert_file_contents)
8643 || EQ (operation, Qwrite_region))
8644 ? Vfile_coding_system_alist
8645 : (EQ (operation, Qopen_network_stream)
8646 ? Vnetwork_coding_system_alist
8647 : Vprocess_coding_system_alist));
8648 if (NILP (chain))
8649 return Qnil;
8650
8651 for (; CONSP (chain); chain = XCDR (chain))
8652 {
8653 Lisp_Object elt;
8654
8655 elt = XCAR (chain);
8656 if (CONSP (elt)
8657 && ((STRINGP (target)
8658 && STRINGP (XCAR (elt))
8659 && fast_string_match (XCAR (elt), target) >= 0)
8660 || (INTEGERP (target) && EQ (target, XCAR (elt)))))
8661 {
8662 val = XCDR (elt);
8663 /* Here, if VAL is both a valid coding system and a valid
8664 function symbol, we return VAL as a coding system. */
8665 if (CONSP (val))
8666 return val;
8667 if (! SYMBOLP (val))
8668 return Qnil;
8669 if (! NILP (Fcoding_system_p (val)))
8670 return Fcons (val, val);
8671 if (! NILP (Ffboundp (val)))
8672 {
8673 /* We use call1 rather than safe_call1
8674 so as to get bug reports about functions called here
8675 which don't handle the current interface. */
8676 val = call1 (val, Flist (nargs, args));
8677 if (CONSP (val))
8678 return val;
8679 if (SYMBOLP (val) && ! NILP (Fcoding_system_p (val)))
8680 return Fcons (val, val);
8681 }
8682 return Qnil;
8683 }
8684 }
8685 return Qnil;
8686 }
8687
8688 DEFUN ("set-coding-system-priority", Fset_coding_system_priority,
8689 Sset_coding_system_priority, 0, MANY, 0,
8690 doc: /* Assign higher priority to the coding systems given as arguments.
8691 If multiple coding systems belongs to the same category,
8692 all but the first one are ignored.
8693
8694 usage: (set-coding-system-priority ...) */)
8695 (nargs, args)
8696 int nargs;
8697 Lisp_Object *args;
8698 {
8699 int i, j;
8700 int changed[coding_category_max];
8701 enum coding_category priorities[coding_category_max];
8702
8703 bzero (changed, sizeof changed);
8704
8705 for (i = j = 0; i < nargs; i++)
8706 {
8707 enum coding_category category;
8708 Lisp_Object spec, attrs;
8709
8710 CHECK_CODING_SYSTEM_GET_SPEC (args[i], spec);
8711 attrs = AREF (spec, 0);
8712 category = XINT (CODING_ATTR_CATEGORY (attrs));
8713 if (changed[category])
8714 /* Ignore this coding system because a coding system of the
8715 same category already had a higher priority. */
8716 continue;
8717 changed[category] = 1;
8718 priorities[j++] = category;
8719 if (coding_categories[category].id >= 0
8720 && ! EQ (args[i], CODING_ID_NAME (coding_categories[category].id)))
8721 setup_coding_system (args[i], &coding_categories[category]);
8722 Fset (AREF (Vcoding_category_table, category), args[i]);
8723 }
8724
8725 /* Now we have decided top J priorities. Reflect the order of the
8726 original priorities to the remaining priorities. */
8727
8728 for (i = j, j = 0; i < coding_category_max; i++, j++)
8729 {
8730 while (j < coding_category_max
8731 && changed[coding_priorities[j]])
8732 j++;
8733 if (j == coding_category_max)
8734 abort ();
8735 priorities[i] = coding_priorities[j];
8736 }
8737
8738 bcopy (priorities, coding_priorities, sizeof priorities);
8739
8740 /* Update `coding-category-list'. */
8741 Vcoding_category_list = Qnil;
8742 for (i = coding_category_max - 1; i >= 0; i--)
8743 Vcoding_category_list
8744 = Fcons (AREF (Vcoding_category_table, priorities[i]),
8745 Vcoding_category_list);
8746
8747 return Qnil;
8748 }
8749
8750 DEFUN ("coding-system-priority-list", Fcoding_system_priority_list,
8751 Scoding_system_priority_list, 0, 1, 0,
8752 doc: /* Return a list of coding systems ordered by their priorities.
8753 HIGHESTP non-nil means just return the highest priority one. */)
8754 (highestp)
8755 Lisp_Object highestp;
8756 {
8757 int i;
8758 Lisp_Object val;
8759
8760 for (i = 0, val = Qnil; i < coding_category_max; i++)
8761 {
8762 enum coding_category category = coding_priorities[i];
8763 int id = coding_categories[category].id;
8764 Lisp_Object attrs;
8765
8766 if (id < 0)
8767 continue;
8768 attrs = CODING_ID_ATTRS (id);
8769 if (! NILP (highestp))
8770 return CODING_ATTR_BASE_NAME (attrs);
8771 val = Fcons (CODING_ATTR_BASE_NAME (attrs), val);
8772 }
8773 return Fnreverse (val);
8774 }
8775
8776 static char *suffixes[] = { "-unix", "-dos", "-mac" };
8777
8778 static Lisp_Object
8779 make_subsidiaries (base)
8780 Lisp_Object base;
8781 {
8782 Lisp_Object subsidiaries;
8783 int base_name_len = SBYTES (SYMBOL_NAME (base));
8784 char *buf = (char *) alloca (base_name_len + 6);
8785 int i;
8786
8787 bcopy (SDATA (SYMBOL_NAME (base)), buf, base_name_len);
8788 subsidiaries = Fmake_vector (make_number (3), Qnil);
8789 for (i = 0; i < 3; i++)
8790 {
8791 bcopy (suffixes[i], buf + base_name_len, strlen (suffixes[i]) + 1);
8792 ASET (subsidiaries, i, intern (buf));
8793 }
8794 return subsidiaries;
8795 }
8796
8797
8798 DEFUN ("define-coding-system-internal", Fdefine_coding_system_internal,
8799 Sdefine_coding_system_internal, coding_arg_max, MANY, 0,
8800 doc: /* For internal use only.
8801 usage: (define-coding-system-internal ...) */)
8802 (nargs, args)
8803 int nargs;
8804 Lisp_Object *args;
8805 {
8806 Lisp_Object name;
8807 Lisp_Object spec_vec; /* [ ATTRS ALIASE EOL_TYPE ] */
8808 Lisp_Object attrs; /* Vector of attributes. */
8809 Lisp_Object eol_type;
8810 Lisp_Object aliases;
8811 Lisp_Object coding_type, charset_list, safe_charsets;
8812 enum coding_category category;
8813 Lisp_Object tail, val;
8814 int max_charset_id = 0;
8815 int i;
8816
8817 if (nargs < coding_arg_max)
8818 goto short_args;
8819
8820 attrs = Fmake_vector (make_number (coding_attr_last_index), Qnil);
8821
8822 name = args[coding_arg_name];
8823 CHECK_SYMBOL (name);
8824 CODING_ATTR_BASE_NAME (attrs) = name;
8825
8826 val = args[coding_arg_mnemonic];
8827 if (! STRINGP (val))
8828 CHECK_CHARACTER (val);
8829 CODING_ATTR_MNEMONIC (attrs) = val;
8830
8831 coding_type = args[coding_arg_coding_type];
8832 CHECK_SYMBOL (coding_type);
8833 CODING_ATTR_TYPE (attrs) = coding_type;
8834
8835 charset_list = args[coding_arg_charset_list];
8836 if (SYMBOLP (charset_list))
8837 {
8838 if (EQ (charset_list, Qiso_2022))
8839 {
8840 if (! EQ (coding_type, Qiso_2022))
8841 error ("Invalid charset-list");
8842 charset_list = Viso_2022_charset_list;
8843 }
8844 else if (EQ (charset_list, Qemacs_mule))
8845 {
8846 if (! EQ (coding_type, Qemacs_mule))
8847 error ("Invalid charset-list");
8848 charset_list = Vemacs_mule_charset_list;
8849 }
8850 for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
8851 if (max_charset_id < XFASTINT (XCAR (tail)))
8852 max_charset_id = XFASTINT (XCAR (tail));
8853 }
8854 else
8855 {
8856 charset_list = Fcopy_sequence (charset_list);
8857 for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
8858 {
8859 struct charset *charset;
8860
8861 val = XCAR (tail);
8862 CHECK_CHARSET_GET_CHARSET (val, charset);
8863 if (EQ (coding_type, Qiso_2022)
8864 ? CHARSET_ISO_FINAL (charset) < 0
8865 : EQ (coding_type, Qemacs_mule)
8866 ? CHARSET_EMACS_MULE_ID (charset) < 0
8867 : 0)
8868 error ("Can't handle charset `%s'",
8869 SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
8870
8871 XSETCAR (tail, make_number (charset->id));
8872 if (max_charset_id < charset->id)
8873 max_charset_id = charset->id;
8874 }
8875 }
8876 CODING_ATTR_CHARSET_LIST (attrs) = charset_list;
8877
8878 safe_charsets = Fmake_string (make_number (max_charset_id + 1),
8879 make_number (255));
8880 for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
8881 SSET (safe_charsets, XFASTINT (XCAR (tail)), 0);
8882 CODING_ATTR_SAFE_CHARSETS (attrs) = safe_charsets;
8883
8884 CODING_ATTR_ASCII_COMPAT (attrs) = args[coding_arg_ascii_compatible_p];
8885
8886 val = args[coding_arg_decode_translation_table];
8887 if (! CHAR_TABLE_P (val) && ! CONSP (val))
8888 CHECK_SYMBOL (val);
8889 CODING_ATTR_DECODE_TBL (attrs) = val;
8890
8891 val = args[coding_arg_encode_translation_table];
8892 if (! CHAR_TABLE_P (val) && ! CONSP (val))
8893 CHECK_SYMBOL (val);
8894 CODING_ATTR_ENCODE_TBL (attrs) = val;
8895
8896 val = args[coding_arg_post_read_conversion];
8897 CHECK_SYMBOL (val);
8898 CODING_ATTR_POST_READ (attrs) = val;
8899
8900 val = args[coding_arg_pre_write_conversion];
8901 CHECK_SYMBOL (val);
8902 CODING_ATTR_PRE_WRITE (attrs) = val;
8903
8904 val = args[coding_arg_default_char];
8905 if (NILP (val))
8906 CODING_ATTR_DEFAULT_CHAR (attrs) = make_number (' ');
8907 else
8908 {
8909 CHECK_CHARACTER (val);
8910 CODING_ATTR_DEFAULT_CHAR (attrs) = val;
8911 }
8912
8913 val = args[coding_arg_for_unibyte];
8914 CODING_ATTR_FOR_UNIBYTE (attrs) = NILP (val) ? Qnil : Qt;
8915
8916 val = args[coding_arg_plist];
8917 CHECK_LIST (val);
8918 CODING_ATTR_PLIST (attrs) = val;
8919
8920 if (EQ (coding_type, Qcharset))
8921 {
8922 /* Generate a lisp vector of 256 elements. Each element is nil,
8923 integer, or a list of charset IDs.
8924
8925 If Nth element is nil, the byte code N is invalid in this
8926 coding system.
8927
8928 If Nth element is a number NUM, N is the first byte of a
8929 charset whose ID is NUM.
8930
8931 If Nth element is a list of charset IDs, N is the first byte
8932 of one of them. The list is sorted by dimensions of the
8933 charsets. A charset of smaller dimension comes firtst. */
8934 val = Fmake_vector (make_number (256), Qnil);
8935
8936 for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
8937 {
8938 struct charset *charset = CHARSET_FROM_ID (XFASTINT (XCAR (tail)));
8939 int dim = CHARSET_DIMENSION (charset);
8940 int idx = (dim - 1) * 4;
8941
8942 if (CHARSET_ASCII_COMPATIBLE_P (charset))
8943 CODING_ATTR_ASCII_COMPAT (attrs) = Qt;
8944
8945 for (i = charset->code_space[idx];
8946 i <= charset->code_space[idx + 1]; i++)
8947 {
8948 Lisp_Object tmp, tmp2;
8949 int dim2;
8950
8951 tmp = AREF (val, i);
8952 if (NILP (tmp))
8953 tmp = XCAR (tail);
8954 else if (NUMBERP (tmp))
8955 {
8956 dim2 = CHARSET_DIMENSION (CHARSET_FROM_ID (XFASTINT (tmp)));
8957 if (dim < dim2)
8958 tmp = Fcons (XCAR (tail), Fcons (tmp, Qnil));
8959 else
8960 tmp = Fcons (tmp, Fcons (XCAR (tail), Qnil));
8961 }
8962 else
8963 {
8964 for (tmp2 = tmp; CONSP (tmp2); tmp2 = XCDR (tmp2))
8965 {
8966 dim2 = CHARSET_DIMENSION (CHARSET_FROM_ID (XFASTINT (XCAR (tmp2))));
8967 if (dim < dim2)
8968 break;
8969 }
8970 if (NILP (tmp2))
8971 tmp = nconc2 (tmp, Fcons (XCAR (tail), Qnil));
8972 else
8973 {
8974 XSETCDR (tmp2, Fcons (XCAR (tmp2), XCDR (tmp2)));
8975 XSETCAR (tmp2, XCAR (tail));
8976 }
8977 }
8978 ASET (val, i, tmp);
8979 }
8980 }
8981 ASET (attrs, coding_attr_charset_valids, val);
8982 category = coding_category_charset;
8983 }
8984 else if (EQ (coding_type, Qccl))
8985 {
8986 Lisp_Object valids;
8987
8988 if (nargs < coding_arg_ccl_max)
8989 goto short_args;
8990
8991 val = args[coding_arg_ccl_decoder];
8992 CHECK_CCL_PROGRAM (val);
8993 if (VECTORP (val))
8994 val = Fcopy_sequence (val);
8995 ASET (attrs, coding_attr_ccl_decoder, val);
8996
8997 val = args[coding_arg_ccl_encoder];
8998 CHECK_CCL_PROGRAM (val);
8999 if (VECTORP (val))
9000 val = Fcopy_sequence (val);
9001 ASET (attrs, coding_attr_ccl_encoder, val);
9002
9003 val = args[coding_arg_ccl_valids];
9004 valids = Fmake_string (make_number (256), make_number (0));
9005 for (tail = val; !NILP (tail); tail = Fcdr (tail))
9006 {
9007 int from, to;
9008
9009 val = Fcar (tail);
9010 if (INTEGERP (val))
9011 {
9012 from = to = XINT (val);
9013 if (from < 0 || from > 255)
9014 args_out_of_range_3 (val, make_number (0), make_number (255));
9015 }
9016 else
9017 {
9018 CHECK_CONS (val);
9019 CHECK_NATNUM_CAR (val);
9020 CHECK_NATNUM_CDR (val);
9021 from = XINT (XCAR (val));
9022 if (from > 255)
9023 args_out_of_range_3 (XCAR (val),
9024 make_number (0), make_number (255));
9025 to = XINT (XCDR (val));
9026 if (to < from || to > 255)
9027 args_out_of_range_3 (XCDR (val),
9028 XCAR (val), make_number (255));
9029 }
9030 for (i = from; i <= to; i++)
9031 SSET (valids, i, 1);
9032 }
9033 ASET (attrs, coding_attr_ccl_valids, valids);
9034
9035 category = coding_category_ccl;
9036 }
9037 else if (EQ (coding_type, Qutf_16))
9038 {
9039 Lisp_Object bom, endian;
9040
9041 CODING_ATTR_ASCII_COMPAT (attrs) = Qnil;
9042
9043 if (nargs < coding_arg_utf16_max)
9044 goto short_args;
9045
9046 bom = args[coding_arg_utf16_bom];
9047 if (! NILP (bom) && ! EQ (bom, Qt))
9048 {
9049 CHECK_CONS (bom);
9050 val = XCAR (bom);
9051 CHECK_CODING_SYSTEM (val);
9052 val = XCDR (bom);
9053 CHECK_CODING_SYSTEM (val);
9054 }
9055 ASET (attrs, coding_attr_utf_16_bom, bom);
9056
9057 endian = args[coding_arg_utf16_endian];
9058 CHECK_SYMBOL (endian);
9059 if (NILP (endian))
9060 endian = Qbig;
9061 else if (! EQ (endian, Qbig) && ! EQ (endian, Qlittle))
9062 error ("Invalid endian: %s", SDATA (SYMBOL_NAME (endian)));
9063 ASET (attrs, coding_attr_utf_16_endian, endian);
9064
9065 category = (CONSP (bom)
9066 ? coding_category_utf_16_auto
9067 : NILP (bom)
9068 ? (EQ (endian, Qbig)
9069 ? coding_category_utf_16_be_nosig
9070 : coding_category_utf_16_le_nosig)
9071 : (EQ (endian, Qbig)
9072 ? coding_category_utf_16_be
9073 : coding_category_utf_16_le));
9074 }
9075 else if (EQ (coding_type, Qiso_2022))
9076 {
9077 Lisp_Object initial, reg_usage, request, flags;
9078 int i;
9079
9080 if (nargs < coding_arg_iso2022_max)
9081 goto short_args;
9082
9083 initial = Fcopy_sequence (args[coding_arg_iso2022_initial]);
9084 CHECK_VECTOR (initial);
9085 for (i = 0; i < 4; i++)
9086 {
9087 val = Faref (initial, make_number (i));
9088 if (! NILP (val))
9089 {
9090 struct charset *charset;
9091
9092 CHECK_CHARSET_GET_CHARSET (val, charset);
9093 ASET (initial, i, make_number (CHARSET_ID (charset)));
9094 if (i == 0 && CHARSET_ASCII_COMPATIBLE_P (charset))
9095 CODING_ATTR_ASCII_COMPAT (attrs) = Qt;
9096 }
9097 else
9098 ASET (initial, i, make_number (-1));
9099 }
9100
9101 reg_usage = args[coding_arg_iso2022_reg_usage];
9102 CHECK_CONS (reg_usage);
9103 CHECK_NUMBER_CAR (reg_usage);
9104 CHECK_NUMBER_CDR (reg_usage);
9105
9106 request = Fcopy_sequence (args[coding_arg_iso2022_request]);
9107 for (tail = request; ! NILP (tail); tail = Fcdr (tail))
9108 {
9109 int id;
9110 Lisp_Object tmp;
9111
9112 val = Fcar (tail);
9113 CHECK_CONS (val);
9114 tmp = XCAR (val);
9115 CHECK_CHARSET_GET_ID (tmp, id);
9116 CHECK_NATNUM_CDR (val);
9117 if (XINT (XCDR (val)) >= 4)
9118 error ("Invalid graphic register number: %d", XINT (XCDR (val)));
9119 XSETCAR (val, make_number (id));
9120 }
9121
9122 flags = args[coding_arg_iso2022_flags];
9123 CHECK_NATNUM (flags);
9124 i = XINT (flags);
9125 if (EQ (args[coding_arg_charset_list], Qiso_2022))
9126 flags = make_number (i | CODING_ISO_FLAG_FULL_SUPPORT);
9127
9128 ASET (attrs, coding_attr_iso_initial, initial);
9129 ASET (attrs, coding_attr_iso_usage, reg_usage);
9130 ASET (attrs, coding_attr_iso_request, request);
9131 ASET (attrs, coding_attr_iso_flags, flags);
9132 setup_iso_safe_charsets (attrs);
9133
9134 if (i & CODING_ISO_FLAG_SEVEN_BITS)
9135 category = ((i & (CODING_ISO_FLAG_LOCKING_SHIFT
9136 | CODING_ISO_FLAG_SINGLE_SHIFT))
9137 ? coding_category_iso_7_else
9138 : EQ (args[coding_arg_charset_list], Qiso_2022)
9139 ? coding_category_iso_7
9140 : coding_category_iso_7_tight);
9141 else
9142 {
9143 int id = XINT (AREF (initial, 1));
9144
9145 category = (((i & CODING_ISO_FLAG_LOCKING_SHIFT)
9146 || EQ (args[coding_arg_charset_list], Qiso_2022)
9147 || id < 0)
9148 ? coding_category_iso_8_else
9149 : (CHARSET_DIMENSION (CHARSET_FROM_ID (id)) == 1)
9150 ? coding_category_iso_8_1
9151 : coding_category_iso_8_2);
9152 }
9153 if (category != coding_category_iso_8_1
9154 && category != coding_category_iso_8_2)
9155 CODING_ATTR_ASCII_COMPAT (attrs) = Qnil;
9156 }
9157 else if (EQ (coding_type, Qemacs_mule))
9158 {
9159 if (EQ (args[coding_arg_charset_list], Qemacs_mule))
9160 ASET (attrs, coding_attr_emacs_mule_full, Qt);
9161 CODING_ATTR_ASCII_COMPAT (attrs) = Qt;
9162 category = coding_category_emacs_mule;
9163 }
9164 else if (EQ (coding_type, Qshift_jis))
9165 {
9166
9167 struct charset *charset;
9168
9169 if (XINT (Flength (charset_list)) != 3
9170 && XINT (Flength (charset_list)) != 4)
9171 error ("There should be three or four charsets");
9172
9173 charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
9174 if (CHARSET_DIMENSION (charset) != 1)
9175 error ("Dimension of charset %s is not one",
9176 SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
9177 if (CHARSET_ASCII_COMPATIBLE_P (charset))
9178 CODING_ATTR_ASCII_COMPAT (attrs) = Qt;
9179
9180 charset_list = XCDR (charset_list);
9181 charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
9182 if (CHARSET_DIMENSION (charset) != 1)
9183 error ("Dimension of charset %s is not one",
9184 SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
9185
9186 charset_list = XCDR (charset_list);
9187 charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
9188 if (CHARSET_DIMENSION (charset) != 2)
9189 error ("Dimension of charset %s is not two",
9190 SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
9191
9192 charset_list = XCDR (charset_list);
9193 if (! NILP (charset_list))
9194 {
9195 charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
9196 if (CHARSET_DIMENSION (charset) != 2)
9197 error ("Dimension of charset %s is not two",
9198 SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
9199 }
9200
9201 category = coding_category_sjis;
9202 Vsjis_coding_system = name;
9203 }
9204 else if (EQ (coding_type, Qbig5))
9205 {
9206 struct charset *charset;
9207
9208 if (XINT (Flength (charset_list)) != 2)
9209 error ("There should be just two charsets");
9210
9211 charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
9212 if (CHARSET_DIMENSION (charset) != 1)
9213 error ("Dimension of charset %s is not one",
9214 SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
9215 if (CHARSET_ASCII_COMPATIBLE_P (charset))
9216 CODING_ATTR_ASCII_COMPAT (attrs) = Qt;
9217
9218 charset_list = XCDR (charset_list);
9219 charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
9220 if (CHARSET_DIMENSION (charset) != 2)
9221 error ("Dimension of charset %s is not two",
9222 SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
9223
9224 category = coding_category_big5;
9225 Vbig5_coding_system = name;
9226 }
9227 else if (EQ (coding_type, Qraw_text))
9228 {
9229 category = coding_category_raw_text;
9230 CODING_ATTR_ASCII_COMPAT (attrs) = Qt;
9231 }
9232 else if (EQ (coding_type, Qutf_8))
9233 {
9234 category = coding_category_utf_8;
9235 CODING_ATTR_ASCII_COMPAT (attrs) = Qt;
9236 }
9237 else if (EQ (coding_type, Qundecided))
9238 category = coding_category_undecided;
9239 else
9240 error ("Invalid coding system type: %s",
9241 SDATA (SYMBOL_NAME (coding_type)));
9242
9243 CODING_ATTR_CATEGORY (attrs) = make_number (category);
9244 CODING_ATTR_PLIST (attrs)
9245 = Fcons (QCcategory, Fcons (AREF (Vcoding_category_table, category),
9246 CODING_ATTR_PLIST (attrs)));
9247 CODING_ATTR_PLIST (attrs)
9248 = Fcons (QCascii_compatible_p,
9249 Fcons (CODING_ATTR_ASCII_COMPAT (attrs),
9250 CODING_ATTR_PLIST (attrs)));
9251
9252 eol_type = args[coding_arg_eol_type];
9253 if (! NILP (eol_type)
9254 && ! EQ (eol_type, Qunix)
9255 && ! EQ (eol_type, Qdos)
9256 && ! EQ (eol_type, Qmac))
9257 error ("Invalid eol-type");
9258
9259 aliases = Fcons (name, Qnil);
9260
9261 if (NILP (eol_type))
9262 {
9263 eol_type = make_subsidiaries (name);
9264 for (i = 0; i < 3; i++)
9265 {
9266 Lisp_Object this_spec, this_name, this_aliases, this_eol_type;
9267
9268 this_name = AREF (eol_type, i);
9269 this_aliases = Fcons (this_name, Qnil);
9270 this_eol_type = (i == 0 ? Qunix : i == 1 ? Qdos : Qmac);
9271 this_spec = Fmake_vector (make_number (3), attrs);
9272 ASET (this_spec, 1, this_aliases);
9273 ASET (this_spec, 2, this_eol_type);
9274 Fputhash (this_name, this_spec, Vcoding_system_hash_table);
9275 Vcoding_system_list = Fcons (this_name, Vcoding_system_list);
9276 val = Fassoc (Fsymbol_name (this_name), Vcoding_system_alist);
9277 if (NILP (val))
9278 Vcoding_system_alist
9279 = Fcons (Fcons (Fsymbol_name (this_name), Qnil),
9280 Vcoding_system_alist);
9281 }
9282 }
9283
9284 spec_vec = Fmake_vector (make_number (3), attrs);
9285 ASET (spec_vec, 1, aliases);
9286 ASET (spec_vec, 2, eol_type);
9287
9288 Fputhash (name, spec_vec, Vcoding_system_hash_table);
9289 Vcoding_system_list = Fcons (name, Vcoding_system_list);
9290 val = Fassoc (Fsymbol_name (name), Vcoding_system_alist);
9291 if (NILP (val))
9292 Vcoding_system_alist = Fcons (Fcons (Fsymbol_name (name), Qnil),
9293 Vcoding_system_alist);
9294
9295 {
9296 int id = coding_categories[category].id;
9297
9298 if (id < 0 || EQ (name, CODING_ID_NAME (id)))
9299 setup_coding_system (name, &coding_categories[category]);
9300 }
9301
9302 return Qnil;
9303
9304 short_args:
9305 return Fsignal (Qwrong_number_of_arguments,
9306 Fcons (intern ("define-coding-system-internal"),
9307 make_number (nargs)));
9308 }
9309
9310
9311 DEFUN ("coding-system-put", Fcoding_system_put, Scoding_system_put,
9312 3, 3, 0,
9313 doc: /* Change value in CODING-SYSTEM's property list PROP to VAL. */)
9314 (coding_system, prop, val)
9315 Lisp_Object coding_system, prop, val;
9316 {
9317 Lisp_Object spec, attrs;
9318
9319 CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
9320 attrs = AREF (spec, 0);
9321 if (EQ (prop, QCmnemonic))
9322 {
9323 if (! STRINGP (val))
9324 CHECK_CHARACTER (val);
9325 CODING_ATTR_MNEMONIC (attrs) = val;
9326 }
9327 else if (EQ (prop, QCdefalut_char))
9328 {
9329 if (NILP (val))
9330 val = make_number (' ');
9331 else
9332 CHECK_CHARACTER (val);
9333 CODING_ATTR_DEFAULT_CHAR (attrs) = val;
9334 }
9335 else if (EQ (prop, QCdecode_translation_table))
9336 {
9337 if (! CHAR_TABLE_P (val) && ! CONSP (val))
9338 CHECK_SYMBOL (val);
9339 CODING_ATTR_DECODE_TBL (attrs) = val;
9340 }
9341 else if (EQ (prop, QCencode_translation_table))
9342 {
9343 if (! CHAR_TABLE_P (val) && ! CONSP (val))
9344 CHECK_SYMBOL (val);
9345 CODING_ATTR_ENCODE_TBL (attrs) = val;
9346 }
9347 else if (EQ (prop, QCpost_read_conversion))
9348 {
9349 CHECK_SYMBOL (val);
9350 CODING_ATTR_POST_READ (attrs) = val;
9351 }
9352 else if (EQ (prop, QCpre_write_conversion))
9353 {
9354 CHECK_SYMBOL (val);
9355 CODING_ATTR_PRE_WRITE (attrs) = val;
9356 }
9357 else if (EQ (prop, QCascii_compatible_p))
9358 {
9359 CODING_ATTR_ASCII_COMPAT (attrs) = val;
9360 }
9361
9362 CODING_ATTR_PLIST (attrs)
9363 = Fplist_put (CODING_ATTR_PLIST (attrs), prop, val);
9364 return val;
9365 }
9366
9367
9368 DEFUN ("define-coding-system-alias", Fdefine_coding_system_alias,
9369 Sdefine_coding_system_alias, 2, 2, 0,
9370 doc: /* Define ALIAS as an alias for CODING-SYSTEM. */)
9371 (alias, coding_system)
9372 Lisp_Object alias, coding_system;
9373 {
9374 Lisp_Object spec, aliases, eol_type, val;
9375
9376 CHECK_SYMBOL (alias);
9377 CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
9378 aliases = AREF (spec, 1);
9379 /* ALISES should be a list of length more than zero, and the first
9380 element is a base coding system. Append ALIAS at the tail of the
9381 list. */
9382 while (!NILP (XCDR (aliases)))
9383 aliases = XCDR (aliases);
9384 XSETCDR (aliases, Fcons (alias, Qnil));
9385
9386 eol_type = AREF (spec, 2);
9387 if (VECTORP (eol_type))
9388 {
9389 Lisp_Object subsidiaries;
9390 int i;
9391
9392 subsidiaries = make_subsidiaries (alias);
9393 for (i = 0; i < 3; i++)
9394 Fdefine_coding_system_alias (AREF (subsidiaries, i),
9395 AREF (eol_type, i));
9396 }
9397
9398 Fputhash (alias, spec, Vcoding_system_hash_table);
9399 Vcoding_system_list = Fcons (alias, Vcoding_system_list);
9400 val = Fassoc (Fsymbol_name (alias), Vcoding_system_alist);
9401 if (NILP (val))
9402 Vcoding_system_alist = Fcons (Fcons (Fsymbol_name (alias), Qnil),
9403 Vcoding_system_alist);
9404
9405 return Qnil;
9406 }
9407
9408 DEFUN ("coding-system-base", Fcoding_system_base, Scoding_system_base,
9409 1, 1, 0,
9410 doc: /* Return the base of CODING-SYSTEM.
9411 Any alias or subsidiary coding system is not a base coding system. */)
9412 (coding_system)
9413 Lisp_Object coding_system;
9414 {
9415 Lisp_Object spec, attrs;
9416
9417 if (NILP (coding_system))
9418 return (Qno_conversion);
9419 CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
9420 attrs = AREF (spec, 0);
9421 return CODING_ATTR_BASE_NAME (attrs);
9422 }
9423
9424 DEFUN ("coding-system-plist", Fcoding_system_plist, Scoding_system_plist,
9425 1, 1, 0,
9426 doc: "Return the property list of CODING-SYSTEM.")
9427 (coding_system)
9428 Lisp_Object coding_system;
9429 {
9430 Lisp_Object spec, attrs;
9431
9432 if (NILP (coding_system))
9433 coding_system = Qno_conversion;
9434 CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
9435 attrs = AREF (spec, 0);
9436 return CODING_ATTR_PLIST (attrs);
9437 }
9438
9439
9440 DEFUN ("coding-system-aliases", Fcoding_system_aliases, Scoding_system_aliases,
9441 1, 1, 0,
9442 doc: /* Return the list of aliases of CODING-SYSTEM. */)
9443 (coding_system)
9444 Lisp_Object coding_system;
9445 {
9446 Lisp_Object spec;
9447
9448 if (NILP (coding_system))
9449 coding_system = Qno_conversion;
9450 CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
9451 return AREF (spec, 1);
9452 }
9453
9454 DEFUN ("coding-system-eol-type", Fcoding_system_eol_type,
9455 Scoding_system_eol_type, 1, 1, 0,
9456 doc: /* Return eol-type of CODING-SYSTEM.
9457 An eol-type is integer 0, 1, 2, or a vector of coding systems.
9458
9459 Integer values 0, 1, and 2 indicate a format of end-of-line; LF, CRLF,
9460 and CR respectively.
9461
9462 A vector value indicates that a format of end-of-line should be
9463 detected automatically. Nth element of the vector is the subsidiary
9464 coding system whose eol-type is N. */)
9465 (coding_system)
9466 Lisp_Object coding_system;
9467 {
9468 Lisp_Object spec, eol_type;
9469 int n;
9470
9471 if (NILP (coding_system))
9472 coding_system = Qno_conversion;
9473 if (! CODING_SYSTEM_P (coding_system))
9474 return Qnil;
9475 spec = CODING_SYSTEM_SPEC (coding_system);
9476 eol_type = AREF (spec, 2);
9477 if (VECTORP (eol_type))
9478 return Fcopy_sequence (eol_type);
9479 n = EQ (eol_type, Qunix) ? 0 : EQ (eol_type, Qdos) ? 1 : 2;
9480 return make_number (n);
9481 }
9482
9483 #endif /* emacs */
9484
9485 \f
9486 /*** 9. Post-amble ***/
9487
9488 void
9489 init_coding_once ()
9490 {
9491 int i;
9492
9493 for (i = 0; i < coding_category_max; i++)
9494 {
9495 coding_categories[i].id = -1;
9496 coding_priorities[i] = i;
9497 }
9498
9499 /* ISO2022 specific initialize routine. */
9500 for (i = 0; i < 0x20; i++)
9501 iso_code_class[i] = ISO_control_0;
9502 for (i = 0x21; i < 0x7F; i++)
9503 iso_code_class[i] = ISO_graphic_plane_0;
9504 for (i = 0x80; i < 0xA0; i++)
9505 iso_code_class[i] = ISO_control_1;
9506 for (i = 0xA1; i < 0xFF; i++)
9507 iso_code_class[i] = ISO_graphic_plane_1;
9508 iso_code_class[0x20] = iso_code_class[0x7F] = ISO_0x20_or_0x7F;
9509 iso_code_class[0xA0] = iso_code_class[0xFF] = ISO_0xA0_or_0xFF;
9510 iso_code_class[ISO_CODE_SO] = ISO_shift_out;
9511 iso_code_class[ISO_CODE_SI] = ISO_shift_in;
9512 iso_code_class[ISO_CODE_SS2_7] = ISO_single_shift_2_7;
9513 iso_code_class[ISO_CODE_ESC] = ISO_escape;
9514 iso_code_class[ISO_CODE_SS2] = ISO_single_shift_2;
9515 iso_code_class[ISO_CODE_SS3] = ISO_single_shift_3;
9516 iso_code_class[ISO_CODE_CSI] = ISO_control_sequence_introducer;
9517
9518 for (i = 0; i < 256; i++)
9519 {
9520 emacs_mule_bytes[i] = 1;
9521 }
9522 emacs_mule_bytes[EMACS_MULE_LEADING_CODE_PRIVATE_11] = 3;
9523 emacs_mule_bytes[EMACS_MULE_LEADING_CODE_PRIVATE_12] = 3;
9524 emacs_mule_bytes[EMACS_MULE_LEADING_CODE_PRIVATE_21] = 4;
9525 emacs_mule_bytes[EMACS_MULE_LEADING_CODE_PRIVATE_22] = 4;
9526 }
9527
9528 #ifdef emacs
9529
9530 void
9531 syms_of_coding ()
9532 {
9533 staticpro (&Vcoding_system_hash_table);
9534 {
9535 Lisp_Object args[2];
9536 args[0] = QCtest;
9537 args[1] = Qeq;
9538 Vcoding_system_hash_table = Fmake_hash_table (2, args);
9539 }
9540
9541 staticpro (&Vsjis_coding_system);
9542 Vsjis_coding_system = Qnil;
9543
9544 staticpro (&Vbig5_coding_system);
9545 Vbig5_coding_system = Qnil;
9546
9547 staticpro (&Vcode_conversion_reused_workbuf);
9548 Vcode_conversion_reused_workbuf = Qnil;
9549
9550 staticpro (&Vcode_conversion_workbuf_name);
9551 Vcode_conversion_workbuf_name = build_string (" *code-conversion-work*");
9552
9553 reused_workbuf_in_use = 0;
9554
9555 DEFSYM (Qcharset, "charset");
9556 DEFSYM (Qtarget_idx, "target-idx");
9557 DEFSYM (Qcoding_system_history, "coding-system-history");
9558 Fset (Qcoding_system_history, Qnil);
9559
9560 /* Target FILENAME is the first argument. */
9561 Fput (Qinsert_file_contents, Qtarget_idx, make_number (0));
9562 /* Target FILENAME is the third argument. */
9563 Fput (Qwrite_region, Qtarget_idx, make_number (2));
9564
9565 DEFSYM (Qcall_process, "call-process");
9566 /* Target PROGRAM is the first argument. */
9567 Fput (Qcall_process, Qtarget_idx, make_number (0));
9568
9569 DEFSYM (Qcall_process_region, "call-process-region");
9570 /* Target PROGRAM is the third argument. */
9571 Fput (Qcall_process_region, Qtarget_idx, make_number (2));
9572
9573 DEFSYM (Qstart_process, "start-process");
9574 /* Target PROGRAM is the third argument. */
9575 Fput (Qstart_process, Qtarget_idx, make_number (2));
9576
9577 DEFSYM (Qopen_network_stream, "open-network-stream");
9578 /* Target SERVICE is the fourth argument. */
9579 Fput (Qopen_network_stream, Qtarget_idx, make_number (3));
9580
9581 DEFSYM (Qcoding_system, "coding-system");
9582 DEFSYM (Qcoding_aliases, "coding-aliases");
9583
9584 DEFSYM (Qeol_type, "eol-type");
9585 DEFSYM (Qunix, "unix");
9586 DEFSYM (Qdos, "dos");
9587
9588 DEFSYM (Qbuffer_file_coding_system, "buffer-file-coding-system");
9589 DEFSYM (Qpost_read_conversion, "post-read-conversion");
9590 DEFSYM (Qpre_write_conversion, "pre-write-conversion");
9591 DEFSYM (Qdefault_char, "default-char");
9592 DEFSYM (Qundecided, "undecided");
9593 DEFSYM (Qno_conversion, "no-conversion");
9594 DEFSYM (Qraw_text, "raw-text");
9595
9596 DEFSYM (Qiso_2022, "iso-2022");
9597
9598 DEFSYM (Qutf_8, "utf-8");
9599 DEFSYM (Qutf_8_emacs, "utf-8-emacs");
9600
9601 DEFSYM (Qutf_16, "utf-16");
9602 DEFSYM (Qbig, "big");
9603 DEFSYM (Qlittle, "little");
9604
9605 DEFSYM (Qshift_jis, "shift-jis");
9606 DEFSYM (Qbig5, "big5");
9607
9608 DEFSYM (Qcoding_system_p, "coding-system-p");
9609
9610 DEFSYM (Qcoding_system_error, "coding-system-error");
9611 Fput (Qcoding_system_error, Qerror_conditions,
9612 Fcons (Qcoding_system_error, Fcons (Qerror, Qnil)));
9613 Fput (Qcoding_system_error, Qerror_message,
9614 build_string ("Invalid coding system"));
9615
9616 /* Intern this now in case it isn't already done.
9617 Setting this variable twice is harmless.
9618 But don't staticpro it here--that is done in alloc.c. */
9619 Qchar_table_extra_slots = intern ("char-table-extra-slots");
9620
9621 DEFSYM (Qtranslation_table, "translation-table");
9622 Fput (Qtranslation_table, Qchar_table_extra_slots, make_number (2));
9623 DEFSYM (Qtranslation_table_id, "translation-table-id");
9624 DEFSYM (Qtranslation_table_for_decode, "translation-table-for-decode");
9625 DEFSYM (Qtranslation_table_for_encode, "translation-table-for-encode");
9626
9627 DEFSYM (Qvalid_codes, "valid-codes");
9628
9629 DEFSYM (Qemacs_mule, "emacs-mule");
9630
9631 DEFSYM (QCcategory, ":category");
9632 DEFSYM (QCmnemonic, ":mnemonic");
9633 DEFSYM (QCdefalut_char, ":default-char");
9634 DEFSYM (QCdecode_translation_table, ":decode-translation-table");
9635 DEFSYM (QCencode_translation_table, ":encode-translation-table");
9636 DEFSYM (QCpost_read_conversion, ":post-read-conversion");
9637 DEFSYM (QCpre_write_conversion, ":pre-write-conversion");
9638 DEFSYM (QCascii_compatible_p, ":ascii-compatible-p");
9639
9640 Vcoding_category_table
9641 = Fmake_vector (make_number (coding_category_max), Qnil);
9642 staticpro (&Vcoding_category_table);
9643 /* Followings are target of code detection. */
9644 ASET (Vcoding_category_table, coding_category_iso_7,
9645 intern ("coding-category-iso-7"));
9646 ASET (Vcoding_category_table, coding_category_iso_7_tight,
9647 intern ("coding-category-iso-7-tight"));
9648 ASET (Vcoding_category_table, coding_category_iso_8_1,
9649 intern ("coding-category-iso-8-1"));
9650 ASET (Vcoding_category_table, coding_category_iso_8_2,
9651 intern ("coding-category-iso-8-2"));
9652 ASET (Vcoding_category_table, coding_category_iso_7_else,
9653 intern ("coding-category-iso-7-else"));
9654 ASET (Vcoding_category_table, coding_category_iso_8_else,
9655 intern ("coding-category-iso-8-else"));
9656 ASET (Vcoding_category_table, coding_category_utf_8,
9657 intern ("coding-category-utf-8"));
9658 ASET (Vcoding_category_table, coding_category_utf_16_be,
9659 intern ("coding-category-utf-16-be"));
9660 ASET (Vcoding_category_table, coding_category_utf_16_auto,
9661 intern ("coding-category-utf-16-auto"));
9662 ASET (Vcoding_category_table, coding_category_utf_16_le,
9663 intern ("coding-category-utf-16-le"));
9664 ASET (Vcoding_category_table, coding_category_utf_16_be_nosig,
9665 intern ("coding-category-utf-16-be-nosig"));
9666 ASET (Vcoding_category_table, coding_category_utf_16_le_nosig,
9667 intern ("coding-category-utf-16-le-nosig"));
9668 ASET (Vcoding_category_table, coding_category_charset,
9669 intern ("coding-category-charset"));
9670 ASET (Vcoding_category_table, coding_category_sjis,
9671 intern ("coding-category-sjis"));
9672 ASET (Vcoding_category_table, coding_category_big5,
9673 intern ("coding-category-big5"));
9674 ASET (Vcoding_category_table, coding_category_ccl,
9675 intern ("coding-category-ccl"));
9676 ASET (Vcoding_category_table, coding_category_emacs_mule,
9677 intern ("coding-category-emacs-mule"));
9678 /* Followings are NOT target of code detection. */
9679 ASET (Vcoding_category_table, coding_category_raw_text,
9680 intern ("coding-category-raw-text"));
9681 ASET (Vcoding_category_table, coding_category_undecided,
9682 intern ("coding-category-undecided"));
9683
9684 DEFSYM (Qinsufficient_source, "insufficient-source");
9685 DEFSYM (Qinconsistent_eol, "inconsistent-eol");
9686 DEFSYM (Qinvalid_source, "invalid-source");
9687 DEFSYM (Qinterrupted, "interrupted");
9688 DEFSYM (Qinsufficient_memory, "insufficient-memory");
9689 DEFSYM (Qcoding_system_define_form, "coding-system-define-form");
9690
9691 defsubr (&Scoding_system_p);
9692 defsubr (&Sread_coding_system);
9693 defsubr (&Sread_non_nil_coding_system);
9694 defsubr (&Scheck_coding_system);
9695 defsubr (&Sdetect_coding_region);
9696 defsubr (&Sdetect_coding_string);
9697 defsubr (&Sfind_coding_systems_region_internal);
9698 defsubr (&Sunencodable_char_position);
9699 defsubr (&Scheck_coding_systems_region);
9700 defsubr (&Sdecode_coding_region);
9701 defsubr (&Sencode_coding_region);
9702 defsubr (&Sdecode_coding_string);
9703 defsubr (&Sencode_coding_string);
9704 defsubr (&Sdecode_sjis_char);
9705 defsubr (&Sencode_sjis_char);
9706 defsubr (&Sdecode_big5_char);
9707 defsubr (&Sencode_big5_char);
9708 defsubr (&Sset_terminal_coding_system_internal);
9709 defsubr (&Sset_safe_terminal_coding_system_internal);
9710 defsubr (&Sterminal_coding_system);
9711 defsubr (&Sset_keyboard_coding_system_internal);
9712 defsubr (&Skeyboard_coding_system);
9713 defsubr (&Sfind_operation_coding_system);
9714 defsubr (&Sset_coding_system_priority);
9715 defsubr (&Sdefine_coding_system_internal);
9716 defsubr (&Sdefine_coding_system_alias);
9717 defsubr (&Scoding_system_put);
9718 defsubr (&Scoding_system_base);
9719 defsubr (&Scoding_system_plist);
9720 defsubr (&Scoding_system_aliases);
9721 defsubr (&Scoding_system_eol_type);
9722 defsubr (&Scoding_system_priority_list);
9723
9724 DEFVAR_LISP ("coding-system-list", &Vcoding_system_list,
9725 doc: /* List of coding systems.
9726
9727 Do not alter the value of this variable manually. This variable should be
9728 updated by the functions `define-coding-system' and
9729 `define-coding-system-alias'. */);
9730 Vcoding_system_list = Qnil;
9731
9732 DEFVAR_LISP ("coding-system-alist", &Vcoding_system_alist,
9733 doc: /* Alist of coding system names.
9734 Each element is one element list of coding system name.
9735 This variable is given to `completing-read' as TABLE argument.
9736
9737 Do not alter the value of this variable manually. This variable should be
9738 updated by the functions `make-coding-system' and
9739 `define-coding-system-alias'. */);
9740 Vcoding_system_alist = Qnil;
9741
9742 DEFVAR_LISP ("coding-category-list", &Vcoding_category_list,
9743 doc: /* List of coding-categories (symbols) ordered by priority.
9744
9745 On detecting a coding system, Emacs tries code detection algorithms
9746 associated with each coding-category one by one in this order. When
9747 one algorithm agrees with a byte sequence of source text, the coding
9748 system bound to the corresponding coding-category is selected.
9749
9750 Don't modify this variable directly, but use `set-coding-priority'. */);
9751 {
9752 int i;
9753
9754 Vcoding_category_list = Qnil;
9755 for (i = coding_category_max - 1; i >= 0; i--)
9756 Vcoding_category_list
9757 = Fcons (XVECTOR (Vcoding_category_table)->contents[i],
9758 Vcoding_category_list);
9759 }
9760
9761 DEFVAR_LISP ("coding-system-for-read", &Vcoding_system_for_read,
9762 doc: /* Specify the coding system for read operations.
9763 It is useful to bind this variable with `let', but do not set it globally.
9764 If the value is a coding system, it is used for decoding on read operation.
9765 If not, an appropriate element is used from one of the coding system alists:
9766 There are three such tables, `file-coding-system-alist',
9767 `process-coding-system-alist', and `network-coding-system-alist'. */);
9768 Vcoding_system_for_read = Qnil;
9769
9770 DEFVAR_LISP ("coding-system-for-write", &Vcoding_system_for_write,
9771 doc: /* Specify the coding system for write operations.
9772 Programs bind this variable with `let', but you should not set it globally.
9773 If the value is a coding system, it is used for encoding of output,
9774 when writing it to a file and when sending it to a file or subprocess.
9775
9776 If this does not specify a coding system, an appropriate element
9777 is used from one of the coding system alists:
9778 There are three such tables, `file-coding-system-alist',
9779 `process-coding-system-alist', and `network-coding-system-alist'.
9780 For output to files, if the above procedure does not specify a coding system,
9781 the value of `buffer-file-coding-system' is used. */);
9782 Vcoding_system_for_write = Qnil;
9783
9784 DEFVAR_LISP ("last-coding-system-used", &Vlast_coding_system_used,
9785 doc: /*
9786 Coding system used in the latest file or process I/O. */);
9787 Vlast_coding_system_used = Qnil;
9788
9789 DEFVAR_LISP ("last-code-conversion-error", &Vlast_code_conversion_error,
9790 doc: /*
9791 Error status of the last code conversion.
9792
9793 When an error was detected in the last code conversion, this variable
9794 is set to one of the following symbols.
9795 `insufficient-source'
9796 `inconsistent-eol'
9797 `invalid-source'
9798 `interrupted'
9799 `insufficient-memory'
9800 When no error was detected, the value doesn't change. So, to check
9801 the error status of a code conversion by this variable, you must
9802 explicitly set this variable to nil before performing code
9803 conversion. */);
9804 Vlast_code_conversion_error = Qnil;
9805
9806 DEFVAR_BOOL ("inhibit-eol-conversion", &inhibit_eol_conversion,
9807 doc: /*
9808 *Non-nil means always inhibit code conversion of end-of-line format.
9809 See info node `Coding Systems' and info node `Text and Binary' concerning
9810 such conversion. */);
9811 inhibit_eol_conversion = 0;
9812
9813 DEFVAR_BOOL ("inherit-process-coding-system", &inherit_process_coding_system,
9814 doc: /*
9815 Non-nil means process buffer inherits coding system of process output.
9816 Bind it to t if the process output is to be treated as if it were a file
9817 read from some filesystem. */);
9818 inherit_process_coding_system = 0;
9819
9820 DEFVAR_LISP ("file-coding-system-alist", &Vfile_coding_system_alist,
9821 doc: /*
9822 Alist to decide a coding system to use for a file I/O operation.
9823 The format is ((PATTERN . VAL) ...),
9824 where PATTERN is a regular expression matching a file name,
9825 VAL is a coding system, a cons of coding systems, or a function symbol.
9826 If VAL is a coding system, it is used for both decoding and encoding
9827 the file contents.
9828 If VAL is a cons of coding systems, the car part is used for decoding,
9829 and the cdr part is used for encoding.
9830 If VAL is a function symbol, the function must return a coding system
9831 or a cons of coding systems which are used as above. The function is
9832 called with an argument that is a list of the arguments with which
9833 `find-operation-coding-system' was called. If the function can't decide
9834 a coding system, it can return `undecided' so that the normal
9835 code-detection is performed.
9836
9837 See also the function `find-operation-coding-system'
9838 and the variable `auto-coding-alist'. */);
9839 Vfile_coding_system_alist = Qnil;
9840
9841 DEFVAR_LISP ("process-coding-system-alist", &Vprocess_coding_system_alist,
9842 doc: /*
9843 Alist to decide a coding system to use for a process I/O operation.
9844 The format is ((PATTERN . VAL) ...),
9845 where PATTERN is a regular expression matching a program name,
9846 VAL is a coding system, a cons of coding systems, or a function symbol.
9847 If VAL is a coding system, it is used for both decoding what received
9848 from the program and encoding what sent to the program.
9849 If VAL is a cons of coding systems, the car part is used for decoding,
9850 and the cdr part is used for encoding.
9851 If VAL is a function symbol, the function must return a coding system
9852 or a cons of coding systems which are used as above.
9853
9854 See also the function `find-operation-coding-system'. */);
9855 Vprocess_coding_system_alist = Qnil;
9856
9857 DEFVAR_LISP ("network-coding-system-alist", &Vnetwork_coding_system_alist,
9858 doc: /*
9859 Alist to decide a coding system to use for a network I/O operation.
9860 The format is ((PATTERN . VAL) ...),
9861 where PATTERN is a regular expression matching a network service name
9862 or is a port number to connect to,
9863 VAL is a coding system, a cons of coding systems, or a function symbol.
9864 If VAL is a coding system, it is used for both decoding what received
9865 from the network stream and encoding what sent to the network stream.
9866 If VAL is a cons of coding systems, the car part is used for decoding,
9867 and the cdr part is used for encoding.
9868 If VAL is a function symbol, the function must return a coding system
9869 or a cons of coding systems which are used as above.
9870
9871 See also the function `find-operation-coding-system'. */);
9872 Vnetwork_coding_system_alist = Qnil;
9873
9874 DEFVAR_LISP ("locale-coding-system", &Vlocale_coding_system,
9875 doc: /* Coding system to use with system messages.
9876 Also used for decoding keyboard input on X Window system. */);
9877 Vlocale_coding_system = Qnil;
9878
9879 /* The eol mnemonics are reset in startup.el system-dependently. */
9880 DEFVAR_LISP ("eol-mnemonic-unix", &eol_mnemonic_unix,
9881 doc: /*
9882 *String displayed in mode line for UNIX-like (LF) end-of-line format. */);
9883 eol_mnemonic_unix = build_string (":");
9884
9885 DEFVAR_LISP ("eol-mnemonic-dos", &eol_mnemonic_dos,
9886 doc: /*
9887 *String displayed in mode line for DOS-like (CRLF) end-of-line format. */);
9888 eol_mnemonic_dos = build_string ("\\");
9889
9890 DEFVAR_LISP ("eol-mnemonic-mac", &eol_mnemonic_mac,
9891 doc: /*
9892 *String displayed in mode line for MAC-like (CR) end-of-line format. */);
9893 eol_mnemonic_mac = build_string ("/");
9894
9895 DEFVAR_LISP ("eol-mnemonic-undecided", &eol_mnemonic_undecided,
9896 doc: /*
9897 *String displayed in mode line when end-of-line format is not yet determined. */);
9898 eol_mnemonic_undecided = build_string (":");
9899
9900 DEFVAR_LISP ("enable-character-translation", &Venable_character_translation,
9901 doc: /*
9902 *Non-nil enables character translation while encoding and decoding. */);
9903 Venable_character_translation = Qt;
9904
9905 DEFVAR_LISP ("standard-translation-table-for-decode",
9906 &Vstandard_translation_table_for_decode,
9907 doc: /* Table for translating characters while decoding. */);
9908 Vstandard_translation_table_for_decode = Qnil;
9909
9910 DEFVAR_LISP ("standard-translation-table-for-encode",
9911 &Vstandard_translation_table_for_encode,
9912 doc: /* Table for translating characters while encoding. */);
9913 Vstandard_translation_table_for_encode = Qnil;
9914
9915 DEFVAR_LISP ("charset-revision-table", &Vcharset_revision_table,
9916 doc: /* Alist of charsets vs revision numbers.
9917 While encoding, if a charset (car part of an element) is found,
9918 designate it with the escape sequence identifying revision (cdr part
9919 of the element). */);
9920 Vcharset_revision_table = Qnil;
9921
9922 DEFVAR_LISP ("default-process-coding-system",
9923 &Vdefault_process_coding_system,
9924 doc: /* Cons of coding systems used for process I/O by default.
9925 The car part is used for decoding a process output,
9926 the cdr part is used for encoding a text to be sent to a process. */);
9927 Vdefault_process_coding_system = Qnil;
9928
9929 DEFVAR_LISP ("latin-extra-code-table", &Vlatin_extra_code_table,
9930 doc: /*
9931 Table of extra Latin codes in the range 128..159 (inclusive).
9932 This is a vector of length 256.
9933 If Nth element is non-nil, the existence of code N in a file
9934 \(or output of subprocess) doesn't prevent it to be detected as
9935 a coding system of ISO 2022 variant which has a flag
9936 `accept-latin-extra-code' t (e.g. iso-latin-1) on reading a file
9937 or reading output of a subprocess.
9938 Only 128th through 159th elements has a meaning. */);
9939 Vlatin_extra_code_table = Fmake_vector (make_number (256), Qnil);
9940
9941 DEFVAR_LISP ("select-safe-coding-system-function",
9942 &Vselect_safe_coding_system_function,
9943 doc: /*
9944 Function to call to select safe coding system for encoding a text.
9945
9946 If set, this function is called to force a user to select a proper
9947 coding system which can encode the text in the case that a default
9948 coding system used in each operation can't encode the text. The
9949 function should take care that the buffer is not modified while
9950 the coding system is being selected.
9951
9952 The default value is `select-safe-coding-system' (which see). */);
9953 Vselect_safe_coding_system_function = Qnil;
9954
9955 DEFVAR_BOOL ("coding-system-require-warning",
9956 &coding_system_require_warning,
9957 doc: /* Internal use only.
9958 If non-nil, on writing a file, `select-safe-coding-system-function' is
9959 called even if `coding-system-for-write' is non-nil. The command
9960 `universal-coding-system-argument' binds this variable to t temporarily. */);
9961 coding_system_require_warning = 0;
9962
9963
9964 DEFVAR_BOOL ("inhibit-iso-escape-detection",
9965 &inhibit_iso_escape_detection,
9966 doc: /*
9967 If non-nil, Emacs ignores ISO2022's escape sequence on code detection.
9968
9969 By default, on reading a file, Emacs tries to detect how the text is
9970 encoded. This code detection is sensitive to escape sequences. If
9971 the sequence is valid as ISO2022, the code is determined as one of
9972 the ISO2022 encodings, and the file is decoded by the corresponding
9973 coding system (e.g. `iso-2022-7bit').
9974
9975 However, there may be a case that you want to read escape sequences in
9976 a file as is. In such a case, you can set this variable to non-nil.
9977 Then, as the code detection ignores any escape sequences, no file is
9978 detected as encoded in some ISO2022 encoding. The result is that all
9979 escape sequences become visible in a buffer.
9980
9981 The default value is nil, and it is strongly recommended not to change
9982 it. That is because many Emacs Lisp source files that contain
9983 non-ASCII characters are encoded by the coding system `iso-2022-7bit'
9984 in Emacs's distribution, and they won't be decoded correctly on
9985 reading if you suppress escape sequence detection.
9986
9987 The other way to read escape sequences in a file without decoding is
9988 to explicitly specify some coding system that doesn't use ISO2022's
9989 escape sequence (e.g `latin-1') on reading by \\[universal-coding-system-argument]. */);
9990 inhibit_iso_escape_detection = 0;
9991
9992 DEFVAR_LISP ("translation-table-for-input", &Vtranslation_table_for_input,
9993 doc: /* Char table for translating self-inserting characters.
9994 This is applied to the result of input methods, not their input. See also
9995 `keyboard-translate-table'. */);
9996 Vtranslation_table_for_input = Qnil;
9997
9998 {
9999 Lisp_Object args[coding_arg_max];
10000 Lisp_Object plist[16];
10001 int i;
10002
10003 for (i = 0; i < coding_arg_max; i++)
10004 args[i] = Qnil;
10005
10006 plist[0] = intern (":name");
10007 plist[1] = args[coding_arg_name] = Qno_conversion;
10008 plist[2] = intern (":mnemonic");
10009 plist[3] = args[coding_arg_mnemonic] = make_number ('=');
10010 plist[4] = intern (":coding-type");
10011 plist[5] = args[coding_arg_coding_type] = Qraw_text;
10012 plist[6] = intern (":ascii-compatible-p");
10013 plist[7] = args[coding_arg_ascii_compatible_p] = Qt;
10014 plist[8] = intern (":default-char");
10015 plist[9] = args[coding_arg_default_char] = make_number (0);
10016 plist[10] = intern (":for-unibyte");
10017 plist[11] = args[coding_arg_for_unibyte] = Qt;
10018 plist[12] = intern (":docstring");
10019 plist[13] = build_string ("Do no conversion.\n\
10020 \n\
10021 When you visit a file with this coding, the file is read into a\n\
10022 unibyte buffer as is, thus each byte of a file is treated as a\n\
10023 character.");
10024 plist[14] = intern (":eol-type");
10025 plist[15] = args[coding_arg_eol_type] = Qunix;
10026 args[coding_arg_plist] = Flist (16, plist);
10027 Fdefine_coding_system_internal (coding_arg_max, args);
10028
10029 plist[1] = args[coding_arg_name] = Qundecided;
10030 plist[3] = args[coding_arg_mnemonic] = make_number ('-');
10031 plist[5] = args[coding_arg_coding_type] = Qundecided;
10032 /* This is already set.
10033 plist[7] = args[coding_arg_ascii_compatible_p] = Qt; */
10034 plist[8] = intern (":charset-list");
10035 plist[9] = args[coding_arg_charset_list] = Fcons (Qascii, Qnil);
10036 plist[11] = args[coding_arg_for_unibyte] = Qnil;
10037 plist[13] = build_string ("No conversion on encoding, automatic conversion on decoding.");
10038 plist[15] = args[coding_arg_eol_type] = Qnil;
10039 args[coding_arg_plist] = Flist (16, plist);
10040 Fdefine_coding_system_internal (coding_arg_max, args);
10041 }
10042
10043 setup_coding_system (Qno_conversion, &safe_terminal_coding);
10044
10045 {
10046 int i;
10047
10048 for (i = 0; i < coding_category_max; i++)
10049 Fset (AREF (Vcoding_category_table, i), Qno_conversion);
10050 }
10051 #if defined (MSDOS) || defined (WINDOWSNT)
10052 system_eol_type = Qdos;
10053 #else
10054 system_eol_type = Qunix;
10055 #endif
10056 staticpro (&system_eol_type);
10057 }
10058
10059 char *
10060 emacs_strerror (error_number)
10061 int error_number;
10062 {
10063 char *str;
10064
10065 synchronize_system_messages_locale ();
10066 str = strerror (error_number);
10067
10068 if (! NILP (Vlocale_coding_system))
10069 {
10070 Lisp_Object dec = code_convert_string_norecord (build_string (str),
10071 Vlocale_coding_system,
10072 0);
10073 str = (char *) SDATA (dec);
10074 }
10075
10076 return str;
10077 }
10078
10079 #endif /* emacs */
10080
10081 /* arch-tag: 3a3a2b01-5ff6-4071-9afe-f5b808d9229d
10082 (do not change this comment) */