(Fdefine_coding_system_internal): Fix category setting
[bpt/emacs.git] / src / coding.c
1 /* Coding system handler (conversion, detection, etc).
2 Copyright (C) 1995, 1997, 1998 Electrotechnical Laboratory, JAPAN.
3 Licensed to the Free Software Foundation.
4 Copyright (C) 2001, 2002 Free Software Foundation, Inc.
5 Copyright (C) 2001, 2002
6 National Institute of Advanced Industrial Science and Technology (AIST)
7 Registration Number H13PRO009
8
9 This file is part of GNU Emacs.
10
11 GNU Emacs is free software; you can redistribute it and/or modify
12 it under the terms of the GNU General Public License as published by
13 the Free Software Foundation; either version 2, or (at your option)
14 any later version.
15
16 GNU Emacs is distributed in the hope that it will be useful,
17 but WITHOUT ANY WARRANTY; without even the implied warranty of
18 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19 GNU General Public License for more details.
20
21 You should have received a copy of the GNU General Public License
22 along with GNU Emacs; see the file COPYING. If not, write to
23 the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
24 Boston, MA 02111-1307, USA. */
25
26 /*** TABLE OF CONTENTS ***
27
28 0. General comments
29 1. Preamble
30 2. Emacs' internal format (emacs-utf-8) handlers
31 3. UTF-8 handlers
32 4. UTF-16 handlers
33 5. Charset-base coding systems handlers
34 6. emacs-mule (old Emacs' internal format) handlers
35 7. ISO2022 handlers
36 8. Shift-JIS and BIG5 handlers
37 9. CCL handlers
38 10. C library functions
39 11. Emacs Lisp library functions
40 12. Postamble
41
42 */
43
44 /*** 0. General comments ***
45
46
47 CODING SYSTEM
48
49 A coding system is an object for an encoding mechanism that contains
50 information about how to convert byte sequences to character
51 sequences and vice versa. When we say "decode", it means converting
52 a byte sequence of a specific coding system into a character
53 sequence that is represented by Emacs' internal coding system
54 `emacs-utf-8', and when we say "encode", it means converting a
55 character sequence of emacs-utf-8 to a byte sequence of a specific
56 coding system.
57
58 In Emacs Lisp, a coding system is represented by a Lisp symbol. In
59 C level, a coding system is represented by a vector of attributes
60 stored in the hash table Vcharset_hash_table. The conversion from
61 coding system symbol to attributes vector is done by looking up
62 Vcharset_hash_table by the symbol.
63
64 Coding systems are classified into the following types depending on
65 the encoding mechanism. Here's a brief description of the types.
66
67 o UTF-8
68
69 o UTF-16
70
71 o Charset-base coding system
72
73 A coding system defined by one or more (coded) character sets.
74 Decoding and encoding are done by a code converter defined for each
75 character set.
76
77 o Old Emacs internal format (emacs-mule)
78
79 The coding system adopted by old versions of Emacs (20 and 21).
80
81 o ISO2022-base coding system
82
83 The most famous coding system for multiple character sets. X's
84 Compound Text, various EUCs (Extended Unix Code), and coding systems
85 used in the Internet communication such as ISO-2022-JP are all
86 variants of ISO2022.
87
88 o SJIS (or Shift-JIS or MS-Kanji-Code)
89
90 A coding system to encode character sets: ASCII, JISX0201, and
91 JISX0208. Widely used for PC's in Japan. Details are described in
92 section 8.
93
94 o BIG5
95
96 A coding system to encode character sets: ASCII and Big5. Widely
97 used for Chinese (mainly in Taiwan and Hong Kong). Details are
98 described in section 8. In this file, when we write "big5" (all
99 lowercase), we mean the coding system, and when we write "Big5"
100 (capitalized), we mean the character set.
101
102 o CCL
103
104 If a user wants to decode/encode text encoded in a coding system
105 not listed above, he can supply a decoder and an encoder for it in
106 CCL (Code Conversion Language) programs. Emacs executes the CCL
107 program while decoding/encoding.
108
109 o Raw-text
110
111 A coding system for text containing raw eight-bit data. Emacs
112 treats each byte of source text as a character (except for
113 end-of-line conversion).
114
115 o No-conversion
116
117 Like raw text, but don't do end-of-line conversion.
118
119
120 END-OF-LINE FORMAT
121
122 How text end-of-line is encoded depends on operating system. For
123 instance, Unix's format is just one byte of LF (line-feed) code,
124 whereas DOS's format is two-byte sequence of `carriage-return' and
125 `line-feed' codes. MacOS's format is usually one byte of
126 `carriage-return'.
127
128 Since text character encoding and end-of-line encoding are
129 independent, any coding system described above can take any format
130 of end-of-line (except for no-conversion).
131
132 STRUCT CODING_SYSTEM
133
134 Before using a coding system for code conversion (i.e. decoding and
135 encoding), we setup a structure of type `struct coding_system'.
136 This structure keeps various information about a specific code
137 conversion (e.g. the location of source and destination data).
138
139 */
140
141 /* COMMON MACROS */
142
143
144 /*** GENERAL NOTES on `detect_coding_XXX ()' functions ***
145
146 These functions check if a byte sequence specified as a source in
147 CODING conforms to the format of XXX. Return 1 if the data contains
148 a byte sequence which can be decoded into non-ASCII characters by
149 the coding system. Otherwize (i.e. the data contains only ASCII
150 characters or invalid sequence) return 0.
151
152 It also resets some bits of an integer pointed by MASK. The macros
153 CATEGORY_MASK_XXX specifies each bit of this integer.
154
155 Below is the template of these functions. */
156
157 #if 0
158 static int
159 detect_coding_XXX (coding, mask)
160 struct coding_system *coding;
161 int *mask;
162 {
163 unsigned char *src = coding->source;
164 unsigned char *src_end = coding->source + coding->src_bytes;
165 int multibytep = coding->src_multibyte;
166 int c;
167 int found = 0;
168 ...;
169
170 while (1)
171 {
172 /* Get one byte from the source. If the souce is exausted, jump
173 to no_more_source:. */
174 ONE_MORE_BYTE (c);
175 /* Check if it conforms to XXX. If not, break the loop. */
176 }
177 /* As the data is invalid for XXX, reset a proper bits. */
178 *mask &= ~CODING_CATEGORY_XXX;
179 return 0;
180 no_more_source:
181 /* The source exausted. */
182 if (!found)
183 /* ASCII characters only. */
184 return 0;
185 /* Some data should be decoded into non-ASCII characters. */
186 *mask &= CODING_CATEGORY_XXX;
187 return 1;
188 }
189 #endif
190
191 /*** GENERAL NOTES on `decode_coding_XXX ()' functions ***
192
193 These functions decode a byte sequence specified as a source by
194 CODING. The resulting multibyte text goes to a place pointed to by
195 CODING->charbuf, the length of which should not exceed
196 CODING->charbuf_size;
197
198 These functions set the information of original and decoded texts in
199 CODING->consumed, CODING->consumed_char, and CODING->charbuf_used.
200 They also set CODING->result to one of CODING_RESULT_XXX indicating
201 how the decoding is finished.
202
203 Below is the template of these functions. */
204
205 #if 0
206 static void
207 decode_coding_XXXX (coding)
208 struct coding_system *coding;
209 {
210 unsigned char *src = coding->source + coding->consumed;
211 unsigned char *src_end = coding->source + coding->src_bytes;
212 /* SRC_BASE remembers the start position in source in each loop.
213 The loop will be exited when there's not enough source code, or
214 when there's no room in CHARBUF for a decoded character. */
215 unsigned char *src_base;
216 /* A buffer to produce decoded characters. */
217 int *charbuf = coding->charbuf;
218 int *charbuf_end = charbuf + coding->charbuf_size;
219 int multibytep = coding->src_multibyte;
220
221 while (1)
222 {
223 src_base = src;
224 if (charbuf < charbuf_end)
225 /* No more room to produce a decoded character. */
226 break;
227 ONE_MORE_BYTE (c);
228 /* Decode it. */
229 }
230
231 no_more_source:
232 if (src_base < src_end
233 && coding->mode & CODING_MODE_LAST_BLOCK)
234 /* If the source ends by partial bytes to construct a character,
235 treat them as eight-bit raw data. */
236 while (src_base < src_end && charbuf < charbuf_end)
237 *charbuf++ = *src_base++;
238 /* Remember how many bytes and characters we consumed. If the
239 source is multibyte, the bytes and chars are not identical. */
240 coding->consumed = coding->consumed_char = src_base - coding->source;
241 /* Remember how many characters we produced. */
242 coding->charbuf_used = charbuf - coding->charbuf;
243 }
244 #endif
245
246 /*** GENERAL NOTES on `encode_coding_XXX ()' functions ***
247
248 These functions encode SRC_BYTES length text at SOURCE of Emacs'
249 internal multibyte format by CODING. The resulting byte sequence
250 goes to a place pointed to by DESTINATION, the length of which
251 should not exceed DST_BYTES.
252
253 These functions set the information of original and encoded texts in
254 the members produced, produced_char, consumed, and consumed_char of
255 the structure *CODING. They also set the member result to one of
256 CODING_RESULT_XXX indicating how the encoding finished.
257
258 DST_BYTES zero means that source area and destination area are
259 overlapped, which means that we can produce a encoded text until it
260 reaches at the head of not-yet-encoded source text.
261
262 Below is a template of these functions. */
263 #if 0
264 static void
265 encode_coding_XXX (coding)
266 struct coding_system *coding;
267 {
268 int multibytep = coding->dst_multibyte;
269 int *charbuf = coding->charbuf;
270 int *charbuf_end = charbuf->charbuf + coding->charbuf_used;
271 unsigned char *dst = coding->destination + coding->produced;
272 unsigned char *dst_end = coding->destination + coding->dst_bytes;
273 unsigned char *adjusted_dst_end = dst_end - _MAX_BYTES_PRODUCED_IN_LOOP_;
274 int produced_chars = 0;
275
276 for (; charbuf < charbuf_end && dst < adjusted_dst_end; charbuf++)
277 {
278 int c = *charbuf;
279 /* Encode C into DST, and increment DST. */
280 }
281 label_no_more_destination:
282 /* How many chars and bytes we produced. */
283 coding->produced_char += produced_chars;
284 coding->produced = dst - coding->destination;
285 }
286 #endif
287
288 \f
289 /*** 1. Preamble ***/
290
291 #include <config.h>
292 #include <stdio.h>
293
294 #include "lisp.h"
295 #include "buffer.h"
296 #include "character.h"
297 #include "charset.h"
298 #include "ccl.h"
299 #include "composite.h"
300 #include "coding.h"
301 #include "window.h"
302
303 Lisp_Object Vcoding_system_hash_table;
304
305 Lisp_Object Qcoding_system, Qcoding_aliases, Qeol_type;
306 Lisp_Object Qunix, Qdos;
307 extern Lisp_Object Qmac; /* frame.c */
308 Lisp_Object Qbuffer_file_coding_system;
309 Lisp_Object Qpost_read_conversion, Qpre_write_conversion;
310 Lisp_Object Qdefault_char;
311 Lisp_Object Qno_conversion, Qundecided;
312 Lisp_Object Qcharset, Qiso_2022, Qutf_8, Qutf_16, Qshift_jis, Qbig5;
313 Lisp_Object Qutf_16_be_nosig, Qutf_16_be, Qutf_16_le_nosig, Qutf_16_le;
314 Lisp_Object Qsignature, Qendian, Qbig, Qlittle;
315 Lisp_Object Qcoding_system_history;
316 Lisp_Object Qvalid_codes;
317
318 extern Lisp_Object Qinsert_file_contents, Qwrite_region;
319 Lisp_Object Qcall_process, Qcall_process_region, Qprocess_argument;
320 Lisp_Object Qstart_process, Qopen_network_stream;
321 Lisp_Object Qtarget_idx;
322
323 Lisp_Object Vselect_safe_coding_system_function;
324
325 /* Mnemonic string for each format of end-of-line. */
326 Lisp_Object eol_mnemonic_unix, eol_mnemonic_dos, eol_mnemonic_mac;
327 /* Mnemonic string to indicate format of end-of-line is not yet
328 decided. */
329 Lisp_Object eol_mnemonic_undecided;
330
331 #ifdef emacs
332
333 Lisp_Object Vcoding_system_list, Vcoding_system_alist;
334
335 Lisp_Object Qcoding_system_p, Qcoding_system_error;
336
337 /* Coding system emacs-mule and raw-text are for converting only
338 end-of-line format. */
339 Lisp_Object Qemacs_mule, Qraw_text;
340
341 /* Coding-systems are handed between Emacs Lisp programs and C internal
342 routines by the following three variables. */
343 /* Coding-system for reading files and receiving data from process. */
344 Lisp_Object Vcoding_system_for_read;
345 /* Coding-system for writing files and sending data to process. */
346 Lisp_Object Vcoding_system_for_write;
347 /* Coding-system actually used in the latest I/O. */
348 Lisp_Object Vlast_coding_system_used;
349
350 /* A vector of length 256 which contains information about special
351 Latin codes (especially for dealing with Microsoft codes). */
352 Lisp_Object Vlatin_extra_code_table;
353
354 /* Flag to inhibit code conversion of end-of-line format. */
355 int inhibit_eol_conversion;
356
357 /* Flag to inhibit ISO2022 escape sequence detection. */
358 int inhibit_iso_escape_detection;
359
360 /* Flag to make buffer-file-coding-system inherit from process-coding. */
361 int inherit_process_coding_system;
362
363 /* Coding system to be used to encode text for terminal display. */
364 struct coding_system terminal_coding;
365
366 /* Coding system to be used to encode text for terminal display when
367 terminal coding system is nil. */
368 struct coding_system safe_terminal_coding;
369
370 /* Coding system of what is sent from terminal keyboard. */
371 struct coding_system keyboard_coding;
372
373 Lisp_Object Vfile_coding_system_alist;
374 Lisp_Object Vprocess_coding_system_alist;
375 Lisp_Object Vnetwork_coding_system_alist;
376
377 Lisp_Object Vlocale_coding_system;
378
379 #endif /* emacs */
380
381 /* Flag to tell if we look up translation table on character code
382 conversion. */
383 Lisp_Object Venable_character_translation;
384 /* Standard translation table to look up on decoding (reading). */
385 Lisp_Object Vstandard_translation_table_for_decode;
386 /* Standard translation table to look up on encoding (writing). */
387 Lisp_Object Vstandard_translation_table_for_encode;
388
389 Lisp_Object Qtranslation_table;
390 Lisp_Object Qtranslation_table_id;
391 Lisp_Object Qtranslation_table_for_decode;
392 Lisp_Object Qtranslation_table_for_encode;
393
394 /* Alist of charsets vs revision number. */
395 static Lisp_Object Vcharset_revision_table;
396
397 /* Default coding systems used for process I/O. */
398 Lisp_Object Vdefault_process_coding_system;
399
400 /* Global flag to tell that we can't call post-read-conversion and
401 pre-write-conversion functions. Usually the value is zero, but it
402 is set to 1 temporarily while such functions are running. This is
403 to avoid infinite recursive call. */
404 static int inhibit_pre_post_conversion;
405
406 /* Two special coding systems. */
407 Lisp_Object Vsjis_coding_system;
408 Lisp_Object Vbig5_coding_system;
409
410
411 static int detect_coding_utf_8 P_ ((struct coding_system *, int *));
412 static void decode_coding_utf_8 P_ ((struct coding_system *));
413 static int encode_coding_utf_8 P_ ((struct coding_system *));
414
415 static int detect_coding_utf_16 P_ ((struct coding_system *, int *));
416 static void decode_coding_utf_16 P_ ((struct coding_system *));
417 static int encode_coding_utf_16 P_ ((struct coding_system *));
418
419 static int detect_coding_iso_2022 P_ ((struct coding_system *, int *));
420 static void decode_coding_iso_2022 P_ ((struct coding_system *));
421 static int encode_coding_iso_2022 P_ ((struct coding_system *));
422
423 static int detect_coding_emacs_mule P_ ((struct coding_system *, int *));
424 static void decode_coding_emacs_mule P_ ((struct coding_system *));
425 static int encode_coding_emacs_mule P_ ((struct coding_system *));
426
427 static int detect_coding_sjis P_ ((struct coding_system *, int *));
428 static void decode_coding_sjis P_ ((struct coding_system *));
429 static int encode_coding_sjis P_ ((struct coding_system *));
430
431 static int detect_coding_big5 P_ ((struct coding_system *, int *));
432 static void decode_coding_big5 P_ ((struct coding_system *));
433 static int encode_coding_big5 P_ ((struct coding_system *));
434
435 static int detect_coding_ccl P_ ((struct coding_system *, int *));
436 static void decode_coding_ccl P_ ((struct coding_system *));
437 static int encode_coding_ccl P_ ((struct coding_system *));
438
439 static void decode_coding_raw_text P_ ((struct coding_system *));
440 static int encode_coding_raw_text P_ ((struct coding_system *));
441
442
443 /* ISO2022 section */
444
445 #define CODING_ISO_INITIAL(coding, reg) \
446 (XINT (AREF (AREF (CODING_ID_ATTRS ((coding)->id), \
447 coding_attr_iso_initial), \
448 reg)))
449
450
451 #define CODING_ISO_REQUEST(coding, charset_id) \
452 ((charset_id <= (coding)->max_charset_id \
453 ? (coding)->safe_charsets[charset_id] \
454 : -1))
455
456
457 #define CODING_ISO_FLAGS(coding) \
458 ((coding)->spec.iso_2022.flags)
459 #define CODING_ISO_DESIGNATION(coding, reg) \
460 ((coding)->spec.iso_2022.current_designation[reg])
461 #define CODING_ISO_INVOCATION(coding, plane) \
462 ((coding)->spec.iso_2022.current_invocation[plane])
463 #define CODING_ISO_SINGLE_SHIFTING(coding) \
464 ((coding)->spec.iso_2022.single_shifting)
465 #define CODING_ISO_BOL(coding) \
466 ((coding)->spec.iso_2022.bol)
467 #define CODING_ISO_INVOKED_CHARSET(coding, plane) \
468 CODING_ISO_DESIGNATION ((coding), CODING_ISO_INVOCATION ((coding), (plane)))
469
470 /* Control characters of ISO2022. */
471 /* code */ /* function */
472 #define ISO_CODE_LF 0x0A /* line-feed */
473 #define ISO_CODE_CR 0x0D /* carriage-return */
474 #define ISO_CODE_SO 0x0E /* shift-out */
475 #define ISO_CODE_SI 0x0F /* shift-in */
476 #define ISO_CODE_SS2_7 0x19 /* single-shift-2 for 7-bit code */
477 #define ISO_CODE_ESC 0x1B /* escape */
478 #define ISO_CODE_SS2 0x8E /* single-shift-2 */
479 #define ISO_CODE_SS3 0x8F /* single-shift-3 */
480 #define ISO_CODE_CSI 0x9B /* control-sequence-introducer */
481
482 /* All code (1-byte) of ISO2022 is classified into one of the
483 followings. */
484 enum iso_code_class_type
485 {
486 ISO_control_0, /* Control codes in the range
487 0x00..0x1F and 0x7F, except for the
488 following 5 codes. */
489 ISO_carriage_return, /* ISO_CODE_CR (0x0D) */
490 ISO_shift_out, /* ISO_CODE_SO (0x0E) */
491 ISO_shift_in, /* ISO_CODE_SI (0x0F) */
492 ISO_single_shift_2_7, /* ISO_CODE_SS2_7 (0x19) */
493 ISO_escape, /* ISO_CODE_SO (0x1B) */
494 ISO_control_1, /* Control codes in the range
495 0x80..0x9F, except for the
496 following 3 codes. */
497 ISO_single_shift_2, /* ISO_CODE_SS2 (0x8E) */
498 ISO_single_shift_3, /* ISO_CODE_SS3 (0x8F) */
499 ISO_control_sequence_introducer, /* ISO_CODE_CSI (0x9B) */
500 ISO_0x20_or_0x7F, /* Codes of the values 0x20 or 0x7F. */
501 ISO_graphic_plane_0, /* Graphic codes in the range 0x21..0x7E. */
502 ISO_0xA0_or_0xFF, /* Codes of the values 0xA0 or 0xFF. */
503 ISO_graphic_plane_1 /* Graphic codes in the range 0xA1..0xFE. */
504 };
505
506 /** The macros CODING_ISO_FLAG_XXX defines a flag bit of the
507 `iso-flags' attribute of an iso2022 coding system. */
508
509 /* If set, produce long-form designation sequence (e.g. ESC $ ( A)
510 instead of the correct short-form sequence (e.g. ESC $ A). */
511 #define CODING_ISO_FLAG_LONG_FORM 0x0001
512
513 /* If set, reset graphic planes and registers at end-of-line to the
514 initial state. */
515 #define CODING_ISO_FLAG_RESET_AT_EOL 0x0002
516
517 /* If set, reset graphic planes and registers before any control
518 characters to the initial state. */
519 #define CODING_ISO_FLAG_RESET_AT_CNTL 0x0004
520
521 /* If set, encode by 7-bit environment. */
522 #define CODING_ISO_FLAG_SEVEN_BITS 0x0008
523
524 /* If set, use locking-shift function. */
525 #define CODING_ISO_FLAG_LOCKING_SHIFT 0x0010
526
527 /* If set, use single-shift function. Overwrite
528 CODING_ISO_FLAG_LOCKING_SHIFT. */
529 #define CODING_ISO_FLAG_SINGLE_SHIFT 0x0020
530
531 /* If set, use designation escape sequence. */
532 #define CODING_ISO_FLAG_DESIGNATION 0x0040
533
534 /* If set, produce revision number sequence. */
535 #define CODING_ISO_FLAG_REVISION 0x0080
536
537 /* If set, produce ISO6429's direction specifying sequence. */
538 #define CODING_ISO_FLAG_DIRECTION 0x0100
539
540 /* If set, assume designation states are reset at beginning of line on
541 output. */
542 #define CODING_ISO_FLAG_INIT_AT_BOL 0x0200
543
544 /* If set, designation sequence should be placed at beginning of line
545 on output. */
546 #define CODING_ISO_FLAG_DESIGNATE_AT_BOL 0x0400
547
548 /* If set, do not encode unsafe charactes on output. */
549 #define CODING_ISO_FLAG_SAFE 0x0800
550
551 /* If set, extra latin codes (128..159) are accepted as a valid code
552 on input. */
553 #define CODING_ISO_FLAG_LATIN_EXTRA 0x1000
554
555 #define CODING_ISO_FLAG_COMPOSITION 0x2000
556
557 #define CODING_ISO_FLAG_EUC_TW_SHIFT 0x4000
558
559 #define CODING_ISO_FLAG_USE_ROMAN 0x8000
560
561 #define CODING_ISO_FLAG_USE_OLDJIS 0x10000
562
563 #define CODING_ISO_FLAG_FULL_SUPPORT 0x100000
564
565 /* A character to be produced on output if encoding of the original
566 character is prohibited by CODING_ISO_FLAG_SAFE. */
567 #define CODING_INHIBIT_CHARACTER_SUBSTITUTION '?'
568
569
570 /* UTF-16 section */
571 #define CODING_UTF_16_BOM(coding) \
572 ((coding)->spec.utf_16.bom)
573
574 #define CODING_UTF_16_ENDIAN(coding) \
575 ((coding)->spec.utf_16.endian)
576
577 #define CODING_UTF_16_SURROGATE(coding) \
578 ((coding)->spec.utf_16.surrogate)
579
580
581 /* CCL section */
582 #define CODING_CCL_DECODER(coding) \
583 AREF (CODING_ID_ATTRS ((coding)->id), coding_attr_ccl_decoder)
584 #define CODING_CCL_ENCODER(coding) \
585 AREF (CODING_ID_ATTRS ((coding)->id), coding_attr_ccl_encoder)
586 #define CODING_CCL_VALIDS(coding) \
587 (XSTRING (AREF (CODING_ID_ATTRS ((coding)->id), coding_attr_ccl_valids)) \
588 ->data)
589
590 /* Index for each coding category in `coding_categories' */
591
592 enum coding_category
593 {
594 coding_category_iso_7,
595 coding_category_iso_7_tight,
596 coding_category_iso_8_1,
597 coding_category_iso_8_2,
598 coding_category_iso_7_else,
599 coding_category_iso_8_else,
600 coding_category_utf_8,
601 coding_category_utf_16_auto,
602 coding_category_utf_16_be,
603 coding_category_utf_16_le,
604 coding_category_utf_16_be_nosig,
605 coding_category_utf_16_le_nosig,
606 coding_category_charset,
607 coding_category_sjis,
608 coding_category_big5,
609 coding_category_ccl,
610 coding_category_emacs_mule,
611 /* All above are targets of code detection. */
612 coding_category_raw_text,
613 coding_category_undecided,
614 coding_category_max
615 };
616
617 /* Definitions of flag bits used in detect_coding_XXXX. */
618 #define CATEGORY_MASK_ISO_7 (1 << coding_category_iso_7)
619 #define CATEGORY_MASK_ISO_7_TIGHT (1 << coding_category_iso_7_tight)
620 #define CATEGORY_MASK_ISO_8_1 (1 << coding_category_iso_8_1)
621 #define CATEGORY_MASK_ISO_8_2 (1 << coding_category_iso_8_2)
622 #define CATEGORY_MASK_ISO_7_ELSE (1 << coding_category_iso_7_else)
623 #define CATEGORY_MASK_ISO_8_ELSE (1 << coding_category_iso_8_else)
624 #define CATEGORY_MASK_UTF_8 (1 << coding_category_utf_8)
625 #define CATEGORY_MASK_UTF_16_BE (1 << coding_category_utf_16_be)
626 #define CATEGORY_MASK_UTF_16_LE (1 << coding_category_utf_16_le)
627 #define CATEGORY_MASK_UTF_16_BE_NOSIG (1 << coding_category_utf_16_be_nosig)
628 #define CATEGORY_MASK_UTF_16_LE_NOSIG (1 << coding_category_utf_16_le_nosig)
629 #define CATEGORY_MASK_CHARSET (1 << coding_category_charset)
630 #define CATEGORY_MASK_SJIS (1 << coding_category_sjis)
631 #define CATEGORY_MASK_BIG5 (1 << coding_category_big5)
632 #define CATEGORY_MASK_CCL (1 << coding_category_ccl)
633 #define CATEGORY_MASK_EMACS_MULE (1 << coding_category_emacs_mule)
634
635 /* This value is returned if detect_coding_mask () find nothing other
636 than ASCII characters. */
637 #define CATEGORY_MASK_ANY \
638 (CATEGORY_MASK_ISO_7 \
639 | CATEGORY_MASK_ISO_7_TIGHT \
640 | CATEGORY_MASK_ISO_8_1 \
641 | CATEGORY_MASK_ISO_8_2 \
642 | CATEGORY_MASK_ISO_7_ELSE \
643 | CATEGORY_MASK_ISO_8_ELSE \
644 | CATEGORY_MASK_UTF_8 \
645 | CATEGORY_MASK_UTF_16_BE \
646 | CATEGORY_MASK_UTF_16_LE \
647 | CATEGORY_MASK_UTF_16_BE_NOSIG \
648 | CATEGORY_MASK_UTF_16_LE_NOSIG \
649 | CATEGORY_MASK_CHARSET \
650 | CATEGORY_MASK_SJIS \
651 | CATEGORY_MASK_BIG5 \
652 | CATEGORY_MASK_CCL \
653 | CATEGORY_MASK_EMACS_MULE)
654
655
656 #define CATEGORY_MASK_ISO_7BIT \
657 (CATEGORY_MASK_ISO_7 | CATEGORY_MASK_ISO_7_TIGHT)
658
659 #define CATEGORY_MASK_ISO_8BIT \
660 (CATEGORY_MASK_ISO_8_1 | CATEGORY_MASK_ISO_8_2)
661
662 #define CATEGORY_MASK_ISO_ELSE \
663 (CATEGORY_MASK_ISO_7_ELSE | CATEGORY_MASK_ISO_8_ELSE)
664
665 #define CATEGORY_MASK_ISO_ESCAPE \
666 (CATEGORY_MASK_ISO_7 \
667 | CATEGORY_MASK_ISO_7_TIGHT \
668 | CATEGORY_MASK_ISO_7_ELSE \
669 | CATEGORY_MASK_ISO_8_ELSE)
670
671 #define CATEGORY_MASK_ISO \
672 ( CATEGORY_MASK_ISO_7BIT \
673 | CATEGORY_MASK_ISO_8BIT \
674 | CATEGORY_MASK_ISO_ELSE)
675
676 #define CATEGORY_MASK_UTF_16 \
677 (CATEGORY_MASK_UTF_16_BE \
678 | CATEGORY_MASK_UTF_16_LE \
679 | CATEGORY_MASK_UTF_16_BE_NOSIG \
680 | CATEGORY_MASK_UTF_16_LE_NOSIG)
681
682
683 /* List of symbols `coding-category-xxx' ordered by priority. This
684 variable is exposed to Emacs Lisp. */
685 static Lisp_Object Vcoding_category_list;
686
687 /* Table of coding categories (Lisp symbols). This variable is for
688 internal use oly. */
689 static Lisp_Object Vcoding_category_table;
690
691 /* Table of coding-categories ordered by priority. */
692 static enum coding_category coding_priorities[coding_category_max];
693
694 /* Nth element is a coding context for the coding system bound to the
695 Nth coding category. */
696 static struct coding_system coding_categories[coding_category_max];
697
698 static int detected_mask[coding_category_raw_text] =
699 { CATEGORY_MASK_ISO,
700 CATEGORY_MASK_ISO,
701 CATEGORY_MASK_ISO,
702 CATEGORY_MASK_ISO,
703 CATEGORY_MASK_ISO,
704 CATEGORY_MASK_ISO,
705 CATEGORY_MASK_UTF_8,
706 CATEGORY_MASK_UTF_16,
707 CATEGORY_MASK_UTF_16,
708 CATEGORY_MASK_UTF_16,
709 CATEGORY_MASK_UTF_16,
710 CATEGORY_MASK_UTF_16,
711 CATEGORY_MASK_CHARSET,
712 CATEGORY_MASK_SJIS,
713 CATEGORY_MASK_BIG5,
714 CATEGORY_MASK_CCL,
715 CATEGORY_MASK_EMACS_MULE
716 };
717
718 /*** Commonly used macros and functions ***/
719
720 #ifndef min
721 #define min(a, b) ((a) < (b) ? (a) : (b))
722 #endif
723 #ifndef max
724 #define max(a, b) ((a) > (b) ? (a) : (b))
725 #endif
726
727 #define CODING_GET_INFO(coding, attrs, eol_type, charset_list) \
728 do { \
729 attrs = CODING_ID_ATTRS (coding->id); \
730 eol_type = CODING_ID_EOL_TYPE (coding->id); \
731 if (VECTORP (eol_type)) \
732 eol_type = Qunix; \
733 charset_list = CODING_ATTR_CHARSET_LIST (attrs); \
734 } while (0)
735
736
737 /* Safely get one byte from the source text pointed by SRC which ends
738 at SRC_END, and set C to that byte. If there are not enough bytes
739 in the source, it jumps to `no_more_source'. The caller
740 should declare and set these variables appropriately in advance:
741 src, src_end, multibytep
742 */
743
744 #define ONE_MORE_BYTE(c) \
745 do { \
746 if (src == src_end) \
747 { \
748 if (src_base < src) \
749 coding->result = CODING_RESULT_INSUFFICIENT_SRC; \
750 goto no_more_source; \
751 } \
752 c = *src++; \
753 if (multibytep && (c & 0x80)) \
754 { \
755 if ((c & 0xFE) != 0xC0) \
756 error ("Undecodable char found"); \
757 c = ((c & 1) << 6) | *src++; \
758 } \
759 consumed_chars++; \
760 } while (0)
761
762
763 #define ONE_MORE_BYTE_NO_CHECK(c) \
764 do { \
765 c = *src++; \
766 if (multibytep && (c & 0x80)) \
767 { \
768 if ((c & 0xFE) != 0xC0) \
769 error ("Undecodable char found"); \
770 c = ((c & 1) << 6) | *src++; \
771 } \
772 consumed_chars++; \
773 } while (0)
774
775
776 /* Store a byte C in the place pointed by DST and increment DST to the
777 next free point, and increment PRODUCED_CHARS. The caller should
778 assure that C is 0..127, and declare and set the variable `dst'
779 appropriately in advance.
780 */
781
782
783 #define EMIT_ONE_ASCII_BYTE(c) \
784 do { \
785 produced_chars++; \
786 *dst++ = (c); \
787 } while (0)
788
789
790 /* Like EMIT_ONE_ASCII_BYTE byt store two bytes; C1 and C2. */
791
792 #define EMIT_TWO_ASCII_BYTES(c1, c2) \
793 do { \
794 produced_chars += 2; \
795 *dst++ = (c1), *dst++ = (c2); \
796 } while (0)
797
798
799 /* Store a byte C in the place pointed by DST and increment DST to the
800 next free point, and increment PRODUCED_CHARS. If MULTIBYTEP is
801 nonzero, store in an appropriate multibyte from. The caller should
802 declare and set the variables `dst' and `multibytep' appropriately
803 in advance. */
804
805 #define EMIT_ONE_BYTE(c) \
806 do { \
807 produced_chars++; \
808 if (multibytep) \
809 { \
810 int ch = (c); \
811 if (ch >= 0x80) \
812 ch = BYTE8_TO_CHAR (ch); \
813 CHAR_STRING_ADVANCE (ch, dst); \
814 } \
815 else \
816 *dst++ = (c); \
817 } while (0)
818
819
820 /* Like EMIT_ONE_BYTE, but emit two bytes; C1 and C2. */
821
822 #define EMIT_TWO_BYTES(c1, c2) \
823 do { \
824 produced_chars += 2; \
825 if (multibytep) \
826 { \
827 int ch; \
828 \
829 ch = (c1); \
830 if (ch >= 0x80) \
831 ch = BYTE8_TO_CHAR (ch); \
832 CHAR_STRING_ADVANCE (ch, dst); \
833 ch = (c2); \
834 if (ch >= 0x80) \
835 ch = BYTE8_TO_CHAR (ch); \
836 CHAR_STRING_ADVANCE (ch, dst); \
837 } \
838 else \
839 { \
840 *dst++ = (c1); \
841 *dst++ = (c2); \
842 } \
843 } while (0)
844
845
846 #define EMIT_THREE_BYTES(c1, c2, c3) \
847 do { \
848 EMIT_ONE_BYTE (c1); \
849 EMIT_TWO_BYTES (c2, c3); \
850 } while (0)
851
852
853 #define EMIT_FOUR_BYTES(c1, c2, c3, c4) \
854 do { \
855 EMIT_TWO_BYTES (c1, c2); \
856 EMIT_TWO_BYTES (c3, c4); \
857 } while (0)
858
859
860 #define CODING_DECODE_CHAR(coding, src, src_base, src_end, charset, code, c) \
861 do { \
862 charset_map_loaded = 0; \
863 c = DECODE_CHAR (charset, code); \
864 if (charset_map_loaded) \
865 { \
866 unsigned char *orig = coding->source; \
867 EMACS_INT offset; \
868 \
869 coding_set_source (coding); \
870 offset = coding->source - orig; \
871 src += offset; \
872 src_base += offset; \
873 src_end += offset; \
874 } \
875 } while (0)
876
877
878 #define ASSURE_DESTINATION(bytes) \
879 do { \
880 if (dst + (bytes) >= dst_end) \
881 { \
882 int more_bytes = charbuf_end - charbuf + (bytes); \
883 \
884 dst = alloc_destination (coding, more_bytes, dst); \
885 dst_end = coding->destination + coding->dst_bytes; \
886 } \
887 } while (0)
888
889
890
891 static void
892 coding_set_source (coding)
893 struct coding_system *coding;
894 {
895 if (BUFFERP (coding->src_object))
896 {
897 if (coding->src_pos < 0)
898 coding->source = GAP_END_ADDR + coding->src_pos_byte;
899 else
900 {
901 struct buffer *buf = XBUFFER (coding->src_object);
902 EMACS_INT gpt_byte = BUF_GPT_BYTE (buf);
903 unsigned char *beg_addr = BUF_BEG_ADDR (buf);
904
905 coding->source = beg_addr + coding->src_pos_byte - 1;
906 if (coding->src_pos_byte >= gpt_byte)
907 coding->source += BUF_GAP_SIZE (buf);
908 }
909 }
910 else if (STRINGP (coding->src_object))
911 {
912 coding->source = (XSTRING (coding->src_object)->data
913 + coding->src_pos_byte);
914 }
915 else
916 /* Otherwise, the source is C string and is never relocated
917 automatically. Thus we don't have to update anything. */
918 ;
919 }
920
921 static void
922 coding_set_destination (coding)
923 struct coding_system *coding;
924 {
925 if (BUFFERP (coding->dst_object))
926 {
927 /* We are sure that coding->dst_pos_byte is before the gap of the
928 buffer. */
929 coding->destination = (BUF_BEG_ADDR (XBUFFER (coding->dst_object))
930 + coding->dst_pos_byte - 1);
931 if (coding->src_pos < 0)
932 coding->dst_bytes = (GAP_END_ADDR
933 - (coding->src_bytes - coding->consumed)
934 - coding->destination);
935 else
936 coding->dst_bytes = (BUF_GAP_END_ADDR (XBUFFER (coding->dst_object))
937 - coding->destination);
938 }
939 else
940 /* Otherwise, the destination is C string and is never relocated
941 automatically. Thus we don't have to update anything. */
942 ;
943 }
944
945
946 static void
947 coding_alloc_by_realloc (coding, bytes)
948 struct coding_system *coding;
949 EMACS_INT bytes;
950 {
951 coding->destination = (unsigned char *) xrealloc (coding->destination,
952 coding->dst_bytes + bytes);
953 coding->dst_bytes += bytes;
954 }
955
956 static void
957 coding_alloc_by_making_gap (coding, bytes)
958 struct coding_system *coding;
959 EMACS_INT bytes;
960 {
961 if (BUFFERP (coding->dst_object)
962 && EQ (coding->src_object, coding->dst_object))
963 {
964 EMACS_INT add = coding->src_bytes - coding->consumed;
965
966 GAP_SIZE -= add; ZV += add; Z += add; ZV_BYTE += add; Z_BYTE += add;
967 make_gap (bytes);
968 GAP_SIZE += add; ZV -= add; Z -= add; ZV_BYTE -= add; Z_BYTE -= add;
969 }
970 else
971 {
972 Lisp_Object this_buffer;
973
974 this_buffer = Fcurrent_buffer ();
975 set_buffer_internal (XBUFFER (coding->dst_object));
976 make_gap (bytes);
977 set_buffer_internal (XBUFFER (this_buffer));
978 }
979 }
980
981
982 static unsigned char *
983 alloc_destination (coding, nbytes, dst)
984 struct coding_system *coding;
985 int nbytes;
986 unsigned char *dst;
987 {
988 EMACS_INT offset = dst - coding->destination;
989
990 if (BUFFERP (coding->dst_object))
991 coding_alloc_by_making_gap (coding, nbytes);
992 else
993 coding_alloc_by_realloc (coding, nbytes);
994 coding->result = CODING_RESULT_SUCCESS;
995 coding_set_destination (coding);
996 dst = coding->destination + offset;
997 return dst;
998 }
999
1000 \f
1001 /*** 2. Emacs' internal format (emacs-utf-8) ***/
1002
1003
1004
1005 \f
1006 /*** 3. UTF-8 ***/
1007
1008 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
1009 Check if a text is encoded in UTF-8. If it is, return
1010 CATEGORY_MASK_UTF_8, else return 0. */
1011
1012 #define UTF_8_1_OCTET_P(c) ((c) < 0x80)
1013 #define UTF_8_EXTRA_OCTET_P(c) (((c) & 0xC0) == 0x80)
1014 #define UTF_8_2_OCTET_LEADING_P(c) (((c) & 0xE0) == 0xC0)
1015 #define UTF_8_3_OCTET_LEADING_P(c) (((c) & 0xF0) == 0xE0)
1016 #define UTF_8_4_OCTET_LEADING_P(c) (((c) & 0xF8) == 0xF0)
1017 #define UTF_8_5_OCTET_LEADING_P(c) (((c) & 0xFC) == 0xF8)
1018
1019 static int
1020 detect_coding_utf_8 (coding, mask)
1021 struct coding_system *coding;
1022 int *mask;
1023 {
1024 unsigned char *src = coding->source, *src_base = src;
1025 unsigned char *src_end = coding->source + coding->src_bytes;
1026 int multibytep = coding->src_multibyte;
1027 int consumed_chars = 0;
1028 int found = 0;
1029
1030 /* A coding system of this category is always ASCII compatible. */
1031 src += coding->head_ascii;
1032
1033 while (1)
1034 {
1035 int c, c1, c2, c3, c4;
1036
1037 ONE_MORE_BYTE (c);
1038 if (UTF_8_1_OCTET_P (c))
1039 continue;
1040 ONE_MORE_BYTE (c1);
1041 if (! UTF_8_EXTRA_OCTET_P (c1))
1042 break;
1043 if (UTF_8_2_OCTET_LEADING_P (c))
1044 {
1045 found++;
1046 continue;
1047 }
1048 ONE_MORE_BYTE (c2);
1049 if (! UTF_8_EXTRA_OCTET_P (c2))
1050 break;
1051 if (UTF_8_3_OCTET_LEADING_P (c))
1052 {
1053 found++;
1054 continue;
1055 }
1056 ONE_MORE_BYTE (c3);
1057 if (! UTF_8_EXTRA_OCTET_P (c3))
1058 break;
1059 if (UTF_8_4_OCTET_LEADING_P (c))
1060 {
1061 found++;
1062 continue;
1063 }
1064 ONE_MORE_BYTE (c4);
1065 if (! UTF_8_EXTRA_OCTET_P (c4))
1066 break;
1067 if (UTF_8_5_OCTET_LEADING_P (c))
1068 {
1069 found++;
1070 continue;
1071 }
1072 break;
1073 }
1074 *mask &= ~CATEGORY_MASK_UTF_8;
1075 return 0;
1076
1077 no_more_source:
1078 if (! found)
1079 return 0;
1080 *mask &= CATEGORY_MASK_UTF_8;
1081 return 1;
1082 }
1083
1084
1085 /* Fixme: deal with surrogates? */
1086 static void
1087 decode_coding_utf_8 (coding)
1088 struct coding_system *coding;
1089 {
1090 unsigned char *src = coding->source + coding->consumed;
1091 unsigned char *src_end = coding->source + coding->src_bytes;
1092 unsigned char *src_base;
1093 int *charbuf = coding->charbuf;
1094 int *charbuf_end = charbuf + coding->charbuf_size;
1095 int consumed_chars = 0, consumed_chars_base;
1096 int multibytep = coding->src_multibyte;
1097 Lisp_Object attr, eol_type, charset_list;
1098
1099 CODING_GET_INFO (coding, attr, eol_type, charset_list);
1100
1101 while (1)
1102 {
1103 int c, c1, c2, c3, c4, c5;
1104
1105 src_base = src;
1106 consumed_chars_base = consumed_chars;
1107
1108 if (charbuf >= charbuf_end)
1109 break;
1110
1111 ONE_MORE_BYTE (c1);
1112 if (UTF_8_1_OCTET_P(c1))
1113 {
1114 c = c1;
1115 if (c == '\r')
1116 {
1117 if (EQ (eol_type, Qdos))
1118 {
1119 if (src == src_end)
1120 goto no_more_source;
1121 if (*src == '\n')
1122 ONE_MORE_BYTE (c);
1123 }
1124 else if (EQ (eol_type, Qmac))
1125 c = '\n';
1126 }
1127 }
1128 else
1129 {
1130 ONE_MORE_BYTE (c2);
1131 if (! UTF_8_EXTRA_OCTET_P (c2))
1132 goto invalid_code;
1133 if (UTF_8_2_OCTET_LEADING_P (c1))
1134 {
1135 c = ((c1 & 0x1F) << 6) | (c2 & 0x3F);
1136 /* Reject overlong sequences here and below. Encoders
1137 producing them are incorrect, they can be misleading,
1138 and they mess up read/write invariance. */
1139 if (c < 128)
1140 goto invalid_code;
1141 }
1142 else
1143 {
1144 ONE_MORE_BYTE (c3);
1145 if (! UTF_8_EXTRA_OCTET_P (c3))
1146 goto invalid_code;
1147 if (UTF_8_3_OCTET_LEADING_P (c1))
1148 {
1149 c = (((c1 & 0xF) << 12)
1150 | ((c2 & 0x3F) << 6) | (c3 & 0x3F));
1151 if (c < 0x800)
1152 goto invalid_code;
1153 }
1154 else
1155 {
1156 ONE_MORE_BYTE (c4);
1157 if (! UTF_8_EXTRA_OCTET_P (c4))
1158 goto invalid_code;
1159 if (UTF_8_4_OCTET_LEADING_P (c1))
1160 {
1161 c = (((c1 & 0x7) << 18) | ((c2 & 0x3F) << 12)
1162 | ((c3 & 0x3F) << 6) | (c4 & 0x3F));
1163 if (c < 0x10000)
1164 goto invalid_code;
1165 }
1166 else
1167 {
1168 ONE_MORE_BYTE (c5);
1169 if (! UTF_8_EXTRA_OCTET_P (c5))
1170 goto invalid_code;
1171 if (UTF_8_5_OCTET_LEADING_P (c1))
1172 {
1173 c = (((c1 & 0x3) << 24) | ((c2 & 0x3F) << 18)
1174 | ((c3 & 0x3F) << 12) | ((c4 & 0x3F) << 6)
1175 | (c5 & 0x3F));
1176 if ((c > MAX_CHAR) || (c < 0x200000))
1177 goto invalid_code;
1178 }
1179 else
1180 goto invalid_code;
1181 }
1182 }
1183 }
1184 }
1185
1186 *charbuf++ = c;
1187 continue;
1188
1189 invalid_code:
1190 src = src_base;
1191 consumed_chars = consumed_chars_base;
1192 ONE_MORE_BYTE (c);
1193 *charbuf++ = ASCII_BYTE_P (c) ? c : BYTE8_TO_CHAR (c);
1194 coding->errors++;
1195 }
1196
1197 no_more_source:
1198 coding->consumed_char += consumed_chars_base;
1199 coding->consumed = src_base - coding->source;
1200 coding->charbuf_used = charbuf - coding->charbuf;
1201 }
1202
1203
1204 static int
1205 encode_coding_utf_8 (coding)
1206 struct coding_system *coding;
1207 {
1208 int multibytep = coding->dst_multibyte;
1209 int *charbuf = coding->charbuf;
1210 int *charbuf_end = charbuf + coding->charbuf_used;
1211 unsigned char *dst = coding->destination + coding->produced;
1212 unsigned char *dst_end = coding->destination + coding->dst_bytes;
1213 int produced_chars = 0;
1214 int c;
1215
1216 if (multibytep)
1217 {
1218 int safe_room = MAX_MULTIBYTE_LENGTH * 2;
1219
1220 while (charbuf < charbuf_end)
1221 {
1222 unsigned char str[MAX_MULTIBYTE_LENGTH], *p, *pend = str;
1223
1224 ASSURE_DESTINATION (safe_room);
1225 c = *charbuf++;
1226 CHAR_STRING_ADVANCE (c, pend);
1227 for (p = str; p < pend; p++)
1228 EMIT_ONE_BYTE (*p);
1229 }
1230 }
1231 else
1232 {
1233 int safe_room = MAX_MULTIBYTE_LENGTH;
1234
1235 while (charbuf < charbuf_end)
1236 {
1237 ASSURE_DESTINATION (safe_room);
1238 c = *charbuf++;
1239 dst += CHAR_STRING (c, dst);
1240 produced_chars++;
1241 }
1242 }
1243 coding->result = CODING_RESULT_SUCCESS;
1244 coding->produced_char += produced_chars;
1245 coding->produced = dst - coding->destination;
1246 return 0;
1247 }
1248
1249
1250 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
1251 Check if a text is encoded in UTF-16 Big Endian (endian == 1) or
1252 Little Endian (otherwise). If it is, return
1253 CATEGORY_MASK_UTF_16_BE or CATEGORY_MASK_UTF_16_LE,
1254 else return 0. */
1255
1256 #define UTF_16_HIGH_SURROGATE_P(val) \
1257 (((val) & 0xFC00) == 0xD800)
1258
1259 #define UTF_16_LOW_SURROGATE_P(val) \
1260 (((val) & 0xFC00) == 0xDC00)
1261
1262 #define UTF_16_INVALID_P(val) \
1263 (((val) == 0xFFFE) \
1264 || ((val) == 0xFFFF) \
1265 || UTF_16_LOW_SURROGATE_P (val))
1266
1267
1268 static int
1269 detect_coding_utf_16 (coding, mask)
1270 struct coding_system *coding;
1271 int *mask;
1272 {
1273 unsigned char *src = coding->source, *src_base = src;
1274 unsigned char *src_end = coding->source + coding->src_bytes;
1275 int multibytep = coding->src_multibyte;
1276 int consumed_chars = 0;
1277 int c1, c2;
1278
1279 ONE_MORE_BYTE (c1);
1280 ONE_MORE_BYTE (c2);
1281
1282 if ((c1 == 0xFF) && (c2 == 0xFE))
1283 {
1284 *mask &= CATEGORY_MASK_UTF_16_LE;
1285 return 1;
1286 }
1287 else if ((c1 == 0xFE) && (c2 == 0xFF))
1288 {
1289 *mask &= CATEGORY_MASK_UTF_16_BE;
1290 return 1;
1291 }
1292 no_more_source:
1293 return 0;
1294 }
1295
1296 static void
1297 decode_coding_utf_16 (coding)
1298 struct coding_system *coding;
1299 {
1300 unsigned char *src = coding->source + coding->consumed;
1301 unsigned char *src_end = coding->source + coding->src_bytes;
1302 unsigned char *src_base;
1303 int *charbuf = coding->charbuf;
1304 int *charbuf_end = charbuf + coding->charbuf_size;
1305 int consumed_chars = 0, consumed_chars_base;
1306 int multibytep = coding->src_multibyte;
1307 enum utf_16_bom_type bom = CODING_UTF_16_BOM (coding);
1308 enum utf_16_endian_type endian = CODING_UTF_16_ENDIAN (coding);
1309 int surrogate = CODING_UTF_16_SURROGATE (coding);
1310 Lisp_Object attr, eol_type, charset_list;
1311
1312 CODING_GET_INFO (coding, attr, eol_type, charset_list);
1313
1314 if (bom != utf_16_without_bom)
1315 {
1316 int c, c1, c2;
1317
1318 src_base = src;
1319 ONE_MORE_BYTE (c1);
1320 ONE_MORE_BYTE (c2);
1321 c = (c1 << 8) | c2;
1322 if (bom == utf_16_with_bom)
1323 {
1324 if (endian == utf_16_big_endian
1325 ? c != 0xFFFE : c != 0xFEFF)
1326 {
1327 /* We are sure that there's enouph room at CHARBUF. */
1328 *charbuf++ = c1;
1329 *charbuf++ = c2;
1330 coding->errors++;
1331 }
1332 }
1333 else
1334 {
1335 if (c == 0xFFFE)
1336 CODING_UTF_16_ENDIAN (coding)
1337 = endian = utf_16_big_endian;
1338 else if (c == 0xFEFF)
1339 CODING_UTF_16_ENDIAN (coding)
1340 = endian = utf_16_little_endian;
1341 else
1342 {
1343 CODING_UTF_16_ENDIAN (coding)
1344 = endian = utf_16_big_endian;
1345 src = src_base;
1346 }
1347 }
1348 CODING_UTF_16_BOM (coding) = utf_16_with_bom;
1349 }
1350
1351 while (1)
1352 {
1353 int c, c1, c2;
1354
1355 src_base = src;
1356 consumed_chars_base = consumed_chars;
1357
1358 if (charbuf + 2 >= charbuf_end)
1359 break;
1360
1361 ONE_MORE_BYTE (c1);
1362 ONE_MORE_BYTE (c2);
1363 c = (endian == utf_16_big_endian
1364 ? ((c1 << 8) | c2) : ((c2 << 8) | c1));
1365 if (surrogate)
1366 {
1367 if (! UTF_16_LOW_SURROGATE_P (c))
1368 {
1369 if (endian == utf_16_big_endian)
1370 c1 = surrogate >> 8, c2 = surrogate & 0xFF;
1371 else
1372 c1 = surrogate & 0xFF, c2 = surrogate >> 8;
1373 *charbuf++ = c1;
1374 *charbuf++ = c2;
1375 coding->errors++;
1376 if (UTF_16_HIGH_SURROGATE_P (c))
1377 CODING_UTF_16_SURROGATE (coding) = surrogate = c;
1378 else
1379 *charbuf++ = c;
1380 }
1381 else
1382 {
1383 c = ((surrogate - 0xD800) << 10) | (c - 0xDC00);
1384 CODING_UTF_16_SURROGATE (coding) = surrogate = 0;
1385 *charbuf++ = c;
1386 }
1387 }
1388 else
1389 {
1390 if (UTF_16_HIGH_SURROGATE_P (c))
1391 CODING_UTF_16_SURROGATE (coding) = surrogate = c;
1392 else
1393 *charbuf++ = c;
1394 }
1395 }
1396
1397 no_more_source:
1398 coding->consumed_char += consumed_chars_base;
1399 coding->consumed = src_base - coding->source;
1400 coding->charbuf_used = charbuf - coding->charbuf;
1401 }
1402
1403 static int
1404 encode_coding_utf_16 (coding)
1405 struct coding_system *coding;
1406 {
1407 int multibytep = coding->dst_multibyte;
1408 int *charbuf = coding->charbuf;
1409 int *charbuf_end = charbuf + coding->charbuf_used;
1410 unsigned char *dst = coding->destination + coding->produced;
1411 unsigned char *dst_end = coding->destination + coding->dst_bytes;
1412 int safe_room = 8;
1413 enum utf_16_bom_type bom = CODING_UTF_16_BOM (coding);
1414 int big_endian = CODING_UTF_16_ENDIAN (coding) == utf_16_big_endian;
1415 int produced_chars = 0;
1416 Lisp_Object attrs, eol_type, charset_list;
1417 int c;
1418
1419 CODING_GET_INFO (coding, attrs, eol_type, charset_list);
1420
1421 if (bom == utf_16_with_bom)
1422 {
1423 ASSURE_DESTINATION (safe_room);
1424 if (big_endian)
1425 EMIT_TWO_BYTES (0xFF, 0xFE);
1426 else
1427 EMIT_TWO_BYTES (0xFE, 0xFF);
1428 CODING_UTF_16_BOM (coding) = utf_16_without_bom;
1429 }
1430
1431 while (charbuf < charbuf_end)
1432 {
1433 ASSURE_DESTINATION (safe_room);
1434 c = *charbuf++;
1435 if (c >= MAX_UNICODE_CHAR)
1436 c = coding->default_char;
1437
1438 if (c < 0x10000)
1439 {
1440 if (big_endian)
1441 EMIT_TWO_BYTES (c >> 8, c & 0xFF);
1442 else
1443 EMIT_TWO_BYTES (c & 0xFF, c >> 8);
1444 }
1445 else
1446 {
1447 int c1, c2;
1448
1449 c -= 0x10000;
1450 c1 = (c >> 10) + 0xD800;
1451 c2 = (c & 0x3FF) + 0xDC00;
1452 if (big_endian)
1453 EMIT_FOUR_BYTES (c1 >> 8, c1 & 0xFF, c2 >> 8, c2 & 0xFF);
1454 else
1455 EMIT_FOUR_BYTES (c1 & 0xFF, c1 >> 8, c2 & 0xFF, c2 >> 8);
1456 }
1457 }
1458 coding->result = CODING_RESULT_SUCCESS;
1459 coding->produced = dst - coding->destination;
1460 coding->produced_char += produced_chars;
1461 return 0;
1462 }
1463
1464 \f
1465 /*** 6. Old Emacs' internal format (emacs-mule) ***/
1466
1467 /* Emacs' internal format for representation of multiple character
1468 sets is a kind of multi-byte encoding, i.e. characters are
1469 represented by variable-length sequences of one-byte codes.
1470
1471 ASCII characters and control characters (e.g. `tab', `newline') are
1472 represented by one-byte sequences which are their ASCII codes, in
1473 the range 0x00 through 0x7F.
1474
1475 8-bit characters of the range 0x80..0x9F are represented by
1476 two-byte sequences of LEADING_CODE_8_BIT_CONTROL and (their 8-bit
1477 code + 0x20).
1478
1479 8-bit characters of the range 0xA0..0xFF are represented by
1480 one-byte sequences which are their 8-bit code.
1481
1482 The other characters are represented by a sequence of `base
1483 leading-code', optional `extended leading-code', and one or two
1484 `position-code's. The length of the sequence is determined by the
1485 base leading-code. Leading-code takes the range 0x81 through 0x9D,
1486 whereas extended leading-code and position-code take the range 0xA0
1487 through 0xFF. See `charset.h' for more details about leading-code
1488 and position-code.
1489
1490 --- CODE RANGE of Emacs' internal format ---
1491 character set range
1492 ------------- -----
1493 ascii 0x00..0x7F
1494 eight-bit-control LEADING_CODE_8_BIT_CONTROL + 0xA0..0xBF
1495 eight-bit-graphic 0xA0..0xBF
1496 ELSE 0x81..0x9D + [0xA0..0xFF]+
1497 ---------------------------------------------
1498
1499 As this is the internal character representation, the format is
1500 usually not used externally (i.e. in a file or in a data sent to a
1501 process). But, it is possible to have a text externally in this
1502 format (i.e. by encoding by the coding system `emacs-mule').
1503
1504 In that case, a sequence of one-byte codes has a slightly different
1505 form.
1506
1507 At first, all characters in eight-bit-control are represented by
1508 one-byte sequences which are their 8-bit code.
1509
1510 Next, character composition data are represented by the byte
1511 sequence of the form: 0x80 METHOD BYTES CHARS COMPONENT ...,
1512 where,
1513 METHOD is 0xF0 plus one of composition method (enum
1514 composition_method),
1515
1516 BYTES is 0xA0 plus a byte length of this composition data,
1517
1518 CHARS is 0x20 plus a number of characters composed by this
1519 data,
1520
1521 COMPONENTs are characters of multibye form or composition
1522 rules encoded by two-byte of ASCII codes.
1523
1524 In addition, for backward compatibility, the following formats are
1525 also recognized as composition data on decoding.
1526
1527 0x80 MSEQ ...
1528 0x80 0xFF MSEQ RULE MSEQ RULE ... MSEQ
1529
1530 Here,
1531 MSEQ is a multibyte form but in these special format:
1532 ASCII: 0xA0 ASCII_CODE+0x80,
1533 other: LEADING_CODE+0x20 FOLLOWING-BYTE ...,
1534 RULE is a one byte code of the range 0xA0..0xF0 that
1535 represents a composition rule.
1536 */
1537
1538 char emacs_mule_bytes[256];
1539
1540 int
1541 emacs_mule_char (coding, src, nbytes, nchars)
1542 struct coding_system *coding;
1543 unsigned char *src;
1544 int *nbytes, *nchars;
1545 {
1546 unsigned char *src_end = coding->source + coding->src_bytes;
1547 int multibytep = coding->src_multibyte;
1548 unsigned char *src_base = src;
1549 struct charset *charset;
1550 unsigned code;
1551 int c;
1552 int consumed_chars = 0;
1553
1554 ONE_MORE_BYTE (c);
1555 switch (emacs_mule_bytes[c])
1556 {
1557 case 2:
1558 if (! (charset = emacs_mule_charset[c]))
1559 goto invalid_code;
1560 ONE_MORE_BYTE (c);
1561 code = c & 0x7F;
1562 break;
1563
1564 case 3:
1565 if (c == EMACS_MULE_LEADING_CODE_PRIVATE_11
1566 || c == EMACS_MULE_LEADING_CODE_PRIVATE_12)
1567 {
1568 ONE_MORE_BYTE (c);
1569 if (! (charset = emacs_mule_charset[c]))
1570 goto invalid_code;
1571 ONE_MORE_BYTE (c);
1572 code = c & 0x7F;
1573 }
1574 else
1575 {
1576 if (! (charset = emacs_mule_charset[c]))
1577 goto invalid_code;
1578 ONE_MORE_BYTE (c);
1579 code = (c & 0x7F) << 8;
1580 ONE_MORE_BYTE (c);
1581 code |= c & 0x7F;
1582 }
1583 break;
1584
1585 case 4:
1586 ONE_MORE_BYTE (c);
1587 if (! (charset = emacs_mule_charset[c]))
1588 goto invalid_code;
1589 ONE_MORE_BYTE (c);
1590 code = (c & 0x7F) << 8;
1591 ONE_MORE_BYTE (c);
1592 code |= c & 0x7F;
1593 break;
1594
1595 case 1:
1596 code = c;
1597 charset = CHARSET_FROM_ID (ASCII_BYTE_P (code)
1598 ? charset_ascii : charset_eight_bit);
1599 break;
1600
1601 default:
1602 abort ();
1603 }
1604 c = DECODE_CHAR (charset, code);
1605 if (c < 0)
1606 goto invalid_code;
1607 *nbytes = src - src_base;
1608 *nchars = consumed_chars;
1609 return c;
1610
1611 no_more_source:
1612 return -2;
1613
1614 invalid_code:
1615 return -1;
1616 }
1617
1618
1619 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
1620 Check if a text is encoded in `emacs-mule'. */
1621
1622 static int
1623 detect_coding_emacs_mule (coding, mask)
1624 struct coding_system *coding;
1625 int *mask;
1626 {
1627 unsigned char *src = coding->source, *src_base = src;
1628 unsigned char *src_end = coding->source + coding->src_bytes;
1629 int multibytep = coding->src_multibyte;
1630 int consumed_chars = 0;
1631 int c;
1632 int found = 0;
1633
1634 /* A coding system of this category is always ASCII compatible. */
1635 src += coding->head_ascii;
1636
1637 while (1)
1638 {
1639 ONE_MORE_BYTE (c);
1640
1641 if (c == 0x80)
1642 {
1643 /* Perhaps the start of composite character. We simple skip
1644 it because analyzing it is too heavy for detecting. But,
1645 at least, we check that the composite character
1646 constitues of more than 4 bytes. */
1647 unsigned char *src_base;
1648
1649 repeat:
1650 src_base = src;
1651 do
1652 {
1653 ONE_MORE_BYTE (c);
1654 }
1655 while (c >= 0xA0);
1656
1657 if (src - src_base <= 4)
1658 break;
1659 found = 1;
1660 if (c == 0x80)
1661 goto repeat;
1662 }
1663
1664 if (c < 0x80)
1665 {
1666 if (c < 0x20
1667 && (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO))
1668 break;
1669 }
1670 else
1671 {
1672 unsigned char *src_base = src - 1;
1673
1674 do
1675 {
1676 ONE_MORE_BYTE (c);
1677 }
1678 while (c >= 0xA0);
1679 if (src - src_base != emacs_mule_bytes[*src_base])
1680 break;
1681 found = 1;
1682 }
1683 }
1684 *mask &= ~CATEGORY_MASK_EMACS_MULE;
1685 return 0;
1686
1687 no_more_source:
1688 if (!found)
1689 return 0;
1690 *mask &= CATEGORY_MASK_EMACS_MULE;
1691 return 1;
1692 }
1693
1694
1695 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions". */
1696
1697 /* Decode a character represented as a component of composition
1698 sequence of Emacs 20/21 style at SRC. Set C to that character and
1699 update SRC to the head of next character (or an encoded composition
1700 rule). If SRC doesn't points a composition component, set C to -1.
1701 If SRC points an invalid byte sequence, global exit by a return
1702 value 0. */
1703
1704 #define DECODE_EMACS_MULE_COMPOSITION_CHAR(buf) \
1705 if (1) \
1706 { \
1707 int c; \
1708 int nbytes, nchars; \
1709 \
1710 if (src == src_end) \
1711 break; \
1712 c = emacs_mule_char (coding, src, &nbytes, &nchars); \
1713 if (c < 0) \
1714 { \
1715 if (c == -2) \
1716 break; \
1717 goto invalid_code; \
1718 } \
1719 *buf++ = c; \
1720 src += nbytes; \
1721 consumed_chars += nchars; \
1722 } \
1723 else
1724
1725
1726 /* Decode a composition rule represented as a component of composition
1727 sequence of Emacs 20 style at SRC. Store the decoded rule in *BUF,
1728 and increment BUF. If SRC points an invalid byte sequence, set C
1729 to -1. */
1730
1731 #define DECODE_EMACS_MULE_COMPOSITION_RULE_20(buf) \
1732 do { \
1733 int c, gref, nref; \
1734 \
1735 if (src >= src_end) \
1736 goto invalid_code; \
1737 ONE_MORE_BYTE_NO_CHECK (c); \
1738 c -= 0x20; \
1739 if (c < 0 || c >= 81) \
1740 goto invalid_code; \
1741 \
1742 gref = c / 9, nref = c % 9; \
1743 *buf++ = COMPOSITION_ENCODE_RULE (gref, nref); \
1744 } while (0)
1745
1746
1747 /* Decode a composition rule represented as a component of composition
1748 sequence of Emacs 21 style at SRC. Store the decoded rule in *BUF,
1749 and increment BUF. If SRC points an invalid byte sequence, set C
1750 to -1. */
1751
1752 #define DECODE_EMACS_MULE_COMPOSITION_RULE_21(buf) \
1753 do { \
1754 int gref, nref; \
1755 \
1756 if (src + 1>= src_end) \
1757 goto invalid_code; \
1758 ONE_MORE_BYTE_NO_CHECK (gref); \
1759 gref -= 0x20; \
1760 ONE_MORE_BYTE_NO_CHECK (nref); \
1761 nref -= 0x20; \
1762 if (gref < 0 || gref >= 81 \
1763 || nref < 0 || nref >= 81) \
1764 goto invalid_code; \
1765 *buf++ = COMPOSITION_ENCODE_RULE (gref, nref); \
1766 } while (0)
1767
1768
1769 #define ADD_COMPOSITION_DATA(buf, method, nchars) \
1770 do { \
1771 *buf++ = -5; \
1772 *buf++ = coding->produced_char + char_offset; \
1773 *buf++ = CODING_ANNOTATE_COMPOSITION_MASK; \
1774 *buf++ = method; \
1775 *buf++ = nchars; \
1776 } while (0)
1777
1778
1779 #define DECODE_EMACS_MULE_21_COMPOSITION(c) \
1780 do { \
1781 /* Emacs 21 style format. The first three bytes at SRC are \
1782 (METHOD - 0xF2), (BYTES - 0xA0), (CHARS - 0xA0), where BYTES is \
1783 the byte length of this composition information, CHARS is the \
1784 number of characters composed by this composition. */ \
1785 enum composition_method method = c - 0xF2; \
1786 int *charbuf_base = charbuf; \
1787 int consumed_chars_limit; \
1788 int nbytes, nchars; \
1789 \
1790 ONE_MORE_BYTE (c); \
1791 nbytes = c - 0xA0; \
1792 if (nbytes < 3) \
1793 goto invalid_code; \
1794 ONE_MORE_BYTE (c); \
1795 nchars = c - 0xA0; \
1796 ADD_COMPOSITION_DATA (charbuf, method, nchars); \
1797 consumed_chars_limit = consumed_chars_base + nbytes; \
1798 if (method != COMPOSITION_RELATIVE) \
1799 { \
1800 int i = 0; \
1801 while (consumed_chars < consumed_chars_limit) \
1802 { \
1803 if (i % 2 && method != COMPOSITION_WITH_ALTCHARS) \
1804 DECODE_EMACS_MULE_COMPOSITION_RULE_21 (charbuf); \
1805 else \
1806 DECODE_EMACS_MULE_COMPOSITION_CHAR (charbuf); \
1807 i++; \
1808 } \
1809 if (consumed_chars < consumed_chars_limit) \
1810 goto invalid_code; \
1811 charbuf_base[0] -= i; \
1812 } \
1813 } while (0)
1814
1815
1816 #define DECODE_EMACS_MULE_20_RELATIVE_COMPOSITION(c) \
1817 do { \
1818 /* Emacs 20 style format for relative composition. */ \
1819 /* Store multibyte form of characters to be composed. */ \
1820 int components[MAX_COMPOSITION_COMPONENTS * 2 - 1]; \
1821 int *buf = components; \
1822 int i, j; \
1823 \
1824 src = src_base; \
1825 ONE_MORE_BYTE (c); /* skip 0x80 */ \
1826 for (i = 0; i < MAX_COMPOSITION_COMPONENTS; i++) \
1827 DECODE_EMACS_MULE_COMPOSITION_CHAR (buf); \
1828 if (i < 2) \
1829 goto invalid_code; \
1830 ADD_COMPOSITION_DATA (charbuf, COMPOSITION_RELATIVE, i); \
1831 for (j = 0; j < i; j++) \
1832 *charbuf++ = components[j]; \
1833 } while (0)
1834
1835
1836 #define DECODE_EMACS_MULE_20_RULEBASE_COMPOSITION(c) \
1837 do { \
1838 /* Emacs 20 style format for rule-base composition. */ \
1839 /* Store multibyte form of characters to be composed. */ \
1840 int components[MAX_COMPOSITION_COMPONENTS * 2 - 1]; \
1841 int *buf = components; \
1842 int i, j; \
1843 \
1844 DECODE_EMACS_MULE_COMPOSITION_CHAR (buf); \
1845 for (i = 0; i < MAX_COMPOSITION_COMPONENTS; i++) \
1846 { \
1847 DECODE_EMACS_MULE_COMPOSITION_RULE_20 (buf); \
1848 DECODE_EMACS_MULE_COMPOSITION_CHAR (buf); \
1849 } \
1850 if (i < 1 || (buf - components) % 2 == 0) \
1851 goto invalid_code; \
1852 if (charbuf + i + (i / 2) + 1 < charbuf_end) \
1853 goto no_more_source; \
1854 ADD_COMPOSITION_DATA (buf, COMPOSITION_WITH_RULE, i); \
1855 for (j = 0; j < i; j++) \
1856 *charbuf++ = components[j]; \
1857 for (j = 0; j < i; j += 2) \
1858 *charbuf++ = components[j]; \
1859 } while (0)
1860
1861
1862 static void
1863 decode_coding_emacs_mule (coding)
1864 struct coding_system *coding;
1865 {
1866 unsigned char *src = coding->source + coding->consumed;
1867 unsigned char *src_end = coding->source + coding->src_bytes;
1868 unsigned char *src_base;
1869 int *charbuf = coding->charbuf;
1870 int *charbuf_end = charbuf + coding->charbuf_size;
1871 int consumed_chars = 0, consumed_chars_base;
1872 int char_offset = 0;
1873 int multibytep = coding->src_multibyte;
1874 Lisp_Object attrs, eol_type, charset_list;
1875
1876 CODING_GET_INFO (coding, attrs, eol_type, charset_list);
1877
1878 while (1)
1879 {
1880 int c;
1881
1882 src_base = src;
1883 consumed_chars_base = consumed_chars;
1884
1885 if (charbuf >= charbuf_end)
1886 break;
1887
1888 ONE_MORE_BYTE (c);
1889
1890 if (c < 0x80)
1891 {
1892 if (c == '\r')
1893 {
1894 if (EQ (eol_type, Qdos))
1895 {
1896 if (src == src_end)
1897 goto no_more_source;
1898 if (*src == '\n')
1899 ONE_MORE_BYTE (c);
1900 }
1901 else if (EQ (eol_type, Qmac))
1902 c = '\n';
1903 }
1904 *charbuf++ = c;
1905 char_offset++;
1906 }
1907 else if (c == 0x80)
1908 {
1909 if (charbuf + 5 + (MAX_COMPOSITION_COMPONENTS * 2) - 1 > charbuf_end)
1910 break;
1911 ONE_MORE_BYTE (c);
1912 if (c - 0xF2 >= COMPOSITION_RELATIVE
1913 && c - 0xF2 <= COMPOSITION_WITH_RULE_ALTCHARS)
1914 DECODE_EMACS_MULE_21_COMPOSITION (c);
1915 else if (c < 0xC0)
1916 DECODE_EMACS_MULE_20_RELATIVE_COMPOSITION (c);
1917 else if (c == 0xFF)
1918 DECODE_EMACS_MULE_20_RULEBASE_COMPOSITION (c);
1919 else
1920 goto invalid_code;
1921 coding->annotated = 1;
1922 }
1923 else if (c < 0xA0 && emacs_mule_bytes[c] > 1)
1924 {
1925 int nbytes, nchars;
1926 src = src_base;
1927 consumed_chars = consumed_chars_base;
1928 c = emacs_mule_char (coding, src, &nbytes, &nchars);
1929 if (c < 0)
1930 {
1931 if (c == -2)
1932 break;
1933 goto invalid_code;
1934 }
1935 *charbuf++ = c;
1936 src += nbytes;
1937 consumed_chars += nchars;
1938 char_offset++;
1939 }
1940 continue;
1941
1942 invalid_code:
1943 src = src_base;
1944 consumed_chars = consumed_chars_base;
1945 ONE_MORE_BYTE (c);
1946 *charbuf++ = ASCII_BYTE_P (c) ? c : BYTE8_TO_CHAR (c);
1947 coding->errors++;
1948 }
1949
1950 no_more_source:
1951 coding->consumed_char += consumed_chars_base;
1952 coding->consumed = src_base - coding->source;
1953 coding->charbuf_used = charbuf - coding->charbuf;
1954 }
1955
1956
1957 #define EMACS_MULE_LEADING_CODES(id, codes) \
1958 do { \
1959 if (id < 0xA0) \
1960 codes[0] = id, codes[1] = 0; \
1961 else if (id < 0xE0) \
1962 codes[0] = 0x9A, codes[1] = id; \
1963 else if (id < 0xF0) \
1964 codes[0] = 0x9B, codes[1] = id; \
1965 else if (id < 0xF5) \
1966 codes[0] = 0x9C, codes[1] = id; \
1967 else \
1968 codes[0] = 0x9D, codes[1] = id; \
1969 } while (0);
1970
1971
1972 static int
1973 encode_coding_emacs_mule (coding)
1974 struct coding_system *coding;
1975 {
1976 int multibytep = coding->dst_multibyte;
1977 int *charbuf = coding->charbuf;
1978 int *charbuf_end = charbuf + coding->charbuf_used;
1979 unsigned char *dst = coding->destination + coding->produced;
1980 unsigned char *dst_end = coding->destination + coding->dst_bytes;
1981 int safe_room = 8;
1982 int produced_chars = 0;
1983 Lisp_Object attrs, eol_type, charset_list;
1984 int c;
1985
1986 CODING_GET_INFO (coding, attrs, eol_type, charset_list);
1987
1988 while (charbuf < charbuf_end)
1989 {
1990 ASSURE_DESTINATION (safe_room);
1991 c = *charbuf++;
1992 if (ASCII_CHAR_P (c))
1993 EMIT_ONE_ASCII_BYTE (c);
1994 else if (CHAR_BYTE8_P (c))
1995 {
1996 c = CHAR_TO_BYTE8 (c);
1997 EMIT_ONE_BYTE (c);
1998 }
1999 else
2000 {
2001 struct charset *charset;
2002 unsigned code;
2003 int dimension;
2004 int emacs_mule_id;
2005 unsigned char leading_codes[2];
2006
2007 charset = char_charset (c, charset_list, &code);
2008 if (! charset)
2009 {
2010 c = coding->default_char;
2011 if (ASCII_CHAR_P (c))
2012 {
2013 EMIT_ONE_ASCII_BYTE (c);
2014 continue;
2015 }
2016 charset = char_charset (c, charset_list, &code);
2017 }
2018 dimension = CHARSET_DIMENSION (charset);
2019 emacs_mule_id = CHARSET_EMACS_MULE_ID (charset);
2020 EMACS_MULE_LEADING_CODES (emacs_mule_id, leading_codes);
2021 EMIT_ONE_BYTE (leading_codes[0]);
2022 if (leading_codes[1])
2023 EMIT_ONE_BYTE (leading_codes[1]);
2024 if (dimension == 1)
2025 EMIT_ONE_BYTE (code);
2026 else
2027 {
2028 EMIT_ONE_BYTE (code >> 8);
2029 EMIT_ONE_BYTE (code & 0xFF);
2030 }
2031 }
2032 }
2033 coding->result = CODING_RESULT_SUCCESS;
2034 coding->produced_char += produced_chars;
2035 coding->produced = dst - coding->destination;
2036 return 0;
2037 }
2038
2039 \f
2040 /*** 7. ISO2022 handlers ***/
2041
2042 /* The following note describes the coding system ISO2022 briefly.
2043 Since the intention of this note is to help understand the
2044 functions in this file, some parts are NOT ACCURATE or are OVERLY
2045 SIMPLIFIED. For thorough understanding, please refer to the
2046 original document of ISO2022. This is equivalent to the standard
2047 ECMA-35, obtainable from <URL:http://www.ecma.ch/> (*).
2048
2049 ISO2022 provides many mechanisms to encode several character sets
2050 in 7-bit and 8-bit environments. For 7-bit environments, all text
2051 is encoded using bytes less than 128. This may make the encoded
2052 text a little bit longer, but the text passes more easily through
2053 several types of gateway, some of which strip off the MSB (Most
2054 Significant Bit).
2055
2056 There are two kinds of character sets: control character sets and
2057 graphic character sets. The former contain control characters such
2058 as `newline' and `escape' to provide control functions (control
2059 functions are also provided by escape sequences). The latter
2060 contain graphic characters such as 'A' and '-'. Emacs recognizes
2061 two control character sets and many graphic character sets.
2062
2063 Graphic character sets are classified into one of the following
2064 four classes, according to the number of bytes (DIMENSION) and
2065 number of characters in one dimension (CHARS) of the set:
2066 - DIMENSION1_CHARS94
2067 - DIMENSION1_CHARS96
2068 - DIMENSION2_CHARS94
2069 - DIMENSION2_CHARS96
2070
2071 In addition, each character set is assigned an identification tag,
2072 unique for each set, called the "final character" (denoted as <F>
2073 hereafter). The <F> of each character set is decided by ECMA(*)
2074 when it is registered in ISO. The code range of <F> is 0x30..0x7F
2075 (0x30..0x3F are for private use only).
2076
2077 Note (*): ECMA = European Computer Manufacturers Association
2078
2079 Here are examples of graphic character sets [NAME(<F>)]:
2080 o DIMENSION1_CHARS94 -- ASCII('B'), right-half-of-JISX0201('I'), ...
2081 o DIMENSION1_CHARS96 -- right-half-of-ISO8859-1('A'), ...
2082 o DIMENSION2_CHARS94 -- GB2312('A'), JISX0208('B'), ...
2083 o DIMENSION2_CHARS96 -- none for the moment
2084
2085 A code area (1 byte=8 bits) is divided into 4 areas, C0, GL, C1, and GR.
2086 C0 [0x00..0x1F] -- control character plane 0
2087 GL [0x20..0x7F] -- graphic character plane 0
2088 C1 [0x80..0x9F] -- control character plane 1
2089 GR [0xA0..0xFF] -- graphic character plane 1
2090
2091 A control character set is directly designated and invoked to C0 or
2092 C1 by an escape sequence. The most common case is that:
2093 - ISO646's control character set is designated/invoked to C0, and
2094 - ISO6429's control character set is designated/invoked to C1,
2095 and usually these designations/invocations are omitted in encoded
2096 text. In a 7-bit environment, only C0 can be used, and a control
2097 character for C1 is encoded by an appropriate escape sequence to
2098 fit into the environment. All control characters for C1 are
2099 defined to have corresponding escape sequences.
2100
2101 A graphic character set is at first designated to one of four
2102 graphic registers (G0 through G3), then these graphic registers are
2103 invoked to GL or GR. These designations and invocations can be
2104 done independently. The most common case is that G0 is invoked to
2105 GL, G1 is invoked to GR, and ASCII is designated to G0. Usually
2106 these invocations and designations are omitted in encoded text.
2107 In a 7-bit environment, only GL can be used.
2108
2109 When a graphic character set of CHARS94 is invoked to GL, codes
2110 0x20 and 0x7F of the GL area work as control characters SPACE and
2111 DEL respectively, and codes 0xA0 and 0xFF of the GR area should not
2112 be used.
2113
2114 There are two ways of invocation: locking-shift and single-shift.
2115 With locking-shift, the invocation lasts until the next different
2116 invocation, whereas with single-shift, the invocation affects the
2117 following character only and doesn't affect the locking-shift
2118 state. Invocations are done by the following control characters or
2119 escape sequences:
2120
2121 ----------------------------------------------------------------------
2122 abbrev function cntrl escape seq description
2123 ----------------------------------------------------------------------
2124 SI/LS0 (shift-in) 0x0F none invoke G0 into GL
2125 SO/LS1 (shift-out) 0x0E none invoke G1 into GL
2126 LS2 (locking-shift-2) none ESC 'n' invoke G2 into GL
2127 LS3 (locking-shift-3) none ESC 'o' invoke G3 into GL
2128 LS1R (locking-shift-1 right) none ESC '~' invoke G1 into GR (*)
2129 LS2R (locking-shift-2 right) none ESC '}' invoke G2 into GR (*)
2130 LS3R (locking-shift 3 right) none ESC '|' invoke G3 into GR (*)
2131 SS2 (single-shift-2) 0x8E ESC 'N' invoke G2 for one char
2132 SS3 (single-shift-3) 0x8F ESC 'O' invoke G3 for one char
2133 ----------------------------------------------------------------------
2134 (*) These are not used by any known coding system.
2135
2136 Control characters for these functions are defined by macros
2137 ISO_CODE_XXX in `coding.h'.
2138
2139 Designations are done by the following escape sequences:
2140 ----------------------------------------------------------------------
2141 escape sequence description
2142 ----------------------------------------------------------------------
2143 ESC '(' <F> designate DIMENSION1_CHARS94<F> to G0
2144 ESC ')' <F> designate DIMENSION1_CHARS94<F> to G1
2145 ESC '*' <F> designate DIMENSION1_CHARS94<F> to G2
2146 ESC '+' <F> designate DIMENSION1_CHARS94<F> to G3
2147 ESC ',' <F> designate DIMENSION1_CHARS96<F> to G0 (*)
2148 ESC '-' <F> designate DIMENSION1_CHARS96<F> to G1
2149 ESC '.' <F> designate DIMENSION1_CHARS96<F> to G2
2150 ESC '/' <F> designate DIMENSION1_CHARS96<F> to G3
2151 ESC '$' '(' <F> designate DIMENSION2_CHARS94<F> to G0 (**)
2152 ESC '$' ')' <F> designate DIMENSION2_CHARS94<F> to G1
2153 ESC '$' '*' <F> designate DIMENSION2_CHARS94<F> to G2
2154 ESC '$' '+' <F> designate DIMENSION2_CHARS94<F> to G3
2155 ESC '$' ',' <F> designate DIMENSION2_CHARS96<F> to G0 (*)
2156 ESC '$' '-' <F> designate DIMENSION2_CHARS96<F> to G1
2157 ESC '$' '.' <F> designate DIMENSION2_CHARS96<F> to G2
2158 ESC '$' '/' <F> designate DIMENSION2_CHARS96<F> to G3
2159 ----------------------------------------------------------------------
2160
2161 In this list, "DIMENSION1_CHARS94<F>" means a graphic character set
2162 of dimension 1, chars 94, and final character <F>, etc...
2163
2164 Note (*): Although these designations are not allowed in ISO2022,
2165 Emacs accepts them on decoding, and produces them on encoding
2166 CHARS96 character sets in a coding system which is characterized as
2167 7-bit environment, non-locking-shift, and non-single-shift.
2168
2169 Note (**): If <F> is '@', 'A', or 'B', the intermediate character
2170 '(' must be omitted. We refer to this as "short-form" hereafter.
2171
2172 Now you may notice that there are a lot of ways of encoding the
2173 same multilingual text in ISO2022. Actually, there exist many
2174 coding systems such as Compound Text (used in X11's inter client
2175 communication, ISO-2022-JP (used in Japanese Internet), ISO-2022-KR
2176 (used in Korean Internet), EUC (Extended UNIX Code, used in Asian
2177 localized platforms), and all of these are variants of ISO2022.
2178
2179 In addition to the above, Emacs handles two more kinds of escape
2180 sequences: ISO6429's direction specification and Emacs' private
2181 sequence for specifying character composition.
2182
2183 ISO6429's direction specification takes the following form:
2184 o CSI ']' -- end of the current direction
2185 o CSI '0' ']' -- end of the current direction
2186 o CSI '1' ']' -- start of left-to-right text
2187 o CSI '2' ']' -- start of right-to-left text
2188 The control character CSI (0x9B: control sequence introducer) is
2189 abbreviated to the escape sequence ESC '[' in a 7-bit environment.
2190
2191 Character composition specification takes the following form:
2192 o ESC '0' -- start relative composition
2193 o ESC '1' -- end composition
2194 o ESC '2' -- start rule-base composition (*)
2195 o ESC '3' -- start relative composition with alternate chars (**)
2196 o ESC '4' -- start rule-base composition with alternate chars (**)
2197 Since these are not standard escape sequences of any ISO standard,
2198 the use of them with these meanings is restricted to Emacs only.
2199
2200 (*) This form is used only in Emacs 20.7 and older versions,
2201 but newer versions can safely decode it.
2202 (**) This form is used only in Emacs 21.1 and newer versions,
2203 and older versions can't decode it.
2204
2205 Here's a list of example usages of these composition escape
2206 sequences (categorized by `enum composition_method').
2207
2208 COMPOSITION_RELATIVE:
2209 ESC 0 CHAR [ CHAR ] ESC 1
2210 COMPOSITION_WITH_RULE:
2211 ESC 2 CHAR [ RULE CHAR ] ESC 1
2212 COMPOSITION_WITH_ALTCHARS:
2213 ESC 3 ALTCHAR [ ALTCHAR ] ESC 0 CHAR [ CHAR ] ESC 1
2214 COMPOSITION_WITH_RULE_ALTCHARS:
2215 ESC 4 ALTCHAR [ RULE ALTCHAR ] ESC 0 CHAR [ CHAR ] ESC 1 */
2216
2217 enum iso_code_class_type iso_code_class[256];
2218
2219 #define SAFE_CHARSET_P(coding, id) \
2220 ((id) <= (coding)->max_charset_id \
2221 && (coding)->safe_charsets[id] >= 0)
2222
2223
2224 #define SHIFT_OUT_OK(category) \
2225 (CODING_ISO_INITIAL (&coding_categories[category], 1) >= 0)
2226
2227 static void
2228 setup_iso_safe_charsets (attrs)
2229 Lisp_Object attrs;
2230 {
2231 Lisp_Object charset_list, safe_charsets;
2232 Lisp_Object request;
2233 Lisp_Object reg_usage;
2234 Lisp_Object tail;
2235 int reg94, reg96;
2236 int flags = XINT (AREF (attrs, coding_attr_iso_flags));
2237 int max_charset_id;
2238
2239 charset_list = CODING_ATTR_CHARSET_LIST (attrs);
2240 if ((flags & CODING_ISO_FLAG_FULL_SUPPORT)
2241 && ! EQ (charset_list, Viso_2022_charset_list))
2242 {
2243 CODING_ATTR_CHARSET_LIST (attrs)
2244 = charset_list = Viso_2022_charset_list;
2245 ASET (attrs, coding_attr_safe_charsets, Qnil);
2246 }
2247
2248 if (STRINGP (AREF (attrs, coding_attr_safe_charsets)))
2249 return;
2250
2251 max_charset_id = 0;
2252 for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
2253 {
2254 int id = XINT (XCAR (tail));
2255 if (max_charset_id < id)
2256 max_charset_id = id;
2257 }
2258
2259 safe_charsets = Fmake_string (make_number (max_charset_id + 1),
2260 make_number (255));
2261 request = AREF (attrs, coding_attr_iso_request);
2262 reg_usage = AREF (attrs, coding_attr_iso_usage);
2263 reg94 = XINT (XCAR (reg_usage));
2264 reg96 = XINT (XCDR (reg_usage));
2265
2266 for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
2267 {
2268 Lisp_Object id;
2269 Lisp_Object reg;
2270 struct charset *charset;
2271
2272 id = XCAR (tail);
2273 charset = CHARSET_FROM_ID (XINT (id));
2274 reg = Fcdr (Fassq (id, request));
2275 if (! NILP (reg))
2276 XSTRING (safe_charsets)->data[XINT (id)] = XINT (reg);
2277 else if (charset->iso_chars_96)
2278 {
2279 if (reg96 < 4)
2280 XSTRING (safe_charsets)->data[XINT (id)] = reg96;
2281 }
2282 else
2283 {
2284 if (reg94 < 4)
2285 XSTRING (safe_charsets)->data[XINT (id)] = reg94;
2286 }
2287 }
2288 ASET (attrs, coding_attr_safe_charsets, safe_charsets);
2289 }
2290
2291
2292 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
2293 Check if a text is encoded in ISO2022. If it is, returns an
2294 integer in which appropriate flag bits any of:
2295 CATEGORY_MASK_ISO_7
2296 CATEGORY_MASK_ISO_7_TIGHT
2297 CATEGORY_MASK_ISO_8_1
2298 CATEGORY_MASK_ISO_8_2
2299 CATEGORY_MASK_ISO_7_ELSE
2300 CATEGORY_MASK_ISO_8_ELSE
2301 are set. If a code which should never appear in ISO2022 is found,
2302 returns 0. */
2303
2304 static int
2305 detect_coding_iso_2022 (coding, mask)
2306 struct coding_system *coding;
2307 int *mask;
2308 {
2309 unsigned char *src = coding->source, *src_base = src;
2310 unsigned char *src_end = coding->source + coding->src_bytes;
2311 int multibytep = coding->src_multibyte;
2312 int mask_iso = CATEGORY_MASK_ISO;
2313 int mask_found = 0, mask_8bit_found = 0;
2314 int reg[4], shift_out = 0, single_shifting = 0;
2315 int id;
2316 int c, c1;
2317 int consumed_chars = 0;
2318 int i;
2319
2320 for (i = coding_category_iso_7; i <= coding_category_iso_8_else; i++)
2321 {
2322 struct coding_system *this = &(coding_categories[i]);
2323 Lisp_Object attrs, val;
2324
2325 attrs = CODING_ID_ATTRS (this->id);
2326 if (CODING_ISO_FLAGS (this) & CODING_ISO_FLAG_FULL_SUPPORT
2327 && ! EQ (CODING_ATTR_SAFE_CHARSETS (attrs), Viso_2022_charset_list))
2328 setup_iso_safe_charsets (attrs);
2329 val = CODING_ATTR_SAFE_CHARSETS (attrs);
2330 this->max_charset_id = XSTRING (val)->size - 1;
2331 this->safe_charsets = (char *) XSTRING (val)->data;
2332 }
2333
2334 /* A coding system of this category is always ASCII compatible. */
2335 src += coding->head_ascii;
2336
2337 reg[0] = charset_ascii, reg[1] = reg[2] = reg[3] = -1;
2338 while (mask_iso && src < src_end)
2339 {
2340 ONE_MORE_BYTE (c);
2341 switch (c)
2342 {
2343 case ISO_CODE_ESC:
2344 if (inhibit_iso_escape_detection)
2345 break;
2346 single_shifting = 0;
2347 ONE_MORE_BYTE (c);
2348 if (c >= '(' && c <= '/')
2349 {
2350 /* Designation sequence for a charset of dimension 1. */
2351 ONE_MORE_BYTE (c1);
2352 if (c1 < ' ' || c1 >= 0x80
2353 || (id = iso_charset_table[0][c >= ','][c1]) < 0)
2354 /* Invalid designation sequence. Just ignore. */
2355 break;
2356 reg[(c - '(') % 4] = id;
2357 }
2358 else if (c == '$')
2359 {
2360 /* Designation sequence for a charset of dimension 2. */
2361 ONE_MORE_BYTE (c);
2362 if (c >= '@' && c <= 'B')
2363 /* Designation for JISX0208.1978, GB2312, or JISX0208. */
2364 reg[0] = id = iso_charset_table[1][0][c];
2365 else if (c >= '(' && c <= '/')
2366 {
2367 ONE_MORE_BYTE (c1);
2368 if (c1 < ' ' || c1 >= 0x80
2369 || (id = iso_charset_table[1][c >= ','][c1]) < 0)
2370 /* Invalid designation sequence. Just ignore. */
2371 break;
2372 reg[(c - '(') % 4] = id;
2373 }
2374 else
2375 /* Invalid designation sequence. Just ignore. */
2376 break;
2377 }
2378 else if (c == 'N' || c == 'O')
2379 {
2380 /* ESC <Fe> for SS2 or SS3. */
2381 mask_iso &= CATEGORY_MASK_ISO_7_ELSE;
2382 break;
2383 }
2384 else if (c >= '0' && c <= '4')
2385 {
2386 /* ESC <Fp> for start/end composition. */
2387 mask_found |= CATEGORY_MASK_ISO;
2388 break;
2389 }
2390 else
2391 {
2392 /* Invalid escape sequence. */
2393 mask_iso &= ~CATEGORY_MASK_ISO_ESCAPE;
2394 break;
2395 }
2396
2397 /* We found a valid designation sequence for CHARSET. */
2398 mask_iso &= ~CATEGORY_MASK_ISO_8BIT;
2399 if (SAFE_CHARSET_P (&coding_categories[coding_category_iso_7],
2400 id))
2401 mask_found |= CATEGORY_MASK_ISO_7;
2402 else
2403 mask_iso &= ~CATEGORY_MASK_ISO_7;
2404 if (SAFE_CHARSET_P (&coding_categories[coding_category_iso_7_tight],
2405 id))
2406 mask_found |= CATEGORY_MASK_ISO_7_TIGHT;
2407 else
2408 mask_iso &= ~CATEGORY_MASK_ISO_7_TIGHT;
2409 if (SAFE_CHARSET_P (&coding_categories[coding_category_iso_7_else],
2410 id))
2411 mask_found |= CATEGORY_MASK_ISO_7_ELSE;
2412 else
2413 mask_iso &= ~CATEGORY_MASK_ISO_7_ELSE;
2414 if (SAFE_CHARSET_P (&coding_categories[coding_category_iso_8_else],
2415 id))
2416 mask_found |= CATEGORY_MASK_ISO_8_ELSE;
2417 else
2418 mask_iso &= ~CATEGORY_MASK_ISO_8_ELSE;
2419 break;
2420
2421 case ISO_CODE_SO:
2422 if (inhibit_iso_escape_detection)
2423 break;
2424 single_shifting = 0;
2425 if (shift_out == 0
2426 && (reg[1] >= 0
2427 || SHIFT_OUT_OK (coding_category_iso_7_else)
2428 || SHIFT_OUT_OK (coding_category_iso_8_else)))
2429 {
2430 /* Locking shift out. */
2431 mask_iso &= ~CATEGORY_MASK_ISO_7BIT;
2432 mask_found |= CATEGORY_MASK_ISO_ELSE;
2433 }
2434 break;
2435
2436 case ISO_CODE_SI:
2437 if (inhibit_iso_escape_detection)
2438 break;
2439 single_shifting = 0;
2440 if (shift_out == 1)
2441 {
2442 /* Locking shift in. */
2443 mask_iso &= ~CATEGORY_MASK_ISO_7BIT;
2444 mask_found |= CATEGORY_MASK_ISO_ELSE;
2445 }
2446 break;
2447
2448 case ISO_CODE_CSI:
2449 single_shifting = 0;
2450 case ISO_CODE_SS2:
2451 case ISO_CODE_SS3:
2452 {
2453 int newmask = CATEGORY_MASK_ISO_8_ELSE;
2454
2455 if (inhibit_iso_escape_detection)
2456 break;
2457 if (c != ISO_CODE_CSI)
2458 {
2459 if (CODING_ISO_FLAGS (&coding_categories[coding_category_iso_8_1])
2460 & CODING_ISO_FLAG_SINGLE_SHIFT)
2461 newmask |= CATEGORY_MASK_ISO_8_1;
2462 if (CODING_ISO_FLAGS (&coding_categories[coding_category_iso_8_2])
2463 & CODING_ISO_FLAG_SINGLE_SHIFT)
2464 newmask |= CATEGORY_MASK_ISO_8_2;
2465 single_shifting = 1;
2466 }
2467 if (VECTORP (Vlatin_extra_code_table)
2468 && !NILP (XVECTOR (Vlatin_extra_code_table)->contents[c]))
2469 {
2470 if (CODING_ISO_FLAGS (&coding_categories[coding_category_iso_8_1])
2471 & CODING_ISO_FLAG_LATIN_EXTRA)
2472 newmask |= CATEGORY_MASK_ISO_8_1;
2473 if (CODING_ISO_FLAGS (&coding_categories[coding_category_iso_8_2])
2474 & CODING_ISO_FLAG_LATIN_EXTRA)
2475 newmask |= CATEGORY_MASK_ISO_8_2;
2476 }
2477 mask_iso &= newmask;
2478 mask_found |= newmask;
2479 }
2480 break;
2481
2482 default:
2483 if (c < 0x80)
2484 {
2485 single_shifting = 0;
2486 break;
2487 }
2488 else if (c < 0xA0)
2489 {
2490 single_shifting = 0;
2491 mask_8bit_found = 1;
2492 if (VECTORP (Vlatin_extra_code_table)
2493 && !NILP (XVECTOR (Vlatin_extra_code_table)->contents[c]))
2494 {
2495 int newmask = 0;
2496
2497 if (CODING_ISO_FLAGS (&coding_categories[coding_category_iso_8_1])
2498 & CODING_ISO_FLAG_LATIN_EXTRA)
2499 newmask |= CATEGORY_MASK_ISO_8_1;
2500 if (CODING_ISO_FLAGS (&coding_categories[coding_category_iso_8_2])
2501 & CODING_ISO_FLAG_LATIN_EXTRA)
2502 newmask |= CATEGORY_MASK_ISO_8_2;
2503 mask_iso &= newmask;
2504 mask_found |= newmask;
2505 }
2506 else
2507 return 0;
2508 }
2509 else
2510 {
2511 mask_iso &= ~(CATEGORY_MASK_ISO_7BIT
2512 | CATEGORY_MASK_ISO_7_ELSE);
2513 mask_found |= CATEGORY_MASK_ISO_8_1;
2514 mask_8bit_found = 1;
2515 /* Check the length of succeeding codes of the range
2516 0xA0..0FF. If the byte length is odd, we exclude
2517 CATEGORY_MASK_ISO_8_2. We can check this only
2518 when we are not single shifting. */
2519 if (!single_shifting
2520 && mask_iso & CATEGORY_MASK_ISO_8_2)
2521 {
2522 int i = 1;
2523 while (src < src_end)
2524 {
2525 ONE_MORE_BYTE (c);
2526 if (c < 0xA0)
2527 break;
2528 i++;
2529 }
2530
2531 if (i & 1 && src < src_end)
2532 mask_iso &= ~CATEGORY_MASK_ISO_8_2;
2533 else
2534 mask_found |= CATEGORY_MASK_ISO_8_2;
2535 }
2536 }
2537 break;
2538 }
2539 }
2540 no_more_source:
2541 if (!mask_iso)
2542 {
2543 *mask &= ~CATEGORY_MASK_ISO;
2544 return 0;
2545 }
2546 if (!mask_found)
2547 return 0;
2548 *mask &= mask_iso & mask_found;
2549 if (! mask_8bit_found)
2550 *mask &= ~(CATEGORY_MASK_ISO_8BIT | CATEGORY_MASK_ISO_8_ELSE);
2551 return 1;
2552 }
2553
2554
2555 /* Set designation state into CODING. */
2556 #define DECODE_DESIGNATION(reg, dim, chars_96, final) \
2557 do { \
2558 int id, prev; \
2559 \
2560 if (final < '0' || final >= 128 \
2561 || ((id = ISO_CHARSET_TABLE (dim, chars_96, final)) < 0) \
2562 || !SAFE_CHARSET_P (coding, id)) \
2563 { \
2564 CODING_ISO_DESIGNATION (coding, reg) = -2; \
2565 goto invalid_code; \
2566 } \
2567 prev = CODING_ISO_DESIGNATION (coding, reg); \
2568 if (id == charset_jisx0201_roman) \
2569 { \
2570 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_USE_ROMAN) \
2571 id = charset_ascii; \
2572 } \
2573 else if (id == charset_jisx0208_1978) \
2574 { \
2575 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_USE_OLDJIS) \
2576 id = charset_jisx0208; \
2577 } \
2578 CODING_ISO_DESIGNATION (coding, reg) = id; \
2579 /* If there was an invalid designation to REG previously, and this \
2580 designation is ASCII to REG, we should keep this designation \
2581 sequence. */ \
2582 if (prev == -2 && id == charset_ascii) \
2583 goto invalid_code; \
2584 } while (0)
2585
2586
2587 #define MAYBE_FINISH_COMPOSITION() \
2588 do { \
2589 int i; \
2590 if (composition_state == COMPOSING_NO) \
2591 break; \
2592 /* It is assured that we have enough room for producing \
2593 characters stored in the table `components'. */ \
2594 if (charbuf + component_idx > charbuf_end) \
2595 goto no_more_source; \
2596 composition_state = COMPOSING_NO; \
2597 if (method == COMPOSITION_RELATIVE \
2598 || method == COMPOSITION_WITH_ALTCHARS) \
2599 { \
2600 for (i = 0; i < component_idx; i++) \
2601 *charbuf++ = components[i]; \
2602 char_offset += component_idx; \
2603 } \
2604 else \
2605 { \
2606 for (i = 0; i < component_idx; i += 2) \
2607 *charbuf++ = components[i]; \
2608 char_offset += (component_idx / 2) + 1; \
2609 } \
2610 } while (0)
2611
2612
2613 /* Handle composition start sequence ESC 0, ESC 2, ESC 3, or ESC 4.
2614 ESC 0 : relative composition : ESC 0 CHAR ... ESC 1
2615 ESC 2 : rulebase composition : ESC 2 CHAR RULE CHAR RULE ... CHAR ESC 1
2616 ESC 3 : altchar composition : ESC 3 CHAR ... ESC 0 CHAR ... ESC 1
2617 ESC 4 : alt&rule composition : ESC 4 CHAR RULE ... CHAR ESC 0 CHAR ... ESC 1
2618 */
2619
2620 #define DECODE_COMPOSITION_START(c1) \
2621 do { \
2622 if (c1 == '0' \
2623 && composition_state == COMPOSING_COMPONENT_RULE) \
2624 { \
2625 component_len = component_idx; \
2626 composition_state = COMPOSING_CHAR; \
2627 } \
2628 else \
2629 { \
2630 unsigned char *p; \
2631 \
2632 MAYBE_FINISH_COMPOSITION (); \
2633 if (charbuf + MAX_COMPOSITION_COMPONENTS > charbuf_end) \
2634 goto no_more_source; \
2635 for (p = src; p < src_end - 1; p++) \
2636 if (*p == ISO_CODE_ESC && p[1] == '1') \
2637 break; \
2638 if (p == src_end - 1) \
2639 { \
2640 if (coding->mode & CODING_MODE_LAST_BLOCK) \
2641 goto invalid_code; \
2642 goto no_more_source; \
2643 } \
2644 \
2645 /* This is surely the start of a composition. */ \
2646 method = (c1 == '0' ? COMPOSITION_RELATIVE \
2647 : c1 == '2' ? COMPOSITION_WITH_RULE \
2648 : c1 == '3' ? COMPOSITION_WITH_ALTCHARS \
2649 : COMPOSITION_WITH_RULE_ALTCHARS); \
2650 composition_state = (c1 <= '2' ? COMPOSING_CHAR \
2651 : COMPOSING_COMPONENT_CHAR); \
2652 component_idx = component_len = 0; \
2653 } \
2654 } while (0)
2655
2656
2657 /* Handle compositoin end sequence ESC 1. */
2658
2659 #define DECODE_COMPOSITION_END() \
2660 do { \
2661 int nchars = (component_len > 0 ? component_idx - component_len \
2662 : method == COMPOSITION_RELATIVE ? component_idx \
2663 : (component_idx + 1) / 2); \
2664 int i; \
2665 int *saved_charbuf = charbuf; \
2666 \
2667 ADD_COMPOSITION_DATA (charbuf, method, nchars); \
2668 if (method != COMPOSITION_RELATIVE) \
2669 { \
2670 if (component_len == 0) \
2671 for (i = 0; i < component_idx; i++) \
2672 *charbuf++ = components[i]; \
2673 else \
2674 for (i = 0; i < component_len; i++) \
2675 *charbuf++ = components[i]; \
2676 *saved_charbuf = saved_charbuf - charbuf; \
2677 } \
2678 if (method == COMPOSITION_WITH_RULE) \
2679 for (i = 0; i < component_idx; i += 2, char_offset++) \
2680 *charbuf++ = components[i]; \
2681 else \
2682 for (i = component_len; i < component_idx; i++, char_offset++) \
2683 *charbuf++ = components[i]; \
2684 coding->annotated = 1; \
2685 composition_state = COMPOSING_NO; \
2686 } while (0)
2687
2688
2689 /* Decode a composition rule from the byte C1 (and maybe one more byte
2690 from SRC) and store one encoded composition rule in
2691 coding->cmp_data. */
2692
2693 #define DECODE_COMPOSITION_RULE(c1) \
2694 do { \
2695 (c1) -= 32; \
2696 if (c1 < 81) /* old format (before ver.21) */ \
2697 { \
2698 int gref = (c1) / 9; \
2699 int nref = (c1) % 9; \
2700 if (gref == 4) gref = 10; \
2701 if (nref == 4) nref = 10; \
2702 c1 = COMPOSITION_ENCODE_RULE (gref, nref); \
2703 } \
2704 else if (c1 < 93) /* new format (after ver.21) */ \
2705 { \
2706 ONE_MORE_BYTE (c2); \
2707 c1 = COMPOSITION_ENCODE_RULE (c1 - 81, c2 - 32); \
2708 } \
2709 else \
2710 c1 = 0; \
2711 } while (0)
2712
2713
2714 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions". */
2715
2716 static void
2717 decode_coding_iso_2022 (coding)
2718 struct coding_system *coding;
2719 {
2720 unsigned char *src = coding->source + coding->consumed;
2721 unsigned char *src_end = coding->source + coding->src_bytes;
2722 unsigned char *src_base;
2723 int *charbuf = coding->charbuf;
2724 int *charbuf_end = charbuf + coding->charbuf_size - 4;
2725 int consumed_chars = 0, consumed_chars_base;
2726 int char_offset = 0;
2727 int multibytep = coding->src_multibyte;
2728 /* Charsets invoked to graphic plane 0 and 1 respectively. */
2729 int charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
2730 int charset_id_1 = CODING_ISO_INVOKED_CHARSET (coding, 1);
2731 struct charset *charset;
2732 int c;
2733 /* For handling composition sequence. */
2734 #define COMPOSING_NO 0
2735 #define COMPOSING_CHAR 1
2736 #define COMPOSING_RULE 2
2737 #define COMPOSING_COMPONENT_CHAR 3
2738 #define COMPOSING_COMPONENT_RULE 4
2739
2740 int composition_state = COMPOSING_NO;
2741 enum composition_method method;
2742 int components[MAX_COMPOSITION_COMPONENTS * 2 + 1];
2743 int component_idx;
2744 int component_len;
2745 Lisp_Object attrs, eol_type, charset_list;
2746
2747 CODING_GET_INFO (coding, attrs, eol_type, charset_list);
2748 setup_iso_safe_charsets (attrs);
2749
2750 while (1)
2751 {
2752 int c1, c2;
2753
2754 src_base = src;
2755 consumed_chars_base = consumed_chars;
2756
2757 if (charbuf >= charbuf_end)
2758 break;
2759
2760 ONE_MORE_BYTE (c1);
2761
2762 /* We produce no character or one character. */
2763 switch (iso_code_class [c1])
2764 {
2765 case ISO_0x20_or_0x7F:
2766 if (composition_state != COMPOSING_NO)
2767 {
2768 if (composition_state == COMPOSING_RULE
2769 || composition_state == COMPOSING_COMPONENT_RULE)
2770 {
2771 DECODE_COMPOSITION_RULE (c1);
2772 components[component_idx++] = c1;
2773 composition_state--;
2774 continue;
2775 }
2776 }
2777 if (charset_id_0 < 0
2778 || ! CHARSET_ISO_CHARS_96 (CHARSET_FROM_ID (charset_id_0)))
2779 /* This is SPACE or DEL. */
2780 charset = CHARSET_FROM_ID (charset_ascii);
2781 else
2782 charset = CHARSET_FROM_ID (charset_id_0);
2783 break;
2784
2785 case ISO_graphic_plane_0:
2786 if (composition_state != COMPOSING_NO)
2787 {
2788 if (composition_state == COMPOSING_RULE
2789 || composition_state == COMPOSING_COMPONENT_RULE)
2790 {
2791 DECODE_COMPOSITION_RULE (c1);
2792 components[component_idx++] = c1;
2793 composition_state--;
2794 continue;
2795 }
2796 }
2797 charset = CHARSET_FROM_ID (charset_id_0);
2798 break;
2799
2800 case ISO_0xA0_or_0xFF:
2801 if (charset_id_1 < 0
2802 || ! CHARSET_ISO_CHARS_96 (CHARSET_FROM_ID (charset_id_1))
2803 || CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS)
2804 goto invalid_code;
2805 /* This is a graphic character, we fall down ... */
2806
2807 case ISO_graphic_plane_1:
2808 if (charset_id_1 < 0)
2809 goto invalid_code;
2810 charset = CHARSET_FROM_ID (charset_id_1);
2811 break;
2812
2813 case ISO_carriage_return:
2814 if (c1 == '\r')
2815 {
2816 if (EQ (eol_type, Qdos))
2817 {
2818 if (src == src_end)
2819 goto no_more_source;
2820 if (*src == '\n')
2821 ONE_MORE_BYTE (c1);
2822 }
2823 else if (EQ (eol_type, Qmac))
2824 c1 = '\n';
2825 }
2826 /* fall through */
2827
2828 case ISO_control_0:
2829 MAYBE_FINISH_COMPOSITION ();
2830 charset = CHARSET_FROM_ID (charset_ascii);
2831 break;
2832
2833 case ISO_control_1:
2834 MAYBE_FINISH_COMPOSITION ();
2835 goto invalid_code;
2836
2837 case ISO_shift_out:
2838 if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LOCKING_SHIFT)
2839 || CODING_ISO_DESIGNATION (coding, 1) < 0)
2840 goto invalid_code;
2841 CODING_ISO_INVOCATION (coding, 0) = 1;
2842 charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
2843 continue;
2844
2845 case ISO_shift_in:
2846 if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LOCKING_SHIFT))
2847 goto invalid_code;
2848 CODING_ISO_INVOCATION (coding, 0) = 0;
2849 charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
2850 continue;
2851
2852 case ISO_single_shift_2_7:
2853 case ISO_single_shift_2:
2854 if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT))
2855 goto invalid_code;
2856 /* SS2 is handled as an escape sequence of ESC 'N' */
2857 c1 = 'N';
2858 goto label_escape_sequence;
2859
2860 case ISO_single_shift_3:
2861 if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT))
2862 goto invalid_code;
2863 /* SS2 is handled as an escape sequence of ESC 'O' */
2864 c1 = 'O';
2865 goto label_escape_sequence;
2866
2867 case ISO_control_sequence_introducer:
2868 /* CSI is handled as an escape sequence of ESC '[' ... */
2869 c1 = '[';
2870 goto label_escape_sequence;
2871
2872 case ISO_escape:
2873 ONE_MORE_BYTE (c1);
2874 label_escape_sequence:
2875 /* Escape sequences handled here are invocation,
2876 designation, direction specification, and character
2877 composition specification. */
2878 switch (c1)
2879 {
2880 case '&': /* revision of following character set */
2881 ONE_MORE_BYTE (c1);
2882 if (!(c1 >= '@' && c1 <= '~'))
2883 goto invalid_code;
2884 ONE_MORE_BYTE (c1);
2885 if (c1 != ISO_CODE_ESC)
2886 goto invalid_code;
2887 ONE_MORE_BYTE (c1);
2888 goto label_escape_sequence;
2889
2890 case '$': /* designation of 2-byte character set */
2891 if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DESIGNATION))
2892 goto invalid_code;
2893 ONE_MORE_BYTE (c1);
2894 if (c1 >= '@' && c1 <= 'B')
2895 { /* designation of JISX0208.1978, GB2312.1980,
2896 or JISX0208.1980 */
2897 DECODE_DESIGNATION (0, 2, 0, c1);
2898 }
2899 else if (c1 >= 0x28 && c1 <= 0x2B)
2900 { /* designation of DIMENSION2_CHARS94 character set */
2901 ONE_MORE_BYTE (c2);
2902 DECODE_DESIGNATION (c1 - 0x28, 2, 0, c2);
2903 }
2904 else if (c1 >= 0x2C && c1 <= 0x2F)
2905 { /* designation of DIMENSION2_CHARS96 character set */
2906 ONE_MORE_BYTE (c2);
2907 DECODE_DESIGNATION (c1 - 0x2C, 2, 1, c2);
2908 }
2909 else
2910 goto invalid_code;
2911 /* We must update these variables now. */
2912 charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
2913 charset_id_1 = CODING_ISO_INVOKED_CHARSET (coding, 1);
2914 continue;
2915
2916 case 'n': /* invocation of locking-shift-2 */
2917 if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LOCKING_SHIFT)
2918 || CODING_ISO_DESIGNATION (coding, 2) < 0)
2919 goto invalid_code;
2920 CODING_ISO_INVOCATION (coding, 0) = 2;
2921 charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
2922 continue;
2923
2924 case 'o': /* invocation of locking-shift-3 */
2925 if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LOCKING_SHIFT)
2926 || CODING_ISO_DESIGNATION (coding, 3) < 0)
2927 goto invalid_code;
2928 CODING_ISO_INVOCATION (coding, 0) = 3;
2929 charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
2930 continue;
2931
2932 case 'N': /* invocation of single-shift-2 */
2933 if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT)
2934 || CODING_ISO_DESIGNATION (coding, 2) < 0)
2935 goto invalid_code;
2936 charset = CHARSET_FROM_ID (CODING_ISO_DESIGNATION (coding, 2));
2937 ONE_MORE_BYTE (c1);
2938 if (c1 < 0x20 || (c1 >= 0x80 && c1 < 0xA0))
2939 goto invalid_code;
2940 break;
2941
2942 case 'O': /* invocation of single-shift-3 */
2943 if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT)
2944 || CODING_ISO_DESIGNATION (coding, 3) < 0)
2945 goto invalid_code;
2946 charset = CHARSET_FROM_ID (CODING_ISO_DESIGNATION (coding, 3));
2947 ONE_MORE_BYTE (c1);
2948 if (c1 < 0x20 || (c1 >= 0x80 && c1 < 0xA0))
2949 goto invalid_code;
2950 break;
2951
2952 case '0': case '2': case '3': case '4': /* start composition */
2953 if (! (coding->common_flags & CODING_ANNOTATE_COMPOSITION_MASK))
2954 goto invalid_code;
2955 DECODE_COMPOSITION_START (c1);
2956 continue;
2957
2958 case '1': /* end composition */
2959 if (composition_state == COMPOSING_NO)
2960 goto invalid_code;
2961 DECODE_COMPOSITION_END ();
2962 continue;
2963
2964 case '[': /* specification of direction */
2965 if (! CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DIRECTION)
2966 goto invalid_code;
2967 /* For the moment, nested direction is not supported.
2968 So, `coding->mode & CODING_MODE_DIRECTION' zero means
2969 left-to-right, and nozero means right-to-left. */
2970 ONE_MORE_BYTE (c1);
2971 switch (c1)
2972 {
2973 case ']': /* end of the current direction */
2974 coding->mode &= ~CODING_MODE_DIRECTION;
2975
2976 case '0': /* end of the current direction */
2977 case '1': /* start of left-to-right direction */
2978 ONE_MORE_BYTE (c1);
2979 if (c1 == ']')
2980 coding->mode &= ~CODING_MODE_DIRECTION;
2981 else
2982 goto invalid_code;
2983 break;
2984
2985 case '2': /* start of right-to-left direction */
2986 ONE_MORE_BYTE (c1);
2987 if (c1 == ']')
2988 coding->mode |= CODING_MODE_DIRECTION;
2989 else
2990 goto invalid_code;
2991 break;
2992
2993 default:
2994 goto invalid_code;
2995 }
2996 continue;
2997
2998 default:
2999 if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DESIGNATION))
3000 goto invalid_code;
3001 if (c1 >= 0x28 && c1 <= 0x2B)
3002 { /* designation of DIMENSION1_CHARS94 character set */
3003 ONE_MORE_BYTE (c2);
3004 DECODE_DESIGNATION (c1 - 0x28, 1, 0, c2);
3005 }
3006 else if (c1 >= 0x2C && c1 <= 0x2F)
3007 { /* designation of DIMENSION1_CHARS96 character set */
3008 ONE_MORE_BYTE (c2);
3009 DECODE_DESIGNATION (c1 - 0x2C, 1, 1, c2);
3010 }
3011 else
3012 goto invalid_code;
3013 /* We must update these variables now. */
3014 charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3015 charset_id_1 = CODING_ISO_INVOKED_CHARSET (coding, 1);
3016 continue;
3017 }
3018 }
3019
3020 /* Now we know CHARSET and 1st position code C1 of a character.
3021 Produce a decoded character while getting 2nd position code
3022 C2 if necessary. */
3023 c1 &= 0x7F;
3024 if (CHARSET_DIMENSION (charset) > 1)
3025 {
3026 ONE_MORE_BYTE (c2);
3027 if (c2 < 0x20 || (c2 >= 0x80 && c2 < 0xA0))
3028 /* C2 is not in a valid range. */
3029 goto invalid_code;
3030 c1 = (c1 << 8) | (c2 & 0x7F);
3031 if (CHARSET_DIMENSION (charset) > 2)
3032 {
3033 ONE_MORE_BYTE (c2);
3034 if (c2 < 0x20 || (c2 >= 0x80 && c2 < 0xA0))
3035 /* C2 is not in a valid range. */
3036 goto invalid_code;
3037 c1 = (c1 << 8) | (c2 & 0x7F);
3038 }
3039 }
3040
3041 CODING_DECODE_CHAR (coding, src, src_base, src_end, charset, c1, c);
3042 if (c < 0)
3043 {
3044 MAYBE_FINISH_COMPOSITION ();
3045 for (; src_base < src; src_base++, char_offset++)
3046 {
3047 if (ASCII_BYTE_P (*src_base))
3048 *charbuf++ = *src_base;
3049 else
3050 *charbuf++ = BYTE8_TO_CHAR (*src_base);
3051 }
3052 }
3053 else if (composition_state == COMPOSING_NO)
3054 {
3055 *charbuf++ = c;
3056 char_offset++;
3057 }
3058 else
3059 {
3060 components[component_idx++] = c;
3061 if (method == COMPOSITION_WITH_RULE
3062 || (method == COMPOSITION_WITH_RULE_ALTCHARS
3063 && composition_state == COMPOSING_COMPONENT_CHAR))
3064 composition_state++;
3065 }
3066 continue;
3067
3068 invalid_code:
3069 MAYBE_FINISH_COMPOSITION ();
3070 src = src_base;
3071 consumed_chars = consumed_chars_base;
3072 ONE_MORE_BYTE (c);
3073 *charbuf++ = ASCII_BYTE_P (c) ? c : BYTE8_TO_CHAR (c);
3074 coding->errors++;
3075 }
3076
3077 no_more_source:
3078 coding->consumed_char += consumed_chars_base;
3079 coding->consumed = src_base - coding->source;
3080 coding->charbuf_used = charbuf - coding->charbuf;
3081 }
3082
3083
3084 /* ISO2022 encoding stuff. */
3085
3086 /*
3087 It is not enough to say just "ISO2022" on encoding, we have to
3088 specify more details. In Emacs, each coding system of ISO2022
3089 variant has the following specifications:
3090 1. Initial designation to G0 thru G3.
3091 2. Allows short-form designation?
3092 3. ASCII should be designated to G0 before control characters?
3093 4. ASCII should be designated to G0 at end of line?
3094 5. 7-bit environment or 8-bit environment?
3095 6. Use locking-shift?
3096 7. Use Single-shift?
3097 And the following two are only for Japanese:
3098 8. Use ASCII in place of JIS0201-1976-Roman?
3099 9. Use JISX0208-1983 in place of JISX0208-1978?
3100 These specifications are encoded in CODING_ISO_FLAGS (coding) as flag bits
3101 defined by macros CODING_ISO_FLAG_XXX. See `coding.h' for more
3102 details.
3103 */
3104
3105 /* Produce codes (escape sequence) for designating CHARSET to graphic
3106 register REG at DST, and increment DST. If <final-char> of CHARSET is
3107 '@', 'A', or 'B' and the coding system CODING allows, produce
3108 designation sequence of short-form. */
3109
3110 #define ENCODE_DESIGNATION(charset, reg, coding) \
3111 do { \
3112 unsigned char final_char = CHARSET_ISO_FINAL (charset); \
3113 char *intermediate_char_94 = "()*+"; \
3114 char *intermediate_char_96 = ",-./"; \
3115 int revision = -1; \
3116 int c; \
3117 \
3118 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_REVISION) \
3119 revision = CHARSET_ISO_REVISION (charset); \
3120 \
3121 if (revision >= 0) \
3122 { \
3123 EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, '&'); \
3124 EMIT_ONE_BYTE ('@' + revision); \
3125 } \
3126 EMIT_ONE_ASCII_BYTE (ISO_CODE_ESC); \
3127 if (CHARSET_DIMENSION (charset) == 1) \
3128 { \
3129 if (! CHARSET_ISO_CHARS_96 (charset)) \
3130 c = intermediate_char_94[reg]; \
3131 else \
3132 c = intermediate_char_96[reg]; \
3133 EMIT_ONE_ASCII_BYTE (c); \
3134 } \
3135 else \
3136 { \
3137 EMIT_ONE_ASCII_BYTE ('$'); \
3138 if (! CHARSET_ISO_CHARS_96 (charset)) \
3139 { \
3140 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LONG_FORM \
3141 || reg != 0 \
3142 || final_char < '@' || final_char > 'B') \
3143 EMIT_ONE_ASCII_BYTE (intermediate_char_94[reg]); \
3144 } \
3145 else \
3146 EMIT_ONE_ASCII_BYTE (intermediate_char_96[reg]); \
3147 } \
3148 EMIT_ONE_ASCII_BYTE (final_char); \
3149 \
3150 CODING_ISO_DESIGNATION (coding, reg) = CHARSET_ID (charset); \
3151 } while (0)
3152
3153
3154 /* The following two macros produce codes (control character or escape
3155 sequence) for ISO2022 single-shift functions (single-shift-2 and
3156 single-shift-3). */
3157
3158 #define ENCODE_SINGLE_SHIFT_2 \
3159 do { \
3160 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS) \
3161 EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'N'); \
3162 else \
3163 EMIT_ONE_BYTE (ISO_CODE_SS2); \
3164 CODING_ISO_SINGLE_SHIFTING (coding) = 1; \
3165 } while (0)
3166
3167
3168 #define ENCODE_SINGLE_SHIFT_3 \
3169 do { \
3170 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS) \
3171 EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'O'); \
3172 else \
3173 EMIT_ONE_BYTE (ISO_CODE_SS3); \
3174 CODING_ISO_SINGLE_SHIFTING (coding) = 1; \
3175 } while (0)
3176
3177
3178 /* The following four macros produce codes (control character or
3179 escape sequence) for ISO2022 locking-shift functions (shift-in,
3180 shift-out, locking-shift-2, and locking-shift-3). */
3181
3182 #define ENCODE_SHIFT_IN \
3183 do { \
3184 EMIT_ONE_ASCII_BYTE (ISO_CODE_SI); \
3185 CODING_ISO_INVOCATION (coding, 0) = 0; \
3186 } while (0)
3187
3188
3189 #define ENCODE_SHIFT_OUT \
3190 do { \
3191 EMIT_ONE_ASCII_BYTE (ISO_CODE_SO); \
3192 CODING_ISO_INVOCATION (coding, 0) = 1; \
3193 } while (0)
3194
3195
3196 #define ENCODE_LOCKING_SHIFT_2 \
3197 do { \
3198 EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'n'); \
3199 CODING_ISO_INVOCATION (coding, 0) = 2; \
3200 } while (0)
3201
3202
3203 #define ENCODE_LOCKING_SHIFT_3 \
3204 do { \
3205 EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'n'); \
3206 CODING_ISO_INVOCATION (coding, 0) = 3; \
3207 } while (0)
3208
3209
3210 /* Produce codes for a DIMENSION1 character whose character set is
3211 CHARSET and whose position-code is C1. Designation and invocation
3212 sequences are also produced in advance if necessary. */
3213
3214 #define ENCODE_ISO_CHARACTER_DIMENSION1(charset, c1) \
3215 do { \
3216 int id = CHARSET_ID (charset); \
3217 \
3218 if ((CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_USE_ROMAN) \
3219 && id == charset_ascii) \
3220 { \
3221 id = charset_jisx0201_roman; \
3222 charset = CHARSET_FROM_ID (id); \
3223 } \
3224 \
3225 if (CODING_ISO_SINGLE_SHIFTING (coding)) \
3226 { \
3227 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS) \
3228 EMIT_ONE_ASCII_BYTE (c1 & 0x7F); \
3229 else \
3230 EMIT_ONE_BYTE (c1 | 0x80); \
3231 CODING_ISO_SINGLE_SHIFTING (coding) = 0; \
3232 break; \
3233 } \
3234 else if (id == CODING_ISO_INVOKED_CHARSET (coding, 0)) \
3235 { \
3236 EMIT_ONE_ASCII_BYTE (c1 & 0x7F); \
3237 break; \
3238 } \
3239 else if (id == CODING_ISO_INVOKED_CHARSET (coding, 1)) \
3240 { \
3241 EMIT_ONE_BYTE (c1 | 0x80); \
3242 break; \
3243 } \
3244 else \
3245 /* Since CHARSET is not yet invoked to any graphic planes, we \
3246 must invoke it, or, at first, designate it to some graphic \
3247 register. Then repeat the loop to actually produce the \
3248 character. */ \
3249 dst = encode_invocation_designation (charset, coding, dst, \
3250 &produced_chars); \
3251 } while (1)
3252
3253
3254 /* Produce codes for a DIMENSION2 character whose character set is
3255 CHARSET and whose position-codes are C1 and C2. Designation and
3256 invocation codes are also produced in advance if necessary. */
3257
3258 #define ENCODE_ISO_CHARACTER_DIMENSION2(charset, c1, c2) \
3259 do { \
3260 int id = CHARSET_ID (charset); \
3261 \
3262 if ((CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_USE_OLDJIS) \
3263 && id == charset_jisx0208) \
3264 { \
3265 id = charset_jisx0208_1978; \
3266 charset = CHARSET_FROM_ID (id); \
3267 } \
3268 \
3269 if (CODING_ISO_SINGLE_SHIFTING (coding)) \
3270 { \
3271 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS) \
3272 EMIT_TWO_ASCII_BYTES ((c1) & 0x7F, (c2) & 0x7F); \
3273 else \
3274 EMIT_TWO_BYTES ((c1) | 0x80, (c2) | 0x80); \
3275 CODING_ISO_SINGLE_SHIFTING (coding) = 0; \
3276 break; \
3277 } \
3278 else if (id == CODING_ISO_INVOKED_CHARSET (coding, 0)) \
3279 { \
3280 EMIT_TWO_ASCII_BYTES ((c1) & 0x7F, (c2) & 0x7F); \
3281 break; \
3282 } \
3283 else if (id == CODING_ISO_INVOKED_CHARSET (coding, 1)) \
3284 { \
3285 EMIT_TWO_BYTES ((c1) | 0x80, (c2) | 0x80); \
3286 break; \
3287 } \
3288 else \
3289 /* Since CHARSET is not yet invoked to any graphic planes, we \
3290 must invoke it, or, at first, designate it to some graphic \
3291 register. Then repeat the loop to actually produce the \
3292 character. */ \
3293 dst = encode_invocation_designation (charset, coding, dst, \
3294 &produced_chars); \
3295 } while (1)
3296
3297
3298 #define ENCODE_ISO_CHARACTER(charset, c) \
3299 do { \
3300 int code = ENCODE_CHAR ((charset),(c)); \
3301 \
3302 if (CHARSET_DIMENSION (charset) == 1) \
3303 ENCODE_ISO_CHARACTER_DIMENSION1 ((charset), code); \
3304 else \
3305 ENCODE_ISO_CHARACTER_DIMENSION2 ((charset), code >> 8, code & 0xFF); \
3306 } while (0)
3307
3308
3309 /* Produce designation and invocation codes at a place pointed by DST
3310 to use CHARSET. The element `spec.iso_2022' of *CODING is updated.
3311 Return new DST. */
3312
3313 unsigned char *
3314 encode_invocation_designation (charset, coding, dst, p_nchars)
3315 struct charset *charset;
3316 struct coding_system *coding;
3317 unsigned char *dst;
3318 int *p_nchars;
3319 {
3320 int multibytep = coding->dst_multibyte;
3321 int produced_chars = *p_nchars;
3322 int reg; /* graphic register number */
3323 int id = CHARSET_ID (charset);
3324
3325 /* At first, check designations. */
3326 for (reg = 0; reg < 4; reg++)
3327 if (id == CODING_ISO_DESIGNATION (coding, reg))
3328 break;
3329
3330 if (reg >= 4)
3331 {
3332 /* CHARSET is not yet designated to any graphic registers. */
3333 /* At first check the requested designation. */
3334 reg = CODING_ISO_REQUEST (coding, id);
3335 if (reg < 0)
3336 /* Since CHARSET requests no special designation, designate it
3337 to graphic register 0. */
3338 reg = 0;
3339
3340 ENCODE_DESIGNATION (charset, reg, coding);
3341 }
3342
3343 if (CODING_ISO_INVOCATION (coding, 0) != reg
3344 && CODING_ISO_INVOCATION (coding, 1) != reg)
3345 {
3346 /* Since the graphic register REG is not invoked to any graphic
3347 planes, invoke it to graphic plane 0. */
3348 switch (reg)
3349 {
3350 case 0: /* graphic register 0 */
3351 ENCODE_SHIFT_IN;
3352 break;
3353
3354 case 1: /* graphic register 1 */
3355 ENCODE_SHIFT_OUT;
3356 break;
3357
3358 case 2: /* graphic register 2 */
3359 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT)
3360 ENCODE_SINGLE_SHIFT_2;
3361 else
3362 ENCODE_LOCKING_SHIFT_2;
3363 break;
3364
3365 case 3: /* graphic register 3 */
3366 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT)
3367 ENCODE_SINGLE_SHIFT_3;
3368 else
3369 ENCODE_LOCKING_SHIFT_3;
3370 break;
3371 }
3372 }
3373
3374 *p_nchars = produced_chars;
3375 return dst;
3376 }
3377
3378 /* The following three macros produce codes for indicating direction
3379 of text. */
3380 #define ENCODE_CONTROL_SEQUENCE_INTRODUCER \
3381 do { \
3382 if (CODING_ISO_FLAGS (coding) == CODING_ISO_FLAG_SEVEN_BITS) \
3383 EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, '['); \
3384 else \
3385 EMIT_ONE_BYTE (ISO_CODE_CSI); \
3386 } while (0)
3387
3388
3389 #define ENCODE_DIRECTION_R2L() \
3390 do { \
3391 ENCODE_CONTROL_SEQUENCE_INTRODUCER (dst); \
3392 EMIT_TWO_ASCII_BYTES ('2', ']'); \
3393 } while (0)
3394
3395
3396 #define ENCODE_DIRECTION_L2R() \
3397 do { \
3398 ENCODE_CONTROL_SEQUENCE_INTRODUCER (dst); \
3399 EMIT_TWO_ASCII_BYTES ('0', ']'); \
3400 } while (0)
3401
3402
3403 /* Produce codes for designation and invocation to reset the graphic
3404 planes and registers to initial state. */
3405 #define ENCODE_RESET_PLANE_AND_REGISTER() \
3406 do { \
3407 int reg; \
3408 struct charset *charset; \
3409 \
3410 if (CODING_ISO_INVOCATION (coding, 0) != 0) \
3411 ENCODE_SHIFT_IN; \
3412 for (reg = 0; reg < 4; reg++) \
3413 if (CODING_ISO_INITIAL (coding, reg) >= 0 \
3414 && (CODING_ISO_DESIGNATION (coding, reg) \
3415 != CODING_ISO_INITIAL (coding, reg))) \
3416 { \
3417 charset = CHARSET_FROM_ID (CODING_ISO_INITIAL (coding, reg)); \
3418 ENCODE_DESIGNATION (charset, reg, coding); \
3419 } \
3420 } while (0)
3421
3422
3423 /* Produce designation sequences of charsets in the line started from
3424 SRC to a place pointed by DST, and return updated DST.
3425
3426 If the current block ends before any end-of-line, we may fail to
3427 find all the necessary designations. */
3428
3429 static unsigned char *
3430 encode_designation_at_bol (coding, charbuf, charbuf_end, dst)
3431 struct coding_system *coding;
3432 int *charbuf, *charbuf_end;
3433 unsigned char *dst;
3434 {
3435 struct charset *charset;
3436 /* Table of charsets to be designated to each graphic register. */
3437 int r[4];
3438 int c, found = 0, reg;
3439 int produced_chars = 0;
3440 int multibytep = coding->dst_multibyte;
3441 Lisp_Object attrs;
3442 Lisp_Object charset_list;
3443
3444 attrs = CODING_ID_ATTRS (coding->id);
3445 charset_list = CODING_ATTR_CHARSET_LIST (attrs);
3446 if (EQ (charset_list, Qiso_2022))
3447 charset_list = Viso_2022_charset_list;
3448
3449 for (reg = 0; reg < 4; reg++)
3450 r[reg] = -1;
3451
3452 while (found < 4)
3453 {
3454 int id;
3455
3456 c = *charbuf++;
3457 if (c == '\n')
3458 break;
3459 charset = char_charset (c, charset_list, NULL);
3460 id = CHARSET_ID (charset);
3461 reg = CODING_ISO_REQUEST (coding, id);
3462 if (reg >= 0 && r[reg] < 0)
3463 {
3464 found++;
3465 r[reg] = id;
3466 }
3467 }
3468
3469 if (found)
3470 {
3471 for (reg = 0; reg < 4; reg++)
3472 if (r[reg] >= 0
3473 && CODING_ISO_DESIGNATION (coding, reg) != r[reg])
3474 ENCODE_DESIGNATION (CHARSET_FROM_ID (r[reg]), reg, coding);
3475 }
3476
3477 return dst;
3478 }
3479
3480 /* See the above "GENERAL NOTES on `encode_coding_XXX ()' functions". */
3481
3482 static int
3483 encode_coding_iso_2022 (coding)
3484 struct coding_system *coding;
3485 {
3486 int multibytep = coding->dst_multibyte;
3487 int *charbuf = coding->charbuf;
3488 int *charbuf_end = charbuf + coding->charbuf_used;
3489 unsigned char *dst = coding->destination + coding->produced;
3490 unsigned char *dst_end = coding->destination + coding->dst_bytes;
3491 int safe_room = 16;
3492 int bol_designation
3493 = (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DESIGNATE_AT_BOL
3494 && CODING_ISO_BOL (coding));
3495 int produced_chars = 0;
3496 Lisp_Object attrs, eol_type, charset_list;
3497 int ascii_compatible;
3498 int c;
3499
3500 CODING_GET_INFO (coding, attrs, eol_type, charset_list);
3501 setup_iso_safe_charsets (attrs);
3502 coding->safe_charsets
3503 = (char *) XSTRING (CODING_ATTR_SAFE_CHARSETS(attrs))->data;
3504
3505 ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs));
3506
3507 while (charbuf < charbuf_end)
3508 {
3509 ASSURE_DESTINATION (safe_room);
3510
3511 if (bol_designation)
3512 {
3513 unsigned char *dst_prev = dst;
3514
3515 /* We have to produce designation sequences if any now. */
3516 dst = encode_designation_at_bol (coding, charbuf, charbuf_end, dst);
3517 bol_designation = 0;
3518 /* We are sure that designation sequences are all ASCII bytes. */
3519 produced_chars += dst - dst_prev;
3520 }
3521
3522 c = *charbuf++;
3523
3524 /* Now encode the character C. */
3525 if (c < 0x20 || c == 0x7F)
3526 {
3527 if (c == '\n'
3528 || (c == '\r' && EQ (eol_type, Qmac)))
3529 {
3530 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_RESET_AT_EOL)
3531 ENCODE_RESET_PLANE_AND_REGISTER ();
3532 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_INIT_AT_BOL)
3533 {
3534 int i;
3535
3536 for (i = 0; i < 4; i++)
3537 CODING_ISO_DESIGNATION (coding, i)
3538 = CODING_ISO_INITIAL (coding, i);
3539 }
3540 bol_designation
3541 = CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DESIGNATE_AT_BOL;
3542 }
3543 else if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_RESET_AT_CNTL)
3544 ENCODE_RESET_PLANE_AND_REGISTER ();
3545 EMIT_ONE_ASCII_BYTE (c);
3546 }
3547 else if (ASCII_CHAR_P (c))
3548 {
3549 if (ascii_compatible)
3550 EMIT_ONE_ASCII_BYTE (c);
3551 else
3552 {
3553 struct charset *charset = CHARSET_FROM_ID (charset_ascii);
3554 ENCODE_ISO_CHARACTER (charset, c);
3555 }
3556 }
3557 else if (CHAR_BYTE8_P (c))
3558 {
3559 c = CHAR_TO_BYTE8 (c);
3560 EMIT_ONE_BYTE (c);
3561 }
3562 else
3563 {
3564 struct charset *charset = char_charset (c, charset_list, NULL);
3565
3566 if (!charset)
3567 {
3568 if (coding->mode & CODING_MODE_SAFE_ENCODING)
3569 {
3570 c = CODING_INHIBIT_CHARACTER_SUBSTITUTION;
3571 charset = CHARSET_FROM_ID (charset_ascii);
3572 }
3573 else
3574 {
3575 c = coding->default_char;
3576 charset = char_charset (c, charset_list, NULL);
3577 }
3578 }
3579 ENCODE_ISO_CHARACTER (charset, c);
3580 }
3581 }
3582
3583 if (coding->mode & CODING_MODE_LAST_BLOCK
3584 && CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_RESET_AT_EOL)
3585 {
3586 ASSURE_DESTINATION (safe_room);
3587 ENCODE_RESET_PLANE_AND_REGISTER ();
3588 }
3589 coding->result = CODING_RESULT_SUCCESS;
3590 CODING_ISO_BOL (coding) = bol_designation;
3591 coding->produced_char += produced_chars;
3592 coding->produced = dst - coding->destination;
3593 return 0;
3594 }
3595
3596 \f
3597 /*** 8,9. SJIS and BIG5 handlers ***/
3598
3599 /* Although SJIS and BIG5 are not ISO's coding system, they are used
3600 quite widely. So, for the moment, Emacs supports them in the bare
3601 C code. But, in the future, they may be supported only by CCL. */
3602
3603 /* SJIS is a coding system encoding three character sets: ASCII, right
3604 half of JISX0201-Kana, and JISX0208. An ASCII character is encoded
3605 as is. A character of charset katakana-jisx0201 is encoded by
3606 "position-code + 0x80". A character of charset japanese-jisx0208
3607 is encoded in 2-byte but two position-codes are divided and shifted
3608 so that it fit in the range below.
3609
3610 --- CODE RANGE of SJIS ---
3611 (character set) (range)
3612 ASCII 0x00 .. 0x7F
3613 KATAKANA-JISX0201 0xA0 .. 0xDF
3614 JISX0208 (1st byte) 0x81 .. 0x9F and 0xE0 .. 0xEF
3615 (2nd byte) 0x40 .. 0x7E and 0x80 .. 0xFC
3616 -------------------------------
3617
3618 */
3619
3620 /* BIG5 is a coding system encoding two character sets: ASCII and
3621 Big5. An ASCII character is encoded as is. Big5 is a two-byte
3622 character set and is encoded in two-byte.
3623
3624 --- CODE RANGE of BIG5 ---
3625 (character set) (range)
3626 ASCII 0x00 .. 0x7F
3627 Big5 (1st byte) 0xA1 .. 0xFE
3628 (2nd byte) 0x40 .. 0x7E and 0xA1 .. 0xFE
3629 --------------------------
3630
3631 */
3632
3633 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
3634 Check if a text is encoded in SJIS. If it is, return
3635 CATEGORY_MASK_SJIS, else return 0. */
3636
3637 static int
3638 detect_coding_sjis (coding, mask)
3639 struct coding_system *coding;
3640 int *mask;
3641 {
3642 unsigned char *src = coding->source, *src_base = src;
3643 unsigned char *src_end = coding->source + coding->src_bytes;
3644 int multibytep = coding->src_multibyte;
3645 int consumed_chars = 0;
3646 int found = 0;
3647 int c;
3648
3649 /* A coding system of this category is always ASCII compatible. */
3650 src += coding->head_ascii;
3651
3652 while (1)
3653 {
3654 ONE_MORE_BYTE (c);
3655 if (c < 0x80)
3656 continue;
3657 if ((c >= 0x81 && c <= 0x9F) || (c >= 0xE0 && c <= 0xEF))
3658 {
3659 ONE_MORE_BYTE (c);
3660 if (c < 0x40 || c == 0x7F || c > 0xFC)
3661 break;
3662 found = 1;
3663 }
3664 else if (c >= 0xA0 && c < 0xE0)
3665 found = 1;
3666 else
3667 break;
3668 }
3669 *mask &= ~CATEGORY_MASK_SJIS;
3670 return 0;
3671
3672 no_more_source:
3673 if (!found)
3674 return 0;
3675 *mask &= CATEGORY_MASK_SJIS;
3676 return 1;
3677 }
3678
3679 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
3680 Check if a text is encoded in BIG5. If it is, return
3681 CATEGORY_MASK_BIG5, else return 0. */
3682
3683 static int
3684 detect_coding_big5 (coding, mask)
3685 struct coding_system *coding;
3686 int *mask;
3687 {
3688 unsigned char *src = coding->source, *src_base = src;
3689 unsigned char *src_end = coding->source + coding->src_bytes;
3690 int multibytep = coding->src_multibyte;
3691 int consumed_chars = 0;
3692 int found = 0;
3693 int c;
3694
3695 /* A coding system of this category is always ASCII compatible. */
3696 src += coding->head_ascii;
3697
3698 while (1)
3699 {
3700 ONE_MORE_BYTE (c);
3701 if (c < 0x80)
3702 continue;
3703 if (c >= 0xA1)
3704 {
3705 ONE_MORE_BYTE (c);
3706 if (c < 0x40 || (c >= 0x7F && c <= 0xA0))
3707 return 0;
3708 found = 1;
3709 }
3710 else
3711 break;
3712 }
3713 *mask &= ~CATEGORY_MASK_BIG5;
3714 return 0;
3715
3716 no_more_source:
3717 if (!found)
3718 return 0;
3719 *mask &= CATEGORY_MASK_BIG5;
3720 return 1;
3721 }
3722
3723 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".
3724 If SJIS_P is 1, decode SJIS text, else decode BIG5 test. */
3725
3726 static void
3727 decode_coding_sjis (coding)
3728 struct coding_system *coding;
3729 {
3730 unsigned char *src = coding->source + coding->consumed;
3731 unsigned char *src_end = coding->source + coding->src_bytes;
3732 unsigned char *src_base;
3733 int *charbuf = coding->charbuf;
3734 int *charbuf_end = charbuf + coding->charbuf_size;
3735 int consumed_chars = 0, consumed_chars_base;
3736 int multibytep = coding->src_multibyte;
3737 struct charset *charset_roman, *charset_kanji, *charset_kana;
3738 Lisp_Object attrs, eol_type, charset_list, val;
3739
3740 CODING_GET_INFO (coding, attrs, eol_type, charset_list);
3741
3742 val = charset_list;
3743 charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
3744 charset_kanji = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
3745 charset_kana = CHARSET_FROM_ID (XINT (XCAR (val)));
3746
3747 while (1)
3748 {
3749 int c, c1;
3750
3751 src_base = src;
3752 consumed_chars_base = consumed_chars;
3753
3754 if (charbuf >= charbuf_end)
3755 break;
3756
3757 ONE_MORE_BYTE (c);
3758
3759 if (c == '\r')
3760 {
3761 if (EQ (eol_type, Qdos))
3762 {
3763 if (src == src_end)
3764 goto no_more_source;
3765 if (*src == '\n')
3766 ONE_MORE_BYTE (c);
3767 }
3768 else if (EQ (eol_type, Qmac))
3769 c = '\n';
3770 }
3771 else
3772 {
3773 struct charset *charset;
3774
3775 if (c < 0x80)
3776 charset = charset_roman;
3777 else
3778 {
3779 if (c >= 0xF0)
3780 goto invalid_code;
3781 if (c < 0xA0 || c >= 0xE0)
3782 {
3783 /* SJIS -> JISX0208 */
3784 ONE_MORE_BYTE (c1);
3785 if (c1 < 0x40 || c1 == 0x7F || c1 > 0xFC)
3786 goto invalid_code;
3787 c = (c << 8) | c1;
3788 SJIS_TO_JIS (c);
3789 charset = charset_kanji;
3790 }
3791 else
3792 /* SJIS -> JISX0201-Kana */
3793 charset = charset_kana;
3794 }
3795 CODING_DECODE_CHAR (coding, src, src_base, src_end, charset, c, c);
3796 }
3797 *charbuf++ = c;
3798 continue;
3799
3800 invalid_code:
3801 src = src_base;
3802 consumed_chars = consumed_chars_base;
3803 ONE_MORE_BYTE (c);
3804 *charbuf++ = ASCII_BYTE_P (c) ? c : BYTE8_TO_CHAR (c);
3805 coding->errors++;
3806 }
3807
3808 no_more_source:
3809 coding->consumed_char += consumed_chars_base;
3810 coding->consumed = src_base - coding->source;
3811 coding->charbuf_used = charbuf - coding->charbuf;
3812 }
3813
3814 static void
3815 decode_coding_big5 (coding)
3816 struct coding_system *coding;
3817 {
3818 unsigned char *src = coding->source + coding->consumed;
3819 unsigned char *src_end = coding->source + coding->src_bytes;
3820 unsigned char *src_base;
3821 int *charbuf = coding->charbuf;
3822 int *charbuf_end = charbuf + coding->charbuf_size;
3823 int consumed_chars = 0, consumed_chars_base;
3824 int multibytep = coding->src_multibyte;
3825 struct charset *charset_roman, *charset_big5;
3826 Lisp_Object attrs, eol_type, charset_list, val;
3827
3828 CODING_GET_INFO (coding, attrs, eol_type, charset_list);
3829 val = charset_list;
3830 charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
3831 charset_big5 = CHARSET_FROM_ID (XINT (XCAR (val)));
3832
3833 while (1)
3834 {
3835 int c, c1;
3836
3837 src_base = src;
3838 consumed_chars_base = consumed_chars;
3839
3840 if (charbuf >= charbuf_end)
3841 break;
3842
3843 ONE_MORE_BYTE (c);
3844
3845 if (c == '\r')
3846 {
3847 if (EQ (eol_type, Qdos))
3848 {
3849 if (src == src_end)
3850 goto no_more_source;
3851 if (*src == '\n')
3852 ONE_MORE_BYTE (c);
3853 }
3854 else if (EQ (eol_type, Qmac))
3855 c = '\n';
3856 }
3857 else
3858 {
3859 struct charset *charset;
3860 if (c < 0x80)
3861 charset = charset_roman;
3862 else
3863 {
3864 /* BIG5 -> Big5 */
3865 if (c < 0xA1 || c > 0xFE)
3866 goto invalid_code;
3867 ONE_MORE_BYTE (c1);
3868 if (c1 < 0x40 || (c1 > 0x7E && c1 < 0xA1) || c1 > 0xFE)
3869 goto invalid_code;
3870 c = c << 8 | c1;
3871 charset = charset_big5;
3872 }
3873 CODING_DECODE_CHAR (coding, src, src_base, src_end, charset, c, c);
3874 }
3875
3876 *charbuf++ = c;
3877 continue;
3878
3879 invalid_code:
3880 src = src_base;
3881 consumed_chars = consumed_chars_base;
3882 ONE_MORE_BYTE (c);
3883 *charbuf++ = ASCII_BYTE_P (c) ? c : BYTE8_TO_CHAR (c);
3884 coding->errors++;
3885 }
3886
3887 no_more_source:
3888 coding->consumed_char += consumed_chars_base;
3889 coding->consumed = src_base - coding->source;
3890 coding->charbuf_used = charbuf - coding->charbuf;
3891 }
3892
3893 /* See the above "GENERAL NOTES on `encode_coding_XXX ()' functions".
3894 This function can encode charsets `ascii', `katakana-jisx0201',
3895 `japanese-jisx0208', `chinese-big5-1', and `chinese-big5-2'. We
3896 are sure that all these charsets are registered as official charset
3897 (i.e. do not have extended leading-codes). Characters of other
3898 charsets are produced without any encoding. If SJIS_P is 1, encode
3899 SJIS text, else encode BIG5 text. */
3900
3901 static int
3902 encode_coding_sjis (coding)
3903 struct coding_system *coding;
3904 {
3905 int multibytep = coding->dst_multibyte;
3906 int *charbuf = coding->charbuf;
3907 int *charbuf_end = charbuf + coding->charbuf_used;
3908 unsigned char *dst = coding->destination + coding->produced;
3909 unsigned char *dst_end = coding->destination + coding->dst_bytes;
3910 int safe_room = 4;
3911 int produced_chars = 0;
3912 Lisp_Object attrs, eol_type, charset_list, val;
3913 int ascii_compatible;
3914 struct charset *charset_roman, *charset_kanji, *charset_kana;
3915 int c;
3916
3917 CODING_GET_INFO (coding, attrs, eol_type, charset_list);
3918 val = charset_list;
3919 charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
3920 charset_kana = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
3921 charset_kanji = CHARSET_FROM_ID (XINT (XCAR (val)));
3922
3923 ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs));
3924
3925 while (charbuf < charbuf_end)
3926 {
3927 ASSURE_DESTINATION (safe_room);
3928 c = *charbuf++;
3929 /* Now encode the character C. */
3930 if (ASCII_CHAR_P (c) && ascii_compatible)
3931 EMIT_ONE_ASCII_BYTE (c);
3932 else if (CHAR_BYTE8_P (c))
3933 {
3934 c = CHAR_TO_BYTE8 (c);
3935 EMIT_ONE_BYTE (c);
3936 }
3937 else
3938 {
3939 unsigned code;
3940 struct charset *charset = char_charset (c, charset_list, &code);
3941
3942 if (!charset)
3943 {
3944 if (coding->mode & CODING_MODE_SAFE_ENCODING)
3945 {
3946 code = CODING_INHIBIT_CHARACTER_SUBSTITUTION;
3947 charset = CHARSET_FROM_ID (charset_ascii);
3948 }
3949 else
3950 {
3951 c = coding->default_char;
3952 charset = char_charset (c, charset_list, &code);
3953 }
3954 }
3955 if (code == CHARSET_INVALID_CODE (charset))
3956 abort ();
3957 if (charset == charset_kanji)
3958 {
3959 int c1, c2;
3960 JIS_TO_SJIS (code);
3961 c1 = code >> 8, c2 = code & 0xFF;
3962 EMIT_TWO_BYTES (c1, c2);
3963 }
3964 else if (charset == charset_kana)
3965 EMIT_ONE_BYTE (code | 0x80);
3966 else
3967 EMIT_ONE_ASCII_BYTE (code & 0x7F);
3968 }
3969 }
3970 coding->result = CODING_RESULT_SUCCESS;
3971 coding->produced_char += produced_chars;
3972 coding->produced = dst - coding->destination;
3973 return 0;
3974 }
3975
3976 static int
3977 encode_coding_big5 (coding)
3978 struct coding_system *coding;
3979 {
3980 int multibytep = coding->dst_multibyte;
3981 int *charbuf = coding->charbuf;
3982 int *charbuf_end = charbuf + coding->charbuf_used;
3983 unsigned char *dst = coding->destination + coding->produced;
3984 unsigned char *dst_end = coding->destination + coding->dst_bytes;
3985 int safe_room = 4;
3986 int produced_chars = 0;
3987 Lisp_Object attrs, eol_type, charset_list, val;
3988 int ascii_compatible;
3989 struct charset *charset_roman, *charset_big5;
3990 int c;
3991
3992 CODING_GET_INFO (coding, attrs, eol_type, charset_list);
3993 val = charset_list;
3994 charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
3995 charset_big5 = CHARSET_FROM_ID (XINT (XCAR (val)));
3996 ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs));
3997
3998 while (charbuf < charbuf_end)
3999 {
4000 ASSURE_DESTINATION (safe_room);
4001 c = *charbuf++;
4002 /* Now encode the character C. */
4003 if (ASCII_CHAR_P (c) && ascii_compatible)
4004 EMIT_ONE_ASCII_BYTE (c);
4005 else if (CHAR_BYTE8_P (c))
4006 {
4007 c = CHAR_TO_BYTE8 (c);
4008 EMIT_ONE_BYTE (c);
4009 }
4010 else
4011 {
4012 unsigned code;
4013 struct charset *charset = char_charset (c, charset_list, &code);
4014
4015 if (! charset)
4016 {
4017 if (coding->mode & CODING_MODE_SAFE_ENCODING)
4018 {
4019 code = CODING_INHIBIT_CHARACTER_SUBSTITUTION;
4020 charset = CHARSET_FROM_ID (charset_ascii);
4021 }
4022 else
4023 {
4024 c = coding->default_char;
4025 charset = char_charset (c, charset_list, &code);
4026 }
4027 }
4028 if (code == CHARSET_INVALID_CODE (charset))
4029 abort ();
4030 if (charset == charset_big5)
4031 {
4032 int c1, c2;
4033
4034 c1 = code >> 8, c2 = code & 0xFF;
4035 EMIT_TWO_BYTES (c1, c2);
4036 }
4037 else
4038 EMIT_ONE_ASCII_BYTE (code & 0x7F);
4039 }
4040 }
4041 coding->result = CODING_RESULT_SUCCESS;
4042 coding->produced_char += produced_chars;
4043 coding->produced = dst - coding->destination;
4044 return 0;
4045 }
4046
4047 \f
4048 /*** 10. CCL handlers ***/
4049
4050 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
4051 Check if a text is encoded in a coding system of which
4052 encoder/decoder are written in CCL program. If it is, return
4053 CATEGORY_MASK_CCL, else return 0. */
4054
4055 static int
4056 detect_coding_ccl (coding, mask)
4057 struct coding_system *coding;
4058 int *mask;
4059 {
4060 unsigned char *src = coding->source, *src_base = src;
4061 unsigned char *src_end = coding->source + coding->src_bytes;
4062 int multibytep = coding->src_multibyte;
4063 int consumed_chars = 0;
4064 int found = 0;
4065 unsigned char *valids = CODING_CCL_VALIDS (coding);
4066 int head_ascii = coding->head_ascii;
4067 Lisp_Object attrs;
4068
4069 coding = &coding_categories[coding_category_ccl];
4070 attrs = CODING_ID_ATTRS (coding->id);
4071 if (! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
4072 src += head_ascii;
4073
4074 while (1)
4075 {
4076 int c;
4077 ONE_MORE_BYTE (c);
4078 if (! valids[c])
4079 break;
4080 if (!found && valids[c] > 1)
4081 found = 1;
4082 }
4083 *mask &= ~CATEGORY_MASK_CCL;
4084 return 0;
4085
4086 no_more_source:
4087 if (!found)
4088 return 0;
4089 *mask &= CATEGORY_MASK_CCL;
4090 return 1;
4091 }
4092
4093 static void
4094 decode_coding_ccl (coding)
4095 struct coding_system *coding;
4096 {
4097 const unsigned char *src = coding->source + coding->consumed;
4098 unsigned char *src_end = coding->source + coding->src_bytes;
4099 int *charbuf = coding->charbuf;
4100 int *charbuf_end = charbuf + coding->charbuf_size;
4101 int consumed_chars = 0;
4102 int multibytep = coding->src_multibyte;
4103 struct ccl_program ccl;
4104 int source_charbuf[1024];
4105 int source_byteidx[1024];
4106
4107 setup_ccl_program (&ccl, CODING_CCL_DECODER (coding));
4108
4109 while (src < src_end)
4110 {
4111 const unsigned char *p = src;
4112 int *source, *source_end;
4113 int i = 0;
4114
4115 if (multibytep)
4116 while (i < 1024 && p < src_end)
4117 {
4118 source_byteidx[i] = p - src;
4119 source_charbuf[i++] = STRING_CHAR_ADVANCE (p);
4120 }
4121 else
4122 while (i < 1024 && p < src_end)
4123 source_charbuf[i++] = *p++;
4124
4125 if (p == src_end && coding->mode & CODING_MODE_LAST_BLOCK)
4126 ccl.last_block = 1;
4127
4128 source = source_charbuf;
4129 source_end = source + i;
4130 while (source < source_end)
4131 {
4132 ccl_driver (&ccl, source, charbuf,
4133 source_end - source, charbuf_end - charbuf);
4134 source += ccl.consumed;
4135 charbuf += ccl.produced;
4136 if (ccl.status != CCL_STAT_SUSPEND_BY_DST)
4137 break;
4138 }
4139 if (source < source_end)
4140 src += source_byteidx[source - source_charbuf];
4141 else
4142 src = p;
4143 consumed_chars += source - source_charbuf;
4144
4145 if (ccl.status != CCL_STAT_SUSPEND_BY_SRC
4146 && ccl.status != CODING_RESULT_INSUFFICIENT_SRC)
4147 break;
4148 }
4149
4150 switch (ccl.status)
4151 {
4152 case CCL_STAT_SUSPEND_BY_SRC:
4153 coding->result = CODING_RESULT_INSUFFICIENT_SRC;
4154 break;
4155 case CCL_STAT_SUSPEND_BY_DST:
4156 break;
4157 case CCL_STAT_QUIT:
4158 case CCL_STAT_INVALID_CMD:
4159 coding->result = CODING_RESULT_INTERRUPT;
4160 break;
4161 default:
4162 coding->result = CODING_RESULT_SUCCESS;
4163 break;
4164 }
4165 coding->consumed_char += consumed_chars;
4166 coding->consumed = src - coding->source;
4167 coding->charbuf_used = charbuf - coding->charbuf;
4168 }
4169
4170 static int
4171 encode_coding_ccl (coding)
4172 struct coding_system *coding;
4173 {
4174 struct ccl_program ccl;
4175 int multibytep = coding->dst_multibyte;
4176 int *charbuf = coding->charbuf;
4177 int *charbuf_end = charbuf + coding->charbuf_used;
4178 unsigned char *dst = coding->destination + coding->produced;
4179 unsigned char *dst_end = coding->destination + coding->dst_bytes;
4180 unsigned char *adjusted_dst_end = dst_end - 1;
4181 int destination_charbuf[1024];
4182 int i, produced_chars = 0;
4183
4184 setup_ccl_program (&ccl, CODING_CCL_ENCODER (coding));
4185
4186 ccl.last_block = coding->mode & CODING_MODE_LAST_BLOCK;
4187 ccl.dst_multibyte = coding->dst_multibyte;
4188
4189 while (charbuf < charbuf_end && dst < adjusted_dst_end)
4190 {
4191 int dst_bytes = dst_end - dst;
4192 if (dst_bytes > 1024)
4193 dst_bytes = 1024;
4194
4195 ccl_driver (&ccl, charbuf, destination_charbuf,
4196 charbuf_end - charbuf, dst_bytes);
4197 charbuf += ccl.consumed;
4198 if (multibytep)
4199 for (i = 0; i < ccl.produced; i++)
4200 EMIT_ONE_BYTE (destination_charbuf[i] & 0xFF);
4201 else
4202 {
4203 for (i = 0; i < ccl.produced; i++)
4204 *dst++ = destination_charbuf[i] & 0xFF;
4205 produced_chars += ccl.produced;
4206 }
4207 }
4208
4209 switch (ccl.status)
4210 {
4211 case CCL_STAT_SUSPEND_BY_SRC:
4212 coding->result = CODING_RESULT_INSUFFICIENT_SRC;
4213 break;
4214 case CCL_STAT_SUSPEND_BY_DST:
4215 coding->result = CODING_RESULT_INSUFFICIENT_DST;
4216 break;
4217 case CCL_STAT_QUIT:
4218 case CCL_STAT_INVALID_CMD:
4219 coding->result = CODING_RESULT_INTERRUPT;
4220 break;
4221 default:
4222 coding->result = CODING_RESULT_SUCCESS;
4223 break;
4224 }
4225
4226 coding->produced_char += produced_chars;
4227 coding->produced = dst - coding->destination;
4228 return 0;
4229 }
4230
4231
4232 \f
4233 /*** 10, 11. no-conversion handlers ***/
4234
4235 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions". */
4236
4237 static void
4238 decode_coding_raw_text (coding)
4239 struct coding_system *coding;
4240 {
4241 coding->chars_at_source = 1;
4242 coding->consumed_char = 0;
4243 coding->consumed = 0;
4244 coding->result = CODING_RESULT_SUCCESS;
4245 }
4246
4247 static int
4248 encode_coding_raw_text (coding)
4249 struct coding_system *coding;
4250 {
4251 int multibytep = coding->dst_multibyte;
4252 int *charbuf = coding->charbuf;
4253 int *charbuf_end = coding->charbuf + coding->charbuf_used;
4254 unsigned char *dst = coding->destination + coding->produced;
4255 unsigned char *dst_end = coding->destination + coding->dst_bytes;
4256 int produced_chars = 0;
4257 int c;
4258
4259 if (multibytep)
4260 {
4261 int safe_room = MAX_MULTIBYTE_LENGTH * 2;
4262
4263 if (coding->src_multibyte)
4264 while (charbuf < charbuf_end)
4265 {
4266 ASSURE_DESTINATION (safe_room);
4267 c = *charbuf++;
4268 if (ASCII_CHAR_P (c))
4269 EMIT_ONE_ASCII_BYTE (c);
4270 else if (CHAR_BYTE8_P (c))
4271 {
4272 c = CHAR_TO_BYTE8 (c);
4273 EMIT_ONE_BYTE (c);
4274 }
4275 else
4276 {
4277 unsigned char str[MAX_MULTIBYTE_LENGTH], *p0 = str, *p1 = str;
4278
4279 CHAR_STRING_ADVANCE (c, p1);
4280 while (p0 < p1)
4281 {
4282 EMIT_ONE_BYTE (*p0);
4283 p0++;
4284 }
4285 }
4286 }
4287 else
4288 while (charbuf < charbuf_end)
4289 {
4290 ASSURE_DESTINATION (safe_room);
4291 c = *charbuf++;
4292 EMIT_ONE_BYTE (c);
4293 }
4294 }
4295 else
4296 {
4297 if (coding->src_multibyte)
4298 {
4299 int safe_room = MAX_MULTIBYTE_LENGTH;
4300
4301 while (charbuf < charbuf_end)
4302 {
4303 ASSURE_DESTINATION (safe_room);
4304 c = *charbuf++;
4305 if (ASCII_CHAR_P (c))
4306 *dst++ = c;
4307 else if (CHAR_BYTE8_P (c))
4308 *dst++ = CHAR_TO_BYTE8 (c);
4309 else
4310 CHAR_STRING_ADVANCE (c, dst);
4311 produced_chars++;
4312 }
4313 }
4314 else
4315 {
4316 ASSURE_DESTINATION (charbuf_end - charbuf);
4317 while (charbuf < charbuf_end && dst < dst_end)
4318 *dst++ = *charbuf++;
4319 produced_chars = dst - (coding->destination + coding->dst_bytes);
4320 }
4321 }
4322 coding->result = CODING_RESULT_SUCCESS;
4323 coding->produced_char += produced_chars;
4324 coding->produced = dst - coding->destination;
4325 return 0;
4326 }
4327
4328 static int
4329 detect_coding_charset (coding, mask)
4330 struct coding_system *coding;
4331 int *mask;
4332 {
4333 unsigned char *src = coding->source, *src_base = src;
4334 unsigned char *src_end = coding->source + coding->src_bytes;
4335 int multibytep = coding->src_multibyte;
4336 int consumed_chars = 0;
4337 Lisp_Object attrs, valids;
4338
4339 coding = &coding_categories[coding_category_charset];
4340 attrs = CODING_ID_ATTRS (coding->id);
4341 valids = AREF (attrs, coding_attr_charset_valids);
4342
4343 if (! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
4344 src += coding->head_ascii;
4345
4346 while (1)
4347 {
4348 int c;
4349
4350 ONE_MORE_BYTE (c);
4351 if (NILP (AREF (valids, c)))
4352 break;
4353 }
4354 *mask &= ~CATEGORY_MASK_CHARSET;
4355 return 0;
4356
4357 no_more_source:
4358 *mask &= CATEGORY_MASK_CHARSET;
4359 return 1;
4360 }
4361
4362 static void
4363 decode_coding_charset (coding)
4364 struct coding_system *coding;
4365 {
4366 unsigned char *src = coding->source + coding->consumed;
4367 unsigned char *src_end = coding->source + coding->src_bytes;
4368 unsigned char *src_base;
4369 int *charbuf = coding->charbuf;
4370 int *charbuf_end = charbuf + coding->charbuf_size;
4371 int consumed_chars = 0, consumed_chars_base;
4372 int multibytep = coding->src_multibyte;
4373 Lisp_Object attrs, eol_type, charset_list, valids;
4374
4375 CODING_GET_INFO (coding, attrs, eol_type, charset_list);
4376 valids = AREF (attrs, coding_attr_charset_valids);
4377
4378 while (1)
4379 {
4380 int c;
4381
4382 src_base = src;
4383 consumed_chars_base = consumed_chars;
4384
4385 if (charbuf >= charbuf_end)
4386 break;
4387
4388 ONE_MORE_BYTE (c);
4389 if (c == '\r')
4390 {
4391 /* Here we assume that no charset maps '\r' to something
4392 else. */
4393 if (EQ (eol_type, Qdos))
4394 {
4395 if (src < src_end
4396 && *src == '\n')
4397 ONE_MORE_BYTE (c);
4398 }
4399 else if (EQ (eol_type, Qmac))
4400 c = '\n';
4401 }
4402 else
4403 {
4404 Lisp_Object val;
4405 struct charset *charset;
4406 int dim;
4407 int len = 1;
4408 unsigned code = c;
4409
4410 val = AREF (valids, c);
4411 if (NILP (val))
4412 goto invalid_code;
4413 if (INTEGERP (val))
4414 {
4415 charset = CHARSET_FROM_ID (XFASTINT (val));
4416 dim = CHARSET_DIMENSION (charset);
4417 while (len < dim)
4418 {
4419 ONE_MORE_BYTE (c);
4420 code = (code << 8) | c;
4421 len++;
4422 }
4423 CODING_DECODE_CHAR (coding, src, src_base, src_end,
4424 charset, code, c);
4425 }
4426 else
4427 {
4428 /* VAL is a list of charset IDs. It is assured that the
4429 list is sorted by charset dimensions (smaller one
4430 comes first). */
4431 while (CONSP (val))
4432 {
4433 charset = CHARSET_FROM_ID (XFASTINT (XCAR (val)));
4434 dim = CHARSET_DIMENSION (charset);
4435 while (len < dim)
4436 {
4437 ONE_MORE_BYTE (c);
4438 code = (code << 8) | c;
4439 len++;
4440 }
4441 CODING_DECODE_CHAR (coding, src, src_base,
4442 src_end, charset, code, c);
4443 if (c >= 0)
4444 break;
4445 val = XCDR (val);
4446 }
4447 }
4448 if (c < 0)
4449 goto invalid_code;
4450 }
4451 *charbuf++ = c;
4452 continue;
4453
4454 invalid_code:
4455 src = src_base;
4456 consumed_chars = consumed_chars_base;
4457 ONE_MORE_BYTE (c);
4458 *charbuf++ = ASCII_BYTE_P (c) ? c : BYTE8_TO_CHAR (c);
4459 coding->errors++;
4460 }
4461
4462 no_more_source:
4463 coding->consumed_char += consumed_chars_base;
4464 coding->consumed = src_base - coding->source;
4465 coding->charbuf_used = charbuf - coding->charbuf;
4466 }
4467
4468 static int
4469 encode_coding_charset (coding)
4470 struct coding_system *coding;
4471 {
4472 int multibytep = coding->dst_multibyte;
4473 int *charbuf = coding->charbuf;
4474 int *charbuf_end = charbuf + coding->charbuf_used;
4475 unsigned char *dst = coding->destination + coding->produced;
4476 unsigned char *dst_end = coding->destination + coding->dst_bytes;
4477 int safe_room = MAX_MULTIBYTE_LENGTH;
4478 int produced_chars = 0;
4479 Lisp_Object attrs, eol_type, charset_list;
4480 int ascii_compatible;
4481 int c;
4482
4483 CODING_GET_INFO (coding, attrs, eol_type, charset_list);
4484 ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs));
4485
4486 while (charbuf < charbuf_end)
4487 {
4488 struct charset *charset;
4489 unsigned code;
4490
4491 ASSURE_DESTINATION (safe_room);
4492 c = *charbuf++;
4493 if (ascii_compatible && ASCII_CHAR_P (c))
4494 EMIT_ONE_ASCII_BYTE (c);
4495 else if (CHAR_BYTE8_P (c))
4496 {
4497 c = CHAR_TO_BYTE8 (c);
4498 EMIT_ONE_BYTE (c);
4499 }
4500 else
4501 {
4502 charset = char_charset (c, charset_list, &code);
4503 if (charset)
4504 {
4505 if (CHARSET_DIMENSION (charset) == 1)
4506 EMIT_ONE_BYTE (code);
4507 else if (CHARSET_DIMENSION (charset) == 2)
4508 EMIT_TWO_BYTES (code >> 8, code & 0xFF);
4509 else if (CHARSET_DIMENSION (charset) == 3)
4510 EMIT_THREE_BYTES (code >> 16, (code >> 8) & 0xFF, code & 0xFF);
4511 else
4512 EMIT_FOUR_BYTES (code >> 24, (code >> 16) & 0xFF,
4513 (code >> 8) & 0xFF, code & 0xFF);
4514 }
4515 else
4516 {
4517 if (coding->mode & CODING_MODE_SAFE_ENCODING)
4518 c = CODING_INHIBIT_CHARACTER_SUBSTITUTION;
4519 else
4520 c = coding->default_char;
4521 EMIT_ONE_BYTE (c);
4522 }
4523 }
4524 }
4525
4526 coding->result = CODING_RESULT_SUCCESS;
4527 coding->produced_char += produced_chars;
4528 coding->produced = dst - coding->destination;
4529 return 0;
4530 }
4531
4532 \f
4533 /*** 7. C library functions ***/
4534
4535 /* Setup coding context CODING from information about CODING_SYSTEM.
4536 If CODING_SYSTEM is nil, `no-conversion' is assumed. If
4537 CODING_SYSTEM is invalid, signal an error. */
4538
4539 void
4540 setup_coding_system (coding_system, coding)
4541 Lisp_Object coding_system;
4542 struct coding_system *coding;
4543 {
4544 Lisp_Object attrs;
4545 Lisp_Object eol_type;
4546 Lisp_Object coding_type;
4547 Lisp_Object val;
4548
4549 if (NILP (coding_system))
4550 coding_system = Qno_conversion;
4551
4552 CHECK_CODING_SYSTEM_GET_ID (coding_system, coding->id);
4553
4554 attrs = CODING_ID_ATTRS (coding->id);
4555 eol_type = CODING_ID_EOL_TYPE (coding->id);
4556
4557 coding->mode = 0;
4558 coding->head_ascii = -1;
4559 coding->common_flags
4560 = (VECTORP (eol_type) ? CODING_REQUIRE_DETECTION_MASK : 0);
4561
4562 val = CODING_ATTR_SAFE_CHARSETS (attrs);
4563 coding->max_charset_id = XSTRING (val)->size - 1;
4564 coding->safe_charsets = (char *) XSTRING (val)->data;
4565 coding->default_char = XINT (CODING_ATTR_DEFAULT_CHAR (attrs));
4566
4567 coding_type = CODING_ATTR_TYPE (attrs);
4568 if (EQ (coding_type, Qundecided))
4569 {
4570 coding->detector = NULL;
4571 coding->decoder = decode_coding_raw_text;
4572 coding->encoder = encode_coding_raw_text;
4573 coding->common_flags |= CODING_REQUIRE_DETECTION_MASK;
4574 }
4575 else if (EQ (coding_type, Qiso_2022))
4576 {
4577 int i;
4578 int flags = XINT (AREF (attrs, coding_attr_iso_flags));
4579
4580 /* Invoke graphic register 0 to plane 0. */
4581 CODING_ISO_INVOCATION (coding, 0) = 0;
4582 /* Invoke graphic register 1 to plane 1 if we can use 8-bit. */
4583 CODING_ISO_INVOCATION (coding, 1)
4584 = (flags & CODING_ISO_FLAG_SEVEN_BITS ? -1 : 1);
4585 /* Setup the initial status of designation. */
4586 for (i = 0; i < 4; i++)
4587 CODING_ISO_DESIGNATION (coding, i) = CODING_ISO_INITIAL (coding, i);
4588 /* Not single shifting initially. */
4589 CODING_ISO_SINGLE_SHIFTING (coding) = 0;
4590 /* Beginning of buffer should also be regarded as bol. */
4591 CODING_ISO_BOL (coding) = 1;
4592 coding->detector = detect_coding_iso_2022;
4593 coding->decoder = decode_coding_iso_2022;
4594 coding->encoder = encode_coding_iso_2022;
4595 if (flags & CODING_ISO_FLAG_SAFE)
4596 coding->mode |= CODING_MODE_SAFE_ENCODING;
4597 coding->common_flags
4598 |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK
4599 | CODING_REQUIRE_FLUSHING_MASK);
4600 if (flags & CODING_ISO_FLAG_COMPOSITION)
4601 coding->common_flags |= CODING_ANNOTATE_COMPOSITION_MASK;
4602 if (flags & CODING_ISO_FLAG_FULL_SUPPORT)
4603 {
4604 setup_iso_safe_charsets (attrs);
4605 val = CODING_ATTR_SAFE_CHARSETS (attrs);
4606 coding->max_charset_id = XSTRING (val)->size - 1;
4607 coding->safe_charsets = (char *) XSTRING (val)->data;
4608 }
4609 CODING_ISO_FLAGS (coding) = flags;
4610 }
4611 else if (EQ (coding_type, Qcharset))
4612 {
4613 coding->detector = detect_coding_charset;
4614 coding->decoder = decode_coding_charset;
4615 coding->encoder = encode_coding_charset;
4616 coding->common_flags
4617 |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
4618 }
4619 else if (EQ (coding_type, Qutf_8))
4620 {
4621 coding->detector = detect_coding_utf_8;
4622 coding->decoder = decode_coding_utf_8;
4623 coding->encoder = encode_coding_utf_8;
4624 coding->common_flags
4625 |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
4626 }
4627 else if (EQ (coding_type, Qutf_16))
4628 {
4629 val = AREF (attrs, coding_attr_utf_16_bom);
4630 CODING_UTF_16_BOM (coding) = (CONSP (val) ? utf_16_detect_bom
4631 : EQ (val, Qt) ? utf_16_with_bom
4632 : utf_16_without_bom);
4633 val = AREF (attrs, coding_attr_utf_16_endian);
4634 CODING_UTF_16_ENDIAN (coding) = (NILP (val) ? utf_16_big_endian
4635 : utf_16_little_endian);
4636 CODING_UTF_16_SURROGATE (coding) = 0;
4637 coding->detector = detect_coding_utf_16;
4638 coding->decoder = decode_coding_utf_16;
4639 coding->encoder = encode_coding_utf_16;
4640 coding->common_flags
4641 |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
4642 }
4643 else if (EQ (coding_type, Qccl))
4644 {
4645 coding->detector = detect_coding_ccl;
4646 coding->decoder = decode_coding_ccl;
4647 coding->encoder = encode_coding_ccl;
4648 coding->common_flags
4649 |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK
4650 | CODING_REQUIRE_FLUSHING_MASK);
4651 }
4652 else if (EQ (coding_type, Qemacs_mule))
4653 {
4654 coding->detector = detect_coding_emacs_mule;
4655 coding->decoder = decode_coding_emacs_mule;
4656 coding->encoder = encode_coding_emacs_mule;
4657 coding->common_flags
4658 |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
4659 if (! NILP (AREF (attrs, coding_attr_emacs_mule_full))
4660 && ! EQ (CODING_ATTR_CHARSET_LIST (attrs), Vemacs_mule_charset_list))
4661 {
4662 Lisp_Object tail, safe_charsets;
4663 int max_charset_id = 0;
4664
4665 for (tail = Vemacs_mule_charset_list; CONSP (tail);
4666 tail = XCDR (tail))
4667 if (max_charset_id < XFASTINT (XCAR (tail)))
4668 max_charset_id = XFASTINT (XCAR (tail));
4669 safe_charsets = Fmake_string (make_number (max_charset_id + 1),
4670 make_number (255));
4671 for (tail = Vemacs_mule_charset_list; CONSP (tail);
4672 tail = XCDR (tail))
4673 XSTRING (safe_charsets)->data[XFASTINT (XCAR (tail))] = 0;
4674 coding->max_charset_id = max_charset_id;
4675 coding->safe_charsets = (char *) XSTRING (safe_charsets)->data;
4676 }
4677 }
4678 else if (EQ (coding_type, Qshift_jis))
4679 {
4680 coding->detector = detect_coding_sjis;
4681 coding->decoder = decode_coding_sjis;
4682 coding->encoder = encode_coding_sjis;
4683 coding->common_flags
4684 |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
4685 }
4686 else if (EQ (coding_type, Qbig5))
4687 {
4688 coding->detector = detect_coding_big5;
4689 coding->decoder = decode_coding_big5;
4690 coding->encoder = encode_coding_big5;
4691 coding->common_flags
4692 |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
4693 }
4694 else /* EQ (coding_type, Qraw_text) */
4695 {
4696 coding->detector = NULL;
4697 coding->decoder = decode_coding_raw_text;
4698 coding->encoder = encode_coding_raw_text;
4699 coding->common_flags |= CODING_FOR_UNIBYTE_MASK;
4700 }
4701
4702 return;
4703 }
4704
4705 /* Return raw-text or one of its subsidiaries that has the same
4706 eol_type as CODING-SYSTEM. */
4707
4708 Lisp_Object
4709 raw_text_coding_system (coding_system)
4710 Lisp_Object coding_system;
4711 {
4712 Lisp_Object spec, attrs;
4713 Lisp_Object eol_type, raw_text_eol_type;
4714
4715 spec = CODING_SYSTEM_SPEC (coding_system);
4716 attrs = AREF (spec, 0);
4717
4718 if (EQ (CODING_ATTR_TYPE (attrs), Qraw_text))
4719 return coding_system;
4720
4721 eol_type = AREF (spec, 2);
4722 if (VECTORP (eol_type))
4723 return Qraw_text;
4724 spec = CODING_SYSTEM_SPEC (Qraw_text);
4725 raw_text_eol_type = AREF (spec, 2);
4726 return (EQ (eol_type, Qunix) ? AREF (raw_text_eol_type, 0)
4727 : EQ (eol_type, Qdos) ? AREF (raw_text_eol_type, 1)
4728 : AREF (raw_text_eol_type, 2));
4729 }
4730
4731
4732 /* If CODING_SYSTEM doesn't specify end-of-line format but PARENT
4733 does, return one of the subsidiary that has the same eol-spec as
4734 PARENT. Otherwise, return CODING_SYSTEM. */
4735
4736 Lisp_Object
4737 coding_inherit_eol_type (coding_system, parent)
4738 Lisp_Object coding_system, parent;
4739 {
4740 Lisp_Object spec, attrs, eol_type;
4741
4742 spec = CODING_SYSTEM_SPEC (coding_system);
4743 attrs = AREF (spec, 0);
4744 eol_type = AREF (spec, 2);
4745 if (VECTORP (eol_type))
4746 {
4747 Lisp_Object parent_spec;
4748 Lisp_Object parent_eol_type;
4749
4750 parent_spec
4751 = CODING_SYSTEM_SPEC (buffer_defaults.buffer_file_coding_system);
4752 parent_eol_type = AREF (parent_spec, 2);
4753 if (EQ (parent_eol_type, Qunix))
4754 coding_system = AREF (eol_type, 0);
4755 else if (EQ (parent_eol_type, Qdos))
4756 coding_system = AREF (eol_type, 1);
4757 else if (EQ (parent_eol_type, Qmac))
4758 coding_system = AREF (eol_type, 2);
4759 }
4760 return coding_system;
4761 }
4762
4763 /* Emacs has a mechanism to automatically detect a coding system if it
4764 is one of Emacs' internal format, ISO2022, SJIS, and BIG5. But,
4765 it's impossible to distinguish some coding systems accurately
4766 because they use the same range of codes. So, at first, coding
4767 systems are categorized into 7, those are:
4768
4769 o coding-category-emacs-mule
4770
4771 The category for a coding system which has the same code range
4772 as Emacs' internal format. Assigned the coding-system (Lisp
4773 symbol) `emacs-mule' by default.
4774
4775 o coding-category-sjis
4776
4777 The category for a coding system which has the same code range
4778 as SJIS. Assigned the coding-system (Lisp
4779 symbol) `japanese-shift-jis' by default.
4780
4781 o coding-category-iso-7
4782
4783 The category for a coding system which has the same code range
4784 as ISO2022 of 7-bit environment. This doesn't use any locking
4785 shift and single shift functions. This can encode/decode all
4786 charsets. Assigned the coding-system (Lisp symbol)
4787 `iso-2022-7bit' by default.
4788
4789 o coding-category-iso-7-tight
4790
4791 Same as coding-category-iso-7 except that this can
4792 encode/decode only the specified charsets.
4793
4794 o coding-category-iso-8-1
4795
4796 The category for a coding system which has the same code range
4797 as ISO2022 of 8-bit environment and graphic plane 1 used only
4798 for DIMENSION1 charset. This doesn't use any locking shift
4799 and single shift functions. Assigned the coding-system (Lisp
4800 symbol) `iso-latin-1' by default.
4801
4802 o coding-category-iso-8-2
4803
4804 The category for a coding system which has the same code range
4805 as ISO2022 of 8-bit environment and graphic plane 1 used only
4806 for DIMENSION2 charset. This doesn't use any locking shift
4807 and single shift functions. Assigned the coding-system (Lisp
4808 symbol) `japanese-iso-8bit' by default.
4809
4810 o coding-category-iso-7-else
4811
4812 The category for a coding system which has the same code range
4813 as ISO2022 of 7-bit environemnt but uses locking shift or
4814 single shift functions. Assigned the coding-system (Lisp
4815 symbol) `iso-2022-7bit-lock' by default.
4816
4817 o coding-category-iso-8-else
4818
4819 The category for a coding system which has the same code range
4820 as ISO2022 of 8-bit environemnt but uses locking shift or
4821 single shift functions. Assigned the coding-system (Lisp
4822 symbol) `iso-2022-8bit-ss2' by default.
4823
4824 o coding-category-big5
4825
4826 The category for a coding system which has the same code range
4827 as BIG5. Assigned the coding-system (Lisp symbol)
4828 `cn-big5' by default.
4829
4830 o coding-category-utf-8
4831
4832 The category for a coding system which has the same code range
4833 as UTF-8 (cf. RFC2279). Assigned the coding-system (Lisp
4834 symbol) `utf-8' by default.
4835
4836 o coding-category-utf-16-be
4837
4838 The category for a coding system in which a text has an
4839 Unicode signature (cf. Unicode Standard) in the order of BIG
4840 endian at the head. Assigned the coding-system (Lisp symbol)
4841 `utf-16-be' by default.
4842
4843 o coding-category-utf-16-le
4844
4845 The category for a coding system in which a text has an
4846 Unicode signature (cf. Unicode Standard) in the order of
4847 LITTLE endian at the head. Assigned the coding-system (Lisp
4848 symbol) `utf-16-le' by default.
4849
4850 o coding-category-ccl
4851
4852 The category for a coding system of which encoder/decoder is
4853 written in CCL programs. The default value is nil, i.e., no
4854 coding system is assigned.
4855
4856 o coding-category-binary
4857
4858 The category for a coding system not categorized in any of the
4859 above. Assigned the coding-system (Lisp symbol)
4860 `no-conversion' by default.
4861
4862 Each of them is a Lisp symbol and the value is an actual
4863 `coding-system's (this is also a Lisp symbol) assigned by a user.
4864 What Emacs does actually is to detect a category of coding system.
4865 Then, it uses a `coding-system' assigned to it. If Emacs can't
4866 decide only one possible category, it selects a category of the
4867 highest priority. Priorities of categories are also specified by a
4868 user in a Lisp variable `coding-category-list'.
4869
4870 */
4871
4872 #define EOL_SEEN_NONE 0
4873 #define EOL_SEEN_LF 1
4874 #define EOL_SEEN_CR 2
4875 #define EOL_SEEN_CRLF 4
4876
4877 /* Detect how end-of-line of a text of length CODING->src_bytes
4878 pointed by CODING->source is encoded. Return one of
4879 EOL_SEEN_XXX. */
4880
4881 #define MAX_EOL_CHECK_COUNT 3
4882
4883 static int
4884 detect_eol (coding, source, src_bytes)
4885 struct coding_system *coding;
4886 unsigned char *source;
4887 EMACS_INT src_bytes;
4888 {
4889 Lisp_Object attrs, coding_type;
4890 unsigned char *src = source, *src_end = src + src_bytes;
4891 unsigned char c;
4892 int total = 0;
4893 int eol_seen = EOL_SEEN_NONE;
4894
4895 attrs = CODING_ID_ATTRS (coding->id);
4896 coding_type = CODING_ATTR_TYPE (attrs);
4897
4898 if (EQ (coding_type, Qccl))
4899 {
4900 int msb, lsb;
4901
4902 msb = coding->spec.utf_16.endian == utf_16_little_endian;
4903 lsb = 1 - msb;
4904
4905 while (src + 1 < src_end)
4906 {
4907 c = src[lsb];
4908 if (src[msb] == 0 && (c == '\n' || c == '\r'))
4909 {
4910 int this_eol;
4911
4912 if (c == '\n')
4913 this_eol = EOL_SEEN_LF;
4914 else if (src + 3 >= src_end
4915 || src[msb + 2] != 0
4916 || src[lsb + 2] != '\n')
4917 this_eol = EOL_SEEN_CR;
4918 else
4919 this_eol = EOL_SEEN_CRLF;
4920
4921 if (eol_seen == EOL_SEEN_NONE)
4922 /* This is the first end-of-line. */
4923 eol_seen = this_eol;
4924 else if (eol_seen != this_eol)
4925 {
4926 /* The found type is different from what found before. */
4927 eol_seen = EOL_SEEN_LF;
4928 break;
4929 }
4930 if (++total == MAX_EOL_CHECK_COUNT)
4931 break;
4932 }
4933 src += 2;
4934 }
4935 }
4936 else
4937 {
4938 while (src < src_end)
4939 {
4940 c = *src++;
4941 if (c == '\n' || c == '\r')
4942 {
4943 int this_eol;
4944
4945 if (c == '\n')
4946 this_eol = EOL_SEEN_LF;
4947 else if (src >= src_end || *src != '\n')
4948 this_eol = EOL_SEEN_CR;
4949 else
4950 this_eol = EOL_SEEN_CRLF, src++;
4951
4952 if (eol_seen == EOL_SEEN_NONE)
4953 /* This is the first end-of-line. */
4954 eol_seen = this_eol;
4955 else if (eol_seen != this_eol)
4956 {
4957 /* The found type is different from what found before. */
4958 eol_seen = EOL_SEEN_LF;
4959 break;
4960 }
4961 if (++total == MAX_EOL_CHECK_COUNT)
4962 break;
4963 }
4964 }
4965 }
4966 return eol_seen;
4967 }
4968
4969
4970 static void
4971 adjust_coding_eol_type (coding, eol_seen)
4972 struct coding_system *coding;
4973 int eol_seen;
4974 {
4975 Lisp_Object eol_type;
4976
4977 eol_type = CODING_ID_EOL_TYPE (coding->id);
4978 if (eol_seen & EOL_SEEN_LF)
4979 coding->id = CODING_SYSTEM_ID (AREF (eol_type, 0));
4980 else if (eol_seen & EOL_SEEN_CRLF)
4981 coding->id = CODING_SYSTEM_ID (AREF (eol_type, 1));
4982 else if (eol_seen & EOL_SEEN_CR)
4983 coding->id = CODING_SYSTEM_ID (AREF (eol_type, 2));
4984 }
4985
4986 /* Detect how a text specified in CODING is encoded. If a coding
4987 system is detected, update fields of CODING by the detected coding
4988 system. */
4989
4990 void
4991 detect_coding (coding)
4992 struct coding_system *coding;
4993 {
4994 unsigned char *src, *src_end;
4995 Lisp_Object attrs, coding_type;
4996
4997 coding->consumed = coding->consumed_char = 0;
4998 coding->produced = coding->produced_char = 0;
4999 coding_set_source (coding);
5000
5001 src_end = coding->source + coding->src_bytes;
5002
5003 /* If we have not yet decided the text encoding type, detect it
5004 now. */
5005 if (EQ (CODING_ATTR_TYPE (CODING_ID_ATTRS (coding->id)), Qundecided))
5006 {
5007 int mask = CATEGORY_MASK_ANY;
5008 int c, i;
5009
5010 for (src = coding->source; src < src_end; src++)
5011 {
5012 c = *src;
5013 if (c & 0x80 || (c < 0x20 && (c == ISO_CODE_ESC
5014 || c == ISO_CODE_SI
5015 || c == ISO_CODE_SO)))
5016 break;
5017 }
5018 coding->head_ascii = src - (coding->source + coding->consumed);
5019
5020 if (coding->head_ascii < coding->src_bytes)
5021 {
5022 int detected = 0;
5023
5024 for (i = 0; i < coding_category_raw_text; i++)
5025 {
5026 enum coding_category category = coding_priorities[i];
5027 struct coding_system *this = coding_categories + category;
5028
5029 if (category >= coding_category_raw_text
5030 || detected & (1 << category))
5031 continue;
5032
5033 if (this->id < 0)
5034 {
5035 /* No coding system of this category is defined. */
5036 mask &= ~(1 << category);
5037 }
5038 else
5039 {
5040 detected |= detected_mask[category];
5041 if ((*(this->detector)) (coding, &mask))
5042 break;
5043 }
5044 }
5045 if (! mask)
5046 setup_coding_system (Qraw_text, coding);
5047 else if (mask != CATEGORY_MASK_ANY)
5048 for (i = 0; i < coding_category_raw_text; i++)
5049 {
5050 enum coding_category category = coding_priorities[i];
5051 struct coding_system *this = coding_categories + category;
5052
5053 if (mask & (1 << category))
5054 {
5055 setup_coding_system (CODING_ID_NAME (this->id), coding);
5056 break;
5057 }
5058 }
5059 }
5060 }
5061
5062 attrs = CODING_ID_ATTRS (coding->id);
5063 coding_type = CODING_ATTR_TYPE (attrs);
5064
5065 /* If we have not yet decided the EOL type, detect it now. But, the
5066 detection is impossible for a CCL based coding system, in which
5067 case, we detct the EOL type after decoding. */
5068 if (VECTORP (CODING_ID_EOL_TYPE (coding->id))
5069 && ! EQ (coding_type, Qccl))
5070 {
5071 int eol_seen = detect_eol (coding, coding->source, coding->src_bytes);
5072
5073 if (eol_seen != EOL_SEEN_NONE)
5074 adjust_coding_eol_type (coding, eol_seen);
5075 }
5076 }
5077
5078
5079 static void
5080 decode_eol (coding)
5081 struct coding_system *coding;
5082 {
5083 if (VECTORP (CODING_ID_EOL_TYPE (coding->id)))
5084 {
5085 unsigned char *p = CHAR_POS_ADDR (coding->dst_pos);
5086 unsigned char *pend = p + coding->produced;
5087 int eol_seen = EOL_SEEN_NONE;
5088
5089 for (; p < pend; p++)
5090 {
5091 if (*p == '\n')
5092 eol_seen |= EOL_SEEN_LF;
5093 else if (*p == '\r')
5094 {
5095 if (p + 1 < pend && *(p + 1) == '\n')
5096 {
5097 eol_seen |= EOL_SEEN_CRLF;
5098 p++;
5099 }
5100 else
5101 eol_seen |= EOL_SEEN_CR;
5102 }
5103 }
5104 if (eol_seen != EOL_SEEN_NONE)
5105 adjust_coding_eol_type (coding, eol_seen);
5106 }
5107
5108 if (EQ (CODING_ID_EOL_TYPE (coding->id), Qmac))
5109 {
5110 unsigned char *p = CHAR_POS_ADDR (coding->dst_pos);
5111 unsigned char *pend = p + coding->produced;
5112
5113 for (; p < pend; p++)
5114 if (*p == '\r')
5115 *p = '\n';
5116 }
5117 else if (EQ (CODING_ID_EOL_TYPE (coding->id), Qdos))
5118 {
5119 unsigned char *p, *pbeg, *pend;
5120 Lisp_Object undo_list;
5121
5122 move_gap_both (coding->dst_pos + coding->produced_char,
5123 coding->dst_pos_byte + coding->produced);
5124 undo_list = current_buffer->undo_list;
5125 current_buffer->undo_list = Qt;
5126 del_range_2 (coding->dst_pos, coding->dst_pos_byte, GPT, GPT_BYTE, 0);
5127 current_buffer->undo_list = undo_list;
5128 pbeg = GPT_ADDR;
5129 pend = pbeg + coding->produced;
5130
5131 for (p = pend - 1; p >= pbeg; p--)
5132 if (*p == '\r')
5133 {
5134 safe_bcopy ((char *) (p + 1), (char *) p, pend - p - 1);
5135 pend--;
5136 }
5137 coding->produced_char -= coding->produced - (pend - pbeg);
5138 coding->produced = pend - pbeg;
5139 insert_from_gap (coding->produced_char, coding->produced);
5140 }
5141 }
5142
5143 static void
5144 translate_chars (coding, table)
5145 struct coding_system *coding;
5146 Lisp_Object table;
5147 {
5148 int *charbuf = coding->charbuf;
5149 int *charbuf_end = charbuf + coding->charbuf_used;
5150 int c;
5151
5152 if (coding->chars_at_source)
5153 return;
5154
5155 while (charbuf < charbuf_end)
5156 {
5157 c = *charbuf;
5158 if (c < 0)
5159 charbuf += c;
5160 else
5161 *charbuf++ = translate_char (table, c);
5162 }
5163 }
5164
5165 static int
5166 produce_chars (coding)
5167 struct coding_system *coding;
5168 {
5169 unsigned char *dst = coding->destination + coding->produced;
5170 unsigned char *dst_end = coding->destination + coding->dst_bytes;
5171 int produced;
5172 int produced_chars = 0;
5173
5174 if (! coding->chars_at_source)
5175 {
5176 /* Characters are in coding->charbuf. */
5177 int *buf = coding->charbuf;
5178 int *buf_end = buf + coding->charbuf_used;
5179 unsigned char *adjusted_dst_end;
5180
5181 if (BUFFERP (coding->src_object)
5182 && EQ (coding->src_object, coding->dst_object))
5183 dst_end = coding->source + coding->consumed;
5184 adjusted_dst_end = dst_end - MAX_MULTIBYTE_LENGTH;
5185
5186 while (buf < buf_end)
5187 {
5188 int c = *buf++;
5189
5190 if (dst >= adjusted_dst_end)
5191 {
5192 dst = alloc_destination (coding,
5193 buf_end - buf + MAX_MULTIBYTE_LENGTH,
5194 dst);
5195 dst_end = coding->destination + coding->dst_bytes;
5196 adjusted_dst_end = dst_end - MAX_MULTIBYTE_LENGTH;
5197 }
5198 if (c >= 0)
5199 {
5200 if (coding->dst_multibyte
5201 || ! CHAR_BYTE8_P (c))
5202 CHAR_STRING_ADVANCE (c, dst);
5203 else
5204 *dst++ = CHAR_TO_BYTE8 (c);
5205 produced_chars++;
5206 }
5207 else
5208 /* This is an annotation data. */
5209 buf -= c + 1;
5210 }
5211 }
5212 else
5213 {
5214 unsigned char *src = coding->source;
5215 unsigned char *src_end = src + coding->src_bytes;
5216 Lisp_Object eol_type;
5217
5218 eol_type = CODING_ID_EOL_TYPE (coding->id);
5219
5220 if (coding->src_multibyte != coding->dst_multibyte)
5221 {
5222 if (coding->src_multibyte)
5223 {
5224 int multibytep = 1;
5225 int consumed_chars;
5226
5227 while (1)
5228 {
5229 unsigned char *src_base = src;
5230 int c;
5231
5232 ONE_MORE_BYTE (c);
5233 if (c == '\r')
5234 {
5235 if (EQ (eol_type, Qdos))
5236 {
5237 if (src < src_end
5238 && *src == '\n')
5239 c = *src++;
5240 }
5241 else if (EQ (eol_type, Qmac))
5242 c = '\n';
5243 }
5244 if (dst == dst_end)
5245 {
5246 coding->consumed = src - coding->source;
5247
5248 if (EQ (coding->src_object, coding->dst_object))
5249 dst_end = src;
5250 if (dst == dst_end)
5251 {
5252 dst = alloc_destination (coding, src_end - src + 1,
5253 dst);
5254 dst_end = coding->destination + coding->dst_bytes;
5255 coding_set_source (coding);
5256 src = coding->source + coding->consumed;
5257 src_end = coding->source + coding->src_bytes;
5258 }
5259 }
5260 *dst++ = c;
5261 produced_chars++;
5262 }
5263 no_more_source:
5264 ;
5265 }
5266 else
5267 while (src < src_end)
5268 {
5269 int multibytep = 1;
5270 int c = *src++;
5271
5272 if (c == '\r')
5273 {
5274 if (EQ (eol_type, Qdos))
5275 {
5276 if (src < src_end
5277 && *src == '\n')
5278 c = *src++;
5279 }
5280 else if (EQ (eol_type, Qmac))
5281 c = '\n';
5282 }
5283 if (dst >= dst_end - 1)
5284 {
5285 coding->consumed = src - coding->source;
5286
5287 if (EQ (coding->src_object, coding->dst_object))
5288 dst_end = src;
5289 if (dst >= dst_end - 1)
5290 {
5291 dst = alloc_destination (coding, src_end - src + 2,
5292 dst);
5293 dst_end = coding->destination + coding->dst_bytes;
5294 coding_set_source (coding);
5295 src = coding->source + coding->consumed;
5296 src_end = coding->source + coding->src_bytes;
5297 }
5298 }
5299 EMIT_ONE_BYTE (c);
5300 }
5301 }
5302 else
5303 {
5304 if (!EQ (coding->src_object, coding->dst_object))
5305 {
5306 int require = coding->src_bytes - coding->dst_bytes;
5307
5308 if (require > 0)
5309 {
5310 EMACS_INT offset = src - coding->source;
5311
5312 dst = alloc_destination (coding, require, dst);
5313 coding_set_source (coding);
5314 src = coding->source + offset;
5315 src_end = coding->source + coding->src_bytes;
5316 }
5317 }
5318 produced_chars = coding->src_chars;
5319 while (src < src_end)
5320 {
5321 int c = *src++;
5322
5323 if (c == '\r')
5324 {
5325 if (EQ (eol_type, Qdos))
5326 {
5327 if (src < src_end
5328 && *src == '\n')
5329 c = *src++;
5330 produced_chars--;
5331 }
5332 else if (EQ (eol_type, Qmac))
5333 c = '\n';
5334 }
5335 *dst++ = c;
5336 }
5337 }
5338 coding->consumed = coding->src_bytes;
5339 coding->consumed_char = coding->src_chars;
5340 }
5341
5342 produced = dst - (coding->destination + coding->produced);
5343 if (BUFFERP (coding->dst_object))
5344 insert_from_gap (produced_chars, produced);
5345 coding->produced += produced;
5346 coding->produced_char += produced_chars;
5347 return produced_chars;
5348 }
5349
5350 /* [ -LENGTH CHAR_POS_OFFSET MASK METHOD COMP_LEN ]
5351 or
5352 [ -LENGTH CHAR_POS_OFFSET MASK METHOD COMP_LEN COMPONENTS... ]
5353 */
5354
5355 static INLINE void
5356 produce_composition (coding, charbuf)
5357 struct coding_system *coding;
5358 int *charbuf;
5359 {
5360 Lisp_Object buffer;
5361 int len;
5362 EMACS_INT pos;
5363 enum composition_method method;
5364 int cmp_len;
5365 Lisp_Object components;
5366
5367 buffer = coding->dst_object;
5368 len = -charbuf[0];
5369 pos = coding->dst_pos + charbuf[1];
5370 method = (enum composition_method) (charbuf[3]);
5371 cmp_len = charbuf[4];
5372
5373 if (method == COMPOSITION_RELATIVE)
5374 components = Qnil;
5375 else
5376 {
5377 Lisp_Object args[MAX_COMPOSITION_COMPONENTS * 2 - 1];
5378 int i;
5379
5380 len -= 5;
5381 charbuf += 5;
5382 for (i = 0; i < len; i++)
5383 args[i] = make_number (charbuf[i]);
5384 components = (method == COMPOSITION_WITH_ALTCHARS
5385 ? Fstring (len, args) : Fvector (len, args));
5386 }
5387 compose_text (pos, pos + cmp_len, components, Qnil, Qnil);
5388 }
5389
5390 static int *
5391 save_composition_data (buf, buf_end, prop)
5392 int *buf, *buf_end;
5393 Lisp_Object prop;
5394 {
5395 enum composition_method method = COMPOSITION_METHOD (prop);
5396 int cmp_len = COMPOSITION_LENGTH (prop);
5397
5398 if (buf + 4 + (MAX_COMPOSITION_COMPONENTS * 2 - 1) > buf_end)
5399 return NULL;
5400
5401 buf[1] = CODING_ANNOTATE_COMPOSITION_MASK;
5402 buf[2] = method;
5403 buf[3] = cmp_len;
5404
5405 if (method == COMPOSITION_RELATIVE)
5406 buf[0] = 4;
5407 else
5408 {
5409 Lisp_Object components;
5410 int len, i;
5411
5412 components = COMPOSITION_COMPONENTS (prop);
5413 if (VECTORP (components))
5414 {
5415 len = XVECTOR (components)->size;
5416 for (i = 0; i < len; i++)
5417 buf[4 + i] = XINT (AREF (components, i));
5418 }
5419 else if (STRINGP (components))
5420 {
5421 int i_byte;
5422
5423 len = XSTRING (components)->size;
5424 i = i_byte = 0;
5425 while (i < len)
5426 FETCH_STRING_CHAR_ADVANCE (buf[4 + i], components, i, i_byte);
5427 }
5428 else if (INTEGERP (components))
5429 {
5430 len = 1;
5431 buf[4] = XINT (components);
5432 }
5433 else if (CONSP (components))
5434 {
5435 for (len = 0; CONSP (components);
5436 len++, components = XCDR (components))
5437 buf[4 + len] = XINT (XCAR (components));
5438 }
5439 else
5440 abort ();
5441 buf[0] = 4 + len;
5442 }
5443 return (buf + buf[0]);
5444 }
5445
5446 #define CHARBUF_SIZE 0x4000
5447
5448 #define ALLOC_CONVERSION_WORK_AREA(coding) \
5449 do { \
5450 int size = CHARBUF_SIZE;; \
5451 \
5452 coding->charbuf = NULL; \
5453 while (size > 1024) \
5454 { \
5455 coding->charbuf = (int *) alloca (sizeof (int) * size); \
5456 if (coding->charbuf) \
5457 break; \
5458 size >>= 1; \
5459 } \
5460 if (! coding->charbuf) \
5461 { \
5462 coding->result = CODING_RESULT_INSUFFICIENT_MEM; \
5463 return coding->result; \
5464 } \
5465 coding->charbuf_size = size; \
5466 } while (0)
5467
5468
5469 static void
5470 produce_annotation (coding)
5471 struct coding_system *coding;
5472 {
5473 int *charbuf = coding->charbuf;
5474 int *charbuf_end = charbuf + coding->charbuf_used;
5475
5476 while (charbuf < charbuf_end)
5477 {
5478 if (*charbuf >= 0)
5479 charbuf++;
5480 else
5481 {
5482 int len = -*charbuf;
5483 switch (charbuf[2])
5484 {
5485 case CODING_ANNOTATE_COMPOSITION_MASK:
5486 produce_composition (coding, charbuf);
5487 break;
5488 default:
5489 abort ();
5490 }
5491 charbuf += len;
5492 }
5493 }
5494 }
5495
5496 /* Decode the data at CODING->src_object into CODING->dst_object.
5497 CODING->src_object is a buffer, a string, or nil.
5498 CODING->dst_object is a buffer.
5499
5500 If CODING->src_object is a buffer, it must be the current buffer.
5501 In this case, if CODING->src_pos is positive, it is a position of
5502 the source text in the buffer, otherwise, the source text is in the
5503 gap area of the buffer, and CODING->src_pos specifies the offset of
5504 the text from GPT (which must be the same as PT). If this is the
5505 same buffer as CODING->dst_object, CODING->src_pos must be
5506 negative.
5507
5508 If CODING->src_object is a string, CODING->src_pos in an index to
5509 that string.
5510
5511 If CODING->src_object is nil, CODING->source must already point to
5512 the non-relocatable memory area. In this case, CODING->src_pos is
5513 an offset from CODING->source.
5514
5515 The decoded data is inserted at the current point of the buffer
5516 CODING->dst_object.
5517 */
5518
5519 static int
5520 decode_coding (coding)
5521 struct coding_system *coding;
5522 {
5523 Lisp_Object attrs;
5524
5525 if (BUFFERP (coding->src_object)
5526 && coding->src_pos > 0
5527 && coding->src_pos < GPT
5528 && coding->src_pos + coding->src_chars > GPT)
5529 move_gap_both (coding->src_pos, coding->src_pos_byte);
5530
5531 if (BUFFERP (coding->dst_object))
5532 {
5533 if (current_buffer != XBUFFER (coding->dst_object))
5534 set_buffer_internal (XBUFFER (coding->dst_object));
5535 if (GPT != PT)
5536 move_gap_both (PT, PT_BYTE);
5537 }
5538
5539 coding->consumed = coding->consumed_char = 0;
5540 coding->produced = coding->produced_char = 0;
5541 coding->chars_at_source = 0;
5542 coding->result = CODING_RESULT_SUCCESS;
5543 coding->errors = 0;
5544
5545 ALLOC_CONVERSION_WORK_AREA (coding);
5546
5547 attrs = CODING_ID_ATTRS (coding->id);
5548
5549 do
5550 {
5551 coding_set_source (coding);
5552 coding->annotated = 0;
5553 (*(coding->decoder)) (coding);
5554 if (!NILP (CODING_ATTR_DECODE_TBL (attrs)))
5555 translate_chars (CODING_ATTR_DECODE_TBL (attrs), coding);
5556 coding_set_destination (coding);
5557 produce_chars (coding);
5558 if (coding->annotated)
5559 produce_annotation (coding);
5560 }
5561 while (coding->consumed < coding->src_bytes
5562 && ! coding->result);
5563
5564 if (EQ (CODING_ATTR_TYPE (CODING_ID_ATTRS (coding->id)), Qccl)
5565 && SYMBOLP (CODING_ID_EOL_TYPE (coding->id))
5566 && ! EQ (CODING_ID_EOL_TYPE (coding->id), Qunix))
5567 decode_eol (coding);
5568
5569 coding->carryover_bytes = 0;
5570 if (coding->consumed < coding->src_bytes)
5571 {
5572 int nbytes = coding->src_bytes - coding->consumed;
5573 unsigned char *src;
5574
5575 coding_set_source (coding);
5576 coding_set_destination (coding);
5577 src = coding->source + coding->consumed;
5578
5579 if (coding->mode & CODING_MODE_LAST_BLOCK)
5580 {
5581 /* Flush out unprocessed data as binary chars. We are sure
5582 that the number of data is less than the size of
5583 coding->charbuf. */
5584 int *charbuf = coding->charbuf;
5585
5586 while (nbytes-- > 0)
5587 {
5588 int c = *src++;
5589 *charbuf++ = (c & 0x80 ? - c : c);
5590 }
5591 produce_chars (coding);
5592 }
5593 else
5594 {
5595 /* Record unprocessed bytes in coding->carryover. We are
5596 sure that the number of data is less than the size of
5597 coding->carryover. */
5598 unsigned char *p = coding->carryover;
5599
5600 coding->carryover_bytes = nbytes;
5601 while (nbytes-- > 0)
5602 *p++ = *src++;
5603 }
5604 coding->consumed = coding->src_bytes;
5605 }
5606
5607 return coding->result;
5608 }
5609
5610 static void
5611 consume_chars (coding)
5612 struct coding_system *coding;
5613 {
5614 int *buf = coding->charbuf;
5615 /* -1 is to compensate for CRLF. */
5616 int *buf_end = coding->charbuf + coding->charbuf_size - 1;
5617 const unsigned char *src = coding->source + coding->consumed;
5618 int pos = coding->src_pos + coding->consumed_char;
5619 int end_pos = coding->src_pos + coding->src_chars;
5620 int multibytep = coding->src_multibyte;
5621 Lisp_Object eol_type;
5622 int c;
5623 int start, end, stop;
5624 Lisp_Object object, prop;
5625
5626 eol_type = CODING_ID_EOL_TYPE (coding->id);
5627 if (VECTORP (eol_type))
5628 eol_type = Qunix;
5629
5630 object = coding->src_object;
5631
5632 /* Note: composition handling is not yet implemented. */
5633 coding->common_flags &= ~CODING_ANNOTATE_COMPOSITION_MASK;
5634
5635 if (coding->common_flags & CODING_ANNOTATE_COMPOSITION_MASK
5636 && find_composition (pos, end_pos, &start, &end, &prop, object)
5637 && end <= end_pos
5638 && (start >= pos
5639 || (find_composition (end, end_pos, &start, &end, &prop, object)
5640 && end <= end_pos)))
5641 stop = start;
5642 else
5643 stop = end_pos;
5644
5645 while (buf < buf_end)
5646 {
5647 if (pos == stop)
5648 {
5649 int *p;
5650
5651 if (pos == end_pos)
5652 break;
5653 p = save_composition_data (buf, buf_end, prop);
5654 if (p == NULL)
5655 break;
5656 buf = p;
5657 if (find_composition (end, end_pos, &start, &end, &prop, object)
5658 && end <= end_pos)
5659 stop = start;
5660 else
5661 stop = end_pos;
5662 }
5663
5664 if (! multibytep)
5665 c = *src++;
5666 else
5667 c = STRING_CHAR_ADVANCE (src);
5668 if ((c == '\r') && (coding->mode & CODING_MODE_SELECTIVE_DISPLAY))
5669 c = '\n';
5670 if (! EQ (eol_type, Qunix))
5671 {
5672 if (c == '\n')
5673 {
5674 if (EQ (eol_type, Qdos))
5675 *buf++ = '\r';
5676 else
5677 c = '\r';
5678 }
5679 }
5680 *buf++ = c;
5681 pos++;
5682 }
5683
5684 coding->consumed = src - coding->source;
5685 coding->consumed_char = pos - coding->src_pos;
5686 coding->charbuf_used = buf - coding->charbuf;
5687 coding->chars_at_source = 0;
5688 }
5689
5690
5691 /* Encode the text at CODING->src_object into CODING->dst_object.
5692 CODING->src_object is a buffer or a string.
5693 CODING->dst_object is a buffer or nil.
5694
5695 If CODING->src_object is a buffer, it must be the current buffer.
5696 In this case, if CODING->src_pos is positive, it is a position of
5697 the source text in the buffer, otherwise. the source text is in the
5698 gap area of the buffer, and coding->src_pos specifies the offset of
5699 the text from GPT (which must be the same as PT). If this is the
5700 same buffer as CODING->dst_object, CODING->src_pos must be
5701 negative and CODING should not have `pre-write-conversion'.
5702
5703 If CODING->src_object is a string, CODING should not have
5704 `pre-write-conversion'.
5705
5706 If CODING->dst_object is a buffer, the encoded data is inserted at
5707 the current point of that buffer.
5708
5709 If CODING->dst_object is nil, the encoded data is placed at the
5710 memory area specified by CODING->destination. */
5711
5712 static int
5713 encode_coding (coding)
5714 struct coding_system *coding;
5715 {
5716 Lisp_Object attrs;
5717
5718 attrs = CODING_ID_ATTRS (coding->id);
5719
5720 if (BUFFERP (coding->dst_object))
5721 {
5722 set_buffer_internal (XBUFFER (coding->dst_object));
5723 coding->dst_multibyte
5724 = ! NILP (current_buffer->enable_multibyte_characters);
5725 }
5726
5727 coding->consumed = coding->consumed_char = 0;
5728 coding->produced = coding->produced_char = 0;
5729 coding->result = CODING_RESULT_SUCCESS;
5730 coding->errors = 0;
5731
5732 ALLOC_CONVERSION_WORK_AREA (coding);
5733
5734 do {
5735 coding_set_source (coding);
5736 consume_chars (coding);
5737
5738 if (!NILP (CODING_ATTR_ENCODE_TBL (attrs)))
5739 translate_chars (CODING_ATTR_ENCODE_TBL (attrs), coding);
5740
5741 coding_set_destination (coding);
5742 (*(coding->encoder)) (coding);
5743 } while (coding->consumed_char < coding->src_chars);
5744
5745 if (BUFFERP (coding->dst_object))
5746 insert_from_gap (coding->produced_char, coding->produced);
5747
5748 return (coding->result);
5749 }
5750
5751 /* Work buffer */
5752
5753 /* List of currently used working buffer. */
5754 Lisp_Object Vcode_conversion_work_buf_list;
5755
5756 /* A working buffer used by the top level conversion. */
5757 Lisp_Object Vcode_conversion_reused_work_buf;
5758
5759
5760 /* Return a working buffer that can be freely used by the following
5761 code conversion. MULTIBYTEP specifies the multibyteness of the
5762 buffer. */
5763
5764 Lisp_Object
5765 make_conversion_work_buffer (multibytep)
5766 int multibytep;
5767 {
5768 struct buffer *current = current_buffer;
5769 Lisp_Object buf;
5770
5771 if (NILP (Vcode_conversion_work_buf_list))
5772 {
5773 if (NILP (Vcode_conversion_reused_work_buf))
5774 Vcode_conversion_reused_work_buf
5775 = Fget_buffer_create (build_string (" *code-conversion-work*"));
5776 Vcode_conversion_work_buf_list
5777 = Fcons (Vcode_conversion_reused_work_buf, Qnil);
5778 }
5779 else
5780 {
5781 int depth = XINT (Flength (Vcode_conversion_work_buf_list));
5782 char str[128];
5783
5784 sprintf (str, " *code-conversion-work*<%d>", depth);
5785 Vcode_conversion_work_buf_list
5786 = Fcons (Fget_buffer_create (build_string (str)),
5787 Vcode_conversion_work_buf_list);
5788 }
5789
5790 buf = XCAR (Vcode_conversion_work_buf_list);
5791 set_buffer_internal (XBUFFER (buf));
5792 current_buffer->undo_list = Qt;
5793 Ferase_buffer ();
5794 Fset_buffer_multibyte (multibytep ? Qt : Qnil, Qnil);
5795 set_buffer_internal (current);
5796 return buf;
5797 }
5798
5799 static struct coding_system *saved_coding;
5800
5801 Lisp_Object
5802 code_conversion_restore (info)
5803 Lisp_Object info;
5804 {
5805 int depth = XINT (Flength (Vcode_conversion_work_buf_list));
5806 Lisp_Object buf;
5807
5808 if (depth > 0)
5809 {
5810 buf = XCAR (Vcode_conversion_work_buf_list);
5811 Vcode_conversion_work_buf_list = XCDR (Vcode_conversion_work_buf_list);
5812 if (depth > 1 && !NILP (Fbuffer_live_p (buf)))
5813 Fkill_buffer (buf);
5814 }
5815
5816 if (EQ (saved_coding->dst_object, Qt)
5817 && saved_coding->destination)
5818 xfree (saved_coding->destination);
5819
5820 return save_excursion_restore (info);
5821 }
5822
5823
5824 int
5825 decode_coding_gap (coding, chars, bytes)
5826 struct coding_system *coding;
5827 EMACS_INT chars, bytes;
5828 {
5829 int count = specpdl_ptr - specpdl;
5830
5831 saved_coding = coding;
5832 record_unwind_protect (code_conversion_restore, save_excursion_save ());
5833
5834 coding->src_object = Fcurrent_buffer ();
5835 coding->src_chars = chars;
5836 coding->src_bytes = bytes;
5837 coding->src_pos = -chars;
5838 coding->src_pos_byte = -bytes;
5839 coding->src_multibyte = chars < bytes;
5840 coding->dst_object = coding->src_object;
5841 coding->dst_pos = PT;
5842 coding->dst_pos_byte = PT_BYTE;
5843 coding->dst_multibyte = ! NILP (current_buffer->enable_multibyte_characters);
5844
5845 if (CODING_REQUIRE_DETECTION (coding))
5846 detect_coding (coding);
5847
5848 decode_coding (coding);
5849
5850 unbind_to (count, Qnil);
5851 return coding->result;
5852 }
5853
5854 int
5855 encode_coding_gap (coding, chars, bytes)
5856 struct coding_system *coding;
5857 EMACS_INT chars, bytes;
5858 {
5859 int count = specpdl_ptr - specpdl;
5860 Lisp_Object buffer;
5861
5862 saved_coding = coding;
5863 record_unwind_protect (code_conversion_restore, save_excursion_save ());
5864
5865 buffer = Fcurrent_buffer ();
5866 coding->src_object = buffer;
5867 coding->src_chars = chars;
5868 coding->src_bytes = bytes;
5869 coding->src_pos = -chars;
5870 coding->src_pos_byte = -bytes;
5871 coding->src_multibyte = chars < bytes;
5872 coding->dst_object = coding->src_object;
5873 coding->dst_pos = PT;
5874 coding->dst_pos_byte = PT_BYTE;
5875
5876 encode_coding (coding);
5877
5878 unbind_to (count, Qnil);
5879 return coding->result;
5880 }
5881
5882
5883 /* Decode the text in the range FROM/FROM_BYTE and TO/TO_BYTE in
5884 SRC_OBJECT into DST_OBJECT by coding context CODING.
5885
5886 SRC_OBJECT is a buffer, a string, or Qnil.
5887
5888 If it is a buffer, the text is at point of the buffer. FROM and TO
5889 are positions in the buffer.
5890
5891 If it is a string, the text is at the beginning of the string.
5892 FROM and TO are indices to the string.
5893
5894 If it is nil, the text is at coding->source. FROM and TO are
5895 indices to coding->source.
5896
5897 DST_OBJECT is a buffer, Qt, or Qnil.
5898
5899 If it is a buffer, the decoded text is inserted at point of the
5900 buffer. If the buffer is the same as SRC_OBJECT, the source text
5901 is deleted.
5902
5903 If it is Qt, a string is made from the decoded text, and
5904 set in CODING->dst_object.
5905
5906 If it is Qnil, the decoded text is stored at CODING->destination.
5907 The called must allocate CODING->dst_bytes bytes at
5908 CODING->destination by xmalloc. If the decoded text is longer than
5909 CODING->dst_bytes, CODING->destination is relocated by xrealloc.
5910 */
5911
5912 void
5913 decode_coding_object (coding, src_object, from, from_byte, to, to_byte,
5914 dst_object)
5915 struct coding_system *coding;
5916 Lisp_Object src_object;
5917 EMACS_INT from, from_byte, to, to_byte;
5918 Lisp_Object dst_object;
5919 {
5920 int count = specpdl_ptr - specpdl;
5921 unsigned char *destination;
5922 EMACS_INT dst_bytes;
5923 EMACS_INT chars = to - from;
5924 EMACS_INT bytes = to_byte - from_byte;
5925 Lisp_Object attrs;
5926
5927 saved_coding = coding;
5928 record_unwind_protect (code_conversion_restore, save_excursion_save ());
5929
5930 if (NILP (dst_object))
5931 {
5932 destination = coding->destination;
5933 dst_bytes = coding->dst_bytes;
5934 }
5935
5936 coding->src_object = src_object;
5937 coding->src_chars = chars;
5938 coding->src_bytes = bytes;
5939 coding->src_multibyte = chars < bytes;
5940
5941 if (STRINGP (src_object))
5942 {
5943 coding->src_pos = from;
5944 coding->src_pos_byte = from_byte;
5945 }
5946 else if (BUFFERP (src_object))
5947 {
5948 set_buffer_internal (XBUFFER (src_object));
5949 if (from != GPT)
5950 move_gap_both (from, from_byte);
5951 if (EQ (src_object, dst_object))
5952 {
5953 TEMP_SET_PT_BOTH (from, from_byte);
5954 del_range_both (from, from_byte, to, to_byte, 1);
5955 coding->src_pos = -chars;
5956 coding->src_pos_byte = -bytes;
5957 }
5958 else
5959 {
5960 coding->src_pos = from;
5961 coding->src_pos_byte = from_byte;
5962 }
5963 }
5964
5965 if (CODING_REQUIRE_DETECTION (coding))
5966 detect_coding (coding);
5967 attrs = CODING_ID_ATTRS (coding->id);
5968
5969 if (! NILP (CODING_ATTR_POST_READ (attrs))
5970 || EQ (dst_object, Qt))
5971 {
5972 coding->dst_object = make_conversion_work_buffer (1);
5973 coding->dst_pos = BEG;
5974 coding->dst_pos_byte = BEG_BYTE;
5975 coding->dst_multibyte = 1;
5976 }
5977 else if (BUFFERP (dst_object))
5978 {
5979 coding->dst_object = dst_object;
5980 coding->dst_pos = BUF_PT (XBUFFER (dst_object));
5981 coding->dst_pos_byte = BUF_PT_BYTE (XBUFFER (dst_object));
5982 coding->dst_multibyte
5983 = ! NILP (XBUFFER (dst_object)->enable_multibyte_characters);
5984 }
5985 else
5986 {
5987 coding->dst_object = Qnil;
5988 coding->dst_multibyte = 1;
5989 }
5990
5991 decode_coding (coding);
5992
5993 if (BUFFERP (coding->dst_object))
5994 set_buffer_internal (XBUFFER (coding->dst_object));
5995
5996 if (! NILP (CODING_ATTR_POST_READ (attrs)))
5997 {
5998 struct gcpro gcpro1, gcpro2;
5999 EMACS_INT prev_Z = Z, prev_Z_BYTE = Z_BYTE;
6000 Lisp_Object val;
6001
6002 TEMP_SET_PT_BOTH (coding->dst_pos, coding->dst_pos_byte);
6003 GCPRO2 (coding->src_object, coding->dst_object);
6004 val = call1 (CODING_ATTR_POST_READ (attrs),
6005 make_number (coding->produced_char));
6006 UNGCPRO;
6007 CHECK_NATNUM (val);
6008 coding->produced_char += Z - prev_Z;
6009 coding->produced += Z_BYTE - prev_Z_BYTE;
6010 }
6011
6012 if (EQ (dst_object, Qt))
6013 {
6014 coding->dst_object = Fbuffer_string ();
6015 }
6016 else if (NILP (dst_object) && BUFFERP (coding->dst_object))
6017 {
6018 set_buffer_internal (XBUFFER (coding->dst_object));
6019 if (dst_bytes < coding->produced)
6020 {
6021 destination
6022 = (unsigned char *) xrealloc (destination, coding->produced);
6023 if (! destination)
6024 {
6025 coding->result = CODING_RESULT_INSUFFICIENT_DST;
6026 unbind_to (count, Qnil);
6027 return;
6028 }
6029 if (BEGV < GPT && GPT < BEGV + coding->produced_char)
6030 move_gap_both (BEGV, BEGV_BYTE);
6031 bcopy (BEGV_ADDR, destination, coding->produced);
6032 coding->destination = destination;
6033 }
6034 }
6035
6036 unbind_to (count, Qnil);
6037 }
6038
6039
6040 void
6041 encode_coding_object (coding, src_object, from, from_byte, to, to_byte,
6042 dst_object)
6043 struct coding_system *coding;
6044 Lisp_Object src_object;
6045 EMACS_INT from, from_byte, to, to_byte;
6046 Lisp_Object dst_object;
6047 {
6048 int count = specpdl_ptr - specpdl;
6049 EMACS_INT chars = to - from;
6050 EMACS_INT bytes = to_byte - from_byte;
6051 Lisp_Object attrs;
6052
6053 saved_coding = coding;
6054 record_unwind_protect (code_conversion_restore, save_excursion_save ());
6055
6056 coding->src_object = src_object;
6057 coding->src_chars = chars;
6058 coding->src_bytes = bytes;
6059 coding->src_multibyte = chars < bytes;
6060
6061 attrs = CODING_ID_ATTRS (coding->id);
6062
6063 if (! NILP (CODING_ATTR_PRE_WRITE (attrs)))
6064 {
6065 coding->src_object = make_conversion_work_buffer (coding->src_multibyte);
6066 set_buffer_internal (XBUFFER (coding->src_object));
6067 if (STRINGP (src_object))
6068 insert_from_string (src_object, from, from_byte, chars, bytes, 0);
6069 else if (BUFFERP (src_object))
6070 insert_from_buffer (XBUFFER (src_object), from, chars, 0);
6071 else
6072 insert_1_both (coding->source + from, chars, bytes, 0, 0, 0);
6073
6074 if (EQ (src_object, dst_object))
6075 {
6076 set_buffer_internal (XBUFFER (src_object));
6077 del_range_both (from, from_byte, to, to_byte, 1);
6078 set_buffer_internal (XBUFFER (coding->src_object));
6079 }
6080
6081 call2 (CODING_ATTR_PRE_WRITE (attrs),
6082 make_number (BEG), make_number (Z));
6083 coding->src_object = Fcurrent_buffer ();
6084 if (BEG != GPT)
6085 move_gap_both (BEG, BEG_BYTE);
6086 coding->src_chars = Z - BEG;
6087 coding->src_bytes = Z_BYTE - BEG_BYTE;
6088 coding->src_pos = BEG;
6089 coding->src_pos_byte = BEG_BYTE;
6090 coding->src_multibyte = Z < Z_BYTE;
6091 }
6092 else if (STRINGP (src_object))
6093 {
6094 coding->src_pos = from;
6095 coding->src_pos_byte = from_byte;
6096 }
6097 else if (BUFFERP (src_object))
6098 {
6099 set_buffer_internal (XBUFFER (src_object));
6100 if (from != GPT)
6101 move_gap_both (from, from_byte);
6102 if (EQ (src_object, dst_object))
6103 {
6104 del_range_both (from, from_byte, to, to_byte, 1);
6105 coding->src_pos = -chars;
6106 coding->src_pos_byte = -bytes;
6107 }
6108 else
6109 {
6110 coding->src_pos = from;
6111 coding->src_pos_byte = from_byte;
6112 }
6113 }
6114
6115 if (BUFFERP (dst_object))
6116 {
6117 coding->dst_object = dst_object;
6118 coding->dst_pos = BUF_PT (XBUFFER (dst_object));
6119 coding->dst_pos_byte = BUF_PT_BYTE (XBUFFER (dst_object));
6120 coding->dst_multibyte
6121 = ! NILP (XBUFFER (dst_object)->enable_multibyte_characters);
6122 }
6123 else if (EQ (dst_object, Qt))
6124 {
6125 coding->dst_object = Qnil;
6126 coding->dst_bytes = coding->src_chars;
6127 if (coding->dst_bytes == 0)
6128 coding->dst_bytes = 1;
6129 coding->destination = (unsigned char *) xmalloc (coding->dst_bytes);
6130 coding->dst_multibyte = 0;
6131 }
6132 else
6133 {
6134 coding->dst_object = Qnil;
6135 coding->dst_multibyte = 0;
6136 }
6137
6138 encode_coding (coding);
6139
6140 if (EQ (dst_object, Qt))
6141 {
6142 if (BUFFERP (coding->dst_object))
6143 coding->dst_object = Fbuffer_string ();
6144 else
6145 {
6146 coding->dst_object
6147 = make_unibyte_string ((char *) coding->destination,
6148 coding->produced);
6149 xfree (coding->destination);
6150 }
6151 }
6152
6153 unbind_to (count, Qnil);
6154 }
6155
6156
6157 Lisp_Object
6158 preferred_coding_system ()
6159 {
6160 int id = coding_categories[coding_priorities[0]].id;
6161
6162 return CODING_ID_NAME (id);
6163 }
6164
6165 \f
6166 #ifdef emacs
6167 /*** 8. Emacs Lisp library functions ***/
6168
6169 DEFUN ("coding-system-p", Fcoding_system_p, Scoding_system_p, 1, 1, 0,
6170 doc: /* Return t if OBJECT is nil or a coding-system.
6171 See the documentation of `define-coding-system' for information
6172 about coding-system objects. */)
6173 (obj)
6174 Lisp_Object obj;
6175 {
6176 return ((NILP (obj) || CODING_SYSTEM_P (obj)) ? Qt : Qnil);
6177 }
6178
6179 DEFUN ("read-non-nil-coding-system", Fread_non_nil_coding_system,
6180 Sread_non_nil_coding_system, 1, 1, 0,
6181 doc: /* Read a coding system from the minibuffer, prompting with string PROMPT. */)
6182 (prompt)
6183 Lisp_Object prompt;
6184 {
6185 Lisp_Object val;
6186 do
6187 {
6188 val = Fcompleting_read (prompt, Vcoding_system_alist, Qnil,
6189 Qt, Qnil, Qcoding_system_history, Qnil, Qnil);
6190 }
6191 while (XSTRING (val)->size == 0);
6192 return (Fintern (val, Qnil));
6193 }
6194
6195 DEFUN ("read-coding-system", Fread_coding_system, Sread_coding_system, 1, 2, 0,
6196 doc: /* Read a coding system from the minibuffer, prompting with string PROMPT.
6197 If the user enters null input, return second argument DEFAULT-CODING-SYSTEM. */)
6198 (prompt, default_coding_system)
6199 Lisp_Object prompt, default_coding_system;
6200 {
6201 Lisp_Object val;
6202 if (SYMBOLP (default_coding_system))
6203 XSETSTRING (default_coding_system, XSYMBOL (default_coding_system)->name);
6204 val = Fcompleting_read (prompt, Vcoding_system_alist, Qnil,
6205 Qt, Qnil, Qcoding_system_history,
6206 default_coding_system, Qnil);
6207 return (XSTRING (val)->size == 0 ? Qnil : Fintern (val, Qnil));
6208 }
6209
6210 DEFUN ("check-coding-system", Fcheck_coding_system, Scheck_coding_system,
6211 1, 1, 0,
6212 doc: /* Check validity of CODING-SYSTEM.
6213 If valid, return CODING-SYSTEM, else signal a `coding-system-error' error.
6214 It is valid if it is a symbol with a non-nil `coding-system' property.
6215 The value of property should be a vector of length 5. */)
6216 (coding_system)
6217 Lisp_Object coding_system;
6218 {
6219 CHECK_SYMBOL (coding_system);
6220 if (!NILP (Fcoding_system_p (coding_system)))
6221 return coding_system;
6222 while (1)
6223 Fsignal (Qcoding_system_error, Fcons (coding_system, Qnil));
6224 }
6225
6226 \f
6227 Lisp_Object
6228 detect_coding_system (src, src_bytes, highest, multibytep, coding_system)
6229 unsigned char *src;
6230 int src_bytes, highest;
6231 int multibytep;
6232 Lisp_Object coding_system;
6233 {
6234 unsigned char *src_end = src + src_bytes;
6235 int mask = CATEGORY_MASK_ANY;
6236 int detected = 0;
6237 int c, i;
6238 Lisp_Object attrs, eol_type;
6239 Lisp_Object val;
6240 struct coding_system coding;
6241
6242 if (NILP (coding_system))
6243 coding_system = Qundecided;
6244 setup_coding_system (coding_system, &coding);
6245 attrs = CODING_ID_ATTRS (coding.id);
6246 eol_type = CODING_ID_EOL_TYPE (coding.id);
6247
6248 coding.source = src;
6249 coding.src_bytes = src_bytes;
6250 coding.src_multibyte = multibytep;
6251 coding.consumed = 0;
6252
6253 if (XINT (CODING_ATTR_CATEGORY (attrs)) != coding_category_undecided)
6254 {
6255 mask = 1 << XINT (CODING_ATTR_CATEGORY (attrs));
6256 }
6257 else
6258 {
6259 coding_system = Qnil;
6260 for (; src < src_end; src++)
6261 {
6262 c = *src;
6263 if (c & 0x80 || (c < 0x20 && (c == ISO_CODE_ESC
6264 || c == ISO_CODE_SI
6265 || c == ISO_CODE_SO)))
6266 break;
6267 }
6268 coding.head_ascii = src - coding.source;
6269
6270 if (src < src_end)
6271 for (i = 0; i < coding_category_raw_text; i++)
6272 {
6273 enum coding_category category = coding_priorities[i];
6274 struct coding_system *this = coding_categories + category;
6275
6276 if (category >= coding_category_raw_text
6277 || detected & (1 << category))
6278 continue;
6279
6280 if (this->id < 0)
6281 {
6282 /* No coding system of this category is defined. */
6283 mask &= ~(1 << category);
6284 }
6285 else
6286 {
6287 detected |= detected_mask[category];
6288 if ((*(coding_categories[category].detector)) (&coding, &mask)
6289 && highest)
6290 {
6291 mask &= detected_mask[category];
6292 break;
6293 }
6294 }
6295 }
6296 }
6297
6298 if (!mask)
6299 val = Fcons (make_number (coding_category_raw_text), Qnil);
6300 else if (mask == CATEGORY_MASK_ANY)
6301 val = Fcons (make_number (coding_category_undecided), Qnil);
6302 else if (highest)
6303 {
6304 for (i = 0; i < coding_category_raw_text; i++)
6305 if (mask & (1 << coding_priorities[i]))
6306 {
6307 val = Fcons (make_number (coding_priorities[i]), Qnil);
6308 break;
6309 }
6310 }
6311 else
6312 {
6313 val = Qnil;
6314 for (i = coding_category_raw_text - 1; i >= 0; i--)
6315 if (mask & (1 << coding_priorities[i]))
6316 val = Fcons (make_number (coding_priorities[i]), val);
6317 }
6318
6319 {
6320 int one_byte_eol = -1, two_byte_eol = -1;
6321 Lisp_Object tail;
6322
6323 for (tail = val; CONSP (tail); tail = XCDR (tail))
6324 {
6325 struct coding_system *this
6326 = (NILP (coding_system) ? coding_categories + XINT (XCAR (tail))
6327 : &coding);
6328 int this_eol;
6329
6330 attrs = CODING_ID_ATTRS (this->id);
6331 eol_type = CODING_ID_EOL_TYPE (this->id);
6332 XSETCAR (tail, CODING_ID_NAME (this->id));
6333 if (VECTORP (eol_type))
6334 {
6335 if (EQ (CODING_ATTR_TYPE (attrs), Qutf_16))
6336 {
6337 if (two_byte_eol < 0)
6338 two_byte_eol = detect_eol (this, coding.source, src_bytes);
6339 this_eol = two_byte_eol;
6340 }
6341 else
6342 {
6343 if (one_byte_eol < 0)
6344 one_byte_eol =detect_eol (this, coding.source, src_bytes);
6345 this_eol = one_byte_eol;
6346 }
6347 if (this_eol == EOL_SEEN_LF)
6348 XSETCAR (tail, AREF (eol_type, 0));
6349 else if (this_eol == EOL_SEEN_CRLF)
6350 XSETCAR (tail, AREF (eol_type, 1));
6351 else if (this_eol == EOL_SEEN_CR)
6352 XSETCAR (tail, AREF (eol_type, 2));
6353 }
6354 }
6355 }
6356
6357 return (highest ? XCAR (val) : val);
6358 }
6359
6360
6361 DEFUN ("detect-coding-region", Fdetect_coding_region, Sdetect_coding_region,
6362 2, 3, 0,
6363 doc: /* Detect coding system of the text in the region between START and END.
6364 Return a list of possible coding systems ordered by priority.
6365
6366 If only ASCII characters are found, it returns a list of single element
6367 `undecided' or its subsidiary coding system according to a detected
6368 end-of-line format.
6369
6370 If optional argument HIGHEST is non-nil, return the coding system of
6371 highest priority. */)
6372 (start, end, highest)
6373 Lisp_Object start, end, highest;
6374 {
6375 int from, to;
6376 int from_byte, to_byte;
6377
6378 CHECK_NUMBER_COERCE_MARKER (start);
6379 CHECK_NUMBER_COERCE_MARKER (end);
6380
6381 validate_region (&start, &end);
6382 from = XINT (start), to = XINT (end);
6383 from_byte = CHAR_TO_BYTE (from);
6384 to_byte = CHAR_TO_BYTE (to);
6385
6386 if (from < GPT && to >= GPT)
6387 move_gap_both (to, to_byte);
6388
6389 return detect_coding_system (BYTE_POS_ADDR (from_byte),
6390 to_byte - from_byte,
6391 !NILP (highest),
6392 !NILP (current_buffer
6393 ->enable_multibyte_characters),
6394 Qnil);
6395 }
6396
6397 DEFUN ("detect-coding-string", Fdetect_coding_string, Sdetect_coding_string,
6398 1, 2, 0,
6399 doc: /* Detect coding system of the text in STRING.
6400 Return a list of possible coding systems ordered by priority.
6401
6402 If only ASCII characters are found, it returns a list of single element
6403 `undecided' or its subsidiary coding system according to a detected
6404 end-of-line format.
6405
6406 If optional argument HIGHEST is non-nil, return the coding system of
6407 highest priority. */)
6408 (string, highest)
6409 Lisp_Object string, highest;
6410 {
6411 CHECK_STRING (string);
6412
6413 return detect_coding_system (XSTRING (string)->data,
6414 STRING_BYTES (XSTRING (string)),
6415 !NILP (highest),
6416 STRING_MULTIBYTE (string),
6417 Qnil);
6418 }
6419
6420
6421 static INLINE int
6422 char_encodable_p (c, attrs)
6423 int c;
6424 Lisp_Object attrs;
6425 {
6426 Lisp_Object tail;
6427 struct charset *charset;
6428
6429 for (tail = CODING_ATTR_CHARSET_LIST (attrs);
6430 CONSP (tail); tail = XCDR (tail))
6431 {
6432 charset = CHARSET_FROM_ID (XINT (XCAR (tail)));
6433 if (CHAR_CHARSET_P (c, charset))
6434 break;
6435 }
6436 return (! NILP (tail));
6437 }
6438
6439
6440 /* Return a list of coding systems that safely encode the text between
6441 START and END. If EXCLUDE is non-nil, it is a list of coding
6442 systems not to check. The returned list doesn't contain any such
6443 coding systems. In any case, if the text contains only ASCII or is
6444 unibyte, return t. */
6445
6446 DEFUN ("find-coding-systems-region-internal",
6447 Ffind_coding_systems_region_internal,
6448 Sfind_coding_systems_region_internal, 2, 3, 0,
6449 doc: /* Internal use only. */)
6450 (start, end, exclude)
6451 Lisp_Object start, end, exclude;
6452 {
6453 Lisp_Object coding_attrs_list, safe_codings;
6454 EMACS_INT start_byte, end_byte;
6455 const unsigned char *p, *pbeg, *pend;
6456 int c;
6457 Lisp_Object tail, elt;
6458
6459 if (STRINGP (start))
6460 {
6461 if (!STRING_MULTIBYTE (start)
6462 || XSTRING (start)->size == STRING_BYTES (XSTRING (start)))
6463 return Qt;
6464 start_byte = 0;
6465 end_byte = STRING_BYTES (XSTRING (start));
6466 }
6467 else
6468 {
6469 CHECK_NUMBER_COERCE_MARKER (start);
6470 CHECK_NUMBER_COERCE_MARKER (end);
6471 if (XINT (start) < BEG || XINT (end) > Z || XINT (start) > XINT (end))
6472 args_out_of_range (start, end);
6473 if (NILP (current_buffer->enable_multibyte_characters))
6474 return Qt;
6475 start_byte = CHAR_TO_BYTE (XINT (start));
6476 end_byte = CHAR_TO_BYTE (XINT (end));
6477 if (XINT (end) - XINT (start) == end_byte - start_byte)
6478 return Qt;
6479
6480 if (start < GPT && end > GPT)
6481 {
6482 if ((GPT - start) < (end - GPT))
6483 move_gap_both (start, start_byte);
6484 else
6485 move_gap_both (end, end_byte);
6486 }
6487 }
6488
6489 coding_attrs_list = Qnil;
6490 for (tail = Vcoding_system_list; CONSP (tail); tail = XCDR (tail))
6491 if (NILP (exclude)
6492 || NILP (Fmemq (XCAR (tail), exclude)))
6493 {
6494 Lisp_Object attrs;
6495
6496 attrs = AREF (CODING_SYSTEM_SPEC (XCAR (tail)), 0);
6497 if (EQ (XCAR (tail), CODING_ATTR_BASE_NAME (attrs))
6498 && ! EQ (CODING_ATTR_TYPE (attrs), Qundecided))
6499 coding_attrs_list = Fcons (attrs, coding_attrs_list);
6500 }
6501
6502 if (STRINGP (start))
6503 p = pbeg = XSTRING (start)->data;
6504 else
6505 p = pbeg = BYTE_POS_ADDR (start_byte);
6506 pend = p + (end_byte - start_byte);
6507
6508 while (p < pend && ASCII_BYTE_P (*p)) p++;
6509 while (p < pend && ASCII_BYTE_P (*(pend - 1))) pend--;
6510
6511 while (p < pend)
6512 {
6513 if (ASCII_BYTE_P (*p))
6514 p++;
6515 else
6516 {
6517 c = STRING_CHAR_ADVANCE (p);
6518
6519 charset_map_loaded = 0;
6520 for (tail = coding_attrs_list; CONSP (tail);)
6521 {
6522 elt = XCAR (tail);
6523 if (NILP (elt))
6524 tail = XCDR (tail);
6525 else if (char_encodable_p (c, elt))
6526 tail = XCDR (tail);
6527 else if (CONSP (XCDR (tail)))
6528 {
6529 XSETCAR (tail, XCAR (XCDR (tail)));
6530 XSETCDR (tail, XCDR (XCDR (tail)));
6531 }
6532 else
6533 {
6534 XSETCAR (tail, Qnil);
6535 tail = XCDR (tail);
6536 }
6537 }
6538 if (charset_map_loaded)
6539 {
6540 EMACS_INT p_offset = p - pbeg, pend_offset = pend - pbeg;
6541
6542 if (STRINGP (start))
6543 pbeg = XSTRING (start)->data;
6544 else
6545 pbeg = BYTE_POS_ADDR (start_byte);
6546 p = pbeg + p_offset;
6547 pend = pbeg + pend_offset;
6548 }
6549 }
6550 }
6551
6552 safe_codings = Qnil;
6553 for (tail = coding_attrs_list; CONSP (tail); tail = XCDR (tail))
6554 if (! NILP (XCAR (tail)))
6555 safe_codings = Fcons (CODING_ATTR_BASE_NAME (XCAR (tail)), safe_codings);
6556
6557 return safe_codings;
6558 }
6559
6560
6561 DEFUN ("check-coding-systems-region", Fcheck_coding_systems_region,
6562 Scheck_coding_systems_region, 3, 3, 0,
6563 doc: /* Check if the region is encodable by coding systems.
6564
6565 START and END are buffer positions specifying the region.
6566 CODING-SYSTEM-LIST is a list of coding systems to check.
6567
6568 The value is an alist ((CODING-SYSTEM POS0 POS1 ...) ...), where
6569 CODING-SYSTEM is a member of CODING-SYSTEM-LIst and can't encode the
6570 whole region, POS0, POS1, ... are buffer positions where non-encodable
6571 characters are found.
6572
6573 If all coding systems in CODING-SYSTEM-LIST can encode the region, the
6574 value is nil.
6575
6576 START may be a string. In that case, check if the string is
6577 encodable, and the value contains indices to the string instead of
6578 buffer positions. END is ignored. */)
6579 (start, end, coding_system_list)
6580 Lisp_Object start, end, coding_system_list;
6581 {
6582 Lisp_Object list;
6583 EMACS_INT start_byte, end_byte;
6584 int pos;
6585 const unsigned char *p, *pbeg, *pend;
6586 int c;
6587 Lisp_Object tail, elt;
6588
6589 if (STRINGP (start))
6590 {
6591 if (!STRING_MULTIBYTE (start)
6592 && XSTRING (start)->size != STRING_BYTES (XSTRING (start)))
6593 return Qnil;
6594 start_byte = 0;
6595 end_byte = STRING_BYTES (XSTRING (start));
6596 pos = 0;
6597 }
6598 else
6599 {
6600 CHECK_NUMBER_COERCE_MARKER (start);
6601 CHECK_NUMBER_COERCE_MARKER (end);
6602 if (XINT (start) < BEG || XINT (end) > Z || XINT (start) > XINT (end))
6603 args_out_of_range (start, end);
6604 if (NILP (current_buffer->enable_multibyte_characters))
6605 return Qnil;
6606 start_byte = CHAR_TO_BYTE (XINT (start));
6607 end_byte = CHAR_TO_BYTE (XINT (end));
6608 if (XINT (end) - XINT (start) == end_byte - start_byte)
6609 return Qt;
6610
6611 if (start < GPT && end > GPT)
6612 {
6613 if ((GPT - start) < (end - GPT))
6614 move_gap_both (start, start_byte);
6615 else
6616 move_gap_both (end, end_byte);
6617 }
6618 pos = start;
6619 }
6620
6621 list = Qnil;
6622 for (tail = coding_system_list; CONSP (tail); tail = XCDR (tail))
6623 {
6624 elt = XCAR (tail);
6625 list = Fcons (Fcons (elt, Fcons (AREF (CODING_SYSTEM_SPEC (elt), 0),
6626 Qnil)),
6627 list);
6628 }
6629
6630 if (STRINGP (start))
6631 p = pbeg = XSTRING (start)->data;
6632 else
6633 p = pbeg = BYTE_POS_ADDR (start_byte);
6634 pend = p + (end_byte - start_byte);
6635
6636 while (p < pend && ASCII_BYTE_P (*p)) p++, pos++;
6637 while (p < pend && ASCII_BYTE_P (*(pend - 1))) pend--;
6638
6639 while (p < pend)
6640 {
6641 if (ASCII_BYTE_P (*p))
6642 p++;
6643 else
6644 {
6645 c = STRING_CHAR_ADVANCE (p);
6646
6647 charset_map_loaded = 0;
6648 for (tail = list; CONSP (tail); tail = XCDR (tail))
6649 {
6650 elt = XCDR (XCAR (tail));
6651 if (! char_encodable_p (c, XCAR (elt)))
6652 XSETCDR (elt, Fcons (make_number (pos), XCDR (elt)));
6653 }
6654 if (charset_map_loaded)
6655 {
6656 EMACS_INT p_offset = p - pbeg, pend_offset = pend - pbeg;
6657
6658 if (STRINGP (start))
6659 pbeg = XSTRING (start)->data;
6660 else
6661 pbeg = BYTE_POS_ADDR (start_byte);
6662 p = pbeg + p_offset;
6663 pend = pbeg + pend_offset;
6664 }
6665 }
6666 pos++;
6667 }
6668
6669 tail = list;
6670 list = Qnil;
6671 for (; CONSP (tail); tail = XCDR (tail))
6672 {
6673 elt = XCAR (tail);
6674 if (CONSP (XCDR (XCDR (elt))))
6675 list = Fcons (Fcons (XCAR (elt), Fnreverse (XCDR (XCDR (elt)))),
6676 list);
6677 }
6678
6679 return list;
6680 }
6681
6682
6683
6684 Lisp_Object
6685 code_convert_region (start, end, coding_system, dst_object, encodep, norecord)
6686 Lisp_Object start, end, coding_system, dst_object;
6687 int encodep, norecord;
6688 {
6689 struct coding_system coding;
6690 EMACS_INT from, from_byte, to, to_byte;
6691 Lisp_Object src_object;
6692
6693 CHECK_NUMBER_COERCE_MARKER (start);
6694 CHECK_NUMBER_COERCE_MARKER (end);
6695 if (NILP (coding_system))
6696 coding_system = Qno_conversion;
6697 else
6698 CHECK_CODING_SYSTEM (coding_system);
6699 src_object = Fcurrent_buffer ();
6700 if (NILP (dst_object))
6701 dst_object = src_object;
6702 else if (! EQ (dst_object, Qt))
6703 CHECK_BUFFER (dst_object);
6704
6705 validate_region (&start, &end);
6706 from = XFASTINT (start);
6707 from_byte = CHAR_TO_BYTE (from);
6708 to = XFASTINT (end);
6709 to_byte = CHAR_TO_BYTE (to);
6710
6711 setup_coding_system (coding_system, &coding);
6712 coding.mode |= CODING_MODE_LAST_BLOCK;
6713
6714 if (encodep)
6715 encode_coding_object (&coding, src_object, from, from_byte, to, to_byte,
6716 dst_object);
6717 else
6718 decode_coding_object (&coding, src_object, from, from_byte, to, to_byte,
6719 dst_object);
6720 if (! norecord)
6721 Vlast_coding_system_used = CODING_ID_NAME (coding.id);
6722
6723 if (coding.result != CODING_RESULT_SUCCESS)
6724 error ("Code conversion error: %d", coding.result);
6725
6726 return (BUFFERP (dst_object)
6727 ? make_number (coding.produced_char)
6728 : coding.dst_object);
6729 }
6730
6731
6732 DEFUN ("decode-coding-region", Fdecode_coding_region, Sdecode_coding_region,
6733 3, 4, "r\nzCoding system: ",
6734 doc: /* Decode the current region from the specified coding system.
6735 When called from a program, takes four arguments:
6736 START, END, CODING-SYSTEM, and DESTINATION.
6737 START and END are buffer positions.
6738
6739 Optional 4th arguments DESTINATION specifies where the decoded text goes.
6740 If nil, the region between START and END is replace by the decoded text.
6741 If buffer, the decoded text is inserted in the buffer.
6742 If t, the decoded text is returned.
6743
6744 This function sets `last-coding-system-used' to the precise coding system
6745 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
6746 not fully specified.)
6747 It returns the length of the decoded text. */)
6748 (start, end, coding_system, destination)
6749 Lisp_Object start, end, coding_system, destination;
6750 {
6751 return code_convert_region (start, end, coding_system, destination, 0, 0);
6752 }
6753
6754 DEFUN ("encode-coding-region", Fencode_coding_region, Sencode_coding_region,
6755 3, 4, "r\nzCoding system: ",
6756 doc: /* Encode the current region by specified coding system.
6757 When called from a program, takes three arguments:
6758 START, END, and CODING-SYSTEM. START and END are buffer positions.
6759
6760 Optional 4th arguments DESTINATION specifies where the encoded text goes.
6761 If nil, the region between START and END is replace by the encoded text.
6762 If buffer, the encoded text is inserted in the buffer.
6763 If t, the encoded text is returned.
6764
6765 This function sets `last-coding-system-used' to the precise coding system
6766 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
6767 not fully specified.)
6768 It returns the length of the encoded text. */)
6769 (start, end, coding_system, destination)
6770 Lisp_Object start, end, coding_system, destination;
6771 {
6772 return code_convert_region (start, end, coding_system, destination, 1, 0);
6773 }
6774
6775 Lisp_Object
6776 code_convert_string (string, coding_system, dst_object,
6777 encodep, nocopy, norecord)
6778 Lisp_Object string, coding_system, dst_object;
6779 int encodep, nocopy, norecord;
6780 {
6781 struct coding_system coding;
6782 EMACS_INT chars, bytes;
6783
6784 CHECK_STRING (string);
6785 if (NILP (coding_system))
6786 {
6787 if (! norecord)
6788 Vlast_coding_system_used = Qno_conversion;
6789 if (NILP (dst_object))
6790 return (nocopy ? Fcopy_sequence (string) : string);
6791 }
6792
6793 if (NILP (coding_system))
6794 coding_system = Qno_conversion;
6795 else
6796 CHECK_CODING_SYSTEM (coding_system);
6797 if (NILP (dst_object))
6798 dst_object = Qt;
6799 else if (! EQ (dst_object, Qt))
6800 CHECK_BUFFER (dst_object);
6801
6802 setup_coding_system (coding_system, &coding);
6803 coding.mode |= CODING_MODE_LAST_BLOCK;
6804 chars = XSTRING (string)->size;
6805 bytes = STRING_BYTES (XSTRING (string));
6806 if (encodep)
6807 encode_coding_object (&coding, string, 0, 0, chars, bytes, dst_object);
6808 else
6809 decode_coding_object (&coding, string, 0, 0, chars, bytes, dst_object);
6810 if (! norecord)
6811 Vlast_coding_system_used = CODING_ID_NAME (coding.id);
6812
6813 if (coding.result != CODING_RESULT_SUCCESS)
6814 error ("Code conversion error: %d", coding.result);
6815
6816 return (BUFFERP (dst_object)
6817 ? make_number (coding.produced_char)
6818 : coding.dst_object);
6819 }
6820
6821
6822 /* Encode or decode STRING according to CODING_SYSTEM.
6823 Do not set Vlast_coding_system_used.
6824
6825 This function is called only from macros DECODE_FILE and
6826 ENCODE_FILE, thus we ignore character composition. */
6827
6828 Lisp_Object
6829 code_convert_string_norecord (string, coding_system, encodep)
6830 Lisp_Object string, coding_system;
6831 int encodep;
6832 {
6833 return code_convert_string (string, coding_system, Qt, encodep, 0, 1);
6834 }
6835
6836
6837 DEFUN ("decode-coding-string", Fdecode_coding_string, Sdecode_coding_string,
6838 2, 4, 0,
6839 doc: /* Decode STRING which is encoded in CODING-SYSTEM, and return the result.
6840
6841 Optional third arg NOCOPY non-nil means it is OK to return STRING itself
6842 if the decoding operation is trivial.
6843
6844 Optional fourth arg BUFFER non-nil meant that the decoded text is
6845 inserted in BUFFER instead of returned as a string. In this case,
6846 the return value is BUFFER.
6847
6848 This function sets `last-coding-system-used' to the precise coding system
6849 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
6850 not fully specified. */)
6851 (string, coding_system, nocopy, buffer)
6852 Lisp_Object string, coding_system, nocopy, buffer;
6853 {
6854 return code_convert_string (string, coding_system, buffer,
6855 0, ! NILP (nocopy), 0);
6856 }
6857
6858 DEFUN ("encode-coding-string", Fencode_coding_string, Sencode_coding_string,
6859 2, 4, 0,
6860 doc: /* Encode STRING to CODING-SYSTEM, and return the result.
6861
6862 Optional third arg NOCOPY non-nil means it is OK to return STRING
6863 itself if the encoding operation is trivial.
6864
6865 Optional fourth arg BUFFER non-nil meant that the encoded text is
6866 inserted in BUFFER instead of returned as a string. In this case,
6867 the return value is BUFFER.
6868
6869 This function sets `last-coding-system-used' to the precise coding system
6870 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
6871 not fully specified.) */)
6872 (string, coding_system, nocopy, buffer)
6873 Lisp_Object string, coding_system, nocopy, buffer;
6874 {
6875 return code_convert_string (string, coding_system, buffer,
6876 1, ! NILP (nocopy), 1);
6877 }
6878
6879 \f
6880 DEFUN ("decode-sjis-char", Fdecode_sjis_char, Sdecode_sjis_char, 1, 1, 0,
6881 doc: /* Decode a Japanese character which has CODE in shift_jis encoding.
6882 Return the corresponding character. */)
6883 (code)
6884 Lisp_Object code;
6885 {
6886 Lisp_Object spec, attrs, val;
6887 struct charset *charset_roman, *charset_kanji, *charset_kana, *charset;
6888 int c;
6889
6890 CHECK_NATNUM (code);
6891 c = XFASTINT (code);
6892 CHECK_CODING_SYSTEM_GET_SPEC (Vsjis_coding_system, spec);
6893 attrs = AREF (spec, 0);
6894
6895 if (ASCII_BYTE_P (c)
6896 && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
6897 return code;
6898
6899 val = CODING_ATTR_CHARSET_LIST (attrs);
6900 charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
6901 charset_kana = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
6902 charset_kanji = CHARSET_FROM_ID (XINT (XCAR (val)));
6903
6904 if (c <= 0x7F)
6905 charset = charset_roman;
6906 else if (c >= 0xA0 && c < 0xDF)
6907 {
6908 charset = charset_kana;
6909 c -= 0x80;
6910 }
6911 else
6912 {
6913 int s1 = c >> 8, s2 = c & 0xFF;
6914
6915 if (s1 < 0x81 || (s1 > 0x9F && s1 < 0xE0) || s1 > 0xEF
6916 || s2 < 0x40 || s2 == 0x7F || s2 > 0xFC)
6917 error ("Invalid code: %d", code);
6918 SJIS_TO_JIS (c);
6919 charset = charset_kanji;
6920 }
6921 c = DECODE_CHAR (charset, c);
6922 if (c < 0)
6923 error ("Invalid code: %d", code);
6924 return make_number (c);
6925 }
6926
6927
6928 DEFUN ("encode-sjis-char", Fencode_sjis_char, Sencode_sjis_char, 1, 1, 0,
6929 doc: /* Encode a Japanese character CHAR to shift_jis encoding.
6930 Return the corresponding code in SJIS. */)
6931 (ch)
6932 Lisp_Object ch;
6933 {
6934 Lisp_Object spec, attrs, charset_list;
6935 int c;
6936 struct charset *charset;
6937 unsigned code;
6938
6939 CHECK_CHARACTER (ch);
6940 c = XFASTINT (ch);
6941 CHECK_CODING_SYSTEM_GET_SPEC (Vsjis_coding_system, spec);
6942 attrs = AREF (spec, 0);
6943
6944 if (ASCII_CHAR_P (c)
6945 && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
6946 return ch;
6947
6948 charset_list = CODING_ATTR_CHARSET_LIST (attrs);
6949 charset = char_charset (c, charset_list, &code);
6950 if (code == CHARSET_INVALID_CODE (charset))
6951 error ("Can't encode by shift_jis encoding: %d", c);
6952 JIS_TO_SJIS (code);
6953
6954 return make_number (code);
6955 }
6956
6957 DEFUN ("decode-big5-char", Fdecode_big5_char, Sdecode_big5_char, 1, 1, 0,
6958 doc: /* Decode a Big5 character which has CODE in BIG5 coding system.
6959 Return the corresponding character. */)
6960 (code)
6961 Lisp_Object code;
6962 {
6963 Lisp_Object spec, attrs, val;
6964 struct charset *charset_roman, *charset_big5, *charset;
6965 int c;
6966
6967 CHECK_NATNUM (code);
6968 c = XFASTINT (code);
6969 CHECK_CODING_SYSTEM_GET_SPEC (Vbig5_coding_system, spec);
6970 attrs = AREF (spec, 0);
6971
6972 if (ASCII_BYTE_P (c)
6973 && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
6974 return code;
6975
6976 val = CODING_ATTR_CHARSET_LIST (attrs);
6977 charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
6978 charset_big5 = CHARSET_FROM_ID (XINT (XCAR (val)));
6979
6980 if (c <= 0x7F)
6981 charset = charset_roman;
6982 else
6983 {
6984 int b1 = c >> 8, b2 = c & 0x7F;
6985 if (b1 < 0xA1 || b1 > 0xFE
6986 || b2 < 0x40 || (b2 > 0x7E && b2 < 0xA1) || b2 > 0xFE)
6987 error ("Invalid code: %d", code);
6988 charset = charset_big5;
6989 }
6990 c = DECODE_CHAR (charset, (unsigned )c);
6991 if (c < 0)
6992 error ("Invalid code: %d", code);
6993 return make_number (c);
6994 }
6995
6996 DEFUN ("encode-big5-char", Fencode_big5_char, Sencode_big5_char, 1, 1, 0,
6997 doc: /* Encode the Big5 character CHAR to BIG5 coding system.
6998 Return the corresponding character code in Big5. */)
6999 (ch)
7000 Lisp_Object ch;
7001 {
7002 Lisp_Object spec, attrs, charset_list;
7003 struct charset *charset;
7004 int c;
7005 unsigned code;
7006
7007 CHECK_CHARACTER (ch);
7008 c = XFASTINT (ch);
7009 CHECK_CODING_SYSTEM_GET_SPEC (Vbig5_coding_system, spec);
7010 attrs = AREF (spec, 0);
7011 if (ASCII_CHAR_P (c)
7012 && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
7013 return ch;
7014
7015 charset_list = CODING_ATTR_CHARSET_LIST (attrs);
7016 charset = char_charset (c, charset_list, &code);
7017 if (code == CHARSET_INVALID_CODE (charset))
7018 error ("Can't encode by Big5 encoding: %d", c);
7019
7020 return make_number (code);
7021 }
7022
7023 \f
7024 DEFUN ("set-terminal-coding-system-internal",
7025 Fset_terminal_coding_system_internal,
7026 Sset_terminal_coding_system_internal, 1, 1, 0,
7027 doc: /* Internal use only. */)
7028 (coding_system)
7029 Lisp_Object coding_system;
7030 {
7031 CHECK_SYMBOL (coding_system);
7032 setup_coding_system (Fcheck_coding_system (coding_system),
7033 &terminal_coding);
7034
7035 /* We had better not send unsafe characters to terminal. */
7036 terminal_coding.mode |= CODING_MODE_SAFE_ENCODING;
7037 /* Characer composition should be disabled. */
7038 terminal_coding.common_flags &= ~CODING_ANNOTATE_COMPOSITION_MASK;
7039 terminal_coding.src_multibyte = 1;
7040 terminal_coding.dst_multibyte = 0;
7041 return Qnil;
7042 }
7043
7044 DEFUN ("set-safe-terminal-coding-system-internal",
7045 Fset_safe_terminal_coding_system_internal,
7046 Sset_safe_terminal_coding_system_internal, 1, 1, 0,
7047 doc: /* Internal use only. */)
7048 (coding_system)
7049 Lisp_Object coding_system;
7050 {
7051 CHECK_SYMBOL (coding_system);
7052 setup_coding_system (Fcheck_coding_system (coding_system),
7053 &safe_terminal_coding);
7054 /* Characer composition should be disabled. */
7055 safe_terminal_coding.common_flags &= ~CODING_ANNOTATE_COMPOSITION_MASK;
7056 safe_terminal_coding.src_multibyte = 1;
7057 safe_terminal_coding.dst_multibyte = 0;
7058 return Qnil;
7059 }
7060
7061 DEFUN ("terminal-coding-system",
7062 Fterminal_coding_system, Sterminal_coding_system, 0, 0, 0,
7063 doc: /* Return coding system specified for terminal output. */)
7064 ()
7065 {
7066 return CODING_ID_NAME (terminal_coding.id);
7067 }
7068
7069 DEFUN ("set-keyboard-coding-system-internal",
7070 Fset_keyboard_coding_system_internal,
7071 Sset_keyboard_coding_system_internal, 1, 1, 0,
7072 doc: /* Internal use only. */)
7073 (coding_system)
7074 Lisp_Object coding_system;
7075 {
7076 CHECK_SYMBOL (coding_system);
7077 setup_coding_system (Fcheck_coding_system (coding_system),
7078 &keyboard_coding);
7079 /* Characer composition should be disabled. */
7080 keyboard_coding.common_flags &= ~CODING_ANNOTATE_COMPOSITION_MASK;
7081 return Qnil;
7082 }
7083
7084 DEFUN ("keyboard-coding-system",
7085 Fkeyboard_coding_system, Skeyboard_coding_system, 0, 0, 0,
7086 doc: /* Return coding system specified for decoding keyboard input. */)
7087 ()
7088 {
7089 return CODING_ID_NAME (keyboard_coding.id);
7090 }
7091
7092 \f
7093 DEFUN ("find-operation-coding-system", Ffind_operation_coding_system,
7094 Sfind_operation_coding_system, 1, MANY, 0,
7095 doc: /* Choose a coding system for an operation based on the target name.
7096 The value names a pair of coding systems: (DECODING-SYSTEM . ENCODING-SYSTEM).
7097 DECODING-SYSTEM is the coding system to use for decoding
7098 \(in case OPERATION does decoding), and ENCODING-SYSTEM is the coding system
7099 for encoding (in case OPERATION does encoding).
7100
7101 The first argument OPERATION specifies an I/O primitive:
7102 For file I/O, `insert-file-contents' or `write-region'.
7103 For process I/O, `call-process', `call-process-region', or `start-process'.
7104 For network I/O, `open-network-stream'.
7105
7106 The remaining arguments should be the same arguments that were passed
7107 to the primitive. Depending on which primitive, one of those arguments
7108 is selected as the TARGET. For example, if OPERATION does file I/O,
7109 whichever argument specifies the file name is TARGET.
7110
7111 TARGET has a meaning which depends on OPERATION:
7112 For file I/O, TARGET is a file name.
7113 For process I/O, TARGET is a process name.
7114 For network I/O, TARGET is a service name or a port number
7115
7116 This function looks up what specified for TARGET in,
7117 `file-coding-system-alist', `process-coding-system-alist',
7118 or `network-coding-system-alist' depending on OPERATION.
7119 They may specify a coding system, a cons of coding systems,
7120 or a function symbol to call.
7121 In the last case, we call the function with one argument,
7122 which is a list of all the arguments given to this function.
7123
7124 usage: (find-operation-coding-system OPERATION ARGUMENTS ...) */)
7125 (nargs, args)
7126 int nargs;
7127 Lisp_Object *args;
7128 {
7129 Lisp_Object operation, target_idx, target, val;
7130 register Lisp_Object chain;
7131
7132 if (nargs < 2)
7133 error ("Too few arguments");
7134 operation = args[0];
7135 if (!SYMBOLP (operation)
7136 || !INTEGERP (target_idx = Fget (operation, Qtarget_idx)))
7137 error ("Invalid first arguement");
7138 if (nargs < 1 + XINT (target_idx))
7139 error ("Too few arguments for operation: %s",
7140 XSYMBOL (operation)->name->data);
7141 target = args[XINT (target_idx) + 1];
7142 if (!(STRINGP (target)
7143 || (EQ (operation, Qopen_network_stream) && INTEGERP (target))))
7144 error ("Invalid %dth argument", XINT (target_idx) + 1);
7145
7146 chain = ((EQ (operation, Qinsert_file_contents)
7147 || EQ (operation, Qwrite_region))
7148 ? Vfile_coding_system_alist
7149 : (EQ (operation, Qopen_network_stream)
7150 ? Vnetwork_coding_system_alist
7151 : Vprocess_coding_system_alist));
7152 if (NILP (chain))
7153 return Qnil;
7154
7155 for (; CONSP (chain); chain = XCDR (chain))
7156 {
7157 Lisp_Object elt;
7158
7159 elt = XCAR (chain);
7160 if (CONSP (elt)
7161 && ((STRINGP (target)
7162 && STRINGP (XCAR (elt))
7163 && fast_string_match (XCAR (elt), target) >= 0)
7164 || (INTEGERP (target) && EQ (target, XCAR (elt)))))
7165 {
7166 val = XCDR (elt);
7167 /* Here, if VAL is both a valid coding system and a valid
7168 function symbol, we return VAL as a coding system. */
7169 if (CONSP (val))
7170 return val;
7171 if (! SYMBOLP (val))
7172 return Qnil;
7173 if (! NILP (Fcoding_system_p (val)))
7174 return Fcons (val, val);
7175 if (! NILP (Ffboundp (val)))
7176 {
7177 val = call1 (val, Flist (nargs, args));
7178 if (CONSP (val))
7179 return val;
7180 if (SYMBOLP (val) && ! NILP (Fcoding_system_p (val)))
7181 return Fcons (val, val);
7182 }
7183 return Qnil;
7184 }
7185 }
7186 return Qnil;
7187 }
7188
7189 DEFUN ("set-coding-system-priority", Fset_coding_system_priority,
7190 Sset_coding_system_priority, 0, MANY, 0,
7191 doc: /* Assign higher priority to the coding systems given as arguments.
7192 usage: (set-coding-system-priority CODING-SYSTEM ...) */)
7193 (nargs, args)
7194 int nargs;
7195 Lisp_Object *args;
7196 {
7197 int i, j;
7198 int changed[coding_category_max];
7199 enum coding_category priorities[coding_category_max];
7200
7201 bzero (changed, sizeof changed);
7202
7203 for (i = j = 0; i < nargs; i++)
7204 {
7205 enum coding_category category;
7206 Lisp_Object spec, attrs;
7207
7208 CHECK_CODING_SYSTEM_GET_SPEC (args[i], spec);
7209 attrs = AREF (spec, 0);
7210 category = XINT (CODING_ATTR_CATEGORY (attrs));
7211 if (changed[category])
7212 /* Ignore this coding system because a coding system of the
7213 same category already had a higher priority. */
7214 continue;
7215 changed[category] = 1;
7216 priorities[j++] = category;
7217 if (coding_categories[category].id >= 0
7218 && ! EQ (args[i], CODING_ID_NAME (coding_categories[category].id)))
7219 setup_coding_system (args[i], &coding_categories[category]);
7220 }
7221
7222 /* Now we have decided top J priorities. Reflect the order of the
7223 original priorities to the remaining priorities. */
7224
7225 for (i = j, j = 0; i < coding_category_max; i++, j++)
7226 {
7227 while (j < coding_category_max
7228 && changed[coding_priorities[j]])
7229 j++;
7230 if (j == coding_category_max)
7231 abort ();
7232 priorities[i] = coding_priorities[j];
7233 }
7234
7235 bcopy (priorities, coding_priorities, sizeof priorities);
7236 return Qnil;
7237 }
7238
7239 DEFUN ("coding-system-priority-list", Fcoding_system_priority_list,
7240 Scoding_system_priority_list, 0, 1, 0,
7241 doc: /* Return a list of coding systems ordered by their priorities.
7242 HIGHESTP non-nil means just return the highest priority one. */)
7243 (highestp)
7244 Lisp_Object highestp;
7245 {
7246 int i;
7247 Lisp_Object val;
7248
7249 for (i = 0, val = Qnil; i < coding_category_max; i++)
7250 {
7251 enum coding_category category = coding_priorities[i];
7252 int id = coding_categories[category].id;
7253 Lisp_Object attrs;
7254
7255 if (id < 0)
7256 continue;
7257 attrs = CODING_ID_ATTRS (id);
7258 if (! NILP (highestp))
7259 return CODING_ATTR_BASE_NAME (attrs);
7260 val = Fcons (CODING_ATTR_BASE_NAME (attrs), val);
7261 }
7262 return Fnreverse (val);
7263 }
7264
7265 static char *suffixes[] = { "-unix", "-dos", "-mac" };
7266
7267 static Lisp_Object
7268 make_subsidiaries (base)
7269 Lisp_Object base;
7270 {
7271 Lisp_Object subsidiaries;
7272 int base_name_len = STRING_BYTES (XSYMBOL (base)->name);
7273 char *buf = (char *) alloca (base_name_len + 6);
7274 int i;
7275
7276 bcopy (XSYMBOL (base)->name->data, buf, base_name_len);
7277 subsidiaries = Fmake_vector (make_number (3), Qnil);
7278 for (i = 0; i < 3; i++)
7279 {
7280 bcopy (suffixes[i], buf + base_name_len, strlen (suffixes[i]) + 1);
7281 ASET (subsidiaries, i, intern (buf));
7282 }
7283 return subsidiaries;
7284 }
7285
7286
7287 DEFUN ("define-coding-system-internal", Fdefine_coding_system_internal,
7288 Sdefine_coding_system_internal, coding_arg_max, MANY, 0,
7289 doc: /* For internal use only.
7290 usage: (define-coding-system-internal ...) */)
7291 (nargs, args)
7292 int nargs;
7293 Lisp_Object *args;
7294 {
7295 Lisp_Object name;
7296 Lisp_Object spec_vec; /* [ ATTRS ALIASE EOL_TYPE ] */
7297 Lisp_Object attrs; /* Vector of attributes. */
7298 Lisp_Object eol_type;
7299 Lisp_Object aliases;
7300 Lisp_Object coding_type, charset_list, safe_charsets;
7301 enum coding_category category;
7302 Lisp_Object tail, val;
7303 int max_charset_id = 0;
7304 int i;
7305
7306 if (nargs < coding_arg_max)
7307 goto short_args;
7308
7309 attrs = Fmake_vector (make_number (coding_attr_last_index), Qnil);
7310
7311 name = args[coding_arg_name];
7312 CHECK_SYMBOL (name);
7313 CODING_ATTR_BASE_NAME (attrs) = name;
7314
7315 val = args[coding_arg_mnemonic];
7316 if (! STRINGP (val))
7317 CHECK_CHARACTER (val);
7318 CODING_ATTR_MNEMONIC (attrs) = val;
7319
7320 coding_type = args[coding_arg_coding_type];
7321 CHECK_SYMBOL (coding_type);
7322 CODING_ATTR_TYPE (attrs) = coding_type;
7323
7324 charset_list = args[coding_arg_charset_list];
7325 if (SYMBOLP (charset_list))
7326 {
7327 if (EQ (charset_list, Qiso_2022))
7328 {
7329 if (! EQ (coding_type, Qiso_2022))
7330 error ("Invalid charset-list");
7331 charset_list = Viso_2022_charset_list;
7332 }
7333 else if (EQ (charset_list, Qemacs_mule))
7334 {
7335 if (! EQ (coding_type, Qemacs_mule))
7336 error ("Invalid charset-list");
7337 charset_list = Vemacs_mule_charset_list;
7338 }
7339 for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
7340 if (max_charset_id < XFASTINT (XCAR (tail)))
7341 max_charset_id = XFASTINT (XCAR (tail));
7342 }
7343 else
7344 {
7345 charset_list = Fcopy_sequence (charset_list);
7346 for (tail = charset_list; !NILP (tail); tail = Fcdr (tail))
7347 {
7348 struct charset *charset;
7349
7350 val = Fcar (tail);
7351 CHECK_CHARSET_GET_CHARSET (val, charset);
7352 if (EQ (coding_type, Qiso_2022)
7353 ? CHARSET_ISO_FINAL (charset) < 0
7354 : EQ (coding_type, Qemacs_mule)
7355 ? CHARSET_EMACS_MULE_ID (charset) < 0
7356 : 0)
7357 error ("Can't handle charset `%s'",
7358 XSYMBOL (CHARSET_NAME (charset))->name->data);
7359
7360 XCAR (tail) = make_number (charset->id);
7361 if (max_charset_id < charset->id)
7362 max_charset_id = charset->id;
7363 }
7364 }
7365 CODING_ATTR_CHARSET_LIST (attrs) = charset_list;
7366
7367 safe_charsets = Fmake_string (make_number (max_charset_id + 1),
7368 make_number (255));
7369 for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
7370 XSTRING (safe_charsets)->data[XFASTINT (XCAR (tail))] = 0;
7371 CODING_ATTR_SAFE_CHARSETS (attrs) = safe_charsets;
7372
7373 val = args[coding_arg_decode_translation_table];
7374 if (! NILP (val))
7375 CHECK_CHAR_TABLE (val);
7376 CODING_ATTR_DECODE_TBL (attrs) = val;
7377
7378 val = args[coding_arg_encode_translation_table];
7379 if (! NILP (val))
7380 CHECK_CHAR_TABLE (val);
7381 CODING_ATTR_ENCODE_TBL (attrs) = val;
7382
7383 val = args[coding_arg_post_read_conversion];
7384 CHECK_SYMBOL (val);
7385 CODING_ATTR_POST_READ (attrs) = val;
7386
7387 val = args[coding_arg_pre_write_conversion];
7388 CHECK_SYMBOL (val);
7389 CODING_ATTR_PRE_WRITE (attrs) = val;
7390
7391 val = args[coding_arg_default_char];
7392 if (NILP (val))
7393 CODING_ATTR_DEFAULT_CHAR (attrs) = make_number (' ');
7394 else
7395 {
7396 CHECK_CHARACTER (val);
7397 CODING_ATTR_DEFAULT_CHAR (attrs) = val;
7398 }
7399
7400 val = args[coding_arg_plist];
7401 CHECK_LIST (val);
7402 CODING_ATTR_PLIST (attrs) = val;
7403
7404 if (EQ (coding_type, Qcharset))
7405 {
7406 /* Generate a lisp vector of 256 elements. Each element is nil,
7407 integer, or a list of charset IDs.
7408
7409 If Nth element is nil, the byte code N is invalid in this
7410 coding system.
7411
7412 If Nth element is a number NUM, N is the first byte of a
7413 charset whose ID is NUM.
7414
7415 If Nth element is a list of charset IDs, N is the first byte
7416 of one of them. The list is sorted by dimensions of the
7417 charsets. A charset of smaller dimension comes firtst.
7418 */
7419 val = Fmake_vector (make_number (256), Qnil);
7420
7421 for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
7422 {
7423 struct charset *charset = CHARSET_FROM_ID (XFASTINT (XCAR (tail)));
7424 int dim = CHARSET_DIMENSION (charset);
7425 int idx = (dim - 1) * 4;
7426
7427 for (i = charset->code_space[idx];
7428 i <= charset->code_space[idx + 1]; i++)
7429 {
7430 Lisp_Object tmp, tmp2;
7431 int dim2;
7432
7433 tmp = AREF (val, i);
7434 if (NILP (tmp))
7435 tmp = XCAR (tail);
7436 else if (NUMBERP (tmp))
7437 {
7438 dim2 = CHARSET_DIMENSION (CHARSET_FROM_ID (XFASTINT (tmp)));
7439 if (dim < dim2)
7440 tmp = Fcons (XCAR (tail), Fcons (tmp, Qnil));
7441 else
7442 tmp = Fcons (tmp, Fcons (XCAR (tail), Qnil));
7443 }
7444 else
7445 {
7446 for (tmp2 = tmp; CONSP (tmp2); tmp2 = XCDR (tmp2))
7447 {
7448 dim2 = CHARSET_DIMENSION (CHARSET_FROM_ID (XFASTINT (XCAR (tmp2))));
7449 if (dim < dim2)
7450 break;
7451 }
7452 if (NILP (tmp2))
7453 tmp = nconc2 (tmp, Fcons (XCAR (tail), Qnil));
7454 else
7455 {
7456 XSETCDR (tmp2, Fcons (XCAR (tmp2), XCDR (tmp2)));
7457 XSETCAR (tmp2, XCAR (tail));
7458 }
7459 }
7460 ASET (val, i, tmp);
7461 }
7462 }
7463 ASET (attrs, coding_attr_charset_valids, val);
7464 category = coding_category_charset;
7465 }
7466 else if (EQ (coding_type, Qccl))
7467 {
7468 Lisp_Object valids;
7469
7470 if (nargs < coding_arg_ccl_max)
7471 goto short_args;
7472
7473 val = args[coding_arg_ccl_decoder];
7474 CHECK_CCL_PROGRAM (val);
7475 if (VECTORP (val))
7476 val = Fcopy_sequence (val);
7477 ASET (attrs, coding_attr_ccl_decoder, val);
7478
7479 val = args[coding_arg_ccl_encoder];
7480 CHECK_CCL_PROGRAM (val);
7481 if (VECTORP (val))
7482 val = Fcopy_sequence (val);
7483 ASET (attrs, coding_attr_ccl_encoder, val);
7484
7485 val = args[coding_arg_ccl_valids];
7486 valids = Fmake_string (make_number (256), make_number (0));
7487 for (tail = val; !NILP (tail); tail = Fcdr (tail))
7488 {
7489 val = Fcar (tail);
7490 if (INTEGERP (val))
7491 ASET (valids, XINT (val), make_number (1));
7492 else
7493 {
7494 int from, to;
7495
7496 CHECK_CONS (val);
7497 CHECK_NUMBER (XCAR (val));
7498 CHECK_NUMBER (XCDR (val));
7499 from = XINT (XCAR (val));
7500 to = XINT (XCDR (val));
7501 for (i = from; i <= to; i++)
7502 ASET (valids, i, make_number (1));
7503 }
7504 }
7505 ASET (attrs, coding_attr_ccl_valids, valids);
7506
7507 category = coding_category_ccl;
7508 }
7509 else if (EQ (coding_type, Qutf_16))
7510 {
7511 Lisp_Object bom, endian;
7512
7513 if (nargs < coding_arg_utf16_max)
7514 goto short_args;
7515
7516 bom = args[coding_arg_utf16_bom];
7517 if (! NILP (bom) && ! EQ (bom, Qt))
7518 {
7519 CHECK_CONS (bom);
7520 CHECK_CODING_SYSTEM (XCAR (bom));
7521 CHECK_CODING_SYSTEM (XCDR (bom));
7522 }
7523 ASET (attrs, coding_attr_utf_16_bom, bom);
7524
7525 endian = args[coding_arg_utf16_endian];
7526 ASET (attrs, coding_attr_utf_16_endian, endian);
7527
7528 category = (CONSP (bom)
7529 ? coding_category_utf_16_auto
7530 : NILP (bom)
7531 ? (NILP (endian)
7532 ? coding_category_utf_16_be_nosig
7533 : coding_category_utf_16_le_nosig)
7534 : (NILP (endian)
7535 ? coding_category_utf_16_be
7536 : coding_category_utf_16_le));
7537 }
7538 else if (EQ (coding_type, Qiso_2022))
7539 {
7540 Lisp_Object initial, reg_usage, request, flags;
7541 int i, id;
7542
7543 if (nargs < coding_arg_iso2022_max)
7544 goto short_args;
7545
7546 initial = Fcopy_sequence (args[coding_arg_iso2022_initial]);
7547 CHECK_VECTOR (initial);
7548 for (i = 0; i < 4; i++)
7549 {
7550 val = Faref (initial, make_number (i));
7551 if (! NILP (val))
7552 {
7553 CHECK_CHARSET_GET_ID (val, id);
7554 ASET (initial, i, make_number (id));
7555 }
7556 else
7557 ASET (initial, i, make_number (-1));
7558 }
7559
7560 reg_usage = args[coding_arg_iso2022_reg_usage];
7561 CHECK_CONS (reg_usage);
7562 CHECK_NATNUM (XCAR (reg_usage));
7563 CHECK_NATNUM (XCDR (reg_usage));
7564
7565 request = Fcopy_sequence (args[coding_arg_iso2022_request]);
7566 for (tail = request; ! NILP (tail); tail = Fcdr (tail))
7567 {
7568 int id;
7569
7570 val = Fcar (tail);
7571 CHECK_CONS (val);
7572 CHECK_CHARSET_GET_ID (XCAR (val), id);
7573 CHECK_NATNUM (XCDR (val));
7574 if (XINT (XCDR (val)) >= 4)
7575 error ("Invalid graphic register number: %d", XINT (XCDR (val)));
7576 XCAR (val) = make_number (id);
7577 }
7578
7579 flags = args[coding_arg_iso2022_flags];
7580 CHECK_NATNUM (flags);
7581 i = XINT (flags);
7582 if (EQ (args[coding_arg_charset_list], Qiso_2022))
7583 flags = make_number (i | CODING_ISO_FLAG_FULL_SUPPORT);
7584
7585 ASET (attrs, coding_attr_iso_initial, initial);
7586 ASET (attrs, coding_attr_iso_usage, reg_usage);
7587 ASET (attrs, coding_attr_iso_request, request);
7588 ASET (attrs, coding_attr_iso_flags, flags);
7589 setup_iso_safe_charsets (attrs);
7590
7591 if (i & CODING_ISO_FLAG_SEVEN_BITS)
7592 category = ((i & (CODING_ISO_FLAG_LOCKING_SHIFT
7593 | CODING_ISO_FLAG_SINGLE_SHIFT))
7594 ? coding_category_iso_7_else
7595 : EQ (args[coding_arg_charset_list], Qiso_2022)
7596 ? coding_category_iso_7
7597 : coding_category_iso_7_tight);
7598 else
7599 {
7600 int id = XINT (AREF (initial, 1));
7601
7602 category = (((i & CODING_ISO_FLAG_LOCKING_SHIFT)
7603 || EQ (args[coding_arg_charset_list], Qiso_2022)
7604 || id < 0)
7605 ? coding_category_iso_8_else
7606 : (CHARSET_DIMENSION (CHARSET_FROM_ID (id)) == 1)
7607 ? coding_category_iso_8_1
7608 : coding_category_iso_8_2);
7609 }
7610 }
7611 else if (EQ (coding_type, Qemacs_mule))
7612 {
7613 if (EQ (args[coding_arg_charset_list], Qemacs_mule))
7614 ASET (attrs, coding_attr_emacs_mule_full, Qt);
7615
7616 category = coding_category_emacs_mule;
7617 }
7618 else if (EQ (coding_type, Qshift_jis))
7619 {
7620
7621 struct charset *charset;
7622
7623 if (XINT (Flength (charset_list)) != 3)
7624 error ("There should be just three charsets");
7625
7626 charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
7627 if (CHARSET_DIMENSION (charset) != 1)
7628 error ("Dimension of charset %s is not one",
7629 XSYMBOL (CHARSET_NAME (charset))->name->data);
7630
7631 charset_list = XCDR (charset_list);
7632 charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
7633 if (CHARSET_DIMENSION (charset) != 1)
7634 error ("Dimension of charset %s is not one",
7635 XSYMBOL (CHARSET_NAME (charset))->name->data);
7636
7637 charset_list = XCDR (charset_list);
7638 charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
7639 if (CHARSET_DIMENSION (charset) != 2)
7640 error ("Dimension of charset %s is not two",
7641 XSYMBOL (CHARSET_NAME (charset))->name->data);
7642
7643 category = coding_category_sjis;
7644 Vsjis_coding_system = name;
7645 }
7646 else if (EQ (coding_type, Qbig5))
7647 {
7648 struct charset *charset;
7649
7650 if (XINT (Flength (charset_list)) != 2)
7651 error ("There should be just two charsets");
7652
7653 charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
7654 if (CHARSET_DIMENSION (charset) != 1)
7655 error ("Dimension of charset %s is not one",
7656 XSYMBOL (CHARSET_NAME (charset))->name->data);
7657
7658 charset_list = XCDR (charset_list);
7659 charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
7660 if (CHARSET_DIMENSION (charset) != 2)
7661 error ("Dimension of charset %s is not two",
7662 XSYMBOL (CHARSET_NAME (charset))->name->data);
7663
7664 category = coding_category_big5;
7665 Vbig5_coding_system = name;
7666 }
7667 else if (EQ (coding_type, Qraw_text))
7668 category = coding_category_raw_text;
7669 else if (EQ (coding_type, Qutf_8))
7670 category = coding_category_utf_8;
7671 else if (EQ (coding_type, Qundecided))
7672 category = coding_category_undecided;
7673 else
7674 error ("Invalid coding system type: %s",
7675 XSYMBOL (coding_type)->name->data);
7676
7677 CODING_ATTR_CATEGORY (attrs) = make_number (category);
7678
7679 eol_type = args[coding_arg_eol_type];
7680 if (! NILP (eol_type)
7681 && ! EQ (eol_type, Qunix)
7682 && ! EQ (eol_type, Qdos)
7683 && ! EQ (eol_type, Qmac))
7684 error ("Invalid eol-type");
7685
7686 aliases = Fcons (name, Qnil);
7687
7688 if (NILP (eol_type))
7689 {
7690 eol_type = make_subsidiaries (name);
7691 for (i = 0; i < 3; i++)
7692 {
7693 Lisp_Object this_spec, this_name, this_aliases, this_eol_type;
7694
7695 this_name = AREF (eol_type, i);
7696 this_aliases = Fcons (this_name, Qnil);
7697 this_eol_type = (i == 0 ? Qunix : i == 1 ? Qdos : Qmac);
7698 this_spec = Fmake_vector (make_number (3), attrs);
7699 ASET (this_spec, 1, this_aliases);
7700 ASET (this_spec, 2, this_eol_type);
7701 Fputhash (this_name, this_spec, Vcoding_system_hash_table);
7702 Vcoding_system_list = Fcons (this_name, Vcoding_system_list);
7703 Vcoding_system_alist = Fcons (Fcons (Fsymbol_name (this_name), Qnil),
7704 Vcoding_system_alist);
7705 }
7706 }
7707
7708 spec_vec = Fmake_vector (make_number (3), attrs);
7709 ASET (spec_vec, 1, aliases);
7710 ASET (spec_vec, 2, eol_type);
7711
7712 Fputhash (name, spec_vec, Vcoding_system_hash_table);
7713 Vcoding_system_list = Fcons (name, Vcoding_system_list);
7714 Vcoding_system_alist = Fcons (Fcons (Fsymbol_name (name), Qnil),
7715 Vcoding_system_alist);
7716
7717 {
7718 int id = coding_categories[category].id;
7719
7720 if (id < 0 || EQ (name, CODING_ID_NAME (id)))
7721 setup_coding_system (name, &coding_categories[category]);
7722 }
7723
7724 return Qnil;
7725
7726 short_args:
7727 return Fsignal (Qwrong_number_of_arguments,
7728 Fcons (intern ("define-coding-system-internal"),
7729 make_number (nargs)));
7730 }
7731
7732 /* Fixme: should this record the alias relationships for
7733 diagnostics? */
7734 DEFUN ("define-coding-system-alias", Fdefine_coding_system_alias,
7735 Sdefine_coding_system_alias, 2, 2, 0,
7736 doc: /* Define ALIAS as an alias for CODING-SYSTEM. */)
7737 (alias, coding_system)
7738 Lisp_Object alias, coding_system;
7739 {
7740 Lisp_Object spec, aliases, eol_type;
7741
7742 CHECK_SYMBOL (alias);
7743 CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
7744 aliases = AREF (spec, 1);
7745 while (!NILP (XCDR (aliases)))
7746 aliases = XCDR (aliases);
7747 XCDR (aliases) = Fcons (alias, Qnil);
7748
7749 eol_type = AREF (spec, 2);
7750 if (VECTORP (eol_type))
7751 {
7752 Lisp_Object subsidiaries;
7753 int i;
7754
7755 subsidiaries = make_subsidiaries (alias);
7756 for (i = 0; i < 3; i++)
7757 Fdefine_coding_system_alias (AREF (subsidiaries, i),
7758 AREF (eol_type, i));
7759
7760 ASET (spec, 2, subsidiaries);
7761 }
7762
7763 Fputhash (alias, spec, Vcoding_system_hash_table);
7764 Vcoding_system_alist = Fcons (Fcons (Fsymbol_name (alias), Qnil),
7765 Vcoding_system_alist);
7766
7767 return Qnil;
7768 }
7769
7770 DEFUN ("coding-system-base", Fcoding_system_base, Scoding_system_base,
7771 1, 1, 0,
7772 doc: /* Return the base of CODING-SYSTEM.
7773 Any alias or subsidiary coding system is not a base coding system. */)
7774 (coding_system)
7775 Lisp_Object coding_system;
7776 {
7777 Lisp_Object spec, attrs;
7778
7779 if (NILP (coding_system))
7780 return (Qno_conversion);
7781 CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
7782 attrs = AREF (spec, 0);
7783 return CODING_ATTR_BASE_NAME (attrs);
7784 }
7785
7786 DEFUN ("coding-system-plist", Fcoding_system_plist, Scoding_system_plist,
7787 1, 1, 0,
7788 doc: "Return the property list of CODING-SYSTEM.")
7789 (coding_system)
7790 Lisp_Object coding_system;
7791 {
7792 Lisp_Object spec, attrs;
7793
7794 if (NILP (coding_system))
7795 coding_system = Qno_conversion;
7796 CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
7797 attrs = AREF (spec, 0);
7798 return CODING_ATTR_PLIST (attrs);
7799 }
7800
7801
7802 DEFUN ("coding-system-aliases", Fcoding_system_aliases, Scoding_system_aliases,
7803 1, 1, 0,
7804 doc: /* Return the list of aliases of CODING-SYSTEM. */)
7805 (coding_system)
7806 Lisp_Object coding_system;
7807 {
7808 Lisp_Object spec;
7809
7810 if (NILP (coding_system))
7811 coding_system = Qno_conversion;
7812 CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
7813 return AREF (spec, 1);
7814 }
7815
7816 DEFUN ("coding-system-eol-type", Fcoding_system_eol_type,
7817 Scoding_system_eol_type, 1, 1, 0,
7818 doc: /* Return eol-type of CODING-SYSTEM.
7819 An eol-type is integer 0, 1, 2, or a vector of coding systems.
7820
7821 Integer values 0, 1, and 2 indicate a format of end-of-line; LF, CRLF,
7822 and CR respectively.
7823
7824 A vector value indicates that a format of end-of-line should be
7825 detected automatically. Nth element of the vector is the subsidiary
7826 coding system whose eol-type is N. */)
7827 (coding_system)
7828 Lisp_Object coding_system;
7829 {
7830 Lisp_Object spec, eol_type;
7831 int n;
7832
7833 if (NILP (coding_system))
7834 coding_system = Qno_conversion;
7835 if (! CODING_SYSTEM_P (coding_system))
7836 return Qnil;
7837 spec = CODING_SYSTEM_SPEC (coding_system);
7838 eol_type = AREF (spec, 2);
7839 if (VECTORP (eol_type))
7840 return Fcopy_sequence (eol_type);
7841 n = EQ (eol_type, Qunix) ? 0 : EQ (eol_type, Qdos) ? 1 : 2;
7842 return make_number (n);
7843 }
7844
7845 #endif /* emacs */
7846
7847 \f
7848 /*** 9. Post-amble ***/
7849
7850 void
7851 init_coding_once ()
7852 {
7853 int i;
7854
7855 for (i = 0; i < coding_category_max; i++)
7856 {
7857 coding_categories[i].id = -1;
7858 coding_priorities[i] = i;
7859 }
7860
7861 /* ISO2022 specific initialize routine. */
7862 for (i = 0; i < 0x20; i++)
7863 iso_code_class[i] = ISO_control_0;
7864 for (i = 0x21; i < 0x7F; i++)
7865 iso_code_class[i] = ISO_graphic_plane_0;
7866 for (i = 0x80; i < 0xA0; i++)
7867 iso_code_class[i] = ISO_control_1;
7868 for (i = 0xA1; i < 0xFF; i++)
7869 iso_code_class[i] = ISO_graphic_plane_1;
7870 iso_code_class[0x20] = iso_code_class[0x7F] = ISO_0x20_or_0x7F;
7871 iso_code_class[0xA0] = iso_code_class[0xFF] = ISO_0xA0_or_0xFF;
7872 iso_code_class[ISO_CODE_CR] = ISO_carriage_return;
7873 iso_code_class[ISO_CODE_SO] = ISO_shift_out;
7874 iso_code_class[ISO_CODE_SI] = ISO_shift_in;
7875 iso_code_class[ISO_CODE_SS2_7] = ISO_single_shift_2_7;
7876 iso_code_class[ISO_CODE_ESC] = ISO_escape;
7877 iso_code_class[ISO_CODE_SS2] = ISO_single_shift_2;
7878 iso_code_class[ISO_CODE_SS3] = ISO_single_shift_3;
7879 iso_code_class[ISO_CODE_CSI] = ISO_control_sequence_introducer;
7880
7881 inhibit_pre_post_conversion = 0;
7882
7883 for (i = 0; i < 256; i++)
7884 {
7885 emacs_mule_bytes[i] = 1;
7886 }
7887 emacs_mule_bytes[EMACS_MULE_LEADING_CODE_PRIVATE_11] = 3;
7888 emacs_mule_bytes[EMACS_MULE_LEADING_CODE_PRIVATE_12] = 3;
7889 emacs_mule_bytes[EMACS_MULE_LEADING_CODE_PRIVATE_21] = 4;
7890 emacs_mule_bytes[EMACS_MULE_LEADING_CODE_PRIVATE_22] = 4;
7891 }
7892
7893 #ifdef emacs
7894
7895 void
7896 syms_of_coding ()
7897 {
7898 staticpro (&Vcoding_system_hash_table);
7899 Vcoding_system_hash_table = Fmakehash (Qeq);
7900
7901 staticpro (&Vsjis_coding_system);
7902 Vsjis_coding_system = Qnil;
7903
7904 staticpro (&Vbig5_coding_system);
7905 Vbig5_coding_system = Qnil;
7906
7907 staticpro (&Vcode_conversion_work_buf_list);
7908 Vcode_conversion_work_buf_list = Qnil;
7909
7910 staticpro (&Vcode_conversion_reused_work_buf);
7911 Vcode_conversion_reused_work_buf = Qnil;
7912
7913 DEFSYM (Qcharset, "charset");
7914 DEFSYM (Qtarget_idx, "target-idx");
7915 DEFSYM (Qcoding_system_history, "coding-system-history");
7916 Fset (Qcoding_system_history, Qnil);
7917
7918 /* Target FILENAME is the first argument. */
7919 Fput (Qinsert_file_contents, Qtarget_idx, make_number (0));
7920 /* Target FILENAME is the third argument. */
7921 Fput (Qwrite_region, Qtarget_idx, make_number (2));
7922
7923 DEFSYM (Qcall_process, "call-process");
7924 /* Target PROGRAM is the first argument. */
7925 Fput (Qcall_process, Qtarget_idx, make_number (0));
7926
7927 DEFSYM (Qcall_process_region, "call-process-region");
7928 /* Target PROGRAM is the third argument. */
7929 Fput (Qcall_process_region, Qtarget_idx, make_number (2));
7930
7931 DEFSYM (Qstart_process, "start-process");
7932 /* Target PROGRAM is the third argument. */
7933 Fput (Qstart_process, Qtarget_idx, make_number (2));
7934
7935 DEFSYM (Qopen_network_stream, "open-network-stream");
7936 /* Target SERVICE is the fourth argument. */
7937 Fput (Qopen_network_stream, Qtarget_idx, make_number (3));
7938
7939 DEFSYM (Qcoding_system, "coding-system");
7940 DEFSYM (Qcoding_aliases, "coding-aliases");
7941
7942 DEFSYM (Qeol_type, "eol-type");
7943 DEFSYM (Qunix, "unix");
7944 DEFSYM (Qdos, "dos");
7945
7946 DEFSYM (Qbuffer_file_coding_system, "buffer-file-coding-system");
7947 DEFSYM (Qpost_read_conversion, "post-read-conversion");
7948 DEFSYM (Qpre_write_conversion, "pre-write-conversion");
7949 DEFSYM (Qdefault_char, "default-char");
7950 DEFSYM (Qundecided, "undecided");
7951 DEFSYM (Qno_conversion, "no-conversion");
7952 DEFSYM (Qraw_text, "raw-text");
7953
7954 DEFSYM (Qiso_2022, "iso-2022");
7955
7956 DEFSYM (Qutf_8, "utf-8");
7957
7958 DEFSYM (Qutf_16, "utf-16");
7959 DEFSYM (Qutf_16_be, "utf-16-be");
7960 DEFSYM (Qutf_16_be_nosig, "utf-16-be-nosig");
7961 DEFSYM (Qutf_16_le, "utf-16-l3");
7962 DEFSYM (Qutf_16_le_nosig, "utf-16-le-nosig");
7963 DEFSYM (Qsignature, "signature");
7964 DEFSYM (Qendian, "endian");
7965 DEFSYM (Qbig, "big");
7966 DEFSYM (Qlittle, "little");
7967
7968 DEFSYM (Qshift_jis, "shift-jis");
7969 DEFSYM (Qbig5, "big5");
7970
7971 DEFSYM (Qcoding_system_p, "coding-system-p");
7972
7973 DEFSYM (Qcoding_system_error, "coding-system-error");
7974 Fput (Qcoding_system_error, Qerror_conditions,
7975 Fcons (Qcoding_system_error, Fcons (Qerror, Qnil)));
7976 Fput (Qcoding_system_error, Qerror_message,
7977 build_string ("Invalid coding system"));
7978
7979 /* Intern this now in case it isn't already done.
7980 Setting this variable twice is harmless.
7981 But don't staticpro it here--that is done in alloc.c. */
7982 Qchar_table_extra_slots = intern ("char-table-extra-slots");
7983
7984 DEFSYM (Qtranslation_table, "translation-table");
7985 Fput (Qtranslation_table, Qchar_table_extra_slots, make_number (1));
7986 DEFSYM (Qtranslation_table_id, "translation-table-id");
7987 DEFSYM (Qtranslation_table_for_decode, "translation-table-for-decode");
7988 DEFSYM (Qtranslation_table_for_encode, "translation-table-for-encode");
7989
7990 DEFSYM (Qvalid_codes, "valid-codes");
7991
7992 DEFSYM (Qemacs_mule, "emacs-mule");
7993
7994 Vcoding_category_table
7995 = Fmake_vector (make_number (coding_category_max), Qnil);
7996 staticpro (&Vcoding_category_table);
7997 /* Followings are target of code detection. */
7998 ASET (Vcoding_category_table, coding_category_iso_7,
7999 intern ("coding-category-iso-7"));
8000 ASET (Vcoding_category_table, coding_category_iso_7_tight,
8001 intern ("coding-category-iso-7-tight"));
8002 ASET (Vcoding_category_table, coding_category_iso_8_1,
8003 intern ("coding-category-iso-8-1"));
8004 ASET (Vcoding_category_table, coding_category_iso_8_2,
8005 intern ("coding-category-iso-8-2"));
8006 ASET (Vcoding_category_table, coding_category_iso_7_else,
8007 intern ("coding-category-iso-7-else"));
8008 ASET (Vcoding_category_table, coding_category_iso_8_else,
8009 intern ("coding-category-iso-8-else"));
8010 ASET (Vcoding_category_table, coding_category_utf_8,
8011 intern ("coding-category-utf-8"));
8012 ASET (Vcoding_category_table, coding_category_utf_16_be,
8013 intern ("coding-category-utf-16-be"));
8014 ASET (Vcoding_category_table, coding_category_utf_16_le,
8015 intern ("coding-category-utf-16-le"));
8016 ASET (Vcoding_category_table, coding_category_utf_16_be_nosig,
8017 intern ("coding-category-utf-16-be-nosig"));
8018 ASET (Vcoding_category_table, coding_category_utf_16_le_nosig,
8019 intern ("coding-category-utf-16-le-nosig"));
8020 ASET (Vcoding_category_table, coding_category_charset,
8021 intern ("coding-category-charset"));
8022 ASET (Vcoding_category_table, coding_category_sjis,
8023 intern ("coding-category-sjis"));
8024 ASET (Vcoding_category_table, coding_category_big5,
8025 intern ("coding-category-big5"));
8026 ASET (Vcoding_category_table, coding_category_ccl,
8027 intern ("coding-category-ccl"));
8028 ASET (Vcoding_category_table, coding_category_emacs_mule,
8029 intern ("coding-category-emacs-mule"));
8030 /* Followings are NOT target of code detection. */
8031 ASET (Vcoding_category_table, coding_category_raw_text,
8032 intern ("coding-category-raw-text"));
8033 ASET (Vcoding_category_table, coding_category_undecided,
8034 intern ("coding-category-undecided"));
8035
8036 defsubr (&Scoding_system_p);
8037 defsubr (&Sread_coding_system);
8038 defsubr (&Sread_non_nil_coding_system);
8039 defsubr (&Scheck_coding_system);
8040 defsubr (&Sdetect_coding_region);
8041 defsubr (&Sdetect_coding_string);
8042 defsubr (&Sfind_coding_systems_region_internal);
8043 defsubr (&Scheck_coding_systems_region);
8044 defsubr (&Sdecode_coding_region);
8045 defsubr (&Sencode_coding_region);
8046 defsubr (&Sdecode_coding_string);
8047 defsubr (&Sencode_coding_string);
8048 defsubr (&Sdecode_sjis_char);
8049 defsubr (&Sencode_sjis_char);
8050 defsubr (&Sdecode_big5_char);
8051 defsubr (&Sencode_big5_char);
8052 defsubr (&Sset_terminal_coding_system_internal);
8053 defsubr (&Sset_safe_terminal_coding_system_internal);
8054 defsubr (&Sterminal_coding_system);
8055 defsubr (&Sset_keyboard_coding_system_internal);
8056 defsubr (&Skeyboard_coding_system);
8057 defsubr (&Sfind_operation_coding_system);
8058 defsubr (&Sset_coding_system_priority);
8059 defsubr (&Sdefine_coding_system_internal);
8060 defsubr (&Sdefine_coding_system_alias);
8061 defsubr (&Scoding_system_base);
8062 defsubr (&Scoding_system_plist);
8063 defsubr (&Scoding_system_aliases);
8064 defsubr (&Scoding_system_eol_type);
8065 defsubr (&Scoding_system_priority_list);
8066
8067 DEFVAR_LISP ("coding-system-list", &Vcoding_system_list,
8068 doc: /* List of coding systems.
8069
8070 Do not alter the value of this variable manually. This variable should be
8071 updated by the functions `define-coding-system' and
8072 `define-coding-system-alias'. */);
8073 Vcoding_system_list = Qnil;
8074
8075 DEFVAR_LISP ("coding-system-alist", &Vcoding_system_alist,
8076 doc: /* Alist of coding system names.
8077 Each element is one element list of coding system name.
8078 This variable is given to `completing-read' as TABLE argument.
8079
8080 Do not alter the value of this variable manually. This variable should be
8081 updated by the functions `make-coding-system' and
8082 `define-coding-system-alias'. */);
8083 Vcoding_system_alist = Qnil;
8084
8085 DEFVAR_LISP ("coding-category-list", &Vcoding_category_list,
8086 doc: /* List of coding-categories (symbols) ordered by priority.
8087
8088 On detecting a coding system, Emacs tries code detection algorithms
8089 associated with each coding-category one by one in this order. When
8090 one algorithm agrees with a byte sequence of source text, the coding
8091 system bound to the corresponding coding-category is selected. */);
8092 {
8093 int i;
8094
8095 Vcoding_category_list = Qnil;
8096 for (i = coding_category_max - 1; i >= 0; i--)
8097 Vcoding_category_list
8098 = Fcons (XVECTOR (Vcoding_category_table)->contents[i],
8099 Vcoding_category_list);
8100 }
8101
8102 DEFVAR_LISP ("coding-system-for-read", &Vcoding_system_for_read,
8103 doc: /* Specify the coding system for read operations.
8104 It is useful to bind this variable with `let', but do not set it globally.
8105 If the value is a coding system, it is used for decoding on read operation.
8106 If not, an appropriate element is used from one of the coding system alists:
8107 There are three such tables, `file-coding-system-alist',
8108 `process-coding-system-alist', and `network-coding-system-alist'. */);
8109 Vcoding_system_for_read = Qnil;
8110
8111 DEFVAR_LISP ("coding-system-for-write", &Vcoding_system_for_write,
8112 doc: /* Specify the coding system for write operations.
8113 Programs bind this variable with `let', but you should not set it globally.
8114 If the value is a coding system, it is used for encoding of output,
8115 when writing it to a file and when sending it to a file or subprocess.
8116
8117 If this does not specify a coding system, an appropriate element
8118 is used from one of the coding system alists:
8119 There are three such tables, `file-coding-system-alist',
8120 `process-coding-system-alist', and `network-coding-system-alist'.
8121 For output to files, if the above procedure does not specify a coding system,
8122 the value of `buffer-file-coding-system' is used. */);
8123 Vcoding_system_for_write = Qnil;
8124
8125 DEFVAR_LISP ("last-coding-system-used", &Vlast_coding_system_used,
8126 doc: /*
8127 Coding system used in the latest file or process I/O. */);
8128 Vlast_coding_system_used = Qnil;
8129
8130 DEFVAR_BOOL ("inhibit-eol-conversion", &inhibit_eol_conversion,
8131 doc: /*
8132 *Non-nil means always inhibit code conversion of end-of-line format.
8133 See info node `Coding Systems' and info node `Text and Binary' concerning
8134 such conversion. */);
8135 inhibit_eol_conversion = 0;
8136
8137 DEFVAR_BOOL ("inherit-process-coding-system", &inherit_process_coding_system,
8138 doc: /*
8139 Non-nil means process buffer inherits coding system of process output.
8140 Bind it to t if the process output is to be treated as if it were a file
8141 read from some filesystem. */);
8142 inherit_process_coding_system = 0;
8143
8144 DEFVAR_LISP ("file-coding-system-alist", &Vfile_coding_system_alist,
8145 doc: /*
8146 Alist to decide a coding system to use for a file I/O operation.
8147 The format is ((PATTERN . VAL) ...),
8148 where PATTERN is a regular expression matching a file name,
8149 VAL is a coding system, a cons of coding systems, or a function symbol.
8150 If VAL is a coding system, it is used for both decoding and encoding
8151 the file contents.
8152 If VAL is a cons of coding systems, the car part is used for decoding,
8153 and the cdr part is used for encoding.
8154 If VAL is a function symbol, the function must return a coding system
8155 or a cons of coding systems which are used as above. The function gets
8156 the arguments with which `find-operation-coding-systems' was called.
8157
8158 See also the function `find-operation-coding-system'
8159 and the variable `auto-coding-alist'. */);
8160 Vfile_coding_system_alist = Qnil;
8161
8162 DEFVAR_LISP ("process-coding-system-alist", &Vprocess_coding_system_alist,
8163 doc: /*
8164 Alist to decide a coding system to use for a process I/O operation.
8165 The format is ((PATTERN . VAL) ...),
8166 where PATTERN is a regular expression matching a program name,
8167 VAL is a coding system, a cons of coding systems, or a function symbol.
8168 If VAL is a coding system, it is used for both decoding what received
8169 from the program and encoding what sent to the program.
8170 If VAL is a cons of coding systems, the car part is used for decoding,
8171 and the cdr part is used for encoding.
8172 If VAL is a function symbol, the function must return a coding system
8173 or a cons of coding systems which are used as above.
8174
8175 See also the function `find-operation-coding-system'. */);
8176 Vprocess_coding_system_alist = Qnil;
8177
8178 DEFVAR_LISP ("network-coding-system-alist", &Vnetwork_coding_system_alist,
8179 doc: /*
8180 Alist to decide a coding system to use for a network I/O operation.
8181 The format is ((PATTERN . VAL) ...),
8182 where PATTERN is a regular expression matching a network service name
8183 or is a port number to connect to,
8184 VAL is a coding system, a cons of coding systems, or a function symbol.
8185 If VAL is a coding system, it is used for both decoding what received
8186 from the network stream and encoding what sent to the network stream.
8187 If VAL is a cons of coding systems, the car part is used for decoding,
8188 and the cdr part is used for encoding.
8189 If VAL is a function symbol, the function must return a coding system
8190 or a cons of coding systems which are used as above.
8191
8192 See also the function `find-operation-coding-system'. */);
8193 Vnetwork_coding_system_alist = Qnil;
8194
8195 DEFVAR_LISP ("locale-coding-system", &Vlocale_coding_system,
8196 doc: /* Coding system to use with system messages.
8197 Also used for decoding keyboard input on X Window system. */);
8198 Vlocale_coding_system = Qnil;
8199
8200 /* The eol mnemonics are reset in startup.el system-dependently. */
8201 DEFVAR_LISP ("eol-mnemonic-unix", &eol_mnemonic_unix,
8202 doc: /*
8203 *String displayed in mode line for UNIX-like (LF) end-of-line format. */);
8204 eol_mnemonic_unix = build_string (":");
8205
8206 DEFVAR_LISP ("eol-mnemonic-dos", &eol_mnemonic_dos,
8207 doc: /*
8208 *String displayed in mode line for DOS-like (CRLF) end-of-line format. */);
8209 eol_mnemonic_dos = build_string ("\\");
8210
8211 DEFVAR_LISP ("eol-mnemonic-mac", &eol_mnemonic_mac,
8212 doc: /*
8213 *String displayed in mode line for MAC-like (CR) end-of-line format. */);
8214 eol_mnemonic_mac = build_string ("/");
8215
8216 DEFVAR_LISP ("eol-mnemonic-undecided", &eol_mnemonic_undecided,
8217 doc: /*
8218 *String displayed in mode line when end-of-line format is not yet determined. */);
8219 eol_mnemonic_undecided = build_string (":");
8220
8221 DEFVAR_LISP ("enable-character-translation", &Venable_character_translation,
8222 doc: /*
8223 *Non-nil enables character translation while encoding and decoding. */);
8224 Venable_character_translation = Qt;
8225
8226 DEFVAR_LISP ("standard-translation-table-for-decode",
8227 &Vstandard_translation_table_for_decode,
8228 doc: /* Table for translating characters while decoding. */);
8229 Vstandard_translation_table_for_decode = Qnil;
8230
8231 DEFVAR_LISP ("standard-translation-table-for-encode",
8232 &Vstandard_translation_table_for_encode,
8233 doc: /* Table for translating characters while encoding. */);
8234 Vstandard_translation_table_for_encode = Qnil;
8235
8236 DEFVAR_LISP ("charset-revision-table", &Vcharset_revision_table,
8237 doc: /* Alist of charsets vs revision numbers.
8238 While encoding, if a charset (car part of an element) is found,
8239 designate it with the escape sequence identifying revision (cdr part
8240 of the element). */);
8241 Vcharset_revision_table = Qnil;
8242
8243 DEFVAR_LISP ("default-process-coding-system",
8244 &Vdefault_process_coding_system,
8245 doc: /* Cons of coding systems used for process I/O by default.
8246 The car part is used for decoding a process output,
8247 the cdr part is used for encoding a text to be sent to a process. */);
8248 Vdefault_process_coding_system = Qnil;
8249
8250 DEFVAR_LISP ("latin-extra-code-table", &Vlatin_extra_code_table,
8251 doc: /*
8252 Table of extra Latin codes in the range 128..159 (inclusive).
8253 This is a vector of length 256.
8254 If Nth element is non-nil, the existence of code N in a file
8255 \(or output of subprocess) doesn't prevent it to be detected as
8256 a coding system of ISO 2022 variant which has a flag
8257 `accept-latin-extra-code' t (e.g. iso-latin-1) on reading a file
8258 or reading output of a subprocess.
8259 Only 128th through 159th elements has a meaning. */);
8260 Vlatin_extra_code_table = Fmake_vector (make_number (256), Qnil);
8261
8262 DEFVAR_LISP ("select-safe-coding-system-function",
8263 &Vselect_safe_coding_system_function,
8264 doc: /*
8265 Function to call to select safe coding system for encoding a text.
8266
8267 If set, this function is called to force a user to select a proper
8268 coding system which can encode the text in the case that a default
8269 coding system used in each operation can't encode the text.
8270
8271 The default value is `select-safe-coding-system' (which see). */);
8272 Vselect_safe_coding_system_function = Qnil;
8273
8274 DEFVAR_BOOL ("inhibit-iso-escape-detection",
8275 &inhibit_iso_escape_detection,
8276 doc: /*
8277 If non-nil, Emacs ignores ISO2022's escape sequence on code detection.
8278
8279 By default, on reading a file, Emacs tries to detect how the text is
8280 encoded. This code detection is sensitive to escape sequences. If
8281 the sequence is valid as ISO2022, the code is determined as one of
8282 the ISO2022 encodings, and the file is decoded by the corresponding
8283 coding system (e.g. `iso-2022-7bit').
8284
8285 However, there may be a case that you want to read escape sequences in
8286 a file as is. In such a case, you can set this variable to non-nil.
8287 Then, as the code detection ignores any escape sequences, no file is
8288 detected as encoded in some ISO2022 encoding. The result is that all
8289 escape sequences become visible in a buffer.
8290
8291 The default value is nil, and it is strongly recommended not to change
8292 it. That is because many Emacs Lisp source files that contain
8293 non-ASCII characters are encoded by the coding system `iso-2022-7bit'
8294 in Emacs's distribution, and they won't be decoded correctly on
8295 reading if you suppress escape sequence detection.
8296
8297 The other way to read escape sequences in a file without decoding is
8298 to explicitly specify some coding system that doesn't use ISO2022's
8299 escape sequence (e.g `latin-1') on reading by \\[universal-coding-system-argument]. */);
8300 inhibit_iso_escape_detection = 0;
8301
8302 {
8303 Lisp_Object args[coding_arg_max];
8304 Lisp_Object plist[14];
8305 int i;
8306
8307 for (i = 0; i < coding_arg_max; i++)
8308 args[i] = Qnil;
8309
8310 plist[0] = intern (":name");
8311 plist[1] = args[coding_arg_name] = Qno_conversion;
8312 plist[2] = intern (":mnemonic");
8313 plist[3] = args[coding_arg_mnemonic] = make_number ('=');
8314 plist[4] = intern (":coding-type");
8315 plist[5] = args[coding_arg_coding_type] = Qraw_text;
8316 plist[6] = intern (":ascii-compatible-p");
8317 plist[7] = args[coding_arg_ascii_compatible_p] = Qt;
8318 plist[8] = intern (":default-char");
8319 plist[9] = args[coding_arg_default_char] = make_number (0);
8320 plist[10] = intern (":docstring");
8321 plist[11] = build_string ("Do no conversion.\n\
8322 \n\
8323 When you visit a file with this coding, the file is read into a\n\
8324 unibyte buffer as is, thus each byte of a file is treated as a\n\
8325 character.");
8326 plist[12] = intern (":eol-type");
8327 plist[13] = args[coding_arg_eol_type] = Qunix;
8328 args[coding_arg_plist] = Flist (14, plist);
8329 Fdefine_coding_system_internal (coding_arg_max, args);
8330 }
8331
8332 setup_coding_system (Qno_conversion, &keyboard_coding);
8333 setup_coding_system (Qno_conversion, &terminal_coding);
8334 setup_coding_system (Qno_conversion, &safe_terminal_coding);
8335 }
8336
8337 char *
8338 emacs_strerror (error_number)
8339 int error_number;
8340 {
8341 char *str;
8342
8343 synchronize_system_messages_locale ();
8344 str = strerror (error_number);
8345
8346 if (! NILP (Vlocale_coding_system))
8347 {
8348 Lisp_Object dec = code_convert_string_norecord (build_string (str),
8349 Vlocale_coding_system,
8350 0);
8351 str = (char *) XSTRING (dec)->data;
8352 }
8353
8354 return str;
8355 }
8356
8357 #endif /* emacs */