(CODING_ISO_FLAG_USE_ROMAN): New macro
[bpt/emacs.git] / src / coding.c
1 /* Coding system handler (conversion, detection, and etc).
2 Copyright (C) 1995, 1997, 1998 Electrotechnical Laboratory, JAPAN.
3 Licensed to the Free Software Foundation.
4 Copyright (C) 2001 Free Software Foundation, Inc.
5 Copyright (C) 2001, 2002
6 National Institute of Advanced Industrial Science and Technology (AIST)
7 Registration Number H13PRO009
8
9 This file is part of GNU Emacs.
10
11 GNU Emacs is free software; you can redistribute it and/or modify
12 it under the terms of the GNU General Public License as published by
13 the Free Software Foundation; either version 2, or (at your option)
14 any later version.
15
16 GNU Emacs is distributed in the hope that it will be useful,
17 but WITHOUT ANY WARRANTY; without even the implied warranty of
18 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19 GNU General Public License for more details.
20
21 You should have received a copy of the GNU General Public License
22 along with GNU Emacs; see the file COPYING. If not, write to
23 the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
24 Boston, MA 02111-1307, USA. */
25
26 /*** TABLE OF CONTENTS ***
27
28 0. General comments
29 1. Preamble
30 2. Emacs' internal format (emacs-utf-8) handlers
31 3. UTF-8 handlers
32 4. UTF-16 handlers
33 5. Charset-base coding systems handlers
34 6. emacs-mule (old Emacs' internal format) handlers
35 7. ISO2022 handlers
36 8. Shift-JIS and BIG5 handlers
37 9. CCL handlers
38 10. C library functions
39 11. Emacs Lisp library functions
40 12. Postamble
41
42 */
43
44 /*** 0. General comments ***
45
46
47 CODING SYSTEM
48
49 A coding system is an object for an encoding mechanism that contains
50 information about how to convert byte sequences to character
51 sequences and vice versa. When we say "decode", it means converting
52 a byte sequence of a specific coding system into a character
53 sequence that is represented by Emacs' internal coding system
54 `emacs-utf-8', and when we say "encode", it means converting a
55 character sequence of emacs-utf-8 to a byte sequence of a specific
56 coding system.
57
58 In Emacs Lisp, a coding system is represented by a Lisp symbol. In
59 C level, a coding system is represented by a vector of attributes
60 stored in the hash table Vcharset_hash_table. The conversion from
61 coding system symbol to attributes vector is done by looking up
62 Vcharset_hash_table by the symbol.
63
64 Coding systems are classified into the following types depending on
65 the encoding mechanism. Here's a brief description of the types.
66
67 o UTF-8
68
69 o UTF-16
70
71 o Charset-base coding system
72
73 A coding system defined by one or more (coded) character sets.
74 Decoding and encoding are done by a code converter defined for each
75 character set.
76
77 o Old Emacs internal format (emacs-mule)
78
79 The coding system adopted by old versions of Emacs (20 and 21).
80
81 o ISO2022-base coding system
82
83 The most famous coding system for multiple character sets. X's
84 Compound Text, various EUCs (Extended Unix Code), and coding systems
85 used in the Internet communication such as ISO-2022-JP are all
86 variants of ISO2022.
87
88 o SJIS (or Shift-JIS or MS-Kanji-Code)
89
90 A coding system to encode character sets: ASCII, JISX0201, and
91 JISX0208. Widely used for PC's in Japan. Details are described in
92 section 8.
93
94 o BIG5
95
96 A coding system to encode character sets: ASCII and Big5. Widely
97 used by Chinese (mainly in Taiwan and Hong Kong). Details are
98 described in section 8. In this file, when we write "big5" (all
99 lowercase), we mean the coding system, and when we write "Big5"
100 (capitalized), we mean the character set.
101
102 o CCL
103
104 If a user wants to decode/encode text encoded in a coding system
105 not listed above, he can supply a decoder and an encoder for it in
106 CCL (Code Conversion Language) programs. Emacs executes the CCL
107 program while decoding/encoding.
108
109 o Raw-text
110
111 A coding system for a text containing raw eight-bit data. Emacs
112 treats each byte of source text as a character (except for
113 end-of-line conversion).
114
115 o No-conversion
116
117 Like raw text, but don't do end-of-line conversion.
118
119
120 END-OF-LINE FORMAT
121
122 How text end-of-line is encoded depends on operating system. For
123 instance, Unix's format is just one byte of LF (line-feed) code,
124 whereas DOS's format is two-byte sequence of `carriage-return' and
125 `line-feed' codes. MacOS's format is usually one byte of
126 `carriage-return'.
127
128 Since text character encoding and end-of-line encoding are
129 independent, any coding system described above can take any format
130 of end-of-line (except for no-conversion).
131
132 STRUCT CODING_SYSTEM
133
134 Before using a coding system for code conversion (i.e. decoding and
135 encoding), we setup a structure of type `struct coding_system'.
136 This structure keeps various information about a specific code
137 conversion (e.g. the location of source and destination data).
138
139 */
140
141 /* COMMON MACROS */
142
143
144 /*** GENERAL NOTES on `detect_coding_XXX ()' functions ***
145
146 These functions check if a byte sequence specified as a source in
147 CODING conforms to the format of XXX. Return 1 if the data contains
148 a byte sequence which can be decoded into non-ASCII characters by
149 the coding system. Otherwize (i.e. the data contains only ASCII
150 characters or invalid sequence) return 0.
151
152 It also resets some bits of an integer pointed by MASK. The macros
153 CATEGORY_MASK_XXX specifies each bit of this integer.
154
155 Below is the template of these functions. */
156
157 #if 0
158 static int
159 detect_coding_XXX (coding, mask)
160 struct coding_system *coding;
161 int *mask;
162 {
163 unsigned char *src = coding->source;
164 unsigned char *src_end = coding->source + coding->src_bytes;
165 int multibytep = coding->src_multibyte;
166 int c;
167 int found = 0;
168 ...;
169
170 while (1)
171 {
172 /* Get one byte from the source. If the souce is exausted, jump
173 to no_more_source:. */
174 ONE_MORE_BYTE (c);
175 /* Check if it conforms to XXX. If not, break the loop. */
176 }
177 /* As the data is invalid for XXX, reset a proper bits. */
178 *mask &= ~CODING_CATEGORY_XXX;
179 return 0;
180 no_more_source:
181 /* The source exausted. */
182 if (!found)
183 /* ASCII characters only. */
184 return 0;
185 /* Some data should be decoded into non-ASCII characters. */
186 *mask &= CODING_CATEGORY_XXX;
187 return 1;
188 }
189 #endif
190
191 /*** GENERAL NOTES on `decode_coding_XXX ()' functions ***
192
193 These functions decode a byte sequence specified as a source by
194 CODING. The resulting multibyte text goes to a place pointed to by
195 CODING->charbuf, the length of which should not exceed
196 CODING->charbuf_size;
197
198 These functions set the information of original and decoded texts in
199 CODING->consumed, CODING->consumed_char, and CODING->charbuf_used.
200 They also set CODING->result to one of CODING_RESULT_XXX indicating
201 how the decoding is finished.
202
203 Below is the template of these functions. */
204
205 #if 0
206 static void
207 decode_coding_XXXX (coding)
208 struct coding_system *coding;
209 {
210 unsigned char *src = coding->source + coding->consumed;
211 unsigned char *src_end = coding->source + coding->src_bytes;
212 /* SRC_BASE remembers the start position in source in each loop.
213 The loop will be exited when there's not enough source code, or
214 when there's no room in CHARBUF for a decoded character. */
215 unsigned char *src_base;
216 /* A buffer to produce decoded characters. */
217 int *charbuf = coding->charbuf;
218 int *charbuf_end = charbuf + coding->charbuf_size;
219 int multibytep = coding->src_multibyte;
220
221 while (1)
222 {
223 src_base = src;
224 if (charbuf < charbuf_end)
225 /* No more room to produce a decoded character. */
226 break;
227 ONE_MORE_BYTE (c);
228 /* Decode it. */
229 }
230
231 no_more_source:
232 if (src_base < src_end
233 && coding->mode & CODING_MODE_LAST_BLOCK)
234 /* If the source ends by partial bytes to construct a character,
235 treat them as eight-bit raw data. */
236 while (src_base < src_end && charbuf < charbuf_end)
237 *charbuf++ = *src_base++;
238 /* Remember how many bytes and characters we consumed. If the
239 source is multibyte, the bytes and chars are not identical. */
240 coding->consumed = coding->consumed_char = src_base - coding->source;
241 /* Remember how many characters we produced. */
242 coding->charbuf_used = charbuf - coding->charbuf;
243 }
244 #endif
245
246 /*** GENERAL NOTES on `encode_coding_XXX ()' functions ***
247
248 These functions encode SRC_BYTES length text at SOURCE of Emacs'
249 internal multibyte format by CODING. The resulting byte sequence
250 goes to a place pointed to by DESTINATION, the length of which
251 should not exceed DST_BYTES.
252
253 These functions set the information of original and encoded texts in
254 the members produced, produced_char, consumed, and consumed_char of
255 the structure *CODING. They also set the member result to one of
256 CODING_RESULT_XXX indicating how the encoding finished.
257
258 DST_BYTES zero means that source area and destination area are
259 overlapped, which means that we can produce a encoded text until it
260 reaches at the head of not-yet-encoded source text.
261
262 Below is a template of these functions. */
263 #if 0
264 static void
265 encode_coding_XXX (coding)
266 struct coding_system *coding;
267 {
268 int multibytep = coding->dst_multibyte;
269 int *charbuf = coding->charbuf;
270 int *charbuf_end = charbuf->charbuf + coding->charbuf_used;
271 unsigned char *dst = coding->destination + coding->produced;
272 unsigned char *dst_end = coding->destination + coding->dst_bytes;
273 unsigned char *adjusted_dst_end = dst_end - _MAX_BYTES_PRODUCED_IN_LOOP_;
274 int produced_chars = 0;
275
276 for (; charbuf < charbuf_end && dst < adjusted_dst_end; charbuf++)
277 {
278 int c = *charbuf;
279 /* Encode C into DST, and increment DST. */
280 }
281 label_no_more_destination:
282 /* How many chars and bytes we produced. */
283 coding->produced_char += produced_chars;
284 coding->produced = dst - coding->destination;
285 }
286 #endif
287
288 \f
289 /*** 1. Preamble ***/
290
291 #include <config.h>
292 #include <stdio.h>
293
294 #include "lisp.h"
295 #include "buffer.h"
296 #include "character.h"
297 #include "charset.h"
298 #include "ccl.h"
299 #include "composite.h"
300 #include "coding.h"
301 #include "window.h"
302
303 Lisp_Object Vcoding_system_hash_table;
304
305 Lisp_Object Qcoding_system, Qcoding_aliases, Qeol_type;
306 Lisp_Object Qunix, Qdos;
307 extern Lisp_Object Qmac; /* frame.c */
308 Lisp_Object Qbuffer_file_coding_system;
309 Lisp_Object Qpost_read_conversion, Qpre_write_conversion;
310 Lisp_Object Qdefault_char;
311 Lisp_Object Qno_conversion, Qundecided;
312 Lisp_Object Qcharset, Qiso_2022, Qutf_8, Qutf_16, Qshift_jis, Qbig5;
313 Lisp_Object Qutf_16_be_nosig, Qutf_16_be, Qutf_16_le_nosig, Qutf_16_le;
314 Lisp_Object Qsignature, Qendian, Qbig, Qlittle;
315 Lisp_Object Qcoding_system_history;
316 Lisp_Object Qvalid_codes;
317
318 extern Lisp_Object Qinsert_file_contents, Qwrite_region;
319 Lisp_Object Qcall_process, Qcall_process_region, Qprocess_argument;
320 Lisp_Object Qstart_process, Qopen_network_stream;
321 Lisp_Object Qtarget_idx;
322
323 Lisp_Object Vselect_safe_coding_system_function;
324
325 /* Mnemonic string for each format of end-of-line. */
326 Lisp_Object eol_mnemonic_unix, eol_mnemonic_dos, eol_mnemonic_mac;
327 /* Mnemonic string to indicate format of end-of-line is not yet
328 decided. */
329 Lisp_Object eol_mnemonic_undecided;
330
331 #ifdef emacs
332
333 Lisp_Object Vcoding_system_list, Vcoding_system_alist;
334
335 Lisp_Object Qcoding_system_p, Qcoding_system_error;
336
337 /* Coding system emacs-mule and raw-text are for converting only
338 end-of-line format. */
339 Lisp_Object Qemacs_mule, Qraw_text;
340
341 /* Coding-systems are handed between Emacs Lisp programs and C internal
342 routines by the following three variables. */
343 /* Coding-system for reading files and receiving data from process. */
344 Lisp_Object Vcoding_system_for_read;
345 /* Coding-system for writing files and sending data to process. */
346 Lisp_Object Vcoding_system_for_write;
347 /* Coding-system actually used in the latest I/O. */
348 Lisp_Object Vlast_coding_system_used;
349
350 /* A vector of length 256 which contains information about special
351 Latin codes (especially for dealing with Microsoft codes). */
352 Lisp_Object Vlatin_extra_code_table;
353
354 /* Flag to inhibit code conversion of end-of-line format. */
355 int inhibit_eol_conversion;
356
357 /* Flag to inhibit ISO2022 escape sequence detection. */
358 int inhibit_iso_escape_detection;
359
360 /* Flag to make buffer-file-coding-system inherit from process-coding. */
361 int inherit_process_coding_system;
362
363 /* Coding system to be used to encode text for terminal display. */
364 struct coding_system terminal_coding;
365
366 /* Coding system to be used to encode text for terminal display when
367 terminal coding system is nil. */
368 struct coding_system safe_terminal_coding;
369
370 /* Coding system of what is sent from terminal keyboard. */
371 struct coding_system keyboard_coding;
372
373 Lisp_Object Vfile_coding_system_alist;
374 Lisp_Object Vprocess_coding_system_alist;
375 Lisp_Object Vnetwork_coding_system_alist;
376
377 Lisp_Object Vlocale_coding_system;
378
379 #endif /* emacs */
380
381 /* Flag to tell if we look up translation table on character code
382 conversion. */
383 Lisp_Object Venable_character_translation;
384 /* Standard translation table to look up on decoding (reading). */
385 Lisp_Object Vstandard_translation_table_for_decode;
386 /* Standard translation table to look up on encoding (writing). */
387 Lisp_Object Vstandard_translation_table_for_encode;
388
389 Lisp_Object Qtranslation_table;
390 Lisp_Object Qtranslation_table_id;
391 Lisp_Object Qtranslation_table_for_decode;
392 Lisp_Object Qtranslation_table_for_encode;
393
394 /* Alist of charsets vs revision number. */
395 static Lisp_Object Vcharset_revision_table;
396
397 /* Default coding systems used for process I/O. */
398 Lisp_Object Vdefault_process_coding_system;
399
400 /* Global flag to tell that we can't call post-read-conversion and
401 pre-write-conversion functions. Usually the value is zero, but it
402 is set to 1 temporarily while such functions are running. This is
403 to avoid infinite recursive call. */
404 static int inhibit_pre_post_conversion;
405
406 /* Two special coding systems. */
407 Lisp_Object Vsjis_coding_system;
408 Lisp_Object Vbig5_coding_system;
409
410
411 static int detect_coding_utf_8 P_ ((struct coding_system *, int *));
412 static void decode_coding_utf_8 P_ ((struct coding_system *));
413 static int encode_coding_utf_8 P_ ((struct coding_system *));
414
415 static int detect_coding_utf_16 P_ ((struct coding_system *, int *));
416 static void decode_coding_utf_16 P_ ((struct coding_system *));
417 static int encode_coding_utf_16 P_ ((struct coding_system *));
418
419 static int detect_coding_iso_2022 P_ ((struct coding_system *, int *));
420 static void decode_coding_iso_2022 P_ ((struct coding_system *));
421 static int encode_coding_iso_2022 P_ ((struct coding_system *));
422
423 static int detect_coding_emacs_mule P_ ((struct coding_system *, int *));
424 static void decode_coding_emacs_mule P_ ((struct coding_system *));
425 static int encode_coding_emacs_mule P_ ((struct coding_system *));
426
427 static int detect_coding_sjis P_ ((struct coding_system *, int *));
428 static void decode_coding_sjis P_ ((struct coding_system *));
429 static int encode_coding_sjis P_ ((struct coding_system *));
430
431 static int detect_coding_big5 P_ ((struct coding_system *, int *));
432 static void decode_coding_big5 P_ ((struct coding_system *));
433 static int encode_coding_big5 P_ ((struct coding_system *));
434
435 static int detect_coding_ccl P_ ((struct coding_system *, int *));
436 static void decode_coding_ccl P_ ((struct coding_system *));
437 static int encode_coding_ccl P_ ((struct coding_system *));
438
439 static void decode_coding_raw_text P_ ((struct coding_system *));
440 static int encode_coding_raw_text P_ ((struct coding_system *));
441
442
443 /* ISO2022 section */
444
445 #define CODING_ISO_INITIAL(coding, reg) \
446 (XINT (AREF (AREF (CODING_ID_ATTRS ((coding)->id), \
447 coding_attr_iso_initial), \
448 reg)))
449
450
451 #define CODING_ISO_REQUEST(coding, charset_id) \
452 ((charset_id <= (coding)->max_charset_id \
453 ? (coding)->safe_charsets[charset_id] \
454 : -1))
455
456
457 #define CODING_ISO_FLAGS(coding) \
458 ((coding)->spec.iso_2022.flags)
459 #define CODING_ISO_DESIGNATION(coding, reg) \
460 ((coding)->spec.iso_2022.current_designation[reg])
461 #define CODING_ISO_INVOCATION(coding, plane) \
462 ((coding)->spec.iso_2022.current_invocation[plane])
463 #define CODING_ISO_SINGLE_SHIFTING(coding) \
464 ((coding)->spec.iso_2022.single_shifting)
465 #define CODING_ISO_BOL(coding) \
466 ((coding)->spec.iso_2022.bol)
467 #define CODING_ISO_INVOKED_CHARSET(coding, plane) \
468 CODING_ISO_DESIGNATION ((coding), CODING_ISO_INVOCATION ((coding), (plane)))
469
470 /* Control characters of ISO2022. */
471 /* code */ /* function */
472 #define ISO_CODE_LF 0x0A /* line-feed */
473 #define ISO_CODE_CR 0x0D /* carriage-return */
474 #define ISO_CODE_SO 0x0E /* shift-out */
475 #define ISO_CODE_SI 0x0F /* shift-in */
476 #define ISO_CODE_SS2_7 0x19 /* single-shift-2 for 7-bit code */
477 #define ISO_CODE_ESC 0x1B /* escape */
478 #define ISO_CODE_SS2 0x8E /* single-shift-2 */
479 #define ISO_CODE_SS3 0x8F /* single-shift-3 */
480 #define ISO_CODE_CSI 0x9B /* control-sequence-introducer */
481
482 /* All code (1-byte) of ISO2022 is classified into one of the
483 followings. */
484 enum iso_code_class_type
485 {
486 ISO_control_0, /* Control codes in the range
487 0x00..0x1F and 0x7F, except for the
488 following 5 codes. */
489 ISO_carriage_return, /* ISO_CODE_CR (0x0D) */
490 ISO_shift_out, /* ISO_CODE_SO (0x0E) */
491 ISO_shift_in, /* ISO_CODE_SI (0x0F) */
492 ISO_single_shift_2_7, /* ISO_CODE_SS2_7 (0x19) */
493 ISO_escape, /* ISO_CODE_SO (0x1B) */
494 ISO_control_1, /* Control codes in the range
495 0x80..0x9F, except for the
496 following 3 codes. */
497 ISO_single_shift_2, /* ISO_CODE_SS2 (0x8E) */
498 ISO_single_shift_3, /* ISO_CODE_SS3 (0x8F) */
499 ISO_control_sequence_introducer, /* ISO_CODE_CSI (0x9B) */
500 ISO_0x20_or_0x7F, /* Codes of the values 0x20 or 0x7F. */
501 ISO_graphic_plane_0, /* Graphic codes in the range 0x21..0x7E. */
502 ISO_0xA0_or_0xFF, /* Codes of the values 0xA0 or 0xFF. */
503 ISO_graphic_plane_1 /* Graphic codes in the range 0xA1..0xFE. */
504 };
505
506 /** The macros CODING_ISO_FLAG_XXX defines a flag bit of the
507 `iso-flags' attribute of an iso2022 coding system. */
508
509 /* If set, produce long-form designation sequence (e.g. ESC $ ( A)
510 instead of the correct short-form sequence (e.g. ESC $ A). */
511 #define CODING_ISO_FLAG_LONG_FORM 0x0001
512
513 /* If set, reset graphic planes and registers at end-of-line to the
514 initial state. */
515 #define CODING_ISO_FLAG_RESET_AT_EOL 0x0002
516
517 /* If set, reset graphic planes and registers before any control
518 characters to the initial state. */
519 #define CODING_ISO_FLAG_RESET_AT_CNTL 0x0004
520
521 /* If set, encode by 7-bit environment. */
522 #define CODING_ISO_FLAG_SEVEN_BITS 0x0008
523
524 /* If set, use locking-shift function. */
525 #define CODING_ISO_FLAG_LOCKING_SHIFT 0x0010
526
527 /* If set, use single-shift function. Overwrite
528 CODING_ISO_FLAG_LOCKING_SHIFT. */
529 #define CODING_ISO_FLAG_SINGLE_SHIFT 0x0020
530
531 /* If set, use designation escape sequence. */
532 #define CODING_ISO_FLAG_DESIGNATION 0x0040
533
534 /* If set, produce revision number sequence. */
535 #define CODING_ISO_FLAG_REVISION 0x0080
536
537 /* If set, produce ISO6429's direction specifying sequence. */
538 #define CODING_ISO_FLAG_DIRECTION 0x0100
539
540 /* If set, assume designation states are reset at beginning of line on
541 output. */
542 #define CODING_ISO_FLAG_INIT_AT_BOL 0x0200
543
544 /* If set, designation sequence should be placed at beginning of line
545 on output. */
546 #define CODING_ISO_FLAG_DESIGNATE_AT_BOL 0x0400
547
548 /* If set, do not encode unsafe charactes on output. */
549 #define CODING_ISO_FLAG_SAFE 0x0800
550
551 /* If set, extra latin codes (128..159) are accepted as a valid code
552 on input. */
553 #define CODING_ISO_FLAG_LATIN_EXTRA 0x1000
554
555 #define CODING_ISO_FLAG_COMPOSITION 0x2000
556
557 #define CODING_ISO_FLAG_EUC_TW_SHIFT 0x4000
558
559 #define CODING_ISO_FLAG_USE_ROMAN 0x8000
560
561 #define CODING_ISO_FLAG_USE_OLDJIS 0x10000
562
563 #define CODING_ISO_FLAG_FULL_SUPPORT 0x100000
564
565 /* A character to be produced on output if encoding of the original
566 character is prohibited by CODING_ISO_FLAG_SAFE. */
567 #define CODING_INHIBIT_CHARACTER_SUBSTITUTION '?'
568
569
570 /* UTF-16 section */
571 #define CODING_UTF_16_BOM(coding) \
572 ((coding)->spec.utf_16.bom)
573
574 #define CODING_UTF_16_ENDIAN(coding) \
575 ((coding)->spec.utf_16.endian)
576
577 #define CODING_UTF_16_SURROGATE(coding) \
578 ((coding)->spec.utf_16.surrogate)
579
580
581 /* CCL section */
582 #define CODING_CCL_DECODER(coding) \
583 AREF (CODING_ID_ATTRS ((coding)->id), coding_attr_ccl_decoder)
584 #define CODING_CCL_ENCODER(coding) \
585 AREF (CODING_ID_ATTRS ((coding)->id), coding_attr_ccl_encoder)
586 #define CODING_CCL_VALIDS(coding) \
587 (XSTRING (AREF (CODING_ID_ATTRS ((coding)->id), coding_attr_ccl_valids)) \
588 ->data)
589
590 /* Index for each coding category in `coding_category_table' */
591
592 enum coding_category
593 {
594 coding_category_iso_7,
595 coding_category_iso_7_tight,
596 coding_category_iso_8_1,
597 coding_category_iso_8_2,
598 coding_category_iso_7_else,
599 coding_category_iso_8_else,
600 coding_category_utf_8,
601 coding_category_utf_16_auto,
602 coding_category_utf_16_be,
603 coding_category_utf_16_le,
604 coding_category_utf_16_be_nosig,
605 coding_category_utf_16_le_nosig,
606 coding_category_charset,
607 coding_category_sjis,
608 coding_category_big5,
609 coding_category_ccl,
610 coding_category_emacs_mule,
611 /* All above are targets of code detection. */
612 coding_category_raw_text,
613 coding_category_undecided,
614 coding_category_max
615 };
616
617 /* Definitions of flag bits used in detect_coding_XXXX. */
618 #define CATEGORY_MASK_ISO_7 (1 << coding_category_iso_7)
619 #define CATEGORY_MASK_ISO_7_TIGHT (1 << coding_category_iso_7_tight)
620 #define CATEGORY_MASK_ISO_8_1 (1 << coding_category_iso_8_1)
621 #define CATEGORY_MASK_ISO_8_2 (1 << coding_category_iso_8_2)
622 #define CATEGORY_MASK_ISO_7_ELSE (1 << coding_category_iso_7_else)
623 #define CATEGORY_MASK_ISO_8_ELSE (1 << coding_category_iso_8_else)
624 #define CATEGORY_MASK_UTF_8 (1 << coding_category_utf_8)
625 #define CATEGORY_MASK_UTF_16_BE (1 << coding_category_utf_16_be)
626 #define CATEGORY_MASK_UTF_16_LE (1 << coding_category_utf_16_le)
627 #define CATEGORY_MASK_UTF_16_BE_NOSIG (1 << coding_category_utf_16_be_nosig)
628 #define CATEGORY_MASK_UTF_16_LE_NOSIG (1 << coding_category_utf_16_le_nosig)
629 #define CATEGORY_MASK_CHARSET (1 << coding_category_charset)
630 #define CATEGORY_MASK_SJIS (1 << coding_category_sjis)
631 #define CATEGORY_MASK_BIG5 (1 << coding_category_big5)
632 #define CATEGORY_MASK_CCL (1 << coding_category_ccl)
633 #define CATEGORY_MASK_EMACS_MULE (1 << coding_category_emacs_mule)
634
635 /* This value is returned if detect_coding_mask () find nothing other
636 than ASCII characters. */
637 #define CATEGORY_MASK_ANY \
638 (CATEGORY_MASK_ISO_7 \
639 | CATEGORY_MASK_ISO_7_TIGHT \
640 | CATEGORY_MASK_ISO_8_1 \
641 | CATEGORY_MASK_ISO_8_2 \
642 | CATEGORY_MASK_ISO_7_ELSE \
643 | CATEGORY_MASK_ISO_8_ELSE \
644 | CATEGORY_MASK_UTF_8 \
645 | CATEGORY_MASK_UTF_16_BE \
646 | CATEGORY_MASK_UTF_16_LE \
647 | CATEGORY_MASK_UTF_16_BE_NOSIG \
648 | CATEGORY_MASK_UTF_16_LE_NOSIG \
649 | CATEGORY_MASK_CHARSET \
650 | CATEGORY_MASK_SJIS \
651 | CATEGORY_MASK_BIG5 \
652 | CATEGORY_MASK_CCL \
653 | CATEGORY_MASK_EMACS_MULE)
654
655
656 #define CATEGORY_MASK_ISO_7BIT \
657 (CATEGORY_MASK_ISO_7 | CATEGORY_MASK_ISO_7_TIGHT)
658
659 #define CATEGORY_MASK_ISO_8BIT \
660 (CATEGORY_MASK_ISO_8_1 | CATEGORY_MASK_ISO_8_2)
661
662 #define CATEGORY_MASK_ISO_ELSE \
663 (CATEGORY_MASK_ISO_7_ELSE | CATEGORY_MASK_ISO_8_ELSE)
664
665 #define CATEGORY_MASK_ISO_ESCAPE \
666 (CATEGORY_MASK_ISO_7 \
667 | CATEGORY_MASK_ISO_7_TIGHT \
668 | CATEGORY_MASK_ISO_7_ELSE \
669 | CATEGORY_MASK_ISO_8_ELSE)
670
671 #define CATEGORY_MASK_ISO \
672 ( CATEGORY_MASK_ISO_7BIT \
673 | CATEGORY_MASK_ISO_8BIT \
674 | CATEGORY_MASK_ISO_ELSE)
675
676 #define CATEGORY_MASK_UTF_16 \
677 (CATEGORY_MASK_UTF_16_BE \
678 | CATEGORY_MASK_UTF_16_LE \
679 | CATEGORY_MASK_UTF_16_BE_NOSIG \
680 | CATEGORY_MASK_UTF_16_LE_NOSIG)
681
682
683 /* List of symbols `coding-category-xxx' ordered by priority. This
684 variable is exposed to Emacs Lisp. */
685 static Lisp_Object Vcoding_category_list;
686
687 /* Table of coding categories (Lisp symbols). This variable is for
688 internal use oly. */
689 static Lisp_Object Vcoding_category_table;
690
691 /* Table of coding-categories ordered by priority. */
692 static enum coding_category coding_priorities[coding_category_max];
693
694 /* Nth element is a coding context for the coding system bound to the
695 Nth coding category. */
696 static struct coding_system coding_categories[coding_category_max];
697
698 static int detected_mask[coding_category_raw_text] =
699 { CATEGORY_MASK_ISO,
700 CATEGORY_MASK_ISO,
701 CATEGORY_MASK_ISO,
702 CATEGORY_MASK_ISO,
703 CATEGORY_MASK_ISO,
704 CATEGORY_MASK_ISO,
705 CATEGORY_MASK_UTF_8,
706 CATEGORY_MASK_UTF_16,
707 CATEGORY_MASK_UTF_16,
708 CATEGORY_MASK_UTF_16,
709 CATEGORY_MASK_UTF_16,
710 CATEGORY_MASK_UTF_16,
711 CATEGORY_MASK_CHARSET,
712 CATEGORY_MASK_SJIS,
713 CATEGORY_MASK_BIG5,
714 CATEGORY_MASK_CCL,
715 CATEGORY_MASK_EMACS_MULE
716 };
717
718 /*** Commonly used macros and functions ***/
719
720 #ifndef min
721 #define min(a, b) ((a) < (b) ? (a) : (b))
722 #endif
723 #ifndef max
724 #define max(a, b) ((a) > (b) ? (a) : (b))
725 #endif
726
727 #define CODING_GET_INFO(coding, attrs, eol_type, charset_list) \
728 do { \
729 attrs = CODING_ID_ATTRS (coding->id); \
730 eol_type = CODING_ID_EOL_TYPE (coding->id); \
731 if (VECTORP (eol_type)) \
732 eol_type = Qunix; \
733 charset_list = CODING_ATTR_CHARSET_LIST (attrs); \
734 } while (0)
735
736
737 /* Safely get one byte from the source text pointed by SRC which ends
738 at SRC_END, and set C to that byte. If there are not enough bytes
739 in the source, it jumps to `no_more_source'. The caller
740 should declare and set these variables appropriately in advance:
741 src, src_end, multibytep
742 */
743
744 #define ONE_MORE_BYTE(c) \
745 do { \
746 if (src == src_end) \
747 { \
748 if (src_base < src) \
749 coding->result = CODING_RESULT_INSUFFICIENT_SRC; \
750 goto no_more_source; \
751 } \
752 c = *src++; \
753 if (multibytep && (c & 0x80)) \
754 { \
755 if ((c & 0xFE) != 0xC0) \
756 error ("Undecodable char found"); \
757 c = ((c & 1) << 6) | *src++; \
758 } \
759 consumed_chars++; \
760 } while (0)
761
762
763 #define ONE_MORE_BYTE_NO_CHECK(c) \
764 do { \
765 c = *src++; \
766 if (multibytep && (c & 0x80)) \
767 { \
768 if ((c & 0xFE) != 0xC0) \
769 error ("Undecodable char found"); \
770 c = ((c & 1) << 6) | *src++; \
771 } \
772 consumed_chars++; \
773 } while (0)
774
775
776 /* Store a byte C in the place pointed by DST and increment DST to the
777 next free point, and increment PRODUCED_CHARS. The caller should
778 assure that C is 0..127, and declare and set the variable `dst'
779 appropriately in advance.
780 */
781
782
783 #define EMIT_ONE_ASCII_BYTE(c) \
784 do { \
785 produced_chars++; \
786 *dst++ = (c); \
787 } while (0)
788
789
790 /* Like EMIT_ONE_ASCII_BYTE byt store two bytes; C1 and C2. */
791
792 #define EMIT_TWO_ASCII_BYTES(c1, c2) \
793 do { \
794 produced_chars += 2; \
795 *dst++ = (c1), *dst++ = (c2); \
796 } while (0)
797
798
799 /* Store a byte C in the place pointed by DST and increment DST to the
800 next free point, and increment PRODUCED_CHARS. If MULTIBYTEP is
801 nonzero, store in an appropriate multibyte from. The caller should
802 declare and set the variables `dst' and `multibytep' appropriately
803 in advance. */
804
805 #define EMIT_ONE_BYTE(c) \
806 do { \
807 produced_chars++; \
808 if (multibytep) \
809 { \
810 int ch = (c); \
811 if (ch >= 0x80) \
812 ch = BYTE8_TO_CHAR (ch); \
813 CHAR_STRING_ADVANCE (ch, dst); \
814 } \
815 else \
816 *dst++ = (c); \
817 } while (0)
818
819
820 /* Like EMIT_ONE_BYTE, but emit two bytes; C1 and C2. */
821
822 #define EMIT_TWO_BYTES(c1, c2) \
823 do { \
824 produced_chars += 2; \
825 if (multibytep) \
826 { \
827 int ch; \
828 \
829 ch = (c1); \
830 if (ch >= 0x80) \
831 ch = BYTE8_TO_CHAR (ch); \
832 CHAR_STRING_ADVANCE (ch, dst); \
833 ch = (c2); \
834 if (ch >= 0x80) \
835 ch = BYTE8_TO_CHAR (ch); \
836 CHAR_STRING_ADVANCE (ch, dst); \
837 } \
838 else \
839 { \
840 *dst++ = (c1); \
841 *dst++ = (c2); \
842 } \
843 } while (0)
844
845
846 #define EMIT_THREE_BYTES(c1, c2, c3) \
847 do { \
848 EMIT_ONE_BYTE (c1); \
849 EMIT_TWO_BYTES (c2, c3); \
850 } while (0)
851
852
853 #define EMIT_FOUR_BYTES(c1, c2, c3, c4) \
854 do { \
855 EMIT_TWO_BYTES (c1, c2); \
856 EMIT_TWO_BYTES (c3, c4); \
857 } while (0)
858
859
860 #define CODING_DECODE_CHAR(coding, src, src_base, src_end, charset, code, c) \
861 do { \
862 charset_map_loaded = 0; \
863 c = DECODE_CHAR (charset, code); \
864 if (charset_map_loaded) \
865 { \
866 unsigned char *orig = coding->source; \
867 EMACS_INT offset; \
868 \
869 coding_set_source (coding); \
870 offset = coding->source - orig; \
871 src += offset; \
872 src_base += offset; \
873 src_end += offset; \
874 } \
875 } while (0)
876
877
878 #define ASSURE_DESTINATION(bytes) \
879 do { \
880 if (dst + (bytes) >= dst_end) \
881 { \
882 int more_bytes = charbuf_end - charbuf + (bytes); \
883 \
884 dst = alloc_destination (coding, more_bytes, dst); \
885 dst_end = coding->destination + coding->dst_bytes; \
886 } \
887 } while (0)
888
889
890
891 static void
892 coding_set_source (coding)
893 struct coding_system *coding;
894 {
895 if (BUFFERP (coding->src_object))
896 {
897 if (coding->src_pos < 0)
898 coding->source = GAP_END_ADDR + coding->src_pos_byte;
899 else
900 {
901 struct buffer *buf = XBUFFER (coding->src_object);
902 EMACS_INT gpt_byte = BUF_GPT_BYTE (buf);
903 unsigned char *beg_addr = BUF_BEG_ADDR (buf);
904
905 coding->source = beg_addr + coding->src_pos_byte - 1;
906 if (coding->src_pos_byte >= gpt_byte)
907 coding->source += BUF_GAP_SIZE (buf);
908 }
909 }
910 else if (STRINGP (coding->src_object))
911 {
912 coding->source = (XSTRING (coding->src_object)->data
913 + coding->src_pos_byte);
914 }
915 else
916 /* Otherwise, the source is C string and is never relocated
917 automatically. Thus we don't have to update anything. */
918 ;
919 }
920
921 static void
922 coding_set_destination (coding)
923 struct coding_system *coding;
924 {
925 if (BUFFERP (coding->dst_object))
926 {
927 /* We are sure that coding->dst_pos_byte is before the gap of the
928 buffer. */
929 coding->destination = (BUF_BEG_ADDR (XBUFFER (coding->dst_object))
930 + coding->dst_pos_byte - 1);
931 if (coding->src_pos < 0)
932 coding->dst_bytes = (GAP_END_ADDR
933 - (coding->src_bytes - coding->consumed)
934 - coding->destination);
935 else
936 coding->dst_bytes = (BUF_GAP_END_ADDR (XBUFFER (coding->dst_object))
937 - coding->destination);
938 }
939 else
940 /* Otherwise, the destination is C string and is never relocated
941 automatically. Thus we don't have to update anything. */
942 ;
943 }
944
945
946 static void
947 coding_alloc_by_realloc (coding, bytes)
948 struct coding_system *coding;
949 EMACS_INT bytes;
950 {
951 coding->destination = (unsigned char *) xrealloc (coding->destination,
952 coding->dst_bytes + bytes);
953 coding->dst_bytes += bytes;
954 }
955
956 static void
957 coding_alloc_by_making_gap (coding, bytes)
958 struct coding_system *coding;
959 EMACS_INT bytes;
960 {
961 if (BUFFERP (coding->dst_object)
962 && EQ (coding->src_object, coding->dst_object))
963 {
964 EMACS_INT add = coding->src_bytes - coding->consumed;
965
966 GAP_SIZE -= add; ZV += add; Z += add; ZV_BYTE += add; Z_BYTE += add;
967 make_gap (bytes);
968 GAP_SIZE += add; ZV -= add; Z -= add; ZV_BYTE -= add; Z_BYTE -= add;
969 }
970 else
971 {
972 Lisp_Object this_buffer;
973
974 this_buffer = Fcurrent_buffer ();
975 set_buffer_internal (XBUFFER (coding->dst_object));
976 make_gap (bytes);
977 set_buffer_internal (XBUFFER (this_buffer));
978 }
979 }
980
981
982 static unsigned char *
983 alloc_destination (coding, nbytes, dst)
984 struct coding_system *coding;
985 int nbytes;
986 unsigned char *dst;
987 {
988 EMACS_INT offset = dst - coding->destination;
989
990 if (BUFFERP (coding->dst_object))
991 coding_alloc_by_making_gap (coding, nbytes);
992 else
993 coding_alloc_by_realloc (coding, nbytes);
994 coding->result = CODING_RESULT_SUCCESS;
995 coding_set_destination (coding);
996 dst = coding->destination + offset;
997 return dst;
998 }
999
1000 \f
1001 /*** 2. Emacs' internal format (emacs-utf-8) ***/
1002
1003
1004
1005 \f
1006 /*** 3. UTF-8 ***/
1007
1008 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
1009 Check if a text is encoded in UTF-8. If it is, return
1010 CATEGORY_MASK_UTF_8, else return 0. */
1011
1012 #define UTF_8_1_OCTET_P(c) ((c) < 0x80)
1013 #define UTF_8_EXTRA_OCTET_P(c) (((c) & 0xC0) == 0x80)
1014 #define UTF_8_2_OCTET_LEADING_P(c) (((c) & 0xE0) == 0xC0)
1015 #define UTF_8_3_OCTET_LEADING_P(c) (((c) & 0xF0) == 0xE0)
1016 #define UTF_8_4_OCTET_LEADING_P(c) (((c) & 0xF8) == 0xF0)
1017 #define UTF_8_5_OCTET_LEADING_P(c) (((c) & 0xFC) == 0xF8)
1018
1019 static int
1020 detect_coding_utf_8 (coding, mask)
1021 struct coding_system *coding;
1022 int *mask;
1023 {
1024 unsigned char *src = coding->source, *src_base = src;
1025 unsigned char *src_end = coding->source + coding->src_bytes;
1026 int multibytep = coding->src_multibyte;
1027 int consumed_chars = 0;
1028 int found = 0;
1029
1030 /* A coding system of this category is always ASCII compatible. */
1031 src += coding->head_ascii;
1032
1033 while (1)
1034 {
1035 int c, c1, c2, c3, c4;
1036
1037 ONE_MORE_BYTE (c);
1038 if (UTF_8_1_OCTET_P (c))
1039 continue;
1040 ONE_MORE_BYTE (c1);
1041 if (! UTF_8_EXTRA_OCTET_P (c1))
1042 break;
1043 if (UTF_8_2_OCTET_LEADING_P (c))
1044 {
1045 found++;
1046 continue;
1047 }
1048 ONE_MORE_BYTE (c2);
1049 if (! UTF_8_EXTRA_OCTET_P (c2))
1050 break;
1051 if (UTF_8_3_OCTET_LEADING_P (c))
1052 {
1053 found++;
1054 continue;
1055 }
1056 ONE_MORE_BYTE (c3);
1057 if (! UTF_8_EXTRA_OCTET_P (c3))
1058 break;
1059 if (UTF_8_4_OCTET_LEADING_P (c))
1060 {
1061 found++;
1062 continue;
1063 }
1064 ONE_MORE_BYTE (c4);
1065 if (! UTF_8_EXTRA_OCTET_P (c4))
1066 break;
1067 if (UTF_8_5_OCTET_LEADING_P (c))
1068 {
1069 found++;
1070 continue;
1071 }
1072 break;
1073 }
1074 *mask &= ~CATEGORY_MASK_UTF_8;
1075 return 0;
1076
1077 no_more_source:
1078 if (! found)
1079 return 0;
1080 *mask &= CATEGORY_MASK_UTF_8;
1081 return 1;
1082 }
1083
1084
1085 /* Fixme: deal with surrogates? */
1086 static void
1087 decode_coding_utf_8 (coding)
1088 struct coding_system *coding;
1089 {
1090 unsigned char *src = coding->source + coding->consumed;
1091 unsigned char *src_end = coding->source + coding->src_bytes;
1092 unsigned char *src_base;
1093 int *charbuf = coding->charbuf;
1094 int *charbuf_end = charbuf + coding->charbuf_size;
1095 int consumed_chars = 0, consumed_chars_base;
1096 int multibytep = coding->src_multibyte;
1097 Lisp_Object attr, eol_type, charset_list;
1098
1099 CODING_GET_INFO (coding, attr, eol_type, charset_list);
1100
1101 while (1)
1102 {
1103 int c, c1, c2, c3, c4, c5;
1104
1105 src_base = src;
1106 consumed_chars_base = consumed_chars;
1107
1108 if (charbuf >= charbuf_end)
1109 break;
1110
1111 ONE_MORE_BYTE (c1);
1112 if (UTF_8_1_OCTET_P(c1))
1113 {
1114 c = c1;
1115 if (c == '\r')
1116 {
1117 if (EQ (eol_type, Qdos))
1118 {
1119 if (src == src_end)
1120 goto no_more_source;
1121 if (*src == '\n')
1122 ONE_MORE_BYTE (c);
1123 }
1124 else if (EQ (eol_type, Qmac))
1125 c = '\n';
1126 }
1127 }
1128 else
1129 {
1130 ONE_MORE_BYTE (c2);
1131 if (! UTF_8_EXTRA_OCTET_P (c2))
1132 goto invalid_code;
1133 if (UTF_8_2_OCTET_LEADING_P (c1))
1134 {
1135 c = ((c1 & 0x1F) << 6) | (c2 & 0x3F);
1136 /* Reject overlong sequences here and below. Encoders
1137 producing them are incorrect, they can be misleading,
1138 and they mess up read/write invariance. */
1139 if (c < 128)
1140 goto invalid_code;
1141 }
1142 else
1143 {
1144 ONE_MORE_BYTE (c3);
1145 if (! UTF_8_EXTRA_OCTET_P (c3))
1146 goto invalid_code;
1147 if (UTF_8_3_OCTET_LEADING_P (c1))
1148 {
1149 c = (((c1 & 0xF) << 12)
1150 | ((c2 & 0x3F) << 6) | (c3 & 0x3F));
1151 if (c < 0x800)
1152 goto invalid_code;
1153 }
1154 else
1155 {
1156 ONE_MORE_BYTE (c4);
1157 if (! UTF_8_EXTRA_OCTET_P (c4))
1158 goto invalid_code;
1159 if (UTF_8_4_OCTET_LEADING_P (c1))
1160 {
1161 c = (((c1 & 0x7) << 18) | ((c2 & 0x3F) << 12)
1162 | ((c3 & 0x3F) << 6) | (c4 & 0x3F));
1163 if (c < 0x10000)
1164 goto invalid_code;
1165 }
1166 else
1167 {
1168 ONE_MORE_BYTE (c5);
1169 if (! UTF_8_EXTRA_OCTET_P (c5))
1170 goto invalid_code;
1171 if (UTF_8_5_OCTET_LEADING_P (c1))
1172 {
1173 c = (((c1 & 0x3) << 24) | ((c2 & 0x3F) << 18)
1174 | ((c3 & 0x3F) << 12) | ((c4 & 0x3F) << 6)
1175 | (c5 & 0x3F));
1176 if ((c > MAX_CHAR) || (c < 0x200000))
1177 goto invalid_code;
1178 }
1179 else
1180 goto invalid_code;
1181 }
1182 }
1183 }
1184 }
1185
1186 *charbuf++ = c;
1187 continue;
1188
1189 invalid_code:
1190 src = src_base;
1191 consumed_chars = consumed_chars_base;
1192 ONE_MORE_BYTE (c);
1193 *charbuf++ = ASCII_BYTE_P (c) ? c : BYTE8_TO_CHAR (c);
1194 coding->errors++;
1195 }
1196
1197 no_more_source:
1198 coding->consumed_char += consumed_chars_base;
1199 coding->consumed = src_base - coding->source;
1200 coding->charbuf_used = charbuf - coding->charbuf;
1201 }
1202
1203
1204 static int
1205 encode_coding_utf_8 (coding)
1206 struct coding_system *coding;
1207 {
1208 int multibytep = coding->dst_multibyte;
1209 int *charbuf = coding->charbuf;
1210 int *charbuf_end = charbuf + coding->charbuf_used;
1211 unsigned char *dst = coding->destination + coding->produced;
1212 unsigned char *dst_end = coding->destination + coding->dst_bytes;
1213 int produced_chars = 0;
1214 int c;
1215
1216 if (multibytep)
1217 {
1218 int safe_room = MAX_MULTIBYTE_LENGTH * 2;
1219
1220 while (charbuf < charbuf_end)
1221 {
1222 unsigned char str[MAX_MULTIBYTE_LENGTH], *p, *pend = str;
1223
1224 ASSURE_DESTINATION (safe_room);
1225 c = *charbuf++;
1226 CHAR_STRING_ADVANCE (c, pend);
1227 for (p = str; p < pend; p++)
1228 EMIT_ONE_BYTE (*p);
1229 }
1230 }
1231 else
1232 {
1233 int safe_room = MAX_MULTIBYTE_LENGTH;
1234
1235 while (charbuf < charbuf_end)
1236 {
1237 ASSURE_DESTINATION (safe_room);
1238 c = *charbuf++;
1239 dst += CHAR_STRING (c, dst);
1240 produced_chars++;
1241 }
1242 }
1243 coding->result = CODING_RESULT_SUCCESS;
1244 coding->produced_char += produced_chars;
1245 coding->produced = dst - coding->destination;
1246 return 0;
1247 }
1248
1249
1250 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
1251 Check if a text is encoded in UTF-16 Big Endian (endian == 1) or
1252 Little Endian (otherwise). If it is, return
1253 CATEGORY_MASK_UTF_16_BE or CATEGORY_MASK_UTF_16_LE,
1254 else return 0. */
1255
1256 #define UTF_16_HIGH_SURROGATE_P(val) \
1257 (((val) & 0xFC00) == 0xD800)
1258
1259 #define UTF_16_LOW_SURROGATE_P(val) \
1260 (((val) & 0xFC00) == 0xDC00)
1261
1262 #define UTF_16_INVALID_P(val) \
1263 (((val) == 0xFFFE) \
1264 || ((val) == 0xFFFF) \
1265 || UTF_16_LOW_SURROGATE_P (val))
1266
1267
1268 static int
1269 detect_coding_utf_16 (coding, mask)
1270 struct coding_system *coding;
1271 int *mask;
1272 {
1273 unsigned char *src = coding->source, *src_base = src;
1274 unsigned char *src_end = coding->source + coding->src_bytes;
1275 int multibytep = coding->src_multibyte;
1276 int consumed_chars = 0;
1277 int c1, c2;
1278
1279 ONE_MORE_BYTE (c1);
1280 ONE_MORE_BYTE (c2);
1281
1282 if ((c1 == 0xFF) && (c2 == 0xFE))
1283 {
1284 *mask &= CATEGORY_MASK_UTF_16_LE;
1285 return 1;
1286 }
1287 else if ((c1 == 0xFE) && (c2 == 0xFF))
1288 {
1289 *mask &= CATEGORY_MASK_UTF_16_BE;
1290 return 1;
1291 }
1292 no_more_source:
1293 return 0;
1294 }
1295
1296 static void
1297 decode_coding_utf_16 (coding)
1298 struct coding_system *coding;
1299 {
1300 unsigned char *src = coding->source + coding->consumed;
1301 unsigned char *src_end = coding->source + coding->src_bytes;
1302 unsigned char *src_base;
1303 int *charbuf = coding->charbuf;
1304 int *charbuf_end = charbuf + coding->charbuf_size;
1305 int consumed_chars = 0, consumed_chars_base;
1306 int multibytep = coding->src_multibyte;
1307 enum utf_16_bom_type bom = CODING_UTF_16_BOM (coding);
1308 enum utf_16_endian_type endian = CODING_UTF_16_ENDIAN (coding);
1309 int surrogate = CODING_UTF_16_SURROGATE (coding);
1310 Lisp_Object attr, eol_type, charset_list;
1311
1312 CODING_GET_INFO (coding, attr, eol_type, charset_list);
1313
1314 if (bom != utf_16_without_bom)
1315 {
1316 int c, c1, c2;
1317
1318 src_base = src;
1319 ONE_MORE_BYTE (c1);
1320 ONE_MORE_BYTE (c2);
1321 c = (c1 << 8) | c2;
1322 if (bom == utf_16_with_bom)
1323 {
1324 if (endian == utf_16_big_endian
1325 ? c != 0xFFFE : c != 0xFEFF)
1326 {
1327 /* We are sure that there's enouph room at CHARBUF. */
1328 *charbuf++ = c1;
1329 *charbuf++ = c2;
1330 coding->errors++;
1331 }
1332 }
1333 else
1334 {
1335 if (c == 0xFFFE)
1336 CODING_UTF_16_ENDIAN (coding)
1337 = endian = utf_16_big_endian;
1338 else if (c == 0xFEFF)
1339 CODING_UTF_16_ENDIAN (coding)
1340 = endian = utf_16_little_endian;
1341 else
1342 {
1343 CODING_UTF_16_ENDIAN (coding)
1344 = endian = utf_16_big_endian;
1345 src = src_base;
1346 }
1347 }
1348 CODING_UTF_16_BOM (coding) = utf_16_with_bom;
1349 }
1350
1351 while (1)
1352 {
1353 int c, c1, c2;
1354
1355 src_base = src;
1356 consumed_chars_base = consumed_chars;
1357
1358 if (charbuf + 2 >= charbuf_end)
1359 break;
1360
1361 ONE_MORE_BYTE (c1);
1362 ONE_MORE_BYTE (c2);
1363 c = (endian == utf_16_big_endian
1364 ? ((c1 << 8) | c2) : ((c2 << 8) | c1));
1365 if (surrogate)
1366 {
1367 if (! UTF_16_LOW_SURROGATE_P (c))
1368 {
1369 if (endian == utf_16_big_endian)
1370 c1 = surrogate >> 8, c2 = surrogate & 0xFF;
1371 else
1372 c1 = surrogate & 0xFF, c2 = surrogate >> 8;
1373 *charbuf++ = c1;
1374 *charbuf++ = c2;
1375 coding->errors++;
1376 if (UTF_16_HIGH_SURROGATE_P (c))
1377 CODING_UTF_16_SURROGATE (coding) = surrogate = c;
1378 else
1379 *charbuf++ = c;
1380 }
1381 else
1382 {
1383 c = ((surrogate - 0xD800) << 10) | (c - 0xDC00);
1384 CODING_UTF_16_SURROGATE (coding) = surrogate = 0;
1385 *charbuf++ = c;
1386 }
1387 }
1388 else
1389 {
1390 if (UTF_16_HIGH_SURROGATE_P (c))
1391 CODING_UTF_16_SURROGATE (coding) = surrogate = c;
1392 else
1393 *charbuf++ = c;
1394 }
1395 }
1396
1397 no_more_source:
1398 coding->consumed_char += consumed_chars_base;
1399 coding->consumed = src_base - coding->source;
1400 coding->charbuf_used = charbuf - coding->charbuf;
1401 }
1402
1403 static int
1404 encode_coding_utf_16 (coding)
1405 struct coding_system *coding;
1406 {
1407 int multibytep = coding->dst_multibyte;
1408 int *charbuf = coding->charbuf;
1409 int *charbuf_end = charbuf + coding->charbuf_used;
1410 unsigned char *dst = coding->destination + coding->produced;
1411 unsigned char *dst_end = coding->destination + coding->dst_bytes;
1412 int safe_room = 8;
1413 enum utf_16_bom_type bom = CODING_UTF_16_BOM (coding);
1414 int big_endian = CODING_UTF_16_ENDIAN (coding) == utf_16_big_endian;
1415 int produced_chars = 0;
1416 Lisp_Object attrs, eol_type, charset_list;
1417 int c;
1418
1419 CODING_GET_INFO (coding, attrs, eol_type, charset_list);
1420
1421 if (bom == utf_16_with_bom)
1422 {
1423 ASSURE_DESTINATION (safe_room);
1424 if (big_endian)
1425 EMIT_TWO_BYTES (0xFF, 0xFE);
1426 else
1427 EMIT_TWO_BYTES (0xFE, 0xFF);
1428 CODING_UTF_16_BOM (coding) = utf_16_without_bom;
1429 }
1430
1431 while (charbuf < charbuf_end)
1432 {
1433 ASSURE_DESTINATION (safe_room);
1434 c = *charbuf++;
1435 if (c >= MAX_UNICODE_CHAR)
1436 c = coding->default_char;
1437
1438 if (c < 0x10000)
1439 {
1440 if (big_endian)
1441 EMIT_TWO_BYTES (c >> 8, c & 0xFF);
1442 else
1443 EMIT_TWO_BYTES (c & 0xFF, c >> 8);
1444 }
1445 else
1446 {
1447 int c1, c2;
1448
1449 c -= 0x10000;
1450 c1 = (c >> 10) + 0xD800;
1451 c2 = (c & 0x3FF) + 0xDC00;
1452 if (big_endian)
1453 EMIT_FOUR_BYTES (c1 >> 8, c1 & 0xFF, c2 >> 8, c2 & 0xFF);
1454 else
1455 EMIT_FOUR_BYTES (c1 & 0xFF, c1 >> 8, c2 & 0xFF, c2 >> 8);
1456 }
1457 }
1458 coding->result = CODING_RESULT_SUCCESS;
1459 coding->produced = dst - coding->destination;
1460 coding->produced_char += produced_chars;
1461 return 0;
1462 }
1463
1464 \f
1465 /*** 6. Old Emacs' internal format (emacs-mule) ***/
1466
1467 /* Emacs' internal format for representation of multiple character
1468 sets is a kind of multi-byte encoding, i.e. characters are
1469 represented by variable-length sequences of one-byte codes.
1470
1471 ASCII characters and control characters (e.g. `tab', `newline') are
1472 represented by one-byte sequences which are their ASCII codes, in
1473 the range 0x00 through 0x7F.
1474
1475 8-bit characters of the range 0x80..0x9F are represented by
1476 two-byte sequences of LEADING_CODE_8_BIT_CONTROL and (their 8-bit
1477 code + 0x20).
1478
1479 8-bit characters of the range 0xA0..0xFF are represented by
1480 one-byte sequences which are their 8-bit code.
1481
1482 The other characters are represented by a sequence of `base
1483 leading-code', optional `extended leading-code', and one or two
1484 `position-code's. The length of the sequence is determined by the
1485 base leading-code. Leading-code takes the range 0x81 through 0x9D,
1486 whereas extended leading-code and position-code take the range 0xA0
1487 through 0xFF. See `charset.h' for more details about leading-code
1488 and position-code.
1489
1490 --- CODE RANGE of Emacs' internal format ---
1491 character set range
1492 ------------- -----
1493 ascii 0x00..0x7F
1494 eight-bit-control LEADING_CODE_8_BIT_CONTROL + 0xA0..0xBF
1495 eight-bit-graphic 0xA0..0xBF
1496 ELSE 0x81..0x9D + [0xA0..0xFF]+
1497 ---------------------------------------------
1498
1499 As this is the internal character representation, the format is
1500 usually not used externally (i.e. in a file or in a data sent to a
1501 process). But, it is possible to have a text externally in this
1502 format (i.e. by encoding by the coding system `emacs-mule').
1503
1504 In that case, a sequence of one-byte codes has a slightly different
1505 form.
1506
1507 At first, all characters in eight-bit-control are represented by
1508 one-byte sequences which are their 8-bit code.
1509
1510 Next, character composition data are represented by the byte
1511 sequence of the form: 0x80 METHOD BYTES CHARS COMPONENT ...,
1512 where,
1513 METHOD is 0xF0 plus one of composition method (enum
1514 composition_method),
1515
1516 BYTES is 0xA0 plus a byte length of this composition data,
1517
1518 CHARS is 0x20 plus a number of characters composed by this
1519 data,
1520
1521 COMPONENTs are characters of multibye form or composition
1522 rules encoded by two-byte of ASCII codes.
1523
1524 In addition, for backward compatibility, the following formats are
1525 also recognized as composition data on decoding.
1526
1527 0x80 MSEQ ...
1528 0x80 0xFF MSEQ RULE MSEQ RULE ... MSEQ
1529
1530 Here,
1531 MSEQ is a multibyte form but in these special format:
1532 ASCII: 0xA0 ASCII_CODE+0x80,
1533 other: LEADING_CODE+0x20 FOLLOWING-BYTE ...,
1534 RULE is a one byte code of the range 0xA0..0xF0 that
1535 represents a composition rule.
1536 */
1537
1538 char emacs_mule_bytes[256];
1539
1540 /* Leading-code followed by extended leading-code. */
1541 #define LEADING_CODE_PRIVATE_11 0x9A /* for private DIMENSION1 of 1-column */
1542 #define LEADING_CODE_PRIVATE_12 0x9B /* for private DIMENSION1 of 2-column */
1543 #define LEADING_CODE_PRIVATE_21 0x9C /* for private DIMENSION2 of 1-column */
1544 #define LEADING_CODE_PRIVATE_22 0x9D /* for private DIMENSION2 of 2-column */
1545
1546
1547 int
1548 emacs_mule_char (coding, src, nbytes, nchars)
1549 struct coding_system *coding;
1550 unsigned char *src;
1551 int *nbytes, *nchars;
1552 {
1553 unsigned char *src_end = coding->source + coding->src_bytes;
1554 int multibytep = coding->src_multibyte;
1555 unsigned char *src_base = src;
1556 struct charset *charset;
1557 unsigned code;
1558 int c;
1559 int consumed_chars = 0;
1560
1561 ONE_MORE_BYTE (c);
1562 switch (emacs_mule_bytes[c])
1563 {
1564 case 2:
1565 if (! (charset = emacs_mule_charset[c]))
1566 goto invalid_code;
1567 ONE_MORE_BYTE (c);
1568 code = c & 0x7F;
1569 break;
1570
1571 case 3:
1572 if (c == LEADING_CODE_PRIVATE_11
1573 || c == LEADING_CODE_PRIVATE_12)
1574 {
1575 ONE_MORE_BYTE (c);
1576 if (! (charset = emacs_mule_charset[c]))
1577 goto invalid_code;
1578 ONE_MORE_BYTE (c);
1579 code = c & 0x7F;
1580 }
1581 else
1582 {
1583 if (! (charset = emacs_mule_charset[c]))
1584 goto invalid_code;
1585 ONE_MORE_BYTE (c);
1586 code = (c & 0x7F) << 8;
1587 ONE_MORE_BYTE (c);
1588 code |= c & 0x7F;
1589 }
1590 break;
1591
1592 case 4:
1593 ONE_MORE_BYTE (c);
1594 if (! (charset = emacs_mule_charset[c]))
1595 goto invalid_code;
1596 ONE_MORE_BYTE (c);
1597 code = (c & 0x7F) << 8;
1598 ONE_MORE_BYTE (c);
1599 code |= c & 0x7F;
1600 break;
1601
1602 case 1:
1603 code = c;
1604 charset = CHARSET_FROM_ID (ASCII_BYTE_P (code) ? charset_ascii
1605 : code < 0xA0 ? charset_8_bit_control
1606 : charset_8_bit_graphic);
1607 break;
1608
1609 default:
1610 abort ();
1611 }
1612 c = DECODE_CHAR (charset, code);
1613 if (c < 0)
1614 goto invalid_code;
1615 *nbytes = src - src_base;
1616 *nchars = consumed_chars;
1617 return c;
1618
1619 no_more_source:
1620 return -2;
1621
1622 invalid_code:
1623 return -1;
1624 }
1625
1626
1627 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
1628 Check if a text is encoded in `emacs-mule'. */
1629
1630 static int
1631 detect_coding_emacs_mule (coding, mask)
1632 struct coding_system *coding;
1633 int *mask;
1634 {
1635 unsigned char *src = coding->source, *src_base = src;
1636 unsigned char *src_end = coding->source + coding->src_bytes;
1637 int multibytep = coding->src_multibyte;
1638 int consumed_chars = 0;
1639 int c;
1640 int found = 0;
1641
1642 /* A coding system of this category is always ASCII compatible. */
1643 src += coding->head_ascii;
1644
1645 while (1)
1646 {
1647 ONE_MORE_BYTE (c);
1648
1649 if (c == 0x80)
1650 {
1651 /* Perhaps the start of composite character. We simple skip
1652 it because analyzing it is too heavy for detecting. But,
1653 at least, we check that the composite character
1654 constitues of more than 4 bytes. */
1655 unsigned char *src_base;
1656
1657 repeat:
1658 src_base = src;
1659 do
1660 {
1661 ONE_MORE_BYTE (c);
1662 }
1663 while (c >= 0xA0);
1664
1665 if (src - src_base <= 4)
1666 break;
1667 found = 1;
1668 if (c == 0x80)
1669 goto repeat;
1670 }
1671
1672 if (c < 0x80)
1673 {
1674 if (c < 0x20
1675 && (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO))
1676 break;
1677 }
1678 else
1679 {
1680 unsigned char *src_base = src - 1;
1681
1682 do
1683 {
1684 ONE_MORE_BYTE (c);
1685 }
1686 while (c >= 0xA0);
1687 if (src - src_base != emacs_mule_bytes[*src_base])
1688 break;
1689 found = 1;
1690 }
1691 }
1692 *mask &= ~CATEGORY_MASK_EMACS_MULE;
1693 return 0;
1694
1695 no_more_source:
1696 if (!found)
1697 return 0;
1698 *mask &= CATEGORY_MASK_EMACS_MULE;
1699 return 1;
1700 }
1701
1702
1703 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions". */
1704
1705 /* Decode a character represented as a component of composition
1706 sequence of Emacs 20/21 style at SRC. Set C to that character and
1707 update SRC to the head of next character (or an encoded composition
1708 rule). If SRC doesn't points a composition component, set C to -1.
1709 If SRC points an invalid byte sequence, global exit by a return
1710 value 0. */
1711
1712 #define DECODE_EMACS_MULE_COMPOSITION_CHAR(buf) \
1713 if (1) \
1714 { \
1715 int c; \
1716 int nbytes, nchars; \
1717 \
1718 if (src == src_end) \
1719 break; \
1720 c = emacs_mule_char (coding, src, &nbytes, &nchars); \
1721 if (c < 0) \
1722 { \
1723 if (c == -2) \
1724 break; \
1725 goto invalid_code; \
1726 } \
1727 *buf++ = c; \
1728 src += nbytes; \
1729 consumed_chars += nchars; \
1730 } \
1731 else
1732
1733
1734 /* Decode a composition rule represented as a component of composition
1735 sequence of Emacs 20 style at SRC. Store the decoded rule in *BUF,
1736 and increment BUF. If SRC points an invalid byte sequence, set C
1737 to -1. */
1738
1739 #define DECODE_EMACS_MULE_COMPOSITION_RULE_20(buf) \
1740 do { \
1741 int c, gref, nref; \
1742 \
1743 if (src >= src_end) \
1744 goto invalid_code; \
1745 ONE_MORE_BYTE_NO_CHECK (c); \
1746 c -= 0x20; \
1747 if (c < 0 || c >= 81) \
1748 goto invalid_code; \
1749 \
1750 gref = c / 9, nref = c % 9; \
1751 *buf++ = COMPOSITION_ENCODE_RULE (gref, nref); \
1752 } while (0)
1753
1754
1755 /* Decode a composition rule represented as a component of composition
1756 sequence of Emacs 21 style at SRC. Store the decoded rule in *BUF,
1757 and increment BUF. If SRC points an invalid byte sequence, set C
1758 to -1. */
1759
1760 #define DECODE_EMACS_MULE_COMPOSITION_RULE_21(buf) \
1761 do { \
1762 int gref, nref; \
1763 \
1764 if (src + 1>= src_end) \
1765 goto invalid_code; \
1766 ONE_MORE_BYTE_NO_CHECK (gref); \
1767 gref -= 0x20; \
1768 ONE_MORE_BYTE_NO_CHECK (nref); \
1769 nref -= 0x20; \
1770 if (gref < 0 || gref >= 81 \
1771 || nref < 0 || nref >= 81) \
1772 goto invalid_code; \
1773 *buf++ = COMPOSITION_ENCODE_RULE (gref, nref); \
1774 } while (0)
1775
1776
1777 #define ADD_COMPOSITION_DATA(buf, method, nchars) \
1778 do { \
1779 *buf++ = -5; \
1780 *buf++ = coding->produced_char + char_offset; \
1781 *buf++ = CODING_ANNOTATE_COMPOSITION_MASK; \
1782 *buf++ = method; \
1783 *buf++ = nchars; \
1784 } while (0)
1785
1786
1787 #define DECODE_EMACS_MULE_21_COMPOSITION(c) \
1788 do { \
1789 /* Emacs 21 style format. The first three bytes at SRC are \
1790 (METHOD - 0xF2), (BYTES - 0xA0), (CHARS - 0xA0), where BYTES is \
1791 the byte length of this composition information, CHARS is the \
1792 number of characters composed by this composition. */ \
1793 enum composition_method method = c - 0xF2; \
1794 int *charbuf_base = charbuf; \
1795 int consumed_chars_limit; \
1796 int nbytes, nchars; \
1797 \
1798 ONE_MORE_BYTE (c); \
1799 nbytes = c - 0xA0; \
1800 if (nbytes < 3) \
1801 goto invalid_code; \
1802 ONE_MORE_BYTE (c); \
1803 nchars = c - 0xA0; \
1804 ADD_COMPOSITION_DATA (charbuf, method, nchars); \
1805 consumed_chars_limit = consumed_chars_base + nbytes; \
1806 if (method != COMPOSITION_RELATIVE) \
1807 { \
1808 int i = 0; \
1809 while (consumed_chars < consumed_chars_limit) \
1810 { \
1811 if (i % 2 && method != COMPOSITION_WITH_ALTCHARS) \
1812 DECODE_EMACS_MULE_COMPOSITION_RULE_21 (charbuf); \
1813 else \
1814 DECODE_EMACS_MULE_COMPOSITION_CHAR (charbuf); \
1815 i++; \
1816 } \
1817 if (consumed_chars < consumed_chars_limit) \
1818 goto invalid_code; \
1819 charbuf_base[0] -= i; \
1820 } \
1821 } while (0)
1822
1823
1824 #define DECODE_EMACS_MULE_20_RELATIVE_COMPOSITION(c) \
1825 do { \
1826 /* Emacs 20 style format for relative composition. */ \
1827 /* Store multibyte form of characters to be composed. */ \
1828 int components[MAX_COMPOSITION_COMPONENTS * 2 - 1]; \
1829 int *buf = components; \
1830 int i, j; \
1831 \
1832 src = src_base; \
1833 ONE_MORE_BYTE (c); /* skip 0x80 */ \
1834 for (i = 0; i < MAX_COMPOSITION_COMPONENTS; i++) \
1835 DECODE_EMACS_MULE_COMPOSITION_CHAR (buf); \
1836 if (i < 2) \
1837 goto invalid_code; \
1838 ADD_COMPOSITION_DATA (charbuf, COMPOSITION_RELATIVE, i); \
1839 for (j = 0; j < i; j++) \
1840 *charbuf++ = components[j]; \
1841 } while (0)
1842
1843
1844 #define DECODE_EMACS_MULE_20_RULEBASE_COMPOSITION(c) \
1845 do { \
1846 /* Emacs 20 style format for rule-base composition. */ \
1847 /* Store multibyte form of characters to be composed. */ \
1848 int components[MAX_COMPOSITION_COMPONENTS * 2 - 1]; \
1849 int *buf = components; \
1850 int i, j; \
1851 \
1852 DECODE_EMACS_MULE_COMPOSITION_CHAR (buf); \
1853 for (i = 0; i < MAX_COMPOSITION_COMPONENTS; i++) \
1854 { \
1855 DECODE_EMACS_MULE_COMPOSITION_RULE_20 (buf); \
1856 DECODE_EMACS_MULE_COMPOSITION_CHAR (buf); \
1857 } \
1858 if (i < 1 || (buf - components) % 2 == 0) \
1859 goto invalid_code; \
1860 if (charbuf + i + (i / 2) + 1 < charbuf_end) \
1861 goto no_more_source; \
1862 ADD_COMPOSITION_DATA (buf, COMPOSITION_WITH_RULE, i); \
1863 for (j = 0; j < i; j++) \
1864 *charbuf++ = components[j]; \
1865 for (j = 0; j < i; j += 2) \
1866 *charbuf++ = components[j]; \
1867 } while (0)
1868
1869
1870 static void
1871 decode_coding_emacs_mule (coding)
1872 struct coding_system *coding;
1873 {
1874 unsigned char *src = coding->source + coding->consumed;
1875 unsigned char *src_end = coding->source + coding->src_bytes;
1876 unsigned char *src_base;
1877 int *charbuf = coding->charbuf;
1878 int *charbuf_end = charbuf + coding->charbuf_size;
1879 int consumed_chars = 0, consumed_chars_base;
1880 int char_offset = 0;
1881 int multibytep = coding->src_multibyte;
1882 Lisp_Object attrs, eol_type, charset_list;
1883
1884 CODING_GET_INFO (coding, attrs, eol_type, charset_list);
1885
1886 while (1)
1887 {
1888 int c;
1889
1890 src_base = src;
1891 consumed_chars_base = consumed_chars;
1892
1893 if (charbuf >= charbuf_end)
1894 break;
1895
1896 ONE_MORE_BYTE (c);
1897
1898 if (c < 0x80)
1899 {
1900 if (c == '\r')
1901 {
1902 if (EQ (eol_type, Qdos))
1903 {
1904 if (src == src_end)
1905 goto no_more_source;
1906 if (*src == '\n')
1907 ONE_MORE_BYTE (c);
1908 }
1909 else if (EQ (eol_type, Qmac))
1910 c = '\n';
1911 }
1912 *charbuf++ = c;
1913 char_offset++;
1914 }
1915 else if (c == 0x80)
1916 {
1917 if (charbuf + 5 + (MAX_COMPOSITION_COMPONENTS * 2) - 1 > charbuf_end)
1918 break;
1919 ONE_MORE_BYTE (c);
1920 if (c - 0xF2 >= COMPOSITION_RELATIVE
1921 && c - 0xF2 <= COMPOSITION_WITH_RULE_ALTCHARS)
1922 DECODE_EMACS_MULE_21_COMPOSITION (c);
1923 else if (c < 0xC0)
1924 DECODE_EMACS_MULE_20_RELATIVE_COMPOSITION (c);
1925 else if (c == 0xFF)
1926 DECODE_EMACS_MULE_20_RULEBASE_COMPOSITION (c);
1927 else
1928 goto invalid_code;
1929 coding->annotated = 1;
1930 }
1931 else if (c < 0xA0 && emacs_mule_bytes[c] > 1)
1932 {
1933 int nbytes, nchars;
1934 src = src_base;
1935 consumed_chars = consumed_chars_base;
1936 c = emacs_mule_char (coding, src, &nbytes, &nchars);
1937 if (c < 0)
1938 {
1939 if (c == -2)
1940 break;
1941 goto invalid_code;
1942 }
1943 *charbuf++ = c;
1944 src += nbytes;
1945 consumed_chars += nchars;
1946 char_offset++;
1947 }
1948 continue;
1949
1950 invalid_code:
1951 src = src_base;
1952 consumed_chars = consumed_chars_base;
1953 ONE_MORE_BYTE (c);
1954 *charbuf++ = ASCII_BYTE_P (c) ? c : BYTE8_TO_CHAR (c);
1955 coding->errors++;
1956 }
1957
1958 no_more_source:
1959 coding->consumed_char += consumed_chars_base;
1960 coding->consumed = src_base - coding->source;
1961 coding->charbuf_used = charbuf - coding->charbuf;
1962 }
1963
1964
1965 #define EMACS_MULE_LEADING_CODES(id, codes) \
1966 do { \
1967 if (id < 0xA0) \
1968 codes[0] = id, codes[1] = 0; \
1969 else if (id < 0xE0) \
1970 codes[0] = 0x9A, codes[1] = id; \
1971 else if (id < 0xF0) \
1972 codes[0] = 0x9B, codes[1] = id; \
1973 else if (id < 0xF5) \
1974 codes[0] = 0x9C, codes[1] = id; \
1975 else \
1976 codes[0] = 0x9D, codes[1] = id; \
1977 } while (0);
1978
1979
1980 static int
1981 encode_coding_emacs_mule (coding)
1982 struct coding_system *coding;
1983 {
1984 int multibytep = coding->dst_multibyte;
1985 int *charbuf = coding->charbuf;
1986 int *charbuf_end = charbuf + coding->charbuf_used;
1987 unsigned char *dst = coding->destination + coding->produced;
1988 unsigned char *dst_end = coding->destination + coding->dst_bytes;
1989 int safe_room = 8;
1990 int produced_chars = 0;
1991 Lisp_Object attrs, eol_type, charset_list;
1992 int c;
1993
1994 CODING_GET_INFO (coding, attrs, eol_type, charset_list);
1995
1996 while (charbuf < charbuf_end)
1997 {
1998 ASSURE_DESTINATION (safe_room);
1999 c = *charbuf++;
2000 if (ASCII_CHAR_P (c))
2001 EMIT_ONE_ASCII_BYTE (c);
2002 else
2003 {
2004 struct charset *charset;
2005 unsigned code;
2006 int dimension;
2007 int emacs_mule_id;
2008 unsigned char leading_codes[2];
2009
2010 charset = char_charset (c, charset_list, &code);
2011 if (! charset)
2012 {
2013 c = coding->default_char;
2014 if (ASCII_CHAR_P (c))
2015 {
2016 EMIT_ONE_ASCII_BYTE (c);
2017 continue;
2018 }
2019 charset = char_charset (c, charset_list, &code);
2020 }
2021 dimension = CHARSET_DIMENSION (charset);
2022 emacs_mule_id = CHARSET_EMACS_MULE_ID (charset);
2023 EMACS_MULE_LEADING_CODES (emacs_mule_id, leading_codes);
2024 EMIT_ONE_BYTE (leading_codes[0]);
2025 if (leading_codes[1])
2026 EMIT_ONE_BYTE (leading_codes[1]);
2027 if (dimension == 1)
2028 EMIT_ONE_BYTE (code);
2029 else
2030 {
2031 EMIT_ONE_BYTE (code >> 8);
2032 EMIT_ONE_BYTE (code & 0xFF);
2033 }
2034 }
2035 }
2036 coding->result = CODING_RESULT_SUCCESS;
2037 coding->produced_char += produced_chars;
2038 coding->produced = dst - coding->destination;
2039 return 0;
2040 }
2041
2042 \f
2043 /*** 7. ISO2022 handlers ***/
2044
2045 /* The following note describes the coding system ISO2022 briefly.
2046 Since the intention of this note is to help understand the
2047 functions in this file, some parts are NOT ACCURATE or OVERLY
2048 SIMPLIFIED. For thorough understanding, please refer to the
2049 original document of ISO2022.
2050
2051 ISO2022 provides many mechanisms to encode several character sets
2052 in 7-bit and 8-bit environments. For 7-bite environments, all text
2053 is encoded using bytes less than 128. This may make the encoded
2054 text a little bit longer, but the text passes more easily through
2055 several gateways, some of which strip off MSB (Most Signigant Bit).
2056
2057 There are two kinds of character sets: control character set and
2058 graphic character set. The former contains control characters such
2059 as `newline' and `escape' to provide control functions (control
2060 functions are also provided by escape sequences). The latter
2061 contains graphic characters such as 'A' and '-'. Emacs recognizes
2062 two control character sets and many graphic character sets.
2063
2064 Graphic character sets are classified into one of the following
2065 four classes, according to the number of bytes (DIMENSION) and
2066 number of characters in one dimension (CHARS) of the set:
2067 - DIMENSION1_CHARS94
2068 - DIMENSION1_CHARS96
2069 - DIMENSION2_CHARS94
2070 - DIMENSION2_CHARS96
2071
2072 In addition, each character set is assigned an identification tag,
2073 unique for each set, called "final character" (denoted as <F>
2074 hereafter). The <F> of each character set is decided by ECMA(*)
2075 when it is registered in ISO. The code range of <F> is 0x30..0x7F
2076 (0x30..0x3F are for private use only).
2077
2078 Note (*): ECMA = European Computer Manufacturers Association
2079
2080 Here are examples of graphic character set [NAME(<F>)]:
2081 o DIMENSION1_CHARS94 -- ASCII('B'), right-half-of-JISX0201('I'), ...
2082 o DIMENSION1_CHARS96 -- right-half-of-ISO8859-1('A'), ...
2083 o DIMENSION2_CHARS94 -- GB2312('A'), JISX0208('B'), ...
2084 o DIMENSION2_CHARS96 -- none for the moment
2085
2086 A code area (1 byte=8 bits) is divided into 4 areas, C0, GL, C1, and GR.
2087 C0 [0x00..0x1F] -- control character plane 0
2088 GL [0x20..0x7F] -- graphic character plane 0
2089 C1 [0x80..0x9F] -- control character plane 1
2090 GR [0xA0..0xFF] -- graphic character plane 1
2091
2092 A control character set is directly designated and invoked to C0 or
2093 C1 by an escape sequence. The most common case is that:
2094 - ISO646's control character set is designated/invoked to C0, and
2095 - ISO6429's control character set is designated/invoked to C1,
2096 and usually these designations/invocations are omitted in encoded
2097 text. In a 7-bit environment, only C0 can be used, and a control
2098 character for C1 is encoded by an appropriate escape sequence to
2099 fit into the environment. All control characters for C1 are
2100 defined to have corresponding escape sequences.
2101
2102 A graphic character set is at first designated to one of four
2103 graphic registers (G0 through G3), then these graphic registers are
2104 invoked to GL or GR. These designations and invocations can be
2105 done independently. The most common case is that G0 is invoked to
2106 GL, G1 is invoked to GR, and ASCII is designated to G0. Usually
2107 these invocations and designations are omitted in encoded text.
2108 In a 7-bit environment, only GL can be used.
2109
2110 When a graphic character set of CHARS94 is invoked to GL, codes
2111 0x20 and 0x7F of the GL area work as control characters SPACE and
2112 DEL respectively, and codes 0xA0 and 0xFF of the GR area should not
2113 be used.
2114
2115 There are two ways of invocation: locking-shift and single-shift.
2116 With locking-shift, the invocation lasts until the next different
2117 invocation, whereas with single-shift, the invocation affects the
2118 following character only and doesn't affect the locking-shift
2119 state. Invocations are done by the following control characters or
2120 escape sequences:
2121
2122 ----------------------------------------------------------------------
2123 abbrev function cntrl escape seq description
2124 ----------------------------------------------------------------------
2125 SI/LS0 (shift-in) 0x0F none invoke G0 into GL
2126 SO/LS1 (shift-out) 0x0E none invoke G1 into GL
2127 LS2 (locking-shift-2) none ESC 'n' invoke G2 into GL
2128 LS3 (locking-shift-3) none ESC 'o' invoke G3 into GL
2129 LS1R (locking-shift-1 right) none ESC '~' invoke G1 into GR (*)
2130 LS2R (locking-shift-2 right) none ESC '}' invoke G2 into GR (*)
2131 LS3R (locking-shift 3 right) none ESC '|' invoke G3 into GR (*)
2132 SS2 (single-shift-2) 0x8E ESC 'N' invoke G2 for one char
2133 SS3 (single-shift-3) 0x8F ESC 'O' invoke G3 for one char
2134 ----------------------------------------------------------------------
2135 (*) These are not used by any known coding system.
2136
2137 Control characters for these functions are defined by macros
2138 ISO_CODE_XXX in `coding.h'.
2139
2140 Designations are done by the following escape sequences:
2141 ----------------------------------------------------------------------
2142 escape sequence description
2143 ----------------------------------------------------------------------
2144 ESC '(' <F> designate DIMENSION1_CHARS94<F> to G0
2145 ESC ')' <F> designate DIMENSION1_CHARS94<F> to G1
2146 ESC '*' <F> designate DIMENSION1_CHARS94<F> to G2
2147 ESC '+' <F> designate DIMENSION1_CHARS94<F> to G3
2148 ESC ',' <F> designate DIMENSION1_CHARS96<F> to G0 (*)
2149 ESC '-' <F> designate DIMENSION1_CHARS96<F> to G1
2150 ESC '.' <F> designate DIMENSION1_CHARS96<F> to G2
2151 ESC '/' <F> designate DIMENSION1_CHARS96<F> to G3
2152 ESC '$' '(' <F> designate DIMENSION2_CHARS94<F> to G0 (**)
2153 ESC '$' ')' <F> designate DIMENSION2_CHARS94<F> to G1
2154 ESC '$' '*' <F> designate DIMENSION2_CHARS94<F> to G2
2155 ESC '$' '+' <F> designate DIMENSION2_CHARS94<F> to G3
2156 ESC '$' ',' <F> designate DIMENSION2_CHARS96<F> to G0 (*)
2157 ESC '$' '-' <F> designate DIMENSION2_CHARS96<F> to G1
2158 ESC '$' '.' <F> designate DIMENSION2_CHARS96<F> to G2
2159 ESC '$' '/' <F> designate DIMENSION2_CHARS96<F> to G3
2160 ----------------------------------------------------------------------
2161
2162 In this list, "DIMENSION1_CHARS94<F>" means a graphic character set
2163 of dimension 1, chars 94, and final character <F>, etc...
2164
2165 Note (*): Although these designations are not allowed in ISO2022,
2166 Emacs accepts them on decoding, and produces them on encoding
2167 CHARS96 character sets in a coding system which is characterized as
2168 7-bit environment, non-locking-shift, and non-single-shift.
2169
2170 Note (**): If <F> is '@', 'A', or 'B', the intermediate character
2171 '(' must be omitted. We refer to this as "short-form" hereafter.
2172
2173 Now you may notice that there are a lot of ways for encoding the
2174 same multilingual text in ISO2022. Actually, there exist many
2175 coding systems such as Compound Text (used in X11's inter client
2176 communication, ISO-2022-JP (used in Japanese internet), ISO-2022-KR
2177 (used in Korean internet), EUC (Extended UNIX Code, used in Asian
2178 localized platforms), and all of these are variants of ISO2022.
2179
2180 In addition to the above, Emacs handles two more kinds of escape
2181 sequences: ISO6429's direction specification and Emacs' private
2182 sequence for specifying character composition.
2183
2184 ISO6429's direction specification takes the following form:
2185 o CSI ']' -- end of the current direction
2186 o CSI '0' ']' -- end of the current direction
2187 o CSI '1' ']' -- start of left-to-right text
2188 o CSI '2' ']' -- start of right-to-left text
2189 The control character CSI (0x9B: control sequence introducer) is
2190 abbreviated to the escape sequence ESC '[' in a 7-bit environment.
2191
2192 Character composition specification takes the following form:
2193 o ESC '0' -- start relative composition
2194 o ESC '1' -- end composition
2195 o ESC '2' -- start rule-base composition (*)
2196 o ESC '3' -- start relative composition with alternate chars (**)
2197 o ESC '4' -- start rule-base composition with alternate chars (**)
2198 Since these are not standard escape sequences of any ISO standard,
2199 the use of them for these meaning is restricted to Emacs only.
2200
2201 (*) This form is used only in Emacs 20.5 and the older versions,
2202 but the newer versions can safely decode it.
2203 (**) This form is used only in Emacs 21.1 and the newer versions,
2204 and the older versions can't decode it.
2205
2206 Here's a list of examples usages of these composition escape
2207 sequences (categorized by `enum composition_method').
2208
2209 COMPOSITION_RELATIVE:
2210 ESC 0 CHAR [ CHAR ] ESC 1
2211 COMPOSITOIN_WITH_RULE:
2212 ESC 2 CHAR [ RULE CHAR ] ESC 1
2213 COMPOSITION_WITH_ALTCHARS:
2214 ESC 3 ALTCHAR [ ALTCHAR ] ESC 0 CHAR [ CHAR ] ESC 1
2215 COMPOSITION_WITH_RULE_ALTCHARS:
2216 ESC 4 ALTCHAR [ RULE ALTCHAR ] ESC 0 CHAR [ CHAR ] ESC 1 */
2217
2218 enum iso_code_class_type iso_code_class[256];
2219
2220 #define SAFE_CHARSET_P(coding, id) \
2221 ((id) <= (coding)->max_charset_id \
2222 && (coding)->safe_charsets[id] >= 0)
2223
2224
2225 #define SHIFT_OUT_OK(category) \
2226 (CODING_ISO_INITIAL (&coding_categories[category], 1) >= 0)
2227
2228 static void
2229 setup_iso_safe_charsets (attrs)
2230 Lisp_Object attrs;
2231 {
2232 Lisp_Object charset_list, safe_charsets;
2233 Lisp_Object request;
2234 Lisp_Object reg_usage;
2235 Lisp_Object tail;
2236 int reg94, reg96;
2237 int flags = XINT (AREF (attrs, coding_attr_iso_flags));
2238 int max_charset_id;
2239
2240 charset_list = CODING_ATTR_CHARSET_LIST (attrs);
2241 if ((flags & CODING_ISO_FLAG_FULL_SUPPORT)
2242 && ! EQ (charset_list, Viso_2022_charset_list))
2243 {
2244 CODING_ATTR_CHARSET_LIST (attrs)
2245 = charset_list = Viso_2022_charset_list;
2246 ASET (attrs, coding_attr_safe_charsets, Qnil);
2247 }
2248
2249 if (STRINGP (AREF (attrs, coding_attr_safe_charsets)))
2250 return;
2251
2252 max_charset_id = 0;
2253 for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
2254 {
2255 int id = XINT (XCAR (tail));
2256 if (max_charset_id < id)
2257 max_charset_id = id;
2258 }
2259
2260 safe_charsets = Fmake_string (make_number (max_charset_id + 1),
2261 make_number (255));
2262 request = AREF (attrs, coding_attr_iso_request);
2263 reg_usage = AREF (attrs, coding_attr_iso_usage);
2264 reg94 = XINT (XCAR (reg_usage));
2265 reg96 = XINT (XCDR (reg_usage));
2266
2267 for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
2268 {
2269 Lisp_Object id;
2270 Lisp_Object reg;
2271 struct charset *charset;
2272
2273 id = XCAR (tail);
2274 charset = CHARSET_FROM_ID (XINT (id));
2275 reg = Fcdr (Fassq (id, request));
2276 if (! NILP (reg))
2277 XSTRING (safe_charsets)->data[XINT (id)] = XINT (reg);
2278 else if (charset->iso_chars_96)
2279 {
2280 if (reg96 < 4)
2281 XSTRING (safe_charsets)->data[XINT (id)] = reg96;
2282 }
2283 else
2284 {
2285 if (reg94 < 4)
2286 XSTRING (safe_charsets)->data[XINT (id)] = reg94;
2287 }
2288 }
2289 ASET (attrs, coding_attr_safe_charsets, safe_charsets);
2290 }
2291
2292
2293 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
2294 Check if a text is encoded in ISO2022. If it is, returns an
2295 integer in which appropriate flag bits any of:
2296 CATEGORY_MASK_ISO_7
2297 CATEGORY_MASK_ISO_7_TIGHT
2298 CATEGORY_MASK_ISO_8_1
2299 CATEGORY_MASK_ISO_8_2
2300 CATEGORY_MASK_ISO_7_ELSE
2301 CATEGORY_MASK_ISO_8_ELSE
2302 are set. If a code which should never appear in ISO2022 is found,
2303 returns 0. */
2304
2305 static int
2306 detect_coding_iso_2022 (coding, mask)
2307 struct coding_system *coding;
2308 int *mask;
2309 {
2310 unsigned char *src = coding->source, *src_base = src;
2311 unsigned char *src_end = coding->source + coding->src_bytes;
2312 int multibytep = coding->src_multibyte;
2313 int mask_iso = CATEGORY_MASK_ISO;
2314 int mask_found = 0, mask_8bit_found = 0;
2315 int reg[4], shift_out = 0, single_shifting = 0;
2316 int id;
2317 int c, c1;
2318 int consumed_chars = 0;
2319 int i;
2320
2321 for (i = coding_category_iso_7; i <= coding_category_iso_8_else; i++)
2322 {
2323 struct coding_system *this = &(coding_categories[i]);
2324 Lisp_Object attrs, val;
2325
2326 attrs = CODING_ID_ATTRS (this->id);
2327 if (CODING_ISO_FLAGS (this) & CODING_ISO_FLAG_FULL_SUPPORT
2328 && ! EQ (CODING_ATTR_SAFE_CHARSETS (attrs), Viso_2022_charset_list))
2329 setup_iso_safe_charsets (attrs);
2330 val = CODING_ATTR_SAFE_CHARSETS (attrs);
2331 this->max_charset_id = XSTRING (val)->size - 1;
2332 this->safe_charsets = (char *) XSTRING (val)->data;
2333 }
2334
2335 /* A coding system of this category is always ASCII compatible. */
2336 src += coding->head_ascii;
2337
2338 reg[0] = charset_ascii, reg[1] = reg[2] = reg[3] = -1;
2339 while (mask_iso && src < src_end)
2340 {
2341 ONE_MORE_BYTE (c);
2342 switch (c)
2343 {
2344 case ISO_CODE_ESC:
2345 if (inhibit_iso_escape_detection)
2346 break;
2347 single_shifting = 0;
2348 ONE_MORE_BYTE (c);
2349 if (c >= '(' && c <= '/')
2350 {
2351 /* Designation sequence for a charset of dimension 1. */
2352 ONE_MORE_BYTE (c1);
2353 if (c1 < ' ' || c1 >= 0x80
2354 || (id = iso_charset_table[0][c >= ','][c1]) < 0)
2355 /* Invalid designation sequence. Just ignore. */
2356 break;
2357 reg[(c - '(') % 4] = id;
2358 }
2359 else if (c == '$')
2360 {
2361 /* Designation sequence for a charset of dimension 2. */
2362 ONE_MORE_BYTE (c);
2363 if (c >= '@' && c <= 'B')
2364 /* Designation for JISX0208.1978, GB2312, or JISX0208. */
2365 reg[0] = id = iso_charset_table[1][0][c];
2366 else if (c >= '(' && c <= '/')
2367 {
2368 ONE_MORE_BYTE (c1);
2369 if (c1 < ' ' || c1 >= 0x80
2370 || (id = iso_charset_table[1][c >= ','][c1]) < 0)
2371 /* Invalid designation sequence. Just ignore. */
2372 break;
2373 reg[(c - '(') % 4] = id;
2374 }
2375 else
2376 /* Invalid designation sequence. Just ignore. */
2377 break;
2378 }
2379 else if (c == 'N' || c == 'O')
2380 {
2381 /* ESC <Fe> for SS2 or SS3. */
2382 mask_iso &= CATEGORY_MASK_ISO_7_ELSE;
2383 break;
2384 }
2385 else if (c >= '0' && c <= '4')
2386 {
2387 /* ESC <Fp> for start/end composition. */
2388 mask_found |= CATEGORY_MASK_ISO;
2389 break;
2390 }
2391 else
2392 {
2393 /* Invalid escape sequence. */
2394 mask_iso &= ~CATEGORY_MASK_ISO_ESCAPE;
2395 break;
2396 }
2397
2398 /* We found a valid designation sequence for CHARSET. */
2399 mask_iso &= ~CATEGORY_MASK_ISO_8BIT;
2400 if (SAFE_CHARSET_P (&coding_categories[coding_category_iso_7],
2401 id))
2402 mask_found |= CATEGORY_MASK_ISO_7;
2403 else
2404 mask_iso &= ~CATEGORY_MASK_ISO_7;
2405 if (SAFE_CHARSET_P (&coding_categories[coding_category_iso_7_tight],
2406 id))
2407 mask_found |= CATEGORY_MASK_ISO_7_TIGHT;
2408 else
2409 mask_iso &= ~CATEGORY_MASK_ISO_7_TIGHT;
2410 if (SAFE_CHARSET_P (&coding_categories[coding_category_iso_7_else],
2411 id))
2412 mask_found |= CATEGORY_MASK_ISO_7_ELSE;
2413 else
2414 mask_iso &= ~CATEGORY_MASK_ISO_7_ELSE;
2415 if (SAFE_CHARSET_P (&coding_categories[coding_category_iso_8_else],
2416 id))
2417 mask_found |= CATEGORY_MASK_ISO_8_ELSE;
2418 else
2419 mask_iso &= ~CATEGORY_MASK_ISO_8_ELSE;
2420 break;
2421
2422 case ISO_CODE_SO:
2423 if (inhibit_iso_escape_detection)
2424 break;
2425 single_shifting = 0;
2426 if (shift_out == 0
2427 && (reg[1] >= 0
2428 || SHIFT_OUT_OK (coding_category_iso_7_else)
2429 || SHIFT_OUT_OK (coding_category_iso_8_else)))
2430 {
2431 /* Locking shift out. */
2432 mask_iso &= ~CATEGORY_MASK_ISO_7BIT;
2433 mask_found |= CATEGORY_MASK_ISO_ELSE;
2434 }
2435 break;
2436
2437 case ISO_CODE_SI:
2438 if (inhibit_iso_escape_detection)
2439 break;
2440 single_shifting = 0;
2441 if (shift_out == 1)
2442 {
2443 /* Locking shift in. */
2444 mask_iso &= ~CATEGORY_MASK_ISO_7BIT;
2445 mask_found |= CATEGORY_MASK_ISO_ELSE;
2446 }
2447 break;
2448
2449 case ISO_CODE_CSI:
2450 single_shifting = 0;
2451 case ISO_CODE_SS2:
2452 case ISO_CODE_SS3:
2453 {
2454 int newmask = CATEGORY_MASK_ISO_8_ELSE;
2455
2456 if (inhibit_iso_escape_detection)
2457 break;
2458 if (c != ISO_CODE_CSI)
2459 {
2460 if (CODING_ISO_FLAGS (&coding_categories[coding_category_iso_8_1])
2461 & CODING_ISO_FLAG_SINGLE_SHIFT)
2462 newmask |= CATEGORY_MASK_ISO_8_1;
2463 if (CODING_ISO_FLAGS (&coding_categories[coding_category_iso_8_2])
2464 & CODING_ISO_FLAG_SINGLE_SHIFT)
2465 newmask |= CATEGORY_MASK_ISO_8_2;
2466 single_shifting = 1;
2467 }
2468 if (VECTORP (Vlatin_extra_code_table)
2469 && !NILP (XVECTOR (Vlatin_extra_code_table)->contents[c]))
2470 {
2471 if (CODING_ISO_FLAGS (&coding_categories[coding_category_iso_8_1])
2472 & CODING_ISO_FLAG_LATIN_EXTRA)
2473 newmask |= CATEGORY_MASK_ISO_8_1;
2474 if (CODING_ISO_FLAGS (&coding_categories[coding_category_iso_8_2])
2475 & CODING_ISO_FLAG_LATIN_EXTRA)
2476 newmask |= CATEGORY_MASK_ISO_8_2;
2477 }
2478 mask_iso &= newmask;
2479 mask_found |= newmask;
2480 }
2481 break;
2482
2483 default:
2484 if (c < 0x80)
2485 {
2486 single_shifting = 0;
2487 break;
2488 }
2489 else if (c < 0xA0)
2490 {
2491 single_shifting = 0;
2492 mask_8bit_found = 1;
2493 if (VECTORP (Vlatin_extra_code_table)
2494 && !NILP (XVECTOR (Vlatin_extra_code_table)->contents[c]))
2495 {
2496 int newmask = 0;
2497
2498 if (CODING_ISO_FLAGS (&coding_categories[coding_category_iso_8_1])
2499 & CODING_ISO_FLAG_LATIN_EXTRA)
2500 newmask |= CATEGORY_MASK_ISO_8_1;
2501 if (CODING_ISO_FLAGS (&coding_categories[coding_category_iso_8_2])
2502 & CODING_ISO_FLAG_LATIN_EXTRA)
2503 newmask |= CATEGORY_MASK_ISO_8_2;
2504 mask_iso &= newmask;
2505 mask_found |= newmask;
2506 }
2507 else
2508 return 0;
2509 }
2510 else
2511 {
2512 mask_iso &= ~(CATEGORY_MASK_ISO_7BIT
2513 | CATEGORY_MASK_ISO_7_ELSE);
2514 mask_found |= CATEGORY_MASK_ISO_8_1;
2515 mask_8bit_found = 1;
2516 /* Check the length of succeeding codes of the range
2517 0xA0..0FF. If the byte length is odd, we exclude
2518 CATEGORY_MASK_ISO_8_2. We can check this only
2519 when we are not single shifting. */
2520 if (!single_shifting
2521 && mask_iso & CATEGORY_MASK_ISO_8_2)
2522 {
2523 int i = 1;
2524 while (src < src_end)
2525 {
2526 ONE_MORE_BYTE (c);
2527 if (c < 0xA0)
2528 break;
2529 i++;
2530 }
2531
2532 if (i & 1 && src < src_end)
2533 mask_iso &= ~CATEGORY_MASK_ISO_8_2;
2534 else
2535 mask_found |= CATEGORY_MASK_ISO_8_2;
2536 }
2537 }
2538 break;
2539 }
2540 }
2541 no_more_source:
2542 if (!mask_iso)
2543 {
2544 *mask &= ~CATEGORY_MASK_ISO;
2545 return 0;
2546 }
2547 if (!mask_found)
2548 return 0;
2549 *mask &= mask_iso & mask_found;
2550 if (! mask_8bit_found)
2551 *mask &= ~(CATEGORY_MASK_ISO_8BIT | CATEGORY_MASK_ISO_8_ELSE);
2552 return 1;
2553 }
2554
2555
2556 /* Set designation state into CODING. */
2557 #define DECODE_DESIGNATION(reg, dim, chars_96, final) \
2558 do { \
2559 int id, prev; \
2560 \
2561 if (final < '0' || final >= 128 \
2562 || ((id = ISO_CHARSET_TABLE (dim, chars_96, final)) < 0) \
2563 || !SAFE_CHARSET_P (coding, id)) \
2564 { \
2565 CODING_ISO_DESIGNATION (coding, reg) = -2; \
2566 goto invalid_code; \
2567 } \
2568 prev = CODING_ISO_DESIGNATION (coding, reg); \
2569 if (id == charset_jisx0201_roman) \
2570 { \
2571 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_USE_ROMAN) \
2572 id = charset_ascii; \
2573 } \
2574 else if (id == charset_jisx0208_1978) \
2575 { \
2576 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_USE_OLDJIS) \
2577 id = charset_jisx0208; \
2578 } \
2579 CODING_ISO_DESIGNATION (coding, reg) = id; \
2580 /* If there was an invalid designation to REG previously, and this \
2581 designation is ASCII to REG, we should keep this designation \
2582 sequence. */ \
2583 if (prev == -2 && id == charset_ascii) \
2584 goto invalid_code; \
2585 } while (0)
2586
2587
2588 #define MAYBE_FINISH_COMPOSITION() \
2589 do { \
2590 int i; \
2591 if (composition_state == COMPOSING_NO) \
2592 break; \
2593 /* It is assured that we have enough room for producing \
2594 characters stored in the table `components'. */ \
2595 if (charbuf + component_idx > charbuf_end) \
2596 goto no_more_source; \
2597 composition_state = COMPOSING_NO; \
2598 if (method == COMPOSITION_RELATIVE \
2599 || method == COMPOSITION_WITH_ALTCHARS) \
2600 { \
2601 for (i = 0; i < component_idx; i++) \
2602 *charbuf++ = components[i]; \
2603 char_offset += component_idx; \
2604 } \
2605 else \
2606 { \
2607 for (i = 0; i < component_idx; i += 2) \
2608 *charbuf++ = components[i]; \
2609 char_offset += (component_idx / 2) + 1; \
2610 } \
2611 } while (0)
2612
2613
2614 /* Handle composition start sequence ESC 0, ESC 2, ESC 3, or ESC 4.
2615 ESC 0 : relative composition : ESC 0 CHAR ... ESC 1
2616 ESC 2 : rulebase composition : ESC 2 CHAR RULE CHAR RULE ... CHAR ESC 1
2617 ESC 3 : altchar composition : ESC 3 CHAR ... ESC 0 CHAR ... ESC 1
2618 ESC 4 : alt&rule composition : ESC 4 CHAR RULE ... CHAR ESC 0 CHAR ... ESC 1
2619 */
2620
2621 #define DECODE_COMPOSITION_START(c1) \
2622 do { \
2623 if (c1 == '0' \
2624 && composition_state == COMPOSING_COMPONENT_RULE) \
2625 { \
2626 component_len = component_idx; \
2627 composition_state = COMPOSING_CHAR; \
2628 } \
2629 else \
2630 { \
2631 unsigned char *p; \
2632 \
2633 MAYBE_FINISH_COMPOSITION (); \
2634 if (charbuf + MAX_COMPOSITION_COMPONENTS > charbuf_end) \
2635 goto no_more_source; \
2636 for (p = src; p < src_end - 1; p++) \
2637 if (*p == ISO_CODE_ESC && p[1] == '1') \
2638 break; \
2639 if (p == src_end - 1) \
2640 { \
2641 if (coding->mode & CODING_MODE_LAST_BLOCK) \
2642 goto invalid_code; \
2643 goto no_more_source; \
2644 } \
2645 \
2646 /* This is surely the start of a composition. */ \
2647 method = (c1 == '0' ? COMPOSITION_RELATIVE \
2648 : c1 == '2' ? COMPOSITION_WITH_RULE \
2649 : c1 == '3' ? COMPOSITION_WITH_ALTCHARS \
2650 : COMPOSITION_WITH_RULE_ALTCHARS); \
2651 composition_state = (c1 <= '2' ? COMPOSING_CHAR \
2652 : COMPOSING_COMPONENT_CHAR); \
2653 component_idx = component_len = 0; \
2654 } \
2655 } while (0)
2656
2657
2658 /* Handle compositoin end sequence ESC 1. */
2659
2660 #define DECODE_COMPOSITION_END() \
2661 do { \
2662 int nchars = (component_len > 0 ? component_idx - component_len \
2663 : method == COMPOSITION_RELATIVE ? component_idx \
2664 : (component_idx + 1) / 2); \
2665 int i; \
2666 int *saved_charbuf = charbuf; \
2667 \
2668 ADD_COMPOSITION_DATA (charbuf, method, nchars); \
2669 if (method != COMPOSITION_RELATIVE) \
2670 { \
2671 if (component_len == 0) \
2672 for (i = 0; i < component_idx; i++) \
2673 *charbuf++ = components[i]; \
2674 else \
2675 for (i = 0; i < component_len; i++) \
2676 *charbuf++ = components[i]; \
2677 *saved_charbuf = saved_charbuf - charbuf; \
2678 } \
2679 if (method == COMPOSITION_WITH_RULE) \
2680 for (i = 0; i < component_idx; i += 2, char_offset++) \
2681 *charbuf++ = components[i]; \
2682 else \
2683 for (i = component_len; i < component_idx; i++, char_offset++) \
2684 *charbuf++ = components[i]; \
2685 coding->annotated = 1; \
2686 composition_state = COMPOSING_NO; \
2687 } while (0)
2688
2689
2690 /* Decode a composition rule from the byte C1 (and maybe one more byte
2691 from SRC) and store one encoded composition rule in
2692 coding->cmp_data. */
2693
2694 #define DECODE_COMPOSITION_RULE(c1) \
2695 do { \
2696 (c1) -= 32; \
2697 if (c1 < 81) /* old format (before ver.21) */ \
2698 { \
2699 int gref = (c1) / 9; \
2700 int nref = (c1) % 9; \
2701 if (gref == 4) gref = 10; \
2702 if (nref == 4) nref = 10; \
2703 c1 = COMPOSITION_ENCODE_RULE (gref, nref); \
2704 } \
2705 else if (c1 < 93) /* new format (after ver.21) */ \
2706 { \
2707 ONE_MORE_BYTE (c2); \
2708 c1 = COMPOSITION_ENCODE_RULE (c1 - 81, c2 - 32); \
2709 } \
2710 else \
2711 c1 = 0; \
2712 } while (0)
2713
2714
2715 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions". */
2716
2717 static void
2718 decode_coding_iso_2022 (coding)
2719 struct coding_system *coding;
2720 {
2721 unsigned char *src = coding->source + coding->consumed;
2722 unsigned char *src_end = coding->source + coding->src_bytes;
2723 unsigned char *src_base;
2724 int *charbuf = coding->charbuf;
2725 int *charbuf_end = charbuf + coding->charbuf_size - 4;
2726 int consumed_chars = 0, consumed_chars_base;
2727 int char_offset = 0;
2728 int multibytep = coding->src_multibyte;
2729 /* Charsets invoked to graphic plane 0 and 1 respectively. */
2730 int charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
2731 int charset_id_1 = CODING_ISO_INVOKED_CHARSET (coding, 1);
2732 struct charset *charset;
2733 int c;
2734 /* For handling composition sequence. */
2735 #define COMPOSING_NO 0
2736 #define COMPOSING_CHAR 1
2737 #define COMPOSING_RULE 2
2738 #define COMPOSING_COMPONENT_CHAR 3
2739 #define COMPOSING_COMPONENT_RULE 4
2740
2741 int composition_state = COMPOSING_NO;
2742 enum composition_method method;
2743 int components[MAX_COMPOSITION_COMPONENTS * 2 + 1];
2744 int component_idx;
2745 int component_len;
2746 Lisp_Object attrs, eol_type, charset_list;
2747
2748 CODING_GET_INFO (coding, attrs, eol_type, charset_list);
2749 setup_iso_safe_charsets (attrs);
2750
2751 while (1)
2752 {
2753 int c1, c2;
2754
2755 src_base = src;
2756 consumed_chars_base = consumed_chars;
2757
2758 if (charbuf >= charbuf_end)
2759 break;
2760
2761 ONE_MORE_BYTE (c1);
2762
2763 /* We produce no character or one character. */
2764 switch (iso_code_class [c1])
2765 {
2766 case ISO_0x20_or_0x7F:
2767 if (composition_state != COMPOSING_NO)
2768 {
2769 if (composition_state == COMPOSING_RULE
2770 || composition_state == COMPOSING_COMPONENT_RULE)
2771 {
2772 DECODE_COMPOSITION_RULE (c1);
2773 components[component_idx++] = c1;
2774 composition_state--;
2775 continue;
2776 }
2777 }
2778 if (charset_id_0 < 0
2779 || ! CHARSET_ISO_CHARS_96 (CHARSET_FROM_ID (charset_id_0)))
2780 /* This is SPACE or DEL. */
2781 charset = CHARSET_FROM_ID (charset_ascii);
2782 else
2783 charset = CHARSET_FROM_ID (charset_id_0);
2784 break;
2785
2786 case ISO_graphic_plane_0:
2787 if (composition_state != COMPOSING_NO)
2788 {
2789 if (composition_state == COMPOSING_RULE
2790 || composition_state == COMPOSING_COMPONENT_RULE)
2791 {
2792 DECODE_COMPOSITION_RULE (c1);
2793 components[component_idx++] = c1;
2794 composition_state--;
2795 continue;
2796 }
2797 }
2798 charset = CHARSET_FROM_ID (charset_id_0);
2799 break;
2800
2801 case ISO_0xA0_or_0xFF:
2802 if (charset_id_1 < 0
2803 || ! CHARSET_ISO_CHARS_96 (CHARSET_FROM_ID (charset_id_1))
2804 || CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS)
2805 goto invalid_code;
2806 /* This is a graphic character, we fall down ... */
2807
2808 case ISO_graphic_plane_1:
2809 if (charset_id_1 < 0)
2810 goto invalid_code;
2811 charset = CHARSET_FROM_ID (charset_id_1);
2812 break;
2813
2814 case ISO_carriage_return:
2815 if (c1 == '\r')
2816 {
2817 if (EQ (eol_type, Qdos))
2818 {
2819 if (src == src_end)
2820 goto no_more_source;
2821 if (*src == '\n')
2822 ONE_MORE_BYTE (c1);
2823 }
2824 else if (EQ (eol_type, Qmac))
2825 c1 = '\n';
2826 }
2827 /* fall through */
2828
2829 case ISO_control_0:
2830 MAYBE_FINISH_COMPOSITION ();
2831 charset = CHARSET_FROM_ID (charset_ascii);
2832 break;
2833
2834 case ISO_control_1:
2835 MAYBE_FINISH_COMPOSITION ();
2836 goto invalid_code;
2837
2838 case ISO_shift_out:
2839 if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LOCKING_SHIFT)
2840 || CODING_ISO_DESIGNATION (coding, 1) < 0)
2841 goto invalid_code;
2842 CODING_ISO_INVOCATION (coding, 0) = 1;
2843 charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
2844 continue;
2845
2846 case ISO_shift_in:
2847 if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LOCKING_SHIFT))
2848 goto invalid_code;
2849 CODING_ISO_INVOCATION (coding, 0) = 0;
2850 charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
2851 continue;
2852
2853 case ISO_single_shift_2_7:
2854 case ISO_single_shift_2:
2855 if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT))
2856 goto invalid_code;
2857 /* SS2 is handled as an escape sequence of ESC 'N' */
2858 c1 = 'N';
2859 goto label_escape_sequence;
2860
2861 case ISO_single_shift_3:
2862 if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT))
2863 goto invalid_code;
2864 /* SS2 is handled as an escape sequence of ESC 'O' */
2865 c1 = 'O';
2866 goto label_escape_sequence;
2867
2868 case ISO_control_sequence_introducer:
2869 /* CSI is handled as an escape sequence of ESC '[' ... */
2870 c1 = '[';
2871 goto label_escape_sequence;
2872
2873 case ISO_escape:
2874 ONE_MORE_BYTE (c1);
2875 label_escape_sequence:
2876 /* Escape sequences handled here are invocation,
2877 designation, direction specification, and character
2878 composition specification. */
2879 switch (c1)
2880 {
2881 case '&': /* revision of following character set */
2882 ONE_MORE_BYTE (c1);
2883 if (!(c1 >= '@' && c1 <= '~'))
2884 goto invalid_code;
2885 ONE_MORE_BYTE (c1);
2886 if (c1 != ISO_CODE_ESC)
2887 goto invalid_code;
2888 ONE_MORE_BYTE (c1);
2889 goto label_escape_sequence;
2890
2891 case '$': /* designation of 2-byte character set */
2892 if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DESIGNATION))
2893 goto invalid_code;
2894 ONE_MORE_BYTE (c1);
2895 if (c1 >= '@' && c1 <= 'B')
2896 { /* designation of JISX0208.1978, GB2312.1980,
2897 or JISX0208.1980 */
2898 DECODE_DESIGNATION (0, 2, 0, c1);
2899 }
2900 else if (c1 >= 0x28 && c1 <= 0x2B)
2901 { /* designation of DIMENSION2_CHARS94 character set */
2902 ONE_MORE_BYTE (c2);
2903 DECODE_DESIGNATION (c1 - 0x28, 2, 0, c2);
2904 }
2905 else if (c1 >= 0x2C && c1 <= 0x2F)
2906 { /* designation of DIMENSION2_CHARS96 character set */
2907 ONE_MORE_BYTE (c2);
2908 DECODE_DESIGNATION (c1 - 0x2C, 2, 1, c2);
2909 }
2910 else
2911 goto invalid_code;
2912 /* We must update these variables now. */
2913 charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
2914 charset_id_1 = CODING_ISO_INVOKED_CHARSET (coding, 1);
2915 continue;
2916
2917 case 'n': /* invocation of locking-shift-2 */
2918 if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LOCKING_SHIFT)
2919 || CODING_ISO_DESIGNATION (coding, 2) < 0)
2920 goto invalid_code;
2921 CODING_ISO_INVOCATION (coding, 0) = 2;
2922 charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
2923 continue;
2924
2925 case 'o': /* invocation of locking-shift-3 */
2926 if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LOCKING_SHIFT)
2927 || CODING_ISO_DESIGNATION (coding, 3) < 0)
2928 goto invalid_code;
2929 CODING_ISO_INVOCATION (coding, 0) = 3;
2930 charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
2931 continue;
2932
2933 case 'N': /* invocation of single-shift-2 */
2934 if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT)
2935 || CODING_ISO_DESIGNATION (coding, 2) < 0)
2936 goto invalid_code;
2937 charset = CHARSET_FROM_ID (CODING_ISO_DESIGNATION (coding, 2));
2938 ONE_MORE_BYTE (c1);
2939 if (c1 < 0x20 || (c1 >= 0x80 && c1 < 0xA0))
2940 goto invalid_code;
2941 break;
2942
2943 case 'O': /* invocation of single-shift-3 */
2944 if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT)
2945 || CODING_ISO_DESIGNATION (coding, 3) < 0)
2946 goto invalid_code;
2947 charset = CHARSET_FROM_ID (CODING_ISO_DESIGNATION (coding, 3));
2948 ONE_MORE_BYTE (c1);
2949 if (c1 < 0x20 || (c1 >= 0x80 && c1 < 0xA0))
2950 goto invalid_code;
2951 break;
2952
2953 case '0': case '2': case '3': case '4': /* start composition */
2954 if (! (coding->common_flags & CODING_ANNOTATE_COMPOSITION_MASK))
2955 goto invalid_code;
2956 DECODE_COMPOSITION_START (c1);
2957 continue;
2958
2959 case '1': /* end composition */
2960 if (composition_state == COMPOSING_NO)
2961 goto invalid_code;
2962 DECODE_COMPOSITION_END ();
2963 continue;
2964
2965 case '[': /* specification of direction */
2966 if (! CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DIRECTION)
2967 goto invalid_code;
2968 /* For the moment, nested direction is not supported.
2969 So, `coding->mode & CODING_MODE_DIRECTION' zero means
2970 left-to-right, and nozero means right-to-left. */
2971 ONE_MORE_BYTE (c1);
2972 switch (c1)
2973 {
2974 case ']': /* end of the current direction */
2975 coding->mode &= ~CODING_MODE_DIRECTION;
2976
2977 case '0': /* end of the current direction */
2978 case '1': /* start of left-to-right direction */
2979 ONE_MORE_BYTE (c1);
2980 if (c1 == ']')
2981 coding->mode &= ~CODING_MODE_DIRECTION;
2982 else
2983 goto invalid_code;
2984 break;
2985
2986 case '2': /* start of right-to-left direction */
2987 ONE_MORE_BYTE (c1);
2988 if (c1 == ']')
2989 coding->mode |= CODING_MODE_DIRECTION;
2990 else
2991 goto invalid_code;
2992 break;
2993
2994 default:
2995 goto invalid_code;
2996 }
2997 continue;
2998
2999 default:
3000 if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DESIGNATION))
3001 goto invalid_code;
3002 if (c1 >= 0x28 && c1 <= 0x2B)
3003 { /* designation of DIMENSION1_CHARS94 character set */
3004 ONE_MORE_BYTE (c2);
3005 DECODE_DESIGNATION (c1 - 0x28, 1, 0, c2);
3006 }
3007 else if (c1 >= 0x2C && c1 <= 0x2F)
3008 { /* designation of DIMENSION1_CHARS96 character set */
3009 ONE_MORE_BYTE (c2);
3010 DECODE_DESIGNATION (c1 - 0x2C, 1, 1, c2);
3011 }
3012 else
3013 goto invalid_code;
3014 /* We must update these variables now. */
3015 charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3016 charset_id_1 = CODING_ISO_INVOKED_CHARSET (coding, 1);
3017 continue;
3018 }
3019 }
3020
3021 /* Now we know CHARSET and 1st position code C1 of a character.
3022 Produce a decoded character while getting 2nd position code
3023 C2 if necessary. */
3024 c1 &= 0x7F;
3025 if (CHARSET_DIMENSION (charset) > 1)
3026 {
3027 ONE_MORE_BYTE (c2);
3028 if (c2 < 0x20 || (c2 >= 0x80 && c2 < 0xA0))
3029 /* C2 is not in a valid range. */
3030 goto invalid_code;
3031 c1 = (c1 << 8) | (c2 & 0x7F);
3032 if (CHARSET_DIMENSION (charset) > 2)
3033 {
3034 ONE_MORE_BYTE (c2);
3035 if (c2 < 0x20 || (c2 >= 0x80 && c2 < 0xA0))
3036 /* C2 is not in a valid range. */
3037 goto invalid_code;
3038 c1 = (c1 << 8) | (c2 & 0x7F);
3039 }
3040 }
3041
3042 CODING_DECODE_CHAR (coding, src, src_base, src_end, charset, c1, c);
3043 if (c < 0)
3044 {
3045 MAYBE_FINISH_COMPOSITION ();
3046 for (; src_base < src; src_base++, char_offset++)
3047 {
3048 if (ASCII_BYTE_P (*src_base))
3049 *charbuf++ = *src_base;
3050 else
3051 *charbuf++ = BYTE8_TO_CHAR (*src_base);
3052 }
3053 }
3054 else if (composition_state == COMPOSING_NO)
3055 {
3056 *charbuf++ = c;
3057 char_offset++;
3058 }
3059 else
3060 {
3061 components[component_idx++] = c;
3062 if (method == COMPOSITION_WITH_RULE
3063 || (method == COMPOSITION_WITH_RULE_ALTCHARS
3064 && composition_state == COMPOSING_COMPONENT_CHAR))
3065 composition_state++;
3066 }
3067 continue;
3068
3069 invalid_code:
3070 MAYBE_FINISH_COMPOSITION ();
3071 src = src_base;
3072 consumed_chars = consumed_chars_base;
3073 ONE_MORE_BYTE (c);
3074 *charbuf++ = ASCII_BYTE_P (c) ? c : BYTE8_TO_CHAR (c);
3075 coding->errors++;
3076 }
3077
3078 no_more_source:
3079 coding->consumed_char += consumed_chars_base;
3080 coding->consumed = src_base - coding->source;
3081 coding->charbuf_used = charbuf - coding->charbuf;
3082 }
3083
3084
3085 /* ISO2022 encoding stuff. */
3086
3087 /*
3088 It is not enough to say just "ISO2022" on encoding, we have to
3089 specify more details. In Emacs, each coding system of ISO2022
3090 variant has the following specifications:
3091 1. Initial designation to G0 thru G3.
3092 2. Allows short-form designation?
3093 3. ASCII should be designated to G0 before control characters?
3094 4. ASCII should be designated to G0 at end of line?
3095 5. 7-bit environment or 8-bit environment?
3096 6. Use locking-shift?
3097 7. Use Single-shift?
3098 And the following two are only for Japanese:
3099 8. Use ASCII in place of JIS0201-1976-Roman?
3100 9. Use JISX0208-1983 in place of JISX0208-1978?
3101 These specifications are encoded in CODING_ISO_FLAGS (coding) as flag bits
3102 defined by macros CODING_ISO_FLAG_XXX. See `coding.h' for more
3103 details.
3104 */
3105
3106 /* Produce codes (escape sequence) for designating CHARSET to graphic
3107 register REG at DST, and increment DST. If <final-char> of CHARSET is
3108 '@', 'A', or 'B' and the coding system CODING allows, produce
3109 designation sequence of short-form. */
3110
3111 #define ENCODE_DESIGNATION(charset, reg, coding) \
3112 do { \
3113 unsigned char final_char = CHARSET_ISO_FINAL (charset); \
3114 char *intermediate_char_94 = "()*+"; \
3115 char *intermediate_char_96 = ",-./"; \
3116 int revision = -1; \
3117 int c; \
3118 \
3119 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_REVISION) \
3120 revision = XINT (CHARSET_ISO_REVISION (charset)); \
3121 \
3122 if (revision >= 0) \
3123 { \
3124 EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, '&'); \
3125 EMIT_ONE_BYTE ('@' + revision); \
3126 } \
3127 EMIT_ONE_ASCII_BYTE (ISO_CODE_ESC); \
3128 if (CHARSET_DIMENSION (charset) == 1) \
3129 { \
3130 if (! CHARSET_ISO_CHARS_96 (charset)) \
3131 c = intermediate_char_94[reg]; \
3132 else \
3133 c = intermediate_char_96[reg]; \
3134 EMIT_ONE_ASCII_BYTE (c); \
3135 } \
3136 else \
3137 { \
3138 EMIT_ONE_ASCII_BYTE ('$'); \
3139 if (! CHARSET_ISO_CHARS_96 (charset)) \
3140 { \
3141 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LONG_FORM \
3142 || reg != 0 \
3143 || final_char < '@' || final_char > 'B') \
3144 EMIT_ONE_ASCII_BYTE (intermediate_char_94[reg]); \
3145 } \
3146 else \
3147 EMIT_ONE_ASCII_BYTE (intermediate_char_96[reg]); \
3148 } \
3149 EMIT_ONE_ASCII_BYTE (final_char); \
3150 \
3151 CODING_ISO_DESIGNATION (coding, reg) = CHARSET_ID (charset); \
3152 } while (0)
3153
3154
3155 /* The following two macros produce codes (control character or escape
3156 sequence) for ISO2022 single-shift functions (single-shift-2 and
3157 single-shift-3). */
3158
3159 #define ENCODE_SINGLE_SHIFT_2 \
3160 do { \
3161 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS) \
3162 EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'N'); \
3163 else \
3164 EMIT_ONE_BYTE (ISO_CODE_SS2); \
3165 CODING_ISO_SINGLE_SHIFTING (coding) = 1; \
3166 } while (0)
3167
3168
3169 #define ENCODE_SINGLE_SHIFT_3 \
3170 do { \
3171 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS) \
3172 EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'O'); \
3173 else \
3174 EMIT_ONE_BYTE (ISO_CODE_SS3); \
3175 CODING_ISO_SINGLE_SHIFTING (coding) = 1; \
3176 } while (0)
3177
3178
3179 /* The following four macros produce codes (control character or
3180 escape sequence) for ISO2022 locking-shift functions (shift-in,
3181 shift-out, locking-shift-2, and locking-shift-3). */
3182
3183 #define ENCODE_SHIFT_IN \
3184 do { \
3185 EMIT_ONE_ASCII_BYTE (ISO_CODE_SI); \
3186 CODING_ISO_INVOCATION (coding, 0) = 0; \
3187 } while (0)
3188
3189
3190 #define ENCODE_SHIFT_OUT \
3191 do { \
3192 EMIT_ONE_ASCII_BYTE (ISO_CODE_SO); \
3193 CODING_ISO_INVOCATION (coding, 0) = 1; \
3194 } while (0)
3195
3196
3197 #define ENCODE_LOCKING_SHIFT_2 \
3198 do { \
3199 EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'n'); \
3200 CODING_ISO_INVOCATION (coding, 0) = 2; \
3201 } while (0)
3202
3203
3204 #define ENCODE_LOCKING_SHIFT_3 \
3205 do { \
3206 EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'n'); \
3207 CODING_ISO_INVOCATION (coding, 0) = 3; \
3208 } while (0)
3209
3210
3211 /* Produce codes for a DIMENSION1 character whose character set is
3212 CHARSET and whose position-code is C1. Designation and invocation
3213 sequences are also produced in advance if necessary. */
3214
3215 #define ENCODE_ISO_CHARACTER_DIMENSION1(charset, c1) \
3216 do { \
3217 int id = CHARSET_ID (charset); \
3218 \
3219 if ((CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_USE_ROMAN) \
3220 && id == charset_ascii) \
3221 { \
3222 id = charset_jisx0201_roman; \
3223 charset = CHARSET_FROM_ID (id); \
3224 } \
3225 \
3226 if (CODING_ISO_SINGLE_SHIFTING (coding)) \
3227 { \
3228 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS) \
3229 EMIT_ONE_ASCII_BYTE (c1 & 0x7F); \
3230 else \
3231 EMIT_ONE_BYTE (c1 | 0x80); \
3232 CODING_ISO_SINGLE_SHIFTING (coding) = 0; \
3233 break; \
3234 } \
3235 else if (id == CODING_ISO_INVOKED_CHARSET (coding, 0)) \
3236 { \
3237 EMIT_ONE_ASCII_BYTE (c1 & 0x7F); \
3238 break; \
3239 } \
3240 else if (id == CODING_ISO_INVOKED_CHARSET (coding, 1)) \
3241 { \
3242 EMIT_ONE_BYTE (c1 | 0x80); \
3243 break; \
3244 } \
3245 else \
3246 /* Since CHARSET is not yet invoked to any graphic planes, we \
3247 must invoke it, or, at first, designate it to some graphic \
3248 register. Then repeat the loop to actually produce the \
3249 character. */ \
3250 dst = encode_invocation_designation (charset, coding, dst, \
3251 &produced_chars); \
3252 } while (1)
3253
3254
3255 /* Produce codes for a DIMENSION2 character whose character set is
3256 CHARSET and whose position-codes are C1 and C2. Designation and
3257 invocation codes are also produced in advance if necessary. */
3258
3259 #define ENCODE_ISO_CHARACTER_DIMENSION2(charset, c1, c2) \
3260 do { \
3261 int id = CHARSET_ID (charset); \
3262 \
3263 if ((CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_USE_OLDJIS) \
3264 && id == charset_jisx0208) \
3265 { \
3266 id = charset_jisx0208_1978; \
3267 charset = CHARSET_FROM_ID (id); \
3268 } \
3269 \
3270 if (CODING_ISO_SINGLE_SHIFTING (coding)) \
3271 { \
3272 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS) \
3273 EMIT_TWO_ASCII_BYTES ((c1) & 0x7F, (c2) & 0x7F); \
3274 else \
3275 EMIT_TWO_BYTES ((c1) | 0x80, (c2) | 0x80); \
3276 CODING_ISO_SINGLE_SHIFTING (coding) = 0; \
3277 break; \
3278 } \
3279 else if (id == CODING_ISO_INVOKED_CHARSET (coding, 0)) \
3280 { \
3281 EMIT_TWO_ASCII_BYTES ((c1) & 0x7F, (c2) & 0x7F); \
3282 break; \
3283 } \
3284 else if (id == CODING_ISO_INVOKED_CHARSET (coding, 1)) \
3285 { \
3286 EMIT_TWO_BYTES ((c1) | 0x80, (c2) | 0x80); \
3287 break; \
3288 } \
3289 else \
3290 /* Since CHARSET is not yet invoked to any graphic planes, we \
3291 must invoke it, or, at first, designate it to some graphic \
3292 register. Then repeat the loop to actually produce the \
3293 character. */ \
3294 dst = encode_invocation_designation (charset, coding, dst, \
3295 &produced_chars); \
3296 } while (1)
3297
3298
3299 #define ENCODE_ISO_CHARACTER(charset, c) \
3300 do { \
3301 int code = ENCODE_CHAR ((charset),(c)); \
3302 \
3303 if (CHARSET_DIMENSION (charset) == 1) \
3304 ENCODE_ISO_CHARACTER_DIMENSION1 ((charset), code); \
3305 else \
3306 ENCODE_ISO_CHARACTER_DIMENSION2 ((charset), code >> 8, code & 0xFF); \
3307 } while (0)
3308
3309
3310 /* Produce designation and invocation codes at a place pointed by DST
3311 to use CHARSET. The element `spec.iso_2022' of *CODING is updated.
3312 Return new DST. */
3313
3314 unsigned char *
3315 encode_invocation_designation (charset, coding, dst, p_nchars)
3316 struct charset *charset;
3317 struct coding_system *coding;
3318 unsigned char *dst;
3319 int *p_nchars;
3320 {
3321 int multibytep = coding->dst_multibyte;
3322 int produced_chars = *p_nchars;
3323 int reg; /* graphic register number */
3324 int id = CHARSET_ID (charset);
3325
3326 /* At first, check designations. */
3327 for (reg = 0; reg < 4; reg++)
3328 if (id == CODING_ISO_DESIGNATION (coding, reg))
3329 break;
3330
3331 if (reg >= 4)
3332 {
3333 /* CHARSET is not yet designated to any graphic registers. */
3334 /* At first check the requested designation. */
3335 reg = CODING_ISO_REQUEST (coding, id);
3336 if (reg < 0)
3337 /* Since CHARSET requests no special designation, designate it
3338 to graphic register 0. */
3339 reg = 0;
3340
3341 ENCODE_DESIGNATION (charset, reg, coding);
3342 }
3343
3344 if (CODING_ISO_INVOCATION (coding, 0) != reg
3345 && CODING_ISO_INVOCATION (coding, 1) != reg)
3346 {
3347 /* Since the graphic register REG is not invoked to any graphic
3348 planes, invoke it to graphic plane 0. */
3349 switch (reg)
3350 {
3351 case 0: /* graphic register 0 */
3352 ENCODE_SHIFT_IN;
3353 break;
3354
3355 case 1: /* graphic register 1 */
3356 ENCODE_SHIFT_OUT;
3357 break;
3358
3359 case 2: /* graphic register 2 */
3360 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT)
3361 ENCODE_SINGLE_SHIFT_2;
3362 else
3363 ENCODE_LOCKING_SHIFT_2;
3364 break;
3365
3366 case 3: /* graphic register 3 */
3367 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT)
3368 ENCODE_SINGLE_SHIFT_3;
3369 else
3370 ENCODE_LOCKING_SHIFT_3;
3371 break;
3372 }
3373 }
3374
3375 *p_nchars = produced_chars;
3376 return dst;
3377 }
3378
3379 /* The following three macros produce codes for indicating direction
3380 of text. */
3381 #define ENCODE_CONTROL_SEQUENCE_INTRODUCER \
3382 do { \
3383 if (CODING_ISO_FLAGS (coding) == CODING_ISO_FLAG_SEVEN_BITS) \
3384 EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, '['); \
3385 else \
3386 EMIT_ONE_BYTE (ISO_CODE_CSI); \
3387 } while (0)
3388
3389
3390 #define ENCODE_DIRECTION_R2L() \
3391 do { \
3392 ENCODE_CONTROL_SEQUENCE_INTRODUCER (dst); \
3393 EMIT_TWO_ASCII_BYTES ('2', ']'); \
3394 } while (0)
3395
3396
3397 #define ENCODE_DIRECTION_L2R() \
3398 do { \
3399 ENCODE_CONTROL_SEQUENCE_INTRODUCER (dst); \
3400 EMIT_TWO_ASCII_BYTES ('0', ']'); \
3401 } while (0)
3402
3403
3404 /* Produce codes for designation and invocation to reset the graphic
3405 planes and registers to initial state. */
3406 #define ENCODE_RESET_PLANE_AND_REGISTER() \
3407 do { \
3408 int reg; \
3409 struct charset *charset; \
3410 \
3411 if (CODING_ISO_INVOCATION (coding, 0) != 0) \
3412 ENCODE_SHIFT_IN; \
3413 for (reg = 0; reg < 4; reg++) \
3414 if (CODING_ISO_INITIAL (coding, reg) >= 0 \
3415 && (CODING_ISO_DESIGNATION (coding, reg) \
3416 != CODING_ISO_INITIAL (coding, reg))) \
3417 { \
3418 charset = CHARSET_FROM_ID (CODING_ISO_INITIAL (coding, reg)); \
3419 ENCODE_DESIGNATION (charset, reg, coding); \
3420 } \
3421 } while (0)
3422
3423
3424 /* Produce designation sequences of charsets in the line started from
3425 SRC to a place pointed by DST, and return updated DST.
3426
3427 If the current block ends before any end-of-line, we may fail to
3428 find all the necessary designations. */
3429
3430 static unsigned char *
3431 encode_designation_at_bol (coding, charbuf, charbuf_end, dst)
3432 struct coding_system *coding;
3433 int *charbuf, *charbuf_end;
3434 unsigned char *dst;
3435 {
3436 struct charset *charset;
3437 /* Table of charsets to be designated to each graphic register. */
3438 int r[4];
3439 int c, found = 0, reg;
3440 int produced_chars = 0;
3441 int multibytep = coding->dst_multibyte;
3442 Lisp_Object attrs;
3443 Lisp_Object charset_list;
3444
3445 attrs = CODING_ID_ATTRS (coding->id);
3446 charset_list = CODING_ATTR_CHARSET_LIST (attrs);
3447 if (EQ (charset_list, Qiso_2022))
3448 charset_list = Viso_2022_charset_list;
3449
3450 for (reg = 0; reg < 4; reg++)
3451 r[reg] = -1;
3452
3453 while (found < 4)
3454 {
3455 int id;
3456
3457 c = *charbuf++;
3458 if (c == '\n')
3459 break;
3460 charset = char_charset (c, charset_list, NULL);
3461 id = CHARSET_ID (charset);
3462 reg = CODING_ISO_REQUEST (coding, id);
3463 if (reg >= 0 && r[reg] < 0)
3464 {
3465 found++;
3466 r[reg] = id;
3467 }
3468 }
3469
3470 if (found)
3471 {
3472 for (reg = 0; reg < 4; reg++)
3473 if (r[reg] >= 0
3474 && CODING_ISO_DESIGNATION (coding, reg) != r[reg])
3475 ENCODE_DESIGNATION (CHARSET_FROM_ID (r[reg]), reg, coding);
3476 }
3477
3478 return dst;
3479 }
3480
3481 /* See the above "GENERAL NOTES on `encode_coding_XXX ()' functions". */
3482
3483 static int
3484 encode_coding_iso_2022 (coding)
3485 struct coding_system *coding;
3486 {
3487 int multibytep = coding->dst_multibyte;
3488 int *charbuf = coding->charbuf;
3489 int *charbuf_end = charbuf + coding->charbuf_used;
3490 unsigned char *dst = coding->destination + coding->produced;
3491 unsigned char *dst_end = coding->destination + coding->dst_bytes;
3492 int safe_room = 16;
3493 int bol_designation
3494 = (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DESIGNATE_AT_BOL
3495 && CODING_ISO_BOL (coding));
3496 int produced_chars = 0;
3497 Lisp_Object attrs, eol_type, charset_list;
3498 int ascii_compatible;
3499 int c;
3500
3501 CODING_GET_INFO (coding, attrs, eol_type, charset_list);
3502 setup_iso_safe_charsets (attrs);
3503 coding->safe_charsets
3504 = (char *) XSTRING (CODING_ATTR_SAFE_CHARSETS(attrs))->data;
3505
3506 ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs));
3507
3508 while (charbuf < charbuf_end)
3509 {
3510 ASSURE_DESTINATION (safe_room);
3511
3512 if (bol_designation)
3513 {
3514 unsigned char *dst_prev = dst;
3515
3516 /* We have to produce designation sequences if any now. */
3517 dst = encode_designation_at_bol (coding, charbuf, charbuf_end, dst);
3518 bol_designation = 0;
3519 /* We are sure that designation sequences are all ASCII bytes. */
3520 produced_chars += dst - dst_prev;
3521 }
3522
3523 c = *charbuf++;
3524
3525 /* Now encode the character C. */
3526 if (c < 0x20 || c == 0x7F)
3527 {
3528 if (c == '\n'
3529 || (c == '\r' && EQ (eol_type, Qmac)))
3530 {
3531 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_RESET_AT_EOL)
3532 ENCODE_RESET_PLANE_AND_REGISTER ();
3533 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_INIT_AT_BOL)
3534 {
3535 int i;
3536
3537 for (i = 0; i < 4; i++)
3538 CODING_ISO_DESIGNATION (coding, i)
3539 = CODING_ISO_INITIAL (coding, i);
3540 }
3541 bol_designation
3542 = CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DESIGNATE_AT_BOL;
3543 }
3544 else if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_RESET_AT_CNTL)
3545 ENCODE_RESET_PLANE_AND_REGISTER ();
3546 EMIT_ONE_ASCII_BYTE (c);
3547 }
3548 else if (ASCII_CHAR_P (c))
3549 {
3550 if (ascii_compatible)
3551 EMIT_ONE_ASCII_BYTE (c);
3552 else
3553 {
3554 struct charset *charset = CHARSET_FROM_ID (charset_ascii);
3555 ENCODE_ISO_CHARACTER (charset, c);
3556 }
3557 }
3558 else
3559 {
3560 struct charset *charset = char_charset (c, charset_list, NULL);
3561
3562 if (!charset)
3563 {
3564 if (coding->mode & CODING_MODE_SAFE_ENCODING)
3565 {
3566 c = CODING_INHIBIT_CHARACTER_SUBSTITUTION;
3567 charset = CHARSET_FROM_ID (charset_ascii);
3568 }
3569 else
3570 {
3571 c = coding->default_char;
3572 charset = char_charset (c, charset_list, NULL);
3573 }
3574 }
3575 ENCODE_ISO_CHARACTER (charset, c);
3576 }
3577 }
3578
3579 if (coding->mode & CODING_MODE_LAST_BLOCK
3580 && CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_RESET_AT_EOL)
3581 {
3582 ASSURE_DESTINATION (safe_room);
3583 ENCODE_RESET_PLANE_AND_REGISTER ();
3584 }
3585 coding->result = CODING_RESULT_SUCCESS;
3586 CODING_ISO_BOL (coding) = bol_designation;
3587 coding->produced_char += produced_chars;
3588 coding->produced = dst - coding->destination;
3589 return 0;
3590 }
3591
3592 \f
3593 /*** 8,9. SJIS and BIG5 handlers ***/
3594
3595 /* Although SJIS and BIG5 are not ISO's coding system, they are used
3596 quite widely. So, for the moment, Emacs supports them in the bare
3597 C code. But, in the future, they may be supported only by CCL. */
3598
3599 /* SJIS is a coding system encoding three character sets: ASCII, right
3600 half of JISX0201-Kana, and JISX0208. An ASCII character is encoded
3601 as is. A character of charset katakana-jisx0201 is encoded by
3602 "position-code + 0x80". A character of charset japanese-jisx0208
3603 is encoded in 2-byte but two position-codes are divided and shifted
3604 so that it fit in the range below.
3605
3606 --- CODE RANGE of SJIS ---
3607 (character set) (range)
3608 ASCII 0x00 .. 0x7F
3609 KATAKANA-JISX0201 0xA0 .. 0xDF
3610 JISX0208 (1st byte) 0x81 .. 0x9F and 0xE0 .. 0xEF
3611 (2nd byte) 0x40 .. 0x7E and 0x80 .. 0xFC
3612 -------------------------------
3613
3614 */
3615
3616 /* BIG5 is a coding system encoding two character sets: ASCII and
3617 Big5. An ASCII character is encoded as is. Big5 is a two-byte
3618 character set and is encoded in two-byte.
3619
3620 --- CODE RANGE of BIG5 ---
3621 (character set) (range)
3622 ASCII 0x00 .. 0x7F
3623 Big5 (1st byte) 0xA1 .. 0xFE
3624 (2nd byte) 0x40 .. 0x7E and 0xA1 .. 0xFE
3625 --------------------------
3626
3627 */
3628
3629 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
3630 Check if a text is encoded in SJIS. If it is, return
3631 CATEGORY_MASK_SJIS, else return 0. */
3632
3633 static int
3634 detect_coding_sjis (coding, mask)
3635 struct coding_system *coding;
3636 int *mask;
3637 {
3638 unsigned char *src = coding->source, *src_base = src;
3639 unsigned char *src_end = coding->source + coding->src_bytes;
3640 int multibytep = coding->src_multibyte;
3641 int consumed_chars = 0;
3642 int found = 0;
3643 int c;
3644
3645 /* A coding system of this category is always ASCII compatible. */
3646 src += coding->head_ascii;
3647
3648 while (1)
3649 {
3650 ONE_MORE_BYTE (c);
3651 if (c < 0x80)
3652 continue;
3653 if ((c >= 0x81 && c <= 0x9F) || (c >= 0xE0 && c <= 0xEF))
3654 {
3655 ONE_MORE_BYTE (c);
3656 if (c < 0x40 || c == 0x7F || c > 0xFC)
3657 break;
3658 found = 1;
3659 }
3660 else if (c >= 0xA0 && c < 0xE0)
3661 found = 1;
3662 else
3663 break;
3664 }
3665 *mask &= ~CATEGORY_MASK_SJIS;
3666 return 0;
3667
3668 no_more_source:
3669 if (!found)
3670 return 0;
3671 *mask &= CATEGORY_MASK_SJIS;
3672 return 1;
3673 }
3674
3675 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
3676 Check if a text is encoded in BIG5. If it is, return
3677 CATEGORY_MASK_BIG5, else return 0. */
3678
3679 static int
3680 detect_coding_big5 (coding, mask)
3681 struct coding_system *coding;
3682 int *mask;
3683 {
3684 unsigned char *src = coding->source, *src_base = src;
3685 unsigned char *src_end = coding->source + coding->src_bytes;
3686 int multibytep = coding->src_multibyte;
3687 int consumed_chars = 0;
3688 int found = 0;
3689 int c;
3690
3691 /* A coding system of this category is always ASCII compatible. */
3692 src += coding->head_ascii;
3693
3694 while (1)
3695 {
3696 ONE_MORE_BYTE (c);
3697 if (c < 0x80)
3698 continue;
3699 if (c >= 0xA1)
3700 {
3701 ONE_MORE_BYTE (c);
3702 if (c < 0x40 || (c >= 0x7F && c <= 0xA0))
3703 return 0;
3704 found = 1;
3705 }
3706 else
3707 break;
3708 }
3709 *mask &= ~CATEGORY_MASK_BIG5;
3710 return 0;
3711
3712 no_more_source:
3713 if (!found)
3714 return 0;
3715 *mask &= CATEGORY_MASK_BIG5;
3716 return 1;
3717 }
3718
3719 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".
3720 If SJIS_P is 1, decode SJIS text, else decode BIG5 test. */
3721
3722 static void
3723 decode_coding_sjis (coding)
3724 struct coding_system *coding;
3725 {
3726 unsigned char *src = coding->source + coding->consumed;
3727 unsigned char *src_end = coding->source + coding->src_bytes;
3728 unsigned char *src_base;
3729 int *charbuf = coding->charbuf;
3730 int *charbuf_end = charbuf + coding->charbuf_size;
3731 int consumed_chars = 0, consumed_chars_base;
3732 int multibytep = coding->src_multibyte;
3733 struct charset *charset_roman, *charset_kanji, *charset_kana;
3734 Lisp_Object attrs, eol_type, charset_list, val;
3735
3736 CODING_GET_INFO (coding, attrs, eol_type, charset_list);
3737
3738 val = charset_list;
3739 charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
3740 charset_kanji = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
3741 charset_kana = CHARSET_FROM_ID (XINT (XCAR (val)));
3742
3743 while (1)
3744 {
3745 int c, c1;
3746
3747 src_base = src;
3748 consumed_chars_base = consumed_chars;
3749
3750 if (charbuf >= charbuf_end)
3751 break;
3752
3753 ONE_MORE_BYTE (c);
3754
3755 if (c == '\r')
3756 {
3757 if (EQ (eol_type, Qdos))
3758 {
3759 if (src == src_end)
3760 goto no_more_source;
3761 if (*src == '\n')
3762 ONE_MORE_BYTE (c);
3763 }
3764 else if (EQ (eol_type, Qmac))
3765 c = '\n';
3766 }
3767 else
3768 {
3769 struct charset *charset;
3770
3771 if (c < 0x80)
3772 charset = charset_roman;
3773 else
3774 {
3775 if (c >= 0xF0)
3776 goto invalid_code;
3777 if (c < 0xA0 || c >= 0xE0)
3778 {
3779 /* SJIS -> JISX0208 */
3780 ONE_MORE_BYTE (c1);
3781 if (c1 < 0x40 || c1 == 0x7F || c1 > 0xFC)
3782 goto invalid_code;
3783 c = (c << 8) | c1;
3784 SJIS_TO_JIS (c);
3785 charset = charset_kanji;
3786 }
3787 else
3788 /* SJIS -> JISX0201-Kana */
3789 charset = charset_kana;
3790 }
3791 CODING_DECODE_CHAR (coding, src, src_base, src_end, charset, c, c);
3792 }
3793 *charbuf++ = c;
3794 continue;
3795
3796 invalid_code:
3797 src = src_base;
3798 consumed_chars = consumed_chars_base;
3799 ONE_MORE_BYTE (c);
3800 *charbuf++ = ASCII_BYTE_P (c) ? c : BYTE8_TO_CHAR (c);
3801 coding->errors++;
3802 }
3803
3804 no_more_source:
3805 coding->consumed_char += consumed_chars_base;
3806 coding->consumed = src_base - coding->source;
3807 coding->charbuf_used = charbuf - coding->charbuf;
3808 }
3809
3810 static void
3811 decode_coding_big5 (coding)
3812 struct coding_system *coding;
3813 {
3814 unsigned char *src = coding->source + coding->consumed;
3815 unsigned char *src_end = coding->source + coding->src_bytes;
3816 unsigned char *src_base;
3817 int *charbuf = coding->charbuf;
3818 int *charbuf_end = charbuf + coding->charbuf_size;
3819 int consumed_chars = 0, consumed_chars_base;
3820 int multibytep = coding->src_multibyte;
3821 struct charset *charset_roman, *charset_big5;
3822 Lisp_Object attrs, eol_type, charset_list, val;
3823
3824 CODING_GET_INFO (coding, attrs, eol_type, charset_list);
3825 val = charset_list;
3826 charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
3827 charset_big5 = CHARSET_FROM_ID (XINT (XCAR (val)));
3828
3829 while (1)
3830 {
3831 int c, c1;
3832
3833 src_base = src;
3834 consumed_chars_base = consumed_chars;
3835
3836 if (charbuf >= charbuf_end)
3837 break;
3838
3839 ONE_MORE_BYTE (c);
3840
3841 if (c == '\r')
3842 {
3843 if (EQ (eol_type, Qdos))
3844 {
3845 if (src == src_end)
3846 goto no_more_source;
3847 if (*src == '\n')
3848 ONE_MORE_BYTE (c);
3849 }
3850 else if (EQ (eol_type, Qmac))
3851 c = '\n';
3852 }
3853 else
3854 {
3855 struct charset *charset;
3856 if (c < 0x80)
3857 charset = charset_roman;
3858 else
3859 {
3860 /* BIG5 -> Big5 */
3861 if (c < 0xA1 || c > 0xFE)
3862 goto invalid_code;
3863 ONE_MORE_BYTE (c1);
3864 if (c1 < 0x40 || (c1 > 0x7E && c1 < 0xA1) || c1 > 0xFE)
3865 goto invalid_code;
3866 c = c << 8 | c1;
3867 charset = charset_big5;
3868 }
3869 CODING_DECODE_CHAR (coding, src, src_base, src_end, charset, c, c);
3870 }
3871
3872 *charbuf++ = c;
3873 continue;
3874
3875 invalid_code:
3876 src = src_base;
3877 consumed_chars = consumed_chars_base;
3878 ONE_MORE_BYTE (c);
3879 *charbuf++ = ASCII_BYTE_P (c) ? c : BYTE8_TO_CHAR (c);
3880 coding->errors++;
3881 }
3882
3883 no_more_source:
3884 coding->consumed_char += consumed_chars_base;
3885 coding->consumed = src_base - coding->source;
3886 coding->charbuf_used = charbuf - coding->charbuf;
3887 }
3888
3889 /* See the above "GENERAL NOTES on `encode_coding_XXX ()' functions".
3890 This function can encode charsets `ascii', `katakana-jisx0201',
3891 `japanese-jisx0208', `chinese-big5-1', and `chinese-big5-2'. We
3892 are sure that all these charsets are registered as official charset
3893 (i.e. do not have extended leading-codes). Characters of other
3894 charsets are produced without any encoding. If SJIS_P is 1, encode
3895 SJIS text, else encode BIG5 text. */
3896
3897 static int
3898 encode_coding_sjis (coding)
3899 struct coding_system *coding;
3900 {
3901 int multibytep = coding->dst_multibyte;
3902 int *charbuf = coding->charbuf;
3903 int *charbuf_end = charbuf + coding->charbuf_used;
3904 unsigned char *dst = coding->destination + coding->produced;
3905 unsigned char *dst_end = coding->destination + coding->dst_bytes;
3906 int safe_room = 4;
3907 int produced_chars = 0;
3908 Lisp_Object attrs, eol_type, charset_list, val;
3909 int ascii_compatible;
3910 struct charset *charset_roman, *charset_kanji, *charset_kana;
3911 int c;
3912
3913 CODING_GET_INFO (coding, attrs, eol_type, charset_list);
3914 val = charset_list;
3915 charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
3916 charset_kana = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
3917 charset_kanji = CHARSET_FROM_ID (XINT (XCAR (val)));
3918
3919 ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs));
3920
3921 while (charbuf < charbuf_end)
3922 {
3923 ASSURE_DESTINATION (safe_room);
3924 c = *charbuf++;
3925 /* Now encode the character C. */
3926 if (ASCII_CHAR_P (c) && ascii_compatible)
3927 EMIT_ONE_ASCII_BYTE (c);
3928 else
3929 {
3930 unsigned code;
3931 struct charset *charset = char_charset (c, charset_list, &code);
3932
3933 if (!charset)
3934 {
3935 if (coding->mode & CODING_MODE_SAFE_ENCODING)
3936 {
3937 code = CODING_INHIBIT_CHARACTER_SUBSTITUTION;
3938 charset = CHARSET_FROM_ID (charset_ascii);
3939 }
3940 else
3941 {
3942 c = coding->default_char;
3943 charset = char_charset (c, charset_list, &code);
3944 }
3945 }
3946 if (code == CHARSET_INVALID_CODE (charset))
3947 abort ();
3948 if (charset == charset_kanji)
3949 {
3950 int c1, c2;
3951 JIS_TO_SJIS (code);
3952 c1 = code >> 8, c2 = code & 0xFF;
3953 EMIT_TWO_BYTES (c1, c2);
3954 }
3955 else if (charset == charset_kana)
3956 EMIT_ONE_BYTE (code | 0x80);
3957 else
3958 EMIT_ONE_ASCII_BYTE (code & 0x7F);
3959 }
3960 }
3961 coding->result = CODING_RESULT_SUCCESS;
3962 coding->produced_char += produced_chars;
3963 coding->produced = dst - coding->destination;
3964 return 0;
3965 }
3966
3967 static int
3968 encode_coding_big5 (coding)
3969 struct coding_system *coding;
3970 {
3971 int multibytep = coding->dst_multibyte;
3972 int *charbuf = coding->charbuf;
3973 int *charbuf_end = charbuf + coding->charbuf_used;
3974 unsigned char *dst = coding->destination + coding->produced;
3975 unsigned char *dst_end = coding->destination + coding->dst_bytes;
3976 int safe_room = 4;
3977 int produced_chars = 0;
3978 Lisp_Object attrs, eol_type, charset_list, val;
3979 int ascii_compatible;
3980 struct charset *charset_roman, *charset_big5;
3981 int c;
3982
3983 CODING_GET_INFO (coding, attrs, eol_type, charset_list);
3984 val = charset_list;
3985 charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
3986 charset_big5 = CHARSET_FROM_ID (XINT (XCAR (val)));
3987 ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs));
3988
3989 while (charbuf < charbuf_end)
3990 {
3991 ASSURE_DESTINATION (safe_room);
3992 c = *charbuf++;
3993 /* Now encode the character C. */
3994 if (ASCII_CHAR_P (c) && ascii_compatible)
3995 EMIT_ONE_ASCII_BYTE (c);
3996 else
3997 {
3998 unsigned code;
3999 struct charset *charset = char_charset (c, charset_list, &code);
4000
4001 if (! charset)
4002 {
4003 if (coding->mode & CODING_MODE_SAFE_ENCODING)
4004 {
4005 code = CODING_INHIBIT_CHARACTER_SUBSTITUTION;
4006 charset = CHARSET_FROM_ID (charset_ascii);
4007 }
4008 else
4009 {
4010 c = coding->default_char;
4011 charset = char_charset (c, charset_list, &code);
4012 }
4013 }
4014 if (code == CHARSET_INVALID_CODE (charset))
4015 abort ();
4016 if (charset == charset_big5)
4017 {
4018 int c1, c2;
4019
4020 c1 = code >> 8, c2 = code & 0xFF;
4021 EMIT_TWO_BYTES (c1, c2);
4022 }
4023 else
4024 EMIT_ONE_ASCII_BYTE (code & 0x7F);
4025 }
4026 }
4027 coding->result = CODING_RESULT_SUCCESS;
4028 coding->produced_char += produced_chars;
4029 coding->produced = dst - coding->destination;
4030 return 0;
4031 }
4032
4033 \f
4034 /*** 10. CCL handlers ***/
4035
4036 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
4037 Check if a text is encoded in a coding system of which
4038 encoder/decoder are written in CCL program. If it is, return
4039 CATEGORY_MASK_CCL, else return 0. */
4040
4041 static int
4042 detect_coding_ccl (coding, mask)
4043 struct coding_system *coding;
4044 int *mask;
4045 {
4046 unsigned char *src = coding->source, *src_base = src;
4047 unsigned char *src_end = coding->source + coding->src_bytes;
4048 int multibytep = coding->src_multibyte;
4049 int consumed_chars = 0;
4050 int found = 0;
4051 unsigned char *valids = CODING_CCL_VALIDS (coding);
4052 int head_ascii = coding->head_ascii;
4053 Lisp_Object attrs;
4054
4055 coding = &coding_categories[coding_category_ccl];
4056 attrs = CODING_ID_ATTRS (coding->id);
4057 if (! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
4058 src += head_ascii;
4059
4060 while (1)
4061 {
4062 int c;
4063 ONE_MORE_BYTE (c);
4064 if (! valids[c])
4065 break;
4066 if (!found && valids[c] > 1)
4067 found = 1;
4068 }
4069 *mask &= ~CATEGORY_MASK_CCL;
4070 return 0;
4071
4072 no_more_source:
4073 if (!found)
4074 return 0;
4075 *mask &= CATEGORY_MASK_CCL;
4076 return 1;
4077 }
4078
4079 static void
4080 decode_coding_ccl (coding)
4081 struct coding_system *coding;
4082 {
4083 unsigned char *src = coding->source + coding->consumed;
4084 unsigned char *src_end = coding->source + coding->src_bytes;
4085 int *charbuf = coding->charbuf;
4086 int *charbuf_end = charbuf + coding->charbuf_size;
4087 int consumed_chars = 0;
4088 int multibytep = coding->src_multibyte;
4089 struct ccl_program ccl;
4090 int source_charbuf[1024];
4091 int source_byteidx[1024];
4092
4093 setup_ccl_program (&ccl, CODING_CCL_DECODER (coding));
4094
4095 while (src < src_end)
4096 {
4097 unsigned char *p = src;
4098 int *source, *source_end;
4099 int i = 0;
4100
4101 if (multibytep)
4102 while (i < 1024 && p < src_end)
4103 {
4104 source_byteidx[i] = p - src;
4105 source_charbuf[i++] = STRING_CHAR_ADVANCE (p);
4106 }
4107 else
4108 while (i < 1024 && p < src_end)
4109 source_charbuf[i++] = *p++;
4110
4111 if (p == src_end && coding->mode & CODING_MODE_LAST_BLOCK)
4112 ccl.last_block = 1;
4113
4114 source = source_charbuf;
4115 source_end = source + i;
4116 while (source < source_end)
4117 {
4118 ccl_driver (&ccl, source, charbuf,
4119 source_end - source, charbuf_end - charbuf);
4120 source += ccl.consumed;
4121 charbuf += ccl.produced;
4122 if (ccl.status != CCL_STAT_SUSPEND_BY_DST)
4123 break;
4124 }
4125 if (source < source_end)
4126 src += source_byteidx[source - source_charbuf];
4127 else
4128 src = p;
4129 consumed_chars += source - source_charbuf;
4130
4131 if (ccl.status != CCL_STAT_SUSPEND_BY_SRC
4132 && ccl.status != CODING_RESULT_INSUFFICIENT_SRC)
4133 break;
4134 }
4135
4136 switch (ccl.status)
4137 {
4138 case CCL_STAT_SUSPEND_BY_SRC:
4139 coding->result = CODING_RESULT_INSUFFICIENT_SRC;
4140 break;
4141 case CCL_STAT_SUSPEND_BY_DST:
4142 break;
4143 case CCL_STAT_QUIT:
4144 case CCL_STAT_INVALID_CMD:
4145 coding->result = CODING_RESULT_INTERRUPT;
4146 break;
4147 default:
4148 coding->result = CODING_RESULT_SUCCESS;
4149 break;
4150 }
4151 coding->consumed_char += consumed_chars;
4152 coding->consumed = src - coding->source;
4153 coding->charbuf_used = charbuf - coding->charbuf;
4154 }
4155
4156 static int
4157 encode_coding_ccl (coding)
4158 struct coding_system *coding;
4159 {
4160 struct ccl_program ccl;
4161 int multibytep = coding->dst_multibyte;
4162 int *charbuf = coding->charbuf;
4163 int *charbuf_end = charbuf + coding->charbuf_used;
4164 unsigned char *dst = coding->destination + coding->produced;
4165 unsigned char *dst_end = coding->destination + coding->dst_bytes;
4166 unsigned char *adjusted_dst_end = dst_end - 1;
4167 int destination_charbuf[1024];
4168 int i, produced_chars = 0;
4169
4170 setup_ccl_program (&ccl, CODING_CCL_ENCODER (coding));
4171
4172 ccl.last_block = coding->mode & CODING_MODE_LAST_BLOCK;
4173 ccl.dst_multibyte = coding->dst_multibyte;
4174
4175 while (charbuf < charbuf_end && dst < adjusted_dst_end)
4176 {
4177 int dst_bytes = dst_end - dst;
4178 if (dst_bytes > 1024)
4179 dst_bytes = 1024;
4180
4181 ccl_driver (&ccl, charbuf, destination_charbuf,
4182 charbuf_end - charbuf, dst_bytes);
4183 charbuf += ccl.consumed;
4184 if (multibytep)
4185 for (i = 0; i < ccl.produced; i++)
4186 EMIT_ONE_BYTE (destination_charbuf[i] & 0xFF);
4187 else
4188 {
4189 for (i = 0; i < ccl.produced; i++)
4190 *dst++ = destination_charbuf[i] & 0xFF;
4191 produced_chars += ccl.produced;
4192 }
4193 }
4194
4195 switch (ccl.status)
4196 {
4197 case CCL_STAT_SUSPEND_BY_SRC:
4198 coding->result = CODING_RESULT_INSUFFICIENT_SRC;
4199 break;
4200 case CCL_STAT_SUSPEND_BY_DST:
4201 coding->result = CODING_RESULT_INSUFFICIENT_DST;
4202 break;
4203 case CCL_STAT_QUIT:
4204 case CCL_STAT_INVALID_CMD:
4205 coding->result = CODING_RESULT_INTERRUPT;
4206 break;
4207 default:
4208 coding->result = CODING_RESULT_SUCCESS;
4209 break;
4210 }
4211
4212 coding->produced_char += produced_chars;
4213 coding->produced = dst - coding->destination;
4214 return 0;
4215 }
4216
4217
4218 \f
4219 /*** 10, 11. no-conversion handlers ***/
4220
4221 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions". */
4222
4223 static void
4224 decode_coding_raw_text (coding)
4225 struct coding_system *coding;
4226 {
4227 coding->chars_at_source = 1;
4228 coding->consumed_char = 0;
4229 coding->consumed = 0;
4230 coding->result = CODING_RESULT_SUCCESS;
4231 }
4232
4233 static int
4234 encode_coding_raw_text (coding)
4235 struct coding_system *coding;
4236 {
4237 int multibytep = coding->dst_multibyte;
4238 int *charbuf = coding->charbuf;
4239 int *charbuf_end = coding->charbuf + coding->charbuf_used;
4240 unsigned char *dst = coding->destination + coding->produced;
4241 unsigned char *dst_end = coding->destination + coding->dst_bytes;
4242 int produced_chars = 0;
4243 int c;
4244
4245 if (multibytep)
4246 {
4247 int safe_room = MAX_MULTIBYTE_LENGTH * 2;
4248
4249 if (coding->src_multibyte)
4250 while (charbuf < charbuf_end)
4251 {
4252 ASSURE_DESTINATION (safe_room);
4253 c = *charbuf++;
4254 if (ASCII_CHAR_P (c))
4255 EMIT_ONE_ASCII_BYTE (c);
4256 else if (CHAR_BYTE8_P (c))
4257 {
4258 c = CHAR_TO_BYTE8 (c);
4259 EMIT_ONE_BYTE (c);
4260 }
4261 else
4262 {
4263 unsigned char str[MAX_MULTIBYTE_LENGTH], *p0 = str, *p1 = str;
4264
4265 CHAR_STRING_ADVANCE (c, p1);
4266 while (p0 < p1)
4267 EMIT_ONE_BYTE (*p0);
4268 }
4269 }
4270 else
4271 while (charbuf < charbuf_end)
4272 {
4273 ASSURE_DESTINATION (safe_room);
4274 c = *charbuf++;
4275 EMIT_ONE_BYTE (c);
4276 }
4277 }
4278 else
4279 {
4280 if (coding->src_multibyte)
4281 {
4282 int safe_room = MAX_MULTIBYTE_LENGTH;
4283
4284 while (charbuf < charbuf_end)
4285 {
4286 ASSURE_DESTINATION (safe_room);
4287 c = *charbuf++;
4288 if (ASCII_CHAR_P (c))
4289 *dst++ = c;
4290 else if (CHAR_BYTE8_P (c))
4291 *dst++ = CHAR_TO_BYTE8 (c);
4292 else
4293 CHAR_STRING_ADVANCE (c, dst);
4294 produced_chars++;
4295 }
4296 }
4297 else
4298 {
4299 ASSURE_DESTINATION (charbuf_end - charbuf);
4300 while (charbuf < charbuf_end && dst < dst_end)
4301 *dst++ = *charbuf++;
4302 produced_chars = dst - (coding->destination + coding->dst_bytes);
4303 }
4304 }
4305 coding->result = CODING_RESULT_SUCCESS;
4306 coding->produced_char += produced_chars;
4307 coding->produced = dst - coding->destination;
4308 return 0;
4309 }
4310
4311 static int
4312 detect_coding_charset (coding, mask)
4313 struct coding_system *coding;
4314 int *mask;
4315 {
4316 unsigned char *src = coding->source, *src_base = src;
4317 unsigned char *src_end = coding->source + coding->src_bytes;
4318 int multibytep = coding->src_multibyte;
4319 int consumed_chars = 0;
4320 Lisp_Object attrs, valids;
4321
4322 coding = &coding_categories[coding_category_charset];
4323 attrs = CODING_ID_ATTRS (coding->id);
4324 valids = AREF (attrs, coding_attr_charset_valids);
4325
4326 if (! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
4327 src += coding->head_ascii;
4328
4329 while (1)
4330 {
4331 int c;
4332
4333 ONE_MORE_BYTE (c);
4334 if (NILP (AREF (valids, c)))
4335 break;
4336 }
4337 *mask &= ~CATEGORY_MASK_CHARSET;
4338 return 0;
4339
4340 no_more_source:
4341 *mask &= CATEGORY_MASK_CHARSET;
4342 return 1;
4343 }
4344
4345 static void
4346 decode_coding_charset (coding)
4347 struct coding_system *coding;
4348 {
4349 unsigned char *src = coding->source + coding->consumed;
4350 unsigned char *src_end = coding->source + coding->src_bytes;
4351 unsigned char *src_base;
4352 int *charbuf = coding->charbuf;
4353 int *charbuf_end = charbuf + coding->charbuf_size;
4354 int consumed_chars = 0, consumed_chars_base;
4355 int multibytep = coding->src_multibyte;
4356 Lisp_Object attrs, eol_type, charset_list, valids;
4357
4358 CODING_GET_INFO (coding, attrs, eol_type, charset_list);
4359 valids = AREF (attrs, coding_attr_charset_valids);
4360
4361 while (1)
4362 {
4363 int c;
4364
4365 src_base = src;
4366 consumed_chars_base = consumed_chars;
4367
4368 if (charbuf >= charbuf_end)
4369 break;
4370
4371 ONE_MORE_BYTE (c);
4372 if (c == '\r')
4373 {
4374 /* Here we assume that no charset maps '\r' to something
4375 else. */
4376 if (EQ (eol_type, Qdos))
4377 {
4378 if (src < src_end
4379 && *src == '\n')
4380 ONE_MORE_BYTE (c);
4381 }
4382 else if (EQ (eol_type, Qmac))
4383 c = '\n';
4384 }
4385 else
4386 {
4387 Lisp_Object val;
4388 struct charset *charset;
4389 int dim;
4390 int len = 1;
4391 unsigned code = c;
4392
4393 val = AREF (valids, c);
4394 if (NILP (val))
4395 goto invalid_code;
4396 if (INTEGERP (val))
4397 {
4398 charset = CHARSET_FROM_ID (XFASTINT (val));
4399 dim = CHARSET_DIMENSION (charset);
4400 while (len < dim)
4401 {
4402 ONE_MORE_BYTE (c);
4403 code = (code << 8) | c;
4404 len++;
4405 }
4406 CODING_DECODE_CHAR (coding, src, src_base, src_end,
4407 charset, code, c);
4408 }
4409 else
4410 {
4411 /* VAL is a list of charset IDs. It is assured that the
4412 list is sorted by charset dimensions (smaller one
4413 comes first). */
4414 while (CONSP (val))
4415 {
4416 charset = CHARSET_FROM_ID (XFASTINT (XCAR (val)));
4417 dim = CHARSET_DIMENSION (charset);
4418 while (len < dim)
4419 {
4420 ONE_MORE_BYTE (c);
4421 code = (code << 8) | c;
4422 len++;
4423 }
4424 CODING_DECODE_CHAR (coding, src, src_base,
4425 src_end, charset, code, c);
4426 if (c >= 0)
4427 break;
4428 val = XCDR (val);
4429 }
4430 }
4431 if (c < 0)
4432 goto invalid_code;
4433 }
4434 *charbuf++ = c;
4435 continue;
4436
4437 invalid_code:
4438 src = src_base;
4439 consumed_chars = consumed_chars_base;
4440 ONE_MORE_BYTE (c);
4441 *charbuf++ = ASCII_BYTE_P (c) ? c : BYTE8_TO_CHAR (c);
4442 coding->errors++;
4443 }
4444
4445 no_more_source:
4446 coding->consumed_char += consumed_chars_base;
4447 coding->consumed = src_base - coding->source;
4448 coding->charbuf_used = charbuf - coding->charbuf;
4449 }
4450
4451 static int
4452 encode_coding_charset (coding)
4453 struct coding_system *coding;
4454 {
4455 int multibytep = coding->dst_multibyte;
4456 int *charbuf = coding->charbuf;
4457 int *charbuf_end = charbuf + coding->charbuf_used;
4458 unsigned char *dst = coding->destination + coding->produced;
4459 unsigned char *dst_end = coding->destination + coding->dst_bytes;
4460 int safe_room = MAX_MULTIBYTE_LENGTH;
4461 int produced_chars = 0;
4462 Lisp_Object attrs, eol_type, charset_list;
4463 int ascii_compatible;
4464 int c;
4465
4466 CODING_GET_INFO (coding, attrs, eol_type, charset_list);
4467 ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs));
4468
4469 while (charbuf < charbuf_end)
4470 {
4471 struct charset *charset;
4472 unsigned code;
4473
4474 ASSURE_DESTINATION (safe_room);
4475 c = *charbuf++;
4476 if (ascii_compatible && ASCII_CHAR_P (c))
4477 EMIT_ONE_ASCII_BYTE (c);
4478 else
4479 {
4480 charset = char_charset (c, charset_list, &code);
4481 if (charset)
4482 {
4483 if (CHARSET_DIMENSION (charset) == 1)
4484 EMIT_ONE_BYTE (code);
4485 else if (CHARSET_DIMENSION (charset) == 2)
4486 EMIT_TWO_BYTES (code >> 8, code & 0xFF);
4487 else if (CHARSET_DIMENSION (charset) == 3)
4488 EMIT_THREE_BYTES (code >> 16, (code >> 8) & 0xFF, code & 0xFF);
4489 else
4490 EMIT_FOUR_BYTES (code >> 24, (code >> 16) & 0xFF,
4491 (code >> 8) & 0xFF, code & 0xFF);
4492 }
4493 else
4494 {
4495 if (coding->mode & CODING_MODE_SAFE_ENCODING)
4496 c = CODING_INHIBIT_CHARACTER_SUBSTITUTION;
4497 else
4498 c = coding->default_char;
4499 EMIT_ONE_BYTE (c);
4500 }
4501 }
4502 }
4503
4504 coding->result = CODING_RESULT_SUCCESS;
4505 coding->produced_char += produced_chars;
4506 coding->produced = dst - coding->destination;
4507 return 0;
4508 }
4509
4510 \f
4511 /*** 7. C library functions ***/
4512
4513 /* In Emacs Lisp, coding system is represented by a Lisp symbol which
4514 has a property `coding-system'. The value of this property is a
4515 vector of length 5 (called as coding-vector). Among elements of
4516 this vector, the first (element[0]) and the fifth (element[4])
4517 carry important information for decoding/encoding. Before
4518 decoding/encoding, this information should be set in fields of a
4519 structure of type `coding_system'.
4520
4521 A value of property `coding-system' can be a symbol of another
4522 subsidiary coding-system. In that case, Emacs gets coding-vector
4523 from that symbol.
4524
4525 `element[0]' contains information to be set in `coding->type'. The
4526 value and its meaning is as follows:
4527
4528 0 -- coding_type_emacs_mule
4529 1 -- coding_type_sjis
4530 2 -- coding_type_iso_2022
4531 3 -- coding_type_big5
4532 4 -- coding_type_ccl encoder/decoder written in CCL
4533 nil -- coding_type_no_conversion
4534 t -- coding_type_undecided (automatic conversion on decoding,
4535 no-conversion on encoding)
4536
4537 `element[4]' contains information to be set in `coding->flags' and
4538 `coding->spec'. The meaning varies by `coding->type'.
4539
4540 If `coding->type' is `coding_type_iso_2022', element[4] is a vector
4541 of length 32 (of which the first 13 sub-elements are used now).
4542 Meanings of these sub-elements are:
4543
4544 sub-element[N] where N is 0 through 3: to be set in `coding->spec.iso_2022'
4545 If the value is an integer of valid charset, the charset is
4546 assumed to be designated to graphic register N initially.
4547
4548 If the value is minus, it is a minus value of charset which
4549 reserves graphic register N, which means that the charset is
4550 not designated initially but should be designated to graphic
4551 register N just before encoding a character in that charset.
4552
4553 If the value is nil, graphic register N is never used on
4554 encoding.
4555
4556 sub-element[N] where N is 4 through 11: to be set in `coding->flags'
4557 Each value takes t or nil. See the section ISO2022 of
4558 `coding.h' for more information.
4559
4560 If `coding->type' is `coding_type_big5', element[4] is t to denote
4561 BIG5-ETen or nil to denote BIG5-HKU.
4562
4563 If `coding->type' takes the other value, element[4] is ignored.
4564
4565 Emacs Lisp's coding system also carries information about format of
4566 end-of-line in a value of property `eol-type'. If the value is
4567 integer, 0 means eol_lf, 1 means eol_crlf, and 2 means eol_cr. If
4568 it is not integer, it should be a vector of subsidiary coding
4569 systems of which property `eol-type' has one of above values.
4570
4571 */
4572
4573 /* Setup coding context CODING from information about CODING_SYSTEM.
4574 If CODING_SYSTEM is nil, `no-conversion' is assumed. If
4575 CODING_SYSTEM is invalid, signal an error. */
4576
4577 void
4578 setup_coding_system (coding_system, coding)
4579 Lisp_Object coding_system;
4580 struct coding_system *coding;
4581 {
4582 Lisp_Object attrs;
4583 Lisp_Object eol_type;
4584 Lisp_Object coding_type;
4585 Lisp_Object val;
4586
4587 if (NILP (coding_system))
4588 coding_system = Qno_conversion;
4589
4590 CHECK_CODING_SYSTEM_GET_ID (coding_system, coding->id);
4591
4592 attrs = CODING_ID_ATTRS (coding->id);
4593 eol_type = CODING_ID_EOL_TYPE (coding->id);
4594
4595 coding->mode = 0;
4596 coding->head_ascii = -1;
4597 coding->common_flags
4598 = (VECTORP (eol_type) ? CODING_REQUIRE_DETECTION_MASK : 0);
4599
4600 val = CODING_ATTR_SAFE_CHARSETS (attrs);
4601 coding->max_charset_id = XSTRING (val)->size - 1;
4602 coding->safe_charsets = (char *) XSTRING (val)->data;
4603 coding->default_char = XINT (CODING_ATTR_DEFAULT_CHAR (attrs));
4604
4605 coding_type = CODING_ATTR_TYPE (attrs);
4606 if (EQ (coding_type, Qundecided))
4607 {
4608 coding->detector = NULL;
4609 coding->decoder = decode_coding_raw_text;
4610 coding->encoder = encode_coding_raw_text;
4611 coding->common_flags |= CODING_REQUIRE_DETECTION_MASK;
4612 }
4613 else if (EQ (coding_type, Qiso_2022))
4614 {
4615 int i;
4616 int flags = XINT (AREF (attrs, coding_attr_iso_flags));
4617
4618 /* Invoke graphic register 0 to plane 0. */
4619 CODING_ISO_INVOCATION (coding, 0) = 0;
4620 /* Invoke graphic register 1 to plane 1 if we can use 8-bit. */
4621 CODING_ISO_INVOCATION (coding, 1)
4622 = (flags & CODING_ISO_FLAG_SEVEN_BITS ? -1 : 1);
4623 /* Setup the initial status of designation. */
4624 for (i = 0; i < 4; i++)
4625 CODING_ISO_DESIGNATION (coding, i) = CODING_ISO_INITIAL (coding, i);
4626 /* Not single shifting initially. */
4627 CODING_ISO_SINGLE_SHIFTING (coding) = 0;
4628 /* Beginning of buffer should also be regarded as bol. */
4629 CODING_ISO_BOL (coding) = 1;
4630 coding->detector = detect_coding_iso_2022;
4631 coding->decoder = decode_coding_iso_2022;
4632 coding->encoder = encode_coding_iso_2022;
4633 if (flags & CODING_ISO_FLAG_SAFE)
4634 coding->mode |= CODING_MODE_SAFE_ENCODING;
4635 coding->common_flags
4636 |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK
4637 | CODING_REQUIRE_FLUSHING_MASK);
4638 if (flags & CODING_ISO_FLAG_COMPOSITION)
4639 coding->common_flags |= CODING_ANNOTATE_COMPOSITION_MASK;
4640 if (flags & CODING_ISO_FLAG_FULL_SUPPORT)
4641 {
4642 setup_iso_safe_charsets (attrs);
4643 val = CODING_ATTR_SAFE_CHARSETS (attrs);
4644 coding->max_charset_id = XSTRING (val)->size - 1;
4645 coding->safe_charsets = (char *) XSTRING (val)->data;
4646 }
4647 CODING_ISO_FLAGS (coding) = flags;
4648 }
4649 else if (EQ (coding_type, Qcharset))
4650 {
4651 coding->detector = detect_coding_charset;
4652 coding->decoder = decode_coding_charset;
4653 coding->encoder = encode_coding_charset;
4654 coding->common_flags
4655 |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
4656 }
4657 else if (EQ (coding_type, Qutf_8))
4658 {
4659 coding->detector = detect_coding_utf_8;
4660 coding->decoder = decode_coding_utf_8;
4661 coding->encoder = encode_coding_utf_8;
4662 coding->common_flags
4663 |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
4664 }
4665 else if (EQ (coding_type, Qutf_16))
4666 {
4667 val = AREF (attrs, coding_attr_utf_16_bom);
4668 CODING_UTF_16_BOM (coding) = (CONSP (val) ? utf_16_detect_bom
4669 : EQ (val, Qt) ? utf_16_with_bom
4670 : utf_16_without_bom);
4671 val = AREF (attrs, coding_attr_utf_16_endian);
4672 CODING_UTF_16_ENDIAN (coding) = (NILP (val) ? utf_16_big_endian
4673 : utf_16_little_endian);
4674 CODING_UTF_16_SURROGATE (coding) = 0;
4675 coding->detector = detect_coding_utf_16;
4676 coding->decoder = decode_coding_utf_16;
4677 coding->encoder = encode_coding_utf_16;
4678 coding->common_flags
4679 |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
4680 }
4681 else if (EQ (coding_type, Qccl))
4682 {
4683 coding->detector = detect_coding_ccl;
4684 coding->decoder = decode_coding_ccl;
4685 coding->encoder = encode_coding_ccl;
4686 coding->common_flags
4687 |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK
4688 | CODING_REQUIRE_FLUSHING_MASK);
4689 }
4690 else if (EQ (coding_type, Qemacs_mule))
4691 {
4692 coding->detector = detect_coding_emacs_mule;
4693 coding->decoder = decode_coding_emacs_mule;
4694 coding->encoder = encode_coding_emacs_mule;
4695 coding->common_flags
4696 |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
4697 if (! NILP (AREF (attrs, coding_attr_emacs_mule_full))
4698 && ! EQ (CODING_ATTR_CHARSET_LIST (attrs), Vemacs_mule_charset_list))
4699 {
4700 Lisp_Object tail, safe_charsets;
4701 int max_charset_id = 0;
4702
4703 for (tail = Vemacs_mule_charset_list; CONSP (tail);
4704 tail = XCDR (tail))
4705 if (max_charset_id < XFASTINT (XCAR (tail)))
4706 max_charset_id = XFASTINT (XCAR (tail));
4707 safe_charsets = Fmake_string (make_number (max_charset_id + 1),
4708 make_number (255));
4709 for (tail = Vemacs_mule_charset_list; CONSP (tail);
4710 tail = XCDR (tail))
4711 XSTRING (safe_charsets)->data[XFASTINT (XCAR (tail))] = 0;
4712 coding->max_charset_id = max_charset_id;
4713 coding->safe_charsets = (char *) XSTRING (safe_charsets)->data;
4714 }
4715 }
4716 else if (EQ (coding_type, Qshift_jis))
4717 {
4718 coding->detector = detect_coding_sjis;
4719 coding->decoder = decode_coding_sjis;
4720 coding->encoder = encode_coding_sjis;
4721 coding->common_flags
4722 |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
4723 }
4724 else if (EQ (coding_type, Qbig5))
4725 {
4726 coding->detector = detect_coding_big5;
4727 coding->decoder = decode_coding_big5;
4728 coding->encoder = encode_coding_big5;
4729 coding->common_flags
4730 |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
4731 }
4732 else /* EQ (coding_type, Qraw_text) */
4733 {
4734 coding->detector = NULL;
4735 coding->decoder = decode_coding_raw_text;
4736 coding->encoder = encode_coding_raw_text;
4737 coding->common_flags |= CODING_FOR_UNIBYTE_MASK;
4738 }
4739
4740 return;
4741 }
4742
4743 /* Return raw-text or one of its subsidiaries that has the same
4744 eol_type as CODING-SYSTEM. */
4745
4746 Lisp_Object
4747 raw_text_coding_system (coding_system)
4748 Lisp_Object coding_system;
4749 {
4750 Lisp_Object spec, attrs;
4751 Lisp_Object eol_type, raw_text_eol_type;
4752
4753 spec = CODING_SYSTEM_SPEC (coding_system);
4754 attrs = AREF (spec, 0);
4755
4756 if (EQ (CODING_ATTR_TYPE (attrs), Qraw_text))
4757 return coding_system;
4758
4759 eol_type = AREF (spec, 2);
4760 if (VECTORP (eol_type))
4761 return Qraw_text;
4762 spec = CODING_SYSTEM_SPEC (Qraw_text);
4763 raw_text_eol_type = AREF (spec, 2);
4764 return (EQ (eol_type, Qunix) ? AREF (raw_text_eol_type, 0)
4765 : EQ (eol_type, Qdos) ? AREF (raw_text_eol_type, 1)
4766 : AREF (raw_text_eol_type, 2));
4767 }
4768
4769
4770 /* If CODING_SYSTEM doesn't specify end-of-line format but PARENT
4771 does, return one of the subsidiary that has the same eol-spec as
4772 PARENT. Otherwise, return CODING_SYSTEM. */
4773
4774 Lisp_Object
4775 coding_inherit_eol_type (coding_system, parent)
4776 Lisp_Object coding_system, parent;
4777 {
4778 Lisp_Object spec, attrs, eol_type;
4779
4780 spec = CODING_SYSTEM_SPEC (coding_system);
4781 attrs = AREF (spec, 0);
4782 eol_type = AREF (spec, 2);
4783 if (VECTORP (eol_type))
4784 {
4785 Lisp_Object parent_spec;
4786 Lisp_Object parent_eol_type;
4787
4788 parent_spec
4789 = CODING_SYSTEM_SPEC (buffer_defaults.buffer_file_coding_system);
4790 parent_eol_type = AREF (parent_spec, 2);
4791 if (EQ (parent_eol_type, Qunix))
4792 coding_system = AREF (eol_type, 0);
4793 else if (EQ (parent_eol_type, Qdos))
4794 coding_system = AREF (eol_type, 1);
4795 else if (EQ (parent_eol_type, Qmac))
4796 coding_system = AREF (eol_type, 2);
4797 }
4798 return coding_system;
4799 }
4800
4801 /* Emacs has a mechanism to automatically detect a coding system if it
4802 is one of Emacs' internal format, ISO2022, SJIS, and BIG5. But,
4803 it's impossible to distinguish some coding systems accurately
4804 because they use the same range of codes. So, at first, coding
4805 systems are categorized into 7, those are:
4806
4807 o coding-category-emacs-mule
4808
4809 The category for a coding system which has the same code range
4810 as Emacs' internal format. Assigned the coding-system (Lisp
4811 symbol) `emacs-mule' by default.
4812
4813 o coding-category-sjis
4814
4815 The category for a coding system which has the same code range
4816 as SJIS. Assigned the coding-system (Lisp
4817 symbol) `japanese-shift-jis' by default.
4818
4819 o coding-category-iso-7
4820
4821 The category for a coding system which has the same code range
4822 as ISO2022 of 7-bit environment. This doesn't use any locking
4823 shift and single shift functions. This can encode/decode all
4824 charsets. Assigned the coding-system (Lisp symbol)
4825 `iso-2022-7bit' by default.
4826
4827 o coding-category-iso-7-tight
4828
4829 Same as coding-category-iso-7 except that this can
4830 encode/decode only the specified charsets.
4831
4832 o coding-category-iso-8-1
4833
4834 The category for a coding system which has the same code range
4835 as ISO2022 of 8-bit environment and graphic plane 1 used only
4836 for DIMENSION1 charset. This doesn't use any locking shift
4837 and single shift functions. Assigned the coding-system (Lisp
4838 symbol) `iso-latin-1' by default.
4839
4840 o coding-category-iso-8-2
4841
4842 The category for a coding system which has the same code range
4843 as ISO2022 of 8-bit environment and graphic plane 1 used only
4844 for DIMENSION2 charset. This doesn't use any locking shift
4845 and single shift functions. Assigned the coding-system (Lisp
4846 symbol) `japanese-iso-8bit' by default.
4847
4848 o coding-category-iso-7-else
4849
4850 The category for a coding system which has the same code range
4851 as ISO2022 of 7-bit environemnt but uses locking shift or
4852 single shift functions. Assigned the coding-system (Lisp
4853 symbol) `iso-2022-7bit-lock' by default.
4854
4855 o coding-category-iso-8-else
4856
4857 The category for a coding system which has the same code range
4858 as ISO2022 of 8-bit environemnt but uses locking shift or
4859 single shift functions. Assigned the coding-system (Lisp
4860 symbol) `iso-2022-8bit-ss2' by default.
4861
4862 o coding-category-big5
4863
4864 The category for a coding system which has the same code range
4865 as BIG5. Assigned the coding-system (Lisp symbol)
4866 `cn-big5' by default.
4867
4868 o coding-category-utf-8
4869
4870 The category for a coding system which has the same code range
4871 as UTF-8 (cf. RFC2279). Assigned the coding-system (Lisp
4872 symbol) `utf-8' by default.
4873
4874 o coding-category-utf-16-be
4875
4876 The category for a coding system in which a text has an
4877 Unicode signature (cf. Unicode Standard) in the order of BIG
4878 endian at the head. Assigned the coding-system (Lisp symbol)
4879 `utf-16-be' by default.
4880
4881 o coding-category-utf-16-le
4882
4883 The category for a coding system in which a text has an
4884 Unicode signature (cf. Unicode Standard) in the order of
4885 LITTLE endian at the head. Assigned the coding-system (Lisp
4886 symbol) `utf-16-le' by default.
4887
4888 o coding-category-ccl
4889
4890 The category for a coding system of which encoder/decoder is
4891 written in CCL programs. The default value is nil, i.e., no
4892 coding system is assigned.
4893
4894 o coding-category-binary
4895
4896 The category for a coding system not categorized in any of the
4897 above. Assigned the coding-system (Lisp symbol)
4898 `no-conversion' by default.
4899
4900 Each of them is a Lisp symbol and the value is an actual
4901 `coding-system's (this is also a Lisp symbol) assigned by a user.
4902 What Emacs does actually is to detect a category of coding system.
4903 Then, it uses a `coding-system' assigned to it. If Emacs can't
4904 decide only one possible category, it selects a category of the
4905 highest priority. Priorities of categories are also specified by a
4906 user in a Lisp variable `coding-category-list'.
4907
4908 */
4909
4910 #define EOL_SEEN_NONE 0
4911 #define EOL_SEEN_LF 1
4912 #define EOL_SEEN_CR 2
4913 #define EOL_SEEN_CRLF 4
4914
4915 /* Detect how end-of-line of a text of length CODING->src_bytes
4916 pointed by CODING->source is encoded. Return one of
4917 EOL_SEEN_XXX. */
4918
4919 #define MAX_EOL_CHECK_COUNT 3
4920
4921 static int
4922 detect_eol (coding, source, src_bytes)
4923 struct coding_system *coding;
4924 unsigned char *source;
4925 EMACS_INT src_bytes;
4926 {
4927 Lisp_Object attrs, coding_type;
4928 unsigned char *src = source, *src_end = src + src_bytes;
4929 unsigned char c;
4930 int total = 0;
4931 int eol_seen = EOL_SEEN_NONE;
4932
4933 attrs = CODING_ID_ATTRS (coding->id);
4934 coding_type = CODING_ATTR_TYPE (attrs);
4935
4936 if (EQ (coding_type, Qccl))
4937 {
4938 int msb, lsb;
4939
4940 msb = coding->spec.utf_16.endian == utf_16_little_endian;
4941 lsb = 1 - msb;
4942
4943 while (src + 1 < src_end)
4944 {
4945 c = src[lsb];
4946 if (src[msb] == 0 && (c == '\n' || c == '\r'))
4947 {
4948 int this_eol;
4949
4950 if (c == '\n')
4951 this_eol = EOL_SEEN_LF;
4952 else if (src + 3 >= src_end
4953 || src[msb + 2] != 0
4954 || src[lsb + 2] != '\n')
4955 this_eol = EOL_SEEN_CR;
4956 else
4957 this_eol = EOL_SEEN_CRLF;
4958
4959 if (eol_seen == EOL_SEEN_NONE)
4960 /* This is the first end-of-line. */
4961 eol_seen = this_eol;
4962 else if (eol_seen != this_eol)
4963 {
4964 /* The found type is different from what found before. */
4965 eol_seen = EOL_SEEN_LF;
4966 break;
4967 }
4968 if (++total == MAX_EOL_CHECK_COUNT)
4969 break;
4970 }
4971 src += 2;
4972 }
4973 }
4974 else
4975 {
4976 while (src < src_end)
4977 {
4978 c = *src++;
4979 if (c == '\n' || c == '\r')
4980 {
4981 int this_eol;
4982
4983 if (c == '\n')
4984 this_eol = EOL_SEEN_LF;
4985 else if (src >= src_end || *src != '\n')
4986 this_eol = EOL_SEEN_CR;
4987 else
4988 this_eol = EOL_SEEN_CRLF, src++;
4989
4990 if (eol_seen == EOL_SEEN_NONE)
4991 /* This is the first end-of-line. */
4992 eol_seen = this_eol;
4993 else if (eol_seen != this_eol)
4994 {
4995 /* The found type is different from what found before. */
4996 eol_seen = EOL_SEEN_LF;
4997 break;
4998 }
4999 if (++total == MAX_EOL_CHECK_COUNT)
5000 break;
5001 }
5002 }
5003 }
5004 return eol_seen;
5005 }
5006
5007
5008 static void
5009 adjust_coding_eol_type (coding, eol_seen)
5010 struct coding_system *coding;
5011 int eol_seen;
5012 {
5013 Lisp_Object eol_type;
5014
5015 eol_type = CODING_ID_EOL_TYPE (coding->id);
5016 if (eol_seen & EOL_SEEN_LF)
5017 coding->id = CODING_SYSTEM_ID (AREF (eol_type, 0));
5018 else if (eol_type & EOL_SEEN_CRLF)
5019 coding->id = CODING_SYSTEM_ID (AREF (eol_type, 1));
5020 else if (eol_type & EOL_SEEN_CR)
5021 coding->id = CODING_SYSTEM_ID (AREF (eol_type, 2));
5022 }
5023
5024 /* Detect how a text specified in CODING is encoded. If a coding
5025 system is detected, update fields of CODING by the detected coding
5026 system. */
5027
5028 void
5029 detect_coding (coding)
5030 struct coding_system *coding;
5031 {
5032 unsigned char *src, *src_end;
5033 Lisp_Object attrs, coding_type;
5034
5035 coding->consumed = coding->consumed_char = 0;
5036 coding->produced = coding->produced_char = 0;
5037 coding_set_source (coding);
5038
5039 src_end = coding->source + coding->src_bytes;
5040
5041 /* If we have not yet decided the text encoding type, detect it
5042 now. */
5043 if (EQ (CODING_ATTR_TYPE (CODING_ID_ATTRS (coding->id)), Qundecided))
5044 {
5045 int mask = CATEGORY_MASK_ANY;
5046 int c, i;
5047
5048 for (src = coding->source; src < src_end; src++)
5049 {
5050 c = *src;
5051 if (c & 0x80 || (c < 0x20 && (c == ISO_CODE_ESC
5052 || c == ISO_CODE_SI
5053 || c == ISO_CODE_SO)))
5054 break;
5055 }
5056 coding->head_ascii = src - (coding->source + coding->consumed);
5057
5058 if (coding->head_ascii < coding->src_bytes)
5059 {
5060 int detected = 0;
5061
5062 for (i = 0; i < coding_category_raw_text; i++)
5063 {
5064 enum coding_category category = coding_priorities[i];
5065 struct coding_system *this = coding_categories + category;
5066
5067 if (category >= coding_category_raw_text
5068 || detected & (1 << category))
5069 continue;
5070
5071 if (this->id < 0)
5072 {
5073 /* No coding system of this category is defined. */
5074 mask &= ~(1 << category);
5075 }
5076 else
5077 {
5078 detected |= detected_mask[category];
5079 if ((*(this->detector)) (coding, &mask))
5080 break;
5081 }
5082 }
5083 if (! mask)
5084 setup_coding_system (Qraw_text, coding);
5085 else if (mask != CATEGORY_MASK_ANY)
5086 for (i = 0; i < coding_category_raw_text; i++)
5087 {
5088 enum coding_category category = coding_priorities[i];
5089 struct coding_system *this = coding_categories + category;
5090
5091 if (mask & (1 << category))
5092 {
5093 setup_coding_system (CODING_ID_NAME (this->id), coding);
5094 break;
5095 }
5096 }
5097 }
5098 }
5099
5100 attrs = CODING_ID_ATTRS (coding->id);
5101 coding_type = CODING_ATTR_TYPE (attrs);
5102
5103 /* If we have not yet decided the EOL type, detect it now. But, the
5104 detection is impossible for a CCL based coding system, in which
5105 case, we detct the EOL type after decoding. */
5106 if (VECTORP (CODING_ID_EOL_TYPE (coding->id))
5107 && ! EQ (coding_type, Qccl))
5108 {
5109 int eol_seen = detect_eol (coding, coding->source, coding->src_bytes);
5110
5111 if (eol_seen != EOL_SEEN_NONE)
5112 adjust_coding_eol_type (coding, eol_seen);
5113 }
5114 }
5115
5116
5117 static void
5118 decode_eol (coding)
5119 struct coding_system *coding;
5120 {
5121 if (VECTORP (CODING_ID_EOL_TYPE (coding->id)))
5122 {
5123 unsigned char *p = CHAR_POS_ADDR (coding->dst_pos);
5124 unsigned char *pend = p + coding->produced;
5125 int eol_seen = EOL_SEEN_NONE;
5126
5127 for (; p < pend; p++)
5128 {
5129 if (*p == '\n')
5130 eol_seen |= EOL_SEEN_LF;
5131 else if (*p == '\r')
5132 {
5133 if (p + 1 < pend && *(p + 1) == '\n')
5134 {
5135 eol_seen |= EOL_SEEN_CRLF;
5136 p++;
5137 }
5138 else
5139 eol_seen |= EOL_SEEN_CR;
5140 }
5141 }
5142 if (eol_seen != EOL_SEEN_NONE)
5143 adjust_coding_eol_type (coding, eol_seen);
5144 }
5145
5146 if (EQ (CODING_ID_EOL_TYPE (coding->id), Qmac))
5147 {
5148 unsigned char *p = CHAR_POS_ADDR (coding->dst_pos);
5149 unsigned char *pend = p + coding->produced;
5150
5151 for (; p < pend; p++)
5152 if (*p == '\r')
5153 *p = '\n';
5154 }
5155 else if (EQ (CODING_ID_EOL_TYPE (coding->id), Qdos))
5156 {
5157 unsigned char *p, *pbeg, *pend;
5158 Lisp_Object undo_list;
5159
5160 move_gap_both (coding->dst_pos + coding->produced_char,
5161 coding->dst_pos_byte + coding->produced);
5162 undo_list = current_buffer->undo_list;
5163 current_buffer->undo_list = Qt;
5164 del_range_2 (coding->dst_pos, coding->dst_pos_byte, GPT, GPT_BYTE, Qnil);
5165 current_buffer->undo_list = undo_list;
5166 pbeg = GPT_ADDR;
5167 pend = pbeg + coding->produced;
5168
5169 for (p = pend - 1; p >= pbeg; p--)
5170 if (*p == '\r')
5171 {
5172 safe_bcopy ((char *) (p + 1), (char *) p, pend - p - 1);
5173 pend--;
5174 }
5175 coding->produced_char -= coding->produced - (pend - pbeg);
5176 coding->produced = pend - pbeg;
5177 insert_from_gap (coding->produced_char, coding->produced);
5178 }
5179 }
5180
5181 static void
5182 translate_chars (coding, table)
5183 struct coding_system *coding;
5184 Lisp_Object table;
5185 {
5186 int *charbuf = coding->charbuf;
5187 int *charbuf_end = charbuf + coding->charbuf_used;
5188 int c;
5189
5190 if (coding->chars_at_source)
5191 return;
5192
5193 while (charbuf < charbuf_end)
5194 {
5195 c = *charbuf;
5196 if (c < 0)
5197 charbuf += c;
5198 else
5199 *charbuf++ = translate_char (table, c);
5200 }
5201 }
5202
5203 static int
5204 produce_chars (coding)
5205 struct coding_system *coding;
5206 {
5207 unsigned char *dst = coding->destination + coding->produced;
5208 unsigned char *dst_end = coding->destination + coding->dst_bytes;
5209 int produced;
5210 int produced_chars = 0;
5211
5212 if (! coding->chars_at_source)
5213 {
5214 /* Characters are in coding->charbuf. */
5215 int *buf = coding->charbuf;
5216 int *buf_end = buf + coding->charbuf_used;
5217 unsigned char *adjusted_dst_end;
5218
5219 if (BUFFERP (coding->src_object)
5220 && EQ (coding->src_object, coding->dst_object))
5221 dst_end = coding->source + coding->consumed;
5222 adjusted_dst_end = dst_end - MAX_MULTIBYTE_LENGTH;
5223
5224 while (buf < buf_end)
5225 {
5226 int c = *buf++;
5227
5228 if (dst >= adjusted_dst_end)
5229 {
5230 dst = alloc_destination (coding,
5231 buf_end - buf + MAX_MULTIBYTE_LENGTH,
5232 dst);
5233 dst_end = coding->destination + coding->dst_bytes;
5234 adjusted_dst_end = dst_end - MAX_MULTIBYTE_LENGTH;
5235 }
5236 if (c >= 0)
5237 {
5238 if (coding->dst_multibyte
5239 || ! CHAR_BYTE8_P (c))
5240 CHAR_STRING_ADVANCE (c, dst);
5241 else
5242 *dst++ = CHAR_TO_BYTE8 (c);
5243 produced_chars++;
5244 }
5245 else
5246 /* This is an annotation data. */
5247 buf -= c + 1;
5248 }
5249 }
5250 else
5251 {
5252 unsigned char *src = coding->source;
5253 unsigned char *src_end = src + coding->src_bytes;
5254 Lisp_Object eol_type;
5255
5256 eol_type = CODING_ID_EOL_TYPE (coding->id);
5257
5258 if (coding->src_multibyte != coding->dst_multibyte)
5259 {
5260 if (coding->src_multibyte)
5261 {
5262 int multibytep = 1;
5263 int consumed_chars;
5264
5265 while (1)
5266 {
5267 unsigned char *src_base = src;
5268 int c;
5269
5270 ONE_MORE_BYTE (c);
5271 if (c == '\r')
5272 {
5273 if (EQ (eol_type, Qdos))
5274 {
5275 if (src < src_end
5276 && *src == '\n')
5277 c = *src++;
5278 }
5279 else if (EQ (eol_type, Qmac))
5280 c = '\n';
5281 }
5282 if (dst == dst_end)
5283 {
5284 coding->consumed = src - coding->source;
5285
5286 if (EQ (coding->src_object, coding->dst_object))
5287 dst_end = src;
5288 if (dst == dst_end)
5289 {
5290 dst = alloc_destination (coding, src_end - src + 1,
5291 dst);
5292 dst_end = coding->destination + coding->dst_bytes;
5293 coding_set_source (coding);
5294 src = coding->source + coding->consumed;
5295 src_end = coding->source + coding->src_bytes;
5296 }
5297 }
5298 *dst++ = c;
5299 produced_chars++;
5300 }
5301 no_more_source:
5302 ;
5303 }
5304 else
5305 while (src < src_end)
5306 {
5307 int multibytep = 1;
5308 int c = *src++;
5309
5310 if (c == '\r')
5311 {
5312 if (EQ (eol_type, Qdos))
5313 {
5314 if (src < src_end
5315 && *src == '\n')
5316 c = *src++;
5317 }
5318 else if (EQ (eol_type, Qmac))
5319 c = '\n';
5320 }
5321 if (dst >= dst_end - 1)
5322 {
5323 coding->consumed = src - coding->source;
5324
5325 if (EQ (coding->src_object, coding->dst_object))
5326 dst_end = src;
5327 if (dst >= dst_end - 1)
5328 {
5329 dst = alloc_destination (coding, src_end - src + 2,
5330 dst);
5331 dst_end = coding->destination + coding->dst_bytes;
5332 coding_set_source (coding);
5333 src = coding->source + coding->consumed;
5334 src_end = coding->source + coding->src_bytes;
5335 }
5336 }
5337 EMIT_ONE_BYTE (c);
5338 }
5339 }
5340 else
5341 {
5342 if (!EQ (coding->src_object, coding->dst_object))
5343 {
5344 int require = coding->src_bytes - coding->dst_bytes;
5345
5346 if (require > 0)
5347 {
5348 EMACS_INT offset = src - coding->source;
5349
5350 dst = alloc_destination (coding, require, dst);
5351 coding_set_source (coding);
5352 src = coding->source + offset;
5353 src_end = coding->source + coding->src_bytes;
5354 }
5355 }
5356 produced_chars = coding->src_chars;
5357 while (src < src_end)
5358 {
5359 int c = *src++;
5360
5361 if (c == '\r')
5362 {
5363 if (EQ (eol_type, Qdos))
5364 {
5365 if (src < src_end
5366 && *src == '\n')
5367 c = *src++;
5368 produced_chars--;
5369 }
5370 else if (EQ (eol_type, Qmac))
5371 c = '\n';
5372 }
5373 *dst++ = c;
5374 }
5375 }
5376 coding->consumed = coding->src_bytes;
5377 coding->consumed_char = coding->src_chars;
5378 }
5379
5380 produced = dst - (coding->destination + coding->produced);
5381 if (BUFFERP (coding->dst_object))
5382 insert_from_gap (produced_chars, produced);
5383 coding->produced += produced;
5384 coding->produced_char += produced_chars;
5385 return produced_chars;
5386 }
5387
5388 /* [ -LENGTH CHAR_POS_OFFSET MASK METHOD COMP_LEN ]
5389 or
5390 [ -LENGTH CHAR_POS_OFFSET MASK METHOD COMP_LEN COMPONENTS... ]
5391 */
5392
5393 static INLINE void
5394 produce_composition (coding, charbuf)
5395 struct coding_system *coding;
5396 int *charbuf;
5397 {
5398 Lisp_Object buffer;
5399 int len;
5400 EMACS_INT pos;
5401 enum composition_method method;
5402 int cmp_len;
5403 Lisp_Object components;
5404
5405 buffer = coding->dst_object;
5406 len = -charbuf[0];
5407 pos = coding->dst_pos + charbuf[1];
5408 method = (enum composition_method) (charbuf[3]);
5409 cmp_len = charbuf[4];
5410
5411 if (method == COMPOSITION_RELATIVE)
5412 components = Qnil;
5413 else
5414 {
5415 Lisp_Object args[MAX_COMPOSITION_COMPONENTS * 2 - 1];
5416 int i;
5417
5418 len -= 5;
5419 charbuf += 5;
5420 for (i = 0; i < len; i++)
5421 args[i] = make_number (charbuf[i]);
5422 components = (method == COMPOSITION_WITH_ALTCHARS
5423 ? Fstring (len, args) : Fvector (len, args));
5424 }
5425 compose_text (pos, pos + cmp_len, components, Qnil, Qnil);
5426 }
5427
5428 static int *
5429 save_composition_data (buf, buf_end, prop)
5430 int *buf, *buf_end;
5431 Lisp_Object prop;
5432 {
5433 enum composition_method method = COMPOSITION_METHOD (prop);
5434 int cmp_len = COMPOSITION_LENGTH (prop);
5435
5436 if (buf + 4 + (MAX_COMPOSITION_COMPONENTS * 2 - 1) > buf_end)
5437 return NULL;
5438
5439 buf[1] = CODING_ANNOTATE_COMPOSITION_MASK;
5440 buf[2] = method;
5441 buf[3] = cmp_len;
5442
5443 if (method == COMPOSITION_RELATIVE)
5444 buf[0] = 4;
5445 else
5446 {
5447 Lisp_Object components;
5448 int len, i;
5449
5450 components = COMPOSITION_COMPONENTS (prop);
5451 if (VECTORP (components))
5452 {
5453 len = XVECTOR (components)->size;
5454 for (i = 0; i < len; i++)
5455 buf[4 + i] = XINT (AREF (components, i));
5456 }
5457 else if (STRINGP (components))
5458 {
5459 int i_byte;
5460
5461 len = XSTRING (components)->size;
5462 i = i_byte = 0;
5463 while (i < len)
5464 FETCH_STRING_CHAR_ADVANCE (buf[4 + i], components, i, i_byte);
5465 }
5466 else if (INTEGERP (components))
5467 {
5468 len = 1;
5469 buf[4] = XINT (components);
5470 }
5471 else if (CONSP (components))
5472 {
5473 for (len = 0; CONSP (components);
5474 len++, components = XCDR (components))
5475 buf[4 + len] = XINT (XCAR (components));
5476 }
5477 else
5478 abort ();
5479 buf[0] = 4 + len;
5480 }
5481 return (buf + buf[0]);
5482 }
5483
5484 #define CHARBUF_SIZE 0x4000
5485
5486 #define ALLOC_CONVERSION_WORK_AREA(coding) \
5487 do { \
5488 int size = CHARBUF_SIZE;; \
5489 \
5490 coding->charbuf = NULL; \
5491 while (size > 1024) \
5492 { \
5493 coding->charbuf = (int *) alloca (sizeof (int) * size); \
5494 if (coding->charbuf) \
5495 break; \
5496 size >>= 1; \
5497 } \
5498 if (! coding->charbuf) \
5499 { \
5500 coding->result = CODING_RESULT_INSUFFICIENT_MEM; \
5501 return coding->result; \
5502 } \
5503 coding->charbuf_size = size; \
5504 } while (0)
5505
5506
5507 static void
5508 produce_annotation (coding)
5509 struct coding_system *coding;
5510 {
5511 int *charbuf = coding->charbuf;
5512 int *charbuf_end = charbuf + coding->charbuf_used;
5513
5514 while (charbuf < charbuf_end)
5515 {
5516 if (*charbuf >= 0)
5517 charbuf++;
5518 else
5519 {
5520 int len = -*charbuf;
5521 switch (charbuf[2])
5522 {
5523 case CODING_ANNOTATE_COMPOSITION_MASK:
5524 produce_composition (coding, charbuf);
5525 break;
5526 default:
5527 abort ();
5528 }
5529 charbuf += len;
5530 }
5531 }
5532 }
5533
5534 /* Decode the data at CODING->src_object into CODING->dst_object.
5535 CODING->src_object is a buffer, a string, or nil.
5536 CODING->dst_object is a buffer.
5537
5538 If CODING->src_object is a buffer, it must be the current buffer.
5539 In this case, if CODING->src_pos is positive, it is a position of
5540 the source text in the buffer, otherwise, the source text is in the
5541 gap area of the buffer, and CODING->src_pos specifies the offset of
5542 the text from GPT (which must be the same as PT). If this is the
5543 same buffer as CODING->dst_object, CODING->src_pos must be
5544 negative.
5545
5546 If CODING->src_object is a string, CODING->src_pos in an index to
5547 that string.
5548
5549 If CODING->src_object is nil, CODING->source must already point to
5550 the non-relocatable memory area. In this case, CODING->src_pos is
5551 an offset from CODING->source.
5552
5553 The decoded data is inserted at the current point of the buffer
5554 CODING->dst_object.
5555 */
5556
5557 static int
5558 decode_coding (coding)
5559 struct coding_system *coding;
5560 {
5561 Lisp_Object attrs;
5562
5563 if (BUFFERP (coding->src_object)
5564 && coding->src_pos > 0
5565 && coding->src_pos < GPT
5566 && coding->src_pos + coding->src_chars > GPT)
5567 move_gap_both (coding->src_pos, coding->src_pos_byte);
5568
5569 if (BUFFERP (coding->dst_object))
5570 {
5571 if (current_buffer != XBUFFER (coding->dst_object))
5572 set_buffer_internal (XBUFFER (coding->dst_object));
5573 if (GPT != PT)
5574 move_gap_both (PT, PT_BYTE);
5575 }
5576
5577 coding->consumed = coding->consumed_char = 0;
5578 coding->produced = coding->produced_char = 0;
5579 coding->chars_at_source = 0;
5580 coding->result = CODING_RESULT_SUCCESS;
5581 coding->errors = 0;
5582
5583 ALLOC_CONVERSION_WORK_AREA (coding);
5584
5585 attrs = CODING_ID_ATTRS (coding->id);
5586
5587 do
5588 {
5589 coding_set_source (coding);
5590 coding->annotated = 0;
5591 (*(coding->decoder)) (coding);
5592 if (!NILP (CODING_ATTR_DECODE_TBL (attrs)))
5593 translate_chars (CODING_ATTR_DECODE_TBL (attrs), coding);
5594 coding_set_destination (coding);
5595 produce_chars (coding);
5596 if (coding->annotated)
5597 produce_annotation (coding);
5598 }
5599 while (coding->consumed < coding->src_bytes
5600 && ! coding->result);
5601
5602 if (EQ (CODING_ATTR_TYPE (CODING_ID_ATTRS (coding->id)), Qccl)
5603 && SYMBOLP (CODING_ID_EOL_TYPE (coding->id))
5604 && ! EQ (CODING_ID_EOL_TYPE (coding->id), Qunix))
5605 decode_eol (coding);
5606
5607 coding->carryover_bytes = 0;
5608 if (coding->consumed < coding->src_bytes)
5609 {
5610 int nbytes = coding->src_bytes - coding->consumed;
5611 unsigned char *src;
5612
5613 coding_set_source (coding);
5614 coding_set_destination (coding);
5615 src = coding->source + coding->consumed;
5616
5617 if (coding->mode & CODING_MODE_LAST_BLOCK)
5618 {
5619 /* Flush out unprocessed data as binary chars. We are sure
5620 that the number of data is less than the size of
5621 coding->charbuf. */
5622 int *charbuf = coding->charbuf;
5623
5624 while (nbytes-- > 0)
5625 {
5626 int c = *src++;
5627 *charbuf++ = (c & 0x80 ? - c : c);
5628 }
5629 produce_chars (coding);
5630 }
5631 else
5632 {
5633 /* Record unprocessed bytes in coding->carryover. We are
5634 sure that the number of data is less than the size of
5635 coding->carryover. */
5636 unsigned char *p = coding->carryover;
5637
5638 coding->carryover_bytes = nbytes;
5639 while (nbytes-- > 0)
5640 *p++ = *src++;
5641 }
5642 coding->consumed = coding->src_bytes;
5643 }
5644
5645 return coding->result;
5646 }
5647
5648 static void
5649 consume_chars (coding)
5650 struct coding_system *coding;
5651 {
5652 int *buf = coding->charbuf;
5653 /* -1 is to compensate for CRLF. */
5654 int *buf_end = coding->charbuf + coding->charbuf_size - 1;
5655 unsigned char *src = coding->source + coding->consumed;
5656 int pos = coding->src_pos + coding->consumed_char;
5657 int end_pos = coding->src_pos + coding->src_chars;
5658 int multibytep = coding->src_multibyte;
5659 Lisp_Object eol_type;
5660 int c;
5661 int start, end, stop;
5662 Lisp_Object object, prop;
5663
5664 eol_type = CODING_ID_EOL_TYPE (coding->id);
5665 if (VECTORP (eol_type))
5666 eol_type = Qunix;
5667
5668 object = coding->src_object;
5669
5670 /* Note: composition handling is not yet implemented. */
5671 coding->common_flags &= ~CODING_ANNOTATE_COMPOSITION_MASK;
5672
5673 if (coding->common_flags & CODING_ANNOTATE_COMPOSITION_MASK
5674 && find_composition (pos, end_pos, &start, &end, &prop, object)
5675 && end <= end_pos
5676 && (start >= pos
5677 || (find_composition (end, end_pos, &start, &end, &prop, object)
5678 && end <= end_pos)))
5679 stop = start;
5680 else
5681 stop = end_pos;
5682
5683 while (buf < buf_end)
5684 {
5685 if (pos == stop)
5686 {
5687 int *p;
5688
5689 if (pos == end_pos)
5690 break;
5691 p = save_composition_data (buf, buf_end, prop);
5692 if (p == NULL)
5693 break;
5694 buf = p;
5695 if (find_composition (end, end_pos, &start, &end, &prop, object)
5696 && end <= end_pos)
5697 stop = start;
5698 else
5699 stop = end_pos;
5700 }
5701
5702 if (! multibytep)
5703 c = *src++;
5704 else
5705 c = STRING_CHAR_ADVANCE (src);
5706 if ((c == '\r') && (coding->mode & CODING_MODE_SELECTIVE_DISPLAY))
5707 c = '\n';
5708 if (! EQ (eol_type, Qunix))
5709 {
5710 if (c == '\n')
5711 {
5712 if (EQ (eol_type, Qdos))
5713 *buf++ = '\r';
5714 else
5715 c = '\r';
5716 }
5717 }
5718 *buf++ = c;
5719 pos++;
5720 }
5721
5722 coding->consumed = src - coding->source;
5723 coding->consumed_char = pos - coding->src_pos;
5724 coding->charbuf_used = buf - coding->charbuf;
5725 coding->chars_at_source = 0;
5726 }
5727
5728
5729 /* Encode the text at CODING->src_object into CODING->dst_object.
5730 CODING->src_object is a buffer or a string.
5731 CODING->dst_object is a buffer or nil.
5732
5733 If CODING->src_object is a buffer, it must be the current buffer.
5734 In this case, if CODING->src_pos is positive, it is a position of
5735 the source text in the buffer, otherwise. the source text is in the
5736 gap area of the buffer, and coding->src_pos specifies the offset of
5737 the text from GPT (which must be the same as PT). If this is the
5738 same buffer as CODING->dst_object, CODING->src_pos must be
5739 negative and CODING should not have `pre-write-conversion'.
5740
5741 If CODING->src_object is a string, CODING should not have
5742 `pre-write-conversion'.
5743
5744 If CODING->dst_object is a buffer, the encoded data is inserted at
5745 the current point of that buffer.
5746
5747 If CODING->dst_object is nil, the encoded data is placed at the
5748 memory area specified by CODING->destination. */
5749
5750 static int
5751 encode_coding (coding)
5752 struct coding_system *coding;
5753 {
5754 Lisp_Object attrs;
5755
5756 attrs = CODING_ID_ATTRS (coding->id);
5757
5758 if (BUFFERP (coding->dst_object))
5759 {
5760 set_buffer_internal (XBUFFER (coding->dst_object));
5761 coding->dst_multibyte
5762 = ! NILP (current_buffer->enable_multibyte_characters);
5763 }
5764
5765 coding->consumed = coding->consumed_char = 0;
5766 coding->produced = coding->produced_char = 0;
5767 coding->result = CODING_RESULT_SUCCESS;
5768 coding->errors = 0;
5769
5770 ALLOC_CONVERSION_WORK_AREA (coding);
5771
5772 do {
5773 coding_set_source (coding);
5774 consume_chars (coding);
5775
5776 if (!NILP (CODING_ATTR_ENCODE_TBL (attrs)))
5777 translate_chars (CODING_ATTR_ENCODE_TBL (attrs), coding);
5778
5779 coding_set_destination (coding);
5780 (*(coding->encoder)) (coding);
5781 } while (coding->consumed_char < coding->src_chars);
5782
5783 if (BUFFERP (coding->dst_object))
5784 insert_from_gap (coding->produced_char, coding->produced);
5785
5786 return (coding->result);
5787 }
5788
5789 /* Work buffer */
5790
5791 /* List of currently used working buffer. */
5792 Lisp_Object Vcode_conversion_work_buf_list;
5793
5794 /* A working buffer used by the top level conversion. */
5795 Lisp_Object Vcode_conversion_reused_work_buf;
5796
5797
5798 /* Return a working buffer that can be freely used by the following
5799 code conversion. MULTIBYTEP specifies the multibyteness of the
5800 buffer. */
5801
5802 Lisp_Object
5803 make_conversion_work_buffer (multibytep)
5804 int multibytep;
5805 {
5806 struct buffer *current = current_buffer;
5807 Lisp_Object buf;
5808
5809 if (NILP (Vcode_conversion_work_buf_list))
5810 {
5811 if (NILP (Vcode_conversion_reused_work_buf))
5812 Vcode_conversion_reused_work_buf
5813 = Fget_buffer_create (build_string (" *code-conversion-work*"));
5814 Vcode_conversion_work_buf_list
5815 = Fcons (Vcode_conversion_reused_work_buf, Qnil);
5816 }
5817 else
5818 {
5819 int depth = Flength (Vcode_conversion_work_buf_list);
5820 char str[128];
5821
5822 sprintf (str, " *code-conversion-work*<%d>", depth);
5823 Vcode_conversion_work_buf_list
5824 = Fcons (Fget_buffer_create (build_string (str)),
5825 Vcode_conversion_work_buf_list);
5826 }
5827
5828 buf = XCAR (Vcode_conversion_work_buf_list);
5829 set_buffer_internal (XBUFFER (buf));
5830 current_buffer->undo_list = Qt;
5831 Ferase_buffer ();
5832 Fset_buffer_multibyte (multibytep ? Qt : Qnil);
5833 set_buffer_internal (current);
5834 return buf;
5835 }
5836
5837 static struct coding_system *saved_coding;
5838
5839 Lisp_Object
5840 code_conversion_restore (info)
5841 Lisp_Object info;
5842 {
5843 int depth = Flength (Vcode_conversion_work_buf_list);
5844 Lisp_Object buf;
5845
5846 if (depth > 0)
5847 {
5848 buf = XCAR (Vcode_conversion_work_buf_list);
5849 Vcode_conversion_work_buf_list = XCDR (Vcode_conversion_work_buf_list);
5850 if (depth > 1 && !NILP (Fbuffer_live_p (buf)))
5851 Fkill_buffer (buf);
5852 }
5853
5854 if (saved_coding->dst_object == Qt
5855 && saved_coding->destination)
5856 xfree (saved_coding->destination);
5857
5858 return save_excursion_restore (info);
5859 }
5860
5861
5862 int
5863 decode_coding_gap (coding, chars, bytes)
5864 struct coding_system *coding;
5865 EMACS_INT chars, bytes;
5866 {
5867 int count = specpdl_ptr - specpdl;
5868
5869 saved_coding = coding;
5870 record_unwind_protect (code_conversion_restore, save_excursion_save ());
5871
5872 coding->src_object = Fcurrent_buffer ();
5873 coding->src_chars = chars;
5874 coding->src_bytes = bytes;
5875 coding->src_pos = -chars;
5876 coding->src_pos_byte = -bytes;
5877 coding->src_multibyte = chars < bytes;
5878 coding->dst_object = coding->src_object;
5879 coding->dst_pos = PT;
5880 coding->dst_pos_byte = PT_BYTE;
5881 coding->dst_multibyte = ! NILP (current_buffer->enable_multibyte_characters);
5882
5883 if (CODING_REQUIRE_DETECTION (coding))
5884 detect_coding (coding);
5885
5886 decode_coding (coding);
5887
5888 unbind_to (count, Qnil);
5889 return coding->result;
5890 }
5891
5892 int
5893 encode_coding_gap (coding, chars, bytes)
5894 struct coding_system *coding;
5895 EMACS_INT chars, bytes;
5896 {
5897 int count = specpdl_ptr - specpdl;
5898 Lisp_Object buffer;
5899
5900 saved_coding = coding;
5901 record_unwind_protect (code_conversion_restore, save_excursion_save ());
5902
5903 buffer = Fcurrent_buffer ();
5904 coding->src_object = buffer;
5905 coding->src_chars = chars;
5906 coding->src_bytes = bytes;
5907 coding->src_pos = -chars;
5908 coding->src_pos_byte = -bytes;
5909 coding->src_multibyte = chars < bytes;
5910 coding->dst_object = coding->src_object;
5911 coding->dst_pos = PT;
5912 coding->dst_pos_byte = PT_BYTE;
5913
5914 encode_coding (coding);
5915
5916 unbind_to (count, Qnil);
5917 return coding->result;
5918 }
5919
5920
5921 /* Decode the text in the range FROM/FROM_BYTE and TO/TO_BYTE in
5922 SRC_OBJECT into DST_OBJECT by coding context CODING.
5923
5924 SRC_OBJECT is a buffer, a string, or Qnil.
5925
5926 If it is a buffer, the text is at point of the buffer. FROM and TO
5927 are positions in the buffer.
5928
5929 If it is a string, the text is at the beginning of the string.
5930 FROM and TO are indices to the string.
5931
5932 If it is nil, the text is at coding->source. FROM and TO are
5933 indices to coding->source.
5934
5935 DST_OBJECT is a buffer, Qt, or Qnil.
5936
5937 If it is a buffer, the decoded text is inserted at point of the
5938 buffer. If the buffer is the same as SRC_OBJECT, the source text
5939 is deleted.
5940
5941 If it is Qt, a string is made from the decoded text, and
5942 set in CODING->dst_object.
5943
5944 If it is Qnil, the decoded text is stored at CODING->destination.
5945 The called must allocate CODING->dst_bytes bytes at
5946 CODING->destination by xmalloc. If the decoded text is longer than
5947 CODING->dst_bytes, CODING->destination is relocated by xrealloc.
5948 */
5949
5950 void
5951 decode_coding_object (coding, src_object, from, from_byte, to, to_byte,
5952 dst_object)
5953 struct coding_system *coding;
5954 Lisp_Object src_object;
5955 EMACS_INT from, from_byte, to, to_byte;
5956 Lisp_Object dst_object;
5957 {
5958 int count = specpdl_ptr - specpdl;
5959 unsigned char *destination;
5960 EMACS_INT dst_bytes;
5961 EMACS_INT chars = to - from;
5962 EMACS_INT bytes = to_byte - from_byte;
5963 Lisp_Object attrs;
5964
5965 saved_coding = coding;
5966 record_unwind_protect (code_conversion_restore, save_excursion_save ());
5967
5968 if (NILP (dst_object))
5969 {
5970 destination = coding->destination;
5971 dst_bytes = coding->dst_bytes;
5972 }
5973
5974 coding->src_object = src_object;
5975 coding->src_chars = chars;
5976 coding->src_bytes = bytes;
5977 coding->src_multibyte = chars < bytes;
5978
5979 if (STRINGP (src_object))
5980 {
5981 coding->src_pos = from;
5982 coding->src_pos_byte = from_byte;
5983 }
5984 else if (BUFFERP (src_object))
5985 {
5986 set_buffer_internal (XBUFFER (src_object));
5987 if (from != GPT)
5988 move_gap_both (from, from_byte);
5989 if (EQ (src_object, dst_object))
5990 {
5991 TEMP_SET_PT_BOTH (from, from_byte);
5992 del_range_both (from, from_byte, to, to_byte, 1);
5993 coding->src_pos = -chars;
5994 coding->src_pos_byte = -bytes;
5995 }
5996 else
5997 {
5998 coding->src_pos = from;
5999 coding->src_pos_byte = from_byte;
6000 }
6001 }
6002
6003 if (CODING_REQUIRE_DETECTION (coding))
6004 detect_coding (coding);
6005 attrs = CODING_ID_ATTRS (coding->id);
6006
6007 if (! NILP (CODING_ATTR_POST_READ (attrs))
6008 || EQ (dst_object, Qt))
6009 {
6010 coding->dst_object = make_conversion_work_buffer (1);
6011 coding->dst_pos = BEG;
6012 coding->dst_pos_byte = BEG_BYTE;
6013 coding->dst_multibyte = 1;
6014 }
6015 else if (BUFFERP (dst_object))
6016 {
6017 coding->dst_object = dst_object;
6018 coding->dst_pos = BUF_PT (XBUFFER (dst_object));
6019 coding->dst_pos_byte = BUF_PT_BYTE (XBUFFER (dst_object));
6020 coding->dst_multibyte
6021 = ! NILP (XBUFFER (dst_object)->enable_multibyte_characters);
6022 }
6023 else
6024 {
6025 coding->dst_object = Qnil;
6026 coding->dst_multibyte = 1;
6027 }
6028
6029 decode_coding (coding);
6030
6031 if (BUFFERP (coding->dst_object))
6032 set_buffer_internal (XBUFFER (coding->dst_object));
6033
6034 if (! NILP (CODING_ATTR_POST_READ (attrs)))
6035 {
6036 struct gcpro gcpro1, gcpro2;
6037 EMACS_INT prev_Z = Z, prev_Z_BYTE = Z_BYTE;
6038 Lisp_Object val;
6039
6040 TEMP_SET_PT_BOTH (coding->dst_pos, coding->dst_pos_byte);
6041 GCPRO2 (coding->src_object, coding->dst_object);
6042 val = call1 (CODING_ATTR_POST_READ (attrs),
6043 make_number (coding->produced_char));
6044 UNGCPRO;
6045 CHECK_NATNUM (val);
6046 coding->produced_char += Z - prev_Z;
6047 coding->produced += Z_BYTE - prev_Z_BYTE;
6048 }
6049
6050 if (EQ (dst_object, Qt))
6051 {
6052 coding->dst_object = Fbuffer_string ();
6053 }
6054 else if (NILP (dst_object) && BUFFERP (coding->dst_object))
6055 {
6056 set_buffer_internal (XBUFFER (coding->dst_object));
6057 if (dst_bytes < coding->produced)
6058 {
6059 destination
6060 = (unsigned char *) xrealloc (destination, coding->produced);
6061 if (! destination)
6062 {
6063 coding->result = CODING_RESULT_INSUFFICIENT_DST;
6064 unbind_to (count, Qnil);
6065 return;
6066 }
6067 if (BEGV < GPT && GPT < BEGV + coding->produced_char)
6068 move_gap_both (BEGV, BEGV_BYTE);
6069 bcopy (BEGV_ADDR, destination, coding->produced);
6070 coding->destination = destination;
6071 }
6072 }
6073
6074 unbind_to (count, Qnil);
6075 }
6076
6077
6078 void
6079 encode_coding_object (coding, src_object, from, from_byte, to, to_byte,
6080 dst_object)
6081 struct coding_system *coding;
6082 Lisp_Object src_object;
6083 EMACS_INT from, from_byte, to, to_byte;
6084 Lisp_Object dst_object;
6085 {
6086 int count = specpdl_ptr - specpdl;
6087 EMACS_INT chars = to - from;
6088 EMACS_INT bytes = to_byte - from_byte;
6089 Lisp_Object attrs;
6090
6091 saved_coding = coding;
6092 record_unwind_protect (code_conversion_restore, save_excursion_save ());
6093
6094 coding->src_object = src_object;
6095 coding->src_chars = chars;
6096 coding->src_bytes = bytes;
6097 coding->src_multibyte = chars < bytes;
6098
6099 attrs = CODING_ID_ATTRS (coding->id);
6100
6101 if (! NILP (CODING_ATTR_PRE_WRITE (attrs)))
6102 {
6103 coding->src_object = make_conversion_work_buffer (coding->src_multibyte);
6104 set_buffer_internal (XBUFFER (coding->src_object));
6105 if (STRINGP (src_object))
6106 insert_from_string (src_object, from, from_byte, chars, bytes, 0);
6107 else if (BUFFERP (src_object))
6108 insert_from_buffer (XBUFFER (src_object), from, chars, 0);
6109 else
6110 insert_1_both (coding->source + from, chars, bytes, 0, 0, 0);
6111
6112 if (EQ (src_object, dst_object))
6113 {
6114 set_buffer_internal (XBUFFER (src_object));
6115 del_range_both (from, from_byte, to, to_byte, 1);
6116 set_buffer_internal (XBUFFER (coding->src_object));
6117 }
6118
6119 call2 (CODING_ATTR_PRE_WRITE (attrs),
6120 make_number (BEG), make_number (Z));
6121 coding->src_object = Fcurrent_buffer ();
6122 if (BEG != GPT)
6123 move_gap_both (BEG, BEG_BYTE);
6124 coding->src_chars = Z - BEG;
6125 coding->src_bytes = Z_BYTE - BEG_BYTE;
6126 coding->src_pos = BEG;
6127 coding->src_pos_byte = BEG_BYTE;
6128 coding->src_multibyte = Z < Z_BYTE;
6129 }
6130 else if (STRINGP (src_object))
6131 {
6132 coding->src_pos = from;
6133 coding->src_pos_byte = from_byte;
6134 }
6135 else if (BUFFERP (src_object))
6136 {
6137 set_buffer_internal (XBUFFER (src_object));
6138 if (from != GPT)
6139 move_gap_both (from, from_byte);
6140 if (EQ (src_object, dst_object))
6141 {
6142 del_range_both (from, from_byte, to, to_byte, 1);
6143 coding->src_pos = -chars;
6144 coding->src_pos_byte = -bytes;
6145 }
6146 else
6147 {
6148 coding->src_pos = from;
6149 coding->src_pos_byte = from_byte;
6150 }
6151 }
6152
6153 if (BUFFERP (dst_object))
6154 {
6155 coding->dst_object = dst_object;
6156 coding->dst_pos = BUF_PT (XBUFFER (dst_object));
6157 coding->dst_pos_byte = BUF_PT_BYTE (XBUFFER (dst_object));
6158 coding->dst_multibyte
6159 = ! NILP (XBUFFER (dst_object)->enable_multibyte_characters);
6160 }
6161 else if (EQ (dst_object, Qt))
6162 {
6163 coding->dst_object = Qnil;
6164 coding->dst_bytes = coding->src_chars;
6165 if (coding->dst_bytes == 0)
6166 coding->dst_bytes = 1;
6167 coding->destination = (unsigned char *) xmalloc (coding->dst_bytes);
6168 coding->dst_multibyte = 0;
6169 }
6170 else
6171 {
6172 coding->dst_object = Qnil;
6173 coding->dst_multibyte = 0;
6174 }
6175
6176 encode_coding (coding);
6177
6178 if (EQ (dst_object, Qt))
6179 {
6180 if (BUFFERP (coding->dst_object))
6181 coding->dst_object = Fbuffer_string ();
6182 else
6183 {
6184 coding->dst_object
6185 = make_unibyte_string ((char *) coding->destination,
6186 coding->produced);
6187 xfree (coding->destination);
6188 }
6189 }
6190
6191 unbind_to (count, Qnil);
6192 }
6193
6194
6195 Lisp_Object
6196 preferred_coding_system ()
6197 {
6198 int id = coding_categories[coding_priorities[0]].id;
6199
6200 return CODING_ID_NAME (id);
6201 }
6202
6203 \f
6204 #ifdef emacs
6205 /*** 8. Emacs Lisp library functions ***/
6206
6207 DEFUN ("coding-system-p", Fcoding_system_p, Scoding_system_p, 1, 1, 0,
6208 doc: /* Return t if OBJECT is nil or a coding-system.
6209 See the documentation of `define-coding-system' for information
6210 about coding-system objects. */)
6211 (obj)
6212 Lisp_Object obj;
6213 {
6214 return ((NILP (obj) || CODING_SYSTEM_P (obj)) ? Qt : Qnil);
6215 }
6216
6217 DEFUN ("read-non-nil-coding-system", Fread_non_nil_coding_system,
6218 Sread_non_nil_coding_system, 1, 1, 0,
6219 doc: /* Read a coding system from the minibuffer, prompting with string PROMPT. */)
6220 (prompt)
6221 Lisp_Object prompt;
6222 {
6223 Lisp_Object val;
6224 do
6225 {
6226 val = Fcompleting_read (prompt, Vcoding_system_alist, Qnil,
6227 Qt, Qnil, Qcoding_system_history, Qnil, Qnil);
6228 }
6229 while (XSTRING (val)->size == 0);
6230 return (Fintern (val, Qnil));
6231 }
6232
6233 DEFUN ("read-coding-system", Fread_coding_system, Sread_coding_system, 1, 2, 0,
6234 doc: /* Read a coding system from the minibuffer, prompting with string PROMPT.
6235 If the user enters null input, return second argument DEFAULT-CODING-SYSTEM. */)
6236 (prompt, default_coding_system)
6237 Lisp_Object prompt, default_coding_system;
6238 {
6239 Lisp_Object val;
6240 if (SYMBOLP (default_coding_system))
6241 XSETSTRING (default_coding_system, XSYMBOL (default_coding_system)->name);
6242 val = Fcompleting_read (prompt, Vcoding_system_alist, Qnil,
6243 Qt, Qnil, Qcoding_system_history,
6244 default_coding_system, Qnil);
6245 return (XSTRING (val)->size == 0 ? Qnil : Fintern (val, Qnil));
6246 }
6247
6248 DEFUN ("check-coding-system", Fcheck_coding_system, Scheck_coding_system,
6249 1, 1, 0,
6250 doc: /* Check validity of CODING-SYSTEM.
6251 If valid, return CODING-SYSTEM, else signal a `coding-system-error' error.
6252 It is valid if it is a symbol with a non-nil `coding-system' property.
6253 The value of property should be a vector of length 5. */)
6254 (coding_system)
6255 Lisp_Object coding_system;
6256 {
6257 CHECK_SYMBOL (coding_system);
6258 if (!NILP (Fcoding_system_p (coding_system)))
6259 return coding_system;
6260 while (1)
6261 Fsignal (Qcoding_system_error, Fcons (coding_system, Qnil));
6262 }
6263
6264 \f
6265 Lisp_Object
6266 detect_coding_system (src, src_bytes, highest, multibytep, coding_system)
6267 unsigned char *src;
6268 int src_bytes, highest;
6269 int multibytep;
6270 Lisp_Object coding_system;
6271 {
6272 unsigned char *src_end = src + src_bytes;
6273 int mask = CATEGORY_MASK_ANY;
6274 int detected = 0;
6275 int c, i;
6276 Lisp_Object attrs, eol_type;
6277 Lisp_Object val;
6278 struct coding_system coding;
6279
6280 if (NILP (coding_system))
6281 coding_system = Qundecided;
6282 setup_coding_system (coding_system, &coding);
6283 attrs = CODING_ID_ATTRS (coding.id);
6284 eol_type = CODING_ID_EOL_TYPE (coding.id);
6285
6286 coding.source = src;
6287 coding.src_bytes = src_bytes;
6288 coding.src_multibyte = multibytep;
6289 coding.consumed = 0;
6290
6291 if (XINT (CODING_ATTR_CATEGORY (attrs)) != coding_category_undecided)
6292 {
6293 mask = 1 << XINT (CODING_ATTR_CATEGORY (attrs));
6294 }
6295 else
6296 {
6297 coding_system = Qnil;
6298 for (; src < src_end; src++)
6299 {
6300 c = *src;
6301 if (c & 0x80 || (c < 0x20 && (c == ISO_CODE_ESC
6302 || c == ISO_CODE_SI
6303 || c == ISO_CODE_SO)))
6304 break;
6305 }
6306 coding.head_ascii = src - coding.source;
6307
6308 if (src < src_end)
6309 for (i = 0; i < coding_category_raw_text; i++)
6310 {
6311 enum coding_category category = coding_priorities[i];
6312 struct coding_system *this = coding_categories + category;
6313
6314 if (category >= coding_category_raw_text
6315 || detected & (1 << category))
6316 continue;
6317
6318 if (this->id < 0)
6319 {
6320 /* No coding system of this category is defined. */
6321 mask &= ~(1 << category);
6322 }
6323 else
6324 {
6325 detected |= detected_mask[category];
6326 if ((*(coding_categories[category].detector)) (&coding, &mask)
6327 && highest)
6328 {
6329 mask &= detected_mask[category];
6330 break;
6331 }
6332 }
6333 }
6334 }
6335
6336 if (!mask)
6337 val = Fcons (make_number (coding_category_raw_text), Qnil);
6338 else if (mask == CATEGORY_MASK_ANY)
6339 val = Fcons (make_number (coding_category_undecided), Qnil);
6340 else if (highest)
6341 {
6342 for (i = 0; i < coding_category_raw_text; i++)
6343 if (mask & (1 << coding_priorities[i]))
6344 {
6345 val = Fcons (make_number (coding_priorities[i]), Qnil);
6346 break;
6347 }
6348 }
6349 else
6350 {
6351 val = Qnil;
6352 for (i = coding_category_raw_text - 1; i >= 0; i--)
6353 if (mask & (1 << coding_priorities[i]))
6354 val = Fcons (make_number (coding_priorities[i]), val);
6355 }
6356
6357 {
6358 int one_byte_eol = -1, two_byte_eol = -1;
6359 Lisp_Object tail;
6360
6361 for (tail = val; CONSP (tail); tail = XCDR (tail))
6362 {
6363 struct coding_system *this
6364 = (NILP (coding_system) ? coding_categories + XINT (XCAR (tail))
6365 : &coding);
6366 int this_eol;
6367
6368 attrs = CODING_ID_ATTRS (this->id);
6369 eol_type = CODING_ID_EOL_TYPE (this->id);
6370 XSETCAR (tail, CODING_ID_NAME (this->id));
6371 if (VECTORP (eol_type))
6372 {
6373 if (EQ (CODING_ATTR_TYPE (attrs), Qutf_16))
6374 {
6375 if (two_byte_eol < 0)
6376 two_byte_eol = detect_eol (this, coding.source, src_bytes);
6377 this_eol = two_byte_eol;
6378 }
6379 else
6380 {
6381 if (one_byte_eol < 0)
6382 one_byte_eol =detect_eol (this, coding.source, src_bytes);
6383 this_eol = one_byte_eol;
6384 }
6385 if (this_eol == EOL_SEEN_LF)
6386 XSETCAR (tail, AREF (eol_type, 0));
6387 else if (this_eol == EOL_SEEN_CRLF)
6388 XSETCAR (tail, AREF (eol_type, 1));
6389 else if (this_eol == EOL_SEEN_CR)
6390 XSETCAR (tail, AREF (eol_type, 2));
6391 }
6392 }
6393 }
6394
6395 return (highest ? XCAR (val) : val);
6396 }
6397
6398
6399 DEFUN ("detect-coding-region", Fdetect_coding_region, Sdetect_coding_region,
6400 2, 3, 0,
6401 doc: /* Detect coding system of the text in the region between START and END.
6402 Return a list of possible coding systems ordered by priority.
6403
6404 If only ASCII characters are found, it returns a list of single element
6405 `undecided' or its subsidiary coding system according to a detected
6406 end-of-line format.
6407
6408 If optional argument HIGHEST is non-nil, return the coding system of
6409 highest priority. */)
6410 (start, end, highest)
6411 Lisp_Object start, end, highest;
6412 {
6413 int from, to;
6414 int from_byte, to_byte;
6415
6416 CHECK_NUMBER_COERCE_MARKER (start);
6417 CHECK_NUMBER_COERCE_MARKER (end);
6418
6419 validate_region (&start, &end);
6420 from = XINT (start), to = XINT (end);
6421 from_byte = CHAR_TO_BYTE (from);
6422 to_byte = CHAR_TO_BYTE (to);
6423
6424 if (from < GPT && to >= GPT)
6425 move_gap_both (to, to_byte);
6426
6427 return detect_coding_system (BYTE_POS_ADDR (from_byte),
6428 to_byte - from_byte,
6429 !NILP (highest),
6430 !NILP (current_buffer
6431 ->enable_multibyte_characters),
6432 Qnil);
6433 }
6434
6435 DEFUN ("detect-coding-string", Fdetect_coding_string, Sdetect_coding_string,
6436 1, 2, 0,
6437 doc: /* Detect coding system of the text in STRING.
6438 Return a list of possible coding systems ordered by priority.
6439
6440 If only ASCII characters are found, it returns a list of single element
6441 `undecided' or its subsidiary coding system according to a detected
6442 end-of-line format.
6443
6444 If optional argument HIGHEST is non-nil, return the coding system of
6445 highest priority. */)
6446 (string, highest)
6447 Lisp_Object string, highest;
6448 {
6449 CHECK_STRING (string);
6450
6451 return detect_coding_system (XSTRING (string)->data,
6452 STRING_BYTES (XSTRING (string)),
6453 !NILP (highest),
6454 STRING_MULTIBYTE (string),
6455 Qnil);
6456 }
6457
6458
6459 static INLINE int
6460 char_encodable_p (c, attrs)
6461 int c;
6462 Lisp_Object attrs;
6463 {
6464 Lisp_Object tail;
6465 struct charset *charset;
6466
6467 for (tail = CODING_ATTR_CHARSET_LIST (attrs);
6468 CONSP (tail); tail = XCDR (tail))
6469 {
6470 charset = CHARSET_FROM_ID (XINT (XCAR (tail)));
6471 if (CHAR_CHARSET_P (c, charset))
6472 break;
6473 }
6474 return (! NILP (tail));
6475 }
6476
6477
6478 /* Return a list of coding systems that safely encode the text between
6479 START and END. If EXCLUDE is non-nil, it is a list of coding
6480 systems not to check. The returned list doesn't contain any such
6481 coding systems. In any case, If the text contains only ASCII or is
6482 unibyte, return t. */
6483
6484 DEFUN ("find-coding-systems-region-internal",
6485 Ffind_coding_systems_region_internal,
6486 Sfind_coding_systems_region_internal, 2, 3, 0,
6487 doc: /* Internal use only. */)
6488 (start, end, exclude)
6489 Lisp_Object start, end, exclude;
6490 {
6491 Lisp_Object coding_attrs_list, safe_codings;
6492 EMACS_INT start_byte, end_byte;
6493 unsigned char *p, *pbeg, *pend;
6494 int c;
6495 Lisp_Object tail, elt;
6496
6497 if (STRINGP (start))
6498 {
6499 if (!STRING_MULTIBYTE (start)
6500 && XSTRING (start)->size != STRING_BYTES (XSTRING (start)))
6501 return Qt;
6502 start_byte = 0;
6503 end_byte = STRING_BYTES (XSTRING (start));
6504 }
6505 else
6506 {
6507 CHECK_NUMBER_COERCE_MARKER (start);
6508 CHECK_NUMBER_COERCE_MARKER (end);
6509 if (XINT (start) < BEG || XINT (end) > Z || XINT (start) > XINT (end))
6510 args_out_of_range (start, end);
6511 if (NILP (current_buffer->enable_multibyte_characters))
6512 return Qt;
6513 start_byte = CHAR_TO_BYTE (XINT (start));
6514 end_byte = CHAR_TO_BYTE (XINT (end));
6515 if (XINT (end) - XINT (start) == end_byte - start_byte)
6516 return Qt;
6517
6518 if (start < GPT && end > GPT)
6519 {
6520 if ((GPT - start) < (end - GPT))
6521 move_gap_both (start, start_byte);
6522 else
6523 move_gap_both (end, end_byte);
6524 }
6525 }
6526
6527 coding_attrs_list = Qnil;
6528 for (tail = Vcoding_system_list; CONSP (tail); tail = XCDR (tail))
6529 if (NILP (exclude)
6530 || NILP (Fmemq (XCAR (tail), exclude)))
6531 {
6532 Lisp_Object attrs;
6533
6534 attrs = AREF (CODING_SYSTEM_SPEC (XCAR (tail)), 0);
6535 if (EQ (XCAR (tail), CODING_ATTR_BASE_NAME (attrs))
6536 && ! EQ (CODING_ATTR_TYPE (attrs), Qundecided))
6537 coding_attrs_list = Fcons (attrs, coding_attrs_list);
6538 }
6539
6540 if (STRINGP (start))
6541 p = pbeg = XSTRING (start)->data;
6542 else
6543 p = pbeg = BYTE_POS_ADDR (start_byte);
6544 pend = p + (end_byte - start_byte);
6545
6546 while (p < pend && ASCII_BYTE_P (*p)) p++;
6547 while (p < pend && ASCII_BYTE_P (*(pend - 1))) pend--;
6548
6549 while (p < pend)
6550 {
6551 if (ASCII_BYTE_P (*p))
6552 p++;
6553 else
6554 {
6555 c = STRING_CHAR_ADVANCE (p);
6556
6557 charset_map_loaded = 0;
6558 for (tail = coding_attrs_list; CONSP (tail);)
6559 {
6560 elt = XCAR (tail);
6561 if (NILP (elt))
6562 tail = XCDR (tail);
6563 else if (char_encodable_p (c, elt))
6564 tail = XCDR (tail);
6565 else if (CONSP (XCDR (tail)))
6566 {
6567 XSETCAR (tail, XCAR (XCDR (tail)));
6568 XSETCDR (tail, XCDR (XCDR (tail)));
6569 }
6570 else
6571 {
6572 XSETCAR (tail, Qnil);
6573 tail = XCDR (tail);
6574 }
6575 }
6576 if (charset_map_loaded)
6577 {
6578 EMACS_INT p_offset = p - pbeg, pend_offset = pend - pbeg;
6579
6580 if (STRINGP (start))
6581 pbeg = XSTRING (start)->data;
6582 else
6583 pbeg = BYTE_POS_ADDR (start_byte);
6584 p = pbeg + p_offset;
6585 pend = pbeg + pend_offset;
6586 }
6587 }
6588 }
6589
6590 safe_codings = Qnil;
6591 for (tail = coding_attrs_list; CONSP (tail); tail = XCDR (tail))
6592 if (! NILP (XCAR (tail)))
6593 safe_codings = Fcons (CODING_ATTR_BASE_NAME (XCAR (tail)), safe_codings);
6594
6595 return safe_codings;
6596 }
6597
6598
6599 DEFUN ("check-coding-systems-region", Fcheck_coding_systems_region,
6600 Scheck_coding_systems_region, 3, 3, 0,
6601 doc: /* Check if the region is encodable by coding systems.
6602
6603 START and END are buffer positions specifying the region.
6604 CODING-SYSTEM-LIST is a list of coding systems to check.
6605
6606 The value is an alist ((CODING-SYSTEM POS0 POS1 ...) ...), where
6607 CODING-SYSTEM is a member of CODING-SYSTEM-LIst and can't encode the
6608 whole region, POS0, POS1, ... are buffer positions where non-encodable
6609 characters are found.
6610
6611 If all coding systems in CODING-SYSTEM-LIST can encode the region, the
6612 value is nil.
6613
6614 START may be a string. In that case, check if the string is
6615 encodable, and the value contains indices to the string instead of
6616 buffer positions. END is ignored. */)
6617 (start, end, coding_system_list)
6618 Lisp_Object start, end, coding_system_list;
6619 {
6620 Lisp_Object list;
6621 EMACS_INT start_byte, end_byte;
6622 int pos;
6623 unsigned char *p, *pbeg, *pend;
6624 int c;
6625 Lisp_Object tail, elt;
6626
6627 if (STRINGP (start))
6628 {
6629 if (!STRING_MULTIBYTE (start)
6630 && XSTRING (start)->size != STRING_BYTES (XSTRING (start)))
6631 return Qnil;
6632 start_byte = 0;
6633 end_byte = STRING_BYTES (XSTRING (start));
6634 pos = 0;
6635 }
6636 else
6637 {
6638 CHECK_NUMBER_COERCE_MARKER (start);
6639 CHECK_NUMBER_COERCE_MARKER (end);
6640 if (XINT (start) < BEG || XINT (end) > Z || XINT (start) > XINT (end))
6641 args_out_of_range (start, end);
6642 if (NILP (current_buffer->enable_multibyte_characters))
6643 return Qnil;
6644 start_byte = CHAR_TO_BYTE (XINT (start));
6645 end_byte = CHAR_TO_BYTE (XINT (end));
6646 if (XINT (end) - XINT (start) == end_byte - start_byte)
6647 return Qt;
6648
6649 if (start < GPT && end > GPT)
6650 {
6651 if ((GPT - start) < (end - GPT))
6652 move_gap_both (start, start_byte);
6653 else
6654 move_gap_both (end, end_byte);
6655 }
6656 pos = start;
6657 }
6658
6659 list = Qnil;
6660 for (tail = coding_system_list; CONSP (tail); tail = XCDR (tail))
6661 {
6662 elt = XCAR (tail);
6663 list = Fcons (Fcons (elt, Fcons (AREF (CODING_SYSTEM_SPEC (elt), 0),
6664 Qnil)),
6665 list);
6666 }
6667
6668 if (STRINGP (start))
6669 p = pbeg = XSTRING (start)->data;
6670 else
6671 p = pbeg = BYTE_POS_ADDR (start_byte);
6672 pend = p + (end_byte - start_byte);
6673
6674 while (p < pend && ASCII_BYTE_P (*p)) p++, pos++;
6675 while (p < pend && ASCII_BYTE_P (*(pend - 1))) pend--;
6676
6677 while (p < pend)
6678 {
6679 if (ASCII_BYTE_P (*p))
6680 p++;
6681 else
6682 {
6683 c = STRING_CHAR_ADVANCE (p);
6684
6685 charset_map_loaded = 0;
6686 for (tail = list; CONSP (tail); tail = XCDR (tail))
6687 {
6688 elt = XCDR (XCAR (tail));
6689 if (! char_encodable_p (c, XCAR (elt)))
6690 XSETCDR (elt, Fcons (make_number (pos), XCDR (elt)));
6691 }
6692 if (charset_map_loaded)
6693 {
6694 EMACS_INT p_offset = p - pbeg, pend_offset = pend - pbeg;
6695
6696 if (STRINGP (start))
6697 pbeg = XSTRING (start)->data;
6698 else
6699 pbeg = BYTE_POS_ADDR (start_byte);
6700 p = pbeg + p_offset;
6701 pend = pbeg + pend_offset;
6702 }
6703 }
6704 pos++;
6705 }
6706
6707 tail = list;
6708 list = Qnil;
6709 for (; CONSP (tail); tail = XCDR (tail))
6710 {
6711 elt = XCAR (tail);
6712 if (CONSP (XCDR (XCDR (elt))))
6713 list = Fcons (Fcons (XCAR (elt), Fnreverse (XCDR (XCDR (elt)))),
6714 list);
6715 }
6716
6717 return list;
6718 }
6719
6720
6721
6722 Lisp_Object
6723 code_convert_region (start, end, coding_system, dst_object, encodep, norecord)
6724 Lisp_Object start, end, coding_system, dst_object;
6725 int encodep, norecord;
6726 {
6727 struct coding_system coding;
6728 EMACS_INT from, from_byte, to, to_byte;
6729 Lisp_Object src_object;
6730
6731 CHECK_NUMBER_COERCE_MARKER (start);
6732 CHECK_NUMBER_COERCE_MARKER (end);
6733 if (NILP (coding_system))
6734 coding_system = Qno_conversion;
6735 else
6736 CHECK_CODING_SYSTEM (coding_system);
6737 src_object = Fcurrent_buffer ();
6738 if (NILP (dst_object))
6739 dst_object = src_object;
6740 else if (! EQ (dst_object, Qt))
6741 CHECK_BUFFER (dst_object);
6742
6743 validate_region (&start, &end);
6744 from = XFASTINT (start);
6745 from_byte = CHAR_TO_BYTE (from);
6746 to = XFASTINT (end);
6747 to_byte = CHAR_TO_BYTE (to);
6748
6749 setup_coding_system (coding_system, &coding);
6750 coding.mode |= CODING_MODE_LAST_BLOCK;
6751
6752 if (encodep)
6753 encode_coding_object (&coding, src_object, from, from_byte, to, to_byte,
6754 dst_object);
6755 else
6756 decode_coding_object (&coding, src_object, from, from_byte, to, to_byte,
6757 dst_object);
6758 if (! norecord)
6759 Vlast_coding_system_used = CODING_ID_NAME (coding.id);
6760
6761 if (coding.result != CODING_RESULT_SUCCESS)
6762 error ("Code conversion error: %d", coding.result);
6763
6764 return (BUFFERP (dst_object)
6765 ? make_number (coding.produced_char)
6766 : coding.dst_object);
6767 }
6768
6769
6770 DEFUN ("decode-coding-region", Fdecode_coding_region, Sdecode_coding_region,
6771 3, 4, "r\nzCoding system: ",
6772 doc: /* Decode the current region from the specified coding system.
6773 When called from a program, takes four arguments:
6774 START, END, CODING-SYSTEM, and DESTINATION.
6775 START and END are buffer positions.
6776
6777 Optional 4th arguments DESTINATION specifies where the decoded text goes.
6778 If nil, the region between START and END is replace by the decoded text.
6779 If buffer, the decoded text is inserted in the buffer.
6780 If t, the decoded text is returned.
6781
6782 This function sets `last-coding-system-used' to the precise coding system
6783 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
6784 not fully specified.)
6785 It returns the length of the decoded text. */)
6786 (start, end, coding_system, destination)
6787 Lisp_Object start, end, coding_system, destination;
6788 {
6789 return code_convert_region (start, end, coding_system, destination, 0, 0);
6790 }
6791
6792 DEFUN ("encode-coding-region", Fencode_coding_region, Sencode_coding_region,
6793 3, 4, "r\nzCoding system: ",
6794 doc: /* Encode the current region by specified coding system.
6795 When called from a program, takes three arguments:
6796 START, END, and CODING-SYSTEM. START and END are buffer positions.
6797
6798 Optional 4th arguments DESTINATION specifies where the encoded text goes.
6799 If nil, the region between START and END is replace by the encoded text.
6800 If buffer, the encoded text is inserted in the buffer.
6801 If t, the encoded text is returned.
6802
6803 This function sets `last-coding-system-used' to the precise coding system
6804 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
6805 not fully specified.)
6806 It returns the length of the encoded text. */)
6807 (start, end, coding_system, destination)
6808 Lisp_Object start, end, coding_system, destination;
6809 {
6810 return code_convert_region (start, end, coding_system, destination, 1, 0);
6811 }
6812
6813 Lisp_Object
6814 code_convert_string (string, coding_system, dst_object,
6815 encodep, nocopy, norecord)
6816 Lisp_Object string, coding_system, dst_object;
6817 int encodep, nocopy, norecord;
6818 {
6819 struct coding_system coding;
6820 EMACS_INT chars, bytes;
6821
6822 CHECK_STRING (string);
6823 if (NILP (coding_system))
6824 {
6825 if (! norecord)
6826 Vlast_coding_system_used = Qno_conversion;
6827 if (NILP (dst_object))
6828 return (nocopy ? Fcopy_sequence (string) : string);
6829 }
6830
6831 if (NILP (coding_system))
6832 coding_system = Qno_conversion;
6833 else
6834 CHECK_CODING_SYSTEM (coding_system);
6835 if (NILP (dst_object))
6836 dst_object = Qt;
6837 else if (! EQ (dst_object, Qt))
6838 CHECK_BUFFER (dst_object);
6839
6840 setup_coding_system (coding_system, &coding);
6841 coding.mode |= CODING_MODE_LAST_BLOCK;
6842 chars = XSTRING (string)->size;
6843 bytes = STRING_BYTES (XSTRING (string));
6844 if (encodep)
6845 encode_coding_object (&coding, string, 0, 0, chars, bytes, dst_object);
6846 else
6847 decode_coding_object (&coding, string, 0, 0, chars, bytes, dst_object);
6848 if (! norecord)
6849 Vlast_coding_system_used = CODING_ID_NAME (coding.id);
6850
6851 if (coding.result != CODING_RESULT_SUCCESS)
6852 error ("Code conversion error: %d", coding.result);
6853
6854 return (BUFFERP (dst_object)
6855 ? make_number (coding.produced_char)
6856 : coding.dst_object);
6857 }
6858
6859
6860 /* Encode or decode STRING according to CODING_SYSTEM.
6861 Do not set Vlast_coding_system_used.
6862
6863 This function is called only from macros DECODE_FILE and
6864 ENCODE_FILE, thus we ignore character composition. */
6865
6866 Lisp_Object
6867 code_convert_string_norecord (string, coding_system, encodep)
6868 Lisp_Object string, coding_system;
6869 int encodep;
6870 {
6871 return code_convert_string (string, coding_system, Qt, encodep, 0, 1);
6872 }
6873
6874
6875 DEFUN ("decode-coding-string", Fdecode_coding_string, Sdecode_coding_string,
6876 2, 4, 0,
6877 doc: /* Decode STRING which is encoded in CODING-SYSTEM, and return the result.
6878
6879 Optional third arg NOCOPY non-nil means it is OK to return STRING itself
6880 if the decoding operation is trivial.
6881
6882 Optional fourth arg BUFFER non-nil meant that the decoded text is
6883 inserted in BUFFER instead of returned as a astring. In this case,
6884 the return value is BUFFER.
6885
6886 This function sets `last-coding-system-used' to the precise coding system
6887 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
6888 not fully specified. */)
6889 (string, coding_system, nocopy, buffer)
6890 Lisp_Object string, coding_system, nocopy, buffer;
6891 {
6892 return code_convert_string (string, coding_system, buffer,
6893 0, ! NILP (nocopy), 0);
6894 }
6895
6896 DEFUN ("encode-coding-string", Fencode_coding_string, Sencode_coding_string,
6897 2, 4, 0,
6898 doc: /* Encode STRING to CODING-SYSTEM, and return the result.
6899
6900 Optional third arg NOCOPY non-nil means it is OK to return STRING
6901 itself if the encoding operation is trivial.
6902
6903 Optional fourth arg BUFFER non-nil meant that the encoded text is
6904 inserted in BUFFER instead of returned as a astring. In this case,
6905 the return value is BUFFER.
6906
6907 This function sets `last-coding-system-used' to the precise coding system
6908 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
6909 not fully specified.) */)
6910 (string, coding_system, nocopy, buffer)
6911 Lisp_Object string, coding_system, nocopy, buffer;
6912 {
6913 return code_convert_string (string, coding_system, buffer,
6914 nocopy, ! NILP (nocopy), 1);
6915 }
6916
6917 \f
6918 DEFUN ("decode-sjis-char", Fdecode_sjis_char, Sdecode_sjis_char, 1, 1, 0,
6919 doc: /* Decode a Japanese character which has CODE in shift_jis encoding.
6920 Return the corresponding character. */)
6921 (code)
6922 Lisp_Object code;
6923 {
6924 Lisp_Object spec, attrs, val;
6925 struct charset *charset_roman, *charset_kanji, *charset_kana, *charset;
6926 int c;
6927
6928 CHECK_NATNUM (code);
6929 c = XFASTINT (code);
6930 CHECK_CODING_SYSTEM_GET_SPEC (Vsjis_coding_system, spec);
6931 attrs = AREF (spec, 0);
6932
6933 if (ASCII_BYTE_P (c)
6934 && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
6935 return code;
6936
6937 val = CODING_ATTR_CHARSET_LIST (attrs);
6938 charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
6939 charset_kana = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
6940 charset_kanji = CHARSET_FROM_ID (XINT (XCAR (val)));
6941
6942 if (c <= 0x7F)
6943 charset = charset_roman;
6944 else if (c >= 0xA0 && c < 0xDF)
6945 {
6946 charset = charset_kana;
6947 c -= 0x80;
6948 }
6949 else
6950 {
6951 int s1 = c >> 8, s2 = c & 0xFF;
6952
6953 if (s1 < 0x81 || (s1 > 0x9F && s1 < 0xE0) || s1 > 0xEF
6954 || s2 < 0x40 || s2 == 0x7F || s2 > 0xFC)
6955 error ("Invalid code: %d", code);
6956 SJIS_TO_JIS (c);
6957 charset = charset_kanji;
6958 }
6959 c = DECODE_CHAR (charset, c);
6960 if (c < 0)
6961 error ("Invalid code: %d", code);
6962 return make_number (c);
6963 }
6964
6965
6966 DEFUN ("encode-sjis-char", Fencode_sjis_char, Sencode_sjis_char, 1, 1, 0,
6967 doc: /* Encode a Japanese character CHAR to shift_jis encoding.
6968 Return the corresponding code in SJIS. */)
6969 (ch)
6970 Lisp_Object ch;
6971 {
6972 Lisp_Object spec, attrs, charset_list;
6973 int c;
6974 struct charset *charset;
6975 unsigned code;
6976
6977 CHECK_CHARACTER (ch);
6978 c = XFASTINT (ch);
6979 CHECK_CODING_SYSTEM_GET_SPEC (Vsjis_coding_system, spec);
6980 attrs = AREF (spec, 0);
6981
6982 if (ASCII_CHAR_P (c)
6983 && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
6984 return ch;
6985
6986 charset_list = CODING_ATTR_CHARSET_LIST (attrs);
6987 charset = char_charset (c, charset_list, &code);
6988 if (code == CHARSET_INVALID_CODE (charset))
6989 error ("Can't encode by shift_jis encoding: %d", c);
6990 JIS_TO_SJIS (code);
6991
6992 return make_number (code);
6993 }
6994
6995 DEFUN ("decode-big5-char", Fdecode_big5_char, Sdecode_big5_char, 1, 1, 0,
6996 doc: /* Decode a Big5 character which has CODE in BIG5 coding system.
6997 Return the corresponding character. */)
6998 (code)
6999 Lisp_Object code;
7000 {
7001 Lisp_Object spec, attrs, val;
7002 struct charset *charset_roman, *charset_big5, *charset;
7003 int c;
7004
7005 CHECK_NATNUM (code);
7006 c = XFASTINT (code);
7007 CHECK_CODING_SYSTEM_GET_SPEC (Vbig5_coding_system, spec);
7008 attrs = AREF (spec, 0);
7009
7010 if (ASCII_BYTE_P (c)
7011 && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
7012 return code;
7013
7014 val = CODING_ATTR_CHARSET_LIST (attrs);
7015 charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
7016 charset_big5 = CHARSET_FROM_ID (XINT (XCAR (val)));
7017
7018 if (c <= 0x7F)
7019 charset = charset_roman;
7020 else
7021 {
7022 int b1 = c >> 8, b2 = c & 0x7F;
7023 if (b1 < 0xA1 || b1 > 0xFE
7024 || b2 < 0x40 || (b2 > 0x7E && b2 < 0xA1) || b2 > 0xFE)
7025 error ("Invalid code: %d", code);
7026 charset = charset_big5;
7027 }
7028 c = DECODE_CHAR (charset, (unsigned )c);
7029 if (c < 0)
7030 error ("Invalid code: %d", code);
7031 return make_number (c);
7032 }
7033
7034 DEFUN ("encode-big5-char", Fencode_big5_char, Sencode_big5_char, 1, 1, 0,
7035 doc: /* Encode the Big5 character CHAR to BIG5 coding system.
7036 Return the corresponding character code in Big5. */)
7037 (ch)
7038 Lisp_Object ch;
7039 {
7040 Lisp_Object spec, attrs, charset_list;
7041 struct charset *charset;
7042 int c;
7043 unsigned code;
7044
7045 CHECK_CHARACTER (ch);
7046 c = XFASTINT (ch);
7047 CHECK_CODING_SYSTEM_GET_SPEC (Vbig5_coding_system, spec);
7048 attrs = AREF (spec, 0);
7049 if (ASCII_CHAR_P (c)
7050 && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
7051 return ch;
7052
7053 charset_list = CODING_ATTR_CHARSET_LIST (attrs);
7054 charset = char_charset (c, charset_list, &code);
7055 if (code == CHARSET_INVALID_CODE (charset))
7056 error ("Can't encode by Big5 encoding: %d", c);
7057
7058 return make_number (code);
7059 }
7060
7061 \f
7062 DEFUN ("set-terminal-coding-system-internal",
7063 Fset_terminal_coding_system_internal,
7064 Sset_terminal_coding_system_internal, 1, 1, 0,
7065 doc: /* Internal use only. */)
7066 (coding_system)
7067 Lisp_Object coding_system;
7068 {
7069 CHECK_SYMBOL (coding_system);
7070 setup_coding_system (Fcheck_coding_system (coding_system),
7071 &terminal_coding);
7072
7073 /* We had better not send unsafe characters to terminal. */
7074 terminal_coding.mode |= CODING_MODE_SAFE_ENCODING;
7075 /* Characer composition should be disabled. */
7076 terminal_coding.common_flags &= ~CODING_ANNOTATE_COMPOSITION_MASK;
7077 terminal_coding.src_multibyte = 1;
7078 terminal_coding.dst_multibyte = 0;
7079 return Qnil;
7080 }
7081
7082 DEFUN ("set-safe-terminal-coding-system-internal",
7083 Fset_safe_terminal_coding_system_internal,
7084 Sset_safe_terminal_coding_system_internal, 1, 1, 0,
7085 doc: /* Internal use only. */)
7086 (coding_system)
7087 Lisp_Object coding_system;
7088 {
7089 CHECK_SYMBOL (coding_system);
7090 setup_coding_system (Fcheck_coding_system (coding_system),
7091 &safe_terminal_coding);
7092 /* Characer composition should be disabled. */
7093 safe_terminal_coding.common_flags &= ~CODING_ANNOTATE_COMPOSITION_MASK;
7094 safe_terminal_coding.src_multibyte = 1;
7095 safe_terminal_coding.dst_multibyte = 0;
7096 return Qnil;
7097 }
7098
7099 DEFUN ("terminal-coding-system",
7100 Fterminal_coding_system, Sterminal_coding_system, 0, 0, 0,
7101 doc: /* Return coding system specified for terminal output. */)
7102 ()
7103 {
7104 return CODING_ID_NAME (terminal_coding.id);
7105 }
7106
7107 DEFUN ("set-keyboard-coding-system-internal",
7108 Fset_keyboard_coding_system_internal,
7109 Sset_keyboard_coding_system_internal, 1, 1, 0,
7110 doc: /* Internal use only. */)
7111 (coding_system)
7112 Lisp_Object coding_system;
7113 {
7114 CHECK_SYMBOL (coding_system);
7115 setup_coding_system (Fcheck_coding_system (coding_system),
7116 &keyboard_coding);
7117 /* Characer composition should be disabled. */
7118 keyboard_coding.common_flags &= ~CODING_ANNOTATE_COMPOSITION_MASK;
7119 return Qnil;
7120 }
7121
7122 DEFUN ("keyboard-coding-system",
7123 Fkeyboard_coding_system, Skeyboard_coding_system, 0, 0, 0,
7124 doc: /* Return coding system specified for decoding keyboard input. */)
7125 ()
7126 {
7127 return CODING_ID_NAME (keyboard_coding.id);
7128 }
7129
7130 \f
7131 DEFUN ("find-operation-coding-system", Ffind_operation_coding_system,
7132 Sfind_operation_coding_system, 1, MANY, 0,
7133 doc: /* Choose a coding system for an operation based on the target name.
7134 The value names a pair of coding systems: (DECODING-SYSTEM . ENCODING-SYSTEM).
7135 DECODING-SYSTEM is the coding system to use for decoding
7136 \(in case OPERATION does decoding), and ENCODING-SYSTEM is the coding system
7137 for encoding (in case OPERATION does encoding).
7138
7139 The first argument OPERATION specifies an I/O primitive:
7140 For file I/O, `insert-file-contents' or `write-region'.
7141 For process I/O, `call-process', `call-process-region', or `start-process'.
7142 For network I/O, `open-network-stream'.
7143
7144 The remaining arguments should be the same arguments that were passed
7145 to the primitive. Depending on which primitive, one of those arguments
7146 is selected as the TARGET. For example, if OPERATION does file I/O,
7147 whichever argument specifies the file name is TARGET.
7148
7149 TARGET has a meaning which depends on OPERATION:
7150 For file I/O, TARGET is a file name.
7151 For process I/O, TARGET is a process name.
7152 For network I/O, TARGET is a service name or a port number
7153
7154 This function looks up what specified for TARGET in,
7155 `file-coding-system-alist', `process-coding-system-alist',
7156 or `network-coding-system-alist' depending on OPERATION.
7157 They may specify a coding system, a cons of coding systems,
7158 or a function symbol to call.
7159 In the last case, we call the function with one argument,
7160 which is a list of all the arguments given to this function.
7161
7162 usage: (find-operation-coding-system OPERATION ARGUMENTS ...) */)
7163 (nargs, args)
7164 int nargs;
7165 Lisp_Object *args;
7166 {
7167 Lisp_Object operation, target_idx, target, val;
7168 register Lisp_Object chain;
7169
7170 if (nargs < 2)
7171 error ("Too few arguments");
7172 operation = args[0];
7173 if (!SYMBOLP (operation)
7174 || !INTEGERP (target_idx = Fget (operation, Qtarget_idx)))
7175 error ("Invalid first arguement");
7176 if (nargs < 1 + XINT (target_idx))
7177 error ("Too few arguments for operation: %s",
7178 XSYMBOL (operation)->name->data);
7179 target = args[XINT (target_idx) + 1];
7180 if (!(STRINGP (target)
7181 || (EQ (operation, Qopen_network_stream) && INTEGERP (target))))
7182 error ("Invalid %dth argument", XINT (target_idx) + 1);
7183
7184 chain = ((EQ (operation, Qinsert_file_contents)
7185 || EQ (operation, Qwrite_region))
7186 ? Vfile_coding_system_alist
7187 : (EQ (operation, Qopen_network_stream)
7188 ? Vnetwork_coding_system_alist
7189 : Vprocess_coding_system_alist));
7190 if (NILP (chain))
7191 return Qnil;
7192
7193 for (; CONSP (chain); chain = XCDR (chain))
7194 {
7195 Lisp_Object elt;
7196
7197 elt = XCAR (chain);
7198 if (CONSP (elt)
7199 && ((STRINGP (target)
7200 && STRINGP (XCAR (elt))
7201 && fast_string_match (XCAR (elt), target) >= 0)
7202 || (INTEGERP (target) && EQ (target, XCAR (elt)))))
7203 {
7204 val = XCDR (elt);
7205 /* Here, if VAL is both a valid coding system and a valid
7206 function symbol, we return VAL as a coding system. */
7207 if (CONSP (val))
7208 return val;
7209 if (! SYMBOLP (val))
7210 return Qnil;
7211 if (! NILP (Fcoding_system_p (val)))
7212 return Fcons (val, val);
7213 if (! NILP (Ffboundp (val)))
7214 {
7215 val = call1 (val, Flist (nargs, args));
7216 if (CONSP (val))
7217 return val;
7218 if (SYMBOLP (val) && ! NILP (Fcoding_system_p (val)))
7219 return Fcons (val, val);
7220 }
7221 return Qnil;
7222 }
7223 }
7224 return Qnil;
7225 }
7226
7227 DEFUN ("set-coding-system-priority", Fset_coding_system_priority,
7228 Sset_coding_system_priority, 1, MANY, 0,
7229 doc: /* Assign higher priority to the coding systems given as arguments.
7230 usage: (set-coding-system-priority CODING-SYSTEM ...) */)
7231 (nargs, args)
7232 int nargs;
7233 Lisp_Object *args;
7234 {
7235 int i, j;
7236 int changed[coding_category_max];
7237 enum coding_category priorities[coding_category_max];
7238
7239 bzero (changed, sizeof changed);
7240
7241 for (i = j = 0; i < nargs; i++)
7242 {
7243 enum coding_category category;
7244 Lisp_Object spec, attrs;
7245
7246 CHECK_CODING_SYSTEM_GET_SPEC (args[i], spec);
7247 attrs = AREF (spec, 0);
7248 category = XINT (CODING_ATTR_CATEGORY (attrs));
7249 if (changed[category])
7250 /* Ignore this coding system because a coding system of the
7251 same category already had a higher priority. */
7252 continue;
7253 changed[category] = 1;
7254 priorities[j++] = category;
7255 if (coding_categories[category].id >= 0
7256 && ! EQ (args[i], CODING_ID_NAME (coding_categories[category].id)))
7257 setup_coding_system (args[i], &coding_categories[category]);
7258 }
7259
7260 /* Now we have decided top J priorities. Reflect the order of the
7261 original priorities to the remaining priorities. */
7262
7263 for (i = j, j = 0; i < coding_category_max; i++, j++)
7264 {
7265 while (j < coding_category_max
7266 && changed[coding_priorities[j]])
7267 j++;
7268 if (j == coding_category_max)
7269 abort ();
7270 priorities[i] = coding_priorities[j];
7271 }
7272
7273 bcopy (priorities, coding_priorities, sizeof priorities);
7274 return Qnil;
7275 }
7276
7277 DEFUN ("coding-system-priority-list", Fcoding_system_priority_list,
7278 Scoding_system_priority_list, 0, 1, 0,
7279 doc: /* Return a list of coding systems ordered by their priorities.
7280 HIGHESTP non-nil means just return the highest priority one. */)
7281 (highestp)
7282 Lisp_Object highestp;
7283 {
7284 int i;
7285 Lisp_Object val;
7286
7287 for (i = 0, val = Qnil; i < coding_category_max; i++)
7288 {
7289 enum coding_category category = coding_priorities[i];
7290 int id = coding_categories[category].id;
7291 Lisp_Object attrs;
7292
7293 if (id < 0)
7294 continue;
7295 attrs = CODING_ID_ATTRS (id);
7296 if (! NILP (highestp))
7297 return CODING_ATTR_BASE_NAME (attrs);
7298 val = Fcons (CODING_ATTR_BASE_NAME (attrs), val);
7299 }
7300 return Fnreverse (val);
7301 }
7302
7303 static char *suffixes[] = { "-unix", "-dos", "-mac" };
7304
7305 static Lisp_Object
7306 make_subsidiaries (base)
7307 Lisp_Object base;
7308 {
7309 Lisp_Object subsidiaries;
7310 int base_name_len = STRING_BYTES (XSYMBOL (base)->name);
7311 char *buf = (char *) alloca (base_name_len + 6);
7312 int i;
7313
7314 bcopy (XSYMBOL (base)->name->data, buf, base_name_len);
7315 subsidiaries = Fmake_vector (make_number (3), Qnil);
7316 for (i = 0; i < 3; i++)
7317 {
7318 bcopy (suffixes[i], buf + base_name_len, strlen (suffixes[i]) + 1);
7319 ASET (subsidiaries, i, intern (buf));
7320 }
7321 return subsidiaries;
7322 }
7323
7324
7325 DEFUN ("define-coding-system-internal", Fdefine_coding_system_internal,
7326 Sdefine_coding_system_internal, coding_arg_max, MANY, 0,
7327 doc: /* For internal use only.
7328 usage: (define-coding-system-internal ...) */)
7329 (nargs, args)
7330 int nargs;
7331 Lisp_Object *args;
7332 {
7333 Lisp_Object name;
7334 Lisp_Object spec_vec; /* [ ATTRS ALIASE EOL_TYPE ] */
7335 Lisp_Object attrs; /* Vector of attributes. */
7336 Lisp_Object eol_type;
7337 Lisp_Object aliases;
7338 Lisp_Object coding_type, charset_list, safe_charsets;
7339 enum coding_category category;
7340 Lisp_Object tail, val;
7341 int max_charset_id = 0;
7342 int i;
7343
7344 if (nargs < coding_arg_max)
7345 goto short_args;
7346
7347 attrs = Fmake_vector (make_number (coding_attr_last_index), Qnil);
7348
7349 name = args[coding_arg_name];
7350 CHECK_SYMBOL (name);
7351 CODING_ATTR_BASE_NAME (attrs) = name;
7352
7353 val = args[coding_arg_mnemonic];
7354 if (! STRINGP (val))
7355 CHECK_CHARACTER (val);
7356 CODING_ATTR_MNEMONIC (attrs) = val;
7357
7358 coding_type = args[coding_arg_coding_type];
7359 CHECK_SYMBOL (coding_type);
7360 CODING_ATTR_TYPE (attrs) = coding_type;
7361
7362 charset_list = args[coding_arg_charset_list];
7363 if (SYMBOLP (charset_list))
7364 {
7365 if (EQ (charset_list, Qiso_2022))
7366 {
7367 if (! EQ (coding_type, Qiso_2022))
7368 error ("Invalid charset-list");
7369 charset_list = Viso_2022_charset_list;
7370 }
7371 else if (EQ (charset_list, Qemacs_mule))
7372 {
7373 if (! EQ (coding_type, Qemacs_mule))
7374 error ("Invalid charset-list");
7375 charset_list = Vemacs_mule_charset_list;
7376 }
7377 for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
7378 if (max_charset_id < XFASTINT (XCAR (tail)))
7379 max_charset_id = XFASTINT (XCAR (tail));
7380 }
7381 else
7382 {
7383 charset_list = Fcopy_sequence (charset_list);
7384 for (tail = charset_list; !NILP (tail); tail = Fcdr (tail))
7385 {
7386 struct charset *charset;
7387
7388 val = Fcar (tail);
7389 CHECK_CHARSET_GET_CHARSET (val, charset);
7390 if (EQ (coding_type, Qiso_2022)
7391 ? CHARSET_ISO_FINAL (charset) < 0
7392 : EQ (coding_type, Qemacs_mule)
7393 ? CHARSET_EMACS_MULE_ID (charset) < 0
7394 : 0)
7395 error ("Can't handle charset `%s'",
7396 XSYMBOL (CHARSET_NAME (charset))->name->data);
7397
7398 XCAR (tail) = make_number (charset->id);
7399 if (max_charset_id < charset->id)
7400 max_charset_id = charset->id;
7401 }
7402 }
7403 CODING_ATTR_CHARSET_LIST (attrs) = charset_list;
7404
7405 safe_charsets = Fmake_string (make_number (max_charset_id + 1),
7406 make_number (255));
7407 for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
7408 XSTRING (safe_charsets)->data[XFASTINT (XCAR (tail))] = 0;
7409 CODING_ATTR_SAFE_CHARSETS (attrs) = safe_charsets;
7410
7411 val = args[coding_arg_decode_translation_table];
7412 if (! NILP (val))
7413 CHECK_CHAR_TABLE (val);
7414 CODING_ATTR_DECODE_TBL (attrs) = val;
7415
7416 val = args[coding_arg_encode_translation_table];
7417 if (! NILP (val))
7418 CHECK_CHAR_TABLE (val);
7419 CODING_ATTR_ENCODE_TBL (attrs) = val;
7420
7421 val = args[coding_arg_post_read_conversion];
7422 CHECK_SYMBOL (val);
7423 CODING_ATTR_POST_READ (attrs) = val;
7424
7425 val = args[coding_arg_pre_write_conversion];
7426 CHECK_SYMBOL (val);
7427 CODING_ATTR_PRE_WRITE (attrs) = val;
7428
7429 val = args[coding_arg_default_char];
7430 if (NILP (val))
7431 CODING_ATTR_DEFAULT_CHAR (attrs) = make_number (' ');
7432 else
7433 {
7434 CHECK_CHARACTER (val);
7435 CODING_ATTR_DEFAULT_CHAR (attrs) = val;
7436 }
7437
7438 val = args[coding_arg_plist];
7439 CHECK_LIST (val);
7440 CODING_ATTR_PLIST (attrs) = val;
7441
7442 if (EQ (coding_type, Qcharset))
7443 {
7444 /* Generate a lisp vector of 256 elements. Each element is nil,
7445 integer, or a list of charset IDs.
7446
7447 If Nth element is nil, the byte code N is invalid in this
7448 coding system.
7449
7450 If Nth element is a number NUM, N is the first byte of a
7451 charset whose ID is NUM.
7452
7453 If Nth element is a list of charset IDs, N is the first byte
7454 of one of them. The list is sorted by dimensions of the
7455 charsets. A charset of smaller dimension comes firtst.
7456 */
7457 val = Fmake_vector (make_number (256), Qnil);
7458
7459 for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
7460 {
7461 struct charset *charset = CHARSET_FROM_ID (XFASTINT (XCAR (tail)));
7462 int dim = CHARSET_DIMENSION (charset);
7463 int idx = (dim - 1) * 4;
7464
7465 for (i = charset->code_space[idx];
7466 i <= charset->code_space[idx + 1]; i++)
7467 {
7468 Lisp_Object tmp, tmp2;
7469 int dim2;
7470
7471 tmp = AREF (val, i);
7472 if (NILP (tmp))
7473 tmp = XCAR (tail);
7474 else if (NUMBERP (tmp))
7475 {
7476 dim2 = CHARSET_DIMENSION (CHARSET_FROM_ID (XFASTINT (tmp)));
7477 if (dim < dim2)
7478 tmp = Fcons (XCAR (tail), Fcons (tmp, Qnil));
7479 else
7480 tmp = Fcons (tmp, Fcons (XCAR (tail), Qnil));
7481 }
7482 else
7483 {
7484 for (tmp2 = tmp; CONSP (tmp2); tmp2 = XCDR (tmp2))
7485 {
7486 dim2 = CHARSET_DIMENSION (CHARSET_FROM_ID (XFASTINT (XCAR (tmp2))));
7487 if (dim < dim2)
7488 break;
7489 }
7490 if (NILP (tmp2))
7491 tmp = nconc2 (tmp, Fcons (XCAR (tail), Qnil));
7492 else
7493 {
7494 XSETCDR (tmp2, Fcons (XCAR (tmp2), XCDR (tmp2)));
7495 XSETCAR (tmp2, XCAR (tail));
7496 }
7497 }
7498 ASET (val, i, tmp);
7499 }
7500 }
7501 ASET (attrs, coding_attr_charset_valids, val);
7502 category = coding_category_charset;
7503 }
7504 else if (EQ (coding_type, Qccl))
7505 {
7506 Lisp_Object valids;
7507
7508 if (nargs < coding_arg_ccl_max)
7509 goto short_args;
7510
7511 val = args[coding_arg_ccl_decoder];
7512 CHECK_CCL_PROGRAM (val);
7513 if (VECTORP (val))
7514 val = Fcopy_sequence (val);
7515 ASET (attrs, coding_attr_ccl_decoder, val);
7516
7517 val = args[coding_arg_ccl_encoder];
7518 CHECK_CCL_PROGRAM (val);
7519 if (VECTORP (val))
7520 val = Fcopy_sequence (val);
7521 ASET (attrs, coding_attr_ccl_encoder, val);
7522
7523 val = args[coding_arg_ccl_valids];
7524 valids = Fmake_string (make_number (256), make_number (0));
7525 for (tail = val; !NILP (tail); tail = Fcdr (tail))
7526 {
7527 val = Fcar (tail);
7528 if (INTEGERP (val))
7529 ASET (valids, XINT (val), 1);
7530 else
7531 {
7532 int from, to;
7533
7534 CHECK_CONS (val);
7535 CHECK_NUMBER (XCAR (val));
7536 CHECK_NUMBER (XCDR (val));
7537 from = XINT (XCAR (val));
7538 to = XINT (XCDR (val));
7539 for (i = from; i <= to; i++)
7540 ASET (valids, i, 1);
7541 }
7542 }
7543 ASET (attrs, coding_attr_ccl_valids, valids);
7544
7545 category = coding_category_ccl;
7546 }
7547 else if (EQ (coding_type, Qutf_16))
7548 {
7549 Lisp_Object bom, endian;
7550
7551 if (nargs < coding_arg_utf16_max)
7552 goto short_args;
7553
7554 bom = args[coding_arg_utf16_bom];
7555 if (! NILP (bom) && ! EQ (bom, Qt))
7556 {
7557 CHECK_CONS (bom);
7558 CHECK_CODING_SYSTEM (XCAR (bom));
7559 CHECK_CODING_SYSTEM (XCDR (bom));
7560 }
7561 ASET (attrs, coding_attr_utf_16_bom, bom);
7562
7563 endian = args[coding_arg_utf16_endian];
7564 ASET (attrs, coding_attr_utf_16_endian, endian);
7565
7566 category = (CONSP (bom)
7567 ? coding_category_utf_16_auto
7568 : NILP (bom)
7569 ? (NILP (endian)
7570 ? coding_category_utf_16_be_nosig
7571 : coding_category_utf_16_le_nosig)
7572 : (NILP (endian)
7573 ? coding_category_utf_16_be
7574 : coding_category_utf_16_le));
7575 }
7576 else if (EQ (coding_type, Qiso_2022))
7577 {
7578 Lisp_Object initial, reg_usage, request, flags;
7579 int i, id;
7580
7581 if (nargs < coding_arg_iso2022_max)
7582 goto short_args;
7583
7584 initial = Fcopy_sequence (args[coding_arg_iso2022_initial]);
7585 CHECK_VECTOR (initial);
7586 for (i = 0; i < 4; i++)
7587 {
7588 val = Faref (initial, make_number (i));
7589 if (! NILP (val))
7590 {
7591 CHECK_CHARSET_GET_ID (val, id);
7592 ASET (initial, i, make_number (id));
7593 }
7594 else
7595 ASET (initial, i, make_number (-1));
7596 }
7597
7598 reg_usage = args[coding_arg_iso2022_reg_usage];
7599 CHECK_CONS (reg_usage);
7600 CHECK_NATNUM (XCAR (reg_usage));
7601 CHECK_NATNUM (XCDR (reg_usage));
7602
7603 request = Fcopy_sequence (args[coding_arg_iso2022_request]);
7604 for (tail = request; ! NILP (tail); tail = Fcdr (tail))
7605 {
7606 int id;
7607
7608 val = Fcar (tail);
7609 CHECK_CONS (val);
7610 CHECK_CHARSET_GET_ID (XCAR (val), id);
7611 CHECK_NATNUM (XCDR (val));
7612 if (XINT (XCDR (val)) >= 4)
7613 error ("Invalid graphic register number: %d", XINT (XCDR (val)));
7614 XCAR (val) = make_number (id);
7615 }
7616
7617 flags = args[coding_arg_iso2022_flags];
7618 CHECK_NATNUM (flags);
7619 i = XINT (flags);
7620 if (EQ (args[coding_arg_charset_list], Qiso_2022))
7621 flags = make_number (i | CODING_ISO_FLAG_FULL_SUPPORT);
7622
7623 ASET (attrs, coding_attr_iso_initial, initial);
7624 ASET (attrs, coding_attr_iso_usage, reg_usage);
7625 ASET (attrs, coding_attr_iso_request, request);
7626 ASET (attrs, coding_attr_iso_flags, flags);
7627 setup_iso_safe_charsets (attrs);
7628
7629 if (i & CODING_ISO_FLAG_SEVEN_BITS)
7630 category = ((i & (CODING_ISO_FLAG_LOCKING_SHIFT
7631 | CODING_ISO_FLAG_SINGLE_SHIFT))
7632 ? coding_category_iso_7_else
7633 : EQ (args[coding_arg_charset_list], Qiso_2022)
7634 ? coding_category_iso_7
7635 : coding_category_iso_7_tight);
7636 else
7637 {
7638 int id = XINT (AREF (initial, 1));
7639
7640 category = (((i & (CODING_ISO_FLAG_LOCKING_SHIFT
7641 | CODING_ISO_FLAG_SINGLE_SHIFT))
7642 || EQ (args[coding_arg_charset_list], Qiso_2022)
7643 || id < 0)
7644 ? coding_category_iso_8_else
7645 : (CHARSET_DIMENSION (CHARSET_FROM_ID (id)) == 1)
7646 ? coding_category_iso_8_1
7647 : coding_category_iso_8_2);
7648 }
7649 }
7650 else if (EQ (coding_type, Qemacs_mule))
7651 {
7652 if (EQ (args[coding_arg_charset_list], Qemacs_mule))
7653 ASET (attrs, coding_attr_emacs_mule_full, Qt);
7654
7655 category = coding_category_emacs_mule;
7656 }
7657 else if (EQ (coding_type, Qshift_jis))
7658 {
7659
7660 struct charset *charset;
7661
7662 if (XINT (Flength (charset_list)) != 3)
7663 error ("There should be just three charsets");
7664
7665 charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
7666 if (CHARSET_DIMENSION (charset) != 1)
7667 error ("Dimension of charset %s is not one",
7668 XSYMBOL (CHARSET_NAME (charset))->name->data);
7669
7670 charset_list = XCDR (charset_list);
7671 charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
7672 if (CHARSET_DIMENSION (charset) != 1)
7673 error ("Dimension of charset %s is not one",
7674 XSYMBOL (CHARSET_NAME (charset))->name->data);
7675
7676 charset_list = XCDR (charset_list);
7677 charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
7678 if (CHARSET_DIMENSION (charset) != 2)
7679 error ("Dimension of charset %s is not two",
7680 XSYMBOL (CHARSET_NAME (charset))->name->data);
7681
7682 category = coding_category_sjis;
7683 Vsjis_coding_system = name;
7684 }
7685 else if (EQ (coding_type, Qbig5))
7686 {
7687 struct charset *charset;
7688
7689 if (XINT (Flength (charset_list)) != 2)
7690 error ("There should be just two charsets");
7691
7692 charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
7693 if (CHARSET_DIMENSION (charset) != 1)
7694 error ("Dimension of charset %s is not one",
7695 XSYMBOL (CHARSET_NAME (charset))->name->data);
7696
7697 charset_list = XCDR (charset_list);
7698 charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
7699 if (CHARSET_DIMENSION (charset) != 2)
7700 error ("Dimension of charset %s is not two",
7701 XSYMBOL (CHARSET_NAME (charset))->name->data);
7702
7703 category = coding_category_big5;
7704 Vbig5_coding_system = name;
7705 }
7706 else if (EQ (coding_type, Qraw_text))
7707 category = coding_category_raw_text;
7708 else if (EQ (coding_type, Qutf_8))
7709 category = coding_category_utf_8;
7710 else if (EQ (coding_type, Qundecided))
7711 category = coding_category_undecided;
7712 else
7713 error ("Invalid coding system type: %s",
7714 XSYMBOL (coding_type)->name->data);
7715
7716 CODING_ATTR_CATEGORY (attrs) = make_number (category);
7717
7718 eol_type = args[coding_arg_eol_type];
7719 if (! NILP (eol_type)
7720 && ! EQ (eol_type, Qunix)
7721 && ! EQ (eol_type, Qdos)
7722 && ! EQ (eol_type, Qmac))
7723 error ("Invalid eol-type");
7724
7725 aliases = Fcons (name, Qnil);
7726
7727 if (NILP (eol_type))
7728 {
7729 eol_type = make_subsidiaries (name);
7730 for (i = 0; i < 3; i++)
7731 {
7732 Lisp_Object this_spec, this_name, this_aliases, this_eol_type;
7733
7734 this_name = AREF (eol_type, i);
7735 this_aliases = Fcons (this_name, Qnil);
7736 this_eol_type = (i == 0 ? Qunix : i == 1 ? Qdos : Qmac);
7737 this_spec = Fmake_vector (make_number (3), attrs);
7738 ASET (this_spec, 1, this_aliases);
7739 ASET (this_spec, 2, this_eol_type);
7740 Fputhash (this_name, this_spec, Vcoding_system_hash_table);
7741 Vcoding_system_list = Fcons (this_name, Vcoding_system_list);
7742 Vcoding_system_alist = Fcons (Fcons (Fsymbol_name (this_name), Qnil),
7743 Vcoding_system_alist);
7744 }
7745 }
7746
7747 spec_vec = Fmake_vector (make_number (3), attrs);
7748 ASET (spec_vec, 1, aliases);
7749 ASET (spec_vec, 2, eol_type);
7750
7751 Fputhash (name, spec_vec, Vcoding_system_hash_table);
7752 Vcoding_system_list = Fcons (name, Vcoding_system_list);
7753 Vcoding_system_alist = Fcons (Fcons (Fsymbol_name (name), Qnil),
7754 Vcoding_system_alist);
7755
7756 {
7757 int id = coding_categories[category].id;
7758
7759 if (id < 0 || EQ (name, CODING_ID_NAME (id)))
7760 setup_coding_system (name, &coding_categories[category]);
7761 }
7762
7763 return Qnil;
7764
7765 short_args:
7766 return Fsignal (Qwrong_number_of_arguments,
7767 Fcons (intern ("define-coding-system-internal"),
7768 make_number (nargs)));
7769 }
7770
7771 /* Fixme: should this record the alias relationships for
7772 diagnostics? */
7773 DEFUN ("define-coding-system-alias", Fdefine_coding_system_alias,
7774 Sdefine_coding_system_alias, 2, 2, 0,
7775 doc: /* Define ALIAS as an alias for CODING-SYSTEM. */)
7776 (alias, coding_system)
7777 Lisp_Object alias, coding_system;
7778 {
7779 Lisp_Object spec, aliases, eol_type;
7780
7781 CHECK_SYMBOL (alias);
7782 CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
7783 aliases = AREF (spec, 1);
7784 while (!NILP (XCDR (aliases)))
7785 aliases = XCDR (aliases);
7786 XCDR (aliases) = Fcons (alias, Qnil);
7787
7788 eol_type = AREF (spec, 2);
7789 if (VECTORP (eol_type))
7790 {
7791 Lisp_Object subsidiaries;
7792 int i;
7793
7794 subsidiaries = make_subsidiaries (alias);
7795 for (i = 0; i < 3; i++)
7796 Fdefine_coding_system_alias (AREF (subsidiaries, i),
7797 AREF (eol_type, i));
7798
7799 ASET (spec, 2, subsidiaries);
7800 }
7801
7802 Fputhash (alias, spec, Vcoding_system_hash_table);
7803 Vcoding_system_alist = Fcons (Fcons (Fsymbol_name (alias), Qnil),
7804 Vcoding_system_alist);
7805
7806 return Qnil;
7807 }
7808
7809 DEFUN ("coding-system-base", Fcoding_system_base, Scoding_system_base,
7810 1, 1, 0,
7811 doc: /* Return the base of CODING-SYSTEM.
7812 Any alias or subsidiary coding system is not a base coding system. */)
7813 (coding_system)
7814 Lisp_Object coding_system;
7815 {
7816 Lisp_Object spec, attrs;
7817
7818 if (NILP (coding_system))
7819 return (Qno_conversion);
7820 CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
7821 attrs = AREF (spec, 0);
7822 return CODING_ATTR_BASE_NAME (attrs);
7823 }
7824
7825 DEFUN ("coding-system-plist", Fcoding_system_plist, Scoding_system_plist,
7826 1, 1, 0,
7827 doc: "Return the property list of CODING-SYSTEM.")
7828 (coding_system)
7829 Lisp_Object coding_system;
7830 {
7831 Lisp_Object spec, attrs;
7832
7833 if (NILP (coding_system))
7834 coding_system = Qno_conversion;
7835 CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
7836 attrs = AREF (spec, 0);
7837 return CODING_ATTR_PLIST (attrs);
7838 }
7839
7840
7841 DEFUN ("coding-system-aliases", Fcoding_system_aliases, Scoding_system_aliases,
7842 1, 1, 0,
7843 doc: /* Return the list of aliases of CODING-SYSTEM. */)
7844 (coding_system)
7845 Lisp_Object coding_system;
7846 {
7847 Lisp_Object spec;
7848
7849 if (NILP (coding_system))
7850 coding_system = Qno_conversion;
7851 CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
7852 return AREF (spec, 1);
7853 }
7854
7855 DEFUN ("coding-system-eol-type", Fcoding_system_eol_type,
7856 Scoding_system_eol_type, 1, 1, 0,
7857 doc: /* Return eol-type of CODING-SYSTEM.
7858 An eol-type is integer 0, 1, 2, or a vector of coding systems.
7859
7860 Integer values 0, 1, and 2 indicate a format of end-of-line; LF, CRLF,
7861 and CR respectively.
7862
7863 A vector value indicates that a format of end-of-line should be
7864 detected automatically. Nth element of the vector is the subsidiary
7865 coding system whose eol-type is N. */)
7866 (coding_system)
7867 Lisp_Object coding_system;
7868 {
7869 Lisp_Object spec, eol_type;
7870 int n;
7871
7872 if (NILP (coding_system))
7873 coding_system = Qno_conversion;
7874 if (! CODING_SYSTEM_P (coding_system))
7875 return Qnil;
7876 spec = CODING_SYSTEM_SPEC (coding_system);
7877 eol_type = AREF (spec, 2);
7878 if (VECTORP (eol_type))
7879 return Fcopy_sequence (eol_type);
7880 n = EQ (eol_type, Qunix) ? 0 : EQ (eol_type, Qdos) ? 1 : 2;
7881 return make_number (n);
7882 }
7883
7884 #endif /* emacs */
7885
7886 \f
7887 /*** 9. Post-amble ***/
7888
7889 void
7890 init_coding_once ()
7891 {
7892 int i;
7893
7894 for (i = 0; i < coding_category_max; i++)
7895 {
7896 coding_categories[i].id = -1;
7897 coding_priorities[i] = i;
7898 }
7899
7900 /* ISO2022 specific initialize routine. */
7901 for (i = 0; i < 0x20; i++)
7902 iso_code_class[i] = ISO_control_0;
7903 for (i = 0x21; i < 0x7F; i++)
7904 iso_code_class[i] = ISO_graphic_plane_0;
7905 for (i = 0x80; i < 0xA0; i++)
7906 iso_code_class[i] = ISO_control_1;
7907 for (i = 0xA1; i < 0xFF; i++)
7908 iso_code_class[i] = ISO_graphic_plane_1;
7909 iso_code_class[0x20] = iso_code_class[0x7F] = ISO_0x20_or_0x7F;
7910 iso_code_class[0xA0] = iso_code_class[0xFF] = ISO_0xA0_or_0xFF;
7911 iso_code_class[ISO_CODE_CR] = ISO_carriage_return;
7912 iso_code_class[ISO_CODE_SO] = ISO_shift_out;
7913 iso_code_class[ISO_CODE_SI] = ISO_shift_in;
7914 iso_code_class[ISO_CODE_SS2_7] = ISO_single_shift_2_7;
7915 iso_code_class[ISO_CODE_ESC] = ISO_escape;
7916 iso_code_class[ISO_CODE_SS2] = ISO_single_shift_2;
7917 iso_code_class[ISO_CODE_SS3] = ISO_single_shift_3;
7918 iso_code_class[ISO_CODE_CSI] = ISO_control_sequence_introducer;
7919
7920 inhibit_pre_post_conversion = 0;
7921
7922 for (i = 0; i < 256; i++)
7923 {
7924 emacs_mule_bytes[i] = 1;
7925 }
7926 emacs_mule_bytes[LEADING_CODE_PRIVATE_11] = 3;
7927 emacs_mule_bytes[LEADING_CODE_PRIVATE_12] = 3;
7928 emacs_mule_bytes[LEADING_CODE_PRIVATE_21] = 4;
7929 emacs_mule_bytes[LEADING_CODE_PRIVATE_22] = 4;
7930 }
7931
7932 #ifdef emacs
7933
7934 void
7935 syms_of_coding ()
7936 {
7937 staticpro (&Vcoding_system_hash_table);
7938 Vcoding_system_hash_table = Fmakehash (Qeq);
7939
7940 staticpro (&Vsjis_coding_system);
7941 Vsjis_coding_system = Qnil;
7942
7943 staticpro (&Vbig5_coding_system);
7944 Vbig5_coding_system = Qnil;
7945
7946 staticpro (&Vcode_conversion_work_buf_list);
7947 Vcode_conversion_work_buf_list = Qnil;
7948
7949 staticpro (&Vcode_conversion_reused_work_buf);
7950 Vcode_conversion_reused_work_buf = Qnil;
7951
7952 DEFSYM (Qcharset, "charset");
7953 DEFSYM (Qtarget_idx, "target-idx");
7954 DEFSYM (Qcoding_system_history, "coding-system-history");
7955 Fset (Qcoding_system_history, Qnil);
7956
7957 /* Target FILENAME is the first argument. */
7958 Fput (Qinsert_file_contents, Qtarget_idx, make_number (0));
7959 /* Target FILENAME is the third argument. */
7960 Fput (Qwrite_region, Qtarget_idx, make_number (2));
7961
7962 DEFSYM (Qcall_process, "call-process");
7963 /* Target PROGRAM is the first argument. */
7964 Fput (Qcall_process, Qtarget_idx, make_number (0));
7965
7966 DEFSYM (Qcall_process_region, "call-process-region");
7967 /* Target PROGRAM is the third argument. */
7968 Fput (Qcall_process_region, Qtarget_idx, make_number (2));
7969
7970 DEFSYM (Qstart_process, "start-process");
7971 /* Target PROGRAM is the third argument. */
7972 Fput (Qstart_process, Qtarget_idx, make_number (2));
7973
7974 DEFSYM (Qopen_network_stream, "open-network-stream");
7975 /* Target SERVICE is the fourth argument. */
7976 Fput (Qopen_network_stream, Qtarget_idx, make_number (3));
7977
7978 DEFSYM (Qcoding_system, "coding-system");
7979 DEFSYM (Qcoding_aliases, "coding-aliases");
7980
7981 DEFSYM (Qeol_type, "eol-type");
7982 DEFSYM (Qunix, "unix");
7983 DEFSYM (Qdos, "dos");
7984
7985 DEFSYM (Qbuffer_file_coding_system, "buffer-file-coding-system");
7986 DEFSYM (Qpost_read_conversion, "post-read-conversion");
7987 DEFSYM (Qpre_write_conversion, "pre-write-conversion");
7988 DEFSYM (Qdefault_char, "default-char");
7989 DEFSYM (Qundecided, "undecided");
7990 DEFSYM (Qno_conversion, "no-conversion");
7991 DEFSYM (Qraw_text, "raw-text");
7992
7993 DEFSYM (Qiso_2022, "iso-2022");
7994
7995 DEFSYM (Qutf_8, "utf-8");
7996
7997 DEFSYM (Qutf_16, "utf-16");
7998 DEFSYM (Qutf_16_be, "utf-16-be");
7999 DEFSYM (Qutf_16_be_nosig, "utf-16-be-nosig");
8000 DEFSYM (Qutf_16_le, "utf-16-l3");
8001 DEFSYM (Qutf_16_le_nosig, "utf-16-le-nosig");
8002 DEFSYM (Qsignature, "signature");
8003 DEFSYM (Qendian, "endian");
8004 DEFSYM (Qbig, "big");
8005 DEFSYM (Qlittle, "little");
8006
8007 DEFSYM (Qshift_jis, "shift-jis");
8008 DEFSYM (Qbig5, "big5");
8009
8010 DEFSYM (Qcoding_system_p, "coding-system-p");
8011
8012 DEFSYM (Qcoding_system_error, "coding-system-error");
8013 Fput (Qcoding_system_error, Qerror_conditions,
8014 Fcons (Qcoding_system_error, Fcons (Qerror, Qnil)));
8015 Fput (Qcoding_system_error, Qerror_message,
8016 build_string ("Invalid coding system"));
8017
8018 /* Intern this now in case it isn't already done.
8019 Setting this variable twice is harmless.
8020 But don't staticpro it here--that is done in alloc.c. */
8021 Qchar_table_extra_slots = intern ("char-table-extra-slots");
8022
8023 DEFSYM (Qtranslation_table, "translation-table");
8024 Fput (Qtranslation_table, Qchar_table_extra_slots, make_number (1));
8025 DEFSYM (Qtranslation_table_id, "translation-table-id");
8026 DEFSYM (Qtranslation_table_for_decode, "translation-table-for-decode");
8027 DEFSYM (Qtranslation_table_for_encode, "translation-table-for-encode");
8028
8029 DEFSYM (Qvalid_codes, "valid-codes");
8030
8031 DEFSYM (Qemacs_mule, "emacs-mule");
8032
8033 Vcoding_category_table
8034 = Fmake_vector (make_number (coding_category_max), Qnil);
8035 staticpro (&Vcoding_category_table);
8036 /* Followings are target of code detection. */
8037 ASET (Vcoding_category_table, coding_category_iso_7,
8038 intern ("coding-category-iso-7"));
8039 ASET (Vcoding_category_table, coding_category_iso_7_tight,
8040 intern ("coding-category-iso-7-tight"));
8041 ASET (Vcoding_category_table, coding_category_iso_8_1,
8042 intern ("coding-category-iso-8-1"));
8043 ASET (Vcoding_category_table, coding_category_iso_8_2,
8044 intern ("coding-category-iso-8-2"));
8045 ASET (Vcoding_category_table, coding_category_iso_7_else,
8046 intern ("coding-category-iso-7-else"));
8047 ASET (Vcoding_category_table, coding_category_iso_8_else,
8048 intern ("coding-category-iso-8-else"));
8049 ASET (Vcoding_category_table, coding_category_utf_8,
8050 intern ("coding-category-utf-8"));
8051 ASET (Vcoding_category_table, coding_category_utf_16_be,
8052 intern ("coding-category-utf-16-be"));
8053 ASET (Vcoding_category_table, coding_category_utf_16_le,
8054 intern ("coding-category-utf-16-le"));
8055 ASET (Vcoding_category_table, coding_category_utf_16_be_nosig,
8056 intern ("coding-category-utf-16-be-nosig"));
8057 ASET (Vcoding_category_table, coding_category_utf_16_le_nosig,
8058 intern ("coding-category-utf-16-le-nosig"));
8059 ASET (Vcoding_category_table, coding_category_charset,
8060 intern ("coding-category-charset"));
8061 ASET (Vcoding_category_table, coding_category_sjis,
8062 intern ("coding-category-sjis"));
8063 ASET (Vcoding_category_table, coding_category_big5,
8064 intern ("coding-category-big5"));
8065 ASET (Vcoding_category_table, coding_category_ccl,
8066 intern ("coding-category-ccl"));
8067 ASET (Vcoding_category_table, coding_category_emacs_mule,
8068 intern ("coding-category-emacs-mule"));
8069 /* Followings are NOT target of code detection. */
8070 ASET (Vcoding_category_table, coding_category_raw_text,
8071 intern ("coding-category-raw-text"));
8072 ASET (Vcoding_category_table, coding_category_undecided,
8073 intern ("coding-category-undecided"));
8074
8075 defsubr (&Scoding_system_p);
8076 defsubr (&Sread_coding_system);
8077 defsubr (&Sread_non_nil_coding_system);
8078 defsubr (&Scheck_coding_system);
8079 defsubr (&Sdetect_coding_region);
8080 defsubr (&Sdetect_coding_string);
8081 defsubr (&Sfind_coding_systems_region_internal);
8082 defsubr (&Scheck_coding_systems_region);
8083 defsubr (&Sdecode_coding_region);
8084 defsubr (&Sencode_coding_region);
8085 defsubr (&Sdecode_coding_string);
8086 defsubr (&Sencode_coding_string);
8087 defsubr (&Sdecode_sjis_char);
8088 defsubr (&Sencode_sjis_char);
8089 defsubr (&Sdecode_big5_char);
8090 defsubr (&Sencode_big5_char);
8091 defsubr (&Sset_terminal_coding_system_internal);
8092 defsubr (&Sset_safe_terminal_coding_system_internal);
8093 defsubr (&Sterminal_coding_system);
8094 defsubr (&Sset_keyboard_coding_system_internal);
8095 defsubr (&Skeyboard_coding_system);
8096 defsubr (&Sfind_operation_coding_system);
8097 defsubr (&Sset_coding_system_priority);
8098 defsubr (&Sdefine_coding_system_internal);
8099 defsubr (&Sdefine_coding_system_alias);
8100 defsubr (&Scoding_system_base);
8101 defsubr (&Scoding_system_plist);
8102 defsubr (&Scoding_system_aliases);
8103 defsubr (&Scoding_system_eol_type);
8104 defsubr (&Scoding_system_priority_list);
8105
8106 DEFVAR_LISP ("coding-system-list", &Vcoding_system_list,
8107 doc: /* List of coding systems.
8108
8109 Do not alter the value of this variable manually. This variable should be
8110 updated by the functions `define-coding-system' and
8111 `define-coding-system-alias'. */);
8112 Vcoding_system_list = Qnil;
8113
8114 DEFVAR_LISP ("coding-system-alist", &Vcoding_system_alist,
8115 doc: /* Alist of coding system names.
8116 Each element is one element list of coding system name.
8117 This variable is given to `completing-read' as TABLE argument.
8118
8119 Do not alter the value of this variable manually. This variable should be
8120 updated by the functions `make-coding-system' and
8121 `define-coding-system-alias'. */);
8122 Vcoding_system_alist = Qnil;
8123
8124 DEFVAR_LISP ("coding-category-list", &Vcoding_category_list,
8125 doc: /* List of coding-categories (symbols) ordered by priority.
8126
8127 On detecting a coding system, Emacs tries code detection algorithms
8128 associated with each coding-category one by one in this order. When
8129 one algorithm agrees with a byte sequence of source text, the coding
8130 system bound to the corresponding coding-category is selected. */);
8131 {
8132 int i;
8133
8134 Vcoding_category_list = Qnil;
8135 for (i = coding_category_max - 1; i >= 0; i--)
8136 Vcoding_category_list
8137 = Fcons (XVECTOR (Vcoding_category_table)->contents[i],
8138 Vcoding_category_list);
8139 }
8140
8141 DEFVAR_LISP ("coding-system-for-read", &Vcoding_system_for_read,
8142 doc: /* Specify the coding system for read operations.
8143 It is useful to bind this variable with `let', but do not set it globally.
8144 If the value is a coding system, it is used for decoding on read operation.
8145 If not, an appropriate element is used from one of the coding system alists:
8146 There are three such tables, `file-coding-system-alist',
8147 `process-coding-system-alist', and `network-coding-system-alist'. */);
8148 Vcoding_system_for_read = Qnil;
8149
8150 DEFVAR_LISP ("coding-system-for-write", &Vcoding_system_for_write,
8151 doc: /* Specify the coding system for write operations.
8152 Programs bind this variable with `let', but you should not set it globally.
8153 If the value is a coding system, it is used for encoding of output,
8154 when writing it to a file and when sending it to a file or subprocess.
8155
8156 If this does not specify a coding system, an appropriate element
8157 is used from one of the coding system alists:
8158 There are three such tables, `file-coding-system-alist',
8159 `process-coding-system-alist', and `network-coding-system-alist'.
8160 For output to files, if the above procedure does not specify a coding system,
8161 the value of `buffer-file-coding-system' is used. */);
8162 Vcoding_system_for_write = Qnil;
8163
8164 DEFVAR_LISP ("last-coding-system-used", &Vlast_coding_system_used,
8165 doc: /*
8166 Coding system used in the latest file or process I/O. */);
8167 Vlast_coding_system_used = Qnil;
8168
8169 DEFVAR_BOOL ("inhibit-eol-conversion", &inhibit_eol_conversion,
8170 doc: /*
8171 *Non-nil means always inhibit code conversion of end-of-line format.
8172 See info node `Coding Systems' and info node `Text and Binary' concerning
8173 such conversion. */);
8174 inhibit_eol_conversion = 0;
8175
8176 DEFVAR_BOOL ("inherit-process-coding-system", &inherit_process_coding_system,
8177 doc: /*
8178 Non-nil means process buffer inherits coding system of process output.
8179 Bind it to t if the process output is to be treated as if it were a file
8180 read from some filesystem. */);
8181 inherit_process_coding_system = 0;
8182
8183 DEFVAR_LISP ("file-coding-system-alist", &Vfile_coding_system_alist,
8184 doc: /*
8185 Alist to decide a coding system to use for a file I/O operation.
8186 The format is ((PATTERN . VAL) ...),
8187 where PATTERN is a regular expression matching a file name,
8188 VAL is a coding system, a cons of coding systems, or a function symbol.
8189 If VAL is a coding system, it is used for both decoding and encoding
8190 the file contents.
8191 If VAL is a cons of coding systems, the car part is used for decoding,
8192 and the cdr part is used for encoding.
8193 If VAL is a function symbol, the function must return a coding system
8194 or a cons of coding systems which are used as above. The function gets
8195 the arguments with which `find-operation-coding-systems' was called.
8196
8197 See also the function `find-operation-coding-system'
8198 and the variable `auto-coding-alist'. */);
8199 Vfile_coding_system_alist = Qnil;
8200
8201 DEFVAR_LISP ("process-coding-system-alist", &Vprocess_coding_system_alist,
8202 doc: /*
8203 Alist to decide a coding system to use for a process I/O operation.
8204 The format is ((PATTERN . VAL) ...),
8205 where PATTERN is a regular expression matching a program name,
8206 VAL is a coding system, a cons of coding systems, or a function symbol.
8207 If VAL is a coding system, it is used for both decoding what received
8208 from the program and encoding what sent to the program.
8209 If VAL is a cons of coding systems, the car part is used for decoding,
8210 and the cdr part is used for encoding.
8211 If VAL is a function symbol, the function must return a coding system
8212 or a cons of coding systems which are used as above.
8213
8214 See also the function `find-operation-coding-system'. */);
8215 Vprocess_coding_system_alist = Qnil;
8216
8217 DEFVAR_LISP ("network-coding-system-alist", &Vnetwork_coding_system_alist,
8218 doc: /*
8219 Alist to decide a coding system to use for a network I/O operation.
8220 The format is ((PATTERN . VAL) ...),
8221 where PATTERN is a regular expression matching a network service name
8222 or is a port number to connect to,
8223 VAL is a coding system, a cons of coding systems, or a function symbol.
8224 If VAL is a coding system, it is used for both decoding what received
8225 from the network stream and encoding what sent to the network stream.
8226 If VAL is a cons of coding systems, the car part is used for decoding,
8227 and the cdr part is used for encoding.
8228 If VAL is a function symbol, the function must return a coding system
8229 or a cons of coding systems which are used as above.
8230
8231 See also the function `find-operation-coding-system'. */);
8232 Vnetwork_coding_system_alist = Qnil;
8233
8234 DEFVAR_LISP ("locale-coding-system", &Vlocale_coding_system,
8235 doc: /* Coding system to use with system messages.
8236 Also used for decoding keyboard input on X Window system. */);
8237 Vlocale_coding_system = Qnil;
8238
8239 /* The eol mnemonics are reset in startup.el system-dependently. */
8240 DEFVAR_LISP ("eol-mnemonic-unix", &eol_mnemonic_unix,
8241 doc: /*
8242 *String displayed in mode line for UNIX-like (LF) end-of-line format. */);
8243 eol_mnemonic_unix = build_string (":");
8244
8245 DEFVAR_LISP ("eol-mnemonic-dos", &eol_mnemonic_dos,
8246 doc: /*
8247 *String displayed in mode line for DOS-like (CRLF) end-of-line format. */);
8248 eol_mnemonic_dos = build_string ("\\");
8249
8250 DEFVAR_LISP ("eol-mnemonic-mac", &eol_mnemonic_mac,
8251 doc: /*
8252 *String displayed in mode line for MAC-like (CR) end-of-line format. */);
8253 eol_mnemonic_mac = build_string ("/");
8254
8255 DEFVAR_LISP ("eol-mnemonic-undecided", &eol_mnemonic_undecided,
8256 doc: /*
8257 *String displayed in mode line when end-of-line format is not yet determined. */);
8258 eol_mnemonic_undecided = build_string (":");
8259
8260 DEFVAR_LISP ("enable-character-translation", &Venable_character_translation,
8261 doc: /*
8262 *Non-nil enables character translation while encoding and decoding. */);
8263 Venable_character_translation = Qt;
8264
8265 DEFVAR_LISP ("standard-translation-table-for-decode",
8266 &Vstandard_translation_table_for_decode,
8267 doc: /* Table for translating characters while decoding. */);
8268 Vstandard_translation_table_for_decode = Qnil;
8269
8270 DEFVAR_LISP ("standard-translation-table-for-encode",
8271 &Vstandard_translation_table_for_encode,
8272 doc: /* Table for translating characters while encoding. */);
8273 Vstandard_translation_table_for_encode = Qnil;
8274
8275 DEFVAR_LISP ("charset-revision-table", &Vcharset_revision_table,
8276 doc: /* Alist of charsets vs revision numbers.
8277 While encoding, if a charset (car part of an element) is found,
8278 designate it with the escape sequence identifying revision (cdr part
8279 of the element). */);
8280 Vcharset_revision_table = Qnil;
8281
8282 DEFVAR_LISP ("default-process-coding-system",
8283 &Vdefault_process_coding_system,
8284 doc: /* Cons of coding systems used for process I/O by default.
8285 The car part is used for decoding a process output,
8286 the cdr part is used for encoding a text to be sent to a process. */);
8287 Vdefault_process_coding_system = Qnil;
8288
8289 DEFVAR_LISP ("latin-extra-code-table", &Vlatin_extra_code_table,
8290 doc: /*
8291 Table of extra Latin codes in the range 128..159 (inclusive).
8292 This is a vector of length 256.
8293 If Nth element is non-nil, the existence of code N in a file
8294 \(or output of subprocess) doesn't prevent it to be detected as
8295 a coding system of ISO 2022 variant which has a flag
8296 `accept-latin-extra-code' t (e.g. iso-latin-1) on reading a file
8297 or reading output of a subprocess.
8298 Only 128th through 159th elements has a meaning. */);
8299 Vlatin_extra_code_table = Fmake_vector (make_number (256), Qnil);
8300
8301 DEFVAR_LISP ("select-safe-coding-system-function",
8302 &Vselect_safe_coding_system_function,
8303 doc: /*
8304 Function to call to select safe coding system for encoding a text.
8305
8306 If set, this function is called to force a user to select a proper
8307 coding system which can encode the text in the case that a default
8308 coding system used in each operation can't encode the text.
8309
8310 The default value is `select-safe-coding-system' (which see). */);
8311 Vselect_safe_coding_system_function = Qnil;
8312
8313 DEFVAR_BOOL ("inhibit-iso-escape-detection",
8314 &inhibit_iso_escape_detection,
8315 doc: /*
8316 If non-nil, Emacs ignores ISO2022's escape sequence on code detection.
8317
8318 By default, on reading a file, Emacs tries to detect how the text is
8319 encoded. This code detection is sensitive to escape sequences. If
8320 the sequence is valid as ISO2022, the code is determined as one of
8321 the ISO2022 encodings, and the file is decoded by the corresponding
8322 coding system (e.g. `iso-2022-7bit').
8323
8324 However, there may be a case that you want to read escape sequences in
8325 a file as is. In such a case, you can set this variable to non-nil.
8326 Then, as the code detection ignores any escape sequences, no file is
8327 detected as encoded in some ISO2022 encoding. The result is that all
8328 escape sequences become visible in a buffer.
8329
8330 The default value is nil, and it is strongly recommended not to change
8331 it. That is because many Emacs Lisp source files that contain
8332 non-ASCII characters are encoded by the coding system `iso-2022-7bit'
8333 in Emacs's distribution, and they won't be decoded correctly on
8334 reading if you suppress escape sequence detection.
8335
8336 The other way to read escape sequences in a file without decoding is
8337 to explicitly specify some coding system that doesn't use ISO2022's
8338 escape sequence (e.g `latin-1') on reading by \\[universal-coding-system-argument]. */);
8339 inhibit_iso_escape_detection = 0;
8340
8341 {
8342 Lisp_Object args[coding_arg_max];
8343 Lisp_Object plist[14];
8344 int i;
8345
8346 for (i = 0; i < coding_arg_max; i++)
8347 args[i] = Qnil;
8348
8349 plist[0] = intern (":name");
8350 plist[1] = args[coding_arg_name] = Qno_conversion;
8351 plist[2] = intern (":mnemonic");
8352 plist[3] = args[coding_arg_mnemonic] = make_number ('=');
8353 plist[4] = intern (":coding-type");
8354 plist[5] = args[coding_arg_coding_type] = Qraw_text;
8355 plist[6] = intern (":ascii-compatible-p");
8356 plist[7] = args[coding_arg_ascii_compatible_p] = Qt;
8357 plist[8] = intern (":default-char");
8358 plist[9] = args[coding_arg_default_char] = make_number (0);
8359 plist[10] = intern (":docstring");
8360 plist[11] = build_string ("Do no conversion.\n\
8361 \n\
8362 When you visit a file with this coding, the file is read into a\n\
8363 unibyte buffer as is, thus each byte of a file is treated as a\n\
8364 character.");
8365 plist[12] = intern (":eol-type");
8366 plist[13] = args[coding_arg_eol_type] = Qunix;
8367 args[coding_arg_plist] = Flist (14, plist);
8368 Fdefine_coding_system_internal (coding_arg_max, args);
8369 }
8370
8371 setup_coding_system (Qno_conversion, &keyboard_coding);
8372 setup_coding_system (Qno_conversion, &terminal_coding);
8373 setup_coding_system (Qno_conversion, &safe_terminal_coding);
8374 }
8375
8376 char *
8377 emacs_strerror (error_number)
8378 int error_number;
8379 {
8380 char *str;
8381
8382 synchronize_system_messages_locale ();
8383 str = strerror (error_number);
8384
8385 if (! NILP (Vlocale_coding_system))
8386 {
8387 Lisp_Object dec = code_convert_string_norecord (build_string (str),
8388 Vlocale_coding_system,
8389 0);
8390 str = (char *) XSTRING (dec)->data;
8391 }
8392
8393 return str;
8394 }
8395
8396 #endif /* emacs */