(encode_coding_emacs_mule): Pay attention to raw-8-bit chars.
[bpt/emacs.git] / src / coding.c
1 /* Coding system handler (conversion, detection, and etc).
2 Copyright (C) 1995, 1997, 1998 Electrotechnical Laboratory, JAPAN.
3 Licensed to the Free Software Foundation.
4 Copyright (C) 2001 Free Software Foundation, Inc.
5 Copyright (C) 2001, 2002
6 National Institute of Advanced Industrial Science and Technology (AIST)
7 Registration Number H13PRO009
8
9 This file is part of GNU Emacs.
10
11 GNU Emacs is free software; you can redistribute it and/or modify
12 it under the terms of the GNU General Public License as published by
13 the Free Software Foundation; either version 2, or (at your option)
14 any later version.
15
16 GNU Emacs is distributed in the hope that it will be useful,
17 but WITHOUT ANY WARRANTY; without even the implied warranty of
18 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19 GNU General Public License for more details.
20
21 You should have received a copy of the GNU General Public License
22 along with GNU Emacs; see the file COPYING. If not, write to
23 the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
24 Boston, MA 02111-1307, USA. */
25
26 /*** TABLE OF CONTENTS ***
27
28 0. General comments
29 1. Preamble
30 2. Emacs' internal format (emacs-utf-8) handlers
31 3. UTF-8 handlers
32 4. UTF-16 handlers
33 5. Charset-base coding systems handlers
34 6. emacs-mule (old Emacs' internal format) handlers
35 7. ISO2022 handlers
36 8. Shift-JIS and BIG5 handlers
37 9. CCL handlers
38 10. C library functions
39 11. Emacs Lisp library functions
40 12. Postamble
41
42 */
43
44 /*** 0. General comments ***
45
46
47 CODING SYSTEM
48
49 A coding system is an object for an encoding mechanism that contains
50 information about how to convert byte sequences to character
51 sequences and vice versa. When we say "decode", it means converting
52 a byte sequence of a specific coding system into a character
53 sequence that is represented by Emacs' internal coding system
54 `emacs-utf-8', and when we say "encode", it means converting a
55 character sequence of emacs-utf-8 to a byte sequence of a specific
56 coding system.
57
58 In Emacs Lisp, a coding system is represented by a Lisp symbol. In
59 C level, a coding system is represented by a vector of attributes
60 stored in the hash table Vcharset_hash_table. The conversion from
61 coding system symbol to attributes vector is done by looking up
62 Vcharset_hash_table by the symbol.
63
64 Coding systems are classified into the following types depending on
65 the encoding mechanism. Here's a brief description of the types.
66
67 o UTF-8
68
69 o UTF-16
70
71 o Charset-base coding system
72
73 A coding system defined by one or more (coded) character sets.
74 Decoding and encoding are done by a code converter defined for each
75 character set.
76
77 o Old Emacs internal format (emacs-mule)
78
79 The coding system adopted by old versions of Emacs (20 and 21).
80
81 o ISO2022-base coding system
82
83 The most famous coding system for multiple character sets. X's
84 Compound Text, various EUCs (Extended Unix Code), and coding systems
85 used in the Internet communication such as ISO-2022-JP are all
86 variants of ISO2022.
87
88 o SJIS (or Shift-JIS or MS-Kanji-Code)
89
90 A coding system to encode character sets: ASCII, JISX0201, and
91 JISX0208. Widely used for PC's in Japan. Details are described in
92 section 8.
93
94 o BIG5
95
96 A coding system to encode character sets: ASCII and Big5. Widely
97 used by Chinese (mainly in Taiwan and Hong Kong). Details are
98 described in section 8. In this file, when we write "big5" (all
99 lowercase), we mean the coding system, and when we write "Big5"
100 (capitalized), we mean the character set.
101
102 o CCL
103
104 If a user wants to decode/encode text encoded in a coding system
105 not listed above, he can supply a decoder and an encoder for it in
106 CCL (Code Conversion Language) programs. Emacs executes the CCL
107 program while decoding/encoding.
108
109 o Raw-text
110
111 A coding system for a text containing raw eight-bit data. Emacs
112 treats each byte of source text as a character (except for
113 end-of-line conversion).
114
115 o No-conversion
116
117 Like raw text, but don't do end-of-line conversion.
118
119
120 END-OF-LINE FORMAT
121
122 How text end-of-line is encoded depends on operating system. For
123 instance, Unix's format is just one byte of LF (line-feed) code,
124 whereas DOS's format is two-byte sequence of `carriage-return' and
125 `line-feed' codes. MacOS's format is usually one byte of
126 `carriage-return'.
127
128 Since text character encoding and end-of-line encoding are
129 independent, any coding system described above can take any format
130 of end-of-line (except for no-conversion).
131
132 STRUCT CODING_SYSTEM
133
134 Before using a coding system for code conversion (i.e. decoding and
135 encoding), we setup a structure of type `struct coding_system'.
136 This structure keeps various information about a specific code
137 conversion (e.g. the location of source and destination data).
138
139 */
140
141 /* COMMON MACROS */
142
143
144 /*** GENERAL NOTES on `detect_coding_XXX ()' functions ***
145
146 These functions check if a byte sequence specified as a source in
147 CODING conforms to the format of XXX. Return 1 if the data contains
148 a byte sequence which can be decoded into non-ASCII characters by
149 the coding system. Otherwize (i.e. the data contains only ASCII
150 characters or invalid sequence) return 0.
151
152 It also resets some bits of an integer pointed by MASK. The macros
153 CATEGORY_MASK_XXX specifies each bit of this integer.
154
155 Below is the template of these functions. */
156
157 #if 0
158 static int
159 detect_coding_XXX (coding, mask)
160 struct coding_system *coding;
161 int *mask;
162 {
163 unsigned char *src = coding->source;
164 unsigned char *src_end = coding->source + coding->src_bytes;
165 int multibytep = coding->src_multibyte;
166 int c;
167 int found = 0;
168 ...;
169
170 while (1)
171 {
172 /* Get one byte from the source. If the souce is exausted, jump
173 to no_more_source:. */
174 ONE_MORE_BYTE (c);
175 /* Check if it conforms to XXX. If not, break the loop. */
176 }
177 /* As the data is invalid for XXX, reset a proper bits. */
178 *mask &= ~CODING_CATEGORY_XXX;
179 return 0;
180 no_more_source:
181 /* The source exausted. */
182 if (!found)
183 /* ASCII characters only. */
184 return 0;
185 /* Some data should be decoded into non-ASCII characters. */
186 *mask &= CODING_CATEGORY_XXX;
187 return 1;
188 }
189 #endif
190
191 /*** GENERAL NOTES on `decode_coding_XXX ()' functions ***
192
193 These functions decode a byte sequence specified as a source by
194 CODING. The resulting multibyte text goes to a place pointed to by
195 CODING->charbuf, the length of which should not exceed
196 CODING->charbuf_size;
197
198 These functions set the information of original and decoded texts in
199 CODING->consumed, CODING->consumed_char, and CODING->charbuf_used.
200 They also set CODING->result to one of CODING_RESULT_XXX indicating
201 how the decoding is finished.
202
203 Below is the template of these functions. */
204
205 #if 0
206 static void
207 decode_coding_XXXX (coding)
208 struct coding_system *coding;
209 {
210 unsigned char *src = coding->source + coding->consumed;
211 unsigned char *src_end = coding->source + coding->src_bytes;
212 /* SRC_BASE remembers the start position in source in each loop.
213 The loop will be exited when there's not enough source code, or
214 when there's no room in CHARBUF for a decoded character. */
215 unsigned char *src_base;
216 /* A buffer to produce decoded characters. */
217 int *charbuf = coding->charbuf;
218 int *charbuf_end = charbuf + coding->charbuf_size;
219 int multibytep = coding->src_multibyte;
220
221 while (1)
222 {
223 src_base = src;
224 if (charbuf < charbuf_end)
225 /* No more room to produce a decoded character. */
226 break;
227 ONE_MORE_BYTE (c);
228 /* Decode it. */
229 }
230
231 no_more_source:
232 if (src_base < src_end
233 && coding->mode & CODING_MODE_LAST_BLOCK)
234 /* If the source ends by partial bytes to construct a character,
235 treat them as eight-bit raw data. */
236 while (src_base < src_end && charbuf < charbuf_end)
237 *charbuf++ = *src_base++;
238 /* Remember how many bytes and characters we consumed. If the
239 source is multibyte, the bytes and chars are not identical. */
240 coding->consumed = coding->consumed_char = src_base - coding->source;
241 /* Remember how many characters we produced. */
242 coding->charbuf_used = charbuf - coding->charbuf;
243 }
244 #endif
245
246 /*** GENERAL NOTES on `encode_coding_XXX ()' functions ***
247
248 These functions encode SRC_BYTES length text at SOURCE of Emacs'
249 internal multibyte format by CODING. The resulting byte sequence
250 goes to a place pointed to by DESTINATION, the length of which
251 should not exceed DST_BYTES.
252
253 These functions set the information of original and encoded texts in
254 the members produced, produced_char, consumed, and consumed_char of
255 the structure *CODING. They also set the member result to one of
256 CODING_RESULT_XXX indicating how the encoding finished.
257
258 DST_BYTES zero means that source area and destination area are
259 overlapped, which means that we can produce a encoded text until it
260 reaches at the head of not-yet-encoded source text.
261
262 Below is a template of these functions. */
263 #if 0
264 static void
265 encode_coding_XXX (coding)
266 struct coding_system *coding;
267 {
268 int multibytep = coding->dst_multibyte;
269 int *charbuf = coding->charbuf;
270 int *charbuf_end = charbuf->charbuf + coding->charbuf_used;
271 unsigned char *dst = coding->destination + coding->produced;
272 unsigned char *dst_end = coding->destination + coding->dst_bytes;
273 unsigned char *adjusted_dst_end = dst_end - _MAX_BYTES_PRODUCED_IN_LOOP_;
274 int produced_chars = 0;
275
276 for (; charbuf < charbuf_end && dst < adjusted_dst_end; charbuf++)
277 {
278 int c = *charbuf;
279 /* Encode C into DST, and increment DST. */
280 }
281 label_no_more_destination:
282 /* How many chars and bytes we produced. */
283 coding->produced_char += produced_chars;
284 coding->produced = dst - coding->destination;
285 }
286 #endif
287
288 \f
289 /*** 1. Preamble ***/
290
291 #include <config.h>
292 #include <stdio.h>
293
294 #include "lisp.h"
295 #include "buffer.h"
296 #include "character.h"
297 #include "charset.h"
298 #include "ccl.h"
299 #include "composite.h"
300 #include "coding.h"
301 #include "window.h"
302
303 Lisp_Object Vcoding_system_hash_table;
304
305 Lisp_Object Qcoding_system, Qcoding_aliases, Qeol_type;
306 Lisp_Object Qunix, Qdos;
307 extern Lisp_Object Qmac; /* frame.c */
308 Lisp_Object Qbuffer_file_coding_system;
309 Lisp_Object Qpost_read_conversion, Qpre_write_conversion;
310 Lisp_Object Qdefault_char;
311 Lisp_Object Qno_conversion, Qundecided;
312 Lisp_Object Qcharset, Qiso_2022, Qutf_8, Qutf_16, Qshift_jis, Qbig5;
313 Lisp_Object Qutf_16_be_nosig, Qutf_16_be, Qutf_16_le_nosig, Qutf_16_le;
314 Lisp_Object Qsignature, Qendian, Qbig, Qlittle;
315 Lisp_Object Qcoding_system_history;
316 Lisp_Object Qvalid_codes;
317
318 extern Lisp_Object Qinsert_file_contents, Qwrite_region;
319 Lisp_Object Qcall_process, Qcall_process_region, Qprocess_argument;
320 Lisp_Object Qstart_process, Qopen_network_stream;
321 Lisp_Object Qtarget_idx;
322
323 Lisp_Object Vselect_safe_coding_system_function;
324
325 /* Mnemonic string for each format of end-of-line. */
326 Lisp_Object eol_mnemonic_unix, eol_mnemonic_dos, eol_mnemonic_mac;
327 /* Mnemonic string to indicate format of end-of-line is not yet
328 decided. */
329 Lisp_Object eol_mnemonic_undecided;
330
331 #ifdef emacs
332
333 Lisp_Object Vcoding_system_list, Vcoding_system_alist;
334
335 Lisp_Object Qcoding_system_p, Qcoding_system_error;
336
337 /* Coding system emacs-mule and raw-text are for converting only
338 end-of-line format. */
339 Lisp_Object Qemacs_mule, Qraw_text;
340
341 /* Coding-systems are handed between Emacs Lisp programs and C internal
342 routines by the following three variables. */
343 /* Coding-system for reading files and receiving data from process. */
344 Lisp_Object Vcoding_system_for_read;
345 /* Coding-system for writing files and sending data to process. */
346 Lisp_Object Vcoding_system_for_write;
347 /* Coding-system actually used in the latest I/O. */
348 Lisp_Object Vlast_coding_system_used;
349
350 /* A vector of length 256 which contains information about special
351 Latin codes (especially for dealing with Microsoft codes). */
352 Lisp_Object Vlatin_extra_code_table;
353
354 /* Flag to inhibit code conversion of end-of-line format. */
355 int inhibit_eol_conversion;
356
357 /* Flag to inhibit ISO2022 escape sequence detection. */
358 int inhibit_iso_escape_detection;
359
360 /* Flag to make buffer-file-coding-system inherit from process-coding. */
361 int inherit_process_coding_system;
362
363 /* Coding system to be used to encode text for terminal display. */
364 struct coding_system terminal_coding;
365
366 /* Coding system to be used to encode text for terminal display when
367 terminal coding system is nil. */
368 struct coding_system safe_terminal_coding;
369
370 /* Coding system of what is sent from terminal keyboard. */
371 struct coding_system keyboard_coding;
372
373 Lisp_Object Vfile_coding_system_alist;
374 Lisp_Object Vprocess_coding_system_alist;
375 Lisp_Object Vnetwork_coding_system_alist;
376
377 Lisp_Object Vlocale_coding_system;
378
379 #endif /* emacs */
380
381 /* Flag to tell if we look up translation table on character code
382 conversion. */
383 Lisp_Object Venable_character_translation;
384 /* Standard translation table to look up on decoding (reading). */
385 Lisp_Object Vstandard_translation_table_for_decode;
386 /* Standard translation table to look up on encoding (writing). */
387 Lisp_Object Vstandard_translation_table_for_encode;
388
389 Lisp_Object Qtranslation_table;
390 Lisp_Object Qtranslation_table_id;
391 Lisp_Object Qtranslation_table_for_decode;
392 Lisp_Object Qtranslation_table_for_encode;
393
394 /* Alist of charsets vs revision number. */
395 static Lisp_Object Vcharset_revision_table;
396
397 /* Default coding systems used for process I/O. */
398 Lisp_Object Vdefault_process_coding_system;
399
400 /* Global flag to tell that we can't call post-read-conversion and
401 pre-write-conversion functions. Usually the value is zero, but it
402 is set to 1 temporarily while such functions are running. This is
403 to avoid infinite recursive call. */
404 static int inhibit_pre_post_conversion;
405
406 /* Two special coding systems. */
407 Lisp_Object Vsjis_coding_system;
408 Lisp_Object Vbig5_coding_system;
409
410
411 static int detect_coding_utf_8 P_ ((struct coding_system *, int *));
412 static void decode_coding_utf_8 P_ ((struct coding_system *));
413 static int encode_coding_utf_8 P_ ((struct coding_system *));
414
415 static int detect_coding_utf_16 P_ ((struct coding_system *, int *));
416 static void decode_coding_utf_16 P_ ((struct coding_system *));
417 static int encode_coding_utf_16 P_ ((struct coding_system *));
418
419 static int detect_coding_iso_2022 P_ ((struct coding_system *, int *));
420 static void decode_coding_iso_2022 P_ ((struct coding_system *));
421 static int encode_coding_iso_2022 P_ ((struct coding_system *));
422
423 static int detect_coding_emacs_mule P_ ((struct coding_system *, int *));
424 static void decode_coding_emacs_mule P_ ((struct coding_system *));
425 static int encode_coding_emacs_mule P_ ((struct coding_system *));
426
427 static int detect_coding_sjis P_ ((struct coding_system *, int *));
428 static void decode_coding_sjis P_ ((struct coding_system *));
429 static int encode_coding_sjis P_ ((struct coding_system *));
430
431 static int detect_coding_big5 P_ ((struct coding_system *, int *));
432 static void decode_coding_big5 P_ ((struct coding_system *));
433 static int encode_coding_big5 P_ ((struct coding_system *));
434
435 static int detect_coding_ccl P_ ((struct coding_system *, int *));
436 static void decode_coding_ccl P_ ((struct coding_system *));
437 static int encode_coding_ccl P_ ((struct coding_system *));
438
439 static void decode_coding_raw_text P_ ((struct coding_system *));
440 static int encode_coding_raw_text P_ ((struct coding_system *));
441
442
443 /* ISO2022 section */
444
445 #define CODING_ISO_INITIAL(coding, reg) \
446 (XINT (AREF (AREF (CODING_ID_ATTRS ((coding)->id), \
447 coding_attr_iso_initial), \
448 reg)))
449
450
451 #define CODING_ISO_REQUEST(coding, charset_id) \
452 ((charset_id <= (coding)->max_charset_id \
453 ? (coding)->safe_charsets[charset_id] \
454 : -1))
455
456
457 #define CODING_ISO_FLAGS(coding) \
458 ((coding)->spec.iso_2022.flags)
459 #define CODING_ISO_DESIGNATION(coding, reg) \
460 ((coding)->spec.iso_2022.current_designation[reg])
461 #define CODING_ISO_INVOCATION(coding, plane) \
462 ((coding)->spec.iso_2022.current_invocation[plane])
463 #define CODING_ISO_SINGLE_SHIFTING(coding) \
464 ((coding)->spec.iso_2022.single_shifting)
465 #define CODING_ISO_BOL(coding) \
466 ((coding)->spec.iso_2022.bol)
467 #define CODING_ISO_INVOKED_CHARSET(coding, plane) \
468 CODING_ISO_DESIGNATION ((coding), CODING_ISO_INVOCATION ((coding), (plane)))
469
470 /* Control characters of ISO2022. */
471 /* code */ /* function */
472 #define ISO_CODE_LF 0x0A /* line-feed */
473 #define ISO_CODE_CR 0x0D /* carriage-return */
474 #define ISO_CODE_SO 0x0E /* shift-out */
475 #define ISO_CODE_SI 0x0F /* shift-in */
476 #define ISO_CODE_SS2_7 0x19 /* single-shift-2 for 7-bit code */
477 #define ISO_CODE_ESC 0x1B /* escape */
478 #define ISO_CODE_SS2 0x8E /* single-shift-2 */
479 #define ISO_CODE_SS3 0x8F /* single-shift-3 */
480 #define ISO_CODE_CSI 0x9B /* control-sequence-introducer */
481
482 /* All code (1-byte) of ISO2022 is classified into one of the
483 followings. */
484 enum iso_code_class_type
485 {
486 ISO_control_0, /* Control codes in the range
487 0x00..0x1F and 0x7F, except for the
488 following 5 codes. */
489 ISO_carriage_return, /* ISO_CODE_CR (0x0D) */
490 ISO_shift_out, /* ISO_CODE_SO (0x0E) */
491 ISO_shift_in, /* ISO_CODE_SI (0x0F) */
492 ISO_single_shift_2_7, /* ISO_CODE_SS2_7 (0x19) */
493 ISO_escape, /* ISO_CODE_SO (0x1B) */
494 ISO_control_1, /* Control codes in the range
495 0x80..0x9F, except for the
496 following 3 codes. */
497 ISO_single_shift_2, /* ISO_CODE_SS2 (0x8E) */
498 ISO_single_shift_3, /* ISO_CODE_SS3 (0x8F) */
499 ISO_control_sequence_introducer, /* ISO_CODE_CSI (0x9B) */
500 ISO_0x20_or_0x7F, /* Codes of the values 0x20 or 0x7F. */
501 ISO_graphic_plane_0, /* Graphic codes in the range 0x21..0x7E. */
502 ISO_0xA0_or_0xFF, /* Codes of the values 0xA0 or 0xFF. */
503 ISO_graphic_plane_1 /* Graphic codes in the range 0xA1..0xFE. */
504 };
505
506 /** The macros CODING_ISO_FLAG_XXX defines a flag bit of the
507 `iso-flags' attribute of an iso2022 coding system. */
508
509 /* If set, produce long-form designation sequence (e.g. ESC $ ( A)
510 instead of the correct short-form sequence (e.g. ESC $ A). */
511 #define CODING_ISO_FLAG_LONG_FORM 0x0001
512
513 /* If set, reset graphic planes and registers at end-of-line to the
514 initial state. */
515 #define CODING_ISO_FLAG_RESET_AT_EOL 0x0002
516
517 /* If set, reset graphic planes and registers before any control
518 characters to the initial state. */
519 #define CODING_ISO_FLAG_RESET_AT_CNTL 0x0004
520
521 /* If set, encode by 7-bit environment. */
522 #define CODING_ISO_FLAG_SEVEN_BITS 0x0008
523
524 /* If set, use locking-shift function. */
525 #define CODING_ISO_FLAG_LOCKING_SHIFT 0x0010
526
527 /* If set, use single-shift function. Overwrite
528 CODING_ISO_FLAG_LOCKING_SHIFT. */
529 #define CODING_ISO_FLAG_SINGLE_SHIFT 0x0020
530
531 /* If set, use designation escape sequence. */
532 #define CODING_ISO_FLAG_DESIGNATION 0x0040
533
534 /* If set, produce revision number sequence. */
535 #define CODING_ISO_FLAG_REVISION 0x0080
536
537 /* If set, produce ISO6429's direction specifying sequence. */
538 #define CODING_ISO_FLAG_DIRECTION 0x0100
539
540 /* If set, assume designation states are reset at beginning of line on
541 output. */
542 #define CODING_ISO_FLAG_INIT_AT_BOL 0x0200
543
544 /* If set, designation sequence should be placed at beginning of line
545 on output. */
546 #define CODING_ISO_FLAG_DESIGNATE_AT_BOL 0x0400
547
548 /* If set, do not encode unsafe charactes on output. */
549 #define CODING_ISO_FLAG_SAFE 0x0800
550
551 /* If set, extra latin codes (128..159) are accepted as a valid code
552 on input. */
553 #define CODING_ISO_FLAG_LATIN_EXTRA 0x1000
554
555 #define CODING_ISO_FLAG_COMPOSITION 0x2000
556
557 #define CODING_ISO_FLAG_EUC_TW_SHIFT 0x4000
558
559 #define CODING_ISO_FLAG_USE_ROMAN 0x8000
560
561 #define CODING_ISO_FLAG_USE_OLDJIS 0x10000
562
563 #define CODING_ISO_FLAG_FULL_SUPPORT 0x100000
564
565 /* A character to be produced on output if encoding of the original
566 character is prohibited by CODING_ISO_FLAG_SAFE. */
567 #define CODING_INHIBIT_CHARACTER_SUBSTITUTION '?'
568
569
570 /* UTF-16 section */
571 #define CODING_UTF_16_BOM(coding) \
572 ((coding)->spec.utf_16.bom)
573
574 #define CODING_UTF_16_ENDIAN(coding) \
575 ((coding)->spec.utf_16.endian)
576
577 #define CODING_UTF_16_SURROGATE(coding) \
578 ((coding)->spec.utf_16.surrogate)
579
580
581 /* CCL section */
582 #define CODING_CCL_DECODER(coding) \
583 AREF (CODING_ID_ATTRS ((coding)->id), coding_attr_ccl_decoder)
584 #define CODING_CCL_ENCODER(coding) \
585 AREF (CODING_ID_ATTRS ((coding)->id), coding_attr_ccl_encoder)
586 #define CODING_CCL_VALIDS(coding) \
587 (XSTRING (AREF (CODING_ID_ATTRS ((coding)->id), coding_attr_ccl_valids)) \
588 ->data)
589
590 /* Index for each coding category in `coding_category_table' */
591
592 enum coding_category
593 {
594 coding_category_iso_7,
595 coding_category_iso_7_tight,
596 coding_category_iso_8_1,
597 coding_category_iso_8_2,
598 coding_category_iso_7_else,
599 coding_category_iso_8_else,
600 coding_category_utf_8,
601 coding_category_utf_16_auto,
602 coding_category_utf_16_be,
603 coding_category_utf_16_le,
604 coding_category_utf_16_be_nosig,
605 coding_category_utf_16_le_nosig,
606 coding_category_charset,
607 coding_category_sjis,
608 coding_category_big5,
609 coding_category_ccl,
610 coding_category_emacs_mule,
611 /* All above are targets of code detection. */
612 coding_category_raw_text,
613 coding_category_undecided,
614 coding_category_max
615 };
616
617 /* Definitions of flag bits used in detect_coding_XXXX. */
618 #define CATEGORY_MASK_ISO_7 (1 << coding_category_iso_7)
619 #define CATEGORY_MASK_ISO_7_TIGHT (1 << coding_category_iso_7_tight)
620 #define CATEGORY_MASK_ISO_8_1 (1 << coding_category_iso_8_1)
621 #define CATEGORY_MASK_ISO_8_2 (1 << coding_category_iso_8_2)
622 #define CATEGORY_MASK_ISO_7_ELSE (1 << coding_category_iso_7_else)
623 #define CATEGORY_MASK_ISO_8_ELSE (1 << coding_category_iso_8_else)
624 #define CATEGORY_MASK_UTF_8 (1 << coding_category_utf_8)
625 #define CATEGORY_MASK_UTF_16_BE (1 << coding_category_utf_16_be)
626 #define CATEGORY_MASK_UTF_16_LE (1 << coding_category_utf_16_le)
627 #define CATEGORY_MASK_UTF_16_BE_NOSIG (1 << coding_category_utf_16_be_nosig)
628 #define CATEGORY_MASK_UTF_16_LE_NOSIG (1 << coding_category_utf_16_le_nosig)
629 #define CATEGORY_MASK_CHARSET (1 << coding_category_charset)
630 #define CATEGORY_MASK_SJIS (1 << coding_category_sjis)
631 #define CATEGORY_MASK_BIG5 (1 << coding_category_big5)
632 #define CATEGORY_MASK_CCL (1 << coding_category_ccl)
633 #define CATEGORY_MASK_EMACS_MULE (1 << coding_category_emacs_mule)
634
635 /* This value is returned if detect_coding_mask () find nothing other
636 than ASCII characters. */
637 #define CATEGORY_MASK_ANY \
638 (CATEGORY_MASK_ISO_7 \
639 | CATEGORY_MASK_ISO_7_TIGHT \
640 | CATEGORY_MASK_ISO_8_1 \
641 | CATEGORY_MASK_ISO_8_2 \
642 | CATEGORY_MASK_ISO_7_ELSE \
643 | CATEGORY_MASK_ISO_8_ELSE \
644 | CATEGORY_MASK_UTF_8 \
645 | CATEGORY_MASK_UTF_16_BE \
646 | CATEGORY_MASK_UTF_16_LE \
647 | CATEGORY_MASK_UTF_16_BE_NOSIG \
648 | CATEGORY_MASK_UTF_16_LE_NOSIG \
649 | CATEGORY_MASK_CHARSET \
650 | CATEGORY_MASK_SJIS \
651 | CATEGORY_MASK_BIG5 \
652 | CATEGORY_MASK_CCL \
653 | CATEGORY_MASK_EMACS_MULE)
654
655
656 #define CATEGORY_MASK_ISO_7BIT \
657 (CATEGORY_MASK_ISO_7 | CATEGORY_MASK_ISO_7_TIGHT)
658
659 #define CATEGORY_MASK_ISO_8BIT \
660 (CATEGORY_MASK_ISO_8_1 | CATEGORY_MASK_ISO_8_2)
661
662 #define CATEGORY_MASK_ISO_ELSE \
663 (CATEGORY_MASK_ISO_7_ELSE | CATEGORY_MASK_ISO_8_ELSE)
664
665 #define CATEGORY_MASK_ISO_ESCAPE \
666 (CATEGORY_MASK_ISO_7 \
667 | CATEGORY_MASK_ISO_7_TIGHT \
668 | CATEGORY_MASK_ISO_7_ELSE \
669 | CATEGORY_MASK_ISO_8_ELSE)
670
671 #define CATEGORY_MASK_ISO \
672 ( CATEGORY_MASK_ISO_7BIT \
673 | CATEGORY_MASK_ISO_8BIT \
674 | CATEGORY_MASK_ISO_ELSE)
675
676 #define CATEGORY_MASK_UTF_16 \
677 (CATEGORY_MASK_UTF_16_BE \
678 | CATEGORY_MASK_UTF_16_LE \
679 | CATEGORY_MASK_UTF_16_BE_NOSIG \
680 | CATEGORY_MASK_UTF_16_LE_NOSIG)
681
682
683 /* List of symbols `coding-category-xxx' ordered by priority. This
684 variable is exposed to Emacs Lisp. */
685 static Lisp_Object Vcoding_category_list;
686
687 /* Table of coding categories (Lisp symbols). This variable is for
688 internal use oly. */
689 static Lisp_Object Vcoding_category_table;
690
691 /* Table of coding-categories ordered by priority. */
692 static enum coding_category coding_priorities[coding_category_max];
693
694 /* Nth element is a coding context for the coding system bound to the
695 Nth coding category. */
696 static struct coding_system coding_categories[coding_category_max];
697
698 static int detected_mask[coding_category_raw_text] =
699 { CATEGORY_MASK_ISO,
700 CATEGORY_MASK_ISO,
701 CATEGORY_MASK_ISO,
702 CATEGORY_MASK_ISO,
703 CATEGORY_MASK_ISO,
704 CATEGORY_MASK_ISO,
705 CATEGORY_MASK_UTF_8,
706 CATEGORY_MASK_UTF_16,
707 CATEGORY_MASK_UTF_16,
708 CATEGORY_MASK_UTF_16,
709 CATEGORY_MASK_UTF_16,
710 CATEGORY_MASK_UTF_16,
711 CATEGORY_MASK_CHARSET,
712 CATEGORY_MASK_SJIS,
713 CATEGORY_MASK_BIG5,
714 CATEGORY_MASK_CCL,
715 CATEGORY_MASK_EMACS_MULE
716 };
717
718 /*** Commonly used macros and functions ***/
719
720 #ifndef min
721 #define min(a, b) ((a) < (b) ? (a) : (b))
722 #endif
723 #ifndef max
724 #define max(a, b) ((a) > (b) ? (a) : (b))
725 #endif
726
727 #define CODING_GET_INFO(coding, attrs, eol_type, charset_list) \
728 do { \
729 attrs = CODING_ID_ATTRS (coding->id); \
730 eol_type = CODING_ID_EOL_TYPE (coding->id); \
731 if (VECTORP (eol_type)) \
732 eol_type = Qunix; \
733 charset_list = CODING_ATTR_CHARSET_LIST (attrs); \
734 } while (0)
735
736
737 /* Safely get one byte from the source text pointed by SRC which ends
738 at SRC_END, and set C to that byte. If there are not enough bytes
739 in the source, it jumps to `no_more_source'. The caller
740 should declare and set these variables appropriately in advance:
741 src, src_end, multibytep
742 */
743
744 #define ONE_MORE_BYTE(c) \
745 do { \
746 if (src == src_end) \
747 { \
748 if (src_base < src) \
749 coding->result = CODING_RESULT_INSUFFICIENT_SRC; \
750 goto no_more_source; \
751 } \
752 c = *src++; \
753 if (multibytep && (c & 0x80)) \
754 { \
755 if ((c & 0xFE) != 0xC0) \
756 error ("Undecodable char found"); \
757 c = ((c & 1) << 6) | *src++; \
758 } \
759 consumed_chars++; \
760 } while (0)
761
762
763 #define ONE_MORE_BYTE_NO_CHECK(c) \
764 do { \
765 c = *src++; \
766 if (multibytep && (c & 0x80)) \
767 { \
768 if ((c & 0xFE) != 0xC0) \
769 error ("Undecodable char found"); \
770 c = ((c & 1) << 6) | *src++; \
771 } \
772 consumed_chars++; \
773 } while (0)
774
775
776 /* Store a byte C in the place pointed by DST and increment DST to the
777 next free point, and increment PRODUCED_CHARS. The caller should
778 assure that C is 0..127, and declare and set the variable `dst'
779 appropriately in advance.
780 */
781
782
783 #define EMIT_ONE_ASCII_BYTE(c) \
784 do { \
785 produced_chars++; \
786 *dst++ = (c); \
787 } while (0)
788
789
790 /* Like EMIT_ONE_ASCII_BYTE byt store two bytes; C1 and C2. */
791
792 #define EMIT_TWO_ASCII_BYTES(c1, c2) \
793 do { \
794 produced_chars += 2; \
795 *dst++ = (c1), *dst++ = (c2); \
796 } while (0)
797
798
799 /* Store a byte C in the place pointed by DST and increment DST to the
800 next free point, and increment PRODUCED_CHARS. If MULTIBYTEP is
801 nonzero, store in an appropriate multibyte from. The caller should
802 declare and set the variables `dst' and `multibytep' appropriately
803 in advance. */
804
805 #define EMIT_ONE_BYTE(c) \
806 do { \
807 produced_chars++; \
808 if (multibytep) \
809 { \
810 int ch = (c); \
811 if (ch >= 0x80) \
812 ch = BYTE8_TO_CHAR (ch); \
813 CHAR_STRING_ADVANCE (ch, dst); \
814 } \
815 else \
816 *dst++ = (c); \
817 } while (0)
818
819
820 /* Like EMIT_ONE_BYTE, but emit two bytes; C1 and C2. */
821
822 #define EMIT_TWO_BYTES(c1, c2) \
823 do { \
824 produced_chars += 2; \
825 if (multibytep) \
826 { \
827 int ch; \
828 \
829 ch = (c1); \
830 if (ch >= 0x80) \
831 ch = BYTE8_TO_CHAR (ch); \
832 CHAR_STRING_ADVANCE (ch, dst); \
833 ch = (c2); \
834 if (ch >= 0x80) \
835 ch = BYTE8_TO_CHAR (ch); \
836 CHAR_STRING_ADVANCE (ch, dst); \
837 } \
838 else \
839 { \
840 *dst++ = (c1); \
841 *dst++ = (c2); \
842 } \
843 } while (0)
844
845
846 #define EMIT_THREE_BYTES(c1, c2, c3) \
847 do { \
848 EMIT_ONE_BYTE (c1); \
849 EMIT_TWO_BYTES (c2, c3); \
850 } while (0)
851
852
853 #define EMIT_FOUR_BYTES(c1, c2, c3, c4) \
854 do { \
855 EMIT_TWO_BYTES (c1, c2); \
856 EMIT_TWO_BYTES (c3, c4); \
857 } while (0)
858
859
860 #define CODING_DECODE_CHAR(coding, src, src_base, src_end, charset, code, c) \
861 do { \
862 charset_map_loaded = 0; \
863 c = DECODE_CHAR (charset, code); \
864 if (charset_map_loaded) \
865 { \
866 unsigned char *orig = coding->source; \
867 EMACS_INT offset; \
868 \
869 coding_set_source (coding); \
870 offset = coding->source - orig; \
871 src += offset; \
872 src_base += offset; \
873 src_end += offset; \
874 } \
875 } while (0)
876
877
878 #define ASSURE_DESTINATION(bytes) \
879 do { \
880 if (dst + (bytes) >= dst_end) \
881 { \
882 int more_bytes = charbuf_end - charbuf + (bytes); \
883 \
884 dst = alloc_destination (coding, more_bytes, dst); \
885 dst_end = coding->destination + coding->dst_bytes; \
886 } \
887 } while (0)
888
889
890
891 static void
892 coding_set_source (coding)
893 struct coding_system *coding;
894 {
895 if (BUFFERP (coding->src_object))
896 {
897 if (coding->src_pos < 0)
898 coding->source = GAP_END_ADDR + coding->src_pos_byte;
899 else
900 {
901 struct buffer *buf = XBUFFER (coding->src_object);
902 EMACS_INT gpt_byte = BUF_GPT_BYTE (buf);
903 unsigned char *beg_addr = BUF_BEG_ADDR (buf);
904
905 coding->source = beg_addr + coding->src_pos_byte - 1;
906 if (coding->src_pos_byte >= gpt_byte)
907 coding->source += BUF_GAP_SIZE (buf);
908 }
909 }
910 else if (STRINGP (coding->src_object))
911 {
912 coding->source = (XSTRING (coding->src_object)->data
913 + coding->src_pos_byte);
914 }
915 else
916 /* Otherwise, the source is C string and is never relocated
917 automatically. Thus we don't have to update anything. */
918 ;
919 }
920
921 static void
922 coding_set_destination (coding)
923 struct coding_system *coding;
924 {
925 if (BUFFERP (coding->dst_object))
926 {
927 /* We are sure that coding->dst_pos_byte is before the gap of the
928 buffer. */
929 coding->destination = (BUF_BEG_ADDR (XBUFFER (coding->dst_object))
930 + coding->dst_pos_byte - 1);
931 if (coding->src_pos < 0)
932 coding->dst_bytes = (GAP_END_ADDR
933 - (coding->src_bytes - coding->consumed)
934 - coding->destination);
935 else
936 coding->dst_bytes = (BUF_GAP_END_ADDR (XBUFFER (coding->dst_object))
937 - coding->destination);
938 }
939 else
940 /* Otherwise, the destination is C string and is never relocated
941 automatically. Thus we don't have to update anything. */
942 ;
943 }
944
945
946 static void
947 coding_alloc_by_realloc (coding, bytes)
948 struct coding_system *coding;
949 EMACS_INT bytes;
950 {
951 coding->destination = (unsigned char *) xrealloc (coding->destination,
952 coding->dst_bytes + bytes);
953 coding->dst_bytes += bytes;
954 }
955
956 static void
957 coding_alloc_by_making_gap (coding, bytes)
958 struct coding_system *coding;
959 EMACS_INT bytes;
960 {
961 if (BUFFERP (coding->dst_object)
962 && EQ (coding->src_object, coding->dst_object))
963 {
964 EMACS_INT add = coding->src_bytes - coding->consumed;
965
966 GAP_SIZE -= add; ZV += add; Z += add; ZV_BYTE += add; Z_BYTE += add;
967 make_gap (bytes);
968 GAP_SIZE += add; ZV -= add; Z -= add; ZV_BYTE -= add; Z_BYTE -= add;
969 }
970 else
971 {
972 Lisp_Object this_buffer;
973
974 this_buffer = Fcurrent_buffer ();
975 set_buffer_internal (XBUFFER (coding->dst_object));
976 make_gap (bytes);
977 set_buffer_internal (XBUFFER (this_buffer));
978 }
979 }
980
981
982 static unsigned char *
983 alloc_destination (coding, nbytes, dst)
984 struct coding_system *coding;
985 int nbytes;
986 unsigned char *dst;
987 {
988 EMACS_INT offset = dst - coding->destination;
989
990 if (BUFFERP (coding->dst_object))
991 coding_alloc_by_making_gap (coding, nbytes);
992 else
993 coding_alloc_by_realloc (coding, nbytes);
994 coding->result = CODING_RESULT_SUCCESS;
995 coding_set_destination (coding);
996 dst = coding->destination + offset;
997 return dst;
998 }
999
1000 \f
1001 /*** 2. Emacs' internal format (emacs-utf-8) ***/
1002
1003
1004
1005 \f
1006 /*** 3. UTF-8 ***/
1007
1008 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
1009 Check if a text is encoded in UTF-8. If it is, return
1010 CATEGORY_MASK_UTF_8, else return 0. */
1011
1012 #define UTF_8_1_OCTET_P(c) ((c) < 0x80)
1013 #define UTF_8_EXTRA_OCTET_P(c) (((c) & 0xC0) == 0x80)
1014 #define UTF_8_2_OCTET_LEADING_P(c) (((c) & 0xE0) == 0xC0)
1015 #define UTF_8_3_OCTET_LEADING_P(c) (((c) & 0xF0) == 0xE0)
1016 #define UTF_8_4_OCTET_LEADING_P(c) (((c) & 0xF8) == 0xF0)
1017 #define UTF_8_5_OCTET_LEADING_P(c) (((c) & 0xFC) == 0xF8)
1018
1019 static int
1020 detect_coding_utf_8 (coding, mask)
1021 struct coding_system *coding;
1022 int *mask;
1023 {
1024 unsigned char *src = coding->source, *src_base = src;
1025 unsigned char *src_end = coding->source + coding->src_bytes;
1026 int multibytep = coding->src_multibyte;
1027 int consumed_chars = 0;
1028 int found = 0;
1029
1030 /* A coding system of this category is always ASCII compatible. */
1031 src += coding->head_ascii;
1032
1033 while (1)
1034 {
1035 int c, c1, c2, c3, c4;
1036
1037 ONE_MORE_BYTE (c);
1038 if (UTF_8_1_OCTET_P (c))
1039 continue;
1040 ONE_MORE_BYTE (c1);
1041 if (! UTF_8_EXTRA_OCTET_P (c1))
1042 break;
1043 if (UTF_8_2_OCTET_LEADING_P (c))
1044 {
1045 found++;
1046 continue;
1047 }
1048 ONE_MORE_BYTE (c2);
1049 if (! UTF_8_EXTRA_OCTET_P (c2))
1050 break;
1051 if (UTF_8_3_OCTET_LEADING_P (c))
1052 {
1053 found++;
1054 continue;
1055 }
1056 ONE_MORE_BYTE (c3);
1057 if (! UTF_8_EXTRA_OCTET_P (c3))
1058 break;
1059 if (UTF_8_4_OCTET_LEADING_P (c))
1060 {
1061 found++;
1062 continue;
1063 }
1064 ONE_MORE_BYTE (c4);
1065 if (! UTF_8_EXTRA_OCTET_P (c4))
1066 break;
1067 if (UTF_8_5_OCTET_LEADING_P (c))
1068 {
1069 found++;
1070 continue;
1071 }
1072 break;
1073 }
1074 *mask &= ~CATEGORY_MASK_UTF_8;
1075 return 0;
1076
1077 no_more_source:
1078 if (! found)
1079 return 0;
1080 *mask &= CATEGORY_MASK_UTF_8;
1081 return 1;
1082 }
1083
1084
1085 /* Fixme: deal with surrogates? */
1086 static void
1087 decode_coding_utf_8 (coding)
1088 struct coding_system *coding;
1089 {
1090 unsigned char *src = coding->source + coding->consumed;
1091 unsigned char *src_end = coding->source + coding->src_bytes;
1092 unsigned char *src_base;
1093 int *charbuf = coding->charbuf;
1094 int *charbuf_end = charbuf + coding->charbuf_size;
1095 int consumed_chars = 0, consumed_chars_base;
1096 int multibytep = coding->src_multibyte;
1097 Lisp_Object attr, eol_type, charset_list;
1098
1099 CODING_GET_INFO (coding, attr, eol_type, charset_list);
1100
1101 while (1)
1102 {
1103 int c, c1, c2, c3, c4, c5;
1104
1105 src_base = src;
1106 consumed_chars_base = consumed_chars;
1107
1108 if (charbuf >= charbuf_end)
1109 break;
1110
1111 ONE_MORE_BYTE (c1);
1112 if (UTF_8_1_OCTET_P(c1))
1113 {
1114 c = c1;
1115 if (c == '\r')
1116 {
1117 if (EQ (eol_type, Qdos))
1118 {
1119 if (src == src_end)
1120 goto no_more_source;
1121 if (*src == '\n')
1122 ONE_MORE_BYTE (c);
1123 }
1124 else if (EQ (eol_type, Qmac))
1125 c = '\n';
1126 }
1127 }
1128 else
1129 {
1130 ONE_MORE_BYTE (c2);
1131 if (! UTF_8_EXTRA_OCTET_P (c2))
1132 goto invalid_code;
1133 if (UTF_8_2_OCTET_LEADING_P (c1))
1134 {
1135 c = ((c1 & 0x1F) << 6) | (c2 & 0x3F);
1136 /* Reject overlong sequences here and below. Encoders
1137 producing them are incorrect, they can be misleading,
1138 and they mess up read/write invariance. */
1139 if (c < 128)
1140 goto invalid_code;
1141 }
1142 else
1143 {
1144 ONE_MORE_BYTE (c3);
1145 if (! UTF_8_EXTRA_OCTET_P (c3))
1146 goto invalid_code;
1147 if (UTF_8_3_OCTET_LEADING_P (c1))
1148 {
1149 c = (((c1 & 0xF) << 12)
1150 | ((c2 & 0x3F) << 6) | (c3 & 0x3F));
1151 if (c < 0x800)
1152 goto invalid_code;
1153 }
1154 else
1155 {
1156 ONE_MORE_BYTE (c4);
1157 if (! UTF_8_EXTRA_OCTET_P (c4))
1158 goto invalid_code;
1159 if (UTF_8_4_OCTET_LEADING_P (c1))
1160 {
1161 c = (((c1 & 0x7) << 18) | ((c2 & 0x3F) << 12)
1162 | ((c3 & 0x3F) << 6) | (c4 & 0x3F));
1163 if (c < 0x10000)
1164 goto invalid_code;
1165 }
1166 else
1167 {
1168 ONE_MORE_BYTE (c5);
1169 if (! UTF_8_EXTRA_OCTET_P (c5))
1170 goto invalid_code;
1171 if (UTF_8_5_OCTET_LEADING_P (c1))
1172 {
1173 c = (((c1 & 0x3) << 24) | ((c2 & 0x3F) << 18)
1174 | ((c3 & 0x3F) << 12) | ((c4 & 0x3F) << 6)
1175 | (c5 & 0x3F));
1176 if ((c > MAX_CHAR) || (c < 0x200000))
1177 goto invalid_code;
1178 }
1179 else
1180 goto invalid_code;
1181 }
1182 }
1183 }
1184 }
1185
1186 *charbuf++ = c;
1187 continue;
1188
1189 invalid_code:
1190 src = src_base;
1191 consumed_chars = consumed_chars_base;
1192 ONE_MORE_BYTE (c);
1193 *charbuf++ = ASCII_BYTE_P (c) ? c : BYTE8_TO_CHAR (c);
1194 coding->errors++;
1195 }
1196
1197 no_more_source:
1198 coding->consumed_char += consumed_chars_base;
1199 coding->consumed = src_base - coding->source;
1200 coding->charbuf_used = charbuf - coding->charbuf;
1201 }
1202
1203
1204 static int
1205 encode_coding_utf_8 (coding)
1206 struct coding_system *coding;
1207 {
1208 int multibytep = coding->dst_multibyte;
1209 int *charbuf = coding->charbuf;
1210 int *charbuf_end = charbuf + coding->charbuf_used;
1211 unsigned char *dst = coding->destination + coding->produced;
1212 unsigned char *dst_end = coding->destination + coding->dst_bytes;
1213 int produced_chars = 0;
1214 int c;
1215
1216 if (multibytep)
1217 {
1218 int safe_room = MAX_MULTIBYTE_LENGTH * 2;
1219
1220 while (charbuf < charbuf_end)
1221 {
1222 unsigned char str[MAX_MULTIBYTE_LENGTH], *p, *pend = str;
1223
1224 ASSURE_DESTINATION (safe_room);
1225 c = *charbuf++;
1226 CHAR_STRING_ADVANCE (c, pend);
1227 for (p = str; p < pend; p++)
1228 EMIT_ONE_BYTE (*p);
1229 }
1230 }
1231 else
1232 {
1233 int safe_room = MAX_MULTIBYTE_LENGTH;
1234
1235 while (charbuf < charbuf_end)
1236 {
1237 ASSURE_DESTINATION (safe_room);
1238 c = *charbuf++;
1239 dst += CHAR_STRING (c, dst);
1240 produced_chars++;
1241 }
1242 }
1243 coding->result = CODING_RESULT_SUCCESS;
1244 coding->produced_char += produced_chars;
1245 coding->produced = dst - coding->destination;
1246 return 0;
1247 }
1248
1249
1250 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
1251 Check if a text is encoded in UTF-16 Big Endian (endian == 1) or
1252 Little Endian (otherwise). If it is, return
1253 CATEGORY_MASK_UTF_16_BE or CATEGORY_MASK_UTF_16_LE,
1254 else return 0. */
1255
1256 #define UTF_16_HIGH_SURROGATE_P(val) \
1257 (((val) & 0xFC00) == 0xD800)
1258
1259 #define UTF_16_LOW_SURROGATE_P(val) \
1260 (((val) & 0xFC00) == 0xDC00)
1261
1262 #define UTF_16_INVALID_P(val) \
1263 (((val) == 0xFFFE) \
1264 || ((val) == 0xFFFF) \
1265 || UTF_16_LOW_SURROGATE_P (val))
1266
1267
1268 static int
1269 detect_coding_utf_16 (coding, mask)
1270 struct coding_system *coding;
1271 int *mask;
1272 {
1273 unsigned char *src = coding->source, *src_base = src;
1274 unsigned char *src_end = coding->source + coding->src_bytes;
1275 int multibytep = coding->src_multibyte;
1276 int consumed_chars = 0;
1277 int c1, c2;
1278
1279 ONE_MORE_BYTE (c1);
1280 ONE_MORE_BYTE (c2);
1281
1282 if ((c1 == 0xFF) && (c2 == 0xFE))
1283 {
1284 *mask &= CATEGORY_MASK_UTF_16_LE;
1285 return 1;
1286 }
1287 else if ((c1 == 0xFE) && (c2 == 0xFF))
1288 {
1289 *mask &= CATEGORY_MASK_UTF_16_BE;
1290 return 1;
1291 }
1292 no_more_source:
1293 return 0;
1294 }
1295
1296 static void
1297 decode_coding_utf_16 (coding)
1298 struct coding_system *coding;
1299 {
1300 unsigned char *src = coding->source + coding->consumed;
1301 unsigned char *src_end = coding->source + coding->src_bytes;
1302 unsigned char *src_base;
1303 int *charbuf = coding->charbuf;
1304 int *charbuf_end = charbuf + coding->charbuf_size;
1305 int consumed_chars = 0, consumed_chars_base;
1306 int multibytep = coding->src_multibyte;
1307 enum utf_16_bom_type bom = CODING_UTF_16_BOM (coding);
1308 enum utf_16_endian_type endian = CODING_UTF_16_ENDIAN (coding);
1309 int surrogate = CODING_UTF_16_SURROGATE (coding);
1310 Lisp_Object attr, eol_type, charset_list;
1311
1312 CODING_GET_INFO (coding, attr, eol_type, charset_list);
1313
1314 if (bom != utf_16_without_bom)
1315 {
1316 int c, c1, c2;
1317
1318 src_base = src;
1319 ONE_MORE_BYTE (c1);
1320 ONE_MORE_BYTE (c2);
1321 c = (c1 << 8) | c2;
1322 if (bom == utf_16_with_bom)
1323 {
1324 if (endian == utf_16_big_endian
1325 ? c != 0xFFFE : c != 0xFEFF)
1326 {
1327 /* We are sure that there's enouph room at CHARBUF. */
1328 *charbuf++ = c1;
1329 *charbuf++ = c2;
1330 coding->errors++;
1331 }
1332 }
1333 else
1334 {
1335 if (c == 0xFFFE)
1336 CODING_UTF_16_ENDIAN (coding)
1337 = endian = utf_16_big_endian;
1338 else if (c == 0xFEFF)
1339 CODING_UTF_16_ENDIAN (coding)
1340 = endian = utf_16_little_endian;
1341 else
1342 {
1343 CODING_UTF_16_ENDIAN (coding)
1344 = endian = utf_16_big_endian;
1345 src = src_base;
1346 }
1347 }
1348 CODING_UTF_16_BOM (coding) = utf_16_with_bom;
1349 }
1350
1351 while (1)
1352 {
1353 int c, c1, c2;
1354
1355 src_base = src;
1356 consumed_chars_base = consumed_chars;
1357
1358 if (charbuf + 2 >= charbuf_end)
1359 break;
1360
1361 ONE_MORE_BYTE (c1);
1362 ONE_MORE_BYTE (c2);
1363 c = (endian == utf_16_big_endian
1364 ? ((c1 << 8) | c2) : ((c2 << 8) | c1));
1365 if (surrogate)
1366 {
1367 if (! UTF_16_LOW_SURROGATE_P (c))
1368 {
1369 if (endian == utf_16_big_endian)
1370 c1 = surrogate >> 8, c2 = surrogate & 0xFF;
1371 else
1372 c1 = surrogate & 0xFF, c2 = surrogate >> 8;
1373 *charbuf++ = c1;
1374 *charbuf++ = c2;
1375 coding->errors++;
1376 if (UTF_16_HIGH_SURROGATE_P (c))
1377 CODING_UTF_16_SURROGATE (coding) = surrogate = c;
1378 else
1379 *charbuf++ = c;
1380 }
1381 else
1382 {
1383 c = ((surrogate - 0xD800) << 10) | (c - 0xDC00);
1384 CODING_UTF_16_SURROGATE (coding) = surrogate = 0;
1385 *charbuf++ = c;
1386 }
1387 }
1388 else
1389 {
1390 if (UTF_16_HIGH_SURROGATE_P (c))
1391 CODING_UTF_16_SURROGATE (coding) = surrogate = c;
1392 else
1393 *charbuf++ = c;
1394 }
1395 }
1396
1397 no_more_source:
1398 coding->consumed_char += consumed_chars_base;
1399 coding->consumed = src_base - coding->source;
1400 coding->charbuf_used = charbuf - coding->charbuf;
1401 }
1402
1403 static int
1404 encode_coding_utf_16 (coding)
1405 struct coding_system *coding;
1406 {
1407 int multibytep = coding->dst_multibyte;
1408 int *charbuf = coding->charbuf;
1409 int *charbuf_end = charbuf + coding->charbuf_used;
1410 unsigned char *dst = coding->destination + coding->produced;
1411 unsigned char *dst_end = coding->destination + coding->dst_bytes;
1412 int safe_room = 8;
1413 enum utf_16_bom_type bom = CODING_UTF_16_BOM (coding);
1414 int big_endian = CODING_UTF_16_ENDIAN (coding) == utf_16_big_endian;
1415 int produced_chars = 0;
1416 Lisp_Object attrs, eol_type, charset_list;
1417 int c;
1418
1419 CODING_GET_INFO (coding, attrs, eol_type, charset_list);
1420
1421 if (bom == utf_16_with_bom)
1422 {
1423 ASSURE_DESTINATION (safe_room);
1424 if (big_endian)
1425 EMIT_TWO_BYTES (0xFF, 0xFE);
1426 else
1427 EMIT_TWO_BYTES (0xFE, 0xFF);
1428 CODING_UTF_16_BOM (coding) = utf_16_without_bom;
1429 }
1430
1431 while (charbuf < charbuf_end)
1432 {
1433 ASSURE_DESTINATION (safe_room);
1434 c = *charbuf++;
1435 if (c >= MAX_UNICODE_CHAR)
1436 c = coding->default_char;
1437
1438 if (c < 0x10000)
1439 {
1440 if (big_endian)
1441 EMIT_TWO_BYTES (c >> 8, c & 0xFF);
1442 else
1443 EMIT_TWO_BYTES (c & 0xFF, c >> 8);
1444 }
1445 else
1446 {
1447 int c1, c2;
1448
1449 c -= 0x10000;
1450 c1 = (c >> 10) + 0xD800;
1451 c2 = (c & 0x3FF) + 0xDC00;
1452 if (big_endian)
1453 EMIT_FOUR_BYTES (c1 >> 8, c1 & 0xFF, c2 >> 8, c2 & 0xFF);
1454 else
1455 EMIT_FOUR_BYTES (c1 & 0xFF, c1 >> 8, c2 & 0xFF, c2 >> 8);
1456 }
1457 }
1458 coding->result = CODING_RESULT_SUCCESS;
1459 coding->produced = dst - coding->destination;
1460 coding->produced_char += produced_chars;
1461 return 0;
1462 }
1463
1464 \f
1465 /*** 6. Old Emacs' internal format (emacs-mule) ***/
1466
1467 /* Emacs' internal format for representation of multiple character
1468 sets is a kind of multi-byte encoding, i.e. characters are
1469 represented by variable-length sequences of one-byte codes.
1470
1471 ASCII characters and control characters (e.g. `tab', `newline') are
1472 represented by one-byte sequences which are their ASCII codes, in
1473 the range 0x00 through 0x7F.
1474
1475 8-bit characters of the range 0x80..0x9F are represented by
1476 two-byte sequences of LEADING_CODE_8_BIT_CONTROL and (their 8-bit
1477 code + 0x20).
1478
1479 8-bit characters of the range 0xA0..0xFF are represented by
1480 one-byte sequences which are their 8-bit code.
1481
1482 The other characters are represented by a sequence of `base
1483 leading-code', optional `extended leading-code', and one or two
1484 `position-code's. The length of the sequence is determined by the
1485 base leading-code. Leading-code takes the range 0x81 through 0x9D,
1486 whereas extended leading-code and position-code take the range 0xA0
1487 through 0xFF. See `charset.h' for more details about leading-code
1488 and position-code.
1489
1490 --- CODE RANGE of Emacs' internal format ---
1491 character set range
1492 ------------- -----
1493 ascii 0x00..0x7F
1494 eight-bit-control LEADING_CODE_8_BIT_CONTROL + 0xA0..0xBF
1495 eight-bit-graphic 0xA0..0xBF
1496 ELSE 0x81..0x9D + [0xA0..0xFF]+
1497 ---------------------------------------------
1498
1499 As this is the internal character representation, the format is
1500 usually not used externally (i.e. in a file or in a data sent to a
1501 process). But, it is possible to have a text externally in this
1502 format (i.e. by encoding by the coding system `emacs-mule').
1503
1504 In that case, a sequence of one-byte codes has a slightly different
1505 form.
1506
1507 At first, all characters in eight-bit-control are represented by
1508 one-byte sequences which are their 8-bit code.
1509
1510 Next, character composition data are represented by the byte
1511 sequence of the form: 0x80 METHOD BYTES CHARS COMPONENT ...,
1512 where,
1513 METHOD is 0xF0 plus one of composition method (enum
1514 composition_method),
1515
1516 BYTES is 0xA0 plus a byte length of this composition data,
1517
1518 CHARS is 0x20 plus a number of characters composed by this
1519 data,
1520
1521 COMPONENTs are characters of multibye form or composition
1522 rules encoded by two-byte of ASCII codes.
1523
1524 In addition, for backward compatibility, the following formats are
1525 also recognized as composition data on decoding.
1526
1527 0x80 MSEQ ...
1528 0x80 0xFF MSEQ RULE MSEQ RULE ... MSEQ
1529
1530 Here,
1531 MSEQ is a multibyte form but in these special format:
1532 ASCII: 0xA0 ASCII_CODE+0x80,
1533 other: LEADING_CODE+0x20 FOLLOWING-BYTE ...,
1534 RULE is a one byte code of the range 0xA0..0xF0 that
1535 represents a composition rule.
1536 */
1537
1538 char emacs_mule_bytes[256];
1539
1540 /* Leading-code followed by extended leading-code. */
1541 #define LEADING_CODE_PRIVATE_11 0x9A /* for private DIMENSION1 of 1-column */
1542 #define LEADING_CODE_PRIVATE_12 0x9B /* for private DIMENSION1 of 2-column */
1543 #define LEADING_CODE_PRIVATE_21 0x9C /* for private DIMENSION2 of 1-column */
1544 #define LEADING_CODE_PRIVATE_22 0x9D /* for private DIMENSION2 of 2-column */
1545
1546
1547 int
1548 emacs_mule_char (coding, src, nbytes, nchars)
1549 struct coding_system *coding;
1550 unsigned char *src;
1551 int *nbytes, *nchars;
1552 {
1553 unsigned char *src_end = coding->source + coding->src_bytes;
1554 int multibytep = coding->src_multibyte;
1555 unsigned char *src_base = src;
1556 struct charset *charset;
1557 unsigned code;
1558 int c;
1559 int consumed_chars = 0;
1560
1561 ONE_MORE_BYTE (c);
1562 switch (emacs_mule_bytes[c])
1563 {
1564 case 2:
1565 if (! (charset = emacs_mule_charset[c]))
1566 goto invalid_code;
1567 ONE_MORE_BYTE (c);
1568 code = c & 0x7F;
1569 break;
1570
1571 case 3:
1572 if (c == LEADING_CODE_PRIVATE_11
1573 || c == LEADING_CODE_PRIVATE_12)
1574 {
1575 ONE_MORE_BYTE (c);
1576 if (! (charset = emacs_mule_charset[c]))
1577 goto invalid_code;
1578 ONE_MORE_BYTE (c);
1579 code = c & 0x7F;
1580 }
1581 else
1582 {
1583 if (! (charset = emacs_mule_charset[c]))
1584 goto invalid_code;
1585 ONE_MORE_BYTE (c);
1586 code = (c & 0x7F) << 8;
1587 ONE_MORE_BYTE (c);
1588 code |= c & 0x7F;
1589 }
1590 break;
1591
1592 case 4:
1593 ONE_MORE_BYTE (c);
1594 if (! (charset = emacs_mule_charset[c]))
1595 goto invalid_code;
1596 ONE_MORE_BYTE (c);
1597 code = (c & 0x7F) << 8;
1598 ONE_MORE_BYTE (c);
1599 code |= c & 0x7F;
1600 break;
1601
1602 case 1:
1603 code = c;
1604 charset = CHARSET_FROM_ID (ASCII_BYTE_P (code) ? charset_ascii
1605 : code < 0xA0 ? charset_8_bit_control
1606 : charset_8_bit_graphic);
1607 break;
1608
1609 default:
1610 abort ();
1611 }
1612 c = DECODE_CHAR (charset, code);
1613 if (c < 0)
1614 goto invalid_code;
1615 *nbytes = src - src_base;
1616 *nchars = consumed_chars;
1617 return c;
1618
1619 no_more_source:
1620 return -2;
1621
1622 invalid_code:
1623 return -1;
1624 }
1625
1626
1627 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
1628 Check if a text is encoded in `emacs-mule'. */
1629
1630 static int
1631 detect_coding_emacs_mule (coding, mask)
1632 struct coding_system *coding;
1633 int *mask;
1634 {
1635 unsigned char *src = coding->source, *src_base = src;
1636 unsigned char *src_end = coding->source + coding->src_bytes;
1637 int multibytep = coding->src_multibyte;
1638 int consumed_chars = 0;
1639 int c;
1640 int found = 0;
1641
1642 /* A coding system of this category is always ASCII compatible. */
1643 src += coding->head_ascii;
1644
1645 while (1)
1646 {
1647 ONE_MORE_BYTE (c);
1648
1649 if (c == 0x80)
1650 {
1651 /* Perhaps the start of composite character. We simple skip
1652 it because analyzing it is too heavy for detecting. But,
1653 at least, we check that the composite character
1654 constitues of more than 4 bytes. */
1655 unsigned char *src_base;
1656
1657 repeat:
1658 src_base = src;
1659 do
1660 {
1661 ONE_MORE_BYTE (c);
1662 }
1663 while (c >= 0xA0);
1664
1665 if (src - src_base <= 4)
1666 break;
1667 found = 1;
1668 if (c == 0x80)
1669 goto repeat;
1670 }
1671
1672 if (c < 0x80)
1673 {
1674 if (c < 0x20
1675 && (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO))
1676 break;
1677 }
1678 else
1679 {
1680 unsigned char *src_base = src - 1;
1681
1682 do
1683 {
1684 ONE_MORE_BYTE (c);
1685 }
1686 while (c >= 0xA0);
1687 if (src - src_base != emacs_mule_bytes[*src_base])
1688 break;
1689 found = 1;
1690 }
1691 }
1692 *mask &= ~CATEGORY_MASK_EMACS_MULE;
1693 return 0;
1694
1695 no_more_source:
1696 if (!found)
1697 return 0;
1698 *mask &= CATEGORY_MASK_EMACS_MULE;
1699 return 1;
1700 }
1701
1702
1703 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions". */
1704
1705 /* Decode a character represented as a component of composition
1706 sequence of Emacs 20/21 style at SRC. Set C to that character and
1707 update SRC to the head of next character (or an encoded composition
1708 rule). If SRC doesn't points a composition component, set C to -1.
1709 If SRC points an invalid byte sequence, global exit by a return
1710 value 0. */
1711
1712 #define DECODE_EMACS_MULE_COMPOSITION_CHAR(buf) \
1713 if (1) \
1714 { \
1715 int c; \
1716 int nbytes, nchars; \
1717 \
1718 if (src == src_end) \
1719 break; \
1720 c = emacs_mule_char (coding, src, &nbytes, &nchars); \
1721 if (c < 0) \
1722 { \
1723 if (c == -2) \
1724 break; \
1725 goto invalid_code; \
1726 } \
1727 *buf++ = c; \
1728 src += nbytes; \
1729 consumed_chars += nchars; \
1730 } \
1731 else
1732
1733
1734 /* Decode a composition rule represented as a component of composition
1735 sequence of Emacs 20 style at SRC. Store the decoded rule in *BUF,
1736 and increment BUF. If SRC points an invalid byte sequence, set C
1737 to -1. */
1738
1739 #define DECODE_EMACS_MULE_COMPOSITION_RULE_20(buf) \
1740 do { \
1741 int c, gref, nref; \
1742 \
1743 if (src >= src_end) \
1744 goto invalid_code; \
1745 ONE_MORE_BYTE_NO_CHECK (c); \
1746 c -= 0x20; \
1747 if (c < 0 || c >= 81) \
1748 goto invalid_code; \
1749 \
1750 gref = c / 9, nref = c % 9; \
1751 *buf++ = COMPOSITION_ENCODE_RULE (gref, nref); \
1752 } while (0)
1753
1754
1755 /* Decode a composition rule represented as a component of composition
1756 sequence of Emacs 21 style at SRC. Store the decoded rule in *BUF,
1757 and increment BUF. If SRC points an invalid byte sequence, set C
1758 to -1. */
1759
1760 #define DECODE_EMACS_MULE_COMPOSITION_RULE_21(buf) \
1761 do { \
1762 int gref, nref; \
1763 \
1764 if (src + 1>= src_end) \
1765 goto invalid_code; \
1766 ONE_MORE_BYTE_NO_CHECK (gref); \
1767 gref -= 0x20; \
1768 ONE_MORE_BYTE_NO_CHECK (nref); \
1769 nref -= 0x20; \
1770 if (gref < 0 || gref >= 81 \
1771 || nref < 0 || nref >= 81) \
1772 goto invalid_code; \
1773 *buf++ = COMPOSITION_ENCODE_RULE (gref, nref); \
1774 } while (0)
1775
1776
1777 #define ADD_COMPOSITION_DATA(buf, method, nchars) \
1778 do { \
1779 *buf++ = -5; \
1780 *buf++ = coding->produced_char + char_offset; \
1781 *buf++ = CODING_ANNOTATE_COMPOSITION_MASK; \
1782 *buf++ = method; \
1783 *buf++ = nchars; \
1784 } while (0)
1785
1786
1787 #define DECODE_EMACS_MULE_21_COMPOSITION(c) \
1788 do { \
1789 /* Emacs 21 style format. The first three bytes at SRC are \
1790 (METHOD - 0xF2), (BYTES - 0xA0), (CHARS - 0xA0), where BYTES is \
1791 the byte length of this composition information, CHARS is the \
1792 number of characters composed by this composition. */ \
1793 enum composition_method method = c - 0xF2; \
1794 int *charbuf_base = charbuf; \
1795 int consumed_chars_limit; \
1796 int nbytes, nchars; \
1797 \
1798 ONE_MORE_BYTE (c); \
1799 nbytes = c - 0xA0; \
1800 if (nbytes < 3) \
1801 goto invalid_code; \
1802 ONE_MORE_BYTE (c); \
1803 nchars = c - 0xA0; \
1804 ADD_COMPOSITION_DATA (charbuf, method, nchars); \
1805 consumed_chars_limit = consumed_chars_base + nbytes; \
1806 if (method != COMPOSITION_RELATIVE) \
1807 { \
1808 int i = 0; \
1809 while (consumed_chars < consumed_chars_limit) \
1810 { \
1811 if (i % 2 && method != COMPOSITION_WITH_ALTCHARS) \
1812 DECODE_EMACS_MULE_COMPOSITION_RULE_21 (charbuf); \
1813 else \
1814 DECODE_EMACS_MULE_COMPOSITION_CHAR (charbuf); \
1815 i++; \
1816 } \
1817 if (consumed_chars < consumed_chars_limit) \
1818 goto invalid_code; \
1819 charbuf_base[0] -= i; \
1820 } \
1821 } while (0)
1822
1823
1824 #define DECODE_EMACS_MULE_20_RELATIVE_COMPOSITION(c) \
1825 do { \
1826 /* Emacs 20 style format for relative composition. */ \
1827 /* Store multibyte form of characters to be composed. */ \
1828 int components[MAX_COMPOSITION_COMPONENTS * 2 - 1]; \
1829 int *buf = components; \
1830 int i, j; \
1831 \
1832 src = src_base; \
1833 ONE_MORE_BYTE (c); /* skip 0x80 */ \
1834 for (i = 0; i < MAX_COMPOSITION_COMPONENTS; i++) \
1835 DECODE_EMACS_MULE_COMPOSITION_CHAR (buf); \
1836 if (i < 2) \
1837 goto invalid_code; \
1838 ADD_COMPOSITION_DATA (charbuf, COMPOSITION_RELATIVE, i); \
1839 for (j = 0; j < i; j++) \
1840 *charbuf++ = components[j]; \
1841 } while (0)
1842
1843
1844 #define DECODE_EMACS_MULE_20_RULEBASE_COMPOSITION(c) \
1845 do { \
1846 /* Emacs 20 style format for rule-base composition. */ \
1847 /* Store multibyte form of characters to be composed. */ \
1848 int components[MAX_COMPOSITION_COMPONENTS * 2 - 1]; \
1849 int *buf = components; \
1850 int i, j; \
1851 \
1852 DECODE_EMACS_MULE_COMPOSITION_CHAR (buf); \
1853 for (i = 0; i < MAX_COMPOSITION_COMPONENTS; i++) \
1854 { \
1855 DECODE_EMACS_MULE_COMPOSITION_RULE_20 (buf); \
1856 DECODE_EMACS_MULE_COMPOSITION_CHAR (buf); \
1857 } \
1858 if (i < 1 || (buf - components) % 2 == 0) \
1859 goto invalid_code; \
1860 if (charbuf + i + (i / 2) + 1 < charbuf_end) \
1861 goto no_more_source; \
1862 ADD_COMPOSITION_DATA (buf, COMPOSITION_WITH_RULE, i); \
1863 for (j = 0; j < i; j++) \
1864 *charbuf++ = components[j]; \
1865 for (j = 0; j < i; j += 2) \
1866 *charbuf++ = components[j]; \
1867 } while (0)
1868
1869
1870 static void
1871 decode_coding_emacs_mule (coding)
1872 struct coding_system *coding;
1873 {
1874 unsigned char *src = coding->source + coding->consumed;
1875 unsigned char *src_end = coding->source + coding->src_bytes;
1876 unsigned char *src_base;
1877 int *charbuf = coding->charbuf;
1878 int *charbuf_end = charbuf + coding->charbuf_size;
1879 int consumed_chars = 0, consumed_chars_base;
1880 int char_offset = 0;
1881 int multibytep = coding->src_multibyte;
1882 Lisp_Object attrs, eol_type, charset_list;
1883
1884 CODING_GET_INFO (coding, attrs, eol_type, charset_list);
1885
1886 while (1)
1887 {
1888 int c;
1889
1890 src_base = src;
1891 consumed_chars_base = consumed_chars;
1892
1893 if (charbuf >= charbuf_end)
1894 break;
1895
1896 ONE_MORE_BYTE (c);
1897
1898 if (c < 0x80)
1899 {
1900 if (c == '\r')
1901 {
1902 if (EQ (eol_type, Qdos))
1903 {
1904 if (src == src_end)
1905 goto no_more_source;
1906 if (*src == '\n')
1907 ONE_MORE_BYTE (c);
1908 }
1909 else if (EQ (eol_type, Qmac))
1910 c = '\n';
1911 }
1912 *charbuf++ = c;
1913 char_offset++;
1914 }
1915 else if (c == 0x80)
1916 {
1917 if (charbuf + 5 + (MAX_COMPOSITION_COMPONENTS * 2) - 1 > charbuf_end)
1918 break;
1919 ONE_MORE_BYTE (c);
1920 if (c - 0xF2 >= COMPOSITION_RELATIVE
1921 && c - 0xF2 <= COMPOSITION_WITH_RULE_ALTCHARS)
1922 DECODE_EMACS_MULE_21_COMPOSITION (c);
1923 else if (c < 0xC0)
1924 DECODE_EMACS_MULE_20_RELATIVE_COMPOSITION (c);
1925 else if (c == 0xFF)
1926 DECODE_EMACS_MULE_20_RULEBASE_COMPOSITION (c);
1927 else
1928 goto invalid_code;
1929 coding->annotated = 1;
1930 }
1931 else if (c < 0xA0 && emacs_mule_bytes[c] > 1)
1932 {
1933 int nbytes, nchars;
1934 src = src_base;
1935 consumed_chars = consumed_chars_base;
1936 c = emacs_mule_char (coding, src, &nbytes, &nchars);
1937 if (c < 0)
1938 {
1939 if (c == -2)
1940 break;
1941 goto invalid_code;
1942 }
1943 *charbuf++ = c;
1944 src += nbytes;
1945 consumed_chars += nchars;
1946 char_offset++;
1947 }
1948 continue;
1949
1950 invalid_code:
1951 src = src_base;
1952 consumed_chars = consumed_chars_base;
1953 ONE_MORE_BYTE (c);
1954 *charbuf++ = ASCII_BYTE_P (c) ? c : BYTE8_TO_CHAR (c);
1955 coding->errors++;
1956 }
1957
1958 no_more_source:
1959 coding->consumed_char += consumed_chars_base;
1960 coding->consumed = src_base - coding->source;
1961 coding->charbuf_used = charbuf - coding->charbuf;
1962 }
1963
1964
1965 #define EMACS_MULE_LEADING_CODES(id, codes) \
1966 do { \
1967 if (id < 0xA0) \
1968 codes[0] = id, codes[1] = 0; \
1969 else if (id < 0xE0) \
1970 codes[0] = 0x9A, codes[1] = id; \
1971 else if (id < 0xF0) \
1972 codes[0] = 0x9B, codes[1] = id; \
1973 else if (id < 0xF5) \
1974 codes[0] = 0x9C, codes[1] = id; \
1975 else \
1976 codes[0] = 0x9D, codes[1] = id; \
1977 } while (0);
1978
1979
1980 static int
1981 encode_coding_emacs_mule (coding)
1982 struct coding_system *coding;
1983 {
1984 int multibytep = coding->dst_multibyte;
1985 int *charbuf = coding->charbuf;
1986 int *charbuf_end = charbuf + coding->charbuf_used;
1987 unsigned char *dst = coding->destination + coding->produced;
1988 unsigned char *dst_end = coding->destination + coding->dst_bytes;
1989 int safe_room = 8;
1990 int produced_chars = 0;
1991 Lisp_Object attrs, eol_type, charset_list;
1992 int c;
1993
1994 CODING_GET_INFO (coding, attrs, eol_type, charset_list);
1995
1996 while (charbuf < charbuf_end)
1997 {
1998 ASSURE_DESTINATION (safe_room);
1999 c = *charbuf++;
2000 if (ASCII_CHAR_P (c))
2001 EMIT_ONE_ASCII_BYTE (c);
2002 else if (CHAR_BYTE8_P (c))
2003 {
2004 c = CHAR_TO_BYTE8 (c);
2005 EMIT_ONE_BYTE (c);
2006 }
2007 else
2008 {
2009 struct charset *charset;
2010 unsigned code;
2011 int dimension;
2012 int emacs_mule_id;
2013 unsigned char leading_codes[2];
2014
2015 charset = char_charset (c, charset_list, &code);
2016 if (! charset)
2017 {
2018 c = coding->default_char;
2019 if (ASCII_CHAR_P (c))
2020 {
2021 EMIT_ONE_ASCII_BYTE (c);
2022 continue;
2023 }
2024 charset = char_charset (c, charset_list, &code);
2025 }
2026 dimension = CHARSET_DIMENSION (charset);
2027 emacs_mule_id = CHARSET_EMACS_MULE_ID (charset);
2028 EMACS_MULE_LEADING_CODES (emacs_mule_id, leading_codes);
2029 EMIT_ONE_BYTE (leading_codes[0]);
2030 if (leading_codes[1])
2031 EMIT_ONE_BYTE (leading_codes[1]);
2032 if (dimension == 1)
2033 EMIT_ONE_BYTE (code);
2034 else
2035 {
2036 EMIT_ONE_BYTE (code >> 8);
2037 EMIT_ONE_BYTE (code & 0xFF);
2038 }
2039 }
2040 }
2041 coding->result = CODING_RESULT_SUCCESS;
2042 coding->produced_char += produced_chars;
2043 coding->produced = dst - coding->destination;
2044 return 0;
2045 }
2046
2047 \f
2048 /*** 7. ISO2022 handlers ***/
2049
2050 /* The following note describes the coding system ISO2022 briefly.
2051 Since the intention of this note is to help understand the
2052 functions in this file, some parts are NOT ACCURATE or OVERLY
2053 SIMPLIFIED. For thorough understanding, please refer to the
2054 original document of ISO2022.
2055
2056 ISO2022 provides many mechanisms to encode several character sets
2057 in 7-bit and 8-bit environments. For 7-bite environments, all text
2058 is encoded using bytes less than 128. This may make the encoded
2059 text a little bit longer, but the text passes more easily through
2060 several gateways, some of which strip off MSB (Most Signigant Bit).
2061
2062 There are two kinds of character sets: control character set and
2063 graphic character set. The former contains control characters such
2064 as `newline' and `escape' to provide control functions (control
2065 functions are also provided by escape sequences). The latter
2066 contains graphic characters such as 'A' and '-'. Emacs recognizes
2067 two control character sets and many graphic character sets.
2068
2069 Graphic character sets are classified into one of the following
2070 four classes, according to the number of bytes (DIMENSION) and
2071 number of characters in one dimension (CHARS) of the set:
2072 - DIMENSION1_CHARS94
2073 - DIMENSION1_CHARS96
2074 - DIMENSION2_CHARS94
2075 - DIMENSION2_CHARS96
2076
2077 In addition, each character set is assigned an identification tag,
2078 unique for each set, called "final character" (denoted as <F>
2079 hereafter). The <F> of each character set is decided by ECMA(*)
2080 when it is registered in ISO. The code range of <F> is 0x30..0x7F
2081 (0x30..0x3F are for private use only).
2082
2083 Note (*): ECMA = European Computer Manufacturers Association
2084
2085 Here are examples of graphic character set [NAME(<F>)]:
2086 o DIMENSION1_CHARS94 -- ASCII('B'), right-half-of-JISX0201('I'), ...
2087 o DIMENSION1_CHARS96 -- right-half-of-ISO8859-1('A'), ...
2088 o DIMENSION2_CHARS94 -- GB2312('A'), JISX0208('B'), ...
2089 o DIMENSION2_CHARS96 -- none for the moment
2090
2091 A code area (1 byte=8 bits) is divided into 4 areas, C0, GL, C1, and GR.
2092 C0 [0x00..0x1F] -- control character plane 0
2093 GL [0x20..0x7F] -- graphic character plane 0
2094 C1 [0x80..0x9F] -- control character plane 1
2095 GR [0xA0..0xFF] -- graphic character plane 1
2096
2097 A control character set is directly designated and invoked to C0 or
2098 C1 by an escape sequence. The most common case is that:
2099 - ISO646's control character set is designated/invoked to C0, and
2100 - ISO6429's control character set is designated/invoked to C1,
2101 and usually these designations/invocations are omitted in encoded
2102 text. In a 7-bit environment, only C0 can be used, and a control
2103 character for C1 is encoded by an appropriate escape sequence to
2104 fit into the environment. All control characters for C1 are
2105 defined to have corresponding escape sequences.
2106
2107 A graphic character set is at first designated to one of four
2108 graphic registers (G0 through G3), then these graphic registers are
2109 invoked to GL or GR. These designations and invocations can be
2110 done independently. The most common case is that G0 is invoked to
2111 GL, G1 is invoked to GR, and ASCII is designated to G0. Usually
2112 these invocations and designations are omitted in encoded text.
2113 In a 7-bit environment, only GL can be used.
2114
2115 When a graphic character set of CHARS94 is invoked to GL, codes
2116 0x20 and 0x7F of the GL area work as control characters SPACE and
2117 DEL respectively, and codes 0xA0 and 0xFF of the GR area should not
2118 be used.
2119
2120 There are two ways of invocation: locking-shift and single-shift.
2121 With locking-shift, the invocation lasts until the next different
2122 invocation, whereas with single-shift, the invocation affects the
2123 following character only and doesn't affect the locking-shift
2124 state. Invocations are done by the following control characters or
2125 escape sequences:
2126
2127 ----------------------------------------------------------------------
2128 abbrev function cntrl escape seq description
2129 ----------------------------------------------------------------------
2130 SI/LS0 (shift-in) 0x0F none invoke G0 into GL
2131 SO/LS1 (shift-out) 0x0E none invoke G1 into GL
2132 LS2 (locking-shift-2) none ESC 'n' invoke G2 into GL
2133 LS3 (locking-shift-3) none ESC 'o' invoke G3 into GL
2134 LS1R (locking-shift-1 right) none ESC '~' invoke G1 into GR (*)
2135 LS2R (locking-shift-2 right) none ESC '}' invoke G2 into GR (*)
2136 LS3R (locking-shift 3 right) none ESC '|' invoke G3 into GR (*)
2137 SS2 (single-shift-2) 0x8E ESC 'N' invoke G2 for one char
2138 SS3 (single-shift-3) 0x8F ESC 'O' invoke G3 for one char
2139 ----------------------------------------------------------------------
2140 (*) These are not used by any known coding system.
2141
2142 Control characters for these functions are defined by macros
2143 ISO_CODE_XXX in `coding.h'.
2144
2145 Designations are done by the following escape sequences:
2146 ----------------------------------------------------------------------
2147 escape sequence description
2148 ----------------------------------------------------------------------
2149 ESC '(' <F> designate DIMENSION1_CHARS94<F> to G0
2150 ESC ')' <F> designate DIMENSION1_CHARS94<F> to G1
2151 ESC '*' <F> designate DIMENSION1_CHARS94<F> to G2
2152 ESC '+' <F> designate DIMENSION1_CHARS94<F> to G3
2153 ESC ',' <F> designate DIMENSION1_CHARS96<F> to G0 (*)
2154 ESC '-' <F> designate DIMENSION1_CHARS96<F> to G1
2155 ESC '.' <F> designate DIMENSION1_CHARS96<F> to G2
2156 ESC '/' <F> designate DIMENSION1_CHARS96<F> to G3
2157 ESC '$' '(' <F> designate DIMENSION2_CHARS94<F> to G0 (**)
2158 ESC '$' ')' <F> designate DIMENSION2_CHARS94<F> to G1
2159 ESC '$' '*' <F> designate DIMENSION2_CHARS94<F> to G2
2160 ESC '$' '+' <F> designate DIMENSION2_CHARS94<F> to G3
2161 ESC '$' ',' <F> designate DIMENSION2_CHARS96<F> to G0 (*)
2162 ESC '$' '-' <F> designate DIMENSION2_CHARS96<F> to G1
2163 ESC '$' '.' <F> designate DIMENSION2_CHARS96<F> to G2
2164 ESC '$' '/' <F> designate DIMENSION2_CHARS96<F> to G3
2165 ----------------------------------------------------------------------
2166
2167 In this list, "DIMENSION1_CHARS94<F>" means a graphic character set
2168 of dimension 1, chars 94, and final character <F>, etc...
2169
2170 Note (*): Although these designations are not allowed in ISO2022,
2171 Emacs accepts them on decoding, and produces them on encoding
2172 CHARS96 character sets in a coding system which is characterized as
2173 7-bit environment, non-locking-shift, and non-single-shift.
2174
2175 Note (**): If <F> is '@', 'A', or 'B', the intermediate character
2176 '(' must be omitted. We refer to this as "short-form" hereafter.
2177
2178 Now you may notice that there are a lot of ways for encoding the
2179 same multilingual text in ISO2022. Actually, there exist many
2180 coding systems such as Compound Text (used in X11's inter client
2181 communication, ISO-2022-JP (used in Japanese internet), ISO-2022-KR
2182 (used in Korean internet), EUC (Extended UNIX Code, used in Asian
2183 localized platforms), and all of these are variants of ISO2022.
2184
2185 In addition to the above, Emacs handles two more kinds of escape
2186 sequences: ISO6429's direction specification and Emacs' private
2187 sequence for specifying character composition.
2188
2189 ISO6429's direction specification takes the following form:
2190 o CSI ']' -- end of the current direction
2191 o CSI '0' ']' -- end of the current direction
2192 o CSI '1' ']' -- start of left-to-right text
2193 o CSI '2' ']' -- start of right-to-left text
2194 The control character CSI (0x9B: control sequence introducer) is
2195 abbreviated to the escape sequence ESC '[' in a 7-bit environment.
2196
2197 Character composition specification takes the following form:
2198 o ESC '0' -- start relative composition
2199 o ESC '1' -- end composition
2200 o ESC '2' -- start rule-base composition (*)
2201 o ESC '3' -- start relative composition with alternate chars (**)
2202 o ESC '4' -- start rule-base composition with alternate chars (**)
2203 Since these are not standard escape sequences of any ISO standard,
2204 the use of them for these meaning is restricted to Emacs only.
2205
2206 (*) This form is used only in Emacs 20.5 and the older versions,
2207 but the newer versions can safely decode it.
2208 (**) This form is used only in Emacs 21.1 and the newer versions,
2209 and the older versions can't decode it.
2210
2211 Here's a list of examples usages of these composition escape
2212 sequences (categorized by `enum composition_method').
2213
2214 COMPOSITION_RELATIVE:
2215 ESC 0 CHAR [ CHAR ] ESC 1
2216 COMPOSITOIN_WITH_RULE:
2217 ESC 2 CHAR [ RULE CHAR ] ESC 1
2218 COMPOSITION_WITH_ALTCHARS:
2219 ESC 3 ALTCHAR [ ALTCHAR ] ESC 0 CHAR [ CHAR ] ESC 1
2220 COMPOSITION_WITH_RULE_ALTCHARS:
2221 ESC 4 ALTCHAR [ RULE ALTCHAR ] ESC 0 CHAR [ CHAR ] ESC 1 */
2222
2223 enum iso_code_class_type iso_code_class[256];
2224
2225 #define SAFE_CHARSET_P(coding, id) \
2226 ((id) <= (coding)->max_charset_id \
2227 && (coding)->safe_charsets[id] >= 0)
2228
2229
2230 #define SHIFT_OUT_OK(category) \
2231 (CODING_ISO_INITIAL (&coding_categories[category], 1) >= 0)
2232
2233 static void
2234 setup_iso_safe_charsets (attrs)
2235 Lisp_Object attrs;
2236 {
2237 Lisp_Object charset_list, safe_charsets;
2238 Lisp_Object request;
2239 Lisp_Object reg_usage;
2240 Lisp_Object tail;
2241 int reg94, reg96;
2242 int flags = XINT (AREF (attrs, coding_attr_iso_flags));
2243 int max_charset_id;
2244
2245 charset_list = CODING_ATTR_CHARSET_LIST (attrs);
2246 if ((flags & CODING_ISO_FLAG_FULL_SUPPORT)
2247 && ! EQ (charset_list, Viso_2022_charset_list))
2248 {
2249 CODING_ATTR_CHARSET_LIST (attrs)
2250 = charset_list = Viso_2022_charset_list;
2251 ASET (attrs, coding_attr_safe_charsets, Qnil);
2252 }
2253
2254 if (STRINGP (AREF (attrs, coding_attr_safe_charsets)))
2255 return;
2256
2257 max_charset_id = 0;
2258 for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
2259 {
2260 int id = XINT (XCAR (tail));
2261 if (max_charset_id < id)
2262 max_charset_id = id;
2263 }
2264
2265 safe_charsets = Fmake_string (make_number (max_charset_id + 1),
2266 make_number (255));
2267 request = AREF (attrs, coding_attr_iso_request);
2268 reg_usage = AREF (attrs, coding_attr_iso_usage);
2269 reg94 = XINT (XCAR (reg_usage));
2270 reg96 = XINT (XCDR (reg_usage));
2271
2272 for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
2273 {
2274 Lisp_Object id;
2275 Lisp_Object reg;
2276 struct charset *charset;
2277
2278 id = XCAR (tail);
2279 charset = CHARSET_FROM_ID (XINT (id));
2280 reg = Fcdr (Fassq (id, request));
2281 if (! NILP (reg))
2282 XSTRING (safe_charsets)->data[XINT (id)] = XINT (reg);
2283 else if (charset->iso_chars_96)
2284 {
2285 if (reg96 < 4)
2286 XSTRING (safe_charsets)->data[XINT (id)] = reg96;
2287 }
2288 else
2289 {
2290 if (reg94 < 4)
2291 XSTRING (safe_charsets)->data[XINT (id)] = reg94;
2292 }
2293 }
2294 ASET (attrs, coding_attr_safe_charsets, safe_charsets);
2295 }
2296
2297
2298 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
2299 Check if a text is encoded in ISO2022. If it is, returns an
2300 integer in which appropriate flag bits any of:
2301 CATEGORY_MASK_ISO_7
2302 CATEGORY_MASK_ISO_7_TIGHT
2303 CATEGORY_MASK_ISO_8_1
2304 CATEGORY_MASK_ISO_8_2
2305 CATEGORY_MASK_ISO_7_ELSE
2306 CATEGORY_MASK_ISO_8_ELSE
2307 are set. If a code which should never appear in ISO2022 is found,
2308 returns 0. */
2309
2310 static int
2311 detect_coding_iso_2022 (coding, mask)
2312 struct coding_system *coding;
2313 int *mask;
2314 {
2315 unsigned char *src = coding->source, *src_base = src;
2316 unsigned char *src_end = coding->source + coding->src_bytes;
2317 int multibytep = coding->src_multibyte;
2318 int mask_iso = CATEGORY_MASK_ISO;
2319 int mask_found = 0, mask_8bit_found = 0;
2320 int reg[4], shift_out = 0, single_shifting = 0;
2321 int id;
2322 int c, c1;
2323 int consumed_chars = 0;
2324 int i;
2325
2326 for (i = coding_category_iso_7; i <= coding_category_iso_8_else; i++)
2327 {
2328 struct coding_system *this = &(coding_categories[i]);
2329 Lisp_Object attrs, val;
2330
2331 attrs = CODING_ID_ATTRS (this->id);
2332 if (CODING_ISO_FLAGS (this) & CODING_ISO_FLAG_FULL_SUPPORT
2333 && ! EQ (CODING_ATTR_SAFE_CHARSETS (attrs), Viso_2022_charset_list))
2334 setup_iso_safe_charsets (attrs);
2335 val = CODING_ATTR_SAFE_CHARSETS (attrs);
2336 this->max_charset_id = XSTRING (val)->size - 1;
2337 this->safe_charsets = (char *) XSTRING (val)->data;
2338 }
2339
2340 /* A coding system of this category is always ASCII compatible. */
2341 src += coding->head_ascii;
2342
2343 reg[0] = charset_ascii, reg[1] = reg[2] = reg[3] = -1;
2344 while (mask_iso && src < src_end)
2345 {
2346 ONE_MORE_BYTE (c);
2347 switch (c)
2348 {
2349 case ISO_CODE_ESC:
2350 if (inhibit_iso_escape_detection)
2351 break;
2352 single_shifting = 0;
2353 ONE_MORE_BYTE (c);
2354 if (c >= '(' && c <= '/')
2355 {
2356 /* Designation sequence for a charset of dimension 1. */
2357 ONE_MORE_BYTE (c1);
2358 if (c1 < ' ' || c1 >= 0x80
2359 || (id = iso_charset_table[0][c >= ','][c1]) < 0)
2360 /* Invalid designation sequence. Just ignore. */
2361 break;
2362 reg[(c - '(') % 4] = id;
2363 }
2364 else if (c == '$')
2365 {
2366 /* Designation sequence for a charset of dimension 2. */
2367 ONE_MORE_BYTE (c);
2368 if (c >= '@' && c <= 'B')
2369 /* Designation for JISX0208.1978, GB2312, or JISX0208. */
2370 reg[0] = id = iso_charset_table[1][0][c];
2371 else if (c >= '(' && c <= '/')
2372 {
2373 ONE_MORE_BYTE (c1);
2374 if (c1 < ' ' || c1 >= 0x80
2375 || (id = iso_charset_table[1][c >= ','][c1]) < 0)
2376 /* Invalid designation sequence. Just ignore. */
2377 break;
2378 reg[(c - '(') % 4] = id;
2379 }
2380 else
2381 /* Invalid designation sequence. Just ignore. */
2382 break;
2383 }
2384 else if (c == 'N' || c == 'O')
2385 {
2386 /* ESC <Fe> for SS2 or SS3. */
2387 mask_iso &= CATEGORY_MASK_ISO_7_ELSE;
2388 break;
2389 }
2390 else if (c >= '0' && c <= '4')
2391 {
2392 /* ESC <Fp> for start/end composition. */
2393 mask_found |= CATEGORY_MASK_ISO;
2394 break;
2395 }
2396 else
2397 {
2398 /* Invalid escape sequence. */
2399 mask_iso &= ~CATEGORY_MASK_ISO_ESCAPE;
2400 break;
2401 }
2402
2403 /* We found a valid designation sequence for CHARSET. */
2404 mask_iso &= ~CATEGORY_MASK_ISO_8BIT;
2405 if (SAFE_CHARSET_P (&coding_categories[coding_category_iso_7],
2406 id))
2407 mask_found |= CATEGORY_MASK_ISO_7;
2408 else
2409 mask_iso &= ~CATEGORY_MASK_ISO_7;
2410 if (SAFE_CHARSET_P (&coding_categories[coding_category_iso_7_tight],
2411 id))
2412 mask_found |= CATEGORY_MASK_ISO_7_TIGHT;
2413 else
2414 mask_iso &= ~CATEGORY_MASK_ISO_7_TIGHT;
2415 if (SAFE_CHARSET_P (&coding_categories[coding_category_iso_7_else],
2416 id))
2417 mask_found |= CATEGORY_MASK_ISO_7_ELSE;
2418 else
2419 mask_iso &= ~CATEGORY_MASK_ISO_7_ELSE;
2420 if (SAFE_CHARSET_P (&coding_categories[coding_category_iso_8_else],
2421 id))
2422 mask_found |= CATEGORY_MASK_ISO_8_ELSE;
2423 else
2424 mask_iso &= ~CATEGORY_MASK_ISO_8_ELSE;
2425 break;
2426
2427 case ISO_CODE_SO:
2428 if (inhibit_iso_escape_detection)
2429 break;
2430 single_shifting = 0;
2431 if (shift_out == 0
2432 && (reg[1] >= 0
2433 || SHIFT_OUT_OK (coding_category_iso_7_else)
2434 || SHIFT_OUT_OK (coding_category_iso_8_else)))
2435 {
2436 /* Locking shift out. */
2437 mask_iso &= ~CATEGORY_MASK_ISO_7BIT;
2438 mask_found |= CATEGORY_MASK_ISO_ELSE;
2439 }
2440 break;
2441
2442 case ISO_CODE_SI:
2443 if (inhibit_iso_escape_detection)
2444 break;
2445 single_shifting = 0;
2446 if (shift_out == 1)
2447 {
2448 /* Locking shift in. */
2449 mask_iso &= ~CATEGORY_MASK_ISO_7BIT;
2450 mask_found |= CATEGORY_MASK_ISO_ELSE;
2451 }
2452 break;
2453
2454 case ISO_CODE_CSI:
2455 single_shifting = 0;
2456 case ISO_CODE_SS2:
2457 case ISO_CODE_SS3:
2458 {
2459 int newmask = CATEGORY_MASK_ISO_8_ELSE;
2460
2461 if (inhibit_iso_escape_detection)
2462 break;
2463 if (c != ISO_CODE_CSI)
2464 {
2465 if (CODING_ISO_FLAGS (&coding_categories[coding_category_iso_8_1])
2466 & CODING_ISO_FLAG_SINGLE_SHIFT)
2467 newmask |= CATEGORY_MASK_ISO_8_1;
2468 if (CODING_ISO_FLAGS (&coding_categories[coding_category_iso_8_2])
2469 & CODING_ISO_FLAG_SINGLE_SHIFT)
2470 newmask |= CATEGORY_MASK_ISO_8_2;
2471 single_shifting = 1;
2472 }
2473 if (VECTORP (Vlatin_extra_code_table)
2474 && !NILP (XVECTOR (Vlatin_extra_code_table)->contents[c]))
2475 {
2476 if (CODING_ISO_FLAGS (&coding_categories[coding_category_iso_8_1])
2477 & CODING_ISO_FLAG_LATIN_EXTRA)
2478 newmask |= CATEGORY_MASK_ISO_8_1;
2479 if (CODING_ISO_FLAGS (&coding_categories[coding_category_iso_8_2])
2480 & CODING_ISO_FLAG_LATIN_EXTRA)
2481 newmask |= CATEGORY_MASK_ISO_8_2;
2482 }
2483 mask_iso &= newmask;
2484 mask_found |= newmask;
2485 }
2486 break;
2487
2488 default:
2489 if (c < 0x80)
2490 {
2491 single_shifting = 0;
2492 break;
2493 }
2494 else if (c < 0xA0)
2495 {
2496 single_shifting = 0;
2497 mask_8bit_found = 1;
2498 if (VECTORP (Vlatin_extra_code_table)
2499 && !NILP (XVECTOR (Vlatin_extra_code_table)->contents[c]))
2500 {
2501 int newmask = 0;
2502
2503 if (CODING_ISO_FLAGS (&coding_categories[coding_category_iso_8_1])
2504 & CODING_ISO_FLAG_LATIN_EXTRA)
2505 newmask |= CATEGORY_MASK_ISO_8_1;
2506 if (CODING_ISO_FLAGS (&coding_categories[coding_category_iso_8_2])
2507 & CODING_ISO_FLAG_LATIN_EXTRA)
2508 newmask |= CATEGORY_MASK_ISO_8_2;
2509 mask_iso &= newmask;
2510 mask_found |= newmask;
2511 }
2512 else
2513 return 0;
2514 }
2515 else
2516 {
2517 mask_iso &= ~(CATEGORY_MASK_ISO_7BIT
2518 | CATEGORY_MASK_ISO_7_ELSE);
2519 mask_found |= CATEGORY_MASK_ISO_8_1;
2520 mask_8bit_found = 1;
2521 /* Check the length of succeeding codes of the range
2522 0xA0..0FF. If the byte length is odd, we exclude
2523 CATEGORY_MASK_ISO_8_2. We can check this only
2524 when we are not single shifting. */
2525 if (!single_shifting
2526 && mask_iso & CATEGORY_MASK_ISO_8_2)
2527 {
2528 int i = 1;
2529 while (src < src_end)
2530 {
2531 ONE_MORE_BYTE (c);
2532 if (c < 0xA0)
2533 break;
2534 i++;
2535 }
2536
2537 if (i & 1 && src < src_end)
2538 mask_iso &= ~CATEGORY_MASK_ISO_8_2;
2539 else
2540 mask_found |= CATEGORY_MASK_ISO_8_2;
2541 }
2542 }
2543 break;
2544 }
2545 }
2546 no_more_source:
2547 if (!mask_iso)
2548 {
2549 *mask &= ~CATEGORY_MASK_ISO;
2550 return 0;
2551 }
2552 if (!mask_found)
2553 return 0;
2554 *mask &= mask_iso & mask_found;
2555 if (! mask_8bit_found)
2556 *mask &= ~(CATEGORY_MASK_ISO_8BIT | CATEGORY_MASK_ISO_8_ELSE);
2557 return 1;
2558 }
2559
2560
2561 /* Set designation state into CODING. */
2562 #define DECODE_DESIGNATION(reg, dim, chars_96, final) \
2563 do { \
2564 int id, prev; \
2565 \
2566 if (final < '0' || final >= 128 \
2567 || ((id = ISO_CHARSET_TABLE (dim, chars_96, final)) < 0) \
2568 || !SAFE_CHARSET_P (coding, id)) \
2569 { \
2570 CODING_ISO_DESIGNATION (coding, reg) = -2; \
2571 goto invalid_code; \
2572 } \
2573 prev = CODING_ISO_DESIGNATION (coding, reg); \
2574 if (id == charset_jisx0201_roman) \
2575 { \
2576 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_USE_ROMAN) \
2577 id = charset_ascii; \
2578 } \
2579 else if (id == charset_jisx0208_1978) \
2580 { \
2581 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_USE_OLDJIS) \
2582 id = charset_jisx0208; \
2583 } \
2584 CODING_ISO_DESIGNATION (coding, reg) = id; \
2585 /* If there was an invalid designation to REG previously, and this \
2586 designation is ASCII to REG, we should keep this designation \
2587 sequence. */ \
2588 if (prev == -2 && id == charset_ascii) \
2589 goto invalid_code; \
2590 } while (0)
2591
2592
2593 #define MAYBE_FINISH_COMPOSITION() \
2594 do { \
2595 int i; \
2596 if (composition_state == COMPOSING_NO) \
2597 break; \
2598 /* It is assured that we have enough room for producing \
2599 characters stored in the table `components'. */ \
2600 if (charbuf + component_idx > charbuf_end) \
2601 goto no_more_source; \
2602 composition_state = COMPOSING_NO; \
2603 if (method == COMPOSITION_RELATIVE \
2604 || method == COMPOSITION_WITH_ALTCHARS) \
2605 { \
2606 for (i = 0; i < component_idx; i++) \
2607 *charbuf++ = components[i]; \
2608 char_offset += component_idx; \
2609 } \
2610 else \
2611 { \
2612 for (i = 0; i < component_idx; i += 2) \
2613 *charbuf++ = components[i]; \
2614 char_offset += (component_idx / 2) + 1; \
2615 } \
2616 } while (0)
2617
2618
2619 /* Handle composition start sequence ESC 0, ESC 2, ESC 3, or ESC 4.
2620 ESC 0 : relative composition : ESC 0 CHAR ... ESC 1
2621 ESC 2 : rulebase composition : ESC 2 CHAR RULE CHAR RULE ... CHAR ESC 1
2622 ESC 3 : altchar composition : ESC 3 CHAR ... ESC 0 CHAR ... ESC 1
2623 ESC 4 : alt&rule composition : ESC 4 CHAR RULE ... CHAR ESC 0 CHAR ... ESC 1
2624 */
2625
2626 #define DECODE_COMPOSITION_START(c1) \
2627 do { \
2628 if (c1 == '0' \
2629 && composition_state == COMPOSING_COMPONENT_RULE) \
2630 { \
2631 component_len = component_idx; \
2632 composition_state = COMPOSING_CHAR; \
2633 } \
2634 else \
2635 { \
2636 unsigned char *p; \
2637 \
2638 MAYBE_FINISH_COMPOSITION (); \
2639 if (charbuf + MAX_COMPOSITION_COMPONENTS > charbuf_end) \
2640 goto no_more_source; \
2641 for (p = src; p < src_end - 1; p++) \
2642 if (*p == ISO_CODE_ESC && p[1] == '1') \
2643 break; \
2644 if (p == src_end - 1) \
2645 { \
2646 if (coding->mode & CODING_MODE_LAST_BLOCK) \
2647 goto invalid_code; \
2648 goto no_more_source; \
2649 } \
2650 \
2651 /* This is surely the start of a composition. */ \
2652 method = (c1 == '0' ? COMPOSITION_RELATIVE \
2653 : c1 == '2' ? COMPOSITION_WITH_RULE \
2654 : c1 == '3' ? COMPOSITION_WITH_ALTCHARS \
2655 : COMPOSITION_WITH_RULE_ALTCHARS); \
2656 composition_state = (c1 <= '2' ? COMPOSING_CHAR \
2657 : COMPOSING_COMPONENT_CHAR); \
2658 component_idx = component_len = 0; \
2659 } \
2660 } while (0)
2661
2662
2663 /* Handle compositoin end sequence ESC 1. */
2664
2665 #define DECODE_COMPOSITION_END() \
2666 do { \
2667 int nchars = (component_len > 0 ? component_idx - component_len \
2668 : method == COMPOSITION_RELATIVE ? component_idx \
2669 : (component_idx + 1) / 2); \
2670 int i; \
2671 int *saved_charbuf = charbuf; \
2672 \
2673 ADD_COMPOSITION_DATA (charbuf, method, nchars); \
2674 if (method != COMPOSITION_RELATIVE) \
2675 { \
2676 if (component_len == 0) \
2677 for (i = 0; i < component_idx; i++) \
2678 *charbuf++ = components[i]; \
2679 else \
2680 for (i = 0; i < component_len; i++) \
2681 *charbuf++ = components[i]; \
2682 *saved_charbuf = saved_charbuf - charbuf; \
2683 } \
2684 if (method == COMPOSITION_WITH_RULE) \
2685 for (i = 0; i < component_idx; i += 2, char_offset++) \
2686 *charbuf++ = components[i]; \
2687 else \
2688 for (i = component_len; i < component_idx; i++, char_offset++) \
2689 *charbuf++ = components[i]; \
2690 coding->annotated = 1; \
2691 composition_state = COMPOSING_NO; \
2692 } while (0)
2693
2694
2695 /* Decode a composition rule from the byte C1 (and maybe one more byte
2696 from SRC) and store one encoded composition rule in
2697 coding->cmp_data. */
2698
2699 #define DECODE_COMPOSITION_RULE(c1) \
2700 do { \
2701 (c1) -= 32; \
2702 if (c1 < 81) /* old format (before ver.21) */ \
2703 { \
2704 int gref = (c1) / 9; \
2705 int nref = (c1) % 9; \
2706 if (gref == 4) gref = 10; \
2707 if (nref == 4) nref = 10; \
2708 c1 = COMPOSITION_ENCODE_RULE (gref, nref); \
2709 } \
2710 else if (c1 < 93) /* new format (after ver.21) */ \
2711 { \
2712 ONE_MORE_BYTE (c2); \
2713 c1 = COMPOSITION_ENCODE_RULE (c1 - 81, c2 - 32); \
2714 } \
2715 else \
2716 c1 = 0; \
2717 } while (0)
2718
2719
2720 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions". */
2721
2722 static void
2723 decode_coding_iso_2022 (coding)
2724 struct coding_system *coding;
2725 {
2726 unsigned char *src = coding->source + coding->consumed;
2727 unsigned char *src_end = coding->source + coding->src_bytes;
2728 unsigned char *src_base;
2729 int *charbuf = coding->charbuf;
2730 int *charbuf_end = charbuf + coding->charbuf_size - 4;
2731 int consumed_chars = 0, consumed_chars_base;
2732 int char_offset = 0;
2733 int multibytep = coding->src_multibyte;
2734 /* Charsets invoked to graphic plane 0 and 1 respectively. */
2735 int charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
2736 int charset_id_1 = CODING_ISO_INVOKED_CHARSET (coding, 1);
2737 struct charset *charset;
2738 int c;
2739 /* For handling composition sequence. */
2740 #define COMPOSING_NO 0
2741 #define COMPOSING_CHAR 1
2742 #define COMPOSING_RULE 2
2743 #define COMPOSING_COMPONENT_CHAR 3
2744 #define COMPOSING_COMPONENT_RULE 4
2745
2746 int composition_state = COMPOSING_NO;
2747 enum composition_method method;
2748 int components[MAX_COMPOSITION_COMPONENTS * 2 + 1];
2749 int component_idx;
2750 int component_len;
2751 Lisp_Object attrs, eol_type, charset_list;
2752
2753 CODING_GET_INFO (coding, attrs, eol_type, charset_list);
2754 setup_iso_safe_charsets (attrs);
2755
2756 while (1)
2757 {
2758 int c1, c2;
2759
2760 src_base = src;
2761 consumed_chars_base = consumed_chars;
2762
2763 if (charbuf >= charbuf_end)
2764 break;
2765
2766 ONE_MORE_BYTE (c1);
2767
2768 /* We produce no character or one character. */
2769 switch (iso_code_class [c1])
2770 {
2771 case ISO_0x20_or_0x7F:
2772 if (composition_state != COMPOSING_NO)
2773 {
2774 if (composition_state == COMPOSING_RULE
2775 || composition_state == COMPOSING_COMPONENT_RULE)
2776 {
2777 DECODE_COMPOSITION_RULE (c1);
2778 components[component_idx++] = c1;
2779 composition_state--;
2780 continue;
2781 }
2782 }
2783 if (charset_id_0 < 0
2784 || ! CHARSET_ISO_CHARS_96 (CHARSET_FROM_ID (charset_id_0)))
2785 /* This is SPACE or DEL. */
2786 charset = CHARSET_FROM_ID (charset_ascii);
2787 else
2788 charset = CHARSET_FROM_ID (charset_id_0);
2789 break;
2790
2791 case ISO_graphic_plane_0:
2792 if (composition_state != COMPOSING_NO)
2793 {
2794 if (composition_state == COMPOSING_RULE
2795 || composition_state == COMPOSING_COMPONENT_RULE)
2796 {
2797 DECODE_COMPOSITION_RULE (c1);
2798 components[component_idx++] = c1;
2799 composition_state--;
2800 continue;
2801 }
2802 }
2803 charset = CHARSET_FROM_ID (charset_id_0);
2804 break;
2805
2806 case ISO_0xA0_or_0xFF:
2807 if (charset_id_1 < 0
2808 || ! CHARSET_ISO_CHARS_96 (CHARSET_FROM_ID (charset_id_1))
2809 || CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS)
2810 goto invalid_code;
2811 /* This is a graphic character, we fall down ... */
2812
2813 case ISO_graphic_plane_1:
2814 if (charset_id_1 < 0)
2815 goto invalid_code;
2816 charset = CHARSET_FROM_ID (charset_id_1);
2817 break;
2818
2819 case ISO_carriage_return:
2820 if (c1 == '\r')
2821 {
2822 if (EQ (eol_type, Qdos))
2823 {
2824 if (src == src_end)
2825 goto no_more_source;
2826 if (*src == '\n')
2827 ONE_MORE_BYTE (c1);
2828 }
2829 else if (EQ (eol_type, Qmac))
2830 c1 = '\n';
2831 }
2832 /* fall through */
2833
2834 case ISO_control_0:
2835 MAYBE_FINISH_COMPOSITION ();
2836 charset = CHARSET_FROM_ID (charset_ascii);
2837 break;
2838
2839 case ISO_control_1:
2840 MAYBE_FINISH_COMPOSITION ();
2841 goto invalid_code;
2842
2843 case ISO_shift_out:
2844 if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LOCKING_SHIFT)
2845 || CODING_ISO_DESIGNATION (coding, 1) < 0)
2846 goto invalid_code;
2847 CODING_ISO_INVOCATION (coding, 0) = 1;
2848 charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
2849 continue;
2850
2851 case ISO_shift_in:
2852 if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LOCKING_SHIFT))
2853 goto invalid_code;
2854 CODING_ISO_INVOCATION (coding, 0) = 0;
2855 charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
2856 continue;
2857
2858 case ISO_single_shift_2_7:
2859 case ISO_single_shift_2:
2860 if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT))
2861 goto invalid_code;
2862 /* SS2 is handled as an escape sequence of ESC 'N' */
2863 c1 = 'N';
2864 goto label_escape_sequence;
2865
2866 case ISO_single_shift_3:
2867 if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT))
2868 goto invalid_code;
2869 /* SS2 is handled as an escape sequence of ESC 'O' */
2870 c1 = 'O';
2871 goto label_escape_sequence;
2872
2873 case ISO_control_sequence_introducer:
2874 /* CSI is handled as an escape sequence of ESC '[' ... */
2875 c1 = '[';
2876 goto label_escape_sequence;
2877
2878 case ISO_escape:
2879 ONE_MORE_BYTE (c1);
2880 label_escape_sequence:
2881 /* Escape sequences handled here are invocation,
2882 designation, direction specification, and character
2883 composition specification. */
2884 switch (c1)
2885 {
2886 case '&': /* revision of following character set */
2887 ONE_MORE_BYTE (c1);
2888 if (!(c1 >= '@' && c1 <= '~'))
2889 goto invalid_code;
2890 ONE_MORE_BYTE (c1);
2891 if (c1 != ISO_CODE_ESC)
2892 goto invalid_code;
2893 ONE_MORE_BYTE (c1);
2894 goto label_escape_sequence;
2895
2896 case '$': /* designation of 2-byte character set */
2897 if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DESIGNATION))
2898 goto invalid_code;
2899 ONE_MORE_BYTE (c1);
2900 if (c1 >= '@' && c1 <= 'B')
2901 { /* designation of JISX0208.1978, GB2312.1980,
2902 or JISX0208.1980 */
2903 DECODE_DESIGNATION (0, 2, 0, c1);
2904 }
2905 else if (c1 >= 0x28 && c1 <= 0x2B)
2906 { /* designation of DIMENSION2_CHARS94 character set */
2907 ONE_MORE_BYTE (c2);
2908 DECODE_DESIGNATION (c1 - 0x28, 2, 0, c2);
2909 }
2910 else if (c1 >= 0x2C && c1 <= 0x2F)
2911 { /* designation of DIMENSION2_CHARS96 character set */
2912 ONE_MORE_BYTE (c2);
2913 DECODE_DESIGNATION (c1 - 0x2C, 2, 1, c2);
2914 }
2915 else
2916 goto invalid_code;
2917 /* We must update these variables now. */
2918 charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
2919 charset_id_1 = CODING_ISO_INVOKED_CHARSET (coding, 1);
2920 continue;
2921
2922 case 'n': /* invocation of locking-shift-2 */
2923 if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LOCKING_SHIFT)
2924 || CODING_ISO_DESIGNATION (coding, 2) < 0)
2925 goto invalid_code;
2926 CODING_ISO_INVOCATION (coding, 0) = 2;
2927 charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
2928 continue;
2929
2930 case 'o': /* invocation of locking-shift-3 */
2931 if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LOCKING_SHIFT)
2932 || CODING_ISO_DESIGNATION (coding, 3) < 0)
2933 goto invalid_code;
2934 CODING_ISO_INVOCATION (coding, 0) = 3;
2935 charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
2936 continue;
2937
2938 case 'N': /* invocation of single-shift-2 */
2939 if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT)
2940 || CODING_ISO_DESIGNATION (coding, 2) < 0)
2941 goto invalid_code;
2942 charset = CHARSET_FROM_ID (CODING_ISO_DESIGNATION (coding, 2));
2943 ONE_MORE_BYTE (c1);
2944 if (c1 < 0x20 || (c1 >= 0x80 && c1 < 0xA0))
2945 goto invalid_code;
2946 break;
2947
2948 case 'O': /* invocation of single-shift-3 */
2949 if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT)
2950 || CODING_ISO_DESIGNATION (coding, 3) < 0)
2951 goto invalid_code;
2952 charset = CHARSET_FROM_ID (CODING_ISO_DESIGNATION (coding, 3));
2953 ONE_MORE_BYTE (c1);
2954 if (c1 < 0x20 || (c1 >= 0x80 && c1 < 0xA0))
2955 goto invalid_code;
2956 break;
2957
2958 case '0': case '2': case '3': case '4': /* start composition */
2959 if (! (coding->common_flags & CODING_ANNOTATE_COMPOSITION_MASK))
2960 goto invalid_code;
2961 DECODE_COMPOSITION_START (c1);
2962 continue;
2963
2964 case '1': /* end composition */
2965 if (composition_state == COMPOSING_NO)
2966 goto invalid_code;
2967 DECODE_COMPOSITION_END ();
2968 continue;
2969
2970 case '[': /* specification of direction */
2971 if (! CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DIRECTION)
2972 goto invalid_code;
2973 /* For the moment, nested direction is not supported.
2974 So, `coding->mode & CODING_MODE_DIRECTION' zero means
2975 left-to-right, and nozero means right-to-left. */
2976 ONE_MORE_BYTE (c1);
2977 switch (c1)
2978 {
2979 case ']': /* end of the current direction */
2980 coding->mode &= ~CODING_MODE_DIRECTION;
2981
2982 case '0': /* end of the current direction */
2983 case '1': /* start of left-to-right direction */
2984 ONE_MORE_BYTE (c1);
2985 if (c1 == ']')
2986 coding->mode &= ~CODING_MODE_DIRECTION;
2987 else
2988 goto invalid_code;
2989 break;
2990
2991 case '2': /* start of right-to-left direction */
2992 ONE_MORE_BYTE (c1);
2993 if (c1 == ']')
2994 coding->mode |= CODING_MODE_DIRECTION;
2995 else
2996 goto invalid_code;
2997 break;
2998
2999 default:
3000 goto invalid_code;
3001 }
3002 continue;
3003
3004 default:
3005 if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DESIGNATION))
3006 goto invalid_code;
3007 if (c1 >= 0x28 && c1 <= 0x2B)
3008 { /* designation of DIMENSION1_CHARS94 character set */
3009 ONE_MORE_BYTE (c2);
3010 DECODE_DESIGNATION (c1 - 0x28, 1, 0, c2);
3011 }
3012 else if (c1 >= 0x2C && c1 <= 0x2F)
3013 { /* designation of DIMENSION1_CHARS96 character set */
3014 ONE_MORE_BYTE (c2);
3015 DECODE_DESIGNATION (c1 - 0x2C, 1, 1, c2);
3016 }
3017 else
3018 goto invalid_code;
3019 /* We must update these variables now. */
3020 charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3021 charset_id_1 = CODING_ISO_INVOKED_CHARSET (coding, 1);
3022 continue;
3023 }
3024 }
3025
3026 /* Now we know CHARSET and 1st position code C1 of a character.
3027 Produce a decoded character while getting 2nd position code
3028 C2 if necessary. */
3029 c1 &= 0x7F;
3030 if (CHARSET_DIMENSION (charset) > 1)
3031 {
3032 ONE_MORE_BYTE (c2);
3033 if (c2 < 0x20 || (c2 >= 0x80 && c2 < 0xA0))
3034 /* C2 is not in a valid range. */
3035 goto invalid_code;
3036 c1 = (c1 << 8) | (c2 & 0x7F);
3037 if (CHARSET_DIMENSION (charset) > 2)
3038 {
3039 ONE_MORE_BYTE (c2);
3040 if (c2 < 0x20 || (c2 >= 0x80 && c2 < 0xA0))
3041 /* C2 is not in a valid range. */
3042 goto invalid_code;
3043 c1 = (c1 << 8) | (c2 & 0x7F);
3044 }
3045 }
3046
3047 CODING_DECODE_CHAR (coding, src, src_base, src_end, charset, c1, c);
3048 if (c < 0)
3049 {
3050 MAYBE_FINISH_COMPOSITION ();
3051 for (; src_base < src; src_base++, char_offset++)
3052 {
3053 if (ASCII_BYTE_P (*src_base))
3054 *charbuf++ = *src_base;
3055 else
3056 *charbuf++ = BYTE8_TO_CHAR (*src_base);
3057 }
3058 }
3059 else if (composition_state == COMPOSING_NO)
3060 {
3061 *charbuf++ = c;
3062 char_offset++;
3063 }
3064 else
3065 {
3066 components[component_idx++] = c;
3067 if (method == COMPOSITION_WITH_RULE
3068 || (method == COMPOSITION_WITH_RULE_ALTCHARS
3069 && composition_state == COMPOSING_COMPONENT_CHAR))
3070 composition_state++;
3071 }
3072 continue;
3073
3074 invalid_code:
3075 MAYBE_FINISH_COMPOSITION ();
3076 src = src_base;
3077 consumed_chars = consumed_chars_base;
3078 ONE_MORE_BYTE (c);
3079 *charbuf++ = ASCII_BYTE_P (c) ? c : BYTE8_TO_CHAR (c);
3080 coding->errors++;
3081 }
3082
3083 no_more_source:
3084 coding->consumed_char += consumed_chars_base;
3085 coding->consumed = src_base - coding->source;
3086 coding->charbuf_used = charbuf - coding->charbuf;
3087 }
3088
3089
3090 /* ISO2022 encoding stuff. */
3091
3092 /*
3093 It is not enough to say just "ISO2022" on encoding, we have to
3094 specify more details. In Emacs, each coding system of ISO2022
3095 variant has the following specifications:
3096 1. Initial designation to G0 thru G3.
3097 2. Allows short-form designation?
3098 3. ASCII should be designated to G0 before control characters?
3099 4. ASCII should be designated to G0 at end of line?
3100 5. 7-bit environment or 8-bit environment?
3101 6. Use locking-shift?
3102 7. Use Single-shift?
3103 And the following two are only for Japanese:
3104 8. Use ASCII in place of JIS0201-1976-Roman?
3105 9. Use JISX0208-1983 in place of JISX0208-1978?
3106 These specifications are encoded in CODING_ISO_FLAGS (coding) as flag bits
3107 defined by macros CODING_ISO_FLAG_XXX. See `coding.h' for more
3108 details.
3109 */
3110
3111 /* Produce codes (escape sequence) for designating CHARSET to graphic
3112 register REG at DST, and increment DST. If <final-char> of CHARSET is
3113 '@', 'A', or 'B' and the coding system CODING allows, produce
3114 designation sequence of short-form. */
3115
3116 #define ENCODE_DESIGNATION(charset, reg, coding) \
3117 do { \
3118 unsigned char final_char = CHARSET_ISO_FINAL (charset); \
3119 char *intermediate_char_94 = "()*+"; \
3120 char *intermediate_char_96 = ",-./"; \
3121 int revision = -1; \
3122 int c; \
3123 \
3124 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_REVISION) \
3125 revision = XINT (CHARSET_ISO_REVISION (charset)); \
3126 \
3127 if (revision >= 0) \
3128 { \
3129 EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, '&'); \
3130 EMIT_ONE_BYTE ('@' + revision); \
3131 } \
3132 EMIT_ONE_ASCII_BYTE (ISO_CODE_ESC); \
3133 if (CHARSET_DIMENSION (charset) == 1) \
3134 { \
3135 if (! CHARSET_ISO_CHARS_96 (charset)) \
3136 c = intermediate_char_94[reg]; \
3137 else \
3138 c = intermediate_char_96[reg]; \
3139 EMIT_ONE_ASCII_BYTE (c); \
3140 } \
3141 else \
3142 { \
3143 EMIT_ONE_ASCII_BYTE ('$'); \
3144 if (! CHARSET_ISO_CHARS_96 (charset)) \
3145 { \
3146 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LONG_FORM \
3147 || reg != 0 \
3148 || final_char < '@' || final_char > 'B') \
3149 EMIT_ONE_ASCII_BYTE (intermediate_char_94[reg]); \
3150 } \
3151 else \
3152 EMIT_ONE_ASCII_BYTE (intermediate_char_96[reg]); \
3153 } \
3154 EMIT_ONE_ASCII_BYTE (final_char); \
3155 \
3156 CODING_ISO_DESIGNATION (coding, reg) = CHARSET_ID (charset); \
3157 } while (0)
3158
3159
3160 /* The following two macros produce codes (control character or escape
3161 sequence) for ISO2022 single-shift functions (single-shift-2 and
3162 single-shift-3). */
3163
3164 #define ENCODE_SINGLE_SHIFT_2 \
3165 do { \
3166 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS) \
3167 EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'N'); \
3168 else \
3169 EMIT_ONE_BYTE (ISO_CODE_SS2); \
3170 CODING_ISO_SINGLE_SHIFTING (coding) = 1; \
3171 } while (0)
3172
3173
3174 #define ENCODE_SINGLE_SHIFT_3 \
3175 do { \
3176 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS) \
3177 EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'O'); \
3178 else \
3179 EMIT_ONE_BYTE (ISO_CODE_SS3); \
3180 CODING_ISO_SINGLE_SHIFTING (coding) = 1; \
3181 } while (0)
3182
3183
3184 /* The following four macros produce codes (control character or
3185 escape sequence) for ISO2022 locking-shift functions (shift-in,
3186 shift-out, locking-shift-2, and locking-shift-3). */
3187
3188 #define ENCODE_SHIFT_IN \
3189 do { \
3190 EMIT_ONE_ASCII_BYTE (ISO_CODE_SI); \
3191 CODING_ISO_INVOCATION (coding, 0) = 0; \
3192 } while (0)
3193
3194
3195 #define ENCODE_SHIFT_OUT \
3196 do { \
3197 EMIT_ONE_ASCII_BYTE (ISO_CODE_SO); \
3198 CODING_ISO_INVOCATION (coding, 0) = 1; \
3199 } while (0)
3200
3201
3202 #define ENCODE_LOCKING_SHIFT_2 \
3203 do { \
3204 EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'n'); \
3205 CODING_ISO_INVOCATION (coding, 0) = 2; \
3206 } while (0)
3207
3208
3209 #define ENCODE_LOCKING_SHIFT_3 \
3210 do { \
3211 EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'n'); \
3212 CODING_ISO_INVOCATION (coding, 0) = 3; \
3213 } while (0)
3214
3215
3216 /* Produce codes for a DIMENSION1 character whose character set is
3217 CHARSET and whose position-code is C1. Designation and invocation
3218 sequences are also produced in advance if necessary. */
3219
3220 #define ENCODE_ISO_CHARACTER_DIMENSION1(charset, c1) \
3221 do { \
3222 int id = CHARSET_ID (charset); \
3223 \
3224 if ((CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_USE_ROMAN) \
3225 && id == charset_ascii) \
3226 { \
3227 id = charset_jisx0201_roman; \
3228 charset = CHARSET_FROM_ID (id); \
3229 } \
3230 \
3231 if (CODING_ISO_SINGLE_SHIFTING (coding)) \
3232 { \
3233 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS) \
3234 EMIT_ONE_ASCII_BYTE (c1 & 0x7F); \
3235 else \
3236 EMIT_ONE_BYTE (c1 | 0x80); \
3237 CODING_ISO_SINGLE_SHIFTING (coding) = 0; \
3238 break; \
3239 } \
3240 else if (id == CODING_ISO_INVOKED_CHARSET (coding, 0)) \
3241 { \
3242 EMIT_ONE_ASCII_BYTE (c1 & 0x7F); \
3243 break; \
3244 } \
3245 else if (id == CODING_ISO_INVOKED_CHARSET (coding, 1)) \
3246 { \
3247 EMIT_ONE_BYTE (c1 | 0x80); \
3248 break; \
3249 } \
3250 else \
3251 /* Since CHARSET is not yet invoked to any graphic planes, we \
3252 must invoke it, or, at first, designate it to some graphic \
3253 register. Then repeat the loop to actually produce the \
3254 character. */ \
3255 dst = encode_invocation_designation (charset, coding, dst, \
3256 &produced_chars); \
3257 } while (1)
3258
3259
3260 /* Produce codes for a DIMENSION2 character whose character set is
3261 CHARSET and whose position-codes are C1 and C2. Designation and
3262 invocation codes are also produced in advance if necessary. */
3263
3264 #define ENCODE_ISO_CHARACTER_DIMENSION2(charset, c1, c2) \
3265 do { \
3266 int id = CHARSET_ID (charset); \
3267 \
3268 if ((CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_USE_OLDJIS) \
3269 && id == charset_jisx0208) \
3270 { \
3271 id = charset_jisx0208_1978; \
3272 charset = CHARSET_FROM_ID (id); \
3273 } \
3274 \
3275 if (CODING_ISO_SINGLE_SHIFTING (coding)) \
3276 { \
3277 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS) \
3278 EMIT_TWO_ASCII_BYTES ((c1) & 0x7F, (c2) & 0x7F); \
3279 else \
3280 EMIT_TWO_BYTES ((c1) | 0x80, (c2) | 0x80); \
3281 CODING_ISO_SINGLE_SHIFTING (coding) = 0; \
3282 break; \
3283 } \
3284 else if (id == CODING_ISO_INVOKED_CHARSET (coding, 0)) \
3285 { \
3286 EMIT_TWO_ASCII_BYTES ((c1) & 0x7F, (c2) & 0x7F); \
3287 break; \
3288 } \
3289 else if (id == CODING_ISO_INVOKED_CHARSET (coding, 1)) \
3290 { \
3291 EMIT_TWO_BYTES ((c1) | 0x80, (c2) | 0x80); \
3292 break; \
3293 } \
3294 else \
3295 /* Since CHARSET is not yet invoked to any graphic planes, we \
3296 must invoke it, or, at first, designate it to some graphic \
3297 register. Then repeat the loop to actually produce the \
3298 character. */ \
3299 dst = encode_invocation_designation (charset, coding, dst, \
3300 &produced_chars); \
3301 } while (1)
3302
3303
3304 #define ENCODE_ISO_CHARACTER(charset, c) \
3305 do { \
3306 int code = ENCODE_CHAR ((charset),(c)); \
3307 \
3308 if (CHARSET_DIMENSION (charset) == 1) \
3309 ENCODE_ISO_CHARACTER_DIMENSION1 ((charset), code); \
3310 else \
3311 ENCODE_ISO_CHARACTER_DIMENSION2 ((charset), code >> 8, code & 0xFF); \
3312 } while (0)
3313
3314
3315 /* Produce designation and invocation codes at a place pointed by DST
3316 to use CHARSET. The element `spec.iso_2022' of *CODING is updated.
3317 Return new DST. */
3318
3319 unsigned char *
3320 encode_invocation_designation (charset, coding, dst, p_nchars)
3321 struct charset *charset;
3322 struct coding_system *coding;
3323 unsigned char *dst;
3324 int *p_nchars;
3325 {
3326 int multibytep = coding->dst_multibyte;
3327 int produced_chars = *p_nchars;
3328 int reg; /* graphic register number */
3329 int id = CHARSET_ID (charset);
3330
3331 /* At first, check designations. */
3332 for (reg = 0; reg < 4; reg++)
3333 if (id == CODING_ISO_DESIGNATION (coding, reg))
3334 break;
3335
3336 if (reg >= 4)
3337 {
3338 /* CHARSET is not yet designated to any graphic registers. */
3339 /* At first check the requested designation. */
3340 reg = CODING_ISO_REQUEST (coding, id);
3341 if (reg < 0)
3342 /* Since CHARSET requests no special designation, designate it
3343 to graphic register 0. */
3344 reg = 0;
3345
3346 ENCODE_DESIGNATION (charset, reg, coding);
3347 }
3348
3349 if (CODING_ISO_INVOCATION (coding, 0) != reg
3350 && CODING_ISO_INVOCATION (coding, 1) != reg)
3351 {
3352 /* Since the graphic register REG is not invoked to any graphic
3353 planes, invoke it to graphic plane 0. */
3354 switch (reg)
3355 {
3356 case 0: /* graphic register 0 */
3357 ENCODE_SHIFT_IN;
3358 break;
3359
3360 case 1: /* graphic register 1 */
3361 ENCODE_SHIFT_OUT;
3362 break;
3363
3364 case 2: /* graphic register 2 */
3365 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT)
3366 ENCODE_SINGLE_SHIFT_2;
3367 else
3368 ENCODE_LOCKING_SHIFT_2;
3369 break;
3370
3371 case 3: /* graphic register 3 */
3372 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT)
3373 ENCODE_SINGLE_SHIFT_3;
3374 else
3375 ENCODE_LOCKING_SHIFT_3;
3376 break;
3377 }
3378 }
3379
3380 *p_nchars = produced_chars;
3381 return dst;
3382 }
3383
3384 /* The following three macros produce codes for indicating direction
3385 of text. */
3386 #define ENCODE_CONTROL_SEQUENCE_INTRODUCER \
3387 do { \
3388 if (CODING_ISO_FLAGS (coding) == CODING_ISO_FLAG_SEVEN_BITS) \
3389 EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, '['); \
3390 else \
3391 EMIT_ONE_BYTE (ISO_CODE_CSI); \
3392 } while (0)
3393
3394
3395 #define ENCODE_DIRECTION_R2L() \
3396 do { \
3397 ENCODE_CONTROL_SEQUENCE_INTRODUCER (dst); \
3398 EMIT_TWO_ASCII_BYTES ('2', ']'); \
3399 } while (0)
3400
3401
3402 #define ENCODE_DIRECTION_L2R() \
3403 do { \
3404 ENCODE_CONTROL_SEQUENCE_INTRODUCER (dst); \
3405 EMIT_TWO_ASCII_BYTES ('0', ']'); \
3406 } while (0)
3407
3408
3409 /* Produce codes for designation and invocation to reset the graphic
3410 planes and registers to initial state. */
3411 #define ENCODE_RESET_PLANE_AND_REGISTER() \
3412 do { \
3413 int reg; \
3414 struct charset *charset; \
3415 \
3416 if (CODING_ISO_INVOCATION (coding, 0) != 0) \
3417 ENCODE_SHIFT_IN; \
3418 for (reg = 0; reg < 4; reg++) \
3419 if (CODING_ISO_INITIAL (coding, reg) >= 0 \
3420 && (CODING_ISO_DESIGNATION (coding, reg) \
3421 != CODING_ISO_INITIAL (coding, reg))) \
3422 { \
3423 charset = CHARSET_FROM_ID (CODING_ISO_INITIAL (coding, reg)); \
3424 ENCODE_DESIGNATION (charset, reg, coding); \
3425 } \
3426 } while (0)
3427
3428
3429 /* Produce designation sequences of charsets in the line started from
3430 SRC to a place pointed by DST, and return updated DST.
3431
3432 If the current block ends before any end-of-line, we may fail to
3433 find all the necessary designations. */
3434
3435 static unsigned char *
3436 encode_designation_at_bol (coding, charbuf, charbuf_end, dst)
3437 struct coding_system *coding;
3438 int *charbuf, *charbuf_end;
3439 unsigned char *dst;
3440 {
3441 struct charset *charset;
3442 /* Table of charsets to be designated to each graphic register. */
3443 int r[4];
3444 int c, found = 0, reg;
3445 int produced_chars = 0;
3446 int multibytep = coding->dst_multibyte;
3447 Lisp_Object attrs;
3448 Lisp_Object charset_list;
3449
3450 attrs = CODING_ID_ATTRS (coding->id);
3451 charset_list = CODING_ATTR_CHARSET_LIST (attrs);
3452 if (EQ (charset_list, Qiso_2022))
3453 charset_list = Viso_2022_charset_list;
3454
3455 for (reg = 0; reg < 4; reg++)
3456 r[reg] = -1;
3457
3458 while (found < 4)
3459 {
3460 int id;
3461
3462 c = *charbuf++;
3463 if (c == '\n')
3464 break;
3465 charset = char_charset (c, charset_list, NULL);
3466 id = CHARSET_ID (charset);
3467 reg = CODING_ISO_REQUEST (coding, id);
3468 if (reg >= 0 && r[reg] < 0)
3469 {
3470 found++;
3471 r[reg] = id;
3472 }
3473 }
3474
3475 if (found)
3476 {
3477 for (reg = 0; reg < 4; reg++)
3478 if (r[reg] >= 0
3479 && CODING_ISO_DESIGNATION (coding, reg) != r[reg])
3480 ENCODE_DESIGNATION (CHARSET_FROM_ID (r[reg]), reg, coding);
3481 }
3482
3483 return dst;
3484 }
3485
3486 /* See the above "GENERAL NOTES on `encode_coding_XXX ()' functions". */
3487
3488 static int
3489 encode_coding_iso_2022 (coding)
3490 struct coding_system *coding;
3491 {
3492 int multibytep = coding->dst_multibyte;
3493 int *charbuf = coding->charbuf;
3494 int *charbuf_end = charbuf + coding->charbuf_used;
3495 unsigned char *dst = coding->destination + coding->produced;
3496 unsigned char *dst_end = coding->destination + coding->dst_bytes;
3497 int safe_room = 16;
3498 int bol_designation
3499 = (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DESIGNATE_AT_BOL
3500 && CODING_ISO_BOL (coding));
3501 int produced_chars = 0;
3502 Lisp_Object attrs, eol_type, charset_list;
3503 int ascii_compatible;
3504 int c;
3505
3506 CODING_GET_INFO (coding, attrs, eol_type, charset_list);
3507 setup_iso_safe_charsets (attrs);
3508 coding->safe_charsets
3509 = (char *) XSTRING (CODING_ATTR_SAFE_CHARSETS(attrs))->data;
3510
3511 ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs));
3512
3513 while (charbuf < charbuf_end)
3514 {
3515 ASSURE_DESTINATION (safe_room);
3516
3517 if (bol_designation)
3518 {
3519 unsigned char *dst_prev = dst;
3520
3521 /* We have to produce designation sequences if any now. */
3522 dst = encode_designation_at_bol (coding, charbuf, charbuf_end, dst);
3523 bol_designation = 0;
3524 /* We are sure that designation sequences are all ASCII bytes. */
3525 produced_chars += dst - dst_prev;
3526 }
3527
3528 c = *charbuf++;
3529
3530 /* Now encode the character C. */
3531 if (c < 0x20 || c == 0x7F)
3532 {
3533 if (c == '\n'
3534 || (c == '\r' && EQ (eol_type, Qmac)))
3535 {
3536 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_RESET_AT_EOL)
3537 ENCODE_RESET_PLANE_AND_REGISTER ();
3538 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_INIT_AT_BOL)
3539 {
3540 int i;
3541
3542 for (i = 0; i < 4; i++)
3543 CODING_ISO_DESIGNATION (coding, i)
3544 = CODING_ISO_INITIAL (coding, i);
3545 }
3546 bol_designation
3547 = CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DESIGNATE_AT_BOL;
3548 }
3549 else if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_RESET_AT_CNTL)
3550 ENCODE_RESET_PLANE_AND_REGISTER ();
3551 EMIT_ONE_ASCII_BYTE (c);
3552 }
3553 else if (ASCII_CHAR_P (c))
3554 {
3555 if (ascii_compatible)
3556 EMIT_ONE_ASCII_BYTE (c);
3557 else
3558 {
3559 struct charset *charset = CHARSET_FROM_ID (charset_ascii);
3560 ENCODE_ISO_CHARACTER (charset, c);
3561 }
3562 }
3563 else if (CHAR_BYTE8_P (c))
3564 {
3565 c = CHAR_TO_BYTE8 (c);
3566 EMIT_ONE_BYTE (c);
3567 }
3568 else
3569 {
3570 struct charset *charset = char_charset (c, charset_list, NULL);
3571
3572 if (!charset)
3573 {
3574 if (coding->mode & CODING_MODE_SAFE_ENCODING)
3575 {
3576 c = CODING_INHIBIT_CHARACTER_SUBSTITUTION;
3577 charset = CHARSET_FROM_ID (charset_ascii);
3578 }
3579 else
3580 {
3581 c = coding->default_char;
3582 charset = char_charset (c, charset_list, NULL);
3583 }
3584 }
3585 ENCODE_ISO_CHARACTER (charset, c);
3586 }
3587 }
3588
3589 if (coding->mode & CODING_MODE_LAST_BLOCK
3590 && CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_RESET_AT_EOL)
3591 {
3592 ASSURE_DESTINATION (safe_room);
3593 ENCODE_RESET_PLANE_AND_REGISTER ();
3594 }
3595 coding->result = CODING_RESULT_SUCCESS;
3596 CODING_ISO_BOL (coding) = bol_designation;
3597 coding->produced_char += produced_chars;
3598 coding->produced = dst - coding->destination;
3599 return 0;
3600 }
3601
3602 \f
3603 /*** 8,9. SJIS and BIG5 handlers ***/
3604
3605 /* Although SJIS and BIG5 are not ISO's coding system, they are used
3606 quite widely. So, for the moment, Emacs supports them in the bare
3607 C code. But, in the future, they may be supported only by CCL. */
3608
3609 /* SJIS is a coding system encoding three character sets: ASCII, right
3610 half of JISX0201-Kana, and JISX0208. An ASCII character is encoded
3611 as is. A character of charset katakana-jisx0201 is encoded by
3612 "position-code + 0x80". A character of charset japanese-jisx0208
3613 is encoded in 2-byte but two position-codes are divided and shifted
3614 so that it fit in the range below.
3615
3616 --- CODE RANGE of SJIS ---
3617 (character set) (range)
3618 ASCII 0x00 .. 0x7F
3619 KATAKANA-JISX0201 0xA0 .. 0xDF
3620 JISX0208 (1st byte) 0x81 .. 0x9F and 0xE0 .. 0xEF
3621 (2nd byte) 0x40 .. 0x7E and 0x80 .. 0xFC
3622 -------------------------------
3623
3624 */
3625
3626 /* BIG5 is a coding system encoding two character sets: ASCII and
3627 Big5. An ASCII character is encoded as is. Big5 is a two-byte
3628 character set and is encoded in two-byte.
3629
3630 --- CODE RANGE of BIG5 ---
3631 (character set) (range)
3632 ASCII 0x00 .. 0x7F
3633 Big5 (1st byte) 0xA1 .. 0xFE
3634 (2nd byte) 0x40 .. 0x7E and 0xA1 .. 0xFE
3635 --------------------------
3636
3637 */
3638
3639 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
3640 Check if a text is encoded in SJIS. If it is, return
3641 CATEGORY_MASK_SJIS, else return 0. */
3642
3643 static int
3644 detect_coding_sjis (coding, mask)
3645 struct coding_system *coding;
3646 int *mask;
3647 {
3648 unsigned char *src = coding->source, *src_base = src;
3649 unsigned char *src_end = coding->source + coding->src_bytes;
3650 int multibytep = coding->src_multibyte;
3651 int consumed_chars = 0;
3652 int found = 0;
3653 int c;
3654
3655 /* A coding system of this category is always ASCII compatible. */
3656 src += coding->head_ascii;
3657
3658 while (1)
3659 {
3660 ONE_MORE_BYTE (c);
3661 if (c < 0x80)
3662 continue;
3663 if ((c >= 0x81 && c <= 0x9F) || (c >= 0xE0 && c <= 0xEF))
3664 {
3665 ONE_MORE_BYTE (c);
3666 if (c < 0x40 || c == 0x7F || c > 0xFC)
3667 break;
3668 found = 1;
3669 }
3670 else if (c >= 0xA0 && c < 0xE0)
3671 found = 1;
3672 else
3673 break;
3674 }
3675 *mask &= ~CATEGORY_MASK_SJIS;
3676 return 0;
3677
3678 no_more_source:
3679 if (!found)
3680 return 0;
3681 *mask &= CATEGORY_MASK_SJIS;
3682 return 1;
3683 }
3684
3685 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
3686 Check if a text is encoded in BIG5. If it is, return
3687 CATEGORY_MASK_BIG5, else return 0. */
3688
3689 static int
3690 detect_coding_big5 (coding, mask)
3691 struct coding_system *coding;
3692 int *mask;
3693 {
3694 unsigned char *src = coding->source, *src_base = src;
3695 unsigned char *src_end = coding->source + coding->src_bytes;
3696 int multibytep = coding->src_multibyte;
3697 int consumed_chars = 0;
3698 int found = 0;
3699 int c;
3700
3701 /* A coding system of this category is always ASCII compatible. */
3702 src += coding->head_ascii;
3703
3704 while (1)
3705 {
3706 ONE_MORE_BYTE (c);
3707 if (c < 0x80)
3708 continue;
3709 if (c >= 0xA1)
3710 {
3711 ONE_MORE_BYTE (c);
3712 if (c < 0x40 || (c >= 0x7F && c <= 0xA0))
3713 return 0;
3714 found = 1;
3715 }
3716 else
3717 break;
3718 }
3719 *mask &= ~CATEGORY_MASK_BIG5;
3720 return 0;
3721
3722 no_more_source:
3723 if (!found)
3724 return 0;
3725 *mask &= CATEGORY_MASK_BIG5;
3726 return 1;
3727 }
3728
3729 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".
3730 If SJIS_P is 1, decode SJIS text, else decode BIG5 test. */
3731
3732 static void
3733 decode_coding_sjis (coding)
3734 struct coding_system *coding;
3735 {
3736 unsigned char *src = coding->source + coding->consumed;
3737 unsigned char *src_end = coding->source + coding->src_bytes;
3738 unsigned char *src_base;
3739 int *charbuf = coding->charbuf;
3740 int *charbuf_end = charbuf + coding->charbuf_size;
3741 int consumed_chars = 0, consumed_chars_base;
3742 int multibytep = coding->src_multibyte;
3743 struct charset *charset_roman, *charset_kanji, *charset_kana;
3744 Lisp_Object attrs, eol_type, charset_list, val;
3745
3746 CODING_GET_INFO (coding, attrs, eol_type, charset_list);
3747
3748 val = charset_list;
3749 charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
3750 charset_kanji = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
3751 charset_kana = CHARSET_FROM_ID (XINT (XCAR (val)));
3752
3753 while (1)
3754 {
3755 int c, c1;
3756
3757 src_base = src;
3758 consumed_chars_base = consumed_chars;
3759
3760 if (charbuf >= charbuf_end)
3761 break;
3762
3763 ONE_MORE_BYTE (c);
3764
3765 if (c == '\r')
3766 {
3767 if (EQ (eol_type, Qdos))
3768 {
3769 if (src == src_end)
3770 goto no_more_source;
3771 if (*src == '\n')
3772 ONE_MORE_BYTE (c);
3773 }
3774 else if (EQ (eol_type, Qmac))
3775 c = '\n';
3776 }
3777 else
3778 {
3779 struct charset *charset;
3780
3781 if (c < 0x80)
3782 charset = charset_roman;
3783 else
3784 {
3785 if (c >= 0xF0)
3786 goto invalid_code;
3787 if (c < 0xA0 || c >= 0xE0)
3788 {
3789 /* SJIS -> JISX0208 */
3790 ONE_MORE_BYTE (c1);
3791 if (c1 < 0x40 || c1 == 0x7F || c1 > 0xFC)
3792 goto invalid_code;
3793 c = (c << 8) | c1;
3794 SJIS_TO_JIS (c);
3795 charset = charset_kanji;
3796 }
3797 else
3798 /* SJIS -> JISX0201-Kana */
3799 charset = charset_kana;
3800 }
3801 CODING_DECODE_CHAR (coding, src, src_base, src_end, charset, c, c);
3802 }
3803 *charbuf++ = c;
3804 continue;
3805
3806 invalid_code:
3807 src = src_base;
3808 consumed_chars = consumed_chars_base;
3809 ONE_MORE_BYTE (c);
3810 *charbuf++ = ASCII_BYTE_P (c) ? c : BYTE8_TO_CHAR (c);
3811 coding->errors++;
3812 }
3813
3814 no_more_source:
3815 coding->consumed_char += consumed_chars_base;
3816 coding->consumed = src_base - coding->source;
3817 coding->charbuf_used = charbuf - coding->charbuf;
3818 }
3819
3820 static void
3821 decode_coding_big5 (coding)
3822 struct coding_system *coding;
3823 {
3824 unsigned char *src = coding->source + coding->consumed;
3825 unsigned char *src_end = coding->source + coding->src_bytes;
3826 unsigned char *src_base;
3827 int *charbuf = coding->charbuf;
3828 int *charbuf_end = charbuf + coding->charbuf_size;
3829 int consumed_chars = 0, consumed_chars_base;
3830 int multibytep = coding->src_multibyte;
3831 struct charset *charset_roman, *charset_big5;
3832 Lisp_Object attrs, eol_type, charset_list, val;
3833
3834 CODING_GET_INFO (coding, attrs, eol_type, charset_list);
3835 val = charset_list;
3836 charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
3837 charset_big5 = CHARSET_FROM_ID (XINT (XCAR (val)));
3838
3839 while (1)
3840 {
3841 int c, c1;
3842
3843 src_base = src;
3844 consumed_chars_base = consumed_chars;
3845
3846 if (charbuf >= charbuf_end)
3847 break;
3848
3849 ONE_MORE_BYTE (c);
3850
3851 if (c == '\r')
3852 {
3853 if (EQ (eol_type, Qdos))
3854 {
3855 if (src == src_end)
3856 goto no_more_source;
3857 if (*src == '\n')
3858 ONE_MORE_BYTE (c);
3859 }
3860 else if (EQ (eol_type, Qmac))
3861 c = '\n';
3862 }
3863 else
3864 {
3865 struct charset *charset;
3866 if (c < 0x80)
3867 charset = charset_roman;
3868 else
3869 {
3870 /* BIG5 -> Big5 */
3871 if (c < 0xA1 || c > 0xFE)
3872 goto invalid_code;
3873 ONE_MORE_BYTE (c1);
3874 if (c1 < 0x40 || (c1 > 0x7E && c1 < 0xA1) || c1 > 0xFE)
3875 goto invalid_code;
3876 c = c << 8 | c1;
3877 charset = charset_big5;
3878 }
3879 CODING_DECODE_CHAR (coding, src, src_base, src_end, charset, c, c);
3880 }
3881
3882 *charbuf++ = c;
3883 continue;
3884
3885 invalid_code:
3886 src = src_base;
3887 consumed_chars = consumed_chars_base;
3888 ONE_MORE_BYTE (c);
3889 *charbuf++ = ASCII_BYTE_P (c) ? c : BYTE8_TO_CHAR (c);
3890 coding->errors++;
3891 }
3892
3893 no_more_source:
3894 coding->consumed_char += consumed_chars_base;
3895 coding->consumed = src_base - coding->source;
3896 coding->charbuf_used = charbuf - coding->charbuf;
3897 }
3898
3899 /* See the above "GENERAL NOTES on `encode_coding_XXX ()' functions".
3900 This function can encode charsets `ascii', `katakana-jisx0201',
3901 `japanese-jisx0208', `chinese-big5-1', and `chinese-big5-2'. We
3902 are sure that all these charsets are registered as official charset
3903 (i.e. do not have extended leading-codes). Characters of other
3904 charsets are produced without any encoding. If SJIS_P is 1, encode
3905 SJIS text, else encode BIG5 text. */
3906
3907 static int
3908 encode_coding_sjis (coding)
3909 struct coding_system *coding;
3910 {
3911 int multibytep = coding->dst_multibyte;
3912 int *charbuf = coding->charbuf;
3913 int *charbuf_end = charbuf + coding->charbuf_used;
3914 unsigned char *dst = coding->destination + coding->produced;
3915 unsigned char *dst_end = coding->destination + coding->dst_bytes;
3916 int safe_room = 4;
3917 int produced_chars = 0;
3918 Lisp_Object attrs, eol_type, charset_list, val;
3919 int ascii_compatible;
3920 struct charset *charset_roman, *charset_kanji, *charset_kana;
3921 int c;
3922
3923 CODING_GET_INFO (coding, attrs, eol_type, charset_list);
3924 val = charset_list;
3925 charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
3926 charset_kana = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
3927 charset_kanji = CHARSET_FROM_ID (XINT (XCAR (val)));
3928
3929 ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs));
3930
3931 while (charbuf < charbuf_end)
3932 {
3933 ASSURE_DESTINATION (safe_room);
3934 c = *charbuf++;
3935 /* Now encode the character C. */
3936 if (ASCII_CHAR_P (c) && ascii_compatible)
3937 EMIT_ONE_ASCII_BYTE (c);
3938 else if (CHAR_BYTE8_P (c))
3939 {
3940 c = CHAR_TO_BYTE8 (c);
3941 EMIT_ONE_BYTE (c);
3942 }
3943 else
3944 {
3945 unsigned code;
3946 struct charset *charset = char_charset (c, charset_list, &code);
3947
3948 if (!charset)
3949 {
3950 if (coding->mode & CODING_MODE_SAFE_ENCODING)
3951 {
3952 code = CODING_INHIBIT_CHARACTER_SUBSTITUTION;
3953 charset = CHARSET_FROM_ID (charset_ascii);
3954 }
3955 else
3956 {
3957 c = coding->default_char;
3958 charset = char_charset (c, charset_list, &code);
3959 }
3960 }
3961 if (code == CHARSET_INVALID_CODE (charset))
3962 abort ();
3963 if (charset == charset_kanji)
3964 {
3965 int c1, c2;
3966 JIS_TO_SJIS (code);
3967 c1 = code >> 8, c2 = code & 0xFF;
3968 EMIT_TWO_BYTES (c1, c2);
3969 }
3970 else if (charset == charset_kana)
3971 EMIT_ONE_BYTE (code | 0x80);
3972 else
3973 EMIT_ONE_ASCII_BYTE (code & 0x7F);
3974 }
3975 }
3976 coding->result = CODING_RESULT_SUCCESS;
3977 coding->produced_char += produced_chars;
3978 coding->produced = dst - coding->destination;
3979 return 0;
3980 }
3981
3982 static int
3983 encode_coding_big5 (coding)
3984 struct coding_system *coding;
3985 {
3986 int multibytep = coding->dst_multibyte;
3987 int *charbuf = coding->charbuf;
3988 int *charbuf_end = charbuf + coding->charbuf_used;
3989 unsigned char *dst = coding->destination + coding->produced;
3990 unsigned char *dst_end = coding->destination + coding->dst_bytes;
3991 int safe_room = 4;
3992 int produced_chars = 0;
3993 Lisp_Object attrs, eol_type, charset_list, val;
3994 int ascii_compatible;
3995 struct charset *charset_roman, *charset_big5;
3996 int c;
3997
3998 CODING_GET_INFO (coding, attrs, eol_type, charset_list);
3999 val = charset_list;
4000 charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4001 charset_big5 = CHARSET_FROM_ID (XINT (XCAR (val)));
4002 ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs));
4003
4004 while (charbuf < charbuf_end)
4005 {
4006 ASSURE_DESTINATION (safe_room);
4007 c = *charbuf++;
4008 /* Now encode the character C. */
4009 if (ASCII_CHAR_P (c) && ascii_compatible)
4010 EMIT_ONE_ASCII_BYTE (c);
4011 else if (CHAR_BYTE8_P (c))
4012 {
4013 c = CHAR_TO_BYTE8 (c);
4014 EMIT_ONE_BYTE (c);
4015 }
4016 else
4017 {
4018 unsigned code;
4019 struct charset *charset = char_charset (c, charset_list, &code);
4020
4021 if (! charset)
4022 {
4023 if (coding->mode & CODING_MODE_SAFE_ENCODING)
4024 {
4025 code = CODING_INHIBIT_CHARACTER_SUBSTITUTION;
4026 charset = CHARSET_FROM_ID (charset_ascii);
4027 }
4028 else
4029 {
4030 c = coding->default_char;
4031 charset = char_charset (c, charset_list, &code);
4032 }
4033 }
4034 if (code == CHARSET_INVALID_CODE (charset))
4035 abort ();
4036 if (charset == charset_big5)
4037 {
4038 int c1, c2;
4039
4040 c1 = code >> 8, c2 = code & 0xFF;
4041 EMIT_TWO_BYTES (c1, c2);
4042 }
4043 else
4044 EMIT_ONE_ASCII_BYTE (code & 0x7F);
4045 }
4046 }
4047 coding->result = CODING_RESULT_SUCCESS;
4048 coding->produced_char += produced_chars;
4049 coding->produced = dst - coding->destination;
4050 return 0;
4051 }
4052
4053 \f
4054 /*** 10. CCL handlers ***/
4055
4056 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
4057 Check if a text is encoded in a coding system of which
4058 encoder/decoder are written in CCL program. If it is, return
4059 CATEGORY_MASK_CCL, else return 0. */
4060
4061 static int
4062 detect_coding_ccl (coding, mask)
4063 struct coding_system *coding;
4064 int *mask;
4065 {
4066 unsigned char *src = coding->source, *src_base = src;
4067 unsigned char *src_end = coding->source + coding->src_bytes;
4068 int multibytep = coding->src_multibyte;
4069 int consumed_chars = 0;
4070 int found = 0;
4071 unsigned char *valids = CODING_CCL_VALIDS (coding);
4072 int head_ascii = coding->head_ascii;
4073 Lisp_Object attrs;
4074
4075 coding = &coding_categories[coding_category_ccl];
4076 attrs = CODING_ID_ATTRS (coding->id);
4077 if (! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
4078 src += head_ascii;
4079
4080 while (1)
4081 {
4082 int c;
4083 ONE_MORE_BYTE (c);
4084 if (! valids[c])
4085 break;
4086 if (!found && valids[c] > 1)
4087 found = 1;
4088 }
4089 *mask &= ~CATEGORY_MASK_CCL;
4090 return 0;
4091
4092 no_more_source:
4093 if (!found)
4094 return 0;
4095 *mask &= CATEGORY_MASK_CCL;
4096 return 1;
4097 }
4098
4099 static void
4100 decode_coding_ccl (coding)
4101 struct coding_system *coding;
4102 {
4103 unsigned char *src = coding->source + coding->consumed;
4104 unsigned char *src_end = coding->source + coding->src_bytes;
4105 int *charbuf = coding->charbuf;
4106 int *charbuf_end = charbuf + coding->charbuf_size;
4107 int consumed_chars = 0;
4108 int multibytep = coding->src_multibyte;
4109 struct ccl_program ccl;
4110 int source_charbuf[1024];
4111 int source_byteidx[1024];
4112
4113 setup_ccl_program (&ccl, CODING_CCL_DECODER (coding));
4114
4115 while (src < src_end)
4116 {
4117 unsigned char *p = src;
4118 int *source, *source_end;
4119 int i = 0;
4120
4121 if (multibytep)
4122 while (i < 1024 && p < src_end)
4123 {
4124 source_byteidx[i] = p - src;
4125 source_charbuf[i++] = STRING_CHAR_ADVANCE (p);
4126 }
4127 else
4128 while (i < 1024 && p < src_end)
4129 source_charbuf[i++] = *p++;
4130
4131 if (p == src_end && coding->mode & CODING_MODE_LAST_BLOCK)
4132 ccl.last_block = 1;
4133
4134 source = source_charbuf;
4135 source_end = source + i;
4136 while (source < source_end)
4137 {
4138 ccl_driver (&ccl, source, charbuf,
4139 source_end - source, charbuf_end - charbuf);
4140 source += ccl.consumed;
4141 charbuf += ccl.produced;
4142 if (ccl.status != CCL_STAT_SUSPEND_BY_DST)
4143 break;
4144 }
4145 if (source < source_end)
4146 src += source_byteidx[source - source_charbuf];
4147 else
4148 src = p;
4149 consumed_chars += source - source_charbuf;
4150
4151 if (ccl.status != CCL_STAT_SUSPEND_BY_SRC
4152 && ccl.status != CODING_RESULT_INSUFFICIENT_SRC)
4153 break;
4154 }
4155
4156 switch (ccl.status)
4157 {
4158 case CCL_STAT_SUSPEND_BY_SRC:
4159 coding->result = CODING_RESULT_INSUFFICIENT_SRC;
4160 break;
4161 case CCL_STAT_SUSPEND_BY_DST:
4162 break;
4163 case CCL_STAT_QUIT:
4164 case CCL_STAT_INVALID_CMD:
4165 coding->result = CODING_RESULT_INTERRUPT;
4166 break;
4167 default:
4168 coding->result = CODING_RESULT_SUCCESS;
4169 break;
4170 }
4171 coding->consumed_char += consumed_chars;
4172 coding->consumed = src - coding->source;
4173 coding->charbuf_used = charbuf - coding->charbuf;
4174 }
4175
4176 static int
4177 encode_coding_ccl (coding)
4178 struct coding_system *coding;
4179 {
4180 struct ccl_program ccl;
4181 int multibytep = coding->dst_multibyte;
4182 int *charbuf = coding->charbuf;
4183 int *charbuf_end = charbuf + coding->charbuf_used;
4184 unsigned char *dst = coding->destination + coding->produced;
4185 unsigned char *dst_end = coding->destination + coding->dst_bytes;
4186 unsigned char *adjusted_dst_end = dst_end - 1;
4187 int destination_charbuf[1024];
4188 int i, produced_chars = 0;
4189
4190 setup_ccl_program (&ccl, CODING_CCL_ENCODER (coding));
4191
4192 ccl.last_block = coding->mode & CODING_MODE_LAST_BLOCK;
4193 ccl.dst_multibyte = coding->dst_multibyte;
4194
4195 while (charbuf < charbuf_end && dst < adjusted_dst_end)
4196 {
4197 int dst_bytes = dst_end - dst;
4198 if (dst_bytes > 1024)
4199 dst_bytes = 1024;
4200
4201 ccl_driver (&ccl, charbuf, destination_charbuf,
4202 charbuf_end - charbuf, dst_bytes);
4203 charbuf += ccl.consumed;
4204 if (multibytep)
4205 for (i = 0; i < ccl.produced; i++)
4206 EMIT_ONE_BYTE (destination_charbuf[i] & 0xFF);
4207 else
4208 {
4209 for (i = 0; i < ccl.produced; i++)
4210 *dst++ = destination_charbuf[i] & 0xFF;
4211 produced_chars += ccl.produced;
4212 }
4213 }
4214
4215 switch (ccl.status)
4216 {
4217 case CCL_STAT_SUSPEND_BY_SRC:
4218 coding->result = CODING_RESULT_INSUFFICIENT_SRC;
4219 break;
4220 case CCL_STAT_SUSPEND_BY_DST:
4221 coding->result = CODING_RESULT_INSUFFICIENT_DST;
4222 break;
4223 case CCL_STAT_QUIT:
4224 case CCL_STAT_INVALID_CMD:
4225 coding->result = CODING_RESULT_INTERRUPT;
4226 break;
4227 default:
4228 coding->result = CODING_RESULT_SUCCESS;
4229 break;
4230 }
4231
4232 coding->produced_char += produced_chars;
4233 coding->produced = dst - coding->destination;
4234 return 0;
4235 }
4236
4237
4238 \f
4239 /*** 10, 11. no-conversion handlers ***/
4240
4241 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions". */
4242
4243 static void
4244 decode_coding_raw_text (coding)
4245 struct coding_system *coding;
4246 {
4247 coding->chars_at_source = 1;
4248 coding->consumed_char = 0;
4249 coding->consumed = 0;
4250 coding->result = CODING_RESULT_SUCCESS;
4251 }
4252
4253 static int
4254 encode_coding_raw_text (coding)
4255 struct coding_system *coding;
4256 {
4257 int multibytep = coding->dst_multibyte;
4258 int *charbuf = coding->charbuf;
4259 int *charbuf_end = coding->charbuf + coding->charbuf_used;
4260 unsigned char *dst = coding->destination + coding->produced;
4261 unsigned char *dst_end = coding->destination + coding->dst_bytes;
4262 int produced_chars = 0;
4263 int c;
4264
4265 if (multibytep)
4266 {
4267 int safe_room = MAX_MULTIBYTE_LENGTH * 2;
4268
4269 if (coding->src_multibyte)
4270 while (charbuf < charbuf_end)
4271 {
4272 ASSURE_DESTINATION (safe_room);
4273 c = *charbuf++;
4274 if (ASCII_CHAR_P (c))
4275 EMIT_ONE_ASCII_BYTE (c);
4276 else if (CHAR_BYTE8_P (c))
4277 {
4278 c = CHAR_TO_BYTE8 (c);
4279 EMIT_ONE_BYTE (c);
4280 }
4281 else
4282 {
4283 unsigned char str[MAX_MULTIBYTE_LENGTH], *p0 = str, *p1 = str;
4284
4285 CHAR_STRING_ADVANCE (c, p1);
4286 while (p0 < p1)
4287 EMIT_ONE_BYTE (*p0);
4288 }
4289 }
4290 else
4291 while (charbuf < charbuf_end)
4292 {
4293 ASSURE_DESTINATION (safe_room);
4294 c = *charbuf++;
4295 EMIT_ONE_BYTE (c);
4296 }
4297 }
4298 else
4299 {
4300 if (coding->src_multibyte)
4301 {
4302 int safe_room = MAX_MULTIBYTE_LENGTH;
4303
4304 while (charbuf < charbuf_end)
4305 {
4306 ASSURE_DESTINATION (safe_room);
4307 c = *charbuf++;
4308 if (ASCII_CHAR_P (c))
4309 *dst++ = c;
4310 else if (CHAR_BYTE8_P (c))
4311 *dst++ = CHAR_TO_BYTE8 (c);
4312 else
4313 CHAR_STRING_ADVANCE (c, dst);
4314 produced_chars++;
4315 }
4316 }
4317 else
4318 {
4319 ASSURE_DESTINATION (charbuf_end - charbuf);
4320 while (charbuf < charbuf_end && dst < dst_end)
4321 *dst++ = *charbuf++;
4322 produced_chars = dst - (coding->destination + coding->dst_bytes);
4323 }
4324 }
4325 coding->result = CODING_RESULT_SUCCESS;
4326 coding->produced_char += produced_chars;
4327 coding->produced = dst - coding->destination;
4328 return 0;
4329 }
4330
4331 static int
4332 detect_coding_charset (coding, mask)
4333 struct coding_system *coding;
4334 int *mask;
4335 {
4336 unsigned char *src = coding->source, *src_base = src;
4337 unsigned char *src_end = coding->source + coding->src_bytes;
4338 int multibytep = coding->src_multibyte;
4339 int consumed_chars = 0;
4340 Lisp_Object attrs, valids;
4341
4342 coding = &coding_categories[coding_category_charset];
4343 attrs = CODING_ID_ATTRS (coding->id);
4344 valids = AREF (attrs, coding_attr_charset_valids);
4345
4346 if (! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
4347 src += coding->head_ascii;
4348
4349 while (1)
4350 {
4351 int c;
4352
4353 ONE_MORE_BYTE (c);
4354 if (NILP (AREF (valids, c)))
4355 break;
4356 }
4357 *mask &= ~CATEGORY_MASK_CHARSET;
4358 return 0;
4359
4360 no_more_source:
4361 *mask &= CATEGORY_MASK_CHARSET;
4362 return 1;
4363 }
4364
4365 static void
4366 decode_coding_charset (coding)
4367 struct coding_system *coding;
4368 {
4369 unsigned char *src = coding->source + coding->consumed;
4370 unsigned char *src_end = coding->source + coding->src_bytes;
4371 unsigned char *src_base;
4372 int *charbuf = coding->charbuf;
4373 int *charbuf_end = charbuf + coding->charbuf_size;
4374 int consumed_chars = 0, consumed_chars_base;
4375 int multibytep = coding->src_multibyte;
4376 Lisp_Object attrs, eol_type, charset_list, valids;
4377
4378 CODING_GET_INFO (coding, attrs, eol_type, charset_list);
4379 valids = AREF (attrs, coding_attr_charset_valids);
4380
4381 while (1)
4382 {
4383 int c;
4384
4385 src_base = src;
4386 consumed_chars_base = consumed_chars;
4387
4388 if (charbuf >= charbuf_end)
4389 break;
4390
4391 ONE_MORE_BYTE (c);
4392 if (c == '\r')
4393 {
4394 /* Here we assume that no charset maps '\r' to something
4395 else. */
4396 if (EQ (eol_type, Qdos))
4397 {
4398 if (src < src_end
4399 && *src == '\n')
4400 ONE_MORE_BYTE (c);
4401 }
4402 else if (EQ (eol_type, Qmac))
4403 c = '\n';
4404 }
4405 else
4406 {
4407 Lisp_Object val;
4408 struct charset *charset;
4409 int dim;
4410 int len = 1;
4411 unsigned code = c;
4412
4413 val = AREF (valids, c);
4414 if (NILP (val))
4415 goto invalid_code;
4416 if (INTEGERP (val))
4417 {
4418 charset = CHARSET_FROM_ID (XFASTINT (val));
4419 dim = CHARSET_DIMENSION (charset);
4420 while (len < dim)
4421 {
4422 ONE_MORE_BYTE (c);
4423 code = (code << 8) | c;
4424 len++;
4425 }
4426 CODING_DECODE_CHAR (coding, src, src_base, src_end,
4427 charset, code, c);
4428 }
4429 else
4430 {
4431 /* VAL is a list of charset IDs. It is assured that the
4432 list is sorted by charset dimensions (smaller one
4433 comes first). */
4434 while (CONSP (val))
4435 {
4436 charset = CHARSET_FROM_ID (XFASTINT (XCAR (val)));
4437 dim = CHARSET_DIMENSION (charset);
4438 while (len < dim)
4439 {
4440 ONE_MORE_BYTE (c);
4441 code = (code << 8) | c;
4442 len++;
4443 }
4444 CODING_DECODE_CHAR (coding, src, src_base,
4445 src_end, charset, code, c);
4446 if (c >= 0)
4447 break;
4448 val = XCDR (val);
4449 }
4450 }
4451 if (c < 0)
4452 goto invalid_code;
4453 }
4454 *charbuf++ = c;
4455 continue;
4456
4457 invalid_code:
4458 src = src_base;
4459 consumed_chars = consumed_chars_base;
4460 ONE_MORE_BYTE (c);
4461 *charbuf++ = ASCII_BYTE_P (c) ? c : BYTE8_TO_CHAR (c);
4462 coding->errors++;
4463 }
4464
4465 no_more_source:
4466 coding->consumed_char += consumed_chars_base;
4467 coding->consumed = src_base - coding->source;
4468 coding->charbuf_used = charbuf - coding->charbuf;
4469 }
4470
4471 static int
4472 encode_coding_charset (coding)
4473 struct coding_system *coding;
4474 {
4475 int multibytep = coding->dst_multibyte;
4476 int *charbuf = coding->charbuf;
4477 int *charbuf_end = charbuf + coding->charbuf_used;
4478 unsigned char *dst = coding->destination + coding->produced;
4479 unsigned char *dst_end = coding->destination + coding->dst_bytes;
4480 int safe_room = MAX_MULTIBYTE_LENGTH;
4481 int produced_chars = 0;
4482 Lisp_Object attrs, eol_type, charset_list;
4483 int ascii_compatible;
4484 int c;
4485
4486 CODING_GET_INFO (coding, attrs, eol_type, charset_list);
4487 ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs));
4488
4489 while (charbuf < charbuf_end)
4490 {
4491 struct charset *charset;
4492 unsigned code;
4493
4494 ASSURE_DESTINATION (safe_room);
4495 c = *charbuf++;
4496 if (ascii_compatible && ASCII_CHAR_P (c))
4497 EMIT_ONE_ASCII_BYTE (c);
4498 else if (CHAR_BYTE8_P (c))
4499 {
4500 c = CHAR_TO_BYTE8 (c);
4501 EMIT_ONE_BYTE (c);
4502 }
4503 else
4504 {
4505 charset = char_charset (c, charset_list, &code);
4506 if (charset)
4507 {
4508 if (CHARSET_DIMENSION (charset) == 1)
4509 EMIT_ONE_BYTE (code);
4510 else if (CHARSET_DIMENSION (charset) == 2)
4511 EMIT_TWO_BYTES (code >> 8, code & 0xFF);
4512 else if (CHARSET_DIMENSION (charset) == 3)
4513 EMIT_THREE_BYTES (code >> 16, (code >> 8) & 0xFF, code & 0xFF);
4514 else
4515 EMIT_FOUR_BYTES (code >> 24, (code >> 16) & 0xFF,
4516 (code >> 8) & 0xFF, code & 0xFF);
4517 }
4518 else
4519 {
4520 if (coding->mode & CODING_MODE_SAFE_ENCODING)
4521 c = CODING_INHIBIT_CHARACTER_SUBSTITUTION;
4522 else
4523 c = coding->default_char;
4524 EMIT_ONE_BYTE (c);
4525 }
4526 }
4527 }
4528
4529 coding->result = CODING_RESULT_SUCCESS;
4530 coding->produced_char += produced_chars;
4531 coding->produced = dst - coding->destination;
4532 return 0;
4533 }
4534
4535 \f
4536 /*** 7. C library functions ***/
4537
4538 /* In Emacs Lisp, coding system is represented by a Lisp symbol which
4539 has a property `coding-system'. The value of this property is a
4540 vector of length 5 (called as coding-vector). Among elements of
4541 this vector, the first (element[0]) and the fifth (element[4])
4542 carry important information for decoding/encoding. Before
4543 decoding/encoding, this information should be set in fields of a
4544 structure of type `coding_system'.
4545
4546 A value of property `coding-system' can be a symbol of another
4547 subsidiary coding-system. In that case, Emacs gets coding-vector
4548 from that symbol.
4549
4550 `element[0]' contains information to be set in `coding->type'. The
4551 value and its meaning is as follows:
4552
4553 0 -- coding_type_emacs_mule
4554 1 -- coding_type_sjis
4555 2 -- coding_type_iso_2022
4556 3 -- coding_type_big5
4557 4 -- coding_type_ccl encoder/decoder written in CCL
4558 nil -- coding_type_no_conversion
4559 t -- coding_type_undecided (automatic conversion on decoding,
4560 no-conversion on encoding)
4561
4562 `element[4]' contains information to be set in `coding->flags' and
4563 `coding->spec'. The meaning varies by `coding->type'.
4564
4565 If `coding->type' is `coding_type_iso_2022', element[4] is a vector
4566 of length 32 (of which the first 13 sub-elements are used now).
4567 Meanings of these sub-elements are:
4568
4569 sub-element[N] where N is 0 through 3: to be set in `coding->spec.iso_2022'
4570 If the value is an integer of valid charset, the charset is
4571 assumed to be designated to graphic register N initially.
4572
4573 If the value is minus, it is a minus value of charset which
4574 reserves graphic register N, which means that the charset is
4575 not designated initially but should be designated to graphic
4576 register N just before encoding a character in that charset.
4577
4578 If the value is nil, graphic register N is never used on
4579 encoding.
4580
4581 sub-element[N] where N is 4 through 11: to be set in `coding->flags'
4582 Each value takes t or nil. See the section ISO2022 of
4583 `coding.h' for more information.
4584
4585 If `coding->type' is `coding_type_big5', element[4] is t to denote
4586 BIG5-ETen or nil to denote BIG5-HKU.
4587
4588 If `coding->type' takes the other value, element[4] is ignored.
4589
4590 Emacs Lisp's coding system also carries information about format of
4591 end-of-line in a value of property `eol-type'. If the value is
4592 integer, 0 means eol_lf, 1 means eol_crlf, and 2 means eol_cr. If
4593 it is not integer, it should be a vector of subsidiary coding
4594 systems of which property `eol-type' has one of above values.
4595
4596 */
4597
4598 /* Setup coding context CODING from information about CODING_SYSTEM.
4599 If CODING_SYSTEM is nil, `no-conversion' is assumed. If
4600 CODING_SYSTEM is invalid, signal an error. */
4601
4602 void
4603 setup_coding_system (coding_system, coding)
4604 Lisp_Object coding_system;
4605 struct coding_system *coding;
4606 {
4607 Lisp_Object attrs;
4608 Lisp_Object eol_type;
4609 Lisp_Object coding_type;
4610 Lisp_Object val;
4611
4612 if (NILP (coding_system))
4613 coding_system = Qno_conversion;
4614
4615 CHECK_CODING_SYSTEM_GET_ID (coding_system, coding->id);
4616
4617 attrs = CODING_ID_ATTRS (coding->id);
4618 eol_type = CODING_ID_EOL_TYPE (coding->id);
4619
4620 coding->mode = 0;
4621 coding->head_ascii = -1;
4622 coding->common_flags
4623 = (VECTORP (eol_type) ? CODING_REQUIRE_DETECTION_MASK : 0);
4624
4625 val = CODING_ATTR_SAFE_CHARSETS (attrs);
4626 coding->max_charset_id = XSTRING (val)->size - 1;
4627 coding->safe_charsets = (char *) XSTRING (val)->data;
4628 coding->default_char = XINT (CODING_ATTR_DEFAULT_CHAR (attrs));
4629
4630 coding_type = CODING_ATTR_TYPE (attrs);
4631 if (EQ (coding_type, Qundecided))
4632 {
4633 coding->detector = NULL;
4634 coding->decoder = decode_coding_raw_text;
4635 coding->encoder = encode_coding_raw_text;
4636 coding->common_flags |= CODING_REQUIRE_DETECTION_MASK;
4637 }
4638 else if (EQ (coding_type, Qiso_2022))
4639 {
4640 int i;
4641 int flags = XINT (AREF (attrs, coding_attr_iso_flags));
4642
4643 /* Invoke graphic register 0 to plane 0. */
4644 CODING_ISO_INVOCATION (coding, 0) = 0;
4645 /* Invoke graphic register 1 to plane 1 if we can use 8-bit. */
4646 CODING_ISO_INVOCATION (coding, 1)
4647 = (flags & CODING_ISO_FLAG_SEVEN_BITS ? -1 : 1);
4648 /* Setup the initial status of designation. */
4649 for (i = 0; i < 4; i++)
4650 CODING_ISO_DESIGNATION (coding, i) = CODING_ISO_INITIAL (coding, i);
4651 /* Not single shifting initially. */
4652 CODING_ISO_SINGLE_SHIFTING (coding) = 0;
4653 /* Beginning of buffer should also be regarded as bol. */
4654 CODING_ISO_BOL (coding) = 1;
4655 coding->detector = detect_coding_iso_2022;
4656 coding->decoder = decode_coding_iso_2022;
4657 coding->encoder = encode_coding_iso_2022;
4658 if (flags & CODING_ISO_FLAG_SAFE)
4659 coding->mode |= CODING_MODE_SAFE_ENCODING;
4660 coding->common_flags
4661 |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK
4662 | CODING_REQUIRE_FLUSHING_MASK);
4663 if (flags & CODING_ISO_FLAG_COMPOSITION)
4664 coding->common_flags |= CODING_ANNOTATE_COMPOSITION_MASK;
4665 if (flags & CODING_ISO_FLAG_FULL_SUPPORT)
4666 {
4667 setup_iso_safe_charsets (attrs);
4668 val = CODING_ATTR_SAFE_CHARSETS (attrs);
4669 coding->max_charset_id = XSTRING (val)->size - 1;
4670 coding->safe_charsets = (char *) XSTRING (val)->data;
4671 }
4672 CODING_ISO_FLAGS (coding) = flags;
4673 }
4674 else if (EQ (coding_type, Qcharset))
4675 {
4676 coding->detector = detect_coding_charset;
4677 coding->decoder = decode_coding_charset;
4678 coding->encoder = encode_coding_charset;
4679 coding->common_flags
4680 |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
4681 }
4682 else if (EQ (coding_type, Qutf_8))
4683 {
4684 coding->detector = detect_coding_utf_8;
4685 coding->decoder = decode_coding_utf_8;
4686 coding->encoder = encode_coding_utf_8;
4687 coding->common_flags
4688 |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
4689 }
4690 else if (EQ (coding_type, Qutf_16))
4691 {
4692 val = AREF (attrs, coding_attr_utf_16_bom);
4693 CODING_UTF_16_BOM (coding) = (CONSP (val) ? utf_16_detect_bom
4694 : EQ (val, Qt) ? utf_16_with_bom
4695 : utf_16_without_bom);
4696 val = AREF (attrs, coding_attr_utf_16_endian);
4697 CODING_UTF_16_ENDIAN (coding) = (NILP (val) ? utf_16_big_endian
4698 : utf_16_little_endian);
4699 CODING_UTF_16_SURROGATE (coding) = 0;
4700 coding->detector = detect_coding_utf_16;
4701 coding->decoder = decode_coding_utf_16;
4702 coding->encoder = encode_coding_utf_16;
4703 coding->common_flags
4704 |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
4705 }
4706 else if (EQ (coding_type, Qccl))
4707 {
4708 coding->detector = detect_coding_ccl;
4709 coding->decoder = decode_coding_ccl;
4710 coding->encoder = encode_coding_ccl;
4711 coding->common_flags
4712 |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK
4713 | CODING_REQUIRE_FLUSHING_MASK);
4714 }
4715 else if (EQ (coding_type, Qemacs_mule))
4716 {
4717 coding->detector = detect_coding_emacs_mule;
4718 coding->decoder = decode_coding_emacs_mule;
4719 coding->encoder = encode_coding_emacs_mule;
4720 coding->common_flags
4721 |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
4722 if (! NILP (AREF (attrs, coding_attr_emacs_mule_full))
4723 && ! EQ (CODING_ATTR_CHARSET_LIST (attrs), Vemacs_mule_charset_list))
4724 {
4725 Lisp_Object tail, safe_charsets;
4726 int max_charset_id = 0;
4727
4728 for (tail = Vemacs_mule_charset_list; CONSP (tail);
4729 tail = XCDR (tail))
4730 if (max_charset_id < XFASTINT (XCAR (tail)))
4731 max_charset_id = XFASTINT (XCAR (tail));
4732 safe_charsets = Fmake_string (make_number (max_charset_id + 1),
4733 make_number (255));
4734 for (tail = Vemacs_mule_charset_list; CONSP (tail);
4735 tail = XCDR (tail))
4736 XSTRING (safe_charsets)->data[XFASTINT (XCAR (tail))] = 0;
4737 coding->max_charset_id = max_charset_id;
4738 coding->safe_charsets = (char *) XSTRING (safe_charsets)->data;
4739 }
4740 }
4741 else if (EQ (coding_type, Qshift_jis))
4742 {
4743 coding->detector = detect_coding_sjis;
4744 coding->decoder = decode_coding_sjis;
4745 coding->encoder = encode_coding_sjis;
4746 coding->common_flags
4747 |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
4748 }
4749 else if (EQ (coding_type, Qbig5))
4750 {
4751 coding->detector = detect_coding_big5;
4752 coding->decoder = decode_coding_big5;
4753 coding->encoder = encode_coding_big5;
4754 coding->common_flags
4755 |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
4756 }
4757 else /* EQ (coding_type, Qraw_text) */
4758 {
4759 coding->detector = NULL;
4760 coding->decoder = decode_coding_raw_text;
4761 coding->encoder = encode_coding_raw_text;
4762 coding->common_flags |= CODING_FOR_UNIBYTE_MASK;
4763 }
4764
4765 return;
4766 }
4767
4768 /* Return raw-text or one of its subsidiaries that has the same
4769 eol_type as CODING-SYSTEM. */
4770
4771 Lisp_Object
4772 raw_text_coding_system (coding_system)
4773 Lisp_Object coding_system;
4774 {
4775 Lisp_Object spec, attrs;
4776 Lisp_Object eol_type, raw_text_eol_type;
4777
4778 spec = CODING_SYSTEM_SPEC (coding_system);
4779 attrs = AREF (spec, 0);
4780
4781 if (EQ (CODING_ATTR_TYPE (attrs), Qraw_text))
4782 return coding_system;
4783
4784 eol_type = AREF (spec, 2);
4785 if (VECTORP (eol_type))
4786 return Qraw_text;
4787 spec = CODING_SYSTEM_SPEC (Qraw_text);
4788 raw_text_eol_type = AREF (spec, 2);
4789 return (EQ (eol_type, Qunix) ? AREF (raw_text_eol_type, 0)
4790 : EQ (eol_type, Qdos) ? AREF (raw_text_eol_type, 1)
4791 : AREF (raw_text_eol_type, 2));
4792 }
4793
4794
4795 /* If CODING_SYSTEM doesn't specify end-of-line format but PARENT
4796 does, return one of the subsidiary that has the same eol-spec as
4797 PARENT. Otherwise, return CODING_SYSTEM. */
4798
4799 Lisp_Object
4800 coding_inherit_eol_type (coding_system, parent)
4801 Lisp_Object coding_system, parent;
4802 {
4803 Lisp_Object spec, attrs, eol_type;
4804
4805 spec = CODING_SYSTEM_SPEC (coding_system);
4806 attrs = AREF (spec, 0);
4807 eol_type = AREF (spec, 2);
4808 if (VECTORP (eol_type))
4809 {
4810 Lisp_Object parent_spec;
4811 Lisp_Object parent_eol_type;
4812
4813 parent_spec
4814 = CODING_SYSTEM_SPEC (buffer_defaults.buffer_file_coding_system);
4815 parent_eol_type = AREF (parent_spec, 2);
4816 if (EQ (parent_eol_type, Qunix))
4817 coding_system = AREF (eol_type, 0);
4818 else if (EQ (parent_eol_type, Qdos))
4819 coding_system = AREF (eol_type, 1);
4820 else if (EQ (parent_eol_type, Qmac))
4821 coding_system = AREF (eol_type, 2);
4822 }
4823 return coding_system;
4824 }
4825
4826 /* Emacs has a mechanism to automatically detect a coding system if it
4827 is one of Emacs' internal format, ISO2022, SJIS, and BIG5. But,
4828 it's impossible to distinguish some coding systems accurately
4829 because they use the same range of codes. So, at first, coding
4830 systems are categorized into 7, those are:
4831
4832 o coding-category-emacs-mule
4833
4834 The category for a coding system which has the same code range
4835 as Emacs' internal format. Assigned the coding-system (Lisp
4836 symbol) `emacs-mule' by default.
4837
4838 o coding-category-sjis
4839
4840 The category for a coding system which has the same code range
4841 as SJIS. Assigned the coding-system (Lisp
4842 symbol) `japanese-shift-jis' by default.
4843
4844 o coding-category-iso-7
4845
4846 The category for a coding system which has the same code range
4847 as ISO2022 of 7-bit environment. This doesn't use any locking
4848 shift and single shift functions. This can encode/decode all
4849 charsets. Assigned the coding-system (Lisp symbol)
4850 `iso-2022-7bit' by default.
4851
4852 o coding-category-iso-7-tight
4853
4854 Same as coding-category-iso-7 except that this can
4855 encode/decode only the specified charsets.
4856
4857 o coding-category-iso-8-1
4858
4859 The category for a coding system which has the same code range
4860 as ISO2022 of 8-bit environment and graphic plane 1 used only
4861 for DIMENSION1 charset. This doesn't use any locking shift
4862 and single shift functions. Assigned the coding-system (Lisp
4863 symbol) `iso-latin-1' by default.
4864
4865 o coding-category-iso-8-2
4866
4867 The category for a coding system which has the same code range
4868 as ISO2022 of 8-bit environment and graphic plane 1 used only
4869 for DIMENSION2 charset. This doesn't use any locking shift
4870 and single shift functions. Assigned the coding-system (Lisp
4871 symbol) `japanese-iso-8bit' by default.
4872
4873 o coding-category-iso-7-else
4874
4875 The category for a coding system which has the same code range
4876 as ISO2022 of 7-bit environemnt but uses locking shift or
4877 single shift functions. Assigned the coding-system (Lisp
4878 symbol) `iso-2022-7bit-lock' by default.
4879
4880 o coding-category-iso-8-else
4881
4882 The category for a coding system which has the same code range
4883 as ISO2022 of 8-bit environemnt but uses locking shift or
4884 single shift functions. Assigned the coding-system (Lisp
4885 symbol) `iso-2022-8bit-ss2' by default.
4886
4887 o coding-category-big5
4888
4889 The category for a coding system which has the same code range
4890 as BIG5. Assigned the coding-system (Lisp symbol)
4891 `cn-big5' by default.
4892
4893 o coding-category-utf-8
4894
4895 The category for a coding system which has the same code range
4896 as UTF-8 (cf. RFC2279). Assigned the coding-system (Lisp
4897 symbol) `utf-8' by default.
4898
4899 o coding-category-utf-16-be
4900
4901 The category for a coding system in which a text has an
4902 Unicode signature (cf. Unicode Standard) in the order of BIG
4903 endian at the head. Assigned the coding-system (Lisp symbol)
4904 `utf-16-be' by default.
4905
4906 o coding-category-utf-16-le
4907
4908 The category for a coding system in which a text has an
4909 Unicode signature (cf. Unicode Standard) in the order of
4910 LITTLE endian at the head. Assigned the coding-system (Lisp
4911 symbol) `utf-16-le' by default.
4912
4913 o coding-category-ccl
4914
4915 The category for a coding system of which encoder/decoder is
4916 written in CCL programs. The default value is nil, i.e., no
4917 coding system is assigned.
4918
4919 o coding-category-binary
4920
4921 The category for a coding system not categorized in any of the
4922 above. Assigned the coding-system (Lisp symbol)
4923 `no-conversion' by default.
4924
4925 Each of them is a Lisp symbol and the value is an actual
4926 `coding-system's (this is also a Lisp symbol) assigned by a user.
4927 What Emacs does actually is to detect a category of coding system.
4928 Then, it uses a `coding-system' assigned to it. If Emacs can't
4929 decide only one possible category, it selects a category of the
4930 highest priority. Priorities of categories are also specified by a
4931 user in a Lisp variable `coding-category-list'.
4932
4933 */
4934
4935 #define EOL_SEEN_NONE 0
4936 #define EOL_SEEN_LF 1
4937 #define EOL_SEEN_CR 2
4938 #define EOL_SEEN_CRLF 4
4939
4940 /* Detect how end-of-line of a text of length CODING->src_bytes
4941 pointed by CODING->source is encoded. Return one of
4942 EOL_SEEN_XXX. */
4943
4944 #define MAX_EOL_CHECK_COUNT 3
4945
4946 static int
4947 detect_eol (coding, source, src_bytes)
4948 struct coding_system *coding;
4949 unsigned char *source;
4950 EMACS_INT src_bytes;
4951 {
4952 Lisp_Object attrs, coding_type;
4953 unsigned char *src = source, *src_end = src + src_bytes;
4954 unsigned char c;
4955 int total = 0;
4956 int eol_seen = EOL_SEEN_NONE;
4957
4958 attrs = CODING_ID_ATTRS (coding->id);
4959 coding_type = CODING_ATTR_TYPE (attrs);
4960
4961 if (EQ (coding_type, Qccl))
4962 {
4963 int msb, lsb;
4964
4965 msb = coding->spec.utf_16.endian == utf_16_little_endian;
4966 lsb = 1 - msb;
4967
4968 while (src + 1 < src_end)
4969 {
4970 c = src[lsb];
4971 if (src[msb] == 0 && (c == '\n' || c == '\r'))
4972 {
4973 int this_eol;
4974
4975 if (c == '\n')
4976 this_eol = EOL_SEEN_LF;
4977 else if (src + 3 >= src_end
4978 || src[msb + 2] != 0
4979 || src[lsb + 2] != '\n')
4980 this_eol = EOL_SEEN_CR;
4981 else
4982 this_eol = EOL_SEEN_CRLF;
4983
4984 if (eol_seen == EOL_SEEN_NONE)
4985 /* This is the first end-of-line. */
4986 eol_seen = this_eol;
4987 else if (eol_seen != this_eol)
4988 {
4989 /* The found type is different from what found before. */
4990 eol_seen = EOL_SEEN_LF;
4991 break;
4992 }
4993 if (++total == MAX_EOL_CHECK_COUNT)
4994 break;
4995 }
4996 src += 2;
4997 }
4998 }
4999 else
5000 {
5001 while (src < src_end)
5002 {
5003 c = *src++;
5004 if (c == '\n' || c == '\r')
5005 {
5006 int this_eol;
5007
5008 if (c == '\n')
5009 this_eol = EOL_SEEN_LF;
5010 else if (src >= src_end || *src != '\n')
5011 this_eol = EOL_SEEN_CR;
5012 else
5013 this_eol = EOL_SEEN_CRLF, src++;
5014
5015 if (eol_seen == EOL_SEEN_NONE)
5016 /* This is the first end-of-line. */
5017 eol_seen = this_eol;
5018 else if (eol_seen != this_eol)
5019 {
5020 /* The found type is different from what found before. */
5021 eol_seen = EOL_SEEN_LF;
5022 break;
5023 }
5024 if (++total == MAX_EOL_CHECK_COUNT)
5025 break;
5026 }
5027 }
5028 }
5029 return eol_seen;
5030 }
5031
5032
5033 static void
5034 adjust_coding_eol_type (coding, eol_seen)
5035 struct coding_system *coding;
5036 int eol_seen;
5037 {
5038 Lisp_Object eol_type;
5039
5040 eol_type = CODING_ID_EOL_TYPE (coding->id);
5041 if (eol_seen & EOL_SEEN_LF)
5042 coding->id = CODING_SYSTEM_ID (AREF (eol_type, 0));
5043 else if (eol_type & EOL_SEEN_CRLF)
5044 coding->id = CODING_SYSTEM_ID (AREF (eol_type, 1));
5045 else if (eol_type & EOL_SEEN_CR)
5046 coding->id = CODING_SYSTEM_ID (AREF (eol_type, 2));
5047 }
5048
5049 /* Detect how a text specified in CODING is encoded. If a coding
5050 system is detected, update fields of CODING by the detected coding
5051 system. */
5052
5053 void
5054 detect_coding (coding)
5055 struct coding_system *coding;
5056 {
5057 unsigned char *src, *src_end;
5058 Lisp_Object attrs, coding_type;
5059
5060 coding->consumed = coding->consumed_char = 0;
5061 coding->produced = coding->produced_char = 0;
5062 coding_set_source (coding);
5063
5064 src_end = coding->source + coding->src_bytes;
5065
5066 /* If we have not yet decided the text encoding type, detect it
5067 now. */
5068 if (EQ (CODING_ATTR_TYPE (CODING_ID_ATTRS (coding->id)), Qundecided))
5069 {
5070 int mask = CATEGORY_MASK_ANY;
5071 int c, i;
5072
5073 for (src = coding->source; src < src_end; src++)
5074 {
5075 c = *src;
5076 if (c & 0x80 || (c < 0x20 && (c == ISO_CODE_ESC
5077 || c == ISO_CODE_SI
5078 || c == ISO_CODE_SO)))
5079 break;
5080 }
5081 coding->head_ascii = src - (coding->source + coding->consumed);
5082
5083 if (coding->head_ascii < coding->src_bytes)
5084 {
5085 int detected = 0;
5086
5087 for (i = 0; i < coding_category_raw_text; i++)
5088 {
5089 enum coding_category category = coding_priorities[i];
5090 struct coding_system *this = coding_categories + category;
5091
5092 if (category >= coding_category_raw_text
5093 || detected & (1 << category))
5094 continue;
5095
5096 if (this->id < 0)
5097 {
5098 /* No coding system of this category is defined. */
5099 mask &= ~(1 << category);
5100 }
5101 else
5102 {
5103 detected |= detected_mask[category];
5104 if ((*(this->detector)) (coding, &mask))
5105 break;
5106 }
5107 }
5108 if (! mask)
5109 setup_coding_system (Qraw_text, coding);
5110 else if (mask != CATEGORY_MASK_ANY)
5111 for (i = 0; i < coding_category_raw_text; i++)
5112 {
5113 enum coding_category category = coding_priorities[i];
5114 struct coding_system *this = coding_categories + category;
5115
5116 if (mask & (1 << category))
5117 {
5118 setup_coding_system (CODING_ID_NAME (this->id), coding);
5119 break;
5120 }
5121 }
5122 }
5123 }
5124
5125 attrs = CODING_ID_ATTRS (coding->id);
5126 coding_type = CODING_ATTR_TYPE (attrs);
5127
5128 /* If we have not yet decided the EOL type, detect it now. But, the
5129 detection is impossible for a CCL based coding system, in which
5130 case, we detct the EOL type after decoding. */
5131 if (VECTORP (CODING_ID_EOL_TYPE (coding->id))
5132 && ! EQ (coding_type, Qccl))
5133 {
5134 int eol_seen = detect_eol (coding, coding->source, coding->src_bytes);
5135
5136 if (eol_seen != EOL_SEEN_NONE)
5137 adjust_coding_eol_type (coding, eol_seen);
5138 }
5139 }
5140
5141
5142 static void
5143 decode_eol (coding)
5144 struct coding_system *coding;
5145 {
5146 if (VECTORP (CODING_ID_EOL_TYPE (coding->id)))
5147 {
5148 unsigned char *p = CHAR_POS_ADDR (coding->dst_pos);
5149 unsigned char *pend = p + coding->produced;
5150 int eol_seen = EOL_SEEN_NONE;
5151
5152 for (; p < pend; p++)
5153 {
5154 if (*p == '\n')
5155 eol_seen |= EOL_SEEN_LF;
5156 else if (*p == '\r')
5157 {
5158 if (p + 1 < pend && *(p + 1) == '\n')
5159 {
5160 eol_seen |= EOL_SEEN_CRLF;
5161 p++;
5162 }
5163 else
5164 eol_seen |= EOL_SEEN_CR;
5165 }
5166 }
5167 if (eol_seen != EOL_SEEN_NONE)
5168 adjust_coding_eol_type (coding, eol_seen);
5169 }
5170
5171 if (EQ (CODING_ID_EOL_TYPE (coding->id), Qmac))
5172 {
5173 unsigned char *p = CHAR_POS_ADDR (coding->dst_pos);
5174 unsigned char *pend = p + coding->produced;
5175
5176 for (; p < pend; p++)
5177 if (*p == '\r')
5178 *p = '\n';
5179 }
5180 else if (EQ (CODING_ID_EOL_TYPE (coding->id), Qdos))
5181 {
5182 unsigned char *p, *pbeg, *pend;
5183 Lisp_Object undo_list;
5184
5185 move_gap_both (coding->dst_pos + coding->produced_char,
5186 coding->dst_pos_byte + coding->produced);
5187 undo_list = current_buffer->undo_list;
5188 current_buffer->undo_list = Qt;
5189 del_range_2 (coding->dst_pos, coding->dst_pos_byte, GPT, GPT_BYTE, Qnil);
5190 current_buffer->undo_list = undo_list;
5191 pbeg = GPT_ADDR;
5192 pend = pbeg + coding->produced;
5193
5194 for (p = pend - 1; p >= pbeg; p--)
5195 if (*p == '\r')
5196 {
5197 safe_bcopy ((char *) (p + 1), (char *) p, pend - p - 1);
5198 pend--;
5199 }
5200 coding->produced_char -= coding->produced - (pend - pbeg);
5201 coding->produced = pend - pbeg;
5202 insert_from_gap (coding->produced_char, coding->produced);
5203 }
5204 }
5205
5206 static void
5207 translate_chars (coding, table)
5208 struct coding_system *coding;
5209 Lisp_Object table;
5210 {
5211 int *charbuf = coding->charbuf;
5212 int *charbuf_end = charbuf + coding->charbuf_used;
5213 int c;
5214
5215 if (coding->chars_at_source)
5216 return;
5217
5218 while (charbuf < charbuf_end)
5219 {
5220 c = *charbuf;
5221 if (c < 0)
5222 charbuf += c;
5223 else
5224 *charbuf++ = translate_char (table, c);
5225 }
5226 }
5227
5228 static int
5229 produce_chars (coding)
5230 struct coding_system *coding;
5231 {
5232 unsigned char *dst = coding->destination + coding->produced;
5233 unsigned char *dst_end = coding->destination + coding->dst_bytes;
5234 int produced;
5235 int produced_chars = 0;
5236
5237 if (! coding->chars_at_source)
5238 {
5239 /* Characters are in coding->charbuf. */
5240 int *buf = coding->charbuf;
5241 int *buf_end = buf + coding->charbuf_used;
5242 unsigned char *adjusted_dst_end;
5243
5244 if (BUFFERP (coding->src_object)
5245 && EQ (coding->src_object, coding->dst_object))
5246 dst_end = coding->source + coding->consumed;
5247 adjusted_dst_end = dst_end - MAX_MULTIBYTE_LENGTH;
5248
5249 while (buf < buf_end)
5250 {
5251 int c = *buf++;
5252
5253 if (dst >= adjusted_dst_end)
5254 {
5255 dst = alloc_destination (coding,
5256 buf_end - buf + MAX_MULTIBYTE_LENGTH,
5257 dst);
5258 dst_end = coding->destination + coding->dst_bytes;
5259 adjusted_dst_end = dst_end - MAX_MULTIBYTE_LENGTH;
5260 }
5261 if (c >= 0)
5262 {
5263 if (coding->dst_multibyte
5264 || ! CHAR_BYTE8_P (c))
5265 CHAR_STRING_ADVANCE (c, dst);
5266 else
5267 *dst++ = CHAR_TO_BYTE8 (c);
5268 produced_chars++;
5269 }
5270 else
5271 /* This is an annotation data. */
5272 buf -= c + 1;
5273 }
5274 }
5275 else
5276 {
5277 unsigned char *src = coding->source;
5278 unsigned char *src_end = src + coding->src_bytes;
5279 Lisp_Object eol_type;
5280
5281 eol_type = CODING_ID_EOL_TYPE (coding->id);
5282
5283 if (coding->src_multibyte != coding->dst_multibyte)
5284 {
5285 if (coding->src_multibyte)
5286 {
5287 int multibytep = 1;
5288 int consumed_chars;
5289
5290 while (1)
5291 {
5292 unsigned char *src_base = src;
5293 int c;
5294
5295 ONE_MORE_BYTE (c);
5296 if (c == '\r')
5297 {
5298 if (EQ (eol_type, Qdos))
5299 {
5300 if (src < src_end
5301 && *src == '\n')
5302 c = *src++;
5303 }
5304 else if (EQ (eol_type, Qmac))
5305 c = '\n';
5306 }
5307 if (dst == dst_end)
5308 {
5309 coding->consumed = src - coding->source;
5310
5311 if (EQ (coding->src_object, coding->dst_object))
5312 dst_end = src;
5313 if (dst == dst_end)
5314 {
5315 dst = alloc_destination (coding, src_end - src + 1,
5316 dst);
5317 dst_end = coding->destination + coding->dst_bytes;
5318 coding_set_source (coding);
5319 src = coding->source + coding->consumed;
5320 src_end = coding->source + coding->src_bytes;
5321 }
5322 }
5323 *dst++ = c;
5324 produced_chars++;
5325 }
5326 no_more_source:
5327 ;
5328 }
5329 else
5330 while (src < src_end)
5331 {
5332 int multibytep = 1;
5333 int c = *src++;
5334
5335 if (c == '\r')
5336 {
5337 if (EQ (eol_type, Qdos))
5338 {
5339 if (src < src_end
5340 && *src == '\n')
5341 c = *src++;
5342 }
5343 else if (EQ (eol_type, Qmac))
5344 c = '\n';
5345 }
5346 if (dst >= dst_end - 1)
5347 {
5348 coding->consumed = src - coding->source;
5349
5350 if (EQ (coding->src_object, coding->dst_object))
5351 dst_end = src;
5352 if (dst >= dst_end - 1)
5353 {
5354 dst = alloc_destination (coding, src_end - src + 2,
5355 dst);
5356 dst_end = coding->destination + coding->dst_bytes;
5357 coding_set_source (coding);
5358 src = coding->source + coding->consumed;
5359 src_end = coding->source + coding->src_bytes;
5360 }
5361 }
5362 EMIT_ONE_BYTE (c);
5363 }
5364 }
5365 else
5366 {
5367 if (!EQ (coding->src_object, coding->dst_object))
5368 {
5369 int require = coding->src_bytes - coding->dst_bytes;
5370
5371 if (require > 0)
5372 {
5373 EMACS_INT offset = src - coding->source;
5374
5375 dst = alloc_destination (coding, require, dst);
5376 coding_set_source (coding);
5377 src = coding->source + offset;
5378 src_end = coding->source + coding->src_bytes;
5379 }
5380 }
5381 produced_chars = coding->src_chars;
5382 while (src < src_end)
5383 {
5384 int c = *src++;
5385
5386 if (c == '\r')
5387 {
5388 if (EQ (eol_type, Qdos))
5389 {
5390 if (src < src_end
5391 && *src == '\n')
5392 c = *src++;
5393 produced_chars--;
5394 }
5395 else if (EQ (eol_type, Qmac))
5396 c = '\n';
5397 }
5398 *dst++ = c;
5399 }
5400 }
5401 coding->consumed = coding->src_bytes;
5402 coding->consumed_char = coding->src_chars;
5403 }
5404
5405 produced = dst - (coding->destination + coding->produced);
5406 if (BUFFERP (coding->dst_object))
5407 insert_from_gap (produced_chars, produced);
5408 coding->produced += produced;
5409 coding->produced_char += produced_chars;
5410 return produced_chars;
5411 }
5412
5413 /* [ -LENGTH CHAR_POS_OFFSET MASK METHOD COMP_LEN ]
5414 or
5415 [ -LENGTH CHAR_POS_OFFSET MASK METHOD COMP_LEN COMPONENTS... ]
5416 */
5417
5418 static INLINE void
5419 produce_composition (coding, charbuf)
5420 struct coding_system *coding;
5421 int *charbuf;
5422 {
5423 Lisp_Object buffer;
5424 int len;
5425 EMACS_INT pos;
5426 enum composition_method method;
5427 int cmp_len;
5428 Lisp_Object components;
5429
5430 buffer = coding->dst_object;
5431 len = -charbuf[0];
5432 pos = coding->dst_pos + charbuf[1];
5433 method = (enum composition_method) (charbuf[3]);
5434 cmp_len = charbuf[4];
5435
5436 if (method == COMPOSITION_RELATIVE)
5437 components = Qnil;
5438 else
5439 {
5440 Lisp_Object args[MAX_COMPOSITION_COMPONENTS * 2 - 1];
5441 int i;
5442
5443 len -= 5;
5444 charbuf += 5;
5445 for (i = 0; i < len; i++)
5446 args[i] = make_number (charbuf[i]);
5447 components = (method == COMPOSITION_WITH_ALTCHARS
5448 ? Fstring (len, args) : Fvector (len, args));
5449 }
5450 compose_text (pos, pos + cmp_len, components, Qnil, Qnil);
5451 }
5452
5453 static int *
5454 save_composition_data (buf, buf_end, prop)
5455 int *buf, *buf_end;
5456 Lisp_Object prop;
5457 {
5458 enum composition_method method = COMPOSITION_METHOD (prop);
5459 int cmp_len = COMPOSITION_LENGTH (prop);
5460
5461 if (buf + 4 + (MAX_COMPOSITION_COMPONENTS * 2 - 1) > buf_end)
5462 return NULL;
5463
5464 buf[1] = CODING_ANNOTATE_COMPOSITION_MASK;
5465 buf[2] = method;
5466 buf[3] = cmp_len;
5467
5468 if (method == COMPOSITION_RELATIVE)
5469 buf[0] = 4;
5470 else
5471 {
5472 Lisp_Object components;
5473 int len, i;
5474
5475 components = COMPOSITION_COMPONENTS (prop);
5476 if (VECTORP (components))
5477 {
5478 len = XVECTOR (components)->size;
5479 for (i = 0; i < len; i++)
5480 buf[4 + i] = XINT (AREF (components, i));
5481 }
5482 else if (STRINGP (components))
5483 {
5484 int i_byte;
5485
5486 len = XSTRING (components)->size;
5487 i = i_byte = 0;
5488 while (i < len)
5489 FETCH_STRING_CHAR_ADVANCE (buf[4 + i], components, i, i_byte);
5490 }
5491 else if (INTEGERP (components))
5492 {
5493 len = 1;
5494 buf[4] = XINT (components);
5495 }
5496 else if (CONSP (components))
5497 {
5498 for (len = 0; CONSP (components);
5499 len++, components = XCDR (components))
5500 buf[4 + len] = XINT (XCAR (components));
5501 }
5502 else
5503 abort ();
5504 buf[0] = 4 + len;
5505 }
5506 return (buf + buf[0]);
5507 }
5508
5509 #define CHARBUF_SIZE 0x4000
5510
5511 #define ALLOC_CONVERSION_WORK_AREA(coding) \
5512 do { \
5513 int size = CHARBUF_SIZE;; \
5514 \
5515 coding->charbuf = NULL; \
5516 while (size > 1024) \
5517 { \
5518 coding->charbuf = (int *) alloca (sizeof (int) * size); \
5519 if (coding->charbuf) \
5520 break; \
5521 size >>= 1; \
5522 } \
5523 if (! coding->charbuf) \
5524 { \
5525 coding->result = CODING_RESULT_INSUFFICIENT_MEM; \
5526 return coding->result; \
5527 } \
5528 coding->charbuf_size = size; \
5529 } while (0)
5530
5531
5532 static void
5533 produce_annotation (coding)
5534 struct coding_system *coding;
5535 {
5536 int *charbuf = coding->charbuf;
5537 int *charbuf_end = charbuf + coding->charbuf_used;
5538
5539 while (charbuf < charbuf_end)
5540 {
5541 if (*charbuf >= 0)
5542 charbuf++;
5543 else
5544 {
5545 int len = -*charbuf;
5546 switch (charbuf[2])
5547 {
5548 case CODING_ANNOTATE_COMPOSITION_MASK:
5549 produce_composition (coding, charbuf);
5550 break;
5551 default:
5552 abort ();
5553 }
5554 charbuf += len;
5555 }
5556 }
5557 }
5558
5559 /* Decode the data at CODING->src_object into CODING->dst_object.
5560 CODING->src_object is a buffer, a string, or nil.
5561 CODING->dst_object is a buffer.
5562
5563 If CODING->src_object is a buffer, it must be the current buffer.
5564 In this case, if CODING->src_pos is positive, it is a position of
5565 the source text in the buffer, otherwise, the source text is in the
5566 gap area of the buffer, and CODING->src_pos specifies the offset of
5567 the text from GPT (which must be the same as PT). If this is the
5568 same buffer as CODING->dst_object, CODING->src_pos must be
5569 negative.
5570
5571 If CODING->src_object is a string, CODING->src_pos in an index to
5572 that string.
5573
5574 If CODING->src_object is nil, CODING->source must already point to
5575 the non-relocatable memory area. In this case, CODING->src_pos is
5576 an offset from CODING->source.
5577
5578 The decoded data is inserted at the current point of the buffer
5579 CODING->dst_object.
5580 */
5581
5582 static int
5583 decode_coding (coding)
5584 struct coding_system *coding;
5585 {
5586 Lisp_Object attrs;
5587
5588 if (BUFFERP (coding->src_object)
5589 && coding->src_pos > 0
5590 && coding->src_pos < GPT
5591 && coding->src_pos + coding->src_chars > GPT)
5592 move_gap_both (coding->src_pos, coding->src_pos_byte);
5593
5594 if (BUFFERP (coding->dst_object))
5595 {
5596 if (current_buffer != XBUFFER (coding->dst_object))
5597 set_buffer_internal (XBUFFER (coding->dst_object));
5598 if (GPT != PT)
5599 move_gap_both (PT, PT_BYTE);
5600 }
5601
5602 coding->consumed = coding->consumed_char = 0;
5603 coding->produced = coding->produced_char = 0;
5604 coding->chars_at_source = 0;
5605 coding->result = CODING_RESULT_SUCCESS;
5606 coding->errors = 0;
5607
5608 ALLOC_CONVERSION_WORK_AREA (coding);
5609
5610 attrs = CODING_ID_ATTRS (coding->id);
5611
5612 do
5613 {
5614 coding_set_source (coding);
5615 coding->annotated = 0;
5616 (*(coding->decoder)) (coding);
5617 if (!NILP (CODING_ATTR_DECODE_TBL (attrs)))
5618 translate_chars (CODING_ATTR_DECODE_TBL (attrs), coding);
5619 coding_set_destination (coding);
5620 produce_chars (coding);
5621 if (coding->annotated)
5622 produce_annotation (coding);
5623 }
5624 while (coding->consumed < coding->src_bytes
5625 && ! coding->result);
5626
5627 if (EQ (CODING_ATTR_TYPE (CODING_ID_ATTRS (coding->id)), Qccl)
5628 && SYMBOLP (CODING_ID_EOL_TYPE (coding->id))
5629 && ! EQ (CODING_ID_EOL_TYPE (coding->id), Qunix))
5630 decode_eol (coding);
5631
5632 coding->carryover_bytes = 0;
5633 if (coding->consumed < coding->src_bytes)
5634 {
5635 int nbytes = coding->src_bytes - coding->consumed;
5636 unsigned char *src;
5637
5638 coding_set_source (coding);
5639 coding_set_destination (coding);
5640 src = coding->source + coding->consumed;
5641
5642 if (coding->mode & CODING_MODE_LAST_BLOCK)
5643 {
5644 /* Flush out unprocessed data as binary chars. We are sure
5645 that the number of data is less than the size of
5646 coding->charbuf. */
5647 int *charbuf = coding->charbuf;
5648
5649 while (nbytes-- > 0)
5650 {
5651 int c = *src++;
5652 *charbuf++ = (c & 0x80 ? - c : c);
5653 }
5654 produce_chars (coding);
5655 }
5656 else
5657 {
5658 /* Record unprocessed bytes in coding->carryover. We are
5659 sure that the number of data is less than the size of
5660 coding->carryover. */
5661 unsigned char *p = coding->carryover;
5662
5663 coding->carryover_bytes = nbytes;
5664 while (nbytes-- > 0)
5665 *p++ = *src++;
5666 }
5667 coding->consumed = coding->src_bytes;
5668 }
5669
5670 return coding->result;
5671 }
5672
5673 static void
5674 consume_chars (coding)
5675 struct coding_system *coding;
5676 {
5677 int *buf = coding->charbuf;
5678 /* -1 is to compensate for CRLF. */
5679 int *buf_end = coding->charbuf + coding->charbuf_size - 1;
5680 unsigned char *src = coding->source + coding->consumed;
5681 int pos = coding->src_pos + coding->consumed_char;
5682 int end_pos = coding->src_pos + coding->src_chars;
5683 int multibytep = coding->src_multibyte;
5684 Lisp_Object eol_type;
5685 int c;
5686 int start, end, stop;
5687 Lisp_Object object, prop;
5688
5689 eol_type = CODING_ID_EOL_TYPE (coding->id);
5690 if (VECTORP (eol_type))
5691 eol_type = Qunix;
5692
5693 object = coding->src_object;
5694
5695 /* Note: composition handling is not yet implemented. */
5696 coding->common_flags &= ~CODING_ANNOTATE_COMPOSITION_MASK;
5697
5698 if (coding->common_flags & CODING_ANNOTATE_COMPOSITION_MASK
5699 && find_composition (pos, end_pos, &start, &end, &prop, object)
5700 && end <= end_pos
5701 && (start >= pos
5702 || (find_composition (end, end_pos, &start, &end, &prop, object)
5703 && end <= end_pos)))
5704 stop = start;
5705 else
5706 stop = end_pos;
5707
5708 while (buf < buf_end)
5709 {
5710 if (pos == stop)
5711 {
5712 int *p;
5713
5714 if (pos == end_pos)
5715 break;
5716 p = save_composition_data (buf, buf_end, prop);
5717 if (p == NULL)
5718 break;
5719 buf = p;
5720 if (find_composition (end, end_pos, &start, &end, &prop, object)
5721 && end <= end_pos)
5722 stop = start;
5723 else
5724 stop = end_pos;
5725 }
5726
5727 if (! multibytep)
5728 c = *src++;
5729 else
5730 c = STRING_CHAR_ADVANCE (src);
5731 if ((c == '\r') && (coding->mode & CODING_MODE_SELECTIVE_DISPLAY))
5732 c = '\n';
5733 if (! EQ (eol_type, Qunix))
5734 {
5735 if (c == '\n')
5736 {
5737 if (EQ (eol_type, Qdos))
5738 *buf++ = '\r';
5739 else
5740 c = '\r';
5741 }
5742 }
5743 *buf++ = c;
5744 pos++;
5745 }
5746
5747 coding->consumed = src - coding->source;
5748 coding->consumed_char = pos - coding->src_pos;
5749 coding->charbuf_used = buf - coding->charbuf;
5750 coding->chars_at_source = 0;
5751 }
5752
5753
5754 /* Encode the text at CODING->src_object into CODING->dst_object.
5755 CODING->src_object is a buffer or a string.
5756 CODING->dst_object is a buffer or nil.
5757
5758 If CODING->src_object is a buffer, it must be the current buffer.
5759 In this case, if CODING->src_pos is positive, it is a position of
5760 the source text in the buffer, otherwise. the source text is in the
5761 gap area of the buffer, and coding->src_pos specifies the offset of
5762 the text from GPT (which must be the same as PT). If this is the
5763 same buffer as CODING->dst_object, CODING->src_pos must be
5764 negative and CODING should not have `pre-write-conversion'.
5765
5766 If CODING->src_object is a string, CODING should not have
5767 `pre-write-conversion'.
5768
5769 If CODING->dst_object is a buffer, the encoded data is inserted at
5770 the current point of that buffer.
5771
5772 If CODING->dst_object is nil, the encoded data is placed at the
5773 memory area specified by CODING->destination. */
5774
5775 static int
5776 encode_coding (coding)
5777 struct coding_system *coding;
5778 {
5779 Lisp_Object attrs;
5780
5781 attrs = CODING_ID_ATTRS (coding->id);
5782
5783 if (BUFFERP (coding->dst_object))
5784 {
5785 set_buffer_internal (XBUFFER (coding->dst_object));
5786 coding->dst_multibyte
5787 = ! NILP (current_buffer->enable_multibyte_characters);
5788 }
5789
5790 coding->consumed = coding->consumed_char = 0;
5791 coding->produced = coding->produced_char = 0;
5792 coding->result = CODING_RESULT_SUCCESS;
5793 coding->errors = 0;
5794
5795 ALLOC_CONVERSION_WORK_AREA (coding);
5796
5797 do {
5798 coding_set_source (coding);
5799 consume_chars (coding);
5800
5801 if (!NILP (CODING_ATTR_ENCODE_TBL (attrs)))
5802 translate_chars (CODING_ATTR_ENCODE_TBL (attrs), coding);
5803
5804 coding_set_destination (coding);
5805 (*(coding->encoder)) (coding);
5806 } while (coding->consumed_char < coding->src_chars);
5807
5808 if (BUFFERP (coding->dst_object))
5809 insert_from_gap (coding->produced_char, coding->produced);
5810
5811 return (coding->result);
5812 }
5813
5814 /* Work buffer */
5815
5816 /* List of currently used working buffer. */
5817 Lisp_Object Vcode_conversion_work_buf_list;
5818
5819 /* A working buffer used by the top level conversion. */
5820 Lisp_Object Vcode_conversion_reused_work_buf;
5821
5822
5823 /* Return a working buffer that can be freely used by the following
5824 code conversion. MULTIBYTEP specifies the multibyteness of the
5825 buffer. */
5826
5827 Lisp_Object
5828 make_conversion_work_buffer (multibytep)
5829 int multibytep;
5830 {
5831 struct buffer *current = current_buffer;
5832 Lisp_Object buf;
5833
5834 if (NILP (Vcode_conversion_work_buf_list))
5835 {
5836 if (NILP (Vcode_conversion_reused_work_buf))
5837 Vcode_conversion_reused_work_buf
5838 = Fget_buffer_create (build_string (" *code-conversion-work*"));
5839 Vcode_conversion_work_buf_list
5840 = Fcons (Vcode_conversion_reused_work_buf, Qnil);
5841 }
5842 else
5843 {
5844 int depth = Flength (Vcode_conversion_work_buf_list);
5845 char str[128];
5846
5847 sprintf (str, " *code-conversion-work*<%d>", depth);
5848 Vcode_conversion_work_buf_list
5849 = Fcons (Fget_buffer_create (build_string (str)),
5850 Vcode_conversion_work_buf_list);
5851 }
5852
5853 buf = XCAR (Vcode_conversion_work_buf_list);
5854 set_buffer_internal (XBUFFER (buf));
5855 current_buffer->undo_list = Qt;
5856 Ferase_buffer ();
5857 Fset_buffer_multibyte (multibytep ? Qt : Qnil);
5858 set_buffer_internal (current);
5859 return buf;
5860 }
5861
5862 static struct coding_system *saved_coding;
5863
5864 Lisp_Object
5865 code_conversion_restore (info)
5866 Lisp_Object info;
5867 {
5868 int depth = Flength (Vcode_conversion_work_buf_list);
5869 Lisp_Object buf;
5870
5871 if (depth > 0)
5872 {
5873 buf = XCAR (Vcode_conversion_work_buf_list);
5874 Vcode_conversion_work_buf_list = XCDR (Vcode_conversion_work_buf_list);
5875 if (depth > 1 && !NILP (Fbuffer_live_p (buf)))
5876 Fkill_buffer (buf);
5877 }
5878
5879 if (saved_coding->dst_object == Qt
5880 && saved_coding->destination)
5881 xfree (saved_coding->destination);
5882
5883 return save_excursion_restore (info);
5884 }
5885
5886
5887 int
5888 decode_coding_gap (coding, chars, bytes)
5889 struct coding_system *coding;
5890 EMACS_INT chars, bytes;
5891 {
5892 int count = specpdl_ptr - specpdl;
5893
5894 saved_coding = coding;
5895 record_unwind_protect (code_conversion_restore, save_excursion_save ());
5896
5897 coding->src_object = Fcurrent_buffer ();
5898 coding->src_chars = chars;
5899 coding->src_bytes = bytes;
5900 coding->src_pos = -chars;
5901 coding->src_pos_byte = -bytes;
5902 coding->src_multibyte = chars < bytes;
5903 coding->dst_object = coding->src_object;
5904 coding->dst_pos = PT;
5905 coding->dst_pos_byte = PT_BYTE;
5906 coding->dst_multibyte = ! NILP (current_buffer->enable_multibyte_characters);
5907
5908 if (CODING_REQUIRE_DETECTION (coding))
5909 detect_coding (coding);
5910
5911 decode_coding (coding);
5912
5913 unbind_to (count, Qnil);
5914 return coding->result;
5915 }
5916
5917 int
5918 encode_coding_gap (coding, chars, bytes)
5919 struct coding_system *coding;
5920 EMACS_INT chars, bytes;
5921 {
5922 int count = specpdl_ptr - specpdl;
5923 Lisp_Object buffer;
5924
5925 saved_coding = coding;
5926 record_unwind_protect (code_conversion_restore, save_excursion_save ());
5927
5928 buffer = Fcurrent_buffer ();
5929 coding->src_object = buffer;
5930 coding->src_chars = chars;
5931 coding->src_bytes = bytes;
5932 coding->src_pos = -chars;
5933 coding->src_pos_byte = -bytes;
5934 coding->src_multibyte = chars < bytes;
5935 coding->dst_object = coding->src_object;
5936 coding->dst_pos = PT;
5937 coding->dst_pos_byte = PT_BYTE;
5938
5939 encode_coding (coding);
5940
5941 unbind_to (count, Qnil);
5942 return coding->result;
5943 }
5944
5945
5946 /* Decode the text in the range FROM/FROM_BYTE and TO/TO_BYTE in
5947 SRC_OBJECT into DST_OBJECT by coding context CODING.
5948
5949 SRC_OBJECT is a buffer, a string, or Qnil.
5950
5951 If it is a buffer, the text is at point of the buffer. FROM and TO
5952 are positions in the buffer.
5953
5954 If it is a string, the text is at the beginning of the string.
5955 FROM and TO are indices to the string.
5956
5957 If it is nil, the text is at coding->source. FROM and TO are
5958 indices to coding->source.
5959
5960 DST_OBJECT is a buffer, Qt, or Qnil.
5961
5962 If it is a buffer, the decoded text is inserted at point of the
5963 buffer. If the buffer is the same as SRC_OBJECT, the source text
5964 is deleted.
5965
5966 If it is Qt, a string is made from the decoded text, and
5967 set in CODING->dst_object.
5968
5969 If it is Qnil, the decoded text is stored at CODING->destination.
5970 The called must allocate CODING->dst_bytes bytes at
5971 CODING->destination by xmalloc. If the decoded text is longer than
5972 CODING->dst_bytes, CODING->destination is relocated by xrealloc.
5973 */
5974
5975 void
5976 decode_coding_object (coding, src_object, from, from_byte, to, to_byte,
5977 dst_object)
5978 struct coding_system *coding;
5979 Lisp_Object src_object;
5980 EMACS_INT from, from_byte, to, to_byte;
5981 Lisp_Object dst_object;
5982 {
5983 int count = specpdl_ptr - specpdl;
5984 unsigned char *destination;
5985 EMACS_INT dst_bytes;
5986 EMACS_INT chars = to - from;
5987 EMACS_INT bytes = to_byte - from_byte;
5988 Lisp_Object attrs;
5989
5990 saved_coding = coding;
5991 record_unwind_protect (code_conversion_restore, save_excursion_save ());
5992
5993 if (NILP (dst_object))
5994 {
5995 destination = coding->destination;
5996 dst_bytes = coding->dst_bytes;
5997 }
5998
5999 coding->src_object = src_object;
6000 coding->src_chars = chars;
6001 coding->src_bytes = bytes;
6002 coding->src_multibyte = chars < bytes;
6003
6004 if (STRINGP (src_object))
6005 {
6006 coding->src_pos = from;
6007 coding->src_pos_byte = from_byte;
6008 }
6009 else if (BUFFERP (src_object))
6010 {
6011 set_buffer_internal (XBUFFER (src_object));
6012 if (from != GPT)
6013 move_gap_both (from, from_byte);
6014 if (EQ (src_object, dst_object))
6015 {
6016 TEMP_SET_PT_BOTH (from, from_byte);
6017 del_range_both (from, from_byte, to, to_byte, 1);
6018 coding->src_pos = -chars;
6019 coding->src_pos_byte = -bytes;
6020 }
6021 else
6022 {
6023 coding->src_pos = from;
6024 coding->src_pos_byte = from_byte;
6025 }
6026 }
6027
6028 if (CODING_REQUIRE_DETECTION (coding))
6029 detect_coding (coding);
6030 attrs = CODING_ID_ATTRS (coding->id);
6031
6032 if (! NILP (CODING_ATTR_POST_READ (attrs))
6033 || EQ (dst_object, Qt))
6034 {
6035 coding->dst_object = make_conversion_work_buffer (1);
6036 coding->dst_pos = BEG;
6037 coding->dst_pos_byte = BEG_BYTE;
6038 coding->dst_multibyte = 1;
6039 }
6040 else if (BUFFERP (dst_object))
6041 {
6042 coding->dst_object = dst_object;
6043 coding->dst_pos = BUF_PT (XBUFFER (dst_object));
6044 coding->dst_pos_byte = BUF_PT_BYTE (XBUFFER (dst_object));
6045 coding->dst_multibyte
6046 = ! NILP (XBUFFER (dst_object)->enable_multibyte_characters);
6047 }
6048 else
6049 {
6050 coding->dst_object = Qnil;
6051 coding->dst_multibyte = 1;
6052 }
6053
6054 decode_coding (coding);
6055
6056 if (BUFFERP (coding->dst_object))
6057 set_buffer_internal (XBUFFER (coding->dst_object));
6058
6059 if (! NILP (CODING_ATTR_POST_READ (attrs)))
6060 {
6061 struct gcpro gcpro1, gcpro2;
6062 EMACS_INT prev_Z = Z, prev_Z_BYTE = Z_BYTE;
6063 Lisp_Object val;
6064
6065 TEMP_SET_PT_BOTH (coding->dst_pos, coding->dst_pos_byte);
6066 GCPRO2 (coding->src_object, coding->dst_object);
6067 val = call1 (CODING_ATTR_POST_READ (attrs),
6068 make_number (coding->produced_char));
6069 UNGCPRO;
6070 CHECK_NATNUM (val);
6071 coding->produced_char += Z - prev_Z;
6072 coding->produced += Z_BYTE - prev_Z_BYTE;
6073 }
6074
6075 if (EQ (dst_object, Qt))
6076 {
6077 coding->dst_object = Fbuffer_string ();
6078 }
6079 else if (NILP (dst_object) && BUFFERP (coding->dst_object))
6080 {
6081 set_buffer_internal (XBUFFER (coding->dst_object));
6082 if (dst_bytes < coding->produced)
6083 {
6084 destination
6085 = (unsigned char *) xrealloc (destination, coding->produced);
6086 if (! destination)
6087 {
6088 coding->result = CODING_RESULT_INSUFFICIENT_DST;
6089 unbind_to (count, Qnil);
6090 return;
6091 }
6092 if (BEGV < GPT && GPT < BEGV + coding->produced_char)
6093 move_gap_both (BEGV, BEGV_BYTE);
6094 bcopy (BEGV_ADDR, destination, coding->produced);
6095 coding->destination = destination;
6096 }
6097 }
6098
6099 unbind_to (count, Qnil);
6100 }
6101
6102
6103 void
6104 encode_coding_object (coding, src_object, from, from_byte, to, to_byte,
6105 dst_object)
6106 struct coding_system *coding;
6107 Lisp_Object src_object;
6108 EMACS_INT from, from_byte, to, to_byte;
6109 Lisp_Object dst_object;
6110 {
6111 int count = specpdl_ptr - specpdl;
6112 EMACS_INT chars = to - from;
6113 EMACS_INT bytes = to_byte - from_byte;
6114 Lisp_Object attrs;
6115
6116 saved_coding = coding;
6117 record_unwind_protect (code_conversion_restore, save_excursion_save ());
6118
6119 coding->src_object = src_object;
6120 coding->src_chars = chars;
6121 coding->src_bytes = bytes;
6122 coding->src_multibyte = chars < bytes;
6123
6124 attrs = CODING_ID_ATTRS (coding->id);
6125
6126 if (! NILP (CODING_ATTR_PRE_WRITE (attrs)))
6127 {
6128 coding->src_object = make_conversion_work_buffer (coding->src_multibyte);
6129 set_buffer_internal (XBUFFER (coding->src_object));
6130 if (STRINGP (src_object))
6131 insert_from_string (src_object, from, from_byte, chars, bytes, 0);
6132 else if (BUFFERP (src_object))
6133 insert_from_buffer (XBUFFER (src_object), from, chars, 0);
6134 else
6135 insert_1_both (coding->source + from, chars, bytes, 0, 0, 0);
6136
6137 if (EQ (src_object, dst_object))
6138 {
6139 set_buffer_internal (XBUFFER (src_object));
6140 del_range_both (from, from_byte, to, to_byte, 1);
6141 set_buffer_internal (XBUFFER (coding->src_object));
6142 }
6143
6144 call2 (CODING_ATTR_PRE_WRITE (attrs),
6145 make_number (BEG), make_number (Z));
6146 coding->src_object = Fcurrent_buffer ();
6147 if (BEG != GPT)
6148 move_gap_both (BEG, BEG_BYTE);
6149 coding->src_chars = Z - BEG;
6150 coding->src_bytes = Z_BYTE - BEG_BYTE;
6151 coding->src_pos = BEG;
6152 coding->src_pos_byte = BEG_BYTE;
6153 coding->src_multibyte = Z < Z_BYTE;
6154 }
6155 else if (STRINGP (src_object))
6156 {
6157 coding->src_pos = from;
6158 coding->src_pos_byte = from_byte;
6159 }
6160 else if (BUFFERP (src_object))
6161 {
6162 set_buffer_internal (XBUFFER (src_object));
6163 if (from != GPT)
6164 move_gap_both (from, from_byte);
6165 if (EQ (src_object, dst_object))
6166 {
6167 del_range_both (from, from_byte, to, to_byte, 1);
6168 coding->src_pos = -chars;
6169 coding->src_pos_byte = -bytes;
6170 }
6171 else
6172 {
6173 coding->src_pos = from;
6174 coding->src_pos_byte = from_byte;
6175 }
6176 }
6177
6178 if (BUFFERP (dst_object))
6179 {
6180 coding->dst_object = dst_object;
6181 coding->dst_pos = BUF_PT (XBUFFER (dst_object));
6182 coding->dst_pos_byte = BUF_PT_BYTE (XBUFFER (dst_object));
6183 coding->dst_multibyte
6184 = ! NILP (XBUFFER (dst_object)->enable_multibyte_characters);
6185 }
6186 else if (EQ (dst_object, Qt))
6187 {
6188 coding->dst_object = Qnil;
6189 coding->dst_bytes = coding->src_chars;
6190 if (coding->dst_bytes == 0)
6191 coding->dst_bytes = 1;
6192 coding->destination = (unsigned char *) xmalloc (coding->dst_bytes);
6193 coding->dst_multibyte = 0;
6194 }
6195 else
6196 {
6197 coding->dst_object = Qnil;
6198 coding->dst_multibyte = 0;
6199 }
6200
6201 encode_coding (coding);
6202
6203 if (EQ (dst_object, Qt))
6204 {
6205 if (BUFFERP (coding->dst_object))
6206 coding->dst_object = Fbuffer_string ();
6207 else
6208 {
6209 coding->dst_object
6210 = make_unibyte_string ((char *) coding->destination,
6211 coding->produced);
6212 xfree (coding->destination);
6213 }
6214 }
6215
6216 unbind_to (count, Qnil);
6217 }
6218
6219
6220 Lisp_Object
6221 preferred_coding_system ()
6222 {
6223 int id = coding_categories[coding_priorities[0]].id;
6224
6225 return CODING_ID_NAME (id);
6226 }
6227
6228 \f
6229 #ifdef emacs
6230 /*** 8. Emacs Lisp library functions ***/
6231
6232 DEFUN ("coding-system-p", Fcoding_system_p, Scoding_system_p, 1, 1, 0,
6233 doc: /* Return t if OBJECT is nil or a coding-system.
6234 See the documentation of `define-coding-system' for information
6235 about coding-system objects. */)
6236 (obj)
6237 Lisp_Object obj;
6238 {
6239 return ((NILP (obj) || CODING_SYSTEM_P (obj)) ? Qt : Qnil);
6240 }
6241
6242 DEFUN ("read-non-nil-coding-system", Fread_non_nil_coding_system,
6243 Sread_non_nil_coding_system, 1, 1, 0,
6244 doc: /* Read a coding system from the minibuffer, prompting with string PROMPT. */)
6245 (prompt)
6246 Lisp_Object prompt;
6247 {
6248 Lisp_Object val;
6249 do
6250 {
6251 val = Fcompleting_read (prompt, Vcoding_system_alist, Qnil,
6252 Qt, Qnil, Qcoding_system_history, Qnil, Qnil);
6253 }
6254 while (XSTRING (val)->size == 0);
6255 return (Fintern (val, Qnil));
6256 }
6257
6258 DEFUN ("read-coding-system", Fread_coding_system, Sread_coding_system, 1, 2, 0,
6259 doc: /* Read a coding system from the minibuffer, prompting with string PROMPT.
6260 If the user enters null input, return second argument DEFAULT-CODING-SYSTEM. */)
6261 (prompt, default_coding_system)
6262 Lisp_Object prompt, default_coding_system;
6263 {
6264 Lisp_Object val;
6265 if (SYMBOLP (default_coding_system))
6266 XSETSTRING (default_coding_system, XSYMBOL (default_coding_system)->name);
6267 val = Fcompleting_read (prompt, Vcoding_system_alist, Qnil,
6268 Qt, Qnil, Qcoding_system_history,
6269 default_coding_system, Qnil);
6270 return (XSTRING (val)->size == 0 ? Qnil : Fintern (val, Qnil));
6271 }
6272
6273 DEFUN ("check-coding-system", Fcheck_coding_system, Scheck_coding_system,
6274 1, 1, 0,
6275 doc: /* Check validity of CODING-SYSTEM.
6276 If valid, return CODING-SYSTEM, else signal a `coding-system-error' error.
6277 It is valid if it is a symbol with a non-nil `coding-system' property.
6278 The value of property should be a vector of length 5. */)
6279 (coding_system)
6280 Lisp_Object coding_system;
6281 {
6282 CHECK_SYMBOL (coding_system);
6283 if (!NILP (Fcoding_system_p (coding_system)))
6284 return coding_system;
6285 while (1)
6286 Fsignal (Qcoding_system_error, Fcons (coding_system, Qnil));
6287 }
6288
6289 \f
6290 Lisp_Object
6291 detect_coding_system (src, src_bytes, highest, multibytep, coding_system)
6292 unsigned char *src;
6293 int src_bytes, highest;
6294 int multibytep;
6295 Lisp_Object coding_system;
6296 {
6297 unsigned char *src_end = src + src_bytes;
6298 int mask = CATEGORY_MASK_ANY;
6299 int detected = 0;
6300 int c, i;
6301 Lisp_Object attrs, eol_type;
6302 Lisp_Object val;
6303 struct coding_system coding;
6304
6305 if (NILP (coding_system))
6306 coding_system = Qundecided;
6307 setup_coding_system (coding_system, &coding);
6308 attrs = CODING_ID_ATTRS (coding.id);
6309 eol_type = CODING_ID_EOL_TYPE (coding.id);
6310
6311 coding.source = src;
6312 coding.src_bytes = src_bytes;
6313 coding.src_multibyte = multibytep;
6314 coding.consumed = 0;
6315
6316 if (XINT (CODING_ATTR_CATEGORY (attrs)) != coding_category_undecided)
6317 {
6318 mask = 1 << XINT (CODING_ATTR_CATEGORY (attrs));
6319 }
6320 else
6321 {
6322 coding_system = Qnil;
6323 for (; src < src_end; src++)
6324 {
6325 c = *src;
6326 if (c & 0x80 || (c < 0x20 && (c == ISO_CODE_ESC
6327 || c == ISO_CODE_SI
6328 || c == ISO_CODE_SO)))
6329 break;
6330 }
6331 coding.head_ascii = src - coding.source;
6332
6333 if (src < src_end)
6334 for (i = 0; i < coding_category_raw_text; i++)
6335 {
6336 enum coding_category category = coding_priorities[i];
6337 struct coding_system *this = coding_categories + category;
6338
6339 if (category >= coding_category_raw_text
6340 || detected & (1 << category))
6341 continue;
6342
6343 if (this->id < 0)
6344 {
6345 /* No coding system of this category is defined. */
6346 mask &= ~(1 << category);
6347 }
6348 else
6349 {
6350 detected |= detected_mask[category];
6351 if ((*(coding_categories[category].detector)) (&coding, &mask)
6352 && highest)
6353 {
6354 mask &= detected_mask[category];
6355 break;
6356 }
6357 }
6358 }
6359 }
6360
6361 if (!mask)
6362 val = Fcons (make_number (coding_category_raw_text), Qnil);
6363 else if (mask == CATEGORY_MASK_ANY)
6364 val = Fcons (make_number (coding_category_undecided), Qnil);
6365 else if (highest)
6366 {
6367 for (i = 0; i < coding_category_raw_text; i++)
6368 if (mask & (1 << coding_priorities[i]))
6369 {
6370 val = Fcons (make_number (coding_priorities[i]), Qnil);
6371 break;
6372 }
6373 }
6374 else
6375 {
6376 val = Qnil;
6377 for (i = coding_category_raw_text - 1; i >= 0; i--)
6378 if (mask & (1 << coding_priorities[i]))
6379 val = Fcons (make_number (coding_priorities[i]), val);
6380 }
6381
6382 {
6383 int one_byte_eol = -1, two_byte_eol = -1;
6384 Lisp_Object tail;
6385
6386 for (tail = val; CONSP (tail); tail = XCDR (tail))
6387 {
6388 struct coding_system *this
6389 = (NILP (coding_system) ? coding_categories + XINT (XCAR (tail))
6390 : &coding);
6391 int this_eol;
6392
6393 attrs = CODING_ID_ATTRS (this->id);
6394 eol_type = CODING_ID_EOL_TYPE (this->id);
6395 XSETCAR (tail, CODING_ID_NAME (this->id));
6396 if (VECTORP (eol_type))
6397 {
6398 if (EQ (CODING_ATTR_TYPE (attrs), Qutf_16))
6399 {
6400 if (two_byte_eol < 0)
6401 two_byte_eol = detect_eol (this, coding.source, src_bytes);
6402 this_eol = two_byte_eol;
6403 }
6404 else
6405 {
6406 if (one_byte_eol < 0)
6407 one_byte_eol =detect_eol (this, coding.source, src_bytes);
6408 this_eol = one_byte_eol;
6409 }
6410 if (this_eol == EOL_SEEN_LF)
6411 XSETCAR (tail, AREF (eol_type, 0));
6412 else if (this_eol == EOL_SEEN_CRLF)
6413 XSETCAR (tail, AREF (eol_type, 1));
6414 else if (this_eol == EOL_SEEN_CR)
6415 XSETCAR (tail, AREF (eol_type, 2));
6416 }
6417 }
6418 }
6419
6420 return (highest ? XCAR (val) : val);
6421 }
6422
6423
6424 DEFUN ("detect-coding-region", Fdetect_coding_region, Sdetect_coding_region,
6425 2, 3, 0,
6426 doc: /* Detect coding system of the text in the region between START and END.
6427 Return a list of possible coding systems ordered by priority.
6428
6429 If only ASCII characters are found, it returns a list of single element
6430 `undecided' or its subsidiary coding system according to a detected
6431 end-of-line format.
6432
6433 If optional argument HIGHEST is non-nil, return the coding system of
6434 highest priority. */)
6435 (start, end, highest)
6436 Lisp_Object start, end, highest;
6437 {
6438 int from, to;
6439 int from_byte, to_byte;
6440
6441 CHECK_NUMBER_COERCE_MARKER (start);
6442 CHECK_NUMBER_COERCE_MARKER (end);
6443
6444 validate_region (&start, &end);
6445 from = XINT (start), to = XINT (end);
6446 from_byte = CHAR_TO_BYTE (from);
6447 to_byte = CHAR_TO_BYTE (to);
6448
6449 if (from < GPT && to >= GPT)
6450 move_gap_both (to, to_byte);
6451
6452 return detect_coding_system (BYTE_POS_ADDR (from_byte),
6453 to_byte - from_byte,
6454 !NILP (highest),
6455 !NILP (current_buffer
6456 ->enable_multibyte_characters),
6457 Qnil);
6458 }
6459
6460 DEFUN ("detect-coding-string", Fdetect_coding_string, Sdetect_coding_string,
6461 1, 2, 0,
6462 doc: /* Detect coding system of the text in STRING.
6463 Return a list of possible coding systems ordered by priority.
6464
6465 If only ASCII characters are found, it returns a list of single element
6466 `undecided' or its subsidiary coding system according to a detected
6467 end-of-line format.
6468
6469 If optional argument HIGHEST is non-nil, return the coding system of
6470 highest priority. */)
6471 (string, highest)
6472 Lisp_Object string, highest;
6473 {
6474 CHECK_STRING (string);
6475
6476 return detect_coding_system (XSTRING (string)->data,
6477 STRING_BYTES (XSTRING (string)),
6478 !NILP (highest),
6479 STRING_MULTIBYTE (string),
6480 Qnil);
6481 }
6482
6483
6484 static INLINE int
6485 char_encodable_p (c, attrs)
6486 int c;
6487 Lisp_Object attrs;
6488 {
6489 Lisp_Object tail;
6490 struct charset *charset;
6491
6492 for (tail = CODING_ATTR_CHARSET_LIST (attrs);
6493 CONSP (tail); tail = XCDR (tail))
6494 {
6495 charset = CHARSET_FROM_ID (XINT (XCAR (tail)));
6496 if (CHAR_CHARSET_P (c, charset))
6497 break;
6498 }
6499 return (! NILP (tail));
6500 }
6501
6502
6503 /* Return a list of coding systems that safely encode the text between
6504 START and END. If EXCLUDE is non-nil, it is a list of coding
6505 systems not to check. The returned list doesn't contain any such
6506 coding systems. In any case, If the text contains only ASCII or is
6507 unibyte, return t. */
6508
6509 DEFUN ("find-coding-systems-region-internal",
6510 Ffind_coding_systems_region_internal,
6511 Sfind_coding_systems_region_internal, 2, 3, 0,
6512 doc: /* Internal use only. */)
6513 (start, end, exclude)
6514 Lisp_Object start, end, exclude;
6515 {
6516 Lisp_Object coding_attrs_list, safe_codings;
6517 EMACS_INT start_byte, end_byte;
6518 unsigned char *p, *pbeg, *pend;
6519 int c;
6520 Lisp_Object tail, elt;
6521
6522 if (STRINGP (start))
6523 {
6524 if (!STRING_MULTIBYTE (start)
6525 && XSTRING (start)->size != STRING_BYTES (XSTRING (start)))
6526 return Qt;
6527 start_byte = 0;
6528 end_byte = STRING_BYTES (XSTRING (start));
6529 }
6530 else
6531 {
6532 CHECK_NUMBER_COERCE_MARKER (start);
6533 CHECK_NUMBER_COERCE_MARKER (end);
6534 if (XINT (start) < BEG || XINT (end) > Z || XINT (start) > XINT (end))
6535 args_out_of_range (start, end);
6536 if (NILP (current_buffer->enable_multibyte_characters))
6537 return Qt;
6538 start_byte = CHAR_TO_BYTE (XINT (start));
6539 end_byte = CHAR_TO_BYTE (XINT (end));
6540 if (XINT (end) - XINT (start) == end_byte - start_byte)
6541 return Qt;
6542
6543 if (start < GPT && end > GPT)
6544 {
6545 if ((GPT - start) < (end - GPT))
6546 move_gap_both (start, start_byte);
6547 else
6548 move_gap_both (end, end_byte);
6549 }
6550 }
6551
6552 coding_attrs_list = Qnil;
6553 for (tail = Vcoding_system_list; CONSP (tail); tail = XCDR (tail))
6554 if (NILP (exclude)
6555 || NILP (Fmemq (XCAR (tail), exclude)))
6556 {
6557 Lisp_Object attrs;
6558
6559 attrs = AREF (CODING_SYSTEM_SPEC (XCAR (tail)), 0);
6560 if (EQ (XCAR (tail), CODING_ATTR_BASE_NAME (attrs))
6561 && ! EQ (CODING_ATTR_TYPE (attrs), Qundecided))
6562 coding_attrs_list = Fcons (attrs, coding_attrs_list);
6563 }
6564
6565 if (STRINGP (start))
6566 p = pbeg = XSTRING (start)->data;
6567 else
6568 p = pbeg = BYTE_POS_ADDR (start_byte);
6569 pend = p + (end_byte - start_byte);
6570
6571 while (p < pend && ASCII_BYTE_P (*p)) p++;
6572 while (p < pend && ASCII_BYTE_P (*(pend - 1))) pend--;
6573
6574 while (p < pend)
6575 {
6576 if (ASCII_BYTE_P (*p))
6577 p++;
6578 else
6579 {
6580 c = STRING_CHAR_ADVANCE (p);
6581
6582 charset_map_loaded = 0;
6583 for (tail = coding_attrs_list; CONSP (tail);)
6584 {
6585 elt = XCAR (tail);
6586 if (NILP (elt))
6587 tail = XCDR (tail);
6588 else if (char_encodable_p (c, elt))
6589 tail = XCDR (tail);
6590 else if (CONSP (XCDR (tail)))
6591 {
6592 XSETCAR (tail, XCAR (XCDR (tail)));
6593 XSETCDR (tail, XCDR (XCDR (tail)));
6594 }
6595 else
6596 {
6597 XSETCAR (tail, Qnil);
6598 tail = XCDR (tail);
6599 }
6600 }
6601 if (charset_map_loaded)
6602 {
6603 EMACS_INT p_offset = p - pbeg, pend_offset = pend - pbeg;
6604
6605 if (STRINGP (start))
6606 pbeg = XSTRING (start)->data;
6607 else
6608 pbeg = BYTE_POS_ADDR (start_byte);
6609 p = pbeg + p_offset;
6610 pend = pbeg + pend_offset;
6611 }
6612 }
6613 }
6614
6615 safe_codings = Qnil;
6616 for (tail = coding_attrs_list; CONSP (tail); tail = XCDR (tail))
6617 if (! NILP (XCAR (tail)))
6618 safe_codings = Fcons (CODING_ATTR_BASE_NAME (XCAR (tail)), safe_codings);
6619
6620 return safe_codings;
6621 }
6622
6623
6624 DEFUN ("check-coding-systems-region", Fcheck_coding_systems_region,
6625 Scheck_coding_systems_region, 3, 3, 0,
6626 doc: /* Check if the region is encodable by coding systems.
6627
6628 START and END are buffer positions specifying the region.
6629 CODING-SYSTEM-LIST is a list of coding systems to check.
6630
6631 The value is an alist ((CODING-SYSTEM POS0 POS1 ...) ...), where
6632 CODING-SYSTEM is a member of CODING-SYSTEM-LIst and can't encode the
6633 whole region, POS0, POS1, ... are buffer positions where non-encodable
6634 characters are found.
6635
6636 If all coding systems in CODING-SYSTEM-LIST can encode the region, the
6637 value is nil.
6638
6639 START may be a string. In that case, check if the string is
6640 encodable, and the value contains indices to the string instead of
6641 buffer positions. END is ignored. */)
6642 (start, end, coding_system_list)
6643 Lisp_Object start, end, coding_system_list;
6644 {
6645 Lisp_Object list;
6646 EMACS_INT start_byte, end_byte;
6647 int pos;
6648 unsigned char *p, *pbeg, *pend;
6649 int c;
6650 Lisp_Object tail, elt;
6651
6652 if (STRINGP (start))
6653 {
6654 if (!STRING_MULTIBYTE (start)
6655 && XSTRING (start)->size != STRING_BYTES (XSTRING (start)))
6656 return Qnil;
6657 start_byte = 0;
6658 end_byte = STRING_BYTES (XSTRING (start));
6659 pos = 0;
6660 }
6661 else
6662 {
6663 CHECK_NUMBER_COERCE_MARKER (start);
6664 CHECK_NUMBER_COERCE_MARKER (end);
6665 if (XINT (start) < BEG || XINT (end) > Z || XINT (start) > XINT (end))
6666 args_out_of_range (start, end);
6667 if (NILP (current_buffer->enable_multibyte_characters))
6668 return Qnil;
6669 start_byte = CHAR_TO_BYTE (XINT (start));
6670 end_byte = CHAR_TO_BYTE (XINT (end));
6671 if (XINT (end) - XINT (start) == end_byte - start_byte)
6672 return Qt;
6673
6674 if (start < GPT && end > GPT)
6675 {
6676 if ((GPT - start) < (end - GPT))
6677 move_gap_both (start, start_byte);
6678 else
6679 move_gap_both (end, end_byte);
6680 }
6681 pos = start;
6682 }
6683
6684 list = Qnil;
6685 for (tail = coding_system_list; CONSP (tail); tail = XCDR (tail))
6686 {
6687 elt = XCAR (tail);
6688 list = Fcons (Fcons (elt, Fcons (AREF (CODING_SYSTEM_SPEC (elt), 0),
6689 Qnil)),
6690 list);
6691 }
6692
6693 if (STRINGP (start))
6694 p = pbeg = XSTRING (start)->data;
6695 else
6696 p = pbeg = BYTE_POS_ADDR (start_byte);
6697 pend = p + (end_byte - start_byte);
6698
6699 while (p < pend && ASCII_BYTE_P (*p)) p++, pos++;
6700 while (p < pend && ASCII_BYTE_P (*(pend - 1))) pend--;
6701
6702 while (p < pend)
6703 {
6704 if (ASCII_BYTE_P (*p))
6705 p++;
6706 else
6707 {
6708 c = STRING_CHAR_ADVANCE (p);
6709
6710 charset_map_loaded = 0;
6711 for (tail = list; CONSP (tail); tail = XCDR (tail))
6712 {
6713 elt = XCDR (XCAR (tail));
6714 if (! char_encodable_p (c, XCAR (elt)))
6715 XSETCDR (elt, Fcons (make_number (pos), XCDR (elt)));
6716 }
6717 if (charset_map_loaded)
6718 {
6719 EMACS_INT p_offset = p - pbeg, pend_offset = pend - pbeg;
6720
6721 if (STRINGP (start))
6722 pbeg = XSTRING (start)->data;
6723 else
6724 pbeg = BYTE_POS_ADDR (start_byte);
6725 p = pbeg + p_offset;
6726 pend = pbeg + pend_offset;
6727 }
6728 }
6729 pos++;
6730 }
6731
6732 tail = list;
6733 list = Qnil;
6734 for (; CONSP (tail); tail = XCDR (tail))
6735 {
6736 elt = XCAR (tail);
6737 if (CONSP (XCDR (XCDR (elt))))
6738 list = Fcons (Fcons (XCAR (elt), Fnreverse (XCDR (XCDR (elt)))),
6739 list);
6740 }
6741
6742 return list;
6743 }
6744
6745
6746
6747 Lisp_Object
6748 code_convert_region (start, end, coding_system, dst_object, encodep, norecord)
6749 Lisp_Object start, end, coding_system, dst_object;
6750 int encodep, norecord;
6751 {
6752 struct coding_system coding;
6753 EMACS_INT from, from_byte, to, to_byte;
6754 Lisp_Object src_object;
6755
6756 CHECK_NUMBER_COERCE_MARKER (start);
6757 CHECK_NUMBER_COERCE_MARKER (end);
6758 if (NILP (coding_system))
6759 coding_system = Qno_conversion;
6760 else
6761 CHECK_CODING_SYSTEM (coding_system);
6762 src_object = Fcurrent_buffer ();
6763 if (NILP (dst_object))
6764 dst_object = src_object;
6765 else if (! EQ (dst_object, Qt))
6766 CHECK_BUFFER (dst_object);
6767
6768 validate_region (&start, &end);
6769 from = XFASTINT (start);
6770 from_byte = CHAR_TO_BYTE (from);
6771 to = XFASTINT (end);
6772 to_byte = CHAR_TO_BYTE (to);
6773
6774 setup_coding_system (coding_system, &coding);
6775 coding.mode |= CODING_MODE_LAST_BLOCK;
6776
6777 if (encodep)
6778 encode_coding_object (&coding, src_object, from, from_byte, to, to_byte,
6779 dst_object);
6780 else
6781 decode_coding_object (&coding, src_object, from, from_byte, to, to_byte,
6782 dst_object);
6783 if (! norecord)
6784 Vlast_coding_system_used = CODING_ID_NAME (coding.id);
6785
6786 if (coding.result != CODING_RESULT_SUCCESS)
6787 error ("Code conversion error: %d", coding.result);
6788
6789 return (BUFFERP (dst_object)
6790 ? make_number (coding.produced_char)
6791 : coding.dst_object);
6792 }
6793
6794
6795 DEFUN ("decode-coding-region", Fdecode_coding_region, Sdecode_coding_region,
6796 3, 4, "r\nzCoding system: ",
6797 doc: /* Decode the current region from the specified coding system.
6798 When called from a program, takes four arguments:
6799 START, END, CODING-SYSTEM, and DESTINATION.
6800 START and END are buffer positions.
6801
6802 Optional 4th arguments DESTINATION specifies where the decoded text goes.
6803 If nil, the region between START and END is replace by the decoded text.
6804 If buffer, the decoded text is inserted in the buffer.
6805 If t, the decoded text is returned.
6806
6807 This function sets `last-coding-system-used' to the precise coding system
6808 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
6809 not fully specified.)
6810 It returns the length of the decoded text. */)
6811 (start, end, coding_system, destination)
6812 Lisp_Object start, end, coding_system, destination;
6813 {
6814 return code_convert_region (start, end, coding_system, destination, 0, 0);
6815 }
6816
6817 DEFUN ("encode-coding-region", Fencode_coding_region, Sencode_coding_region,
6818 3, 4, "r\nzCoding system: ",
6819 doc: /* Encode the current region by specified coding system.
6820 When called from a program, takes three arguments:
6821 START, END, and CODING-SYSTEM. START and END are buffer positions.
6822
6823 Optional 4th arguments DESTINATION specifies where the encoded text goes.
6824 If nil, the region between START and END is replace by the encoded text.
6825 If buffer, the encoded text is inserted in the buffer.
6826 If t, the encoded text is returned.
6827
6828 This function sets `last-coding-system-used' to the precise coding system
6829 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
6830 not fully specified.)
6831 It returns the length of the encoded text. */)
6832 (start, end, coding_system, destination)
6833 Lisp_Object start, end, coding_system, destination;
6834 {
6835 return code_convert_region (start, end, coding_system, destination, 1, 0);
6836 }
6837
6838 Lisp_Object
6839 code_convert_string (string, coding_system, dst_object,
6840 encodep, nocopy, norecord)
6841 Lisp_Object string, coding_system, dst_object;
6842 int encodep, nocopy, norecord;
6843 {
6844 struct coding_system coding;
6845 EMACS_INT chars, bytes;
6846
6847 CHECK_STRING (string);
6848 if (NILP (coding_system))
6849 {
6850 if (! norecord)
6851 Vlast_coding_system_used = Qno_conversion;
6852 if (NILP (dst_object))
6853 return (nocopy ? Fcopy_sequence (string) : string);
6854 }
6855
6856 if (NILP (coding_system))
6857 coding_system = Qno_conversion;
6858 else
6859 CHECK_CODING_SYSTEM (coding_system);
6860 if (NILP (dst_object))
6861 dst_object = Qt;
6862 else if (! EQ (dst_object, Qt))
6863 CHECK_BUFFER (dst_object);
6864
6865 setup_coding_system (coding_system, &coding);
6866 coding.mode |= CODING_MODE_LAST_BLOCK;
6867 chars = XSTRING (string)->size;
6868 bytes = STRING_BYTES (XSTRING (string));
6869 if (encodep)
6870 encode_coding_object (&coding, string, 0, 0, chars, bytes, dst_object);
6871 else
6872 decode_coding_object (&coding, string, 0, 0, chars, bytes, dst_object);
6873 if (! norecord)
6874 Vlast_coding_system_used = CODING_ID_NAME (coding.id);
6875
6876 if (coding.result != CODING_RESULT_SUCCESS)
6877 error ("Code conversion error: %d", coding.result);
6878
6879 return (BUFFERP (dst_object)
6880 ? make_number (coding.produced_char)
6881 : coding.dst_object);
6882 }
6883
6884
6885 /* Encode or decode STRING according to CODING_SYSTEM.
6886 Do not set Vlast_coding_system_used.
6887
6888 This function is called only from macros DECODE_FILE and
6889 ENCODE_FILE, thus we ignore character composition. */
6890
6891 Lisp_Object
6892 code_convert_string_norecord (string, coding_system, encodep)
6893 Lisp_Object string, coding_system;
6894 int encodep;
6895 {
6896 return code_convert_string (string, coding_system, Qt, encodep, 0, 1);
6897 }
6898
6899
6900 DEFUN ("decode-coding-string", Fdecode_coding_string, Sdecode_coding_string,
6901 2, 4, 0,
6902 doc: /* Decode STRING which is encoded in CODING-SYSTEM, and return the result.
6903
6904 Optional third arg NOCOPY non-nil means it is OK to return STRING itself
6905 if the decoding operation is trivial.
6906
6907 Optional fourth arg BUFFER non-nil meant that the decoded text is
6908 inserted in BUFFER instead of returned as a astring. In this case,
6909 the return value is BUFFER.
6910
6911 This function sets `last-coding-system-used' to the precise coding system
6912 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
6913 not fully specified. */)
6914 (string, coding_system, nocopy, buffer)
6915 Lisp_Object string, coding_system, nocopy, buffer;
6916 {
6917 return code_convert_string (string, coding_system, buffer,
6918 0, ! NILP (nocopy), 0);
6919 }
6920
6921 DEFUN ("encode-coding-string", Fencode_coding_string, Sencode_coding_string,
6922 2, 4, 0,
6923 doc: /* Encode STRING to CODING-SYSTEM, and return the result.
6924
6925 Optional third arg NOCOPY non-nil means it is OK to return STRING
6926 itself if the encoding operation is trivial.
6927
6928 Optional fourth arg BUFFER non-nil meant that the encoded text is
6929 inserted in BUFFER instead of returned as a astring. In this case,
6930 the return value is BUFFER.
6931
6932 This function sets `last-coding-system-used' to the precise coding system
6933 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
6934 not fully specified.) */)
6935 (string, coding_system, nocopy, buffer)
6936 Lisp_Object string, coding_system, nocopy, buffer;
6937 {
6938 return code_convert_string (string, coding_system, buffer,
6939 nocopy, ! NILP (nocopy), 1);
6940 }
6941
6942 \f
6943 DEFUN ("decode-sjis-char", Fdecode_sjis_char, Sdecode_sjis_char, 1, 1, 0,
6944 doc: /* Decode a Japanese character which has CODE in shift_jis encoding.
6945 Return the corresponding character. */)
6946 (code)
6947 Lisp_Object code;
6948 {
6949 Lisp_Object spec, attrs, val;
6950 struct charset *charset_roman, *charset_kanji, *charset_kana, *charset;
6951 int c;
6952
6953 CHECK_NATNUM (code);
6954 c = XFASTINT (code);
6955 CHECK_CODING_SYSTEM_GET_SPEC (Vsjis_coding_system, spec);
6956 attrs = AREF (spec, 0);
6957
6958 if (ASCII_BYTE_P (c)
6959 && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
6960 return code;
6961
6962 val = CODING_ATTR_CHARSET_LIST (attrs);
6963 charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
6964 charset_kana = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
6965 charset_kanji = CHARSET_FROM_ID (XINT (XCAR (val)));
6966
6967 if (c <= 0x7F)
6968 charset = charset_roman;
6969 else if (c >= 0xA0 && c < 0xDF)
6970 {
6971 charset = charset_kana;
6972 c -= 0x80;
6973 }
6974 else
6975 {
6976 int s1 = c >> 8, s2 = c & 0xFF;
6977
6978 if (s1 < 0x81 || (s1 > 0x9F && s1 < 0xE0) || s1 > 0xEF
6979 || s2 < 0x40 || s2 == 0x7F || s2 > 0xFC)
6980 error ("Invalid code: %d", code);
6981 SJIS_TO_JIS (c);
6982 charset = charset_kanji;
6983 }
6984 c = DECODE_CHAR (charset, c);
6985 if (c < 0)
6986 error ("Invalid code: %d", code);
6987 return make_number (c);
6988 }
6989
6990
6991 DEFUN ("encode-sjis-char", Fencode_sjis_char, Sencode_sjis_char, 1, 1, 0,
6992 doc: /* Encode a Japanese character CHAR to shift_jis encoding.
6993 Return the corresponding code in SJIS. */)
6994 (ch)
6995 Lisp_Object ch;
6996 {
6997 Lisp_Object spec, attrs, charset_list;
6998 int c;
6999 struct charset *charset;
7000 unsigned code;
7001
7002 CHECK_CHARACTER (ch);
7003 c = XFASTINT (ch);
7004 CHECK_CODING_SYSTEM_GET_SPEC (Vsjis_coding_system, spec);
7005 attrs = AREF (spec, 0);
7006
7007 if (ASCII_CHAR_P (c)
7008 && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
7009 return ch;
7010
7011 charset_list = CODING_ATTR_CHARSET_LIST (attrs);
7012 charset = char_charset (c, charset_list, &code);
7013 if (code == CHARSET_INVALID_CODE (charset))
7014 error ("Can't encode by shift_jis encoding: %d", c);
7015 JIS_TO_SJIS (code);
7016
7017 return make_number (code);
7018 }
7019
7020 DEFUN ("decode-big5-char", Fdecode_big5_char, Sdecode_big5_char, 1, 1, 0,
7021 doc: /* Decode a Big5 character which has CODE in BIG5 coding system.
7022 Return the corresponding character. */)
7023 (code)
7024 Lisp_Object code;
7025 {
7026 Lisp_Object spec, attrs, val;
7027 struct charset *charset_roman, *charset_big5, *charset;
7028 int c;
7029
7030 CHECK_NATNUM (code);
7031 c = XFASTINT (code);
7032 CHECK_CODING_SYSTEM_GET_SPEC (Vbig5_coding_system, spec);
7033 attrs = AREF (spec, 0);
7034
7035 if (ASCII_BYTE_P (c)
7036 && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
7037 return code;
7038
7039 val = CODING_ATTR_CHARSET_LIST (attrs);
7040 charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
7041 charset_big5 = CHARSET_FROM_ID (XINT (XCAR (val)));
7042
7043 if (c <= 0x7F)
7044 charset = charset_roman;
7045 else
7046 {
7047 int b1 = c >> 8, b2 = c & 0x7F;
7048 if (b1 < 0xA1 || b1 > 0xFE
7049 || b2 < 0x40 || (b2 > 0x7E && b2 < 0xA1) || b2 > 0xFE)
7050 error ("Invalid code: %d", code);
7051 charset = charset_big5;
7052 }
7053 c = DECODE_CHAR (charset, (unsigned )c);
7054 if (c < 0)
7055 error ("Invalid code: %d", code);
7056 return make_number (c);
7057 }
7058
7059 DEFUN ("encode-big5-char", Fencode_big5_char, Sencode_big5_char, 1, 1, 0,
7060 doc: /* Encode the Big5 character CHAR to BIG5 coding system.
7061 Return the corresponding character code in Big5. */)
7062 (ch)
7063 Lisp_Object ch;
7064 {
7065 Lisp_Object spec, attrs, charset_list;
7066 struct charset *charset;
7067 int c;
7068 unsigned code;
7069
7070 CHECK_CHARACTER (ch);
7071 c = XFASTINT (ch);
7072 CHECK_CODING_SYSTEM_GET_SPEC (Vbig5_coding_system, spec);
7073 attrs = AREF (spec, 0);
7074 if (ASCII_CHAR_P (c)
7075 && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
7076 return ch;
7077
7078 charset_list = CODING_ATTR_CHARSET_LIST (attrs);
7079 charset = char_charset (c, charset_list, &code);
7080 if (code == CHARSET_INVALID_CODE (charset))
7081 error ("Can't encode by Big5 encoding: %d", c);
7082
7083 return make_number (code);
7084 }
7085
7086 \f
7087 DEFUN ("set-terminal-coding-system-internal",
7088 Fset_terminal_coding_system_internal,
7089 Sset_terminal_coding_system_internal, 1, 1, 0,
7090 doc: /* Internal use only. */)
7091 (coding_system)
7092 Lisp_Object coding_system;
7093 {
7094 CHECK_SYMBOL (coding_system);
7095 setup_coding_system (Fcheck_coding_system (coding_system),
7096 &terminal_coding);
7097
7098 /* We had better not send unsafe characters to terminal. */
7099 terminal_coding.mode |= CODING_MODE_SAFE_ENCODING;
7100 /* Characer composition should be disabled. */
7101 terminal_coding.common_flags &= ~CODING_ANNOTATE_COMPOSITION_MASK;
7102 terminal_coding.src_multibyte = 1;
7103 terminal_coding.dst_multibyte = 0;
7104 return Qnil;
7105 }
7106
7107 DEFUN ("set-safe-terminal-coding-system-internal",
7108 Fset_safe_terminal_coding_system_internal,
7109 Sset_safe_terminal_coding_system_internal, 1, 1, 0,
7110 doc: /* Internal use only. */)
7111 (coding_system)
7112 Lisp_Object coding_system;
7113 {
7114 CHECK_SYMBOL (coding_system);
7115 setup_coding_system (Fcheck_coding_system (coding_system),
7116 &safe_terminal_coding);
7117 /* Characer composition should be disabled. */
7118 safe_terminal_coding.common_flags &= ~CODING_ANNOTATE_COMPOSITION_MASK;
7119 safe_terminal_coding.src_multibyte = 1;
7120 safe_terminal_coding.dst_multibyte = 0;
7121 return Qnil;
7122 }
7123
7124 DEFUN ("terminal-coding-system",
7125 Fterminal_coding_system, Sterminal_coding_system, 0, 0, 0,
7126 doc: /* Return coding system specified for terminal output. */)
7127 ()
7128 {
7129 return CODING_ID_NAME (terminal_coding.id);
7130 }
7131
7132 DEFUN ("set-keyboard-coding-system-internal",
7133 Fset_keyboard_coding_system_internal,
7134 Sset_keyboard_coding_system_internal, 1, 1, 0,
7135 doc: /* Internal use only. */)
7136 (coding_system)
7137 Lisp_Object coding_system;
7138 {
7139 CHECK_SYMBOL (coding_system);
7140 setup_coding_system (Fcheck_coding_system (coding_system),
7141 &keyboard_coding);
7142 /* Characer composition should be disabled. */
7143 keyboard_coding.common_flags &= ~CODING_ANNOTATE_COMPOSITION_MASK;
7144 return Qnil;
7145 }
7146
7147 DEFUN ("keyboard-coding-system",
7148 Fkeyboard_coding_system, Skeyboard_coding_system, 0, 0, 0,
7149 doc: /* Return coding system specified for decoding keyboard input. */)
7150 ()
7151 {
7152 return CODING_ID_NAME (keyboard_coding.id);
7153 }
7154
7155 \f
7156 DEFUN ("find-operation-coding-system", Ffind_operation_coding_system,
7157 Sfind_operation_coding_system, 1, MANY, 0,
7158 doc: /* Choose a coding system for an operation based on the target name.
7159 The value names a pair of coding systems: (DECODING-SYSTEM . ENCODING-SYSTEM).
7160 DECODING-SYSTEM is the coding system to use for decoding
7161 \(in case OPERATION does decoding), and ENCODING-SYSTEM is the coding system
7162 for encoding (in case OPERATION does encoding).
7163
7164 The first argument OPERATION specifies an I/O primitive:
7165 For file I/O, `insert-file-contents' or `write-region'.
7166 For process I/O, `call-process', `call-process-region', or `start-process'.
7167 For network I/O, `open-network-stream'.
7168
7169 The remaining arguments should be the same arguments that were passed
7170 to the primitive. Depending on which primitive, one of those arguments
7171 is selected as the TARGET. For example, if OPERATION does file I/O,
7172 whichever argument specifies the file name is TARGET.
7173
7174 TARGET has a meaning which depends on OPERATION:
7175 For file I/O, TARGET is a file name.
7176 For process I/O, TARGET is a process name.
7177 For network I/O, TARGET is a service name or a port number
7178
7179 This function looks up what specified for TARGET in,
7180 `file-coding-system-alist', `process-coding-system-alist',
7181 or `network-coding-system-alist' depending on OPERATION.
7182 They may specify a coding system, a cons of coding systems,
7183 or a function symbol to call.
7184 In the last case, we call the function with one argument,
7185 which is a list of all the arguments given to this function.
7186
7187 usage: (find-operation-coding-system OPERATION ARGUMENTS ...) */)
7188 (nargs, args)
7189 int nargs;
7190 Lisp_Object *args;
7191 {
7192 Lisp_Object operation, target_idx, target, val;
7193 register Lisp_Object chain;
7194
7195 if (nargs < 2)
7196 error ("Too few arguments");
7197 operation = args[0];
7198 if (!SYMBOLP (operation)
7199 || !INTEGERP (target_idx = Fget (operation, Qtarget_idx)))
7200 error ("Invalid first arguement");
7201 if (nargs < 1 + XINT (target_idx))
7202 error ("Too few arguments for operation: %s",
7203 XSYMBOL (operation)->name->data);
7204 target = args[XINT (target_idx) + 1];
7205 if (!(STRINGP (target)
7206 || (EQ (operation, Qopen_network_stream) && INTEGERP (target))))
7207 error ("Invalid %dth argument", XINT (target_idx) + 1);
7208
7209 chain = ((EQ (operation, Qinsert_file_contents)
7210 || EQ (operation, Qwrite_region))
7211 ? Vfile_coding_system_alist
7212 : (EQ (operation, Qopen_network_stream)
7213 ? Vnetwork_coding_system_alist
7214 : Vprocess_coding_system_alist));
7215 if (NILP (chain))
7216 return Qnil;
7217
7218 for (; CONSP (chain); chain = XCDR (chain))
7219 {
7220 Lisp_Object elt;
7221
7222 elt = XCAR (chain);
7223 if (CONSP (elt)
7224 && ((STRINGP (target)
7225 && STRINGP (XCAR (elt))
7226 && fast_string_match (XCAR (elt), target) >= 0)
7227 || (INTEGERP (target) && EQ (target, XCAR (elt)))))
7228 {
7229 val = XCDR (elt);
7230 /* Here, if VAL is both a valid coding system and a valid
7231 function symbol, we return VAL as a coding system. */
7232 if (CONSP (val))
7233 return val;
7234 if (! SYMBOLP (val))
7235 return Qnil;
7236 if (! NILP (Fcoding_system_p (val)))
7237 return Fcons (val, val);
7238 if (! NILP (Ffboundp (val)))
7239 {
7240 val = call1 (val, Flist (nargs, args));
7241 if (CONSP (val))
7242 return val;
7243 if (SYMBOLP (val) && ! NILP (Fcoding_system_p (val)))
7244 return Fcons (val, val);
7245 }
7246 return Qnil;
7247 }
7248 }
7249 return Qnil;
7250 }
7251
7252 DEFUN ("set-coding-system-priority", Fset_coding_system_priority,
7253 Sset_coding_system_priority, 1, MANY, 0,
7254 doc: /* Assign higher priority to the coding systems given as arguments.
7255 usage: (set-coding-system-priority CODING-SYSTEM ...) */)
7256 (nargs, args)
7257 int nargs;
7258 Lisp_Object *args;
7259 {
7260 int i, j;
7261 int changed[coding_category_max];
7262 enum coding_category priorities[coding_category_max];
7263
7264 bzero (changed, sizeof changed);
7265
7266 for (i = j = 0; i < nargs; i++)
7267 {
7268 enum coding_category category;
7269 Lisp_Object spec, attrs;
7270
7271 CHECK_CODING_SYSTEM_GET_SPEC (args[i], spec);
7272 attrs = AREF (spec, 0);
7273 category = XINT (CODING_ATTR_CATEGORY (attrs));
7274 if (changed[category])
7275 /* Ignore this coding system because a coding system of the
7276 same category already had a higher priority. */
7277 continue;
7278 changed[category] = 1;
7279 priorities[j++] = category;
7280 if (coding_categories[category].id >= 0
7281 && ! EQ (args[i], CODING_ID_NAME (coding_categories[category].id)))
7282 setup_coding_system (args[i], &coding_categories[category]);
7283 }
7284
7285 /* Now we have decided top J priorities. Reflect the order of the
7286 original priorities to the remaining priorities. */
7287
7288 for (i = j, j = 0; i < coding_category_max; i++, j++)
7289 {
7290 while (j < coding_category_max
7291 && changed[coding_priorities[j]])
7292 j++;
7293 if (j == coding_category_max)
7294 abort ();
7295 priorities[i] = coding_priorities[j];
7296 }
7297
7298 bcopy (priorities, coding_priorities, sizeof priorities);
7299 return Qnil;
7300 }
7301
7302 DEFUN ("coding-system-priority-list", Fcoding_system_priority_list,
7303 Scoding_system_priority_list, 0, 1, 0,
7304 doc: /* Return a list of coding systems ordered by their priorities.
7305 HIGHESTP non-nil means just return the highest priority one. */)
7306 (highestp)
7307 Lisp_Object highestp;
7308 {
7309 int i;
7310 Lisp_Object val;
7311
7312 for (i = 0, val = Qnil; i < coding_category_max; i++)
7313 {
7314 enum coding_category category = coding_priorities[i];
7315 int id = coding_categories[category].id;
7316 Lisp_Object attrs;
7317
7318 if (id < 0)
7319 continue;
7320 attrs = CODING_ID_ATTRS (id);
7321 if (! NILP (highestp))
7322 return CODING_ATTR_BASE_NAME (attrs);
7323 val = Fcons (CODING_ATTR_BASE_NAME (attrs), val);
7324 }
7325 return Fnreverse (val);
7326 }
7327
7328 static char *suffixes[] = { "-unix", "-dos", "-mac" };
7329
7330 static Lisp_Object
7331 make_subsidiaries (base)
7332 Lisp_Object base;
7333 {
7334 Lisp_Object subsidiaries;
7335 int base_name_len = STRING_BYTES (XSYMBOL (base)->name);
7336 char *buf = (char *) alloca (base_name_len + 6);
7337 int i;
7338
7339 bcopy (XSYMBOL (base)->name->data, buf, base_name_len);
7340 subsidiaries = Fmake_vector (make_number (3), Qnil);
7341 for (i = 0; i < 3; i++)
7342 {
7343 bcopy (suffixes[i], buf + base_name_len, strlen (suffixes[i]) + 1);
7344 ASET (subsidiaries, i, intern (buf));
7345 }
7346 return subsidiaries;
7347 }
7348
7349
7350 DEFUN ("define-coding-system-internal", Fdefine_coding_system_internal,
7351 Sdefine_coding_system_internal, coding_arg_max, MANY, 0,
7352 doc: /* For internal use only.
7353 usage: (define-coding-system-internal ...) */)
7354 (nargs, args)
7355 int nargs;
7356 Lisp_Object *args;
7357 {
7358 Lisp_Object name;
7359 Lisp_Object spec_vec; /* [ ATTRS ALIASE EOL_TYPE ] */
7360 Lisp_Object attrs; /* Vector of attributes. */
7361 Lisp_Object eol_type;
7362 Lisp_Object aliases;
7363 Lisp_Object coding_type, charset_list, safe_charsets;
7364 enum coding_category category;
7365 Lisp_Object tail, val;
7366 int max_charset_id = 0;
7367 int i;
7368
7369 if (nargs < coding_arg_max)
7370 goto short_args;
7371
7372 attrs = Fmake_vector (make_number (coding_attr_last_index), Qnil);
7373
7374 name = args[coding_arg_name];
7375 CHECK_SYMBOL (name);
7376 CODING_ATTR_BASE_NAME (attrs) = name;
7377
7378 val = args[coding_arg_mnemonic];
7379 if (! STRINGP (val))
7380 CHECK_CHARACTER (val);
7381 CODING_ATTR_MNEMONIC (attrs) = val;
7382
7383 coding_type = args[coding_arg_coding_type];
7384 CHECK_SYMBOL (coding_type);
7385 CODING_ATTR_TYPE (attrs) = coding_type;
7386
7387 charset_list = args[coding_arg_charset_list];
7388 if (SYMBOLP (charset_list))
7389 {
7390 if (EQ (charset_list, Qiso_2022))
7391 {
7392 if (! EQ (coding_type, Qiso_2022))
7393 error ("Invalid charset-list");
7394 charset_list = Viso_2022_charset_list;
7395 }
7396 else if (EQ (charset_list, Qemacs_mule))
7397 {
7398 if (! EQ (coding_type, Qemacs_mule))
7399 error ("Invalid charset-list");
7400 charset_list = Vemacs_mule_charset_list;
7401 }
7402 for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
7403 if (max_charset_id < XFASTINT (XCAR (tail)))
7404 max_charset_id = XFASTINT (XCAR (tail));
7405 }
7406 else
7407 {
7408 charset_list = Fcopy_sequence (charset_list);
7409 for (tail = charset_list; !NILP (tail); tail = Fcdr (tail))
7410 {
7411 struct charset *charset;
7412
7413 val = Fcar (tail);
7414 CHECK_CHARSET_GET_CHARSET (val, charset);
7415 if (EQ (coding_type, Qiso_2022)
7416 ? CHARSET_ISO_FINAL (charset) < 0
7417 : EQ (coding_type, Qemacs_mule)
7418 ? CHARSET_EMACS_MULE_ID (charset) < 0
7419 : 0)
7420 error ("Can't handle charset `%s'",
7421 XSYMBOL (CHARSET_NAME (charset))->name->data);
7422
7423 XCAR (tail) = make_number (charset->id);
7424 if (max_charset_id < charset->id)
7425 max_charset_id = charset->id;
7426 }
7427 }
7428 CODING_ATTR_CHARSET_LIST (attrs) = charset_list;
7429
7430 safe_charsets = Fmake_string (make_number (max_charset_id + 1),
7431 make_number (255));
7432 for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
7433 XSTRING (safe_charsets)->data[XFASTINT (XCAR (tail))] = 0;
7434 CODING_ATTR_SAFE_CHARSETS (attrs) = safe_charsets;
7435
7436 val = args[coding_arg_decode_translation_table];
7437 if (! NILP (val))
7438 CHECK_CHAR_TABLE (val);
7439 CODING_ATTR_DECODE_TBL (attrs) = val;
7440
7441 val = args[coding_arg_encode_translation_table];
7442 if (! NILP (val))
7443 CHECK_CHAR_TABLE (val);
7444 CODING_ATTR_ENCODE_TBL (attrs) = val;
7445
7446 val = args[coding_arg_post_read_conversion];
7447 CHECK_SYMBOL (val);
7448 CODING_ATTR_POST_READ (attrs) = val;
7449
7450 val = args[coding_arg_pre_write_conversion];
7451 CHECK_SYMBOL (val);
7452 CODING_ATTR_PRE_WRITE (attrs) = val;
7453
7454 val = args[coding_arg_default_char];
7455 if (NILP (val))
7456 CODING_ATTR_DEFAULT_CHAR (attrs) = make_number (' ');
7457 else
7458 {
7459 CHECK_CHARACTER (val);
7460 CODING_ATTR_DEFAULT_CHAR (attrs) = val;
7461 }
7462
7463 val = args[coding_arg_plist];
7464 CHECK_LIST (val);
7465 CODING_ATTR_PLIST (attrs) = val;
7466
7467 if (EQ (coding_type, Qcharset))
7468 {
7469 /* Generate a lisp vector of 256 elements. Each element is nil,
7470 integer, or a list of charset IDs.
7471
7472 If Nth element is nil, the byte code N is invalid in this
7473 coding system.
7474
7475 If Nth element is a number NUM, N is the first byte of a
7476 charset whose ID is NUM.
7477
7478 If Nth element is a list of charset IDs, N is the first byte
7479 of one of them. The list is sorted by dimensions of the
7480 charsets. A charset of smaller dimension comes firtst.
7481 */
7482 val = Fmake_vector (make_number (256), Qnil);
7483
7484 for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
7485 {
7486 struct charset *charset = CHARSET_FROM_ID (XFASTINT (XCAR (tail)));
7487 int dim = CHARSET_DIMENSION (charset);
7488 int idx = (dim - 1) * 4;
7489
7490 for (i = charset->code_space[idx];
7491 i <= charset->code_space[idx + 1]; i++)
7492 {
7493 Lisp_Object tmp, tmp2;
7494 int dim2;
7495
7496 tmp = AREF (val, i);
7497 if (NILP (tmp))
7498 tmp = XCAR (tail);
7499 else if (NUMBERP (tmp))
7500 {
7501 dim2 = CHARSET_DIMENSION (CHARSET_FROM_ID (XFASTINT (tmp)));
7502 if (dim < dim2)
7503 tmp = Fcons (XCAR (tail), Fcons (tmp, Qnil));
7504 else
7505 tmp = Fcons (tmp, Fcons (XCAR (tail), Qnil));
7506 }
7507 else
7508 {
7509 for (tmp2 = tmp; CONSP (tmp2); tmp2 = XCDR (tmp2))
7510 {
7511 dim2 = CHARSET_DIMENSION (CHARSET_FROM_ID (XFASTINT (XCAR (tmp2))));
7512 if (dim < dim2)
7513 break;
7514 }
7515 if (NILP (tmp2))
7516 tmp = nconc2 (tmp, Fcons (XCAR (tail), Qnil));
7517 else
7518 {
7519 XSETCDR (tmp2, Fcons (XCAR (tmp2), XCDR (tmp2)));
7520 XSETCAR (tmp2, XCAR (tail));
7521 }
7522 }
7523 ASET (val, i, tmp);
7524 }
7525 }
7526 ASET (attrs, coding_attr_charset_valids, val);
7527 category = coding_category_charset;
7528 }
7529 else if (EQ (coding_type, Qccl))
7530 {
7531 Lisp_Object valids;
7532
7533 if (nargs < coding_arg_ccl_max)
7534 goto short_args;
7535
7536 val = args[coding_arg_ccl_decoder];
7537 CHECK_CCL_PROGRAM (val);
7538 if (VECTORP (val))
7539 val = Fcopy_sequence (val);
7540 ASET (attrs, coding_attr_ccl_decoder, val);
7541
7542 val = args[coding_arg_ccl_encoder];
7543 CHECK_CCL_PROGRAM (val);
7544 if (VECTORP (val))
7545 val = Fcopy_sequence (val);
7546 ASET (attrs, coding_attr_ccl_encoder, val);
7547
7548 val = args[coding_arg_ccl_valids];
7549 valids = Fmake_string (make_number (256), make_number (0));
7550 for (tail = val; !NILP (tail); tail = Fcdr (tail))
7551 {
7552 val = Fcar (tail);
7553 if (INTEGERP (val))
7554 ASET (valids, XINT (val), 1);
7555 else
7556 {
7557 int from, to;
7558
7559 CHECK_CONS (val);
7560 CHECK_NUMBER (XCAR (val));
7561 CHECK_NUMBER (XCDR (val));
7562 from = XINT (XCAR (val));
7563 to = XINT (XCDR (val));
7564 for (i = from; i <= to; i++)
7565 ASET (valids, i, 1);
7566 }
7567 }
7568 ASET (attrs, coding_attr_ccl_valids, valids);
7569
7570 category = coding_category_ccl;
7571 }
7572 else if (EQ (coding_type, Qutf_16))
7573 {
7574 Lisp_Object bom, endian;
7575
7576 if (nargs < coding_arg_utf16_max)
7577 goto short_args;
7578
7579 bom = args[coding_arg_utf16_bom];
7580 if (! NILP (bom) && ! EQ (bom, Qt))
7581 {
7582 CHECK_CONS (bom);
7583 CHECK_CODING_SYSTEM (XCAR (bom));
7584 CHECK_CODING_SYSTEM (XCDR (bom));
7585 }
7586 ASET (attrs, coding_attr_utf_16_bom, bom);
7587
7588 endian = args[coding_arg_utf16_endian];
7589 ASET (attrs, coding_attr_utf_16_endian, endian);
7590
7591 category = (CONSP (bom)
7592 ? coding_category_utf_16_auto
7593 : NILP (bom)
7594 ? (NILP (endian)
7595 ? coding_category_utf_16_be_nosig
7596 : coding_category_utf_16_le_nosig)
7597 : (NILP (endian)
7598 ? coding_category_utf_16_be
7599 : coding_category_utf_16_le));
7600 }
7601 else if (EQ (coding_type, Qiso_2022))
7602 {
7603 Lisp_Object initial, reg_usage, request, flags;
7604 int i, id;
7605
7606 if (nargs < coding_arg_iso2022_max)
7607 goto short_args;
7608
7609 initial = Fcopy_sequence (args[coding_arg_iso2022_initial]);
7610 CHECK_VECTOR (initial);
7611 for (i = 0; i < 4; i++)
7612 {
7613 val = Faref (initial, make_number (i));
7614 if (! NILP (val))
7615 {
7616 CHECK_CHARSET_GET_ID (val, id);
7617 ASET (initial, i, make_number (id));
7618 }
7619 else
7620 ASET (initial, i, make_number (-1));
7621 }
7622
7623 reg_usage = args[coding_arg_iso2022_reg_usage];
7624 CHECK_CONS (reg_usage);
7625 CHECK_NATNUM (XCAR (reg_usage));
7626 CHECK_NATNUM (XCDR (reg_usage));
7627
7628 request = Fcopy_sequence (args[coding_arg_iso2022_request]);
7629 for (tail = request; ! NILP (tail); tail = Fcdr (tail))
7630 {
7631 int id;
7632
7633 val = Fcar (tail);
7634 CHECK_CONS (val);
7635 CHECK_CHARSET_GET_ID (XCAR (val), id);
7636 CHECK_NATNUM (XCDR (val));
7637 if (XINT (XCDR (val)) >= 4)
7638 error ("Invalid graphic register number: %d", XINT (XCDR (val)));
7639 XCAR (val) = make_number (id);
7640 }
7641
7642 flags = args[coding_arg_iso2022_flags];
7643 CHECK_NATNUM (flags);
7644 i = XINT (flags);
7645 if (EQ (args[coding_arg_charset_list], Qiso_2022))
7646 flags = make_number (i | CODING_ISO_FLAG_FULL_SUPPORT);
7647
7648 ASET (attrs, coding_attr_iso_initial, initial);
7649 ASET (attrs, coding_attr_iso_usage, reg_usage);
7650 ASET (attrs, coding_attr_iso_request, request);
7651 ASET (attrs, coding_attr_iso_flags, flags);
7652 setup_iso_safe_charsets (attrs);
7653
7654 if (i & CODING_ISO_FLAG_SEVEN_BITS)
7655 category = ((i & (CODING_ISO_FLAG_LOCKING_SHIFT
7656 | CODING_ISO_FLAG_SINGLE_SHIFT))
7657 ? coding_category_iso_7_else
7658 : EQ (args[coding_arg_charset_list], Qiso_2022)
7659 ? coding_category_iso_7
7660 : coding_category_iso_7_tight);
7661 else
7662 {
7663 int id = XINT (AREF (initial, 1));
7664
7665 category = (((i & (CODING_ISO_FLAG_LOCKING_SHIFT
7666 | CODING_ISO_FLAG_SINGLE_SHIFT))
7667 || EQ (args[coding_arg_charset_list], Qiso_2022)
7668 || id < 0)
7669 ? coding_category_iso_8_else
7670 : (CHARSET_DIMENSION (CHARSET_FROM_ID (id)) == 1)
7671 ? coding_category_iso_8_1
7672 : coding_category_iso_8_2);
7673 }
7674 }
7675 else if (EQ (coding_type, Qemacs_mule))
7676 {
7677 if (EQ (args[coding_arg_charset_list], Qemacs_mule))
7678 ASET (attrs, coding_attr_emacs_mule_full, Qt);
7679
7680 category = coding_category_emacs_mule;
7681 }
7682 else if (EQ (coding_type, Qshift_jis))
7683 {
7684
7685 struct charset *charset;
7686
7687 if (XINT (Flength (charset_list)) != 3)
7688 error ("There should be just three charsets");
7689
7690 charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
7691 if (CHARSET_DIMENSION (charset) != 1)
7692 error ("Dimension of charset %s is not one",
7693 XSYMBOL (CHARSET_NAME (charset))->name->data);
7694
7695 charset_list = XCDR (charset_list);
7696 charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
7697 if (CHARSET_DIMENSION (charset) != 1)
7698 error ("Dimension of charset %s is not one",
7699 XSYMBOL (CHARSET_NAME (charset))->name->data);
7700
7701 charset_list = XCDR (charset_list);
7702 charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
7703 if (CHARSET_DIMENSION (charset) != 2)
7704 error ("Dimension of charset %s is not two",
7705 XSYMBOL (CHARSET_NAME (charset))->name->data);
7706
7707 category = coding_category_sjis;
7708 Vsjis_coding_system = name;
7709 }
7710 else if (EQ (coding_type, Qbig5))
7711 {
7712 struct charset *charset;
7713
7714 if (XINT (Flength (charset_list)) != 2)
7715 error ("There should be just two charsets");
7716
7717 charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
7718 if (CHARSET_DIMENSION (charset) != 1)
7719 error ("Dimension of charset %s is not one",
7720 XSYMBOL (CHARSET_NAME (charset))->name->data);
7721
7722 charset_list = XCDR (charset_list);
7723 charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
7724 if (CHARSET_DIMENSION (charset) != 2)
7725 error ("Dimension of charset %s is not two",
7726 XSYMBOL (CHARSET_NAME (charset))->name->data);
7727
7728 category = coding_category_big5;
7729 Vbig5_coding_system = name;
7730 }
7731 else if (EQ (coding_type, Qraw_text))
7732 category = coding_category_raw_text;
7733 else if (EQ (coding_type, Qutf_8))
7734 category = coding_category_utf_8;
7735 else if (EQ (coding_type, Qundecided))
7736 category = coding_category_undecided;
7737 else
7738 error ("Invalid coding system type: %s",
7739 XSYMBOL (coding_type)->name->data);
7740
7741 CODING_ATTR_CATEGORY (attrs) = make_number (category);
7742
7743 eol_type = args[coding_arg_eol_type];
7744 if (! NILP (eol_type)
7745 && ! EQ (eol_type, Qunix)
7746 && ! EQ (eol_type, Qdos)
7747 && ! EQ (eol_type, Qmac))
7748 error ("Invalid eol-type");
7749
7750 aliases = Fcons (name, Qnil);
7751
7752 if (NILP (eol_type))
7753 {
7754 eol_type = make_subsidiaries (name);
7755 for (i = 0; i < 3; i++)
7756 {
7757 Lisp_Object this_spec, this_name, this_aliases, this_eol_type;
7758
7759 this_name = AREF (eol_type, i);
7760 this_aliases = Fcons (this_name, Qnil);
7761 this_eol_type = (i == 0 ? Qunix : i == 1 ? Qdos : Qmac);
7762 this_spec = Fmake_vector (make_number (3), attrs);
7763 ASET (this_spec, 1, this_aliases);
7764 ASET (this_spec, 2, this_eol_type);
7765 Fputhash (this_name, this_spec, Vcoding_system_hash_table);
7766 Vcoding_system_list = Fcons (this_name, Vcoding_system_list);
7767 Vcoding_system_alist = Fcons (Fcons (Fsymbol_name (this_name), Qnil),
7768 Vcoding_system_alist);
7769 }
7770 }
7771
7772 spec_vec = Fmake_vector (make_number (3), attrs);
7773 ASET (spec_vec, 1, aliases);
7774 ASET (spec_vec, 2, eol_type);
7775
7776 Fputhash (name, spec_vec, Vcoding_system_hash_table);
7777 Vcoding_system_list = Fcons (name, Vcoding_system_list);
7778 Vcoding_system_alist = Fcons (Fcons (Fsymbol_name (name), Qnil),
7779 Vcoding_system_alist);
7780
7781 {
7782 int id = coding_categories[category].id;
7783
7784 if (id < 0 || EQ (name, CODING_ID_NAME (id)))
7785 setup_coding_system (name, &coding_categories[category]);
7786 }
7787
7788 return Qnil;
7789
7790 short_args:
7791 return Fsignal (Qwrong_number_of_arguments,
7792 Fcons (intern ("define-coding-system-internal"),
7793 make_number (nargs)));
7794 }
7795
7796 /* Fixme: should this record the alias relationships for
7797 diagnostics? */
7798 DEFUN ("define-coding-system-alias", Fdefine_coding_system_alias,
7799 Sdefine_coding_system_alias, 2, 2, 0,
7800 doc: /* Define ALIAS as an alias for CODING-SYSTEM. */)
7801 (alias, coding_system)
7802 Lisp_Object alias, coding_system;
7803 {
7804 Lisp_Object spec, aliases, eol_type;
7805
7806 CHECK_SYMBOL (alias);
7807 CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
7808 aliases = AREF (spec, 1);
7809 while (!NILP (XCDR (aliases)))
7810 aliases = XCDR (aliases);
7811 XCDR (aliases) = Fcons (alias, Qnil);
7812
7813 eol_type = AREF (spec, 2);
7814 if (VECTORP (eol_type))
7815 {
7816 Lisp_Object subsidiaries;
7817 int i;
7818
7819 subsidiaries = make_subsidiaries (alias);
7820 for (i = 0; i < 3; i++)
7821 Fdefine_coding_system_alias (AREF (subsidiaries, i),
7822 AREF (eol_type, i));
7823
7824 ASET (spec, 2, subsidiaries);
7825 }
7826
7827 Fputhash (alias, spec, Vcoding_system_hash_table);
7828 Vcoding_system_alist = Fcons (Fcons (Fsymbol_name (alias), Qnil),
7829 Vcoding_system_alist);
7830
7831 return Qnil;
7832 }
7833
7834 DEFUN ("coding-system-base", Fcoding_system_base, Scoding_system_base,
7835 1, 1, 0,
7836 doc: /* Return the base of CODING-SYSTEM.
7837 Any alias or subsidiary coding system is not a base coding system. */)
7838 (coding_system)
7839 Lisp_Object coding_system;
7840 {
7841 Lisp_Object spec, attrs;
7842
7843 if (NILP (coding_system))
7844 return (Qno_conversion);
7845 CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
7846 attrs = AREF (spec, 0);
7847 return CODING_ATTR_BASE_NAME (attrs);
7848 }
7849
7850 DEFUN ("coding-system-plist", Fcoding_system_plist, Scoding_system_plist,
7851 1, 1, 0,
7852 doc: "Return the property list of CODING-SYSTEM.")
7853 (coding_system)
7854 Lisp_Object coding_system;
7855 {
7856 Lisp_Object spec, attrs;
7857
7858 if (NILP (coding_system))
7859 coding_system = Qno_conversion;
7860 CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
7861 attrs = AREF (spec, 0);
7862 return CODING_ATTR_PLIST (attrs);
7863 }
7864
7865
7866 DEFUN ("coding-system-aliases", Fcoding_system_aliases, Scoding_system_aliases,
7867 1, 1, 0,
7868 doc: /* Return the list of aliases of CODING-SYSTEM. */)
7869 (coding_system)
7870 Lisp_Object coding_system;
7871 {
7872 Lisp_Object spec;
7873
7874 if (NILP (coding_system))
7875 coding_system = Qno_conversion;
7876 CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
7877 return AREF (spec, 1);
7878 }
7879
7880 DEFUN ("coding-system-eol-type", Fcoding_system_eol_type,
7881 Scoding_system_eol_type, 1, 1, 0,
7882 doc: /* Return eol-type of CODING-SYSTEM.
7883 An eol-type is integer 0, 1, 2, or a vector of coding systems.
7884
7885 Integer values 0, 1, and 2 indicate a format of end-of-line; LF, CRLF,
7886 and CR respectively.
7887
7888 A vector value indicates that a format of end-of-line should be
7889 detected automatically. Nth element of the vector is the subsidiary
7890 coding system whose eol-type is N. */)
7891 (coding_system)
7892 Lisp_Object coding_system;
7893 {
7894 Lisp_Object spec, eol_type;
7895 int n;
7896
7897 if (NILP (coding_system))
7898 coding_system = Qno_conversion;
7899 if (! CODING_SYSTEM_P (coding_system))
7900 return Qnil;
7901 spec = CODING_SYSTEM_SPEC (coding_system);
7902 eol_type = AREF (spec, 2);
7903 if (VECTORP (eol_type))
7904 return Fcopy_sequence (eol_type);
7905 n = EQ (eol_type, Qunix) ? 0 : EQ (eol_type, Qdos) ? 1 : 2;
7906 return make_number (n);
7907 }
7908
7909 #endif /* emacs */
7910
7911 \f
7912 /*** 9. Post-amble ***/
7913
7914 void
7915 init_coding_once ()
7916 {
7917 int i;
7918
7919 for (i = 0; i < coding_category_max; i++)
7920 {
7921 coding_categories[i].id = -1;
7922 coding_priorities[i] = i;
7923 }
7924
7925 /* ISO2022 specific initialize routine. */
7926 for (i = 0; i < 0x20; i++)
7927 iso_code_class[i] = ISO_control_0;
7928 for (i = 0x21; i < 0x7F; i++)
7929 iso_code_class[i] = ISO_graphic_plane_0;
7930 for (i = 0x80; i < 0xA0; i++)
7931 iso_code_class[i] = ISO_control_1;
7932 for (i = 0xA1; i < 0xFF; i++)
7933 iso_code_class[i] = ISO_graphic_plane_1;
7934 iso_code_class[0x20] = iso_code_class[0x7F] = ISO_0x20_or_0x7F;
7935 iso_code_class[0xA0] = iso_code_class[0xFF] = ISO_0xA0_or_0xFF;
7936 iso_code_class[ISO_CODE_CR] = ISO_carriage_return;
7937 iso_code_class[ISO_CODE_SO] = ISO_shift_out;
7938 iso_code_class[ISO_CODE_SI] = ISO_shift_in;
7939 iso_code_class[ISO_CODE_SS2_7] = ISO_single_shift_2_7;
7940 iso_code_class[ISO_CODE_ESC] = ISO_escape;
7941 iso_code_class[ISO_CODE_SS2] = ISO_single_shift_2;
7942 iso_code_class[ISO_CODE_SS3] = ISO_single_shift_3;
7943 iso_code_class[ISO_CODE_CSI] = ISO_control_sequence_introducer;
7944
7945 inhibit_pre_post_conversion = 0;
7946
7947 for (i = 0; i < 256; i++)
7948 {
7949 emacs_mule_bytes[i] = 1;
7950 }
7951 emacs_mule_bytes[LEADING_CODE_PRIVATE_11] = 3;
7952 emacs_mule_bytes[LEADING_CODE_PRIVATE_12] = 3;
7953 emacs_mule_bytes[LEADING_CODE_PRIVATE_21] = 4;
7954 emacs_mule_bytes[LEADING_CODE_PRIVATE_22] = 4;
7955 }
7956
7957 #ifdef emacs
7958
7959 void
7960 syms_of_coding ()
7961 {
7962 staticpro (&Vcoding_system_hash_table);
7963 Vcoding_system_hash_table = Fmakehash (Qeq);
7964
7965 staticpro (&Vsjis_coding_system);
7966 Vsjis_coding_system = Qnil;
7967
7968 staticpro (&Vbig5_coding_system);
7969 Vbig5_coding_system = Qnil;
7970
7971 staticpro (&Vcode_conversion_work_buf_list);
7972 Vcode_conversion_work_buf_list = Qnil;
7973
7974 staticpro (&Vcode_conversion_reused_work_buf);
7975 Vcode_conversion_reused_work_buf = Qnil;
7976
7977 DEFSYM (Qcharset, "charset");
7978 DEFSYM (Qtarget_idx, "target-idx");
7979 DEFSYM (Qcoding_system_history, "coding-system-history");
7980 Fset (Qcoding_system_history, Qnil);
7981
7982 /* Target FILENAME is the first argument. */
7983 Fput (Qinsert_file_contents, Qtarget_idx, make_number (0));
7984 /* Target FILENAME is the third argument. */
7985 Fput (Qwrite_region, Qtarget_idx, make_number (2));
7986
7987 DEFSYM (Qcall_process, "call-process");
7988 /* Target PROGRAM is the first argument. */
7989 Fput (Qcall_process, Qtarget_idx, make_number (0));
7990
7991 DEFSYM (Qcall_process_region, "call-process-region");
7992 /* Target PROGRAM is the third argument. */
7993 Fput (Qcall_process_region, Qtarget_idx, make_number (2));
7994
7995 DEFSYM (Qstart_process, "start-process");
7996 /* Target PROGRAM is the third argument. */
7997 Fput (Qstart_process, Qtarget_idx, make_number (2));
7998
7999 DEFSYM (Qopen_network_stream, "open-network-stream");
8000 /* Target SERVICE is the fourth argument. */
8001 Fput (Qopen_network_stream, Qtarget_idx, make_number (3));
8002
8003 DEFSYM (Qcoding_system, "coding-system");
8004 DEFSYM (Qcoding_aliases, "coding-aliases");
8005
8006 DEFSYM (Qeol_type, "eol-type");
8007 DEFSYM (Qunix, "unix");
8008 DEFSYM (Qdos, "dos");
8009
8010 DEFSYM (Qbuffer_file_coding_system, "buffer-file-coding-system");
8011 DEFSYM (Qpost_read_conversion, "post-read-conversion");
8012 DEFSYM (Qpre_write_conversion, "pre-write-conversion");
8013 DEFSYM (Qdefault_char, "default-char");
8014 DEFSYM (Qundecided, "undecided");
8015 DEFSYM (Qno_conversion, "no-conversion");
8016 DEFSYM (Qraw_text, "raw-text");
8017
8018 DEFSYM (Qiso_2022, "iso-2022");
8019
8020 DEFSYM (Qutf_8, "utf-8");
8021
8022 DEFSYM (Qutf_16, "utf-16");
8023 DEFSYM (Qutf_16_be, "utf-16-be");
8024 DEFSYM (Qutf_16_be_nosig, "utf-16-be-nosig");
8025 DEFSYM (Qutf_16_le, "utf-16-l3");
8026 DEFSYM (Qutf_16_le_nosig, "utf-16-le-nosig");
8027 DEFSYM (Qsignature, "signature");
8028 DEFSYM (Qendian, "endian");
8029 DEFSYM (Qbig, "big");
8030 DEFSYM (Qlittle, "little");
8031
8032 DEFSYM (Qshift_jis, "shift-jis");
8033 DEFSYM (Qbig5, "big5");
8034
8035 DEFSYM (Qcoding_system_p, "coding-system-p");
8036
8037 DEFSYM (Qcoding_system_error, "coding-system-error");
8038 Fput (Qcoding_system_error, Qerror_conditions,
8039 Fcons (Qcoding_system_error, Fcons (Qerror, Qnil)));
8040 Fput (Qcoding_system_error, Qerror_message,
8041 build_string ("Invalid coding system"));
8042
8043 /* Intern this now in case it isn't already done.
8044 Setting this variable twice is harmless.
8045 But don't staticpro it here--that is done in alloc.c. */
8046 Qchar_table_extra_slots = intern ("char-table-extra-slots");
8047
8048 DEFSYM (Qtranslation_table, "translation-table");
8049 Fput (Qtranslation_table, Qchar_table_extra_slots, make_number (1));
8050 DEFSYM (Qtranslation_table_id, "translation-table-id");
8051 DEFSYM (Qtranslation_table_for_decode, "translation-table-for-decode");
8052 DEFSYM (Qtranslation_table_for_encode, "translation-table-for-encode");
8053
8054 DEFSYM (Qvalid_codes, "valid-codes");
8055
8056 DEFSYM (Qemacs_mule, "emacs-mule");
8057
8058 Vcoding_category_table
8059 = Fmake_vector (make_number (coding_category_max), Qnil);
8060 staticpro (&Vcoding_category_table);
8061 /* Followings are target of code detection. */
8062 ASET (Vcoding_category_table, coding_category_iso_7,
8063 intern ("coding-category-iso-7"));
8064 ASET (Vcoding_category_table, coding_category_iso_7_tight,
8065 intern ("coding-category-iso-7-tight"));
8066 ASET (Vcoding_category_table, coding_category_iso_8_1,
8067 intern ("coding-category-iso-8-1"));
8068 ASET (Vcoding_category_table, coding_category_iso_8_2,
8069 intern ("coding-category-iso-8-2"));
8070 ASET (Vcoding_category_table, coding_category_iso_7_else,
8071 intern ("coding-category-iso-7-else"));
8072 ASET (Vcoding_category_table, coding_category_iso_8_else,
8073 intern ("coding-category-iso-8-else"));
8074 ASET (Vcoding_category_table, coding_category_utf_8,
8075 intern ("coding-category-utf-8"));
8076 ASET (Vcoding_category_table, coding_category_utf_16_be,
8077 intern ("coding-category-utf-16-be"));
8078 ASET (Vcoding_category_table, coding_category_utf_16_le,
8079 intern ("coding-category-utf-16-le"));
8080 ASET (Vcoding_category_table, coding_category_utf_16_be_nosig,
8081 intern ("coding-category-utf-16-be-nosig"));
8082 ASET (Vcoding_category_table, coding_category_utf_16_le_nosig,
8083 intern ("coding-category-utf-16-le-nosig"));
8084 ASET (Vcoding_category_table, coding_category_charset,
8085 intern ("coding-category-charset"));
8086 ASET (Vcoding_category_table, coding_category_sjis,
8087 intern ("coding-category-sjis"));
8088 ASET (Vcoding_category_table, coding_category_big5,
8089 intern ("coding-category-big5"));
8090 ASET (Vcoding_category_table, coding_category_ccl,
8091 intern ("coding-category-ccl"));
8092 ASET (Vcoding_category_table, coding_category_emacs_mule,
8093 intern ("coding-category-emacs-mule"));
8094 /* Followings are NOT target of code detection. */
8095 ASET (Vcoding_category_table, coding_category_raw_text,
8096 intern ("coding-category-raw-text"));
8097 ASET (Vcoding_category_table, coding_category_undecided,
8098 intern ("coding-category-undecided"));
8099
8100 defsubr (&Scoding_system_p);
8101 defsubr (&Sread_coding_system);
8102 defsubr (&Sread_non_nil_coding_system);
8103 defsubr (&Scheck_coding_system);
8104 defsubr (&Sdetect_coding_region);
8105 defsubr (&Sdetect_coding_string);
8106 defsubr (&Sfind_coding_systems_region_internal);
8107 defsubr (&Scheck_coding_systems_region);
8108 defsubr (&Sdecode_coding_region);
8109 defsubr (&Sencode_coding_region);
8110 defsubr (&Sdecode_coding_string);
8111 defsubr (&Sencode_coding_string);
8112 defsubr (&Sdecode_sjis_char);
8113 defsubr (&Sencode_sjis_char);
8114 defsubr (&Sdecode_big5_char);
8115 defsubr (&Sencode_big5_char);
8116 defsubr (&Sset_terminal_coding_system_internal);
8117 defsubr (&Sset_safe_terminal_coding_system_internal);
8118 defsubr (&Sterminal_coding_system);
8119 defsubr (&Sset_keyboard_coding_system_internal);
8120 defsubr (&Skeyboard_coding_system);
8121 defsubr (&Sfind_operation_coding_system);
8122 defsubr (&Sset_coding_system_priority);
8123 defsubr (&Sdefine_coding_system_internal);
8124 defsubr (&Sdefine_coding_system_alias);
8125 defsubr (&Scoding_system_base);
8126 defsubr (&Scoding_system_plist);
8127 defsubr (&Scoding_system_aliases);
8128 defsubr (&Scoding_system_eol_type);
8129 defsubr (&Scoding_system_priority_list);
8130
8131 DEFVAR_LISP ("coding-system-list", &Vcoding_system_list,
8132 doc: /* List of coding systems.
8133
8134 Do not alter the value of this variable manually. This variable should be
8135 updated by the functions `define-coding-system' and
8136 `define-coding-system-alias'. */);
8137 Vcoding_system_list = Qnil;
8138
8139 DEFVAR_LISP ("coding-system-alist", &Vcoding_system_alist,
8140 doc: /* Alist of coding system names.
8141 Each element is one element list of coding system name.
8142 This variable is given to `completing-read' as TABLE argument.
8143
8144 Do not alter the value of this variable manually. This variable should be
8145 updated by the functions `make-coding-system' and
8146 `define-coding-system-alias'. */);
8147 Vcoding_system_alist = Qnil;
8148
8149 DEFVAR_LISP ("coding-category-list", &Vcoding_category_list,
8150 doc: /* List of coding-categories (symbols) ordered by priority.
8151
8152 On detecting a coding system, Emacs tries code detection algorithms
8153 associated with each coding-category one by one in this order. When
8154 one algorithm agrees with a byte sequence of source text, the coding
8155 system bound to the corresponding coding-category is selected. */);
8156 {
8157 int i;
8158
8159 Vcoding_category_list = Qnil;
8160 for (i = coding_category_max - 1; i >= 0; i--)
8161 Vcoding_category_list
8162 = Fcons (XVECTOR (Vcoding_category_table)->contents[i],
8163 Vcoding_category_list);
8164 }
8165
8166 DEFVAR_LISP ("coding-system-for-read", &Vcoding_system_for_read,
8167 doc: /* Specify the coding system for read operations.
8168 It is useful to bind this variable with `let', but do not set it globally.
8169 If the value is a coding system, it is used for decoding on read operation.
8170 If not, an appropriate element is used from one of the coding system alists:
8171 There are three such tables, `file-coding-system-alist',
8172 `process-coding-system-alist', and `network-coding-system-alist'. */);
8173 Vcoding_system_for_read = Qnil;
8174
8175 DEFVAR_LISP ("coding-system-for-write", &Vcoding_system_for_write,
8176 doc: /* Specify the coding system for write operations.
8177 Programs bind this variable with `let', but you should not set it globally.
8178 If the value is a coding system, it is used for encoding of output,
8179 when writing it to a file and when sending it to a file or subprocess.
8180
8181 If this does not specify a coding system, an appropriate element
8182 is used from one of the coding system alists:
8183 There are three such tables, `file-coding-system-alist',
8184 `process-coding-system-alist', and `network-coding-system-alist'.
8185 For output to files, if the above procedure does not specify a coding system,
8186 the value of `buffer-file-coding-system' is used. */);
8187 Vcoding_system_for_write = Qnil;
8188
8189 DEFVAR_LISP ("last-coding-system-used", &Vlast_coding_system_used,
8190 doc: /*
8191 Coding system used in the latest file or process I/O. */);
8192 Vlast_coding_system_used = Qnil;
8193
8194 DEFVAR_BOOL ("inhibit-eol-conversion", &inhibit_eol_conversion,
8195 doc: /*
8196 *Non-nil means always inhibit code conversion of end-of-line format.
8197 See info node `Coding Systems' and info node `Text and Binary' concerning
8198 such conversion. */);
8199 inhibit_eol_conversion = 0;
8200
8201 DEFVAR_BOOL ("inherit-process-coding-system", &inherit_process_coding_system,
8202 doc: /*
8203 Non-nil means process buffer inherits coding system of process output.
8204 Bind it to t if the process output is to be treated as if it were a file
8205 read from some filesystem. */);
8206 inherit_process_coding_system = 0;
8207
8208 DEFVAR_LISP ("file-coding-system-alist", &Vfile_coding_system_alist,
8209 doc: /*
8210 Alist to decide a coding system to use for a file I/O operation.
8211 The format is ((PATTERN . VAL) ...),
8212 where PATTERN is a regular expression matching a file name,
8213 VAL is a coding system, a cons of coding systems, or a function symbol.
8214 If VAL is a coding system, it is used for both decoding and encoding
8215 the file contents.
8216 If VAL is a cons of coding systems, the car part is used for decoding,
8217 and the cdr part is used for encoding.
8218 If VAL is a function symbol, the function must return a coding system
8219 or a cons of coding systems which are used as above. The function gets
8220 the arguments with which `find-operation-coding-systems' was called.
8221
8222 See also the function `find-operation-coding-system'
8223 and the variable `auto-coding-alist'. */);
8224 Vfile_coding_system_alist = Qnil;
8225
8226 DEFVAR_LISP ("process-coding-system-alist", &Vprocess_coding_system_alist,
8227 doc: /*
8228 Alist to decide a coding system to use for a process I/O operation.
8229 The format is ((PATTERN . VAL) ...),
8230 where PATTERN is a regular expression matching a program name,
8231 VAL is a coding system, a cons of coding systems, or a function symbol.
8232 If VAL is a coding system, it is used for both decoding what received
8233 from the program and encoding what sent to the program.
8234 If VAL is a cons of coding systems, the car part is used for decoding,
8235 and the cdr part is used for encoding.
8236 If VAL is a function symbol, the function must return a coding system
8237 or a cons of coding systems which are used as above.
8238
8239 See also the function `find-operation-coding-system'. */);
8240 Vprocess_coding_system_alist = Qnil;
8241
8242 DEFVAR_LISP ("network-coding-system-alist", &Vnetwork_coding_system_alist,
8243 doc: /*
8244 Alist to decide a coding system to use for a network I/O operation.
8245 The format is ((PATTERN . VAL) ...),
8246 where PATTERN is a regular expression matching a network service name
8247 or is a port number to connect to,
8248 VAL is a coding system, a cons of coding systems, or a function symbol.
8249 If VAL is a coding system, it is used for both decoding what received
8250 from the network stream and encoding what sent to the network stream.
8251 If VAL is a cons of coding systems, the car part is used for decoding,
8252 and the cdr part is used for encoding.
8253 If VAL is a function symbol, the function must return a coding system
8254 or a cons of coding systems which are used as above.
8255
8256 See also the function `find-operation-coding-system'. */);
8257 Vnetwork_coding_system_alist = Qnil;
8258
8259 DEFVAR_LISP ("locale-coding-system", &Vlocale_coding_system,
8260 doc: /* Coding system to use with system messages.
8261 Also used for decoding keyboard input on X Window system. */);
8262 Vlocale_coding_system = Qnil;
8263
8264 /* The eol mnemonics are reset in startup.el system-dependently. */
8265 DEFVAR_LISP ("eol-mnemonic-unix", &eol_mnemonic_unix,
8266 doc: /*
8267 *String displayed in mode line for UNIX-like (LF) end-of-line format. */);
8268 eol_mnemonic_unix = build_string (":");
8269
8270 DEFVAR_LISP ("eol-mnemonic-dos", &eol_mnemonic_dos,
8271 doc: /*
8272 *String displayed in mode line for DOS-like (CRLF) end-of-line format. */);
8273 eol_mnemonic_dos = build_string ("\\");
8274
8275 DEFVAR_LISP ("eol-mnemonic-mac", &eol_mnemonic_mac,
8276 doc: /*
8277 *String displayed in mode line for MAC-like (CR) end-of-line format. */);
8278 eol_mnemonic_mac = build_string ("/");
8279
8280 DEFVAR_LISP ("eol-mnemonic-undecided", &eol_mnemonic_undecided,
8281 doc: /*
8282 *String displayed in mode line when end-of-line format is not yet determined. */);
8283 eol_mnemonic_undecided = build_string (":");
8284
8285 DEFVAR_LISP ("enable-character-translation", &Venable_character_translation,
8286 doc: /*
8287 *Non-nil enables character translation while encoding and decoding. */);
8288 Venable_character_translation = Qt;
8289
8290 DEFVAR_LISP ("standard-translation-table-for-decode",
8291 &Vstandard_translation_table_for_decode,
8292 doc: /* Table for translating characters while decoding. */);
8293 Vstandard_translation_table_for_decode = Qnil;
8294
8295 DEFVAR_LISP ("standard-translation-table-for-encode",
8296 &Vstandard_translation_table_for_encode,
8297 doc: /* Table for translating characters while encoding. */);
8298 Vstandard_translation_table_for_encode = Qnil;
8299
8300 DEFVAR_LISP ("charset-revision-table", &Vcharset_revision_table,
8301 doc: /* Alist of charsets vs revision numbers.
8302 While encoding, if a charset (car part of an element) is found,
8303 designate it with the escape sequence identifying revision (cdr part
8304 of the element). */);
8305 Vcharset_revision_table = Qnil;
8306
8307 DEFVAR_LISP ("default-process-coding-system",
8308 &Vdefault_process_coding_system,
8309 doc: /* Cons of coding systems used for process I/O by default.
8310 The car part is used for decoding a process output,
8311 the cdr part is used for encoding a text to be sent to a process. */);
8312 Vdefault_process_coding_system = Qnil;
8313
8314 DEFVAR_LISP ("latin-extra-code-table", &Vlatin_extra_code_table,
8315 doc: /*
8316 Table of extra Latin codes in the range 128..159 (inclusive).
8317 This is a vector of length 256.
8318 If Nth element is non-nil, the existence of code N in a file
8319 \(or output of subprocess) doesn't prevent it to be detected as
8320 a coding system of ISO 2022 variant which has a flag
8321 `accept-latin-extra-code' t (e.g. iso-latin-1) on reading a file
8322 or reading output of a subprocess.
8323 Only 128th through 159th elements has a meaning. */);
8324 Vlatin_extra_code_table = Fmake_vector (make_number (256), Qnil);
8325
8326 DEFVAR_LISP ("select-safe-coding-system-function",
8327 &Vselect_safe_coding_system_function,
8328 doc: /*
8329 Function to call to select safe coding system for encoding a text.
8330
8331 If set, this function is called to force a user to select a proper
8332 coding system which can encode the text in the case that a default
8333 coding system used in each operation can't encode the text.
8334
8335 The default value is `select-safe-coding-system' (which see). */);
8336 Vselect_safe_coding_system_function = Qnil;
8337
8338 DEFVAR_BOOL ("inhibit-iso-escape-detection",
8339 &inhibit_iso_escape_detection,
8340 doc: /*
8341 If non-nil, Emacs ignores ISO2022's escape sequence on code detection.
8342
8343 By default, on reading a file, Emacs tries to detect how the text is
8344 encoded. This code detection is sensitive to escape sequences. If
8345 the sequence is valid as ISO2022, the code is determined as one of
8346 the ISO2022 encodings, and the file is decoded by the corresponding
8347 coding system (e.g. `iso-2022-7bit').
8348
8349 However, there may be a case that you want to read escape sequences in
8350 a file as is. In such a case, you can set this variable to non-nil.
8351 Then, as the code detection ignores any escape sequences, no file is
8352 detected as encoded in some ISO2022 encoding. The result is that all
8353 escape sequences become visible in a buffer.
8354
8355 The default value is nil, and it is strongly recommended not to change
8356 it. That is because many Emacs Lisp source files that contain
8357 non-ASCII characters are encoded by the coding system `iso-2022-7bit'
8358 in Emacs's distribution, and they won't be decoded correctly on
8359 reading if you suppress escape sequence detection.
8360
8361 The other way to read escape sequences in a file without decoding is
8362 to explicitly specify some coding system that doesn't use ISO2022's
8363 escape sequence (e.g `latin-1') on reading by \\[universal-coding-system-argument]. */);
8364 inhibit_iso_escape_detection = 0;
8365
8366 {
8367 Lisp_Object args[coding_arg_max];
8368 Lisp_Object plist[14];
8369 int i;
8370
8371 for (i = 0; i < coding_arg_max; i++)
8372 args[i] = Qnil;
8373
8374 plist[0] = intern (":name");
8375 plist[1] = args[coding_arg_name] = Qno_conversion;
8376 plist[2] = intern (":mnemonic");
8377 plist[3] = args[coding_arg_mnemonic] = make_number ('=');
8378 plist[4] = intern (":coding-type");
8379 plist[5] = args[coding_arg_coding_type] = Qraw_text;
8380 plist[6] = intern (":ascii-compatible-p");
8381 plist[7] = args[coding_arg_ascii_compatible_p] = Qt;
8382 plist[8] = intern (":default-char");
8383 plist[9] = args[coding_arg_default_char] = make_number (0);
8384 plist[10] = intern (":docstring");
8385 plist[11] = build_string ("Do no conversion.\n\
8386 \n\
8387 When you visit a file with this coding, the file is read into a\n\
8388 unibyte buffer as is, thus each byte of a file is treated as a\n\
8389 character.");
8390 plist[12] = intern (":eol-type");
8391 plist[13] = args[coding_arg_eol_type] = Qunix;
8392 args[coding_arg_plist] = Flist (14, plist);
8393 Fdefine_coding_system_internal (coding_arg_max, args);
8394 }
8395
8396 setup_coding_system (Qno_conversion, &keyboard_coding);
8397 setup_coding_system (Qno_conversion, &terminal_coding);
8398 setup_coding_system (Qno_conversion, &safe_terminal_coding);
8399 }
8400
8401 char *
8402 emacs_strerror (error_number)
8403 int error_number;
8404 {
8405 char *str;
8406
8407 synchronize_system_messages_locale ();
8408 str = strerror (error_number);
8409
8410 if (! NILP (Vlocale_coding_system))
8411 {
8412 Lisp_Object dec = code_convert_string_norecord (build_string (str),
8413 Vlocale_coding_system,
8414 0);
8415 str = (char *) XSTRING (dec)->data;
8416 }
8417
8418 return str;
8419 }
8420
8421 #endif /* emacs */