*** empty log message ***
[bpt/emacs.git] / src / coding.c
1 /* Coding system handler (conversion, detection, and etc).
2 Copyright (C) 1995, 1997, 1998 Electrotechnical Laboratory, JAPAN.
3 Licensed to the Free Software Foundation.
4 Copyright (C) 2001 Free Software Foundation, Inc.
5 Copyright (C) 2001, 2002
6 National Institute of Advanced Industrial Science and Technology (AIST)
7 Registration Number H13PRO009
8
9 This file is part of GNU Emacs.
10
11 GNU Emacs is free software; you can redistribute it and/or modify
12 it under the terms of the GNU General Public License as published by
13 the Free Software Foundation; either version 2, or (at your option)
14 any later version.
15
16 GNU Emacs is distributed in the hope that it will be useful,
17 but WITHOUT ANY WARRANTY; without even the implied warranty of
18 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19 GNU General Public License for more details.
20
21 You should have received a copy of the GNU General Public License
22 along with GNU Emacs; see the file COPYING. If not, write to
23 the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
24 Boston, MA 02111-1307, USA. */
25
26 /*** TABLE OF CONTENTS ***
27
28 0. General comments
29 1. Preamble
30 2. Emacs' internal format (emacs-utf-8) handlers
31 3. UTF-8 handlers
32 4. UTF-16 handlers
33 5. Charset-base coding systems handlers
34 6. emacs-mule (old Emacs' internal format) handlers
35 7. ISO2022 handlers
36 8. Shift-JIS and BIG5 handlers
37 9. CCL handlers
38 10. C library functions
39 11. Emacs Lisp library functions
40 12. Postamble
41
42 */
43
44 /*** 0. General comments ***
45
46
47 CODING SYSTEM
48
49 Coding system is an object for a encoding mechanism that contains
50 information about how to convert byte sequence to character
51 sequences and vice versa. When we say "decode", it means converting
52 a byte sequence of a specific coding system into a character
53 sequence that is represented by Emacs' internal coding system
54 `emacs-utf-8', and when we say "encode", it means converting a
55 character sequence of emacs-utf-8 to a byte sequence of a specific
56 coding system.
57
58 In Emacs Lisp, a coding system is represented by a Lisp symbol. In
59 C level, a coding system is represented by a vector of attributes
60 stored in the hash table Vcharset_hash_table. The conversion from a
61 coding system symbol to attributes vector is done by looking up
62 Vcharset_hash_table by the symbol.
63
64 Coding systems are classified into the following types depending on
65 the mechanism of encoding. Here's a brief descrition about type.
66
67 o UTF-8
68
69 o UTF-16
70
71 o Charset-base coding system
72
73 A coding system defined by one or more (coded) character sets.
74 Decoding and encoding are done by code converter defined for each
75 character set.
76
77 o Old Emacs' internal format (emacs-mule)
78
79 The coding system adopted by an old versions of Emacs (20 and 21).
80
81 o ISO2022-base coding system
82
83 The most famous coding system for multiple character sets. X's
84 Compound Text, various EUCs (Extended Unix Code), and coding systems
85 used in the Internet communication such as ISO-2022-JP are all
86 variants of ISO2022.
87
88 o SJIS (or Shift-JIS or MS-Kanji-Code)
89
90 A coding system to encode character sets: ASCII, JISX0201, and
91 JISX0208. Widely used for PC's in Japan. Details are described in
92 section 8.
93
94 o BIG5
95
96 A coding system to encode character sets: ASCII and Big5. Widely
97 used by Chinese (mainly in Taiwan and Hong Kong). Details are
98 described in section 8. In this file, when we write "big5" (all
99 lowercase), we mean the coding system, and when we write "Big5"
100 (capitalized), we mean the character set.
101
102 o CCL
103
104 If a user wants to decode/encode a text encoded in a coding system
105 not listed above, he can supply a decoder and an encoder for it in
106 CCL (Code Conversion Language) programs. Emacs executes the CCL
107 program while decoding/encoding.
108
109 o Raw-text
110
111 A coding system for a text containing raw eight-bit data. Emacs
112 treat each byte of source text as a character (except for
113 end-of-line conversion).
114
115 o No-conversion
116
117 Like raw text, but don't do end-of-line conversion.
118
119
120 END-OF-LINE FORMAT
121
122 How end-of-line of a text is encoded depends on a system. For
123 instance, Unix's format is just one byte of LF (line-feed) code,
124 whereas DOS's format is two-byte sequence of `carriage-return' and
125 `line-feed' codes. MacOS's format is usually one byte of
126 `carriage-return'.
127
128 Since text characters encoding and end-of-line encoding are
129 independent, any coding system described above can take any format
130 of end-of-line (except for no-conversion).
131
132 STRUCT CODING_SYSTEM
133
134 Before using a coding system for code conversion (i.e. decoding and
135 encoding), we setup a structure of type `struct coding_system'.
136 This structure keeps various information about a specific code
137 conversion (e.g. the location of source and destination data).
138
139 */
140
141 /* COMMON MACROS */
142
143
144 /*** GENERAL NOTES on `detect_coding_XXX ()' functions ***
145
146 These functions check if a byte sequence specified as a source in
147 CODING conforms to the format of XXX. Return 1 if the data contains
148 a byte sequence which can be decoded into non-ASCII characters by
149 the coding system. Otherwize (i.e. the data contains only ASCII
150 characters or invalid sequence) return 0.
151
152 It also resets some bits of an integer pointed by MASK. The macros
153 CATEGORY_MASK_XXX specifies each bit of this integer.
154
155 Below is the template of these functions. */
156
157 #if 0
158 static int
159 detect_coding_XXX (coding, mask)
160 struct coding_system *coding;
161 int *mask;
162 {
163 unsigned char *src = coding->source;
164 unsigned char *src_end = coding->source + coding->src_bytes;
165 int multibytep = coding->src_multibyte;
166 int c;
167 int found = 0;
168 ...;
169
170 while (1)
171 {
172 /* Get one byte from the source. If the souce is exausted, jump
173 to no_more_source:. */
174 ONE_MORE_BYTE (c);
175 /* Check if it conforms to XXX. If not, break the loop. */
176 }
177 /* As the data is invalid for XXX, reset a proper bits. */
178 *mask &= ~CODING_CATEGORY_XXX;
179 return 0;
180 no_more_source:
181 /* The source exausted. */
182 if (!found)
183 /* ASCII characters only. */
184 return 0;
185 /* Some data should be decoded into non-ASCII characters. */
186 *mask &= CODING_CATEGORY_XXX;
187 return 1;
188 }
189 #endif
190
191 /*** GENERAL NOTES on `decode_coding_XXX ()' functions ***
192
193 These functions decode a byte sequence specified as a source by
194 CODING. The resulting multibyte text goes to a place pointed to by
195 CODING->charbuf, the length of which should not exceed
196 CODING->charbuf_size;
197
198 These functions set the information of original and decoded texts in
199 CODING->consumed, CODING->consumed_char, and CODING->charbuf_used.
200 They also set CODING->result to one of CODING_RESULT_XXX indicating
201 how the decoding is finished.
202
203 Below is the template of these functions. */
204
205 #if 0
206 static void
207 decode_coding_XXXX (coding)
208 struct coding_system *coding;
209 {
210 unsigned char *src = coding->source + coding->consumed;
211 unsigned char *src_end = coding->source + coding->src_bytes;
212 /* SRC_BASE remembers the start position in source in each loop.
213 The loop will be exited when there's not enough source code, or
214 when there's no room in CHARBUF for a decoded character. */
215 unsigned char *src_base;
216 /* A buffer to produce decoded characters. */
217 int *charbuf = coding->charbuf;
218 int *charbuf_end = charbuf + coding->charbuf_size;
219 int multibytep = coding->src_multibyte;
220
221 while (1)
222 {
223 src_base = src;
224 if (charbuf < charbuf_end)
225 /* No more room to produce a decoded character. */
226 break;
227 ONE_MORE_BYTE (c);
228 /* Decode it. */
229 }
230
231 no_more_source:
232 if (src_base < src_end
233 && coding->mode & CODING_MODE_LAST_BLOCK)
234 /* If the source ends by partial bytes to construct a character,
235 treat them as eight-bit raw data. */
236 while (src_base < src_end && charbuf < charbuf_end)
237 *charbuf++ = *src_base++;
238 /* Remember how many bytes and characters we consumed. If the
239 source is multibyte, the bytes and chars are not identical. */
240 coding->consumed = coding->consumed_char = src_base - coding->source;
241 /* Remember how many characters we produced. */
242 coding->charbuf_used = charbuf - coding->charbuf;
243 }
244 #endif
245
246 /*** GENERAL NOTES on `encode_coding_XXX ()' functions ***
247
248 These functions encode SRC_BYTES length text at SOURCE of Emacs'
249 internal multibyte format by CODING. The resulting byte sequence
250 goes to a place pointed to by DESTINATION, the length of which
251 should not exceed DST_BYTES.
252
253 These functions set the information of original and encoded texts in
254 the members produced, produced_char, consumed, and consumed_char of
255 the structure *CODING. They also set the member result to one of
256 CODING_RESULT_XXX indicating how the encoding finished.
257
258 DST_BYTES zero means that source area and destination area are
259 overlapped, which means that we can produce a encoded text until it
260 reaches at the head of not-yet-encoded source text.
261
262 Below is a template of these functions. */
263 #if 0
264 static void
265 encode_coding_XXX (coding)
266 struct coding_system *coding;
267 {
268 int multibytep = coding->dst_multibyte;
269 int *charbuf = coding->charbuf;
270 int *charbuf_end = charbuf->charbuf + coding->charbuf_used;
271 unsigned char *dst = coding->destination + coding->produced;
272 unsigned char *dst_end = coding->destination + coding->dst_bytes;
273 unsigned char *adjusted_dst_end = dst_end - _MAX_BYTES_PRODUCED_IN_LOOP_;
274 int produced_chars = 0;
275
276 for (; charbuf < charbuf_end && dst < adjusted_dst_end; charbuf++)
277 {
278 int c = *charbuf;
279 /* Encode C into DST, and increment DST. */
280 }
281 label_no_more_destination:
282 /* How many chars and bytes we produced. */
283 coding->produced_char += produced_chars;
284 coding->produced = dst - coding->destination;
285 }
286 #endif
287
288 \f
289 /*** 1. Preamble ***/
290
291 #include <config.h>
292 #include <stdio.h>
293
294 #include "lisp.h"
295 #include "buffer.h"
296 #include "character.h"
297 #include "charset.h"
298 #include "ccl.h"
299 #include "composite.h"
300 #include "coding.h"
301 #include "window.h"
302
303 Lisp_Object Vcoding_system_hash_table;
304
305 Lisp_Object Qcoding_system, Qcoding_aliases, Qeol_type;
306 Lisp_Object Qunix, Qdos, Qmac;
307 Lisp_Object Qbuffer_file_coding_system;
308 Lisp_Object Qpost_read_conversion, Qpre_write_conversion;
309 Lisp_Object Qdefault_char;
310 Lisp_Object Qno_conversion, Qundecided;
311 Lisp_Object Qcharset, Qiso_2022, Qutf_8, Qutf_16, Qshift_jis, Qbig5;
312 Lisp_Object Qutf_16_be_nosig, Qutf_16_be, Qutf_16_le_nosig, Qutf_16_le;
313 Lisp_Object Qsignature, Qendian, Qbig, Qlittle;
314 Lisp_Object Qcoding_system_history;
315 Lisp_Object Qvalid_codes;
316
317 extern Lisp_Object Qinsert_file_contents, Qwrite_region;
318 Lisp_Object Qcall_process, Qcall_process_region, Qprocess_argument;
319 Lisp_Object Qstart_process, Qopen_network_stream;
320 Lisp_Object Qtarget_idx;
321
322 Lisp_Object Vselect_safe_coding_system_function;
323
324 /* Mnemonic string for each format of end-of-line. */
325 Lisp_Object eol_mnemonic_unix, eol_mnemonic_dos, eol_mnemonic_mac;
326 /* Mnemonic string to indicate format of end-of-line is not yet
327 decided. */
328 Lisp_Object eol_mnemonic_undecided;
329
330 #ifdef emacs
331
332 Lisp_Object Vcoding_system_list, Vcoding_system_alist;
333
334 Lisp_Object Qcoding_system_p, Qcoding_system_error;
335
336 /* Coding system emacs-mule and raw-text are for converting only
337 end-of-line format. */
338 Lisp_Object Qemacs_mule, Qraw_text;
339
340 /* Coding-systems are handed between Emacs Lisp programs and C internal
341 routines by the following three variables. */
342 /* Coding-system for reading files and receiving data from process. */
343 Lisp_Object Vcoding_system_for_read;
344 /* Coding-system for writing files and sending data to process. */
345 Lisp_Object Vcoding_system_for_write;
346 /* Coding-system actually used in the latest I/O. */
347 Lisp_Object Vlast_coding_system_used;
348
349 /* A vector of length 256 which contains information about special
350 Latin codes (especially for dealing with Microsoft codes). */
351 Lisp_Object Vlatin_extra_code_table;
352
353 /* Flag to inhibit code conversion of end-of-line format. */
354 int inhibit_eol_conversion;
355
356 /* Flag to inhibit ISO2022 escape sequence detection. */
357 int inhibit_iso_escape_detection;
358
359 /* Flag to make buffer-file-coding-system inherit from process-coding. */
360 int inherit_process_coding_system;
361
362 /* Coding system to be used to encode text for terminal display. */
363 struct coding_system terminal_coding;
364
365 /* Coding system to be used to encode text for terminal display when
366 terminal coding system is nil. */
367 struct coding_system safe_terminal_coding;
368
369 /* Coding system of what is sent from terminal keyboard. */
370 struct coding_system keyboard_coding;
371
372 Lisp_Object Vfile_coding_system_alist;
373 Lisp_Object Vprocess_coding_system_alist;
374 Lisp_Object Vnetwork_coding_system_alist;
375
376 Lisp_Object Vlocale_coding_system;
377
378 #endif /* emacs */
379
380 /* Flag to tell if we look up translation table on character code
381 conversion. */
382 Lisp_Object Venable_character_translation;
383 /* Standard translation table to look up on decoding (reading). */
384 Lisp_Object Vstandard_translation_table_for_decode;
385 /* Standard translation table to look up on encoding (writing). */
386 Lisp_Object Vstandard_translation_table_for_encode;
387
388 Lisp_Object Qtranslation_table;
389 Lisp_Object Qtranslation_table_id;
390 Lisp_Object Qtranslation_table_for_decode;
391 Lisp_Object Qtranslation_table_for_encode;
392
393 /* Alist of charsets vs revision number. */
394 static Lisp_Object Vcharset_revision_table;
395
396 /* Default coding systems used for process I/O. */
397 Lisp_Object Vdefault_process_coding_system;
398
399 /* Global flag to tell that we can't call post-read-conversion and
400 pre-write-conversion functions. Usually the value is zero, but it
401 is set to 1 temporarily while such functions are running. This is
402 to avoid infinite recursive call. */
403 static int inhibit_pre_post_conversion;
404
405 /* Char-table containing safe coding systems of each character. */
406 Lisp_Object Vchar_coding_system_table;
407 Lisp_Object Qchar_coding_system;
408
409 /* Two special coding systems. */
410 Lisp_Object Vsjis_coding_system;
411 Lisp_Object Vbig5_coding_system;
412
413
414 static int detect_coding_utf_8 P_ ((struct coding_system *, int *));
415 static void decode_coding_utf_8 P_ ((struct coding_system *));
416 static int encode_coding_utf_8 P_ ((struct coding_system *));
417
418 static int detect_coding_utf_16 P_ ((struct coding_system *, int *));
419 static void decode_coding_utf_16 P_ ((struct coding_system *));
420 static int encode_coding_utf_16 P_ ((struct coding_system *));
421
422 static int detect_coding_iso_2022 P_ ((struct coding_system *, int *));
423 static void decode_coding_iso_2022 P_ ((struct coding_system *));
424 static int encode_coding_iso_2022 P_ ((struct coding_system *));
425
426 static int detect_coding_emacs_mule P_ ((struct coding_system *, int *));
427 static void decode_coding_emacs_mule P_ ((struct coding_system *));
428 static int encode_coding_emacs_mule P_ ((struct coding_system *));
429
430 static int detect_coding_sjis P_ ((struct coding_system *, int *));
431 static void decode_coding_sjis P_ ((struct coding_system *));
432 static int encode_coding_sjis P_ ((struct coding_system *));
433
434 static int detect_coding_big5 P_ ((struct coding_system *, int *));
435 static void decode_coding_big5 P_ ((struct coding_system *));
436 static int encode_coding_big5 P_ ((struct coding_system *));
437
438 static int detect_coding_ccl P_ ((struct coding_system *, int *));
439 static void decode_coding_ccl P_ ((struct coding_system *));
440 static int encode_coding_ccl P_ ((struct coding_system *));
441
442 static void decode_coding_raw_text P_ ((struct coding_system *));
443 static int encode_coding_raw_text P_ ((struct coding_system *));
444
445
446 /* ISO2022 section */
447
448 #define CODING_ISO_INITIAL(coding, reg) \
449 (XINT (AREF (AREF (CODING_ID_ATTRS ((coding)->id), \
450 coding_attr_iso_initial), \
451 reg)))
452
453
454 #define CODING_ISO_REQUEST(coding, charset_id) \
455 ((charset_id <= (coding)->max_charset_id \
456 ? (coding)->safe_charsets[charset_id] \
457 : -1))
458
459
460 #define CODING_ISO_FLAGS(coding) \
461 ((coding)->spec.iso_2022.flags)
462 #define CODING_ISO_DESIGNATION(coding, reg) \
463 ((coding)->spec.iso_2022.current_designation[reg])
464 #define CODING_ISO_INVOCATION(coding, plane) \
465 ((coding)->spec.iso_2022.current_invocation[plane])
466 #define CODING_ISO_SINGLE_SHIFTING(coding) \
467 ((coding)->spec.iso_2022.single_shifting)
468 #define CODING_ISO_BOL(coding) \
469 ((coding)->spec.iso_2022.bol)
470 #define CODING_ISO_INVOKED_CHARSET(coding, plane) \
471 CODING_ISO_DESIGNATION ((coding), CODING_ISO_INVOCATION ((coding), (plane)))
472
473 /* Control characters of ISO2022. */
474 /* code */ /* function */
475 #define ISO_CODE_LF 0x0A /* line-feed */
476 #define ISO_CODE_CR 0x0D /* carriage-return */
477 #define ISO_CODE_SO 0x0E /* shift-out */
478 #define ISO_CODE_SI 0x0F /* shift-in */
479 #define ISO_CODE_SS2_7 0x19 /* single-shift-2 for 7-bit code */
480 #define ISO_CODE_ESC 0x1B /* escape */
481 #define ISO_CODE_SS2 0x8E /* single-shift-2 */
482 #define ISO_CODE_SS3 0x8F /* single-shift-3 */
483 #define ISO_CODE_CSI 0x9B /* control-sequence-introducer */
484
485 /* All code (1-byte) of ISO2022 is classified into one of the
486 followings. */
487 enum iso_code_class_type
488 {
489 ISO_control_0, /* Control codes in the range
490 0x00..0x1F and 0x7F, except for the
491 following 5 codes. */
492 ISO_carriage_return, /* ISO_CODE_CR (0x0D) */
493 ISO_shift_out, /* ISO_CODE_SO (0x0E) */
494 ISO_shift_in, /* ISO_CODE_SI (0x0F) */
495 ISO_single_shift_2_7, /* ISO_CODE_SS2_7 (0x19) */
496 ISO_escape, /* ISO_CODE_SO (0x1B) */
497 ISO_control_1, /* Control codes in the range
498 0x80..0x9F, except for the
499 following 3 codes. */
500 ISO_single_shift_2, /* ISO_CODE_SS2 (0x8E) */
501 ISO_single_shift_3, /* ISO_CODE_SS3 (0x8F) */
502 ISO_control_sequence_introducer, /* ISO_CODE_CSI (0x9B) */
503 ISO_0x20_or_0x7F, /* Codes of the values 0x20 or 0x7F. */
504 ISO_graphic_plane_0, /* Graphic codes in the range 0x21..0x7E. */
505 ISO_0xA0_or_0xFF, /* Codes of the values 0xA0 or 0xFF. */
506 ISO_graphic_plane_1 /* Graphic codes in the range 0xA1..0xFE. */
507 };
508
509 /** The macros CODING_ISO_FLAG_XXX defines a flag bit of the
510 `iso-flags' attribute of an iso2022 coding system. */
511
512 /* If set, produce long-form designation sequence (e.g. ESC $ ( A)
513 instead of the correct short-form sequence (e.g. ESC $ A). */
514 #define CODING_ISO_FLAG_LONG_FORM 0x0001
515
516 /* If set, reset graphic planes and registers at end-of-line to the
517 initial state. */
518 #define CODING_ISO_FLAG_RESET_AT_EOL 0x0002
519
520 /* If set, reset graphic planes and registers before any control
521 characters to the initial state. */
522 #define CODING_ISO_FLAG_RESET_AT_CNTL 0x0004
523
524 /* If set, encode by 7-bit environment. */
525 #define CODING_ISO_FLAG_SEVEN_BITS 0x0008
526
527 /* If set, use locking-shift function. */
528 #define CODING_ISO_FLAG_LOCKING_SHIFT 0x0010
529
530 /* If set, use single-shift function. Overwrite
531 CODING_ISO_FLAG_LOCKING_SHIFT. */
532 #define CODING_ISO_FLAG_SINGLE_SHIFT 0x0020
533
534 /* If set, use designation escape sequence. */
535 #define CODING_ISO_FLAG_DESIGNATION 0x0040
536
537 /* If set, produce revision number sequence. */
538 #define CODING_ISO_FLAG_REVISION 0x0080
539
540 /* If set, produce ISO6429's direction specifying sequence. */
541 #define CODING_ISO_FLAG_DIRECTION 0x0100
542
543 /* If set, assume designation states are reset at beginning of line on
544 output. */
545 #define CODING_ISO_FLAG_INIT_AT_BOL 0x0200
546
547 /* If set, designation sequence should be placed at beginning of line
548 on output. */
549 #define CODING_ISO_FLAG_DESIGNATE_AT_BOL 0x0400
550
551 /* If set, do not encode unsafe charactes on output. */
552 #define CODING_ISO_FLAG_SAFE 0x0800
553
554 /* If set, extra latin codes (128..159) are accepted as a valid code
555 on input. */
556 #define CODING_ISO_FLAG_LATIN_EXTRA 0x1000
557
558 #define CODING_ISO_FLAG_COMPOSITION 0x2000
559
560 #define CODING_ISO_FLAG_EUC_TW_SHIFT 0x4000
561
562 #define CODING_ISO_FLAG_FULL_SUPPORT 0x8000
563
564 /* A character to be produced on output if encoding of the original
565 character is prohibited by CODING_ISO_FLAG_SAFE. */
566 #define CODING_INHIBIT_CHARACTER_SUBSTITUTION '?'
567
568
569 /* UTF-16 section */
570 #define CODING_UTF_16_BOM(coding) \
571 ((coding)->spec.utf_16.bom)
572
573 #define CODING_UTF_16_ENDIAN(coding) \
574 ((coding)->spec.utf_16.endian)
575
576 #define CODING_UTF_16_SURROGATE(coding) \
577 ((coding)->spec.utf_16.surrogate)
578
579
580 /* CCL section */
581 #define CODING_CCL_DECODER(coding) \
582 AREF (CODING_ID_ATTRS ((coding)->id), coding_attr_ccl_decoder)
583 #define CODING_CCL_ENCODER(coding) \
584 AREF (CODING_ID_ATTRS ((coding)->id), coding_attr_ccl_encoder)
585 #define CODING_CCL_VALIDS(coding) \
586 (XSTRING (AREF (CODING_ID_ATTRS ((coding)->id), coding_attr_ccl_valids)) \
587 ->data)
588
589 /* Index for each coding category in `coding_category_table' */
590
591 enum coding_category
592 {
593 coding_category_iso_7,
594 coding_category_iso_7_tight,
595 coding_category_iso_8_1,
596 coding_category_iso_8_2,
597 coding_category_iso_7_else,
598 coding_category_iso_8_else,
599 coding_category_utf_8,
600 coding_category_utf_16_auto,
601 coding_category_utf_16_be,
602 coding_category_utf_16_le,
603 coding_category_utf_16_be_nosig,
604 coding_category_utf_16_le_nosig,
605 coding_category_charset,
606 coding_category_sjis,
607 coding_category_big5,
608 coding_category_ccl,
609 coding_category_emacs_mule,
610 /* All above are targets of code detection. */
611 coding_category_raw_text,
612 coding_category_undecided,
613 coding_category_max
614 };
615
616 /* Definitions of flag bits used in detect_coding_XXXX. */
617 #define CATEGORY_MASK_ISO_7 (1 << coding_category_iso_7)
618 #define CATEGORY_MASK_ISO_7_TIGHT (1 << coding_category_iso_7_tight)
619 #define CATEGORY_MASK_ISO_8_1 (1 << coding_category_iso_8_1)
620 #define CATEGORY_MASK_ISO_8_2 (1 << coding_category_iso_8_2)
621 #define CATEGORY_MASK_ISO_7_ELSE (1 << coding_category_iso_7_else)
622 #define CATEGORY_MASK_ISO_8_ELSE (1 << coding_category_iso_8_else)
623 #define CATEGORY_MASK_UTF_8 (1 << coding_category_utf_8)
624 #define CATEGORY_MASK_UTF_16_BE (1 << coding_category_utf_16_be)
625 #define CATEGORY_MASK_UTF_16_LE (1 << coding_category_utf_16_le)
626 #define CATEGORY_MASK_UTF_16_BE_NOSIG (1 << coding_category_utf_16_be_nosig)
627 #define CATEGORY_MASK_UTF_16_LE_NOSIG (1 << coding_category_utf_16_le_nosig)
628 #define CATEGORY_MASK_CHARSET (1 << coding_category_charset)
629 #define CATEGORY_MASK_SJIS (1 << coding_category_sjis)
630 #define CATEGORY_MASK_BIG5 (1 << coding_category_big5)
631 #define CATEGORY_MASK_CCL (1 << coding_category_ccl)
632 #define CATEGORY_MASK_EMACS_MULE (1 << coding_category_emacs_mule)
633
634 /* This value is returned if detect_coding_mask () find nothing other
635 than ASCII characters. */
636 #define CATEGORY_MASK_ANY \
637 (CATEGORY_MASK_ISO_7 \
638 | CATEGORY_MASK_ISO_7_TIGHT \
639 | CATEGORY_MASK_ISO_8_1 \
640 | CATEGORY_MASK_ISO_8_2 \
641 | CATEGORY_MASK_ISO_7_ELSE \
642 | CATEGORY_MASK_ISO_8_ELSE \
643 | CATEGORY_MASK_UTF_8 \
644 | CATEGORY_MASK_UTF_16_BE \
645 | CATEGORY_MASK_UTF_16_LE \
646 | CATEGORY_MASK_UTF_16_BE_NOSIG \
647 | CATEGORY_MASK_UTF_16_LE_NOSIG \
648 | CATEGORY_MASK_CHARSET \
649 | CATEGORY_MASK_SJIS \
650 | CATEGORY_MASK_BIG5 \
651 | CATEGORY_MASK_CCL \
652 | CATEGORY_MASK_EMACS_MULE)
653
654
655 #define CATEGORY_MASK_ISO_7BIT \
656 (CATEGORY_MASK_ISO_7 | CATEGORY_MASK_ISO_7_TIGHT)
657
658 #define CATEGORY_MASK_ISO_8BIT \
659 (CATEGORY_MASK_ISO_8_1 | CATEGORY_MASK_ISO_8_2)
660
661 #define CATEGORY_MASK_ISO_ELSE \
662 (CATEGORY_MASK_ISO_7_ELSE | CATEGORY_MASK_ISO_8_ELSE)
663
664 #define CATEGORY_MASK_ISO_ESCAPE \
665 (CATEGORY_MASK_ISO_7 \
666 | CATEGORY_MASK_ISO_7_TIGHT \
667 | CATEGORY_MASK_ISO_7_ELSE \
668 | CATEGORY_MASK_ISO_8_ELSE)
669
670 #define CATEGORY_MASK_ISO \
671 ( CATEGORY_MASK_ISO_7BIT \
672 | CATEGORY_MASK_ISO_8BIT \
673 | CATEGORY_MASK_ISO_ELSE)
674
675 #define CATEGORY_MASK_UTF_16 \
676 (CATEGORY_MASK_UTF_16_BE \
677 | CATEGORY_MASK_UTF_16_LE \
678 | CATEGORY_MASK_UTF_16_BE_NOSIG \
679 | CATEGORY_MASK_UTF_16_LE_NOSIG)
680
681
682 /* List of symbols `coding-category-xxx' ordered by priority. This
683 variable is exposed to Emacs Lisp. */
684 static Lisp_Object Vcoding_category_list;
685
686 /* Table of coding categories (Lisp symbols). This variable is for
687 internal use oly. */
688 static Lisp_Object Vcoding_category_table;
689
690 /* Table of coding-categories ordered by priority. */
691 static enum coding_category coding_priorities[coding_category_max];
692
693 /* Nth element is a coding context for the coding system bound to the
694 Nth coding category. */
695 static struct coding_system coding_categories[coding_category_max];
696
697 static int detected_mask[coding_category_raw_text] =
698 { CATEGORY_MASK_ISO,
699 CATEGORY_MASK_ISO,
700 CATEGORY_MASK_ISO,
701 CATEGORY_MASK_ISO,
702 CATEGORY_MASK_ISO,
703 CATEGORY_MASK_ISO,
704 CATEGORY_MASK_UTF_8,
705 CATEGORY_MASK_UTF_16,
706 CATEGORY_MASK_UTF_16,
707 CATEGORY_MASK_UTF_16,
708 CATEGORY_MASK_UTF_16,
709 CATEGORY_MASK_UTF_16,
710 CATEGORY_MASK_CHARSET,
711 CATEGORY_MASK_SJIS,
712 CATEGORY_MASK_BIG5,
713 CATEGORY_MASK_CCL,
714 CATEGORY_MASK_EMACS_MULE
715 };
716
717 /*** Commonly used macros and functions ***/
718
719 #ifndef min
720 #define min(a, b) ((a) < (b) ? (a) : (b))
721 #endif
722 #ifndef max
723 #define max(a, b) ((a) > (b) ? (a) : (b))
724 #endif
725
726 #define CODING_GET_INFO(coding, attrs, eol_type, charset_list) \
727 do { \
728 attrs = CODING_ID_ATTRS (coding->id); \
729 eol_type = CODING_ID_EOL_TYPE (coding->id); \
730 if (VECTORP (eol_type)) \
731 eol_type = Qunix; \
732 charset_list = CODING_ATTR_CHARSET_LIST (attrs); \
733 } while (0)
734
735
736 /* Safely get one byte from the source text pointed by SRC which ends
737 at SRC_END, and set C to that byte. If there are not enough bytes
738 in the source, it jumps to `no_more_source'. The caller
739 should declare and set these variables appropriately in advance:
740 src, src_end, multibytep
741 */
742
743 #define ONE_MORE_BYTE(c) \
744 do { \
745 if (src == src_end) \
746 { \
747 if (src_base < src) \
748 coding->result = CODING_RESULT_INSUFFICIENT_SRC; \
749 goto no_more_source; \
750 } \
751 c = *src++; \
752 if (multibytep && (c & 0x80)) \
753 { \
754 if ((c & 0xFE) != 0xC0) \
755 error ("Undecodable char found"); \
756 c = ((c & 1) << 6) | *src++; \
757 } \
758 consumed_chars++; \
759 } while (0)
760
761
762 #define ONE_MORE_BYTE_NO_CHECK(c) \
763 do { \
764 c = *src++; \
765 if (multibytep && (c & 0x80)) \
766 { \
767 if ((c & 0xFE) != 0xC0) \
768 error ("Undecodable char found"); \
769 c = ((c & 1) << 6) | *src++; \
770 } \
771 } while (0)
772
773
774 /* Store a byte C in the place pointed by DST and increment DST to the
775 next free point, and increment PRODUCED_CHARS. The caller should
776 assure that C is 0..127, and declare and set the variable `dst'
777 appropriately in advance.
778 */
779
780
781 #define EMIT_ONE_ASCII_BYTE(c) \
782 do { \
783 produced_chars++; \
784 *dst++ = (c); \
785 } while (0)
786
787
788 /* Like EMIT_ONE_ASCII_BYTE byt store two bytes; C1 and C2. */
789
790 #define EMIT_TWO_ASCII_BYTES(c1, c2) \
791 do { \
792 produced_chars += 2; \
793 *dst++ = (c1), *dst++ = (c2); \
794 } while (0)
795
796
797 /* Store a byte C in the place pointed by DST and increment DST to the
798 next free point, and increment PRODUCED_CHARS. If MULTIBYTEP is
799 nonzero, store in an appropriate multibyte from. The caller should
800 declare and set the variables `dst' and `multibytep' appropriately
801 in advance. */
802
803 #define EMIT_ONE_BYTE(c) \
804 do { \
805 produced_chars++; \
806 if (multibytep) \
807 { \
808 int ch = (c); \
809 if (ch >= 0x80) \
810 ch = BYTE8_TO_CHAR (ch); \
811 CHAR_STRING_ADVANCE (ch, dst); \
812 } \
813 else \
814 *dst++ = (c); \
815 } while (0)
816
817
818 /* Like EMIT_ONE_BYTE, but emit two bytes; C1 and C2. */
819
820 #define EMIT_TWO_BYTES(c1, c2) \
821 do { \
822 produced_chars += 2; \
823 if (multibytep) \
824 { \
825 int ch; \
826 \
827 ch = (c1); \
828 if (ch >= 0x80) \
829 ch = BYTE8_TO_CHAR (ch); \
830 CHAR_STRING_ADVANCE (ch, dst); \
831 ch = (c2); \
832 if (ch >= 0x80) \
833 ch = BYTE8_TO_CHAR (ch); \
834 CHAR_STRING_ADVANCE (ch, dst); \
835 } \
836 else \
837 { \
838 *dst++ = (c1); \
839 *dst++ = (c2); \
840 } \
841 } while (0)
842
843
844 #define EMIT_THREE_BYTES(c1, c2, c3) \
845 do { \
846 EMIT_ONE_BYTE (c1); \
847 EMIT_TWO_BYTES (c2, c3); \
848 } while (0)
849
850
851 #define EMIT_FOUR_BYTES(c1, c2, c3, c4) \
852 do { \
853 EMIT_TWO_BYTES (c1, c2); \
854 EMIT_TWO_BYTES (c3, c4); \
855 } while (0)
856
857
858 #define CODING_DECODE_CHAR(coding, src, src_base, src_end, charset, code, c) \
859 do { \
860 charset_map_loaded = 0; \
861 c = DECODE_CHAR (charset, code); \
862 if (charset_map_loaded) \
863 { \
864 unsigned char *orig = coding->source; \
865 EMACS_INT offset; \
866 \
867 coding_set_source (coding); \
868 offset = coding->source - orig; \
869 src += offset; \
870 src_base += offset; \
871 src_end += offset; \
872 } \
873 } while (0)
874
875
876 #define ASSURE_DESTINATION(bytes) \
877 do { \
878 if (dst + (bytes) >= dst_end) \
879 { \
880 int more_bytes = charbuf_end - charbuf + (bytes); \
881 \
882 dst = alloc_destination (coding, more_bytes, dst); \
883 dst_end = coding->destination + coding->dst_bytes; \
884 } \
885 } while (0)
886
887
888
889 static void
890 coding_set_source (coding)
891 struct coding_system *coding;
892 {
893 if (BUFFERP (coding->src_object))
894 {
895 if (coding->src_pos < 0)
896 coding->source = GAP_END_ADDR + coding->src_pos_byte;
897 else
898 {
899 struct buffer *buf = XBUFFER (coding->src_object);
900 EMACS_INT beg_byte = BUF_BEG_BYTE (buf);
901 EMACS_INT gpt_byte = BUF_GPT_BYTE (buf);
902 unsigned char *beg_addr = BUF_BEG_ADDR (buf);
903
904 coding->source = beg_addr + coding->src_pos_byte - 1;
905 if (coding->src_pos_byte >= gpt_byte)
906 coding->source += BUF_GAP_SIZE (buf);
907 }
908 }
909 else if (STRINGP (coding->src_object))
910 {
911 coding->source = (XSTRING (coding->src_object)->data
912 + coding->src_pos_byte);
913 }
914 else
915 /* Otherwise, the source is C string and is never relocated
916 automatically. Thus we don't have to update anything. */
917 ;
918 }
919
920 static void
921 coding_set_destination (coding)
922 struct coding_system *coding;
923 {
924 if (BUFFERP (coding->dst_object))
925 {
926 /* We are sure that coding->dst_pos_byte is before the gap of the
927 buffer. */
928 coding->destination = (BUF_BEG_ADDR (XBUFFER (coding->dst_object))
929 + coding->dst_pos_byte - 1);
930 if (coding->src_pos < 0)
931 /* The source and destination is in the same buffer. */
932 coding->dst_bytes = (GAP_END_ADDR
933 - (coding->src_bytes - coding->consumed)
934 - coding->destination);
935 else
936 coding->dst_bytes = (BUF_GAP_END_ADDR (XBUFFER (coding->dst_object))
937 - coding->destination);
938 }
939 else
940 /* Otherwise, the destination is C string and is never relocated
941 automatically. Thus we don't have to update anything. */
942 ;
943 }
944
945
946 static void
947 coding_alloc_by_realloc (coding, bytes)
948 struct coding_system *coding;
949 EMACS_INT bytes;
950 {
951 coding->destination = (unsigned char *) xrealloc (coding->destination,
952 coding->dst_bytes + bytes);
953 coding->dst_bytes += bytes;
954 }
955
956 static void
957 coding_alloc_by_making_gap (coding, bytes)
958 struct coding_system *coding;
959 EMACS_INT bytes;
960 {
961 Lisp_Object this_buffer;
962
963 this_buffer = Fcurrent_buffer ();
964 if (EQ (this_buffer, coding->dst_object))
965 {
966 EMACS_INT add = coding->src_bytes - coding->consumed;
967
968 GAP_SIZE -= add; ZV += add; Z += add; ZV_BYTE += add; Z_BYTE += add;
969 make_gap (bytes);
970 GAP_SIZE += add; ZV -= add; Z -= add; ZV_BYTE -= add; Z_BYTE -= add;
971 }
972 else
973 {
974 set_buffer_internal (XBUFFER (coding->dst_object));
975 make_gap (bytes);
976 set_buffer_internal (XBUFFER (this_buffer));
977 }
978 }
979
980
981 static unsigned char *
982 alloc_destination (coding, nbytes, dst)
983 struct coding_system *coding;
984 int nbytes;
985 unsigned char *dst;
986 {
987 EMACS_INT offset = dst - coding->destination;
988
989 if (BUFFERP (coding->dst_object))
990 coding_alloc_by_making_gap (coding, nbytes);
991 else
992 coding_alloc_by_realloc (coding, nbytes);
993 coding->result = CODING_RESULT_SUCCESS;
994 coding_set_destination (coding);
995 dst = coding->destination + offset;
996 return dst;
997 }
998
999 \f
1000 /*** 2. Emacs' internal format (emacs-utf-8) ***/
1001
1002
1003
1004 \f
1005 /*** 3. UTF-8 ***/
1006
1007 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
1008 Check if a text is encoded in UTF-8. If it is, return
1009 CATEGORY_MASK_UTF_8, else return 0. */
1010
1011 #define UTF_8_1_OCTET_P(c) ((c) < 0x80)
1012 #define UTF_8_EXTRA_OCTET_P(c) (((c) & 0xC0) == 0x80)
1013 #define UTF_8_2_OCTET_LEADING_P(c) (((c) & 0xE0) == 0xC0)
1014 #define UTF_8_3_OCTET_LEADING_P(c) (((c) & 0xF0) == 0xE0)
1015 #define UTF_8_4_OCTET_LEADING_P(c) (((c) & 0xF8) == 0xF0)
1016 #define UTF_8_5_OCTET_LEADING_P(c) (((c) & 0xFC) == 0xF8)
1017
1018 static int
1019 detect_coding_utf_8 (coding, mask)
1020 struct coding_system *coding;
1021 int *mask;
1022 {
1023 unsigned char *src = coding->source, *src_base = src;
1024 unsigned char *src_end = coding->source + coding->src_bytes;
1025 int multibytep = coding->src_multibyte;
1026 int consumed_chars = 0;
1027 int found = 0;
1028
1029 /* A coding system of this category is always ASCII compatible. */
1030 src += coding->head_ascii;
1031
1032 while (1)
1033 {
1034 int c, c1, c2, c3, c4;
1035
1036 ONE_MORE_BYTE (c);
1037 if (UTF_8_1_OCTET_P (c))
1038 continue;
1039 ONE_MORE_BYTE (c1);
1040 if (! UTF_8_EXTRA_OCTET_P (c1))
1041 break;
1042 if (UTF_8_2_OCTET_LEADING_P (c))
1043 {
1044 found++;
1045 continue;
1046 }
1047 ONE_MORE_BYTE (c2);
1048 if (! UTF_8_EXTRA_OCTET_P (c2))
1049 break;
1050 if (UTF_8_3_OCTET_LEADING_P (c))
1051 {
1052 found++;
1053 continue;
1054 }
1055 ONE_MORE_BYTE (c3);
1056 if (! UTF_8_EXTRA_OCTET_P (c3))
1057 break;
1058 if (UTF_8_4_OCTET_LEADING_P (c))
1059 {
1060 found++;
1061 continue;
1062 }
1063 ONE_MORE_BYTE (c4);
1064 if (! UTF_8_EXTRA_OCTET_P (c4))
1065 break;
1066 if (UTF_8_5_OCTET_LEADING_P (c))
1067 {
1068 found++;
1069 continue;
1070 }
1071 break;
1072 }
1073 *mask &= ~CATEGORY_MASK_UTF_8;
1074 return 0;
1075
1076 no_more_source:
1077 if (! found)
1078 return 0;
1079 *mask &= CATEGORY_MASK_UTF_8;
1080 return 1;
1081 }
1082
1083
1084 static void
1085 decode_coding_utf_8 (coding)
1086 struct coding_system *coding;
1087 {
1088 unsigned char *src = coding->source + coding->consumed;
1089 unsigned char *src_end = coding->source + coding->src_bytes;
1090 unsigned char *src_base;
1091 int *charbuf = coding->charbuf;
1092 int *charbuf_end = charbuf + coding->charbuf_size;
1093 int consumed_chars = 0, consumed_chars_base;
1094 int multibytep = coding->src_multibyte;
1095 Lisp_Object attr, eol_type, charset_list;
1096
1097 CODING_GET_INFO (coding, attr, eol_type, charset_list);
1098
1099 while (1)
1100 {
1101 int c, c1, c2, c3, c4, c5;
1102
1103 src_base = src;
1104 consumed_chars_base = consumed_chars;
1105
1106 if (charbuf >= charbuf_end)
1107 break;
1108
1109 ONE_MORE_BYTE (c1);
1110 if (UTF_8_1_OCTET_P(c1))
1111 {
1112 c = c1;
1113 if (c == '\r')
1114 {
1115 if (EQ (eol_type, Qdos))
1116 {
1117 if (src == src_end)
1118 goto no_more_source;
1119 if (*src == '\n')
1120 ONE_MORE_BYTE (c);
1121 }
1122 else if (EQ (eol_type, Qmac))
1123 c = '\n';
1124 }
1125 }
1126 else
1127 {
1128 ONE_MORE_BYTE (c2);
1129 if (! UTF_8_EXTRA_OCTET_P (c2))
1130 goto invalid_code;
1131 if (UTF_8_2_OCTET_LEADING_P (c1))
1132 c = ((c1 & 0x1F) << 6) | (c2 & 0x3F);
1133 else
1134 {
1135 ONE_MORE_BYTE (c3);
1136 if (! UTF_8_EXTRA_OCTET_P (c3))
1137 goto invalid_code;
1138 if (UTF_8_3_OCTET_LEADING_P (c1))
1139 c = (((c1 & 0xF) << 12)
1140 | ((c2 & 0x3F) << 6) | (c3 & 0x3F));
1141 else
1142 {
1143 ONE_MORE_BYTE (c4);
1144 if (! UTF_8_EXTRA_OCTET_P (c4))
1145 goto invalid_code;
1146 if (UTF_8_4_OCTET_LEADING_P (c1))
1147 c = (((c1 & 0x7) << 18) | ((c2 & 0x3F) << 12)
1148 | ((c3 & 0x3F) << 6) | (c4 & 0x3F));
1149 else
1150 {
1151 ONE_MORE_BYTE (c5);
1152 if (! UTF_8_EXTRA_OCTET_P (c5))
1153 goto invalid_code;
1154 if (UTF_8_5_OCTET_LEADING_P (c1))
1155 {
1156 c = (((c1 & 0x3) << 24) | ((c2 & 0x3F) << 18)
1157 | ((c3 & 0x3F) << 12) | ((c4 & 0x3F) << 6)
1158 | (c5 & 0x3F));
1159 if (c > MAX_CHAR)
1160 goto invalid_code;
1161 }
1162 else
1163 goto invalid_code;
1164 }
1165 }
1166 }
1167 }
1168
1169 *charbuf++ = c;
1170 continue;
1171
1172 invalid_code:
1173 src = src_base;
1174 consumed_chars = consumed_chars_base;
1175 ONE_MORE_BYTE (c);
1176 *charbuf++ = ASCII_BYTE_P (c) ? c : BYTE8_TO_CHAR (c);
1177 coding->errors++;
1178 }
1179
1180 no_more_source:
1181 coding->consumed_char += consumed_chars_base;
1182 coding->consumed = src_base - coding->source;
1183 coding->charbuf_used = charbuf - coding->charbuf;
1184 }
1185
1186
1187 static int
1188 encode_coding_utf_8 (coding)
1189 struct coding_system *coding;
1190 {
1191 int multibytep = coding->dst_multibyte;
1192 int *charbuf = coding->charbuf;
1193 int *charbuf_end = charbuf + coding->charbuf_used;
1194 unsigned char *dst = coding->destination + coding->produced;
1195 unsigned char *dst_end = coding->destination + coding->dst_bytes;
1196 int produced_chars = 0;
1197 int c;
1198
1199 if (multibytep)
1200 {
1201 int safe_room = MAX_MULTIBYTE_LENGTH * 2;
1202
1203 while (charbuf < charbuf_end)
1204 {
1205 unsigned char str[MAX_MULTIBYTE_LENGTH], *p, *pend = str;
1206
1207 ASSURE_DESTINATION (safe_room);
1208 c = *charbuf++;
1209 CHAR_STRING_ADVANCE (c, pend);
1210 for (p = str; p < pend; p++)
1211 EMIT_ONE_BYTE (*p);
1212 }
1213 }
1214 else
1215 {
1216 int safe_room = MAX_MULTIBYTE_LENGTH;
1217
1218 while (charbuf < charbuf_end)
1219 {
1220 ASSURE_DESTINATION (safe_room);
1221 c = *charbuf++;
1222 dst += CHAR_STRING (c, dst);
1223 produced_chars++;
1224 }
1225 }
1226 coding->result = CODING_RESULT_SUCCESS;
1227 coding->produced_char += produced_chars;
1228 coding->produced = dst - coding->destination;
1229 return 0;
1230 }
1231
1232
1233 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
1234 Check if a text is encoded in UTF-16 Big Endian (endian == 1) or
1235 Little Endian (otherwise). If it is, return
1236 CATEGORY_MASK_UTF_16_BE or CATEGORY_MASK_UTF_16_LE,
1237 else return 0. */
1238
1239 #define UTF_16_HIGH_SURROGATE_P(val) \
1240 (((val) & 0xFC00) == 0xD800)
1241
1242 #define UTF_16_LOW_SURROGATE_P(val) \
1243 (((val) & 0xFC00) == 0xDC00)
1244
1245 #define UTF_16_INVALID_P(val) \
1246 (((val) == 0xFFFE) \
1247 || ((val) == 0xFFFF) \
1248 || UTF_16_LOW_SURROGATE_P (val))
1249
1250
1251 static int
1252 detect_coding_utf_16 (coding, mask)
1253 struct coding_system *coding;
1254 int *mask;
1255 {
1256 unsigned char *src = coding->source, *src_base = src;
1257 unsigned char *src_end = coding->source + coding->src_bytes;
1258 int multibytep = coding->src_multibyte;
1259 int consumed_chars = 0;
1260 int c1, c2;
1261
1262 ONE_MORE_BYTE (c1);
1263 ONE_MORE_BYTE (c2);
1264
1265 if ((c1 == 0xFF) && (c2 == 0xFE))
1266 {
1267 *mask &= CATEGORY_MASK_UTF_16_LE;
1268 return 1;
1269 }
1270 else if ((c1 == 0xFE) && (c2 == 0xFF))
1271 {
1272 *mask &= CATEGORY_MASK_UTF_16_BE;
1273 return 1;
1274 }
1275 no_more_source:
1276 return 0;
1277 }
1278
1279 static void
1280 decode_coding_utf_16 (coding)
1281 struct coding_system *coding;
1282 {
1283 unsigned char *src = coding->source + coding->consumed;
1284 unsigned char *src_end = coding->source + coding->src_bytes;
1285 unsigned char *src_base;
1286 int *charbuf = coding->charbuf;
1287 int *charbuf_end = charbuf + coding->charbuf_size;
1288 int consumed_chars = 0, consumed_chars_base;
1289 int multibytep = coding->src_multibyte;
1290 enum utf_16_bom_type bom = CODING_UTF_16_BOM (coding);
1291 enum utf_16_endian_type endian = CODING_UTF_16_ENDIAN (coding);
1292 int surrogate = CODING_UTF_16_SURROGATE (coding);
1293 Lisp_Object attr, eol_type, charset_list;
1294
1295 CODING_GET_INFO (coding, attr, eol_type, charset_list);
1296
1297 if (bom != utf_16_without_bom)
1298 {
1299 int c, c1, c2;
1300
1301 src_base = src;
1302 ONE_MORE_BYTE (c1);
1303 ONE_MORE_BYTE (c2);
1304 c = (c1 << 8) | c2;
1305 if (bom == utf_16_with_bom)
1306 {
1307 if (endian == utf_16_big_endian
1308 ? c != 0xFFFE : c != 0xFEFF)
1309 {
1310 /* We are sure that there's enouph room at CHARBUF. */
1311 *charbuf++ = c1;
1312 *charbuf++ = c2;
1313 coding->errors++;
1314 }
1315 }
1316 else
1317 {
1318 if (c == 0xFFFE)
1319 CODING_UTF_16_ENDIAN (coding)
1320 = endian = utf_16_big_endian;
1321 else if (c == 0xFEFF)
1322 CODING_UTF_16_ENDIAN (coding)
1323 = endian = utf_16_little_endian;
1324 else
1325 {
1326 CODING_UTF_16_ENDIAN (coding)
1327 = endian = utf_16_big_endian;
1328 src = src_base;
1329 }
1330 }
1331 CODING_UTF_16_BOM (coding) = utf_16_with_bom;
1332 }
1333
1334 while (1)
1335 {
1336 int c, c1, c2;
1337
1338 src_base = src;
1339 consumed_chars_base = consumed_chars;
1340
1341 if (charbuf + 2 >= charbuf_end)
1342 break;
1343
1344 ONE_MORE_BYTE (c1);
1345 ONE_MORE_BYTE (c2);
1346 c = (endian == utf_16_big_endian
1347 ? ((c1 << 8) | c2) : ((c2 << 8) | c1));
1348 if (surrogate)
1349 {
1350 if (! UTF_16_LOW_SURROGATE_P (c))
1351 {
1352 if (endian == utf_16_big_endian)
1353 c1 = surrogate >> 8, c2 = surrogate & 0xFF;
1354 else
1355 c1 = surrogate & 0xFF, c2 = surrogate >> 8;
1356 *charbuf++ = c1;
1357 *charbuf++ = c2;
1358 coding->errors++;
1359 if (UTF_16_HIGH_SURROGATE_P (c))
1360 CODING_UTF_16_SURROGATE (coding) = surrogate = c;
1361 else
1362 *charbuf++ = c;
1363 }
1364 else
1365 {
1366 c = ((surrogate - 0xD800) << 10) | (c - 0xDC00);
1367 CODING_UTF_16_SURROGATE (coding) = surrogate = 0;
1368 *charbuf++ = c;
1369 }
1370 }
1371 else
1372 {
1373 if (UTF_16_HIGH_SURROGATE_P (c))
1374 CODING_UTF_16_SURROGATE (coding) = surrogate = c;
1375 else
1376 *charbuf++ = c;
1377 }
1378 }
1379
1380 no_more_source:
1381 coding->consumed_char += consumed_chars_base;
1382 coding->consumed = src_base - coding->source;
1383 coding->charbuf_used = charbuf - coding->charbuf;
1384 }
1385
1386 static int
1387 encode_coding_utf_16 (coding)
1388 struct coding_system *coding;
1389 {
1390 int multibytep = coding->dst_multibyte;
1391 int *charbuf = coding->charbuf;
1392 int *charbuf_end = charbuf + coding->charbuf_used;
1393 unsigned char *dst = coding->destination + coding->produced;
1394 unsigned char *dst_end = coding->destination + coding->dst_bytes;
1395 int safe_room = 8;
1396 enum utf_16_bom_type bom = CODING_UTF_16_BOM (coding);
1397 int big_endian = CODING_UTF_16_ENDIAN (coding) == utf_16_big_endian;
1398 int produced_chars = 0;
1399 Lisp_Object attrs, eol_type, charset_list;
1400 int c;
1401
1402 CODING_GET_INFO (coding, attrs, eol_type, charset_list);
1403
1404 if (bom == utf_16_with_bom)
1405 {
1406 ASSURE_DESTINATION (safe_room);
1407 if (big_endian)
1408 EMIT_TWO_BYTES (0xFF, 0xFE);
1409 else
1410 EMIT_TWO_BYTES (0xFE, 0xFF);
1411 CODING_UTF_16_BOM (coding) = utf_16_without_bom;
1412 }
1413
1414 while (charbuf < charbuf_end)
1415 {
1416 ASSURE_DESTINATION (safe_room);
1417 c = *charbuf++;
1418 if (c >= MAX_UNICODE_CHAR)
1419 c = coding->default_char;
1420
1421 if (c < 0x10000)
1422 {
1423 if (big_endian)
1424 EMIT_TWO_BYTES (c >> 8, c & 0xFF);
1425 else
1426 EMIT_TWO_BYTES (c & 0xFF, c >> 8);
1427 }
1428 else
1429 {
1430 int c1, c2;
1431
1432 c -= 0x10000;
1433 c1 = (c >> 10) + 0xD800;
1434 c2 = (c & 0x3FF) + 0xDC00;
1435 if (big_endian)
1436 EMIT_FOUR_BYTES (c1 >> 8, c1 & 0xFF, c2 >> 8, c2 & 0xFF);
1437 else
1438 EMIT_FOUR_BYTES (c1 & 0xFF, c1 >> 8, c2 & 0xFF, c2 >> 8);
1439 }
1440 }
1441 coding->result = CODING_RESULT_SUCCESS;
1442 coding->produced = dst - coding->destination;
1443 coding->produced_char += produced_chars;
1444 return 0;
1445 }
1446
1447 \f
1448 /*** 6. Old Emacs' internal format (emacs-mule) ***/
1449
1450 /* Emacs' internal format for representation of multiple character
1451 sets is a kind of multi-byte encoding, i.e. characters are
1452 represented by variable-length sequences of one-byte codes.
1453
1454 ASCII characters and control characters (e.g. `tab', `newline') are
1455 represented by one-byte sequences which are their ASCII codes, in
1456 the range 0x00 through 0x7F.
1457
1458 8-bit characters of the range 0x80..0x9F are represented by
1459 two-byte sequences of LEADING_CODE_8_BIT_CONTROL and (their 8-bit
1460 code + 0x20).
1461
1462 8-bit characters of the range 0xA0..0xFF are represented by
1463 one-byte sequences which are their 8-bit code.
1464
1465 The other characters are represented by a sequence of `base
1466 leading-code', optional `extended leading-code', and one or two
1467 `position-code's. The length of the sequence is determined by the
1468 base leading-code. Leading-code takes the range 0x81 through 0x9D,
1469 whereas extended leading-code and position-code take the range 0xA0
1470 through 0xFF. See `charset.h' for more details about leading-code
1471 and position-code.
1472
1473 --- CODE RANGE of Emacs' internal format ---
1474 character set range
1475 ------------- -----
1476 ascii 0x00..0x7F
1477 eight-bit-control LEADING_CODE_8_BIT_CONTROL + 0xA0..0xBF
1478 eight-bit-graphic 0xA0..0xBF
1479 ELSE 0x81..0x9D + [0xA0..0xFF]+
1480 ---------------------------------------------
1481
1482 As this is the internal character representation, the format is
1483 usually not used externally (i.e. in a file or in a data sent to a
1484 process). But, it is possible to have a text externally in this
1485 format (i.e. by encoding by the coding system `emacs-mule').
1486
1487 In that case, a sequence of one-byte codes has a slightly different
1488 form.
1489
1490 At first, all characters in eight-bit-control are represented by
1491 one-byte sequences which are their 8-bit code.
1492
1493 Next, character composition data are represented by the byte
1494 sequence of the form: 0x80 METHOD BYTES CHARS COMPONENT ...,
1495 where,
1496 METHOD is 0xF0 plus one of composition method (enum
1497 composition_method),
1498
1499 BYTES is 0xA0 plus a byte length of this composition data,
1500
1501 CHARS is 0x20 plus a number of characters composed by this
1502 data,
1503
1504 COMPONENTs are characters of multibye form or composition
1505 rules encoded by two-byte of ASCII codes.
1506
1507 In addition, for backward compatibility, the following formats are
1508 also recognized as composition data on decoding.
1509
1510 0x80 MSEQ ...
1511 0x80 0xFF MSEQ RULE MSEQ RULE ... MSEQ
1512
1513 Here,
1514 MSEQ is a multibyte form but in these special format:
1515 ASCII: 0xA0 ASCII_CODE+0x80,
1516 other: LEADING_CODE+0x20 FOLLOWING-BYTE ...,
1517 RULE is a one byte code of the range 0xA0..0xF0 that
1518 represents a composition rule.
1519 */
1520
1521 char emacs_mule_bytes[256];
1522
1523 /* Leading-code followed by extended leading-code. */
1524 #define LEADING_CODE_PRIVATE_11 0x9A /* for private DIMENSION1 of 1-column */
1525 #define LEADING_CODE_PRIVATE_12 0x9B /* for private DIMENSION1 of 2-column */
1526 #define LEADING_CODE_PRIVATE_21 0x9C /* for private DIMENSION2 of 1-column */
1527 #define LEADING_CODE_PRIVATE_22 0x9D /* for private DIMENSION2 of 2-column */
1528
1529
1530 int
1531 emacs_mule_char (coding, composition, nbytes, nchars)
1532 struct coding_system *coding;
1533 int composition;
1534 int *nbytes, *nchars;
1535 {
1536 unsigned char *src = coding->source + coding->consumed;
1537 unsigned char *src_end = coding->source + coding->src_bytes;
1538 int multibytep = coding->src_multibyte;
1539 unsigned char *src_base = src;
1540 struct charset *charset;
1541 unsigned code;
1542 int c;
1543 int consumed_chars = 0;
1544
1545 ONE_MORE_BYTE (c);
1546 if (composition)
1547 {
1548 c -= 0x20;
1549 if (c == 0x80)
1550 {
1551 ONE_MORE_BYTE (c);
1552 if (c < 0xA0)
1553 goto invalid_code;
1554 *nbytes = src - src_base;
1555 *nchars = consumed_chars;
1556 return (c - 0x80);
1557 }
1558 }
1559
1560 switch (emacs_mule_bytes[c])
1561 {
1562 case 2:
1563 if (! (charset = emacs_mule_charset[c]))
1564 goto invalid_code;
1565 ONE_MORE_BYTE (c);
1566 code = c & 0x7F;
1567 break;
1568
1569 case 3:
1570 if (c == LEADING_CODE_PRIVATE_11
1571 || c == LEADING_CODE_PRIVATE_12)
1572 {
1573 ONE_MORE_BYTE (c);
1574 if (! (charset = emacs_mule_charset[c]))
1575 goto invalid_code;
1576 ONE_MORE_BYTE (c);
1577 code = c & 0x7F;
1578 }
1579 else
1580 {
1581 if (! (charset = emacs_mule_charset[c]))
1582 goto invalid_code;
1583 ONE_MORE_BYTE (c);
1584 code = (c & 0x7F) << 7;
1585 ONE_MORE_BYTE (c);
1586 code |= c & 0x7F;
1587 }
1588 break;
1589
1590 case 4:
1591 if (! (charset = emacs_mule_charset[c]))
1592 goto invalid_code;
1593 ONE_MORE_BYTE (c);
1594 code = (c & 0x7F) << 7;
1595 ONE_MORE_BYTE (c);
1596 code |= c & 0x7F;
1597 break;
1598
1599 case 1:
1600 code = c;
1601 charset = CHARSET_FROM_ID (ASCII_BYTE_P (code) ? charset_ascii
1602 : code < 0xA0 ? charset_8_bit_control
1603 : charset_8_bit_graphic);
1604 break;
1605
1606 default:
1607 abort ();
1608 }
1609 c = DECODE_CHAR (charset, code);
1610 if (c < 0)
1611 goto invalid_code;
1612 *nbytes = src - src_base;
1613 *nchars = consumed_chars;
1614 return c;
1615
1616 no_more_source:
1617 return -2;
1618
1619 invalid_code:
1620 return -1;
1621 }
1622
1623
1624 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
1625 Check if a text is encoded in `emacs-mule'. */
1626
1627 static int
1628 detect_coding_emacs_mule (coding, mask)
1629 struct coding_system *coding;
1630 int *mask;
1631 {
1632 unsigned char *src = coding->source, *src_base = src;
1633 unsigned char *src_end = coding->source + coding->src_bytes;
1634 int multibytep = coding->src_multibyte;
1635 int consumed_chars = 0;
1636 int c;
1637 int found = 0;
1638
1639 /* A coding system of this category is always ASCII compatible. */
1640 src += coding->head_ascii;
1641
1642 while (1)
1643 {
1644 ONE_MORE_BYTE (c);
1645
1646 if (c == 0x80)
1647 {
1648 /* Perhaps the start of composite character. We simple skip
1649 it because analyzing it is too heavy for detecting. But,
1650 at least, we check that the composite character
1651 constitues of more than 4 bytes. */
1652 unsigned char *src_base;
1653
1654 repeat:
1655 src_base = src;
1656 do
1657 {
1658 ONE_MORE_BYTE (c);
1659 }
1660 while (c >= 0xA0);
1661
1662 if (src - src_base <= 4)
1663 break;
1664 found = 1;
1665 if (c == 0x80)
1666 goto repeat;
1667 }
1668
1669 if (c < 0x80)
1670 {
1671 if (c < 0x20
1672 && (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO))
1673 break;
1674 }
1675 else
1676 {
1677 unsigned char *src_base = src - 1;
1678
1679 do
1680 {
1681 ONE_MORE_BYTE (c);
1682 }
1683 while (c >= 0xA0);
1684 if (src - src_base != emacs_mule_bytes[*src_base])
1685 break;
1686 found = 1;
1687 }
1688 }
1689 *mask &= ~CATEGORY_MASK_EMACS_MULE;
1690 return 0;
1691
1692 no_more_source:
1693 if (!found)
1694 return 0;
1695 *mask &= CATEGORY_MASK_EMACS_MULE;
1696 return 1;
1697 }
1698
1699
1700 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions". */
1701
1702 /* Decode a character represented as a component of composition
1703 sequence of Emacs 20/21 style at SRC. Set C to that character and
1704 update SRC to the head of next character (or an encoded composition
1705 rule). If SRC doesn't points a composition component, set C to -1.
1706 If SRC points an invalid byte sequence, global exit by a return
1707 value 0. */
1708
1709 #define DECODE_EMACS_MULE_COMPOSITION_CHAR(buf) \
1710 if (1) \
1711 { \
1712 int c; \
1713 int nbytes, nchars; \
1714 \
1715 if (src == src_end) \
1716 break; \
1717 c = emacs_mule_char (coding, 1, &nbytes, &nchars); \
1718 if (c < 0) \
1719 { \
1720 if (c == -2) \
1721 break; \
1722 goto invalid_code; \
1723 } \
1724 *buf++ = c; \
1725 src += nbytes; \
1726 consumed_chars += nchars; \
1727 } \
1728 else
1729
1730
1731 /* Decode a composition rule represented as a component of composition
1732 sequence of Emacs 20 style at SRC. Set C to the rule. If SRC
1733 points an invalid byte sequence, set C to -1. */
1734
1735 #define DECODE_EMACS_MULE_COMPOSITION_RULE(buf) \
1736 do { \
1737 int c, gref, nref; \
1738 \
1739 if (src < src_end) \
1740 goto invalid_code; \
1741 ONE_MORE_BYTE_NO_CHECK (c); \
1742 c -= 0xA0; \
1743 if (c < 0 || c >= 81) \
1744 goto invalid_code; \
1745 \
1746 gref = c / 9, nref = c % 9; \
1747 *buf++ = COMPOSITION_ENCODE_RULE (gref, nref); \
1748 } while (0)
1749
1750
1751 #define ADD_COMPOSITION_DATA(buf, method, nchars) \
1752 do { \
1753 *buf++ = -5; \
1754 *buf++ = coding->produced_char + char_offset; \
1755 *buf++ = CODING_ANNOTATE_COMPOSITION_MASK; \
1756 *buf++ = method; \
1757 *buf++ = nchars; \
1758 } while (0)
1759
1760
1761 #define DECODE_EMACS_MULE_21_COMPOSITION(c) \
1762 do { \
1763 /* Emacs 21 style format. The first three bytes at SRC are \
1764 (METHOD - 0xF0), (BYTES - 0xA0), (CHARS - 0xA0), where BYTES is \
1765 the byte length of this composition information, CHARS is the \
1766 number of characters composed by this composition. */ \
1767 enum composition_method method = c - 0xF0; \
1768 int consumed_chars_limit; \
1769 int nbytes, nchars; \
1770 \
1771 ONE_MORE_BYTE (c); \
1772 nbytes = c - 0xA0; \
1773 if (nbytes < 3) \
1774 goto invalid_code; \
1775 ONE_MORE_BYTE (c); \
1776 nchars = c - 0xA0; \
1777 ADD_COMPOSITION_DATA (charbuf, method, nchars); \
1778 consumed_chars_limit = consumed_chars_base + nbytes; \
1779 if (method != COMPOSITION_RELATIVE) \
1780 { \
1781 int i = 0; \
1782 while (consumed_chars < consumed_chars_limit) \
1783 { \
1784 if (i % 2 && method != COMPOSITION_WITH_ALTCHARS) \
1785 DECODE_EMACS_MULE_COMPOSITION_RULE (charbuf); \
1786 else \
1787 DECODE_EMACS_MULE_COMPOSITION_CHAR (charbuf); \
1788 } \
1789 if (consumed_chars < consumed_chars_limit) \
1790 goto invalid_code; \
1791 } \
1792 } while (0)
1793
1794
1795 #define DECODE_EMACS_MULE_20_RELATIVE_COMPOSITION(c) \
1796 do { \
1797 /* Emacs 20 style format for relative composition. */ \
1798 /* Store multibyte form of characters to be composed. */ \
1799 int components[MAX_COMPOSITION_COMPONENTS * 2 - 1]; \
1800 int *buf = components; \
1801 int i, j; \
1802 \
1803 src = src_base; \
1804 ONE_MORE_BYTE (c); /* skip 0x80 */ \
1805 for (i = 0; i < MAX_COMPOSITION_COMPONENTS; i++) \
1806 DECODE_EMACS_MULE_COMPOSITION_CHAR (buf); \
1807 if (i < 2) \
1808 goto invalid_code; \
1809 ADD_COMPOSITION_DATA (charbuf, COMPOSITION_RELATIVE, i); \
1810 for (j = 0; j < i; j++) \
1811 *charbuf++ = components[j]; \
1812 } while (0)
1813
1814
1815 #define DECODE_EMACS_MULE_20_RULEBASE_COMPOSITION(c) \
1816 do { \
1817 /* Emacs 20 style format for rule-base composition. */ \
1818 /* Store multibyte form of characters to be composed. */ \
1819 int components[MAX_COMPOSITION_COMPONENTS * 2 - 1]; \
1820 int *buf = components; \
1821 int i, j; \
1822 \
1823 DECODE_EMACS_MULE_COMPOSITION_CHAR (buf); \
1824 for (i = 0; i < MAX_COMPOSITION_COMPONENTS; i++) \
1825 { \
1826 DECODE_EMACS_MULE_COMPOSITION_RULE (buf); \
1827 DECODE_EMACS_MULE_COMPOSITION_CHAR (buf); \
1828 } \
1829 if (i < 1 || (buf - components) % 2 == 0) \
1830 goto invalid_code; \
1831 if (charbuf + i + (i / 2) + 1 < charbuf_end) \
1832 goto no_more_source; \
1833 ADD_COMPOSITION_DATA (buf, COMPOSITION_WITH_RULE, i); \
1834 for (j = 0; j < i; j++) \
1835 *charbuf++ = components[j]; \
1836 for (j = 0; j < i; j += 2) \
1837 *charbuf++ = components[j]; \
1838 } while (0)
1839
1840
1841 static void
1842 decode_coding_emacs_mule (coding)
1843 struct coding_system *coding;
1844 {
1845 unsigned char *src = coding->source + coding->consumed;
1846 unsigned char *src_end = coding->source + coding->src_bytes;
1847 unsigned char *src_base;
1848 int *charbuf = coding->charbuf;
1849 int *charbuf_end = charbuf + coding->charbuf_size;
1850 int consumed_chars = 0, consumed_chars_base;
1851 int char_offset = 0;
1852 int multibytep = coding->src_multibyte;
1853 Lisp_Object attrs, eol_type, charset_list;
1854
1855 CODING_GET_INFO (coding, attrs, eol_type, charset_list);
1856
1857 while (1)
1858 {
1859 int c;
1860
1861 src_base = src;
1862 consumed_chars_base = consumed_chars;
1863
1864 if (charbuf >= charbuf_end)
1865 break;
1866
1867 ONE_MORE_BYTE (c);
1868
1869 if (c < 0x80)
1870 {
1871 if (c == '\r')
1872 {
1873 if (EQ (eol_type, Qdos))
1874 {
1875 if (src == src_end)
1876 goto no_more_source;
1877 if (*src == '\n')
1878 ONE_MORE_BYTE (c);
1879 }
1880 else if (EQ (eol_type, Qmac))
1881 c = '\n';
1882 }
1883 *charbuf++ = c;
1884 char_offset++;
1885 }
1886 else if (c == 0x80)
1887 {
1888 if (charbuf + 5 + (MAX_COMPOSITION_COMPONENTS * 2) - 1 > charbuf_end)
1889 break;
1890 ONE_MORE_BYTE (c);
1891 if (c - 0xF0 >= COMPOSITION_RELATIVE
1892 && c - 0xF0 <= COMPOSITION_WITH_RULE_ALTCHARS)
1893 DECODE_EMACS_MULE_21_COMPOSITION (c);
1894 else if (c < 0xC0)
1895 DECODE_EMACS_MULE_20_RELATIVE_COMPOSITION (c);
1896 else if (c == 0xFF)
1897 DECODE_EMACS_MULE_20_RULEBASE_COMPOSITION (c);
1898 else
1899 goto invalid_code;
1900 }
1901 else if (c < 0xA0 && emacs_mule_bytes[c] > 1)
1902 {
1903 int nbytes, nchars;
1904 src--;
1905 c = emacs_mule_char (coding, 0, &nbytes, &nchars);
1906 if (c < 0)
1907 {
1908 if (c == -2)
1909 break;
1910 goto invalid_code;
1911 }
1912 *charbuf++ = c;
1913 char_offset++;
1914 }
1915 continue;
1916
1917 invalid_code:
1918 src = src_base;
1919 consumed_chars = consumed_chars_base;
1920 ONE_MORE_BYTE (c);
1921 *charbuf++ = ASCII_BYTE_P (c) ? c : BYTE8_TO_CHAR (c);
1922 coding->errors++;
1923 }
1924
1925 no_more_source:
1926 coding->consumed_char += consumed_chars_base;
1927 coding->consumed = src_base - coding->source;
1928 coding->charbuf_used = charbuf - coding->charbuf;
1929 }
1930
1931
1932 #define EMACS_MULE_LEADING_CODES(id, codes) \
1933 do { \
1934 if (id < 0xA0) \
1935 codes[0] = id, codes[1] = 0; \
1936 else if (id < 0xE0) \
1937 codes[0] = 0x9A, codes[1] = id; \
1938 else if (id < 0xF0) \
1939 codes[0] = 0x9B, codes[1] = id; \
1940 else if (id < 0xF5) \
1941 codes[0] = 0x9C, codes[1] = id; \
1942 else \
1943 codes[0] = 0x9D, codes[1] = id; \
1944 } while (0);
1945
1946
1947 static int
1948 encode_coding_emacs_mule (coding)
1949 struct coding_system *coding;
1950 {
1951 int multibytep = coding->dst_multibyte;
1952 int *charbuf = coding->charbuf;
1953 int *charbuf_end = charbuf + coding->charbuf_used;
1954 unsigned char *dst = coding->destination + coding->produced;
1955 unsigned char *dst_end = coding->destination + coding->dst_bytes;
1956 int safe_room = 8;
1957 int produced_chars = 0;
1958 Lisp_Object attrs, eol_type, charset_list;
1959 int c;
1960
1961 CODING_GET_INFO (coding, attrs, eol_type, charset_list);
1962
1963 while (charbuf < charbuf_end)
1964 {
1965 ASSURE_DESTINATION (safe_room);
1966 c = *charbuf++;
1967 if (ASCII_CHAR_P (c))
1968 EMIT_ONE_ASCII_BYTE (c);
1969 else
1970 {
1971 struct charset *charset;
1972 unsigned code;
1973 int dimension;
1974 int emacs_mule_id;
1975 unsigned char leading_codes[2];
1976
1977 charset = char_charset (c, charset_list, &code);
1978 if (! charset)
1979 {
1980 c = coding->default_char;
1981 if (ASCII_CHAR_P (c))
1982 {
1983 EMIT_ONE_ASCII_BYTE (c);
1984 continue;
1985 }
1986 charset = char_charset (c, charset_list, &code);
1987 }
1988 dimension = CHARSET_DIMENSION (charset);
1989 emacs_mule_id = CHARSET_EMACS_MULE_ID (charset);
1990 EMACS_MULE_LEADING_CODES (emacs_mule_id, leading_codes);
1991 EMIT_ONE_BYTE (leading_codes[0]);
1992 if (leading_codes[1])
1993 EMIT_ONE_BYTE (leading_codes[1]);
1994 if (dimension == 1)
1995 EMIT_ONE_BYTE (code);
1996 else
1997 {
1998 EMIT_ONE_BYTE (code >> 8);
1999 EMIT_ONE_BYTE (code & 0xFF);
2000 }
2001 }
2002 }
2003 coding->result = CODING_RESULT_SUCCESS;
2004 coding->produced_char += produced_chars;
2005 coding->produced = dst - coding->destination;
2006 return 0;
2007 }
2008
2009 \f
2010 /*** 7. ISO2022 handlers ***/
2011
2012 /* The following note describes the coding system ISO2022 briefly.
2013 Since the intention of this note is to help understand the
2014 functions in this file, some parts are NOT ACCURATE or OVERLY
2015 SIMPLIFIED. For thorough understanding, please refer to the
2016 original document of ISO2022.
2017
2018 ISO2022 provides many mechanisms to encode several character sets
2019 in 7-bit and 8-bit environments. For 7-bite environments, all text
2020 is encoded using bytes less than 128. This may make the encoded
2021 text a little bit longer, but the text passes more easily through
2022 several gateways, some of which strip off MSB (Most Signigant Bit).
2023
2024 There are two kinds of character sets: control character set and
2025 graphic character set. The former contains control characters such
2026 as `newline' and `escape' to provide control functions (control
2027 functions are also provided by escape sequences). The latter
2028 contains graphic characters such as 'A' and '-'. Emacs recognizes
2029 two control character sets and many graphic character sets.
2030
2031 Graphic character sets are classified into one of the following
2032 four classes, according to the number of bytes (DIMENSION) and
2033 number of characters in one dimension (CHARS) of the set:
2034 - DIMENSION1_CHARS94
2035 - DIMENSION1_CHARS96
2036 - DIMENSION2_CHARS94
2037 - DIMENSION2_CHARS96
2038
2039 In addition, each character set is assigned an identification tag,
2040 unique for each set, called "final character" (denoted as <F>
2041 hereafter). The <F> of each character set is decided by ECMA(*)
2042 when it is registered in ISO. The code range of <F> is 0x30..0x7F
2043 (0x30..0x3F are for private use only).
2044
2045 Note (*): ECMA = European Computer Manufacturers Association
2046
2047 Here are examples of graphic character set [NAME(<F>)]:
2048 o DIMENSION1_CHARS94 -- ASCII('B'), right-half-of-JISX0201('I'), ...
2049 o DIMENSION1_CHARS96 -- right-half-of-ISO8859-1('A'), ...
2050 o DIMENSION2_CHARS94 -- GB2312('A'), JISX0208('B'), ...
2051 o DIMENSION2_CHARS96 -- none for the moment
2052
2053 A code area (1 byte=8 bits) is divided into 4 areas, C0, GL, C1, and GR.
2054 C0 [0x00..0x1F] -- control character plane 0
2055 GL [0x20..0x7F] -- graphic character plane 0
2056 C1 [0x80..0x9F] -- control character plane 1
2057 GR [0xA0..0xFF] -- graphic character plane 1
2058
2059 A control character set is directly designated and invoked to C0 or
2060 C1 by an escape sequence. The most common case is that:
2061 - ISO646's control character set is designated/invoked to C0, and
2062 - ISO6429's control character set is designated/invoked to C1,
2063 and usually these designations/invocations are omitted in encoded
2064 text. In a 7-bit environment, only C0 can be used, and a control
2065 character for C1 is encoded by an appropriate escape sequence to
2066 fit into the environment. All control characters for C1 are
2067 defined to have corresponding escape sequences.
2068
2069 A graphic character set is at first designated to one of four
2070 graphic registers (G0 through G3), then these graphic registers are
2071 invoked to GL or GR. These designations and invocations can be
2072 done independently. The most common case is that G0 is invoked to
2073 GL, G1 is invoked to GR, and ASCII is designated to G0. Usually
2074 these invocations and designations are omitted in encoded text.
2075 In a 7-bit environment, only GL can be used.
2076
2077 When a graphic character set of CHARS94 is invoked to GL, codes
2078 0x20 and 0x7F of the GL area work as control characters SPACE and
2079 DEL respectively, and codes 0xA0 and 0xFF of the GR area should not
2080 be used.
2081
2082 There are two ways of invocation: locking-shift and single-shift.
2083 With locking-shift, the invocation lasts until the next different
2084 invocation, whereas with single-shift, the invocation affects the
2085 following character only and doesn't affect the locking-shift
2086 state. Invocations are done by the following control characters or
2087 escape sequences:
2088
2089 ----------------------------------------------------------------------
2090 abbrev function cntrl escape seq description
2091 ----------------------------------------------------------------------
2092 SI/LS0 (shift-in) 0x0F none invoke G0 into GL
2093 SO/LS1 (shift-out) 0x0E none invoke G1 into GL
2094 LS2 (locking-shift-2) none ESC 'n' invoke G2 into GL
2095 LS3 (locking-shift-3) none ESC 'o' invoke G3 into GL
2096 LS1R (locking-shift-1 right) none ESC '~' invoke G1 into GR (*)
2097 LS2R (locking-shift-2 right) none ESC '}' invoke G2 into GR (*)
2098 LS3R (locking-shift 3 right) none ESC '|' invoke G3 into GR (*)
2099 SS2 (single-shift-2) 0x8E ESC 'N' invoke G2 for one char
2100 SS3 (single-shift-3) 0x8F ESC 'O' invoke G3 for one char
2101 ----------------------------------------------------------------------
2102 (*) These are not used by any known coding system.
2103
2104 Control characters for these functions are defined by macros
2105 ISO_CODE_XXX in `coding.h'.
2106
2107 Designations are done by the following escape sequences:
2108 ----------------------------------------------------------------------
2109 escape sequence description
2110 ----------------------------------------------------------------------
2111 ESC '(' <F> designate DIMENSION1_CHARS94<F> to G0
2112 ESC ')' <F> designate DIMENSION1_CHARS94<F> to G1
2113 ESC '*' <F> designate DIMENSION1_CHARS94<F> to G2
2114 ESC '+' <F> designate DIMENSION1_CHARS94<F> to G3
2115 ESC ',' <F> designate DIMENSION1_CHARS96<F> to G0 (*)
2116 ESC '-' <F> designate DIMENSION1_CHARS96<F> to G1
2117 ESC '.' <F> designate DIMENSION1_CHARS96<F> to G2
2118 ESC '/' <F> designate DIMENSION1_CHARS96<F> to G3
2119 ESC '$' '(' <F> designate DIMENSION2_CHARS94<F> to G0 (**)
2120 ESC '$' ')' <F> designate DIMENSION2_CHARS94<F> to G1
2121 ESC '$' '*' <F> designate DIMENSION2_CHARS94<F> to G2
2122 ESC '$' '+' <F> designate DIMENSION2_CHARS94<F> to G3
2123 ESC '$' ',' <F> designate DIMENSION2_CHARS96<F> to G0 (*)
2124 ESC '$' '-' <F> designate DIMENSION2_CHARS96<F> to G1
2125 ESC '$' '.' <F> designate DIMENSION2_CHARS96<F> to G2
2126 ESC '$' '/' <F> designate DIMENSION2_CHARS96<F> to G3
2127 ----------------------------------------------------------------------
2128
2129 In this list, "DIMENSION1_CHARS94<F>" means a graphic character set
2130 of dimension 1, chars 94, and final character <F>, etc...
2131
2132 Note (*): Although these designations are not allowed in ISO2022,
2133 Emacs accepts them on decoding, and produces them on encoding
2134 CHARS96 character sets in a coding system which is characterized as
2135 7-bit environment, non-locking-shift, and non-single-shift.
2136
2137 Note (**): If <F> is '@', 'A', or 'B', the intermediate character
2138 '(' must be omitted. We refer to this as "short-form" hereafter.
2139
2140 Now you may notice that there are a lot of ways for encoding the
2141 same multilingual text in ISO2022. Actually, there exist many
2142 coding systems such as Compound Text (used in X11's inter client
2143 communication, ISO-2022-JP (used in Japanese internet), ISO-2022-KR
2144 (used in Korean internet), EUC (Extended UNIX Code, used in Asian
2145 localized platforms), and all of these are variants of ISO2022.
2146
2147 In addition to the above, Emacs handles two more kinds of escape
2148 sequences: ISO6429's direction specification and Emacs' private
2149 sequence for specifying character composition.
2150
2151 ISO6429's direction specification takes the following form:
2152 o CSI ']' -- end of the current direction
2153 o CSI '0' ']' -- end of the current direction
2154 o CSI '1' ']' -- start of left-to-right text
2155 o CSI '2' ']' -- start of right-to-left text
2156 The control character CSI (0x9B: control sequence introducer) is
2157 abbreviated to the escape sequence ESC '[' in a 7-bit environment.
2158
2159 Character composition specification takes the following form:
2160 o ESC '0' -- start relative composition
2161 o ESC '1' -- end composition
2162 o ESC '2' -- start rule-base composition (*)
2163 o ESC '3' -- start relative composition with alternate chars (**)
2164 o ESC '4' -- start rule-base composition with alternate chars (**)
2165 Since these are not standard escape sequences of any ISO standard,
2166 the use of them for these meaning is restricted to Emacs only.
2167
2168 (*) This form is used only in Emacs 20.5 and the older versions,
2169 but the newer versions can safely decode it.
2170 (**) This form is used only in Emacs 21.1 and the newer versions,
2171 and the older versions can't decode it.
2172
2173 Here's a list of examples usages of these composition escape
2174 sequences (categorized by `enum composition_method').
2175
2176 COMPOSITION_RELATIVE:
2177 ESC 0 CHAR [ CHAR ] ESC 1
2178 COMPOSITOIN_WITH_RULE:
2179 ESC 2 CHAR [ RULE CHAR ] ESC 1
2180 COMPOSITION_WITH_ALTCHARS:
2181 ESC 3 ALTCHAR [ ALTCHAR ] ESC 0 CHAR [ CHAR ] ESC 1
2182 COMPOSITION_WITH_RULE_ALTCHARS:
2183 ESC 4 ALTCHAR [ RULE ALTCHAR ] ESC 0 CHAR [ CHAR ] ESC 1 */
2184
2185 enum iso_code_class_type iso_code_class[256];
2186
2187 #define SAFE_CHARSET_P(coding, id) \
2188 ((id) <= (coding)->max_charset_id \
2189 && (coding)->safe_charsets[id] >= 0)
2190
2191
2192 #define SHIFT_OUT_OK(category) \
2193 (CODING_ISO_INITIAL (&coding_categories[category], 1) >= 0)
2194
2195 static void
2196 setup_iso_safe_charsets (Lisp_Object attrs)
2197 {
2198 Lisp_Object charset_list, safe_charsets;
2199 Lisp_Object request;
2200 Lisp_Object reg_usage;
2201 Lisp_Object tail;
2202 int reg94, reg96;
2203 int flags = XINT (AREF (attrs, coding_attr_iso_flags));
2204 int max_charset_id;
2205
2206 charset_list = CODING_ATTR_CHARSET_LIST (attrs);
2207 if ((flags & CODING_ISO_FLAG_FULL_SUPPORT)
2208 && ! EQ (charset_list, Viso_2022_charset_list))
2209 {
2210 CODING_ATTR_CHARSET_LIST (attrs)
2211 = charset_list = Viso_2022_charset_list;
2212 ASET (attrs, coding_attr_safe_charsets, Qnil);
2213 }
2214
2215 if (STRINGP (AREF (attrs, coding_attr_safe_charsets)))
2216 return;
2217
2218 max_charset_id = 0;
2219 for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
2220 {
2221 int id = XINT (XCAR (tail));
2222 if (max_charset_id < id)
2223 max_charset_id = id;
2224 }
2225
2226 safe_charsets = Fmake_string (make_number (max_charset_id + 1),
2227 make_number (255));
2228 request = AREF (attrs, coding_attr_iso_request);
2229 reg_usage = AREF (attrs, coding_attr_iso_usage);
2230 reg94 = XINT (XCAR (reg_usage));
2231 reg96 = XINT (XCDR (reg_usage));
2232
2233 for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
2234 {
2235 Lisp_Object id;
2236 Lisp_Object reg;
2237 struct charset *charset;
2238
2239 id = XCAR (tail);
2240 charset = CHARSET_FROM_ID (XINT (id));
2241 reg = Fcdr (Fassq (request, id));
2242 if (! NILP (reg))
2243 XSTRING (safe_charsets)->data[XINT (id)] = XINT (reg);
2244 else if (charset->iso_chars_96)
2245 {
2246 if (reg96 < 4)
2247 XSTRING (safe_charsets)->data[XINT (id)] = reg96;
2248 }
2249 else
2250 {
2251 if (reg94 < 4)
2252 XSTRING (safe_charsets)->data[XINT (id)] = reg94;
2253 }
2254 }
2255 ASET (attrs, coding_attr_safe_charsets, safe_charsets);
2256 }
2257
2258
2259 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
2260 Check if a text is encoded in ISO2022. If it is, returns an
2261 integer in which appropriate flag bits any of:
2262 CATEGORY_MASK_ISO_7
2263 CATEGORY_MASK_ISO_7_TIGHT
2264 CATEGORY_MASK_ISO_8_1
2265 CATEGORY_MASK_ISO_8_2
2266 CATEGORY_MASK_ISO_7_ELSE
2267 CATEGORY_MASK_ISO_8_ELSE
2268 are set. If a code which should never appear in ISO2022 is found,
2269 returns 0. */
2270
2271 static int
2272 detect_coding_iso_2022 (coding, mask)
2273 struct coding_system *coding;
2274 int *mask;
2275 {
2276 unsigned char *src = coding->source, *src_base = src;
2277 unsigned char *src_end = coding->source + coding->src_bytes;
2278 int multibytep = coding->src_multibyte;
2279 int mask_iso = CATEGORY_MASK_ISO;
2280 int mask_found = 0, mask_8bit_found = 0;
2281 int reg[4], shift_out = 0, single_shifting = 0;
2282 int id;
2283 int c, c1;
2284 int consumed_chars = 0;
2285 int i;
2286
2287 for (i = coding_category_iso_7; i <= coding_category_iso_8_else; i++)
2288 {
2289 struct coding_system *this = &(coding_categories[i]);
2290 Lisp_Object attrs, val;
2291
2292 attrs = CODING_ID_ATTRS (this->id);
2293 if (CODING_ISO_FLAGS (this) & CODING_ISO_FLAG_FULL_SUPPORT
2294 && ! EQ (CODING_ATTR_SAFE_CHARSETS (attrs), Viso_2022_charset_list))
2295 setup_iso_safe_charsets (attrs);
2296 val = CODING_ATTR_SAFE_CHARSETS (attrs);
2297 this->max_charset_id = XSTRING (val)->size - 1;
2298 this->safe_charsets = (char *) XSTRING (val)->data;
2299 }
2300
2301 /* A coding system of this category is always ASCII compatible. */
2302 src += coding->head_ascii;
2303
2304 reg[0] = charset_ascii, reg[1] = reg[2] = reg[3] = -1;
2305 while (mask_iso && src < src_end)
2306 {
2307 ONE_MORE_BYTE (c);
2308 switch (c)
2309 {
2310 case ISO_CODE_ESC:
2311 if (inhibit_iso_escape_detection)
2312 break;
2313 single_shifting = 0;
2314 ONE_MORE_BYTE (c);
2315 if (c >= '(' && c <= '/')
2316 {
2317 /* Designation sequence for a charset of dimension 1. */
2318 ONE_MORE_BYTE (c1);
2319 if (c1 < ' ' || c1 >= 0x80
2320 || (id = iso_charset_table[0][c >= ','][c1]) < 0)
2321 /* Invalid designation sequence. Just ignore. */
2322 break;
2323 reg[(c - '(') % 4] = id;
2324 }
2325 else if (c == '$')
2326 {
2327 /* Designation sequence for a charset of dimension 2. */
2328 ONE_MORE_BYTE (c);
2329 if (c >= '@' && c <= 'B')
2330 /* Designation for JISX0208.1978, GB2312, or JISX0208. */
2331 reg[0] = id = iso_charset_table[1][0][c];
2332 else if (c >= '(' && c <= '/')
2333 {
2334 ONE_MORE_BYTE (c1);
2335 if (c1 < ' ' || c1 >= 0x80
2336 || (id = iso_charset_table[1][c >= ','][c1]) < 0)
2337 /* Invalid designation sequence. Just ignore. */
2338 break;
2339 reg[(c - '(') % 4] = id;
2340 }
2341 else
2342 /* Invalid designation sequence. Just ignore. */
2343 break;
2344 }
2345 else if (c == 'N' || c == 'O')
2346 {
2347 /* ESC <Fe> for SS2 or SS3. */
2348 mask_iso &= CATEGORY_MASK_ISO_7_ELSE;
2349 break;
2350 }
2351 else if (c >= '0' && c <= '4')
2352 {
2353 /* ESC <Fp> for start/end composition. */
2354 mask_found |= CATEGORY_MASK_ISO;
2355 break;
2356 }
2357 else
2358 {
2359 /* Invalid escape sequence. */
2360 mask_iso &= ~CATEGORY_MASK_ISO_ESCAPE;
2361 break;
2362 }
2363
2364 /* We found a valid designation sequence for CHARSET. */
2365 mask_iso &= ~CATEGORY_MASK_ISO_8BIT;
2366 if (SAFE_CHARSET_P (&coding_categories[coding_category_iso_7],
2367 id))
2368 mask_found |= CATEGORY_MASK_ISO_7;
2369 else
2370 mask_iso &= ~CATEGORY_MASK_ISO_7;
2371 if (SAFE_CHARSET_P (&coding_categories[coding_category_iso_7_tight],
2372 id))
2373 mask_found |= CATEGORY_MASK_ISO_7_TIGHT;
2374 else
2375 mask_iso &= ~CATEGORY_MASK_ISO_7_TIGHT;
2376 if (SAFE_CHARSET_P (&coding_categories[coding_category_iso_7_else],
2377 id))
2378 mask_found |= CATEGORY_MASK_ISO_7_ELSE;
2379 else
2380 mask_iso &= ~CATEGORY_MASK_ISO_7_ELSE;
2381 if (SAFE_CHARSET_P (&coding_categories[coding_category_iso_8_else],
2382 id))
2383 mask_found |= CATEGORY_MASK_ISO_8_ELSE;
2384 else
2385 mask_iso &= ~CATEGORY_MASK_ISO_8_ELSE;
2386 break;
2387
2388 case ISO_CODE_SO:
2389 if (inhibit_iso_escape_detection)
2390 break;
2391 single_shifting = 0;
2392 if (shift_out == 0
2393 && (reg[1] >= 0
2394 || SHIFT_OUT_OK (coding_category_iso_7_else)
2395 || SHIFT_OUT_OK (coding_category_iso_8_else)))
2396 {
2397 /* Locking shift out. */
2398 mask_iso &= ~CATEGORY_MASK_ISO_7BIT;
2399 mask_found |= CATEGORY_MASK_ISO_ELSE;
2400 }
2401 break;
2402
2403 case ISO_CODE_SI:
2404 if (inhibit_iso_escape_detection)
2405 break;
2406 single_shifting = 0;
2407 if (shift_out == 1)
2408 {
2409 /* Locking shift in. */
2410 mask_iso &= ~CATEGORY_MASK_ISO_7BIT;
2411 mask_found |= CATEGORY_MASK_ISO_ELSE;
2412 }
2413 break;
2414
2415 case ISO_CODE_CSI:
2416 single_shifting = 0;
2417 case ISO_CODE_SS2:
2418 case ISO_CODE_SS3:
2419 {
2420 int newmask = CATEGORY_MASK_ISO_8_ELSE;
2421
2422 if (inhibit_iso_escape_detection)
2423 break;
2424 if (c != ISO_CODE_CSI)
2425 {
2426 if (CODING_ISO_FLAGS (&coding_categories[coding_category_iso_8_1])
2427 & CODING_ISO_FLAG_SINGLE_SHIFT)
2428 newmask |= CATEGORY_MASK_ISO_8_1;
2429 if (CODING_ISO_FLAGS (&coding_categories[coding_category_iso_8_2])
2430 & CODING_ISO_FLAG_SINGLE_SHIFT)
2431 newmask |= CATEGORY_MASK_ISO_8_2;
2432 single_shifting = 1;
2433 }
2434 if (VECTORP (Vlatin_extra_code_table)
2435 && !NILP (XVECTOR (Vlatin_extra_code_table)->contents[c]))
2436 {
2437 if (CODING_ISO_FLAGS (&coding_categories[coding_category_iso_8_1])
2438 & CODING_ISO_FLAG_LATIN_EXTRA)
2439 newmask |= CATEGORY_MASK_ISO_8_1;
2440 if (CODING_ISO_FLAGS (&coding_categories[coding_category_iso_8_2])
2441 & CODING_ISO_FLAG_LATIN_EXTRA)
2442 newmask |= CATEGORY_MASK_ISO_8_2;
2443 }
2444 mask_iso &= newmask;
2445 mask_found |= newmask;
2446 }
2447 break;
2448
2449 default:
2450 if (c < 0x80)
2451 {
2452 single_shifting = 0;
2453 break;
2454 }
2455 else if (c < 0xA0)
2456 {
2457 single_shifting = 0;
2458 mask_8bit_found = 1;
2459 if (VECTORP (Vlatin_extra_code_table)
2460 && !NILP (XVECTOR (Vlatin_extra_code_table)->contents[c]))
2461 {
2462 int newmask = 0;
2463
2464 if (CODING_ISO_FLAGS (&coding_categories[coding_category_iso_8_1])
2465 & CODING_ISO_FLAG_LATIN_EXTRA)
2466 newmask |= CATEGORY_MASK_ISO_8_1;
2467 if (CODING_ISO_FLAGS (&coding_categories[coding_category_iso_8_2])
2468 & CODING_ISO_FLAG_LATIN_EXTRA)
2469 newmask |= CATEGORY_MASK_ISO_8_2;
2470 mask_iso &= newmask;
2471 mask_found |= newmask;
2472 }
2473 else
2474 return 0;
2475 }
2476 else
2477 {
2478 mask_iso &= ~(CATEGORY_MASK_ISO_7BIT
2479 | CATEGORY_MASK_ISO_7_ELSE);
2480 mask_found |= CATEGORY_MASK_ISO_8_1;
2481 mask_8bit_found = 1;
2482 /* Check the length of succeeding codes of the range
2483 0xA0..0FF. If the byte length is odd, we exclude
2484 CATEGORY_MASK_ISO_8_2. We can check this only
2485 when we are not single shifting. */
2486 if (!single_shifting
2487 && mask_iso & CATEGORY_MASK_ISO_8_2)
2488 {
2489 int i = 1;
2490 while (src < src_end)
2491 {
2492 ONE_MORE_BYTE (c);
2493 if (c < 0xA0)
2494 break;
2495 i++;
2496 }
2497
2498 if (i & 1 && src < src_end)
2499 mask_iso &= ~CATEGORY_MASK_ISO_8_2;
2500 else
2501 mask_found |= CATEGORY_MASK_ISO_8_2;
2502 }
2503 }
2504 break;
2505 }
2506 }
2507 no_more_source:
2508 if (!mask_iso)
2509 {
2510 *mask &= ~CATEGORY_MASK_ISO;
2511 return 0;
2512 }
2513 if (!mask_found)
2514 return 0;
2515 *mask &= mask_iso & mask_found;
2516 if (! mask_8bit_found)
2517 *mask &= ~(CATEGORY_MASK_ISO_8BIT | CATEGORY_MASK_ISO_8_ELSE);
2518 return 1;
2519 }
2520
2521
2522 /* Set designation state into CODING. */
2523 #define DECODE_DESIGNATION(reg, dim, chars_96, final) \
2524 do { \
2525 int id, prev; \
2526 \
2527 if (final < '0' || final >= 128 \
2528 || ((id = ISO_CHARSET_TABLE (dim, chars_96, final)) < 0) \
2529 || !SAFE_CHARSET_P (coding, id)) \
2530 { \
2531 CODING_ISO_DESIGNATION (coding, reg) = -2; \
2532 goto invalid_code; \
2533 } \
2534 prev = CODING_ISO_DESIGNATION (coding, reg); \
2535 CODING_ISO_DESIGNATION (coding, reg) = id; \
2536 /* If there was an invalid designation to REG previously, and this \
2537 designation is ASCII to REG, we should keep this designation \
2538 sequence. */ \
2539 if (prev == -2 && id == charset_ascii) \
2540 goto invalid_code; \
2541 } while (0)
2542
2543
2544 #define MAYBE_FINISH_COMPOSITION() \
2545 do { \
2546 int i; \
2547 if (composition_state == COMPOSING_NO) \
2548 break; \
2549 /* It is assured that we have enough room for producing \
2550 characters stored in the table `components'. */ \
2551 if (charbuf + component_idx > charbuf_end) \
2552 goto no_more_source; \
2553 composition_state = COMPOSING_NO; \
2554 if (method == COMPOSITION_RELATIVE \
2555 || method == COMPOSITION_WITH_ALTCHARS) \
2556 { \
2557 for (i = 0; i < component_idx; i++) \
2558 *charbuf++ = components[i]; \
2559 char_offset += component_idx; \
2560 } \
2561 else \
2562 { \
2563 for (i = 0; i < component_idx; i += 2) \
2564 *charbuf++ = components[i]; \
2565 char_offset += (component_idx / 2) + 1; \
2566 } \
2567 } while (0)
2568
2569
2570 /* Handle composition start sequence ESC 0, ESC 2, ESC 3, or ESC 4.
2571 ESC 0 : relative composition : ESC 0 CHAR ... ESC 1
2572 ESC 2 : rulebase composition : ESC 2 CHAR RULE CHAR RULE ... CHAR ESC 1
2573 ESC 3 : altchar composition : ESC 3 CHAR ... ESC 0 CHAR ... ESC 1
2574 ESC 4 : alt&rule composition : ESC 4 CHAR RULE ... CHAR ESC 0 CHAR ... ESC 1
2575 */
2576
2577 #define DECODE_COMPOSITION_START(c1) \
2578 do { \
2579 if (c1 == '0' \
2580 && composition_state == COMPOSING_COMPONENT_CHAR) \
2581 { \
2582 component_len = component_idx; \
2583 composition_state = COMPOSING_CHAR; \
2584 } \
2585 else \
2586 { \
2587 unsigned char *p; \
2588 \
2589 MAYBE_FINISH_COMPOSITION (); \
2590 if (charbuf + MAX_COMPOSITION_COMPONENTS > charbuf_end) \
2591 goto no_more_source; \
2592 for (p = src; p < src_end - 1; p++) \
2593 if (*p == ISO_CODE_ESC && p[1] == '1') \
2594 break; \
2595 if (p == src_end - 1) \
2596 { \
2597 if (coding->mode & CODING_MODE_LAST_BLOCK) \
2598 goto invalid_code; \
2599 goto no_more_source; \
2600 } \
2601 \
2602 /* This is surely the start of a composition. */ \
2603 method = (c1 == '0' ? COMPOSITION_RELATIVE \
2604 : c1 == '2' ? COMPOSITION_WITH_RULE \
2605 : c1 == '3' ? COMPOSITION_WITH_ALTCHARS \
2606 : COMPOSITION_WITH_RULE_ALTCHARS); \
2607 composition_state = (c1 <= '2' ? COMPOSING_CHAR \
2608 : COMPOSING_COMPONENT_CHAR); \
2609 component_idx = component_len = 0; \
2610 } \
2611 } while (0)
2612
2613
2614 /* Handle compositoin end sequence ESC 1. */
2615
2616 #define DECODE_COMPOSITION_END() \
2617 do { \
2618 int nchars = (component_len > 0 ? component_idx - component_len \
2619 : method == COMPOSITION_RELATIVE ? component_idx \
2620 : (component_idx + 1) / 2); \
2621 int i; \
2622 int *saved_charbuf = charbuf; \
2623 \
2624 ADD_COMPOSITION_DATA (charbuf, method, nchars); \
2625 if (method != COMPOSITION_RELATIVE) \
2626 { \
2627 if (component_len == 0) \
2628 for (i = 0; i < component_idx; i++) \
2629 *charbuf++ = components[i]; \
2630 else \
2631 for (i = 0; i < component_len; i++) \
2632 *charbuf++ = components[i]; \
2633 *saved_charbuf = saved_charbuf - charbuf; \
2634 } \
2635 if (method == COMPOSITION_WITH_RULE) \
2636 for (i = 0; i < component_idx; i += 2, char_offset++) \
2637 *charbuf++ = components[i]; \
2638 else \
2639 for (i = component_len; i < component_idx; i++, char_offset++) \
2640 *charbuf++ = components[i]; \
2641 coding->annotated = 1; \
2642 composition_state = COMPOSING_NO; \
2643 } while (0)
2644
2645
2646 /* Decode a composition rule from the byte C1 (and maybe one more byte
2647 from SRC) and store one encoded composition rule in
2648 coding->cmp_data. */
2649
2650 #define DECODE_COMPOSITION_RULE(c1) \
2651 do { \
2652 (c1) -= 32; \
2653 if (c1 < 81) /* old format (before ver.21) */ \
2654 { \
2655 int gref = (c1) / 9; \
2656 int nref = (c1) % 9; \
2657 if (gref == 4) gref = 10; \
2658 if (nref == 4) nref = 10; \
2659 c1 = COMPOSITION_ENCODE_RULE (gref, nref); \
2660 } \
2661 else if (c1 < 93) /* new format (after ver.21) */ \
2662 { \
2663 ONE_MORE_BYTE (c2); \
2664 c1 = COMPOSITION_ENCODE_RULE (c1 - 81, c2 - 32); \
2665 } \
2666 else \
2667 c1 = 0; \
2668 } while (0)
2669
2670
2671 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions". */
2672
2673 static void
2674 decode_coding_iso_2022 (coding)
2675 struct coding_system *coding;
2676 {
2677 unsigned char *src = coding->source + coding->consumed;
2678 unsigned char *src_end = coding->source + coding->src_bytes;
2679 unsigned char *src_base;
2680 int *charbuf = coding->charbuf;
2681 int *charbuf_end = charbuf + coding->charbuf_size - 4;
2682 int consumed_chars = 0, consumed_chars_base;
2683 int char_offset = 0;
2684 int multibytep = coding->src_multibyte;
2685 /* Charsets invoked to graphic plane 0 and 1 respectively. */
2686 int charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
2687 int charset_id_1 = CODING_ISO_INVOKED_CHARSET (coding, 1);
2688 struct charset *charset;
2689 int c;
2690 /* For handling composition sequence. */
2691 #define COMPOSING_NO 0
2692 #define COMPOSING_CHAR 1
2693 #define COMPOSING_RULE 2
2694 #define COMPOSING_COMPONENT_CHAR 3
2695 #define COMPOSING_COMPONENT_RULE 4
2696
2697 int composition_state = COMPOSING_NO;
2698 enum composition_method method;
2699 int components[MAX_COMPOSITION_COMPONENTS * 2 + 1];
2700 int component_idx;
2701 int component_len;
2702 Lisp_Object attrs, eol_type, charset_list;
2703
2704 CODING_GET_INFO (coding, attrs, eol_type, charset_list);
2705 setup_iso_safe_charsets (attrs);
2706
2707 while (1)
2708 {
2709 int c1, c2;
2710
2711 src_base = src;
2712 consumed_chars_base = consumed_chars;
2713
2714 if (charbuf >= charbuf_end)
2715 break;
2716
2717 ONE_MORE_BYTE (c1);
2718
2719 /* We produce no character or one character. */
2720 switch (iso_code_class [c1])
2721 {
2722 case ISO_0x20_or_0x7F:
2723 if (composition_state != COMPOSING_NO)
2724 {
2725 if (composition_state == COMPOSING_RULE
2726 || composition_state == COMPOSING_COMPONENT_RULE)
2727 {
2728 DECODE_COMPOSITION_RULE (c1);
2729 components[component_idx++] = c1;
2730 composition_state--;
2731 continue;
2732 }
2733 else if (method == COMPOSITION_WITH_RULE)
2734 composition_state = COMPOSING_RULE;
2735 else if (method == COMPOSITION_WITH_RULE_ALTCHARS
2736 && composition_state == COMPOSING_COMPONENT_CHAR)
2737 composition_state = COMPOSING_COMPONENT_CHAR;
2738 }
2739 if (charset_id_0 < 0
2740 || ! CHARSET_ISO_CHARS_96 (CHARSET_FROM_ID (charset_id_0)))
2741 {
2742 /* This is SPACE or DEL. */
2743 charset = CHARSET_FROM_ID (charset_ascii);
2744 break;
2745 }
2746 /* This is a graphic character, we fall down ... */
2747
2748 case ISO_graphic_plane_0:
2749 if (composition_state == COMPOSING_RULE)
2750 {
2751 DECODE_COMPOSITION_RULE (c1);
2752 components[component_idx++] = c1;
2753 composition_state = COMPOSING_CHAR;
2754 }
2755 charset = CHARSET_FROM_ID (charset_id_0);
2756 break;
2757
2758 case ISO_0xA0_or_0xFF:
2759 if (charset_id_1 < 0
2760 || ! CHARSET_ISO_CHARS_96 (CHARSET_FROM_ID (charset_id_1))
2761 || CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS)
2762 goto invalid_code;
2763 /* This is a graphic character, we fall down ... */
2764
2765 case ISO_graphic_plane_1:
2766 if (charset_id_1 < 0)
2767 goto invalid_code;
2768 charset = CHARSET_FROM_ID (charset_id_1);
2769 break;
2770
2771 case ISO_carriage_return:
2772 if (c1 == '\r')
2773 {
2774 if (EQ (eol_type, Qdos))
2775 {
2776 if (src == src_end)
2777 goto no_more_source;
2778 if (*src == '\n')
2779 ONE_MORE_BYTE (c1);
2780 }
2781 else if (EQ (eol_type, Qmac))
2782 c1 = '\n';
2783 }
2784 /* fall through */
2785
2786 case ISO_control_0:
2787 MAYBE_FINISH_COMPOSITION ();
2788 charset = CHARSET_FROM_ID (charset_ascii);
2789 break;
2790
2791 case ISO_control_1:
2792 MAYBE_FINISH_COMPOSITION ();
2793 goto invalid_code;
2794
2795 case ISO_shift_out:
2796 if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LOCKING_SHIFT)
2797 || CODING_ISO_DESIGNATION (coding, 1) < 0)
2798 goto invalid_code;
2799 CODING_ISO_INVOCATION (coding, 0) = 1;
2800 charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
2801 continue;
2802
2803 case ISO_shift_in:
2804 if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LOCKING_SHIFT))
2805 goto invalid_code;
2806 CODING_ISO_INVOCATION (coding, 0) = 0;
2807 charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
2808 continue;
2809
2810 case ISO_single_shift_2_7:
2811 case ISO_single_shift_2:
2812 if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT))
2813 goto invalid_code;
2814 /* SS2 is handled as an escape sequence of ESC 'N' */
2815 c1 = 'N';
2816 goto label_escape_sequence;
2817
2818 case ISO_single_shift_3:
2819 if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT))
2820 goto invalid_code;
2821 /* SS2 is handled as an escape sequence of ESC 'O' */
2822 c1 = 'O';
2823 goto label_escape_sequence;
2824
2825 case ISO_control_sequence_introducer:
2826 /* CSI is handled as an escape sequence of ESC '[' ... */
2827 c1 = '[';
2828 goto label_escape_sequence;
2829
2830 case ISO_escape:
2831 ONE_MORE_BYTE (c1);
2832 label_escape_sequence:
2833 /* Escape sequences handled here are invocation,
2834 designation, direction specification, and character
2835 composition specification. */
2836 switch (c1)
2837 {
2838 case '&': /* revision of following character set */
2839 ONE_MORE_BYTE (c1);
2840 if (!(c1 >= '@' && c1 <= '~'))
2841 goto invalid_code;
2842 ONE_MORE_BYTE (c1);
2843 if (c1 != ISO_CODE_ESC)
2844 goto invalid_code;
2845 ONE_MORE_BYTE (c1);
2846 goto label_escape_sequence;
2847
2848 case '$': /* designation of 2-byte character set */
2849 if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DESIGNATION))
2850 goto invalid_code;
2851 ONE_MORE_BYTE (c1);
2852 if (c1 >= '@' && c1 <= 'B')
2853 { /* designation of JISX0208.1978, GB2312.1980,
2854 or JISX0208.1980 */
2855 DECODE_DESIGNATION (0, 2, 0, c1);
2856 }
2857 else if (c1 >= 0x28 && c1 <= 0x2B)
2858 { /* designation of DIMENSION2_CHARS94 character set */
2859 ONE_MORE_BYTE (c2);
2860 DECODE_DESIGNATION (c1 - 0x28, 2, 0, c2);
2861 }
2862 else if (c1 >= 0x2C && c1 <= 0x2F)
2863 { /* designation of DIMENSION2_CHARS96 character set */
2864 ONE_MORE_BYTE (c2);
2865 DECODE_DESIGNATION (c1 - 0x2C, 2, 1, c2);
2866 }
2867 else
2868 goto invalid_code;
2869 /* We must update these variables now. */
2870 charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
2871 charset_id_1 = CODING_ISO_INVOKED_CHARSET (coding, 1);
2872 continue;
2873
2874 case 'n': /* invocation of locking-shift-2 */
2875 if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LOCKING_SHIFT)
2876 || CODING_ISO_DESIGNATION (coding, 2) < 0)
2877 goto invalid_code;
2878 CODING_ISO_INVOCATION (coding, 0) = 2;
2879 charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
2880 continue;
2881
2882 case 'o': /* invocation of locking-shift-3 */
2883 if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LOCKING_SHIFT)
2884 || CODING_ISO_DESIGNATION (coding, 3) < 0)
2885 goto invalid_code;
2886 CODING_ISO_INVOCATION (coding, 0) = 3;
2887 charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
2888 continue;
2889
2890 case 'N': /* invocation of single-shift-2 */
2891 if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT)
2892 || CODING_ISO_DESIGNATION (coding, 2) < 0)
2893 goto invalid_code;
2894 charset = CHARSET_FROM_ID (CODING_ISO_DESIGNATION (coding, 2));
2895 ONE_MORE_BYTE (c1);
2896 if (c1 < 0x20 || (c1 >= 0x80 && c1 < 0xA0))
2897 goto invalid_code;
2898 break;
2899
2900 case 'O': /* invocation of single-shift-3 */
2901 if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT)
2902 || CODING_ISO_DESIGNATION (coding, 3) < 0)
2903 goto invalid_code;
2904 charset = CHARSET_FROM_ID (CODING_ISO_DESIGNATION (coding, 3));
2905 ONE_MORE_BYTE (c1);
2906 if (c1 < 0x20 || (c1 >= 0x80 && c1 < 0xA0))
2907 goto invalid_code;
2908 break;
2909
2910 case '0': case '2': case '3': case '4': /* start composition */
2911 if (! (coding->common_flags & CODING_ANNOTATE_COMPOSITION_MASK))
2912 goto invalid_code;
2913 DECODE_COMPOSITION_START (c1);
2914 continue;
2915
2916 case '1': /* end composition */
2917 if (composition_state == COMPOSING_NO)
2918 goto invalid_code;
2919 DECODE_COMPOSITION_END ();
2920 continue;
2921
2922 case '[': /* specification of direction */
2923 if (! CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DIRECTION)
2924 goto invalid_code;
2925 /* For the moment, nested direction is not supported.
2926 So, `coding->mode & CODING_MODE_DIRECTION' zero means
2927 left-to-right, and nozero means right-to-left. */
2928 ONE_MORE_BYTE (c1);
2929 switch (c1)
2930 {
2931 case ']': /* end of the current direction */
2932 coding->mode &= ~CODING_MODE_DIRECTION;
2933
2934 case '0': /* end of the current direction */
2935 case '1': /* start of left-to-right direction */
2936 ONE_MORE_BYTE (c1);
2937 if (c1 == ']')
2938 coding->mode &= ~CODING_MODE_DIRECTION;
2939 else
2940 goto invalid_code;
2941 break;
2942
2943 case '2': /* start of right-to-left direction */
2944 ONE_MORE_BYTE (c1);
2945 if (c1 == ']')
2946 coding->mode |= CODING_MODE_DIRECTION;
2947 else
2948 goto invalid_code;
2949 break;
2950
2951 default:
2952 goto invalid_code;
2953 }
2954 continue;
2955
2956 default:
2957 if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DESIGNATION))
2958 goto invalid_code;
2959 if (c1 >= 0x28 && c1 <= 0x2B)
2960 { /* designation of DIMENSION1_CHARS94 character set */
2961 ONE_MORE_BYTE (c2);
2962 DECODE_DESIGNATION (c1 - 0x28, 1, 0, c2);
2963 }
2964 else if (c1 >= 0x2C && c1 <= 0x2F)
2965 { /* designation of DIMENSION1_CHARS96 character set */
2966 ONE_MORE_BYTE (c2);
2967 DECODE_DESIGNATION (c1 - 0x2C, 1, 1, c2);
2968 }
2969 else
2970 goto invalid_code;
2971 /* We must update these variables now. */
2972 charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
2973 charset_id_1 = CODING_ISO_INVOKED_CHARSET (coding, 1);
2974 continue;
2975 }
2976 }
2977
2978 /* Now we know CHARSET and 1st position code C1 of a character.
2979 Produce a decoded character while getting 2nd position code
2980 C2 if necessary. */
2981 c1 &= 0x7F;
2982 if (CHARSET_DIMENSION (charset) > 1)
2983 {
2984 ONE_MORE_BYTE (c2);
2985 if (c2 < 0x20 || (c2 >= 0x80 && c2 < 0xA0))
2986 /* C2 is not in a valid range. */
2987 goto invalid_code;
2988 c1 = (c1 << 8) | (c2 & 0x7F);
2989 if (CHARSET_DIMENSION (charset) > 2)
2990 {
2991 ONE_MORE_BYTE (c2);
2992 if (c2 < 0x20 || (c2 >= 0x80 && c2 < 0xA0))
2993 /* C2 is not in a valid range. */
2994 goto invalid_code;
2995 c1 = (c1 << 8) | (c2 & 0x7F);
2996 }
2997 }
2998
2999 CODING_DECODE_CHAR (coding, src, src_base, src_end, charset, c1, c);
3000 if (c < 0)
3001 {
3002 MAYBE_FINISH_COMPOSITION ();
3003 for (; src_base < src; src_base++, char_offset++)
3004 {
3005 if (ASCII_BYTE_P (*src_base))
3006 *charbuf++ = *src_base;
3007 else
3008 *charbuf++ = BYTE8_TO_CHAR (*src_base);
3009 }
3010 }
3011 else if (composition_state == COMPOSING_NO)
3012 {
3013 *charbuf++ = c;
3014 char_offset++;
3015 }
3016 else
3017 components[component_idx++] = c;
3018 continue;
3019
3020 invalid_code:
3021 MAYBE_FINISH_COMPOSITION ();
3022 src = src_base;
3023 consumed_chars = consumed_chars_base;
3024 ONE_MORE_BYTE (c);
3025 *charbuf++ = ASCII_BYTE_P (c) ? c : BYTE8_TO_CHAR (c);
3026 coding->errors++;
3027 }
3028
3029 no_more_source:
3030 coding->consumed_char += consumed_chars_base;
3031 coding->consumed = src_base - coding->source;
3032 coding->charbuf_used = charbuf - coding->charbuf;
3033 }
3034
3035
3036 /* ISO2022 encoding stuff. */
3037
3038 /*
3039 It is not enough to say just "ISO2022" on encoding, we have to
3040 specify more details. In Emacs, each coding system of ISO2022
3041 variant has the following specifications:
3042 1. Initial designation to G0 thru G3.
3043 2. Allows short-form designation?
3044 3. ASCII should be designated to G0 before control characters?
3045 4. ASCII should be designated to G0 at end of line?
3046 5. 7-bit environment or 8-bit environment?
3047 6. Use locking-shift?
3048 7. Use Single-shift?
3049 And the following two are only for Japanese:
3050 8. Use ASCII in place of JIS0201-1976-Roman?
3051 9. Use JISX0208-1983 in place of JISX0208-1978?
3052 These specifications are encoded in CODING_ISO_FLAGS (coding) as flag bits
3053 defined by macros CODING_ISO_FLAG_XXX. See `coding.h' for more
3054 details.
3055 */
3056
3057 /* Produce codes (escape sequence) for designating CHARSET to graphic
3058 register REG at DST, and increment DST. If <final-char> of CHARSET is
3059 '@', 'A', or 'B' and the coding system CODING allows, produce
3060 designation sequence of short-form. */
3061
3062 #define ENCODE_DESIGNATION(charset, reg, coding) \
3063 do { \
3064 unsigned char final_char = CHARSET_ISO_FINAL (charset); \
3065 char *intermediate_char_94 = "()*+"; \
3066 char *intermediate_char_96 = ",-./"; \
3067 int revision = -1; \
3068 int c; \
3069 \
3070 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_REVISION) \
3071 revision = XINT (CHARSET_ISO_REVISION (charset)); \
3072 \
3073 if (revision >= 0) \
3074 { \
3075 EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, '&'); \
3076 EMIT_ONE_BYTE ('@' + revision); \
3077 } \
3078 EMIT_ONE_ASCII_BYTE (ISO_CODE_ESC); \
3079 if (CHARSET_DIMENSION (charset) == 1) \
3080 { \
3081 if (! CHARSET_ISO_CHARS_96 (charset)) \
3082 c = intermediate_char_94[reg]; \
3083 else \
3084 c = intermediate_char_96[reg]; \
3085 EMIT_ONE_ASCII_BYTE (c); \
3086 } \
3087 else \
3088 { \
3089 EMIT_ONE_ASCII_BYTE ('$'); \
3090 if (! CHARSET_ISO_CHARS_96 (charset)) \
3091 { \
3092 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LONG_FORM \
3093 || reg != 0 \
3094 || final_char < '@' || final_char > 'B') \
3095 EMIT_ONE_ASCII_BYTE (intermediate_char_94[reg]); \
3096 } \
3097 else \
3098 EMIT_ONE_ASCII_BYTE (intermediate_char_96[reg]); \
3099 } \
3100 EMIT_ONE_ASCII_BYTE (final_char); \
3101 \
3102 CODING_ISO_DESIGNATION (coding, reg) = CHARSET_ID (charset); \
3103 } while (0)
3104
3105
3106 /* The following two macros produce codes (control character or escape
3107 sequence) for ISO2022 single-shift functions (single-shift-2 and
3108 single-shift-3). */
3109
3110 #define ENCODE_SINGLE_SHIFT_2 \
3111 do { \
3112 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS) \
3113 EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'N'); \
3114 else \
3115 EMIT_ONE_BYTE (ISO_CODE_SS2); \
3116 CODING_ISO_SINGLE_SHIFTING (coding) = 1; \
3117 } while (0)
3118
3119
3120 #define ENCODE_SINGLE_SHIFT_3 \
3121 do { \
3122 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS) \
3123 EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'O'); \
3124 else \
3125 EMIT_ONE_BYTE (ISO_CODE_SS3); \
3126 CODING_ISO_SINGLE_SHIFTING (coding) = 1; \
3127 } while (0)
3128
3129
3130 /* The following four macros produce codes (control character or
3131 escape sequence) for ISO2022 locking-shift functions (shift-in,
3132 shift-out, locking-shift-2, and locking-shift-3). */
3133
3134 #define ENCODE_SHIFT_IN \
3135 do { \
3136 EMIT_ONE_ASCII_BYTE (ISO_CODE_SI); \
3137 CODING_ISO_INVOCATION (coding, 0) = 0; \
3138 } while (0)
3139
3140
3141 #define ENCODE_SHIFT_OUT \
3142 do { \
3143 EMIT_ONE_ASCII_BYTE (ISO_CODE_SO); \
3144 CODING_ISO_INVOCATION (coding, 0) = 1; \
3145 } while (0)
3146
3147
3148 #define ENCODE_LOCKING_SHIFT_2 \
3149 do { \
3150 EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'n'); \
3151 CODING_ISO_INVOCATION (coding, 0) = 2; \
3152 } while (0)
3153
3154
3155 #define ENCODE_LOCKING_SHIFT_3 \
3156 do { \
3157 EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'n'); \
3158 CODING_ISO_INVOCATION (coding, 0) = 3; \
3159 } while (0)
3160
3161
3162 /* Produce codes for a DIMENSION1 character whose character set is
3163 CHARSET and whose position-code is C1. Designation and invocation
3164 sequences are also produced in advance if necessary. */
3165
3166 #define ENCODE_ISO_CHARACTER_DIMENSION1(charset, c1) \
3167 do { \
3168 int id = CHARSET_ID (charset); \
3169 if (CODING_ISO_SINGLE_SHIFTING (coding)) \
3170 { \
3171 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS) \
3172 EMIT_ONE_ASCII_BYTE (c1 & 0x7F); \
3173 else \
3174 EMIT_ONE_BYTE (c1 | 0x80); \
3175 CODING_ISO_SINGLE_SHIFTING (coding) = 0; \
3176 break; \
3177 } \
3178 else if (id == CODING_ISO_INVOKED_CHARSET (coding, 0)) \
3179 { \
3180 EMIT_ONE_ASCII_BYTE (c1 & 0x7F); \
3181 break; \
3182 } \
3183 else if (id == CODING_ISO_INVOKED_CHARSET (coding, 1)) \
3184 { \
3185 EMIT_ONE_BYTE (c1 | 0x80); \
3186 break; \
3187 } \
3188 else \
3189 /* Since CHARSET is not yet invoked to any graphic planes, we \
3190 must invoke it, or, at first, designate it to some graphic \
3191 register. Then repeat the loop to actually produce the \
3192 character. */ \
3193 dst = encode_invocation_designation (charset, coding, dst, \
3194 &produced_chars); \
3195 } while (1)
3196
3197
3198 /* Produce codes for a DIMENSION2 character whose character set is
3199 CHARSET and whose position-codes are C1 and C2. Designation and
3200 invocation codes are also produced in advance if necessary. */
3201
3202 #define ENCODE_ISO_CHARACTER_DIMENSION2(charset, c1, c2) \
3203 do { \
3204 int id = CHARSET_ID (charset); \
3205 if (CODING_ISO_SINGLE_SHIFTING (coding)) \
3206 { \
3207 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS) \
3208 EMIT_TWO_ASCII_BYTES ((c1) & 0x7F, (c2) & 0x7F); \
3209 else \
3210 EMIT_TWO_BYTES ((c1) | 0x80, (c2) | 0x80); \
3211 CODING_ISO_SINGLE_SHIFTING (coding) = 0; \
3212 break; \
3213 } \
3214 else if (id == CODING_ISO_INVOKED_CHARSET (coding, 0)) \
3215 { \
3216 EMIT_TWO_ASCII_BYTES ((c1) & 0x7F, (c2) & 0x7F); \
3217 break; \
3218 } \
3219 else if (id == CODING_ISO_INVOKED_CHARSET (coding, 1)) \
3220 { \
3221 EMIT_TWO_BYTES ((c1) | 0x80, (c2) | 0x80); \
3222 break; \
3223 } \
3224 else \
3225 /* Since CHARSET is not yet invoked to any graphic planes, we \
3226 must invoke it, or, at first, designate it to some graphic \
3227 register. Then repeat the loop to actually produce the \
3228 character. */ \
3229 dst = encode_invocation_designation (charset, coding, dst, \
3230 &produced_chars); \
3231 } while (1)
3232
3233
3234 #define ENCODE_ISO_CHARACTER(charset, c) \
3235 do { \
3236 int code = ENCODE_CHAR ((charset),(c)); \
3237 \
3238 if (CHARSET_DIMENSION (charset) == 1) \
3239 ENCODE_ISO_CHARACTER_DIMENSION1 ((charset), code); \
3240 else \
3241 ENCODE_ISO_CHARACTER_DIMENSION2 ((charset), code >> 8, code & 0xFF); \
3242 } while (0)
3243
3244
3245 /* Produce designation and invocation codes at a place pointed by DST
3246 to use CHARSET. The element `spec.iso_2022' of *CODING is updated.
3247 Return new DST. */
3248
3249 unsigned char *
3250 encode_invocation_designation (charset, coding, dst, p_nchars)
3251 struct charset *charset;
3252 struct coding_system *coding;
3253 unsigned char *dst;
3254 int *p_nchars;
3255 {
3256 int multibytep = coding->dst_multibyte;
3257 int produced_chars = *p_nchars;
3258 int reg; /* graphic register number */
3259 int id = CHARSET_ID (charset);
3260
3261 /* At first, check designations. */
3262 for (reg = 0; reg < 4; reg++)
3263 if (id == CODING_ISO_DESIGNATION (coding, reg))
3264 break;
3265
3266 if (reg >= 4)
3267 {
3268 /* CHARSET is not yet designated to any graphic registers. */
3269 /* At first check the requested designation. */
3270 reg = CODING_ISO_REQUEST (coding, id);
3271 if (reg < 0)
3272 /* Since CHARSET requests no special designation, designate it
3273 to graphic register 0. */
3274 reg = 0;
3275
3276 ENCODE_DESIGNATION (charset, reg, coding);
3277 }
3278
3279 if (CODING_ISO_INVOCATION (coding, 0) != reg
3280 && CODING_ISO_INVOCATION (coding, 1) != reg)
3281 {
3282 /* Since the graphic register REG is not invoked to any graphic
3283 planes, invoke it to graphic plane 0. */
3284 switch (reg)
3285 {
3286 case 0: /* graphic register 0 */
3287 ENCODE_SHIFT_IN;
3288 break;
3289
3290 case 1: /* graphic register 1 */
3291 ENCODE_SHIFT_OUT;
3292 break;
3293
3294 case 2: /* graphic register 2 */
3295 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT)
3296 ENCODE_SINGLE_SHIFT_2;
3297 else
3298 ENCODE_LOCKING_SHIFT_2;
3299 break;
3300
3301 case 3: /* graphic register 3 */
3302 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT)
3303 ENCODE_SINGLE_SHIFT_3;
3304 else
3305 ENCODE_LOCKING_SHIFT_3;
3306 break;
3307 }
3308 }
3309
3310 *p_nchars = produced_chars;
3311 return dst;
3312 }
3313
3314 /* The following three macros produce codes for indicating direction
3315 of text. */
3316 #define ENCODE_CONTROL_SEQUENCE_INTRODUCER \
3317 do { \
3318 if (CODING_ISO_FLAGS (coding) == CODING_ISO_FLAG_SEVEN_BITS) \
3319 EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, '['); \
3320 else \
3321 EMIT_ONE_BYTE (ISO_CODE_CSI); \
3322 } while (0)
3323
3324
3325 #define ENCODE_DIRECTION_R2L() \
3326 do { \
3327 ENCODE_CONTROL_SEQUENCE_INTRODUCER (dst); \
3328 EMIT_TWO_ASCII_BYTES ('2', ']'); \
3329 } while (0)
3330
3331
3332 #define ENCODE_DIRECTION_L2R() \
3333 do { \
3334 ENCODE_CONTROL_SEQUENCE_INTRODUCER (dst); \
3335 EMIT_TWO_ASCII_BYTES ('0', ']'); \
3336 } while (0)
3337
3338
3339 /* Produce codes for designation and invocation to reset the graphic
3340 planes and registers to initial state. */
3341 #define ENCODE_RESET_PLANE_AND_REGISTER() \
3342 do { \
3343 int reg; \
3344 struct charset *charset; \
3345 \
3346 if (CODING_ISO_INVOCATION (coding, 0) != 0) \
3347 ENCODE_SHIFT_IN; \
3348 for (reg = 0; reg < 4; reg++) \
3349 if (CODING_ISO_INITIAL (coding, reg) >= 0 \
3350 && (CODING_ISO_DESIGNATION (coding, reg) \
3351 != CODING_ISO_INITIAL (coding, reg))) \
3352 { \
3353 charset = CHARSET_FROM_ID (CODING_ISO_INITIAL (coding, reg)); \
3354 ENCODE_DESIGNATION (charset, reg, coding); \
3355 } \
3356 } while (0)
3357
3358
3359 /* Produce designation sequences of charsets in the line started from
3360 SRC to a place pointed by DST, and return updated DST.
3361
3362 If the current block ends before any end-of-line, we may fail to
3363 find all the necessary designations. */
3364
3365 static unsigned char *
3366 encode_designation_at_bol (coding, charbuf, charbuf_end, dst)
3367 struct coding_system *coding;
3368 int *charbuf, *charbuf_end;
3369 unsigned char *dst;
3370 {
3371 struct charset *charset;
3372 /* Table of charsets to be designated to each graphic register. */
3373 int r[4];
3374 int c, found = 0, reg;
3375 int produced_chars = 0;
3376 int multibytep = coding->dst_multibyte;
3377 Lisp_Object attrs;
3378 Lisp_Object charset_list;
3379
3380 attrs = CODING_ID_ATTRS (coding->id);
3381 charset_list = CODING_ATTR_CHARSET_LIST (attrs);
3382 if (EQ (charset_list, Qiso_2022))
3383 charset_list = Viso_2022_charset_list;
3384
3385 for (reg = 0; reg < 4; reg++)
3386 r[reg] = -1;
3387
3388 while (found < 4)
3389 {
3390 int id;
3391
3392 c = *charbuf++;
3393 if (c == '\n')
3394 break;
3395 charset = char_charset (c, charset_list, NULL);
3396 id = CHARSET_ID (charset);
3397 reg = CODING_ISO_REQUEST (coding, id);
3398 if (reg >= 0 && r[reg] < 0)
3399 {
3400 found++;
3401 r[reg] = id;
3402 }
3403 }
3404
3405 if (found)
3406 {
3407 for (reg = 0; reg < 4; reg++)
3408 if (r[reg] >= 0
3409 && CODING_ISO_DESIGNATION (coding, reg) != r[reg])
3410 ENCODE_DESIGNATION (CHARSET_FROM_ID (r[reg]), reg, coding);
3411 }
3412
3413 return dst;
3414 }
3415
3416 /* See the above "GENERAL NOTES on `encode_coding_XXX ()' functions". */
3417
3418 static int
3419 encode_coding_iso_2022 (coding)
3420 struct coding_system *coding;
3421 {
3422 int multibytep = coding->dst_multibyte;
3423 int *charbuf = coding->charbuf;
3424 int *charbuf_end = charbuf + coding->charbuf_used;
3425 unsigned char *dst = coding->destination + coding->produced;
3426 unsigned char *dst_end = coding->destination + coding->dst_bytes;
3427 int safe_room = 16;
3428 int bol_designation
3429 = (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DESIGNATE_AT_BOL
3430 && CODING_ISO_BOL (coding));
3431 int produced_chars = 0;
3432 Lisp_Object attrs, eol_type, charset_list;
3433 int ascii_compatible;
3434 int c;
3435
3436 CODING_GET_INFO (coding, attrs, eol_type, charset_list);
3437
3438 ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs));
3439
3440 while (charbuf < charbuf_end)
3441 {
3442 ASSURE_DESTINATION (safe_room);
3443
3444 if (bol_designation)
3445 {
3446 unsigned char *dst_prev = dst;
3447
3448 /* We have to produce designation sequences if any now. */
3449 dst = encode_designation_at_bol (coding, charbuf, charbuf_end, dst);
3450 bol_designation = 0;
3451 /* We are sure that designation sequences are all ASCII bytes. */
3452 produced_chars += dst - dst_prev;
3453 }
3454
3455 c = *charbuf++;
3456
3457 /* Now encode the character C. */
3458 if (c < 0x20 || c == 0x7F)
3459 {
3460 if (c == '\n'
3461 || (c == '\r' && EQ (eol_type, Qmac)))
3462 {
3463 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_RESET_AT_EOL)
3464 ENCODE_RESET_PLANE_AND_REGISTER ();
3465 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_INIT_AT_BOL)
3466 {
3467 int i;
3468
3469 for (i = 0; i < 4; i++)
3470 CODING_ISO_DESIGNATION (coding, i)
3471 = CODING_ISO_INITIAL (coding, i);
3472 }
3473 bol_designation
3474 = CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DESIGNATE_AT_BOL;
3475 }
3476 else if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_RESET_AT_CNTL)
3477 ENCODE_RESET_PLANE_AND_REGISTER ();
3478 EMIT_ONE_ASCII_BYTE (c);
3479 }
3480 else if (ASCII_CHAR_P (c))
3481 {
3482 if (ascii_compatible)
3483 EMIT_ONE_ASCII_BYTE (c);
3484 else
3485 ENCODE_ISO_CHARACTER (CHARSET_FROM_ID (charset_ascii), c);
3486 }
3487 else
3488 {
3489 struct charset *charset = char_charset (c, charset_list, NULL);
3490
3491 if (!charset)
3492 {
3493 c = coding->default_char;
3494 charset = char_charset (c, charset_list, NULL);
3495 }
3496 ENCODE_ISO_CHARACTER (charset, c);
3497 }
3498 }
3499
3500 if (coding->mode & CODING_MODE_LAST_BLOCK
3501 && CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_RESET_AT_EOL)
3502 {
3503 ASSURE_DESTINATION (safe_room);
3504 ENCODE_RESET_PLANE_AND_REGISTER ();
3505 }
3506 coding->result = CODING_RESULT_SUCCESS;
3507 CODING_ISO_BOL (coding) = bol_designation;
3508 coding->produced_char += produced_chars;
3509 coding->produced = dst - coding->destination;
3510 return 0;
3511 }
3512
3513 \f
3514 /*** 8,9. SJIS and BIG5 handlers ***/
3515
3516 /* Although SJIS and BIG5 are not ISO's coding system, they are used
3517 quite widely. So, for the moment, Emacs supports them in the bare
3518 C code. But, in the future, they may be supported only by CCL. */
3519
3520 /* SJIS is a coding system encoding three character sets: ASCII, right
3521 half of JISX0201-Kana, and JISX0208. An ASCII character is encoded
3522 as is. A character of charset katakana-jisx0201 is encoded by
3523 "position-code + 0x80". A character of charset japanese-jisx0208
3524 is encoded in 2-byte but two position-codes are divided and shifted
3525 so that it fit in the range below.
3526
3527 --- CODE RANGE of SJIS ---
3528 (character set) (range)
3529 ASCII 0x00 .. 0x7F
3530 KATAKANA-JISX0201 0xA0 .. 0xDF
3531 JISX0208 (1st byte) 0x81 .. 0x9F and 0xE0 .. 0xEF
3532 (2nd byte) 0x40 .. 0x7E and 0x80 .. 0xFC
3533 -------------------------------
3534
3535 */
3536
3537 /* BIG5 is a coding system encoding two character sets: ASCII and
3538 Big5. An ASCII character is encoded as is. Big5 is a two-byte
3539 character set and is encoded in two-byte.
3540
3541 --- CODE RANGE of BIG5 ---
3542 (character set) (range)
3543 ASCII 0x00 .. 0x7F
3544 Big5 (1st byte) 0xA1 .. 0xFE
3545 (2nd byte) 0x40 .. 0x7E and 0xA1 .. 0xFE
3546 --------------------------
3547
3548 */
3549
3550 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
3551 Check if a text is encoded in SJIS. If it is, return
3552 CATEGORY_MASK_SJIS, else return 0. */
3553
3554 static int
3555 detect_coding_sjis (coding, mask)
3556 struct coding_system *coding;
3557 int *mask;
3558 {
3559 unsigned char *src = coding->source, *src_base = src;
3560 unsigned char *src_end = coding->source + coding->src_bytes;
3561 int multibytep = coding->src_multibyte;
3562 int consumed_chars = 0;
3563 int found = 0;
3564 int c;
3565
3566 /* A coding system of this category is always ASCII compatible. */
3567 src += coding->head_ascii;
3568
3569 while (1)
3570 {
3571 ONE_MORE_BYTE (c);
3572 if (c < 0x80)
3573 continue;
3574 if ((c >= 0x81 && c <= 0x9F) || (c >= 0xE0 && c <= 0xEF))
3575 {
3576 ONE_MORE_BYTE (c);
3577 if (c < 0x40 || c == 0x7F || c > 0xFC)
3578 break;
3579 found = 1;
3580 }
3581 else if (c >= 0xA0 && c < 0xE0)
3582 found = 1;
3583 else
3584 break;
3585 }
3586 *mask &= ~CATEGORY_MASK_SJIS;
3587 return 0;
3588
3589 no_more_source:
3590 if (!found)
3591 return 0;
3592 *mask &= CATEGORY_MASK_SJIS;
3593 return 1;
3594 }
3595
3596 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
3597 Check if a text is encoded in BIG5. If it is, return
3598 CATEGORY_MASK_BIG5, else return 0. */
3599
3600 static int
3601 detect_coding_big5 (coding, mask)
3602 struct coding_system *coding;
3603 int *mask;
3604 {
3605 unsigned char *src = coding->source, *src_base = src;
3606 unsigned char *src_end = coding->source + coding->src_bytes;
3607 int multibytep = coding->src_multibyte;
3608 int consumed_chars = 0;
3609 int found = 0;
3610 int c;
3611
3612 /* A coding system of this category is always ASCII compatible. */
3613 src += coding->head_ascii;
3614
3615 while (1)
3616 {
3617 ONE_MORE_BYTE (c);
3618 if (c < 0x80)
3619 continue;
3620 if (c >= 0xA1)
3621 {
3622 ONE_MORE_BYTE (c);
3623 if (c < 0x40 || (c >= 0x7F && c <= 0xA0))
3624 return 0;
3625 found = 1;
3626 }
3627 else
3628 break;
3629 }
3630 *mask &= ~CATEGORY_MASK_BIG5;
3631 return 0;
3632
3633 no_more_source:
3634 if (!found)
3635 return 0;
3636 *mask &= CATEGORY_MASK_BIG5;
3637 return 1;
3638 }
3639
3640 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".
3641 If SJIS_P is 1, decode SJIS text, else decode BIG5 test. */
3642
3643 static void
3644 decode_coding_sjis (coding)
3645 struct coding_system *coding;
3646 {
3647 unsigned char *src = coding->source + coding->consumed;
3648 unsigned char *src_end = coding->source + coding->src_bytes;
3649 unsigned char *src_base;
3650 int *charbuf = coding->charbuf;
3651 int *charbuf_end = charbuf + coding->charbuf_size;
3652 int consumed_chars = 0, consumed_chars_base;
3653 int multibytep = coding->src_multibyte;
3654 struct charset *charset_roman, *charset_kanji, *charset_kana;
3655 Lisp_Object attrs, eol_type, charset_list, val;
3656
3657 CODING_GET_INFO (coding, attrs, eol_type, charset_list);
3658
3659 val = charset_list;
3660 charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
3661 charset_kanji = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
3662 charset_kana = CHARSET_FROM_ID (XINT (XCAR (val)));
3663
3664 while (1)
3665 {
3666 int c, c1;
3667
3668 src_base = src;
3669 consumed_chars_base = consumed_chars;
3670
3671 if (charbuf >= charbuf_end)
3672 break;
3673
3674 ONE_MORE_BYTE (c);
3675
3676 if (c == '\r')
3677 {
3678 if (EQ (eol_type, Qdos))
3679 {
3680 if (src == src_end)
3681 goto no_more_source;
3682 if (*src == '\n')
3683 ONE_MORE_BYTE (c);
3684 }
3685 else if (EQ (eol_type, Qmac))
3686 c = '\n';
3687 }
3688 else
3689 {
3690 struct charset *charset;
3691
3692 if (c < 0x80)
3693 charset = charset_roman;
3694 else
3695 {
3696 if (c >= 0xF0)
3697 goto invalid_code;
3698 if (c < 0xA0 || c >= 0xE0)
3699 {
3700 /* SJIS -> JISX0208 */
3701 ONE_MORE_BYTE (c1);
3702 if (c1 < 0x40 || c1 == 0x7F || c1 > 0xFC)
3703 goto invalid_code;
3704 c = (c << 8) | c1;
3705 SJIS_TO_JIS (c);
3706 charset = charset_kanji;
3707 }
3708 else
3709 /* SJIS -> JISX0201-Kana */
3710 charset = charset_kana;
3711 }
3712 CODING_DECODE_CHAR (coding, src, src_base, src_end, charset, c, c);
3713 }
3714 *charbuf++ = c;
3715 continue;
3716
3717 invalid_code:
3718 src = src_base;
3719 consumed_chars = consumed_chars_base;
3720 ONE_MORE_BYTE (c);
3721 *charbuf++ = ASCII_BYTE_P (c) ? c : BYTE8_TO_CHAR (c);
3722 coding->errors++;
3723 }
3724
3725 no_more_source:
3726 coding->consumed_char += consumed_chars_base;
3727 coding->consumed = src_base - coding->source;
3728 coding->charbuf_used = charbuf - coding->charbuf;
3729 }
3730
3731 static void
3732 decode_coding_big5 (coding)
3733 struct coding_system *coding;
3734 {
3735 unsigned char *src = coding->source + coding->consumed;
3736 unsigned char *src_end = coding->source + coding->src_bytes;
3737 unsigned char *src_base;
3738 int *charbuf = coding->charbuf;
3739 int *charbuf_end = charbuf + coding->charbuf_size;
3740 int consumed_chars = 0, consumed_chars_base;
3741 int multibytep = coding->src_multibyte;
3742 struct charset *charset_roman, *charset_big5;
3743 Lisp_Object attrs, eol_type, charset_list, val;
3744
3745 CODING_GET_INFO (coding, attrs, eol_type, charset_list);
3746 val = charset_list;
3747 charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
3748 charset_big5 = CHARSET_FROM_ID (XINT (XCAR (val)));
3749
3750 while (1)
3751 {
3752 int c, c1;
3753
3754 src_base = src;
3755 consumed_chars_base = consumed_chars;
3756
3757 if (charbuf >= charbuf_end)
3758 break;
3759
3760 ONE_MORE_BYTE (c);
3761
3762 if (c == '\r')
3763 {
3764 if (EQ (eol_type, Qdos))
3765 {
3766 if (src == src_end)
3767 goto no_more_source;
3768 if (*src == '\n')
3769 ONE_MORE_BYTE (c);
3770 }
3771 else if (EQ (eol_type, Qmac))
3772 c = '\n';
3773 }
3774 else
3775 {
3776 struct charset *charset;
3777 if (c < 0x80)
3778 charset = charset_roman;
3779 else
3780 {
3781 /* BIG5 -> Big5 */
3782 if (c < 0xA1 || c > 0xFE)
3783 goto invalid_code;
3784 ONE_MORE_BYTE (c1);
3785 if (c1 < 0x40 || (c1 > 0x7E && c1 < 0xA1) || c1 > 0xFE)
3786 goto invalid_code;
3787 c = c << 8 | c1;
3788 charset = charset_big5;
3789 }
3790 CODING_DECODE_CHAR (coding, src, src_base, src_end, charset, c, c);
3791 }
3792
3793 *charbuf++ = c;
3794 continue;
3795
3796 invalid_code:
3797 src = src_base;
3798 consumed_chars = consumed_chars_base;
3799 ONE_MORE_BYTE (c);
3800 *charbuf++ = ASCII_BYTE_P (c) ? c : BYTE8_TO_CHAR (c);
3801 coding->errors++;
3802 }
3803
3804 no_more_source:
3805 coding->consumed_char += consumed_chars_base;
3806 coding->consumed = src_base - coding->source;
3807 coding->charbuf_used = charbuf - coding->charbuf;
3808 }
3809
3810 /* See the above "GENERAL NOTES on `encode_coding_XXX ()' functions".
3811 This function can encode charsets `ascii', `katakana-jisx0201',
3812 `japanese-jisx0208', `chinese-big5-1', and `chinese-big5-2'. We
3813 are sure that all these charsets are registered as official charset
3814 (i.e. do not have extended leading-codes). Characters of other
3815 charsets are produced without any encoding. If SJIS_P is 1, encode
3816 SJIS text, else encode BIG5 text. */
3817
3818 static int
3819 encode_coding_sjis (coding)
3820 struct coding_system *coding;
3821 {
3822 int multibytep = coding->dst_multibyte;
3823 int *charbuf = coding->charbuf;
3824 int *charbuf_end = charbuf + coding->charbuf_used;
3825 unsigned char *dst = coding->destination + coding->produced;
3826 unsigned char *dst_end = coding->destination + coding->dst_bytes;
3827 int safe_room = 4;
3828 int produced_chars = 0;
3829 Lisp_Object attrs, eol_type, charset_list, val;
3830 int ascii_compatible;
3831 struct charset *charset_roman, *charset_kanji, *charset_kana;
3832 int c;
3833
3834 CODING_GET_INFO (coding, attrs, eol_type, charset_list);
3835 val = charset_list;
3836 charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
3837 charset_kana = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
3838 charset_kanji = CHARSET_FROM_ID (XINT (XCAR (val)));
3839
3840 ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs));
3841
3842 while (charbuf < charbuf_end)
3843 {
3844 ASSURE_DESTINATION (safe_room);
3845 c = *charbuf++;
3846 /* Now encode the character C. */
3847 if (ASCII_CHAR_P (c) && ascii_compatible)
3848 EMIT_ONE_ASCII_BYTE (c);
3849 else
3850 {
3851 unsigned code;
3852 struct charset *charset = char_charset (c, charset_list, &code);
3853
3854 if (!charset)
3855 {
3856 c = coding->default_char;
3857 charset = char_charset (c, charset_list, &code);
3858 }
3859 if (code == CHARSET_INVALID_CODE (charset))
3860 abort ();
3861 if (charset == charset_kanji)
3862 {
3863 int c1, c2;
3864 JIS_TO_SJIS (code);
3865 c1 = code >> 8, c2 = code & 0xFF;
3866 EMIT_TWO_BYTES (c1, c2);
3867 }
3868 else if (charset == charset_kana)
3869 EMIT_ONE_BYTE (code | 0x80);
3870 else
3871 EMIT_ONE_ASCII_BYTE (code & 0x7F);
3872 }
3873 }
3874 coding->result = CODING_RESULT_SUCCESS;
3875 coding->produced_char += produced_chars;
3876 coding->produced = dst - coding->destination;
3877 return 0;
3878 }
3879
3880 static int
3881 encode_coding_big5 (coding)
3882 struct coding_system *coding;
3883 {
3884 int multibytep = coding->dst_multibyte;
3885 int *charbuf = coding->charbuf;
3886 int *charbuf_end = charbuf + coding->charbuf_used;
3887 unsigned char *dst = coding->destination + coding->produced;
3888 unsigned char *dst_end = coding->destination + coding->dst_bytes;
3889 int safe_room = 4;
3890 int produced_chars = 0;
3891 Lisp_Object attrs, eol_type, charset_list, val;
3892 int ascii_compatible;
3893 struct charset *charset_roman, *charset_big5;
3894 int c;
3895
3896 CODING_GET_INFO (coding, attrs, eol_type, charset_list);
3897 val = charset_list;
3898 charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
3899 charset_big5 = CHARSET_FROM_ID (XINT (XCAR (val)));
3900 ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs));
3901
3902 while (charbuf < charbuf_end)
3903 {
3904 ASSURE_DESTINATION (safe_room);
3905 c = *charbuf++;
3906 /* Now encode the character C. */
3907 if (ASCII_CHAR_P (c) && ascii_compatible)
3908 EMIT_ONE_ASCII_BYTE (c);
3909 else
3910 {
3911 unsigned code;
3912 struct charset *charset = char_charset (c, charset_list, &code);
3913
3914 if (! charset)
3915 {
3916 c = coding->default_char;
3917 charset = char_charset (c, charset_list, &code);
3918 }
3919 if (code == CHARSET_INVALID_CODE (charset))
3920 abort ();
3921 if (charset == charset_big5)
3922 {
3923 int c1, c2;
3924
3925 c1 = code >> 8, c2 = code & 0xFF;
3926 EMIT_TWO_BYTES (c1, c2);
3927 }
3928 else
3929 EMIT_ONE_ASCII_BYTE (code & 0x7F);
3930 }
3931 }
3932 coding->result = CODING_RESULT_SUCCESS;
3933 coding->produced_char += produced_chars;
3934 coding->produced = dst - coding->destination;
3935 return 0;
3936 }
3937
3938 \f
3939 /*** 10. CCL handlers ***/
3940
3941 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
3942 Check if a text is encoded in a coding system of which
3943 encoder/decoder are written in CCL program. If it is, return
3944 CATEGORY_MASK_CCL, else return 0. */
3945
3946 static int
3947 detect_coding_ccl (coding, mask)
3948 struct coding_system *coding;
3949 int *mask;
3950 {
3951 unsigned char *src = coding->source, *src_base = src;
3952 unsigned char *src_end = coding->source + coding->src_bytes;
3953 int multibytep = coding->src_multibyte;
3954 int consumed_chars = 0;
3955 int found = 0;
3956 unsigned char *valids = CODING_CCL_VALIDS (coding);
3957 int head_ascii = coding->head_ascii;
3958 Lisp_Object attrs;
3959
3960 coding = &coding_categories[coding_category_ccl];
3961 attrs = CODING_ID_ATTRS (coding->id);
3962 if (! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
3963 src += head_ascii;
3964
3965 while (1)
3966 {
3967 int c;
3968 ONE_MORE_BYTE (c);
3969 if (! valids[c])
3970 break;
3971 if (!found && valids[c] > 1)
3972 found = 1;
3973 }
3974 *mask &= ~CATEGORY_MASK_CCL;
3975 return 0;
3976
3977 no_more_source:
3978 if (!found)
3979 return 0;
3980 *mask &= CATEGORY_MASK_CCL;
3981 return 1;
3982 }
3983
3984 static void
3985 decode_coding_ccl (coding)
3986 struct coding_system *coding;
3987 {
3988 unsigned char *src = coding->source + coding->consumed;
3989 unsigned char *src_end = coding->source + coding->src_bytes;
3990 int *charbuf = coding->charbuf;
3991 int *charbuf_end = charbuf + coding->charbuf_size;
3992 int consumed_chars = 0;
3993 int multibytep = coding->src_multibyte;
3994 struct ccl_program ccl;
3995 int source_charbuf[1024];
3996 int source_byteidx[1024];
3997
3998 setup_ccl_program (&ccl, CODING_CCL_DECODER (coding));
3999
4000 while (src < src_end)
4001 {
4002 unsigned char *p = src;
4003 int *source, *source_end;
4004 int i = 0;
4005
4006 if (multibytep)
4007 while (i < 1024 && p < src_end)
4008 {
4009 source_byteidx[i] = p - src;
4010 source_charbuf[i++] = STRING_CHAR_ADVANCE (p);
4011 }
4012 else
4013 while (i < 1024 && p < src_end)
4014 source_charbuf[i++] = *p++;
4015
4016 if (p == src_end && coding->mode & CODING_MODE_LAST_BLOCK)
4017 ccl.last_block = 1;
4018
4019 source = source_charbuf;
4020 source_end = source + i;
4021 while (source < source_end)
4022 {
4023 ccl_driver (&ccl, source, charbuf,
4024 source_end - source, charbuf_end - charbuf);
4025 source += ccl.consumed;
4026 charbuf += ccl.produced;
4027 if (ccl.status != CCL_STAT_SUSPEND_BY_DST)
4028 break;
4029 }
4030 if (source < source_end)
4031 src += source_byteidx[source - source_charbuf];
4032 else
4033 src = p;
4034 consumed_chars += source - source_charbuf;
4035
4036 if (ccl.status != CCL_STAT_SUSPEND_BY_SRC
4037 && ccl.status != CODING_RESULT_INSUFFICIENT_SRC)
4038 break;
4039 }
4040
4041 switch (ccl.status)
4042 {
4043 case CCL_STAT_SUSPEND_BY_SRC:
4044 coding->result = CODING_RESULT_INSUFFICIENT_SRC;
4045 break;
4046 case CCL_STAT_SUSPEND_BY_DST:
4047 break;
4048 case CCL_STAT_QUIT:
4049 case CCL_STAT_INVALID_CMD:
4050 coding->result = CODING_RESULT_INTERRUPT;
4051 break;
4052 default:
4053 coding->result = CODING_RESULT_SUCCESS;
4054 break;
4055 }
4056 coding->consumed_char += consumed_chars;
4057 coding->consumed = src - coding->source;
4058 coding->charbuf_used = charbuf - coding->charbuf;
4059 }
4060
4061 static int
4062 encode_coding_ccl (coding)
4063 struct coding_system *coding;
4064 {
4065 struct ccl_program ccl;
4066 int multibytep = coding->dst_multibyte;
4067 int *charbuf = coding->charbuf;
4068 int *charbuf_end = charbuf + coding->charbuf_used;
4069 unsigned char *dst = coding->destination + coding->produced;
4070 unsigned char *dst_end = coding->destination + coding->dst_bytes;
4071 unsigned char *adjusted_dst_end = dst_end - 1;
4072 int destination_charbuf[1024];
4073 int i, produced_chars = 0;
4074
4075 setup_ccl_program (&ccl, CODING_CCL_ENCODER (coding));
4076
4077 ccl.last_block = coding->mode & CODING_MODE_LAST_BLOCK;
4078 ccl.dst_multibyte = coding->dst_multibyte;
4079
4080 while (charbuf < charbuf_end && dst < adjusted_dst_end)
4081 {
4082 int dst_bytes = dst_end - dst;
4083 if (dst_bytes > 1024)
4084 dst_bytes = 1024;
4085
4086 ccl_driver (&ccl, charbuf, destination_charbuf,
4087 charbuf_end - charbuf, dst_bytes);
4088 charbuf += ccl.consumed;
4089 if (multibytep)
4090 for (i = 0; i < ccl.produced; i++)
4091 EMIT_ONE_BYTE (destination_charbuf[i] & 0xFF);
4092 else
4093 {
4094 for (i = 0; i < ccl.produced; i++)
4095 *dst++ = destination_charbuf[i] & 0xFF;
4096 produced_chars += ccl.produced;
4097 }
4098 }
4099
4100 switch (ccl.status)
4101 {
4102 case CCL_STAT_SUSPEND_BY_SRC:
4103 coding->result = CODING_RESULT_INSUFFICIENT_SRC;
4104 break;
4105 case CCL_STAT_SUSPEND_BY_DST:
4106 coding->result = CODING_RESULT_INSUFFICIENT_DST;
4107 break;
4108 case CCL_STAT_QUIT:
4109 case CCL_STAT_INVALID_CMD:
4110 coding->result = CODING_RESULT_INTERRUPT;
4111 break;
4112 default:
4113 coding->result = CODING_RESULT_SUCCESS;
4114 break;
4115 }
4116
4117 coding->produced_char += produced_chars;
4118 coding->produced = dst - coding->destination;
4119 return 0;
4120 }
4121
4122
4123 \f
4124 /*** 10, 11. no-conversion handlers ***/
4125
4126 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions". */
4127
4128 static void
4129 decode_coding_raw_text (coding)
4130 struct coding_system *coding;
4131 {
4132 coding->chars_at_source = 1;
4133 coding->consumed_char = coding->src_chars;
4134 coding->consumed = coding->src_bytes;
4135 coding->result = CODING_RESULT_SUCCESS;
4136 }
4137
4138 static int
4139 encode_coding_raw_text (coding)
4140 struct coding_system *coding;
4141 {
4142 int multibytep = coding->dst_multibyte;
4143 int *charbuf = coding->charbuf;
4144 int *charbuf_end = coding->charbuf + coding->charbuf_used;
4145 unsigned char *dst = coding->destination + coding->produced;
4146 unsigned char *dst_end = coding->destination + coding->dst_bytes;
4147 int produced_chars = 0;
4148 int c;
4149
4150 if (multibytep)
4151 {
4152 int safe_room = MAX_MULTIBYTE_LENGTH * 2;
4153
4154 if (coding->src_multibyte)
4155 while (charbuf < charbuf_end)
4156 {
4157 ASSURE_DESTINATION (safe_room);
4158 c = *charbuf++;
4159 if (ASCII_CHAR_P (c))
4160 EMIT_ONE_ASCII_BYTE (c);
4161 else if (CHAR_BYTE8_P (c))
4162 {
4163 c = CHAR_TO_BYTE8 (c);
4164 EMIT_ONE_BYTE (c);
4165 }
4166 else
4167 {
4168 unsigned char str[MAX_MULTIBYTE_LENGTH], *p0 = str, *p1 = str;
4169
4170 CHAR_STRING_ADVANCE (c, p1);
4171 while (p0 < p1)
4172 EMIT_ONE_BYTE (*p0);
4173 }
4174 }
4175 else
4176 while (charbuf < charbuf_end)
4177 {
4178 ASSURE_DESTINATION (safe_room);
4179 c = *charbuf++;
4180 EMIT_ONE_BYTE (c);
4181 }
4182 }
4183 else
4184 {
4185 if (coding->src_multibyte)
4186 {
4187 int safe_room = MAX_MULTIBYTE_LENGTH;
4188
4189 while (charbuf < charbuf_end)
4190 {
4191 ASSURE_DESTINATION (safe_room);
4192 c = *charbuf++;
4193 if (ASCII_CHAR_P (c))
4194 *dst++ = c;
4195 else if (CHAR_BYTE8_P (c))
4196 *dst++ = CHAR_TO_BYTE8 (c);
4197 else
4198 CHAR_STRING_ADVANCE (c, dst);
4199 produced_chars++;
4200 }
4201 }
4202 else
4203 {
4204 ASSURE_DESTINATION (charbuf_end - charbuf);
4205 while (charbuf < charbuf_end && dst < dst_end)
4206 *dst++ = *charbuf++;
4207 produced_chars = dst - (coding->destination + coding->dst_bytes);
4208 }
4209 }
4210 coding->result = CODING_RESULT_SUCCESS;
4211 coding->produced_char += produced_chars;
4212 coding->produced = dst - coding->destination;
4213 return 0;
4214 }
4215
4216 static int
4217 detect_coding_charset (coding, mask)
4218 struct coding_system *coding;
4219 int *mask;
4220 {
4221 unsigned char *src = coding->source, *src_base = src;
4222 unsigned char *src_end = coding->source + coding->src_bytes;
4223 int multibytep = coding->src_multibyte;
4224 int consumed_chars = 0;
4225 Lisp_Object attrs, valids;
4226
4227 coding = &coding_categories[coding_category_charset];
4228 attrs = CODING_ID_ATTRS (coding->id);
4229 valids = AREF (attrs, coding_attr_charset_valids);
4230
4231 if (! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
4232 src += coding->head_ascii;
4233
4234 while (1)
4235 {
4236 int c;
4237
4238 ONE_MORE_BYTE (c);
4239 if (NILP (AREF (valids, c)))
4240 break;
4241 }
4242 *mask &= ~CATEGORY_MASK_CHARSET;
4243 return 0;
4244
4245 no_more_source:
4246 *mask &= CATEGORY_MASK_CHARSET;
4247 return 1;
4248 }
4249
4250 static void
4251 decode_coding_charset (coding)
4252 struct coding_system *coding;
4253 {
4254 unsigned char *src = coding->source + coding->consumed;
4255 unsigned char *src_end = coding->source + coding->src_bytes;
4256 unsigned char *src_base;
4257 int *charbuf = coding->charbuf;
4258 int *charbuf_end = charbuf + coding->charbuf_size;
4259 int consumed_chars = 0, consumed_chars_base;
4260 int multibytep = coding->src_multibyte;
4261 struct charset *charset;
4262 Lisp_Object attrs, eol_type, charset_list;
4263
4264 CODING_GET_INFO (coding, attrs, eol_type, charset_list);
4265 charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
4266
4267 while (1)
4268 {
4269 int c, c1;
4270
4271 src_base = src;
4272 consumed_chars_base = consumed_chars;
4273
4274 if (charbuf >= charbuf_end)
4275 break;
4276
4277 ONE_MORE_BYTE (c1);
4278 if (c == '\r')
4279 {
4280 if (EQ (eol_type, Qdos))
4281 {
4282 if (src == src_end)
4283 goto no_more_source;
4284 if (*src == '\n')
4285 ONE_MORE_BYTE (c);
4286 }
4287 else if (EQ (eol_type, Qmac))
4288 c = '\n';
4289 }
4290 else
4291 {
4292 CODING_DECODE_CHAR (coding, src, src_base, src_end, charset, c1, c);
4293 if (c < 0)
4294 goto invalid_code;
4295 }
4296 *charbuf++ = c;
4297 continue;
4298
4299 invalid_code:
4300 src = src_base;
4301 consumed_chars = consumed_chars_base;
4302 ONE_MORE_BYTE (c);
4303 *charbuf++ = ASCII_BYTE_P (c) ? c : BYTE8_TO_CHAR (c);
4304 coding->errors++;
4305 }
4306
4307 no_more_source:
4308 coding->consumed_char += consumed_chars_base;
4309 coding->consumed = src_base - coding->source;
4310 coding->charbuf_used = charbuf - coding->charbuf;
4311 }
4312
4313 static int
4314 encode_coding_charset (coding)
4315 struct coding_system *coding;
4316 {
4317 int multibytep = coding->dst_multibyte;
4318 int *charbuf = coding->charbuf;
4319 int *charbuf_end = charbuf + coding->charbuf_used;
4320 unsigned char *dst = coding->destination + coding->produced;
4321 unsigned char *dst_end = coding->destination + coding->dst_bytes;
4322 int safe_room = MAX_MULTIBYTE_LENGTH;
4323 int produced_chars = 0;
4324 struct charset *charset;
4325 Lisp_Object attrs, eol_type, charset_list;
4326 int ascii_compatible;
4327 int c;
4328
4329 CODING_GET_INFO (coding, attrs, eol_type, charset_list);
4330 charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
4331 ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs));
4332
4333 while (charbuf < charbuf_end)
4334 {
4335 unsigned code;
4336
4337 ASSURE_DESTINATION (safe_room);
4338 c = *charbuf++;
4339 if (ascii_compatible && ASCII_CHAR_P (c))
4340 EMIT_ONE_ASCII_BYTE (c);
4341 else if ((code = ENCODE_CHAR (charset, c))
4342 != CHARSET_INVALID_CODE (charset))
4343 EMIT_ONE_BYTE (code);
4344 else
4345 EMIT_ONE_BYTE (coding->default_char);
4346 }
4347
4348 coding->result = CODING_RESULT_SUCCESS;
4349 coding->produced_char += produced_chars;
4350 coding->produced = dst - coding->destination;
4351 return 0;
4352 }
4353
4354 \f
4355 /*** 7. C library functions ***/
4356
4357 /* In Emacs Lisp, coding system is represented by a Lisp symbol which
4358 has a property `coding-system'. The value of this property is a
4359 vector of length 5 (called as coding-vector). Among elements of
4360 this vector, the first (element[0]) and the fifth (element[4])
4361 carry important information for decoding/encoding. Before
4362 decoding/encoding, this information should be set in fields of a
4363 structure of type `coding_system'.
4364
4365 A value of property `coding-system' can be a symbol of another
4366 subsidiary coding-system. In that case, Emacs gets coding-vector
4367 from that symbol.
4368
4369 `element[0]' contains information to be set in `coding->type'. The
4370 value and its meaning is as follows:
4371
4372 0 -- coding_type_emacs_mule
4373 1 -- coding_type_sjis
4374 2 -- coding_type_iso_2022
4375 3 -- coding_type_big5
4376 4 -- coding_type_ccl encoder/decoder written in CCL
4377 nil -- coding_type_no_conversion
4378 t -- coding_type_undecided (automatic conversion on decoding,
4379 no-conversion on encoding)
4380
4381 `element[4]' contains information to be set in `coding->flags' and
4382 `coding->spec'. The meaning varies by `coding->type'.
4383
4384 If `coding->type' is `coding_type_iso_2022', element[4] is a vector
4385 of length 32 (of which the first 13 sub-elements are used now).
4386 Meanings of these sub-elements are:
4387
4388 sub-element[N] where N is 0 through 3: to be set in `coding->spec.iso_2022'
4389 If the value is an integer of valid charset, the charset is
4390 assumed to be designated to graphic register N initially.
4391
4392 If the value is minus, it is a minus value of charset which
4393 reserves graphic register N, which means that the charset is
4394 not designated initially but should be designated to graphic
4395 register N just before encoding a character in that charset.
4396
4397 If the value is nil, graphic register N is never used on
4398 encoding.
4399
4400 sub-element[N] where N is 4 through 11: to be set in `coding->flags'
4401 Each value takes t or nil. See the section ISO2022 of
4402 `coding.h' for more information.
4403
4404 If `coding->type' is `coding_type_big5', element[4] is t to denote
4405 BIG5-ETen or nil to denote BIG5-HKU.
4406
4407 If `coding->type' takes the other value, element[4] is ignored.
4408
4409 Emacs Lisp's coding system also carries information about format of
4410 end-of-line in a value of property `eol-type'. If the value is
4411 integer, 0 means eol_lf, 1 means eol_crlf, and 2 means eol_cr. If
4412 it is not integer, it should be a vector of subsidiary coding
4413 systems of which property `eol-type' has one of above values.
4414
4415 */
4416
4417 /* Setup coding context CODING from information about CODING_SYSTEM.
4418 If CODING_SYSTEM is nil, `no-conversion' is assumed. If
4419 CODING_SYSTEM is invalid, signal an error. */
4420
4421 void
4422 setup_coding_system (coding_system, coding)
4423 Lisp_Object coding_system;
4424 struct coding_system *coding;
4425 {
4426 Lisp_Object attrs;
4427 Lisp_Object eol_type;
4428 Lisp_Object coding_type;
4429 Lisp_Object val;
4430
4431 if (NILP (coding_system))
4432 coding_system = Qno_conversion;
4433
4434 CHECK_CODING_SYSTEM_GET_ID (coding_system, coding->id);
4435
4436 attrs = CODING_ID_ATTRS (coding->id);
4437 eol_type = CODING_ID_EOL_TYPE (coding->id);
4438
4439 coding->mode = 0;
4440 coding->head_ascii = -1;
4441 coding->common_flags
4442 = (VECTORP (eol_type) ? CODING_REQUIRE_DETECTION_MASK : 0);
4443
4444 val = CODING_ATTR_SAFE_CHARSETS (attrs);
4445 coding->max_charset_id = XSTRING (val)->size - 1;
4446 coding->safe_charsets = (char *) XSTRING (val)->data;
4447 coding->default_char = XINT (CODING_ATTR_DEFAULT_CHAR (attrs));
4448
4449 coding_type = CODING_ATTR_TYPE (attrs);
4450 if (EQ (coding_type, Qundecided))
4451 {
4452 coding->detector = NULL;
4453 coding->decoder = decode_coding_raw_text;
4454 coding->encoder = encode_coding_raw_text;
4455 coding->common_flags |= CODING_REQUIRE_DETECTION_MASK;
4456 }
4457 else if (EQ (coding_type, Qiso_2022))
4458 {
4459 int i;
4460 int flags = XINT (AREF (attrs, coding_attr_iso_flags));
4461
4462 /* Invoke graphic register 0 to plane 0. */
4463 CODING_ISO_INVOCATION (coding, 0) = 0;
4464 /* Invoke graphic register 1 to plane 1 if we can use 8-bit. */
4465 CODING_ISO_INVOCATION (coding, 1)
4466 = (flags & CODING_ISO_FLAG_SEVEN_BITS ? -1 : 1);
4467 /* Setup the initial status of designation. */
4468 for (i = 0; i < 4; i++)
4469 CODING_ISO_DESIGNATION (coding, i) = CODING_ISO_INITIAL (coding, i);
4470 /* Not single shifting initially. */
4471 CODING_ISO_SINGLE_SHIFTING (coding) = 0;
4472 /* Beginning of buffer should also be regarded as bol. */
4473 CODING_ISO_BOL (coding) = 1;
4474 coding->detector = detect_coding_iso_2022;
4475 coding->decoder = decode_coding_iso_2022;
4476 coding->encoder = encode_coding_iso_2022;
4477 if (flags & CODING_ISO_FLAG_SAFE)
4478 coding->mode |= CODING_MODE_SAFE_ENCODING;
4479 coding->common_flags
4480 |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK
4481 | CODING_REQUIRE_FLUSHING_MASK);
4482 if (flags & CODING_ISO_FLAG_COMPOSITION)
4483 coding->common_flags |= CODING_ANNOTATE_COMPOSITION_MASK;
4484 if (flags & CODING_ISO_FLAG_FULL_SUPPORT)
4485 {
4486 setup_iso_safe_charsets (attrs);
4487 val = CODING_ATTR_SAFE_CHARSETS (attrs);
4488 coding->max_charset_id = XSTRING (val)->size - 1;
4489 coding->safe_charsets = (char *) XSTRING (val)->data;
4490 }
4491 CODING_ISO_FLAGS (coding) = flags;
4492 }
4493 else if (EQ (coding_type, Qcharset))
4494 {
4495 coding->detector = detect_coding_charset;
4496 coding->decoder = decode_coding_charset;
4497 coding->encoder = encode_coding_charset;
4498 coding->common_flags
4499 |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
4500 }
4501 else if (EQ (coding_type, Qutf_8))
4502 {
4503 coding->detector = detect_coding_utf_8;
4504 coding->decoder = decode_coding_utf_8;
4505 coding->encoder = encode_coding_utf_8;
4506 coding->common_flags
4507 |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
4508 }
4509 else if (EQ (coding_type, Qutf_16))
4510 {
4511 val = AREF (attrs, coding_attr_utf_16_bom);
4512 CODING_UTF_16_BOM (coding) = (CONSP (val) ? utf_16_detect_bom
4513 : EQ (val, Qt) ? utf_16_with_bom
4514 : utf_16_without_bom);
4515 val = AREF (attrs, coding_attr_utf_16_endian);
4516 CODING_UTF_16_ENDIAN (coding) = (NILP (val) ? utf_16_big_endian
4517 : utf_16_little_endian);
4518 CODING_UTF_16_SURROGATE (coding) = 0;
4519 coding->detector = detect_coding_utf_16;
4520 coding->decoder = decode_coding_utf_16;
4521 coding->encoder = encode_coding_utf_16;
4522 coding->common_flags
4523 |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
4524 }
4525 else if (EQ (coding_type, Qccl))
4526 {
4527 coding->detector = detect_coding_ccl;
4528 coding->decoder = decode_coding_ccl;
4529 coding->encoder = encode_coding_ccl;
4530 coding->common_flags
4531 |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK
4532 | CODING_REQUIRE_FLUSHING_MASK);
4533 }
4534 else if (EQ (coding_type, Qemacs_mule))
4535 {
4536 coding->detector = detect_coding_emacs_mule;
4537 coding->decoder = decode_coding_emacs_mule;
4538 coding->encoder = encode_coding_emacs_mule;
4539 coding->common_flags
4540 |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
4541 if (! NILP (AREF (attrs, coding_attr_emacs_mule_full))
4542 && ! EQ (CODING_ATTR_CHARSET_LIST (attrs), Vemacs_mule_charset_list))
4543 {
4544 Lisp_Object tail, safe_charsets;
4545 int max_charset_id = 0;
4546
4547 for (tail = Vemacs_mule_charset_list; CONSP (tail);
4548 tail = XCDR (tail))
4549 if (max_charset_id < XFASTINT (XCAR (tail)))
4550 max_charset_id = XFASTINT (XCAR (tail));
4551 safe_charsets = Fmake_string (make_number (max_charset_id + 1),
4552 make_number (255));
4553 for (tail = Vemacs_mule_charset_list; CONSP (tail);
4554 tail = XCDR (tail))
4555 XSTRING (safe_charsets)->data[XFASTINT (XCAR (tail))] = 0;
4556 coding->max_charset_id = max_charset_id;
4557 coding->safe_charsets = (char *) XSTRING (safe_charsets)->data;
4558 }
4559 }
4560 else if (EQ (coding_type, Qshift_jis))
4561 {
4562 coding->detector = detect_coding_sjis;
4563 coding->decoder = decode_coding_sjis;
4564 coding->encoder = encode_coding_sjis;
4565 coding->common_flags
4566 |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
4567 }
4568 else if (EQ (coding_type, Qbig5))
4569 {
4570 coding->detector = detect_coding_big5;
4571 coding->decoder = decode_coding_big5;
4572 coding->encoder = encode_coding_big5;
4573 coding->common_flags
4574 |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
4575 }
4576 else /* EQ (coding_type, Qraw_text) */
4577 {
4578 coding->detector = NULL;
4579 coding->decoder = decode_coding_raw_text;
4580 coding->encoder = encode_coding_raw_text;
4581 coding->common_flags |= CODING_FOR_UNIBYTE_MASK;
4582 }
4583
4584 return;
4585 }
4586
4587 /* Return raw-text or one of its subsidiaries that has the same
4588 eol_type as CODING-SYSTEM. */
4589
4590 Lisp_Object
4591 raw_text_coding_system (coding_system)
4592 Lisp_Object coding_system;
4593 {
4594 Lisp_Object spec, attrs;
4595 Lisp_Object eol_type, raw_text_eol_type;
4596
4597 spec = CODING_SYSTEM_SPEC (coding_system);
4598 attrs = AREF (spec, 0);
4599
4600 if (EQ (CODING_ATTR_TYPE (attrs), Qraw_text))
4601 return coding_system;
4602
4603 eol_type = AREF (spec, 2);
4604 if (VECTORP (eol_type))
4605 return Qraw_text;
4606 spec = CODING_SYSTEM_SPEC (Qraw_text);
4607 raw_text_eol_type = AREF (spec, 2);
4608 return (EQ (eol_type, Qunix) ? AREF (raw_text_eol_type, 0)
4609 : EQ (eol_type, Qdos) ? AREF (raw_text_eol_type, 1)
4610 : AREF (raw_text_eol_type, 2));
4611 }
4612
4613
4614 /* If CODING_SYSTEM doesn't specify end-of-line format but PARENT
4615 does, return one of the subsidiary that has the same eol-spec as
4616 PARENT. Otherwise, return CODING_SYSTEM. */
4617
4618 Lisp_Object
4619 coding_inherit_eol_type (coding_system, parent)
4620 {
4621 Lisp_Object spec, attrs, eol_type;
4622
4623 spec = CODING_SYSTEM_SPEC (coding_system);
4624 attrs = AREF (spec, 0);
4625 eol_type = AREF (spec, 2);
4626 if (VECTORP (eol_type))
4627 {
4628 Lisp_Object parent_spec;
4629 Lisp_Object parent_eol_type;
4630
4631 parent_spec
4632 = CODING_SYSTEM_SPEC (buffer_defaults.buffer_file_coding_system);
4633 parent_eol_type = AREF (parent_spec, 2);
4634 if (EQ (parent_eol_type, Qunix))
4635 coding_system = AREF (eol_type, 0);
4636 else if (EQ (parent_eol_type, Qdos))
4637 coding_system = AREF (eol_type, 1);
4638 else if (EQ (parent_eol_type, Qmac))
4639 coding_system = AREF (eol_type, 2);
4640 }
4641 return coding_system;
4642 }
4643
4644 /* Emacs has a mechanism to automatically detect a coding system if it
4645 is one of Emacs' internal format, ISO2022, SJIS, and BIG5. But,
4646 it's impossible to distinguish some coding systems accurately
4647 because they use the same range of codes. So, at first, coding
4648 systems are categorized into 7, those are:
4649
4650 o coding-category-emacs-mule
4651
4652 The category for a coding system which has the same code range
4653 as Emacs' internal format. Assigned the coding-system (Lisp
4654 symbol) `emacs-mule' by default.
4655
4656 o coding-category-sjis
4657
4658 The category for a coding system which has the same code range
4659 as SJIS. Assigned the coding-system (Lisp
4660 symbol) `japanese-shift-jis' by default.
4661
4662 o coding-category-iso-7
4663
4664 The category for a coding system which has the same code range
4665 as ISO2022 of 7-bit environment. This doesn't use any locking
4666 shift and single shift functions. This can encode/decode all
4667 charsets. Assigned the coding-system (Lisp symbol)
4668 `iso-2022-7bit' by default.
4669
4670 o coding-category-iso-7-tight
4671
4672 Same as coding-category-iso-7 except that this can
4673 encode/decode only the specified charsets.
4674
4675 o coding-category-iso-8-1
4676
4677 The category for a coding system which has the same code range
4678 as ISO2022 of 8-bit environment and graphic plane 1 used only
4679 for DIMENSION1 charset. This doesn't use any locking shift
4680 and single shift functions. Assigned the coding-system (Lisp
4681 symbol) `iso-latin-1' by default.
4682
4683 o coding-category-iso-8-2
4684
4685 The category for a coding system which has the same code range
4686 as ISO2022 of 8-bit environment and graphic plane 1 used only
4687 for DIMENSION2 charset. This doesn't use any locking shift
4688 and single shift functions. Assigned the coding-system (Lisp
4689 symbol) `japanese-iso-8bit' by default.
4690
4691 o coding-category-iso-7-else
4692
4693 The category for a coding system which has the same code range
4694 as ISO2022 of 7-bit environemnt but uses locking shift or
4695 single shift functions. Assigned the coding-system (Lisp
4696 symbol) `iso-2022-7bit-lock' by default.
4697
4698 o coding-category-iso-8-else
4699
4700 The category for a coding system which has the same code range
4701 as ISO2022 of 8-bit environemnt but uses locking shift or
4702 single shift functions. Assigned the coding-system (Lisp
4703 symbol) `iso-2022-8bit-ss2' by default.
4704
4705 o coding-category-big5
4706
4707 The category for a coding system which has the same code range
4708 as BIG5. Assigned the coding-system (Lisp symbol)
4709 `cn-big5' by default.
4710
4711 o coding-category-utf-8
4712
4713 The category for a coding system which has the same code range
4714 as UTF-8 (cf. RFC2279). Assigned the coding-system (Lisp
4715 symbol) `utf-8' by default.
4716
4717 o coding-category-utf-16-be
4718
4719 The category for a coding system in which a text has an
4720 Unicode signature (cf. Unicode Standard) in the order of BIG
4721 endian at the head. Assigned the coding-system (Lisp symbol)
4722 `utf-16-be' by default.
4723
4724 o coding-category-utf-16-le
4725
4726 The category for a coding system in which a text has an
4727 Unicode signature (cf. Unicode Standard) in the order of
4728 LITTLE endian at the head. Assigned the coding-system (Lisp
4729 symbol) `utf-16-le' by default.
4730
4731 o coding-category-ccl
4732
4733 The category for a coding system of which encoder/decoder is
4734 written in CCL programs. The default value is nil, i.e., no
4735 coding system is assigned.
4736
4737 o coding-category-binary
4738
4739 The category for a coding system not categorized in any of the
4740 above. Assigned the coding-system (Lisp symbol)
4741 `no-conversion' by default.
4742
4743 Each of them is a Lisp symbol and the value is an actual
4744 `coding-system's (this is also a Lisp symbol) assigned by a user.
4745 What Emacs does actually is to detect a category of coding system.
4746 Then, it uses a `coding-system' assigned to it. If Emacs can't
4747 decide only one possible category, it selects a category of the
4748 highest priority. Priorities of categories are also specified by a
4749 user in a Lisp variable `coding-category-list'.
4750
4751 */
4752
4753 #define EOL_SEEN_NONE 0
4754 #define EOL_SEEN_LF 1
4755 #define EOL_SEEN_CR 2
4756 #define EOL_SEEN_CRLF 4
4757
4758 /* Detect how end-of-line of a text of length CODING->src_bytes
4759 pointed by CODING->source is encoded. Return one of
4760 EOL_SEEN_XXX. */
4761
4762 #define MAX_EOL_CHECK_COUNT 3
4763
4764 static int
4765 detect_eol (coding, source, src_bytes)
4766 struct coding_system *coding;
4767 unsigned char *source;
4768 EMACS_INT src_bytes;
4769 {
4770 Lisp_Object attrs, coding_type;
4771 unsigned char *src = source, *src_end = src + src_bytes;
4772 unsigned char c;
4773 int total = 0;
4774 int eol_seen = EOL_SEEN_NONE;
4775
4776 attrs = CODING_ID_ATTRS (coding->id);
4777 coding_type = CODING_ATTR_TYPE (attrs);
4778
4779 if (EQ (coding_type, Qccl))
4780 {
4781 int msb, lsb;
4782
4783 msb = coding->spec.utf_16.endian == utf_16_little_endian;
4784 lsb = 1 - msb;
4785
4786 while (src + 1 < src_end)
4787 {
4788 c = src[lsb];
4789 if (src[msb] == 0 && (c == '\n' || c == '\r'))
4790 {
4791 int this_eol;
4792
4793 if (c == '\n')
4794 this_eol = EOL_SEEN_LF;
4795 else if (src + 3 >= src_end
4796 || src[msb + 2] != 0
4797 || src[lsb + 2] != '\n')
4798 this_eol = EOL_SEEN_CR;
4799 else
4800 this_eol = EOL_SEEN_CRLF;
4801
4802 if (eol_seen == EOL_SEEN_NONE)
4803 /* This is the first end-of-line. */
4804 eol_seen = this_eol;
4805 else if (eol_seen != this_eol)
4806 {
4807 /* The found type is different from what found before. */
4808 eol_seen = EOL_SEEN_LF;
4809 break;
4810 }
4811 if (++total == MAX_EOL_CHECK_COUNT)
4812 break;
4813 }
4814 src += 2;
4815 }
4816 }
4817 else
4818 {
4819 while (src < src_end)
4820 {
4821 c = *src++;
4822 if (c == '\n' || c == '\r')
4823 {
4824 int this_eol;
4825
4826 if (c == '\n')
4827 this_eol = EOL_SEEN_LF;
4828 else if (src >= src_end || *src != '\n')
4829 this_eol = EOL_SEEN_CR;
4830 else
4831 this_eol = EOL_SEEN_CRLF, src++;
4832
4833 if (eol_seen == EOL_SEEN_NONE)
4834 /* This is the first end-of-line. */
4835 eol_seen = this_eol;
4836 else if (eol_seen != this_eol)
4837 {
4838 /* The found type is different from what found before. */
4839 eol_seen = EOL_SEEN_LF;
4840 break;
4841 }
4842 if (++total == MAX_EOL_CHECK_COUNT)
4843 break;
4844 }
4845 }
4846 }
4847 return eol_seen;
4848 }
4849
4850
4851 static void
4852 adjust_coding_eol_type (coding, eol_seen)
4853 struct coding_system *coding;
4854 int eol_seen;
4855 {
4856 Lisp_Object eol_type;
4857
4858 eol_type = CODING_ID_EOL_TYPE (coding->id);
4859 if (eol_seen & EOL_SEEN_LF)
4860 coding->id = CODING_SYSTEM_ID (AREF (eol_type, 0));
4861 else if (eol_type & EOL_SEEN_CRLF)
4862 coding->id = CODING_SYSTEM_ID (AREF (eol_type, 1));
4863 else if (eol_type & EOL_SEEN_CR)
4864 coding->id = CODING_SYSTEM_ID (AREF (eol_type, 2));
4865 }
4866
4867 /* Detect how a text specified in CODING is encoded. If a coding
4868 system is detected, update fields of CODING by the detected coding
4869 system. */
4870
4871 void
4872 detect_coding (coding)
4873 struct coding_system *coding;
4874 {
4875 unsigned char *src, *src_end;
4876 Lisp_Object attrs, coding_type;
4877
4878 coding->consumed = coding->consumed_char = 0;
4879 coding->produced = coding->produced_char = 0;
4880 coding_set_source (coding);
4881
4882 src_end = coding->source + coding->src_bytes;
4883
4884 /* If we have not yet decided the text encoding type, detect it
4885 now. */
4886 if (EQ (CODING_ATTR_TYPE (CODING_ID_ATTRS (coding->id)), Qundecided))
4887 {
4888 int mask = CATEGORY_MASK_ANY;
4889 int c, i;
4890
4891 for (src = coding->source; src < src_end; src++)
4892 {
4893 c = *src;
4894 if (c & 0x80 || (c < 0x20 && (c == ISO_CODE_ESC
4895 || c == ISO_CODE_SI
4896 || c == ISO_CODE_SO)))
4897 break;
4898 }
4899 coding->head_ascii = src - (coding->source + coding->consumed);
4900
4901 if (coding->head_ascii < coding->src_bytes)
4902 {
4903 int detected = 0;
4904
4905 for (i = 0; i < coding_category_raw_text; i++)
4906 {
4907 enum coding_category category = coding_priorities[i];
4908 struct coding_system *this = coding_categories + category;
4909
4910 if (category >= coding_category_raw_text
4911 || detected & (1 << category))
4912 continue;
4913
4914 if (this->id < 0)
4915 {
4916 /* No coding system of this category is defined. */
4917 mask &= ~(1 << category);
4918 }
4919 else
4920 {
4921 detected |= detected_mask[category];
4922 if ((*(this->detector)) (coding, &mask))
4923 break;
4924 }
4925 }
4926 if (! mask)
4927 setup_coding_system (Qraw_text, coding);
4928 else if (mask != CATEGORY_MASK_ANY)
4929 for (i = 0; i < coding_category_raw_text; i++)
4930 {
4931 enum coding_category category = coding_priorities[i];
4932 struct coding_system *this = coding_categories + category;
4933
4934 if (mask & (1 << category))
4935 {
4936 setup_coding_system (CODING_ID_NAME (this->id), coding);
4937 break;
4938 }
4939 }
4940 }
4941 }
4942
4943 attrs = CODING_ID_ATTRS (coding->id);
4944 coding_type = CODING_ATTR_TYPE (attrs);
4945
4946 /* If we have not yet decided the EOL type, detect it now. But, the
4947 detection is impossible for a CCL based coding system, in which
4948 case, we detct the EOL type after decoding. */
4949 if (VECTORP (CODING_ID_EOL_TYPE (coding->id))
4950 && ! EQ (coding_type, Qccl))
4951 {
4952 int eol_seen = detect_eol (coding, coding->source, coding->src_bytes);
4953
4954 if (eol_seen != EOL_SEEN_NONE)
4955 adjust_coding_eol_type (coding, eol_seen);
4956 }
4957 }
4958
4959
4960 static void
4961 decode_eol (coding)
4962 struct coding_system *coding;
4963 {
4964 if (VECTORP (CODING_ID_EOL_TYPE (coding->id)))
4965 {
4966 unsigned char *p = CHAR_POS_ADDR (coding->dst_pos);
4967 unsigned char *pend = p + coding->produced;
4968 int eol_seen = EOL_SEEN_NONE;
4969
4970 for (; p < pend; p++)
4971 {
4972 if (*p == '\n')
4973 eol_seen |= EOL_SEEN_LF;
4974 else if (*p == '\r')
4975 {
4976 if (p + 1 < pend && *(p + 1) == '\n')
4977 {
4978 eol_seen |= EOL_SEEN_CRLF;
4979 p++;
4980 }
4981 else
4982 eol_seen |= EOL_SEEN_CR;
4983 }
4984 }
4985 if (eol_seen != EOL_SEEN_NONE)
4986 adjust_coding_eol_type (coding, eol_seen);
4987 }
4988
4989 if (EQ (CODING_ID_EOL_TYPE (coding->id), Qmac))
4990 {
4991 unsigned char *p = CHAR_POS_ADDR (coding->dst_pos);
4992 unsigned char *pend = p + coding->produced;
4993
4994 for (; p < pend; p++)
4995 if (*p == '\r')
4996 *p = '\n';
4997 }
4998 else if (EQ (CODING_ID_EOL_TYPE (coding->id), Qdos))
4999 {
5000 unsigned char *p, *pbeg, *pend;
5001 Lisp_Object undo_list;
5002
5003 move_gap_both (coding->dst_pos + coding->produced_char,
5004 coding->dst_pos_byte + coding->produced);
5005 undo_list = current_buffer->undo_list;
5006 current_buffer->undo_list = Qt;
5007 del_range_2 (coding->dst_pos, coding->dst_pos_byte, GPT, GPT_BYTE, Qnil);
5008 current_buffer->undo_list = undo_list;
5009 pbeg = GPT_ADDR;
5010 pend = pbeg + coding->produced;
5011
5012 for (p = pend - 1; p >= pbeg; p--)
5013 if (*p == '\r')
5014 {
5015 safe_bcopy ((char *) (p + 1), (char *) p, pend - p - 1);
5016 pend--;
5017 }
5018 coding->produced_char -= coding->produced - (pend - pbeg);
5019 coding->produced = pend - pbeg;
5020 insert_from_gap (coding->produced_char, coding->produced);
5021 }
5022 }
5023
5024 static void
5025 translate_chars (coding, table)
5026 struct coding_system *coding;
5027 Lisp_Object table;
5028 {
5029 int *charbuf = coding->charbuf;
5030 int *charbuf_end = charbuf + coding->charbuf_used;
5031 int c;
5032
5033 if (coding->chars_at_source)
5034 return;
5035
5036 while (charbuf < charbuf_end)
5037 {
5038 c = *charbuf;
5039 if (c < 0)
5040 charbuf += c;
5041 else
5042 *charbuf++ = translate_char (table, c);
5043 }
5044 }
5045
5046 static int
5047 produce_chars (coding)
5048 struct coding_system *coding;
5049 {
5050 unsigned char *dst = coding->destination + coding->produced;
5051 unsigned char *dst_end = coding->destination + coding->dst_bytes;
5052 int produced;
5053 int produced_chars = 0;
5054
5055 if (! coding->chars_at_source)
5056 {
5057 /* Characters are in coding->charbuf. */
5058 int *buf = coding->charbuf;
5059 int *buf_end = buf + coding->charbuf_used;
5060 unsigned char *adjusted_dst_end;
5061
5062 if (BUFFERP (coding->src_object)
5063 && EQ (coding->src_object, coding->dst_object))
5064 dst_end = coding->source + coding->consumed;
5065 adjusted_dst_end = dst_end - MAX_MULTIBYTE_LENGTH;
5066
5067 while (buf < buf_end)
5068 {
5069 int c = *buf++;
5070
5071 if (dst >= adjusted_dst_end)
5072 {
5073 dst = alloc_destination (coding,
5074 buf_end - buf + MAX_MULTIBYTE_LENGTH,
5075 dst);
5076 dst_end = coding->destination + coding->dst_bytes;
5077 adjusted_dst_end = dst_end - MAX_MULTIBYTE_LENGTH;
5078 }
5079 if (c >= 0)
5080 {
5081 if (coding->dst_multibyte
5082 || ! CHAR_BYTE8_P (c))
5083 CHAR_STRING_ADVANCE (c, dst);
5084 else
5085 *dst++ = CHAR_TO_BYTE8 (c);
5086 produced_chars++;
5087 }
5088 else
5089 /* This is an annotation data. */
5090 buf -= c + 1;
5091 }
5092 }
5093 else
5094 {
5095 int multibytep = coding->src_multibyte;
5096 unsigned char *src = coding->source;
5097 unsigned char *src_end = src + coding->src_bytes;
5098 Lisp_Object eol_type;
5099
5100 eol_type = CODING_ID_EOL_TYPE (coding->id);
5101
5102 if (coding->src_multibyte != coding->dst_multibyte)
5103 {
5104 if (coding->src_multibyte)
5105 {
5106 int consumed_chars;
5107
5108 while (1)
5109 {
5110 unsigned char *src_base = src;
5111 int c;
5112
5113 ONE_MORE_BYTE (c);
5114 if (c == '\r')
5115 {
5116 if (EQ (eol_type, Qdos))
5117 {
5118 if (src < src_end
5119 && *src == '\n')
5120 c = *src++;
5121 }
5122 else if (EQ (eol_type, Qmac))
5123 c = '\n';
5124 }
5125 if (dst == dst_end)
5126 {
5127 EMACS_INT offset = src - coding->source;
5128
5129 dst = alloc_destination (coding, src_end - src + 1, dst);
5130 dst_end = coding->destination + coding->dst_bytes;
5131 coding_set_source (coding);
5132 src = coding->source + offset;
5133 src_end = coding->source + coding->src_bytes;
5134 }
5135 *dst++ = c;
5136 produced_chars++;
5137 }
5138 no_more_source:
5139 ;
5140 }
5141 else
5142 while (src < src_end)
5143 {
5144 int c = *src++;
5145
5146 if (c == '\r')
5147 {
5148 if (EQ (eol_type, Qdos))
5149 {
5150 if (src < src_end
5151 && *src == '\n')
5152 c = *src++;
5153 }
5154 else if (EQ (eol_type, Qmac))
5155 c = '\n';
5156 }
5157 if (dst >= dst_end - 1)
5158 {
5159 EMACS_INT offset = src - coding->source;
5160
5161 dst = alloc_destination (coding, src_end - src + 2, dst);
5162 dst_end = coding->destination + coding->dst_bytes;
5163 coding_set_source (coding);
5164 src = coding->source + offset;
5165 src_end = coding->source + coding->src_bytes;
5166 }
5167 EMIT_ONE_BYTE (c);
5168 }
5169 }
5170 else
5171 {
5172 if (!EQ (coding->src_object, coding->dst_object))
5173 {
5174 int require = coding->src_bytes - coding->dst_bytes;
5175
5176 if (require > 0)
5177 {
5178 EMACS_INT offset = src - coding->source;
5179
5180 dst = alloc_destination (coding, require, dst);
5181 coding_set_source (coding);
5182 src = coding->source + offset;
5183 src_end = coding->source + coding->src_bytes;
5184 }
5185 }
5186 produced_chars = coding->src_chars;
5187 while (src < src_end)
5188 {
5189 int c = *src++;
5190
5191 if (c == '\r')
5192 {
5193 if (EQ (eol_type, Qdos))
5194 {
5195 if (src < src_end
5196 && *src == '\n')
5197 c = *src++;
5198 produced_chars--;
5199 }
5200 else if (EQ (eol_type, Qmac))
5201 c = '\n';
5202 }
5203 *dst++ = c;
5204 }
5205 }
5206 }
5207
5208 produced = dst - (coding->destination + coding->produced);
5209 if (BUFFERP (coding->dst_object))
5210 insert_from_gap (produced_chars, produced);
5211 coding->produced += produced;
5212 coding->produced_char += produced_chars;
5213 return produced_chars;
5214 }
5215
5216 /* [ -LENGTH CHAR_POS_OFFSET MASK METHOD COMP_LEN ]
5217 or
5218 [ -LENGTH CHAR_POS_OFFSET MASK METHOD COMP_LEN COMPONENTS... ]
5219 */
5220
5221 static INLINE void
5222 produce_composition (coding, charbuf)
5223 struct coding_system *coding;
5224 int *charbuf;
5225 {
5226 Lisp_Object buffer;
5227 int len;
5228 EMACS_INT pos;
5229 enum composition_method method;
5230 int cmp_len;
5231 Lisp_Object components;
5232
5233 buffer = coding->dst_object;
5234 len = -charbuf[0];
5235 pos = coding->dst_pos + charbuf[1];
5236 method = (enum composition_method) (charbuf[3]);
5237 cmp_len = charbuf[4];
5238
5239 if (method == COMPOSITION_RELATIVE)
5240 components = Qnil;
5241 else
5242 {
5243 Lisp_Object args[MAX_COMPOSITION_COMPONENTS * 2 - 1];
5244 int i;
5245
5246 len -= 5;
5247 charbuf += 5;
5248 for (i = 0; i < len; i++)
5249 args[i] = make_number (charbuf[i]);
5250 components = (method == COMPOSITION_WITH_ALTCHARS
5251 ? Fstring (len, args) : Fvector (len, args));
5252 }
5253 compose_text (pos, pos + cmp_len, components, Qnil, Qnil);
5254 }
5255
5256 static int *
5257 save_composition_data (buf, buf_end, prop)
5258 int *buf, *buf_end;
5259 Lisp_Object prop;
5260 {
5261 enum composition_method method = COMPOSITION_METHOD (prop);
5262 int cmp_len = COMPOSITION_LENGTH (prop);
5263
5264 if (buf + 4 + (MAX_COMPOSITION_COMPONENTS * 2 - 1) > buf_end)
5265 return NULL;
5266
5267 buf[1] = CODING_ANNOTATE_COMPOSITION_MASK;
5268 buf[2] = method;
5269 buf[3] = cmp_len;
5270
5271 if (method == COMPOSITION_RELATIVE)
5272 buf[0] = 4;
5273 else
5274 {
5275 Lisp_Object components;
5276 int len, i;
5277
5278 components = COMPOSITION_COMPONENTS (prop);
5279 if (VECTORP (components))
5280 {
5281 len = XVECTOR (components)->size;
5282 for (i = 0; i < len; i++)
5283 buf[4 + i] = XINT (AREF (components, i));
5284 }
5285 else if (STRINGP (components))
5286 {
5287 int i_byte;
5288
5289 len = XSTRING (components)->size;
5290 i = i_byte = 0;
5291 while (i < len)
5292 FETCH_STRING_CHAR_ADVANCE (buf[4 + i], components, i, i_byte);
5293 }
5294 else if (INTEGERP (components))
5295 {
5296 len = 1;
5297 buf[4] = XINT (components);
5298 }
5299 else if (CONSP (components))
5300 {
5301 for (len = 0; CONSP (components);
5302 len++, components = XCDR (components))
5303 buf[4 + len] = XINT (XCAR (components));
5304 }
5305 else
5306 abort ();
5307 buf[0] = 4 + len;
5308 }
5309 return (buf + buf[0]);
5310 }
5311
5312 #define CHARBUF_SIZE 0x4000
5313
5314 #define ALLOC_CONVERSION_WORK_AREA(coding) \
5315 do { \
5316 int size = CHARBUF_SIZE;; \
5317 \
5318 coding->charbuf = NULL; \
5319 while (size > 1024) \
5320 { \
5321 coding->charbuf = (int *) alloca (sizeof (int) * size); \
5322 if (coding->charbuf) \
5323 break; \
5324 size >>= 1; \
5325 } \
5326 if (! coding->charbuf) \
5327 { \
5328 coding->result = CODING_RESULT_INSUFFICIENT_MEM; \
5329 return coding->result; \
5330 } \
5331 coding->charbuf_size = size; \
5332 } while (0)
5333
5334
5335 static void
5336 produce_annotation (coding)
5337 struct coding_system *coding;
5338 {
5339 int *charbuf = coding->charbuf;
5340 int *charbuf_end = charbuf + coding->charbuf_used;
5341
5342 while (charbuf < charbuf_end)
5343 {
5344 if (*charbuf >= 0)
5345 charbuf++;
5346 else
5347 {
5348 int len = -*charbuf;
5349 switch (charbuf[2])
5350 {
5351 case CODING_ANNOTATE_COMPOSITION_MASK:
5352 produce_composition (coding, charbuf);
5353 break;
5354 default:
5355 abort ();
5356 }
5357 charbuf += len;
5358 }
5359 }
5360 }
5361
5362 /* Decode the data at CODING->src_object into CODING->dst_object.
5363 CODING->src_object is a buffer, a string, or nil.
5364 CODING->dst_object is a buffer.
5365
5366 If CODING->src_object is a buffer, it must be the current buffer.
5367 In this case, if CODING->src_pos is positive, it is a position of
5368 the source text in the buffer, otherwise, the source text is in the
5369 gap area of the buffer, and CODING->src_pos specifies the offset of
5370 the text from GPT (which must be the same as PT). If this is the
5371 same buffer as CODING->dst_object, CODING->src_pos must be
5372 negative.
5373
5374 If CODING->src_object is a string, CODING->src_pos in an index to
5375 that string.
5376
5377 If CODING->src_object is nil, CODING->source must already point to
5378 the non-relocatable memory area. In this case, CODING->src_pos is
5379 an offset from CODING->source.
5380
5381 The decoded data is inserted at the current point of the buffer
5382 CODING->dst_object.
5383 */
5384
5385 static int
5386 decode_coding (coding)
5387 struct coding_system *coding;
5388 {
5389 Lisp_Object attrs;
5390
5391 if (BUFFERP (coding->src_object)
5392 && coding->src_pos > 0
5393 && coding->src_pos < GPT
5394 && coding->src_pos + coding->src_chars > GPT)
5395 move_gap_both (coding->src_pos, coding->src_pos_byte);
5396
5397 if (BUFFERP (coding->dst_object))
5398 {
5399 if (current_buffer != XBUFFER (coding->dst_object))
5400 set_buffer_internal (XBUFFER (coding->dst_object));
5401 if (GPT != PT)
5402 move_gap_both (PT, PT_BYTE);
5403 }
5404
5405 coding->consumed = coding->consumed_char = 0;
5406 coding->produced = coding->produced_char = 0;
5407 coding->chars_at_source = 0;
5408 coding->result = CODING_RESULT_SUCCESS;
5409 coding->errors = 0;
5410
5411 ALLOC_CONVERSION_WORK_AREA (coding);
5412
5413 attrs = CODING_ID_ATTRS (coding->id);
5414
5415 do
5416 {
5417 coding_set_source (coding);
5418 coding->annotated = 0;
5419 (*(coding->decoder)) (coding);
5420 if (!NILP (CODING_ATTR_DECODE_TBL (attrs)))
5421 translate_chars (CODING_ATTR_DECODE_TBL (attrs), coding);
5422 coding_set_destination (coding);
5423 produce_chars (coding);
5424 if (coding->annotated)
5425 produce_annotation (coding);
5426 }
5427 while (coding->consumed < coding->src_bytes
5428 && ! coding->result);
5429
5430 if (EQ (CODING_ATTR_TYPE (CODING_ID_ATTRS (coding->id)), Qccl)
5431 && SYMBOLP (CODING_ID_EOL_TYPE (coding->id))
5432 && ! EQ (CODING_ID_EOL_TYPE (coding->id), Qunix))
5433 decode_eol (coding);
5434
5435 coding->carryover_bytes = 0;
5436 if (coding->consumed < coding->src_bytes)
5437 {
5438 int nbytes = coding->src_bytes - coding->consumed;
5439 unsigned char *src;
5440
5441 coding_set_source (coding);
5442 coding_set_destination (coding);
5443 src = coding->source + coding->consumed;
5444
5445 if (coding->mode & CODING_MODE_LAST_BLOCK)
5446 {
5447 /* Flush out unprocessed data as binary chars. We are sure
5448 that the number of data is less than the size of
5449 coding->charbuf. */
5450 int *charbuf = coding->charbuf;
5451
5452 while (nbytes-- > 0)
5453 {
5454 int c = *src++;
5455 *charbuf++ = (c & 0x80 ? - c : c);
5456 }
5457 produce_chars (coding);
5458 }
5459 else
5460 {
5461 /* Record unprocessed bytes in coding->carryover. We are
5462 sure that the number of data is less than the size of
5463 coding->carryover. */
5464 unsigned char *p = coding->carryover;
5465
5466 coding->carryover_bytes = nbytes;
5467 while (nbytes-- > 0)
5468 *p++ = *src++;
5469 }
5470 coding->consumed = coding->src_bytes;
5471 }
5472
5473 return coding->result;
5474 }
5475
5476 static void
5477 consume_chars (coding)
5478 struct coding_system *coding;
5479 {
5480 int *buf = coding->charbuf;
5481 /* -1 is to compensate for CRLF. */
5482 int *buf_end = coding->charbuf + coding->charbuf_size - 1;
5483 unsigned char *src = coding->source + coding->consumed;
5484 int pos = coding->src_pos + coding->consumed_char;
5485 int end_pos = coding->src_pos + coding->src_chars;
5486 int multibytep = coding->src_multibyte;
5487 Lisp_Object eol_type;
5488 int c;
5489 int start, end, stop;
5490 Lisp_Object object, prop;
5491
5492 eol_type = CODING_ID_EOL_TYPE (coding->id);
5493 if (VECTORP (eol_type))
5494 eol_type = Qunix;
5495
5496 object = coding->src_object;
5497
5498 /* Note: composition handling is not yet implemented. */
5499 coding->common_flags &= ~CODING_ANNOTATE_COMPOSITION_MASK;
5500
5501 if (coding->common_flags & CODING_ANNOTATE_COMPOSITION_MASK
5502 && find_composition (pos, end_pos, &start, &end, &prop, object)
5503 && end <= end_pos
5504 && (start >= pos
5505 || (find_composition (end, end_pos, &start, &end, &prop, object)
5506 && end <= end_pos)))
5507 stop = start;
5508 else
5509 stop = end_pos;
5510
5511 while (buf < buf_end)
5512 {
5513 if (pos == stop)
5514 {
5515 int *p;
5516
5517 if (pos == end_pos)
5518 break;
5519 p = save_composition_data (buf, buf_end, prop);
5520 if (p == NULL)
5521 break;
5522 buf = p;
5523 if (find_composition (end, end_pos, &start, &end, &prop, object)
5524 && end <= end_pos)
5525 stop = start;
5526 else
5527 stop = end_pos;
5528 }
5529
5530 if (! multibytep)
5531 c = *src++;
5532 else
5533 c = STRING_CHAR_ADVANCE (src);
5534 if ((c == '\r') && (coding->mode & CODING_MODE_SELECTIVE_DISPLAY))
5535 c = '\n';
5536 if (! EQ (eol_type, Qunix))
5537 {
5538 if (c == '\n')
5539 {
5540 if (EQ (eol_type, Qdos))
5541 *buf++ = '\r';
5542 else
5543 c = '\r';
5544 }
5545 }
5546 *buf++ = c;
5547 pos++;
5548 }
5549
5550 coding->consumed = src - coding->source;
5551 coding->consumed_char = pos - coding->src_pos;
5552 coding->charbuf_used = buf - coding->charbuf;
5553 coding->chars_at_source = 0;
5554 }
5555
5556
5557 /* Encode the text at CODING->src_object into CODING->dst_object.
5558 CODING->src_object is a buffer or a string.
5559 CODING->dst_object is a buffer or nil.
5560
5561 If CODING->src_object is a buffer, it must be the current buffer.
5562 In this case, if CODING->src_pos is positive, it is a position of
5563 the source text in the buffer, otherwise. the source text is in the
5564 gap area of the buffer, and coding->src_pos specifies the offset of
5565 the text from GPT (which must be the same as PT). If this is the
5566 same buffer as CODING->dst_object, CODING->src_pos must be
5567 negative and CODING should not have `pre-write-conversion'.
5568
5569 If CODING->src_object is a string, CODING should not have
5570 `pre-write-conversion'.
5571
5572 If CODING->dst_object is a buffer, the encoded data is inserted at
5573 the current point of that buffer.
5574
5575 If CODING->dst_object is nil, the encoded data is placed at the
5576 memory area specified by CODING->destination. */
5577
5578 static int
5579 encode_coding (coding)
5580 struct coding_system *coding;
5581 {
5582 Lisp_Object attrs;
5583
5584 attrs = CODING_ID_ATTRS (coding->id);
5585
5586 if (BUFFERP (coding->dst_object))
5587 {
5588 set_buffer_internal (XBUFFER (coding->dst_object));
5589 coding->dst_multibyte
5590 = ! NILP (current_buffer->enable_multibyte_characters);
5591 }
5592
5593 coding->consumed = coding->consumed_char = 0;
5594 coding->produced = coding->produced_char = 0;
5595 coding->result = CODING_RESULT_SUCCESS;
5596 coding->errors = 0;
5597
5598 ALLOC_CONVERSION_WORK_AREA (coding);
5599
5600 do {
5601 coding_set_source (coding);
5602 consume_chars (coding);
5603
5604 if (!NILP (CODING_ATTR_ENCODE_TBL (attrs)))
5605 translate_chars (CODING_ATTR_ENCODE_TBL (attrs), coding);
5606
5607 coding_set_destination (coding);
5608 (*(coding->encoder)) (coding);
5609 } while (coding->consumed_char < coding->src_chars);
5610
5611 if (BUFFERP (coding->dst_object))
5612 insert_from_gap (coding->produced_char, coding->produced);
5613
5614 return (coding->result);
5615 }
5616
5617 /* Work buffer */
5618
5619 /* List of currently used working buffer. */
5620 Lisp_Object Vcode_conversion_work_buf_list;
5621
5622 /* A working buffer used by the top level conversion. */
5623 Lisp_Object Vcode_conversion_reused_work_buf;
5624
5625
5626 /* Return a working buffer that can be freely used by the following
5627 code conversion. MULTIBYTEP specifies the multibyteness of the
5628 buffer. */
5629
5630 Lisp_Object
5631 make_conversion_work_buffer (multibytep)
5632 int multibytep;
5633 {
5634 struct buffer *current = current_buffer;
5635 Lisp_Object buf;
5636
5637 if (NILP (Vcode_conversion_work_buf_list))
5638 {
5639 if (NILP (Vcode_conversion_reused_work_buf))
5640 Vcode_conversion_reused_work_buf
5641 = Fget_buffer_create (build_string (" *code-conversion-work*"));
5642 Vcode_conversion_work_buf_list
5643 = Fcons (Vcode_conversion_reused_work_buf, Qnil);
5644 }
5645 else
5646 {
5647 int depth = Flength (Vcode_conversion_work_buf_list);
5648 char str[128];
5649
5650 sprintf (str, " *code-conversion-work*<%d>", depth);
5651 Vcode_conversion_work_buf_list
5652 = Fcons (Fget_buffer_create (build_string (str)),
5653 Vcode_conversion_work_buf_list);
5654 }
5655
5656 buf = XCAR (Vcode_conversion_work_buf_list);
5657 set_buffer_internal (XBUFFER (buf));
5658 current_buffer->undo_list = Qt;
5659 Ferase_buffer ();
5660 Fset_buffer_multibyte (multibytep ? Qt : Qnil);
5661 set_buffer_internal (current);
5662 return buf;
5663 }
5664
5665 static struct coding_system *saved_coding;
5666
5667 Lisp_Object
5668 code_conversion_restore (info)
5669 Lisp_Object info;
5670 {
5671 int depth = Flength (Vcode_conversion_work_buf_list);
5672 Lisp_Object buf;
5673
5674 if (depth > 0)
5675 {
5676 buf = XCAR (Vcode_conversion_work_buf_list);
5677 Vcode_conversion_work_buf_list = XCDR (Vcode_conversion_work_buf_list);
5678 if (depth > 1 && !NILP (Fbuffer_live_p (buf)))
5679 Fkill_buffer (buf);
5680 }
5681
5682 if (saved_coding->dst_object == Qt
5683 && saved_coding->destination)
5684 xfree (saved_coding->destination);
5685
5686 return save_excursion_restore (info);
5687 }
5688
5689
5690 int
5691 decode_coding_gap (coding, chars, bytes)
5692 struct coding_system *coding;
5693 EMACS_INT chars, bytes;
5694 {
5695 int count = specpdl_ptr - specpdl;
5696
5697 saved_coding = coding;
5698 record_unwind_protect (code_conversion_restore, save_excursion_save ());
5699
5700 coding->src_object = Fcurrent_buffer ();
5701 coding->src_chars = chars;
5702 coding->src_bytes = bytes;
5703 coding->src_pos = -chars;
5704 coding->src_pos_byte = -bytes;
5705 coding->src_multibyte = chars < bytes;
5706 coding->dst_object = coding->src_object;
5707 coding->dst_pos = PT;
5708 coding->dst_pos_byte = PT_BYTE;
5709
5710 if (CODING_REQUIRE_DETECTION (coding))
5711 detect_coding (coding);
5712
5713 decode_coding (coding);
5714
5715 unbind_to (count, Qnil);
5716 return coding->result;
5717 }
5718
5719 int
5720 encode_coding_gap (coding, chars, bytes)
5721 struct coding_system *coding;
5722 EMACS_INT chars, bytes;
5723 {
5724 int count = specpdl_ptr - specpdl;
5725 Lisp_Object buffer;
5726
5727 saved_coding = coding;
5728 record_unwind_protect (code_conversion_restore, save_excursion_save ());
5729
5730 buffer = Fcurrent_buffer ();
5731 coding->src_object = buffer;
5732 coding->src_chars = chars;
5733 coding->src_bytes = bytes;
5734 coding->src_pos = -chars;
5735 coding->src_pos_byte = -bytes;
5736 coding->src_multibyte = chars < bytes;
5737 coding->dst_object = coding->src_object;
5738 coding->dst_pos = PT;
5739 coding->dst_pos_byte = PT_BYTE;
5740
5741 encode_coding (coding);
5742
5743 unbind_to (count, Qnil);
5744 return coding->result;
5745 }
5746
5747
5748 /* Decode the text in the range FROM/FROM_BYTE and TO/TO_BYTE in
5749 SRC_OBJECT into DST_OBJECT by coding context CODING.
5750
5751 SRC_OBJECT is a buffer, a string, or Qnil.
5752
5753 If it is a buffer, the text is at point of the buffer. FROM and TO
5754 are positions in the buffer.
5755
5756 If it is a string, the text is at the beginning of the string.
5757 FROM and TO are indices to the string.
5758
5759 If it is nil, the text is at coding->source. FROM and TO are
5760 indices to coding->source.
5761
5762 DST_OBJECT is a buffer, Qt, or Qnil.
5763
5764 If it is a buffer, the decoded text is inserted at point of the
5765 buffer. If the buffer is the same as SRC_OBJECT, the source text
5766 is deleted.
5767
5768 If it is Qt, a string is made from the decoded text, and
5769 set in CODING->dst_object.
5770
5771 If it is Qnil, the decoded text is stored at CODING->destination.
5772 The called must allocate CODING->dst_bytes bytes at
5773 CODING->destination by xmalloc. If the decoded text is longer than
5774 CODING->dst_bytes, CODING->destination is relocated by xrealloc.
5775 */
5776
5777 void
5778 decode_coding_object (coding, src_object, from, from_byte, to, to_byte,
5779 dst_object)
5780 struct coding_system *coding;
5781 Lisp_Object src_object;
5782 EMACS_INT from, from_byte, to, to_byte;
5783 Lisp_Object dst_object;
5784 {
5785 int count = specpdl_ptr - specpdl;
5786 unsigned char *destination;
5787 EMACS_INT dst_bytes;
5788 EMACS_INT chars = to - from;
5789 EMACS_INT bytes = to_byte - from_byte;
5790 Lisp_Object attrs;
5791
5792 saved_coding = coding;
5793 record_unwind_protect (code_conversion_restore, save_excursion_save ());
5794
5795 if (NILP (dst_object))
5796 {
5797 destination = coding->destination;
5798 dst_bytes = coding->dst_bytes;
5799 }
5800
5801 coding->src_object = src_object;
5802 coding->src_chars = chars;
5803 coding->src_bytes = bytes;
5804 coding->src_multibyte = chars < bytes;
5805
5806 if (STRINGP (src_object))
5807 {
5808 coding->src_pos = from;
5809 coding->src_pos_byte = from_byte;
5810 }
5811 else if (BUFFERP (src_object))
5812 {
5813 set_buffer_internal (XBUFFER (src_object));
5814 if (from != GPT)
5815 move_gap_both (from, from_byte);
5816 if (EQ (src_object, dst_object))
5817 {
5818 TEMP_SET_PT_BOTH (from, from_byte);
5819 del_range_both (from, from_byte, to, to_byte, 1);
5820 coding->src_pos = -chars;
5821 coding->src_pos_byte = -bytes;
5822 }
5823 else
5824 {
5825 coding->src_pos = from;
5826 coding->src_pos_byte = from_byte;
5827 }
5828 }
5829
5830 if (CODING_REQUIRE_DETECTION (coding))
5831 detect_coding (coding);
5832 attrs = CODING_ID_ATTRS (coding->id);
5833
5834 if (! NILP (CODING_ATTR_POST_READ (attrs))
5835 || EQ (dst_object, Qt))
5836 {
5837 coding->dst_object = make_conversion_work_buffer (1);
5838 coding->dst_pos = BEG;
5839 coding->dst_pos_byte = BEG_BYTE;
5840 coding->dst_multibyte = 1;
5841 }
5842 else if (BUFFERP (dst_object))
5843 {
5844 coding->dst_object = dst_object;
5845 coding->dst_pos = BUF_PT (XBUFFER (dst_object));
5846 coding->dst_pos_byte = BUF_PT_BYTE (XBUFFER (dst_object));
5847 coding->dst_multibyte
5848 = ! NILP (XBUFFER (dst_object)->enable_multibyte_characters);
5849 }
5850 else
5851 {
5852 coding->dst_object = Qnil;
5853 coding->dst_multibyte = 1;
5854 }
5855
5856 decode_coding (coding);
5857
5858 if (BUFFERP (coding->dst_object))
5859 set_buffer_internal (XBUFFER (coding->dst_object));
5860
5861 if (! NILP (CODING_ATTR_POST_READ (attrs)))
5862 {
5863 struct gcpro gcpro1, gcpro2;
5864 EMACS_INT prev_Z = Z, prev_Z_BYTE = Z_BYTE;
5865 Lisp_Object val;
5866
5867 GCPRO2 (coding->src_object, coding->dst_object);
5868 val = call1 (CODING_ATTR_POST_READ (attrs),
5869 make_number (coding->produced_char));
5870 UNGCPRO;
5871 CHECK_NATNUM (val);
5872 coding->produced_char += Z - prev_Z;
5873 coding->produced += Z_BYTE - prev_Z_BYTE;
5874 }
5875
5876 if (EQ (dst_object, Qt))
5877 {
5878 coding->dst_object = Fbuffer_string ();
5879 }
5880 else if (NILP (dst_object) && BUFFERP (coding->dst_object))
5881 {
5882 set_buffer_internal (XBUFFER (coding->dst_object));
5883 if (dst_bytes < coding->produced)
5884 {
5885 destination
5886 = (unsigned char *) xrealloc (destination, coding->produced);
5887 if (! destination)
5888 {
5889 coding->result = CODING_RESULT_INSUFFICIENT_DST;
5890 unbind_to (count, Qnil);
5891 return;
5892 }
5893 if (BEGV < GPT && GPT < BEGV + coding->produced_char)
5894 move_gap_both (BEGV, BEGV_BYTE);
5895 bcopy (BEGV_ADDR, destination, coding->produced);
5896 coding->destination = destination;
5897 }
5898 }
5899
5900 unbind_to (count, Qnil);
5901 }
5902
5903
5904 void
5905 encode_coding_object (coding, src_object, from, from_byte, to, to_byte,
5906 dst_object)
5907 struct coding_system *coding;
5908 Lisp_Object src_object;
5909 EMACS_INT from, from_byte, to, to_byte;
5910 Lisp_Object dst_object;
5911 {
5912 int count = specpdl_ptr - specpdl;
5913 EMACS_INT chars = to - from;
5914 EMACS_INT bytes = to_byte - from_byte;
5915 Lisp_Object attrs;
5916
5917 saved_coding = coding;
5918 record_unwind_protect (code_conversion_restore, save_excursion_save ());
5919
5920 coding->src_object = src_object;
5921 coding->src_chars = chars;
5922 coding->src_bytes = bytes;
5923 coding->src_multibyte = chars < bytes;
5924
5925 attrs = CODING_ID_ATTRS (coding->id);
5926
5927 if (! NILP (CODING_ATTR_PRE_WRITE (attrs)))
5928 {
5929 Lisp_Object val;
5930
5931 coding->src_object = make_conversion_work_buffer (coding->src_multibyte);
5932 set_buffer_internal (XBUFFER (coding->src_object));
5933 if (STRINGP (src_object))
5934 insert_from_string (src_object, from, from_byte, chars, bytes, 0);
5935 else if (BUFFERP (src_object))
5936 insert_from_buffer (XBUFFER (src_object), from, chars, 0);
5937 else
5938 insert_1_both (coding->source + from, chars, bytes, 0, 0, 0);
5939
5940 if (EQ (src_object, dst_object))
5941 {
5942 set_buffer_internal (XBUFFER (src_object));
5943 del_range_both (from, from_byte, to, to_byte, 1);
5944 set_buffer_internal (XBUFFER (coding->src_object));
5945 }
5946
5947 val = call2 (CODING_ATTR_PRE_WRITE (attrs),
5948 make_number (1), make_number (chars));
5949 CHECK_NATNUM (val);
5950 if (BEG != GPT)
5951 move_gap_both (BEG, BEG_BYTE);
5952 coding->src_chars = Z - BEG;
5953 coding->src_bytes = Z_BYTE - BEG_BYTE;
5954 coding->src_pos = BEG;
5955 coding->src_pos_byte = BEG_BYTE;
5956 coding->src_multibyte = Z < Z_BYTE;
5957 }
5958 else if (STRINGP (src_object))
5959 {
5960 coding->src_pos = from;
5961 coding->src_pos_byte = from_byte;
5962 }
5963 else if (BUFFERP (src_object))
5964 {
5965 set_buffer_internal (XBUFFER (src_object));
5966 if (from != GPT)
5967 move_gap_both (from, from_byte);
5968 if (EQ (src_object, dst_object))
5969 {
5970 del_range_both (from, from_byte, to, to_byte, 1);
5971 coding->src_pos = -chars;
5972 coding->src_pos_byte = -bytes;
5973 }
5974 else
5975 {
5976 coding->src_pos = from;
5977 coding->src_pos_byte = from_byte;
5978 }
5979 }
5980
5981 if (BUFFERP (dst_object))
5982 {
5983 coding->dst_object = dst_object;
5984 coding->dst_pos = BUF_PT (XBUFFER (dst_object));
5985 coding->dst_pos_byte = BUF_PT_BYTE (XBUFFER (dst_object));
5986 coding->dst_multibyte
5987 = ! NILP (XBUFFER (dst_object)->enable_multibyte_characters);
5988 }
5989 else if (EQ (dst_object, Qt))
5990 {
5991 coding->dst_object = Qnil;
5992 coding->destination = (unsigned char *) xmalloc (coding->src_chars);
5993 coding->dst_bytes = coding->src_chars;
5994 coding->dst_multibyte = 0;
5995 }
5996 else
5997 {
5998 coding->dst_object = Qnil;
5999 coding->dst_multibyte = 0;
6000 }
6001
6002 encode_coding (coding);
6003
6004 if (EQ (dst_object, Qt))
6005 {
6006 if (BUFFERP (coding->dst_object))
6007 coding->dst_object = Fbuffer_string ();
6008 else
6009 {
6010 coding->dst_object
6011 = make_unibyte_string ((char *) coding->destination,
6012 coding->produced);
6013 xfree (coding->destination);
6014 }
6015 }
6016
6017 unbind_to (count, Qnil);
6018 }
6019
6020
6021 Lisp_Object
6022 preferred_coding_system ()
6023 {
6024 int id = coding_categories[coding_priorities[0]].id;
6025
6026 return CODING_ID_NAME (id);
6027 }
6028
6029 \f
6030 #ifdef emacs
6031 /*** 8. Emacs Lisp library functions ***/
6032
6033 DEFUN ("coding-system-p", Fcoding_system_p, Scoding_system_p, 1, 1, 0,
6034 doc: /* Return t if OBJECT is nil or a coding-system.
6035 See the documentation of `define-coding-system' for information
6036 about coding-system objects. */)
6037 (obj)
6038 Lisp_Object obj;
6039 {
6040 return ((NILP (obj) || CODING_SYSTEM_P (obj)) ? Qt : Qnil);
6041 }
6042
6043 DEFUN ("read-non-nil-coding-system", Fread_non_nil_coding_system,
6044 Sread_non_nil_coding_system, 1, 1, 0,
6045 doc: /* Read a coding system from the minibuffer, prompting with string PROMPT. */)
6046 (prompt)
6047 Lisp_Object prompt;
6048 {
6049 Lisp_Object val;
6050 do
6051 {
6052 val = Fcompleting_read (prompt, Vcoding_system_alist, Qnil,
6053 Qt, Qnil, Qcoding_system_history, Qnil, Qnil);
6054 }
6055 while (XSTRING (val)->size == 0);
6056 return (Fintern (val, Qnil));
6057 }
6058
6059 DEFUN ("read-coding-system", Fread_coding_system, Sread_coding_system, 1, 2, 0,
6060 doc: /* Read a coding system from the minibuffer, prompting with string PROMPT.
6061 If the user enters null input, return second argument DEFAULT-CODING-SYSTEM. */)
6062 (prompt, default_coding_system)
6063 Lisp_Object prompt, default_coding_system;
6064 {
6065 Lisp_Object val;
6066 if (SYMBOLP (default_coding_system))
6067 XSETSTRING (default_coding_system, XSYMBOL (default_coding_system)->name);
6068 val = Fcompleting_read (prompt, Vcoding_system_alist, Qnil,
6069 Qt, Qnil, Qcoding_system_history,
6070 default_coding_system, Qnil);
6071 return (XSTRING (val)->size == 0 ? Qnil : Fintern (val, Qnil));
6072 }
6073
6074 DEFUN ("check-coding-system", Fcheck_coding_system, Scheck_coding_system,
6075 1, 1, 0,
6076 doc: /* Check validity of CODING-SYSTEM.
6077 If valid, return CODING-SYSTEM, else signal a `coding-system-error' error.
6078 It is valid if it is a symbol with a non-nil `coding-system' property.
6079 The value of property should be a vector of length 5. */)
6080 (coding_system)
6081 Lisp_Object coding_system;
6082 {
6083 CHECK_SYMBOL (coding_system);
6084 if (!NILP (Fcoding_system_p (coding_system)))
6085 return coding_system;
6086 while (1)
6087 Fsignal (Qcoding_system_error, Fcons (coding_system, Qnil));
6088 }
6089
6090 \f
6091 Lisp_Object
6092 detect_coding_system (src, src_bytes, highest, multibytep, coding_system)
6093 unsigned char *src;
6094 int src_bytes, highest;
6095 int multibytep;
6096 Lisp_Object coding_system;
6097 {
6098 unsigned char *src_end = src + src_bytes;
6099 int mask = CATEGORY_MASK_ANY;
6100 int detected = 0;
6101 int c, i;
6102 Lisp_Object attrs, eol_type;
6103 Lisp_Object val;
6104 struct coding_system coding;
6105
6106 if (NILP (coding_system))
6107 coding_system = Qundecided;
6108 setup_coding_system (coding_system, &coding);
6109 attrs = CODING_ID_ATTRS (coding.id);
6110 eol_type = CODING_ID_EOL_TYPE (coding.id);
6111
6112 coding.source = src;
6113 coding.src_bytes = src_bytes;
6114 coding.src_multibyte = multibytep;
6115 coding.consumed = 0;
6116
6117 if (XINT (CODING_ATTR_CATEGORY (attrs)) != coding_category_undecided)
6118 {
6119 mask = 1 << XINT (CODING_ATTR_CATEGORY (attrs));
6120 }
6121 else
6122 {
6123 coding_system = Qnil;
6124 for (; src < src_end; src++)
6125 {
6126 c = *src;
6127 if (c & 0x80 || (c < 0x20 && (c == ISO_CODE_ESC
6128 || c == ISO_CODE_SI
6129 || c == ISO_CODE_SO)))
6130 break;
6131 }
6132 coding.head_ascii = src - coding.source;
6133
6134 if (src < src_end)
6135 for (i = 0; i < coding_category_raw_text; i++)
6136 {
6137 enum coding_category category = coding_priorities[i];
6138 struct coding_system *this = coding_categories + category;
6139
6140 if (category >= coding_category_raw_text
6141 || detected & (1 << category))
6142 continue;
6143
6144 if (this->id < 0)
6145 {
6146 /* No coding system of this category is defined. */
6147 mask &= ~(1 << category);
6148 }
6149 else
6150 {
6151 detected |= detected_mask[category];
6152 if ((*(coding_categories[category].detector)) (&coding, &mask)
6153 && highest)
6154 {
6155 mask &= detected_mask[category];
6156 break;
6157 }
6158 }
6159 }
6160 }
6161
6162 if (!mask)
6163 val = Fcons (make_number (coding_category_raw_text), Qnil);
6164 else if (mask == CATEGORY_MASK_ANY)
6165 val = Fcons (make_number (coding_category_undecided), Qnil);
6166 else if (highest)
6167 {
6168 for (i = 0; i < coding_category_raw_text; i++)
6169 if (mask & (1 << coding_priorities[i]))
6170 {
6171 val = Fcons (make_number (coding_priorities[i]), Qnil);
6172 break;
6173 }
6174 }
6175 else
6176 {
6177 val = Qnil;
6178 for (i = coding_category_raw_text - 1; i >= 0; i--)
6179 if (mask & (1 << coding_priorities[i]))
6180 val = Fcons (make_number (coding_priorities[i]), val);
6181 }
6182
6183 {
6184 int one_byte_eol = -1, two_byte_eol = -1;
6185 Lisp_Object tail;
6186
6187 for (tail = val; CONSP (tail); tail = XCDR (tail))
6188 {
6189 struct coding_system *this
6190 = (NILP (coding_system) ? coding_categories + XINT (XCAR (tail))
6191 : &coding);
6192 int this_eol;
6193
6194 attrs = CODING_ID_ATTRS (this->id);
6195 eol_type = CODING_ID_EOL_TYPE (this->id);
6196 XSETCAR (tail, CODING_ID_NAME (this->id));
6197 if (VECTORP (eol_type))
6198 {
6199 if (EQ (CODING_ATTR_TYPE (attrs), Qutf_16))
6200 {
6201 if (two_byte_eol < 0)
6202 two_byte_eol = detect_eol (this, coding.source, src_bytes);
6203 this_eol = two_byte_eol;
6204 }
6205 else
6206 {
6207 if (one_byte_eol < 0)
6208 one_byte_eol =detect_eol (this, coding.source, src_bytes);
6209 this_eol = one_byte_eol;
6210 }
6211 if (this_eol == EOL_SEEN_LF)
6212 XSETCAR (tail, AREF (eol_type, 0));
6213 else if (this_eol == EOL_SEEN_CRLF)
6214 XSETCAR (tail, AREF (eol_type, 1));
6215 else if (this_eol == EOL_SEEN_CR)
6216 XSETCAR (tail, AREF (eol_type, 2));
6217 }
6218 }
6219 }
6220
6221 return (highest ? XCAR (val) : val);
6222 }
6223
6224
6225 DEFUN ("detect-coding-region", Fdetect_coding_region, Sdetect_coding_region,
6226 2, 3, 0,
6227 doc: /* Detect coding system of the text in the region between START and END.
6228 Return a list of possible coding systems ordered by priority.
6229
6230 If only ASCII characters are found, it returns a list of single element
6231 `undecided' or its subsidiary coding system according to a detected
6232 end-of-line format.
6233
6234 If optional argument HIGHEST is non-nil, return the coding system of
6235 highest priority. */)
6236 (start, end, highest)
6237 Lisp_Object start, end, highest;
6238 {
6239 int from, to;
6240 int from_byte, to_byte;
6241
6242 CHECK_NUMBER_COERCE_MARKER (start);
6243 CHECK_NUMBER_COERCE_MARKER (end);
6244
6245 validate_region (&start, &end);
6246 from = XINT (start), to = XINT (end);
6247 from_byte = CHAR_TO_BYTE (from);
6248 to_byte = CHAR_TO_BYTE (to);
6249
6250 if (from < GPT && to >= GPT)
6251 move_gap_both (to, to_byte);
6252
6253 return detect_coding_system (BYTE_POS_ADDR (from_byte),
6254 to_byte - from_byte,
6255 !NILP (highest),
6256 !NILP (current_buffer
6257 ->enable_multibyte_characters),
6258 Qnil);
6259 }
6260
6261 DEFUN ("detect-coding-string", Fdetect_coding_string, Sdetect_coding_string,
6262 1, 2, 0,
6263 doc: /* Detect coding system of the text in STRING.
6264 Return a list of possible coding systems ordered by priority.
6265
6266 If only ASCII characters are found, it returns a list of single element
6267 `undecided' or its subsidiary coding system according to a detected
6268 end-of-line format.
6269
6270 If optional argument HIGHEST is non-nil, return the coding system of
6271 highest priority. */)
6272 (string, highest)
6273 Lisp_Object string, highest;
6274 {
6275 CHECK_STRING (string);
6276
6277 return detect_coding_system (XSTRING (string)->data,
6278 STRING_BYTES (XSTRING (string)),
6279 !NILP (highest),
6280 STRING_MULTIBYTE (string),
6281 Qnil);
6282 }
6283
6284
6285 static INLINE int
6286 char_encodable_p (c, attrs)
6287 int c;
6288 Lisp_Object attrs;
6289 {
6290 Lisp_Object tail;
6291 struct charset *charset;
6292
6293 for (tail = CODING_ATTR_CHARSET_LIST (attrs);
6294 CONSP (tail); tail = XCDR (tail))
6295 {
6296 charset = CHARSET_FROM_ID (XINT (XCAR (tail)));
6297 if (CHAR_CHARSET_P (c, charset))
6298 break;
6299 }
6300 return (! NILP (tail));
6301 }
6302
6303
6304 /* Return a list of coding systems that safely encode the text between
6305 START and END. If EXCLUDE is non-nil, it is a list of coding
6306 systems not to check. The returned list doesn't contain any such
6307 coding systems. In any case, If the text contains only ASCII or is
6308 unibyte, return t. */
6309
6310 DEFUN ("find-coding-systems-region-internal",
6311 Ffind_coding_systems_region_internal,
6312 Sfind_coding_systems_region_internal, 2, 3, 0,
6313 doc: /* Internal use only. */)
6314 (start, end, exclude)
6315 Lisp_Object start, end, exclude;
6316 {
6317 Lisp_Object coding_attrs_list, safe_codings;
6318 EMACS_INT start_byte, end_byte;
6319 unsigned char *p, *pbeg, *pend;
6320 int c;
6321 Lisp_Object tail, elt;
6322
6323 if (STRINGP (start))
6324 {
6325 if (!STRING_MULTIBYTE (start)
6326 && XSTRING (start)->size != STRING_BYTES (XSTRING (start)))
6327 return Qt;
6328 start_byte = 0;
6329 end_byte = STRING_BYTES (XSTRING (start));
6330 }
6331 else
6332 {
6333 CHECK_NUMBER_COERCE_MARKER (start);
6334 CHECK_NUMBER_COERCE_MARKER (end);
6335 if (XINT (start) < BEG || XINT (end) > Z || XINT (start) > XINT (end))
6336 args_out_of_range (start, end);
6337 if (NILP (current_buffer->enable_multibyte_characters))
6338 return Qt;
6339 start_byte = CHAR_TO_BYTE (XINT (start));
6340 end_byte = CHAR_TO_BYTE (XINT (end));
6341 if (XINT (end) - XINT (start) == end_byte - start_byte)
6342 return Qt;
6343
6344 if (start < GPT && end > GPT)
6345 {
6346 if ((GPT - start) < (end - GPT))
6347 move_gap_both (start, start_byte);
6348 else
6349 move_gap_both (end, end_byte);
6350 }
6351 }
6352
6353 coding_attrs_list = Qnil;
6354 for (tail = Vcoding_system_list; CONSP (tail); tail = XCDR (tail))
6355 if (NILP (exclude)
6356 || NILP (Fmemq (XCAR (tail), exclude)))
6357 {
6358 Lisp_Object attrs;
6359
6360 attrs = AREF (CODING_SYSTEM_SPEC (XCAR (tail)), 0);
6361 if (EQ (XCAR (tail), CODING_ATTR_BASE_NAME (attrs))
6362 && ! EQ (CODING_ATTR_TYPE (attrs), Qundecided))
6363 coding_attrs_list = Fcons (attrs, coding_attrs_list);
6364 }
6365
6366 if (STRINGP (start))
6367 p = pbeg = XSTRING (start)->data;
6368 else
6369 p = pbeg = BYTE_POS_ADDR (start_byte);
6370 pend = p + (end_byte - start_byte);
6371
6372 while (p < pend && ASCII_BYTE_P (*p)) p++;
6373 while (p < pend && ASCII_BYTE_P (*(pend - 1))) pend--;
6374
6375 while (p < pend)
6376 {
6377 if (ASCII_BYTE_P (*p))
6378 p++;
6379 else
6380 {
6381 c = STRING_CHAR_ADVANCE (p);
6382
6383 charset_map_loaded = 0;
6384 for (tail = coding_attrs_list; CONSP (tail);)
6385 {
6386 elt = XCAR (tail);
6387 if (NILP (elt))
6388 tail = XCDR (tail);
6389 else if (char_encodable_p (c, elt))
6390 tail = XCDR (tail);
6391 else if (CONSP (XCDR (tail)))
6392 {
6393 XSETCAR (tail, XCAR (XCDR (tail)));
6394 XSETCDR (tail, XCDR (XCDR (tail)));
6395 }
6396 else
6397 {
6398 XSETCAR (tail, Qnil);
6399 tail = XCDR (tail);
6400 }
6401 }
6402 if (charset_map_loaded)
6403 {
6404 EMACS_INT p_offset = p - pbeg, pend_offset = pend - pbeg;
6405
6406 if (STRINGP (start))
6407 pbeg = XSTRING (start)->data;
6408 else
6409 pbeg = BYTE_POS_ADDR (start_byte);
6410 p = pbeg + p_offset;
6411 pend = pbeg + pend_offset;
6412 }
6413 }
6414 }
6415
6416 safe_codings = Qnil;
6417 for (tail = coding_attrs_list; CONSP (tail); tail = XCDR (tail))
6418 if (! NILP (XCAR (tail)))
6419 safe_codings = Fcons (CODING_ATTR_BASE_NAME (XCAR (tail)), safe_codings);
6420
6421 return safe_codings;
6422 }
6423
6424
6425 DEFUN ("check-coding-systems-region", Fcheck_coding_systems_region,
6426 Scheck_coding_systems_region, 3, 3, 0,
6427 doc: /* Check if the region is encodable by coding systems.
6428
6429 START and END are buffer positions specifying the region.
6430 CODING-SYSTEM-LIST is a list of coding systems to check.
6431
6432 The value is an alist ((CODING-SYSTEM POS0 POS1 ...) ...), where
6433 CODING-SYSTEM is a member of CODING-SYSTEM-LIst and can't encode the
6434 whole region, POS0, POS1, ... are buffer positions where non-encodable
6435 characters are found.
6436
6437 If all coding systems in CODING-SYSTEM-LIST can encode the region, the
6438 value is nil.
6439
6440 START may be a string. In that case, check if the string is
6441 encodable, and the value contains indices to the string instead of
6442 buffer positions. END is ignored. */)
6443 (start, end, coding_system_list)
6444 Lisp_Object start, end, coding_system_list;
6445 {
6446 Lisp_Object list;
6447 EMACS_INT start_byte, end_byte;
6448 int pos;
6449 unsigned char *p, *pbeg, *pend;
6450 int c;
6451 Lisp_Object tail, elt;
6452
6453 if (STRINGP (start))
6454 {
6455 if (!STRING_MULTIBYTE (start)
6456 && XSTRING (start)->size != STRING_BYTES (XSTRING (start)))
6457 return Qnil;
6458 start_byte = 0;
6459 end_byte = STRING_BYTES (XSTRING (start));
6460 pos = 0;
6461 }
6462 else
6463 {
6464 CHECK_NUMBER_COERCE_MARKER (start);
6465 CHECK_NUMBER_COERCE_MARKER (end);
6466 if (XINT (start) < BEG || XINT (end) > Z || XINT (start) > XINT (end))
6467 args_out_of_range (start, end);
6468 if (NILP (current_buffer->enable_multibyte_characters))
6469 return Qnil;
6470 start_byte = CHAR_TO_BYTE (XINT (start));
6471 end_byte = CHAR_TO_BYTE (XINT (end));
6472 if (XINT (end) - XINT (start) == end_byte - start_byte)
6473 return Qt;
6474
6475 if (start < GPT && end > GPT)
6476 {
6477 if ((GPT - start) < (end - GPT))
6478 move_gap_both (start, start_byte);
6479 else
6480 move_gap_both (end, end_byte);
6481 }
6482 pos = start;
6483 }
6484
6485 list = Qnil;
6486 for (tail = coding_system_list; CONSP (tail); tail = XCDR (tail))
6487 {
6488 elt = XCAR (tail);
6489 list = Fcons (Fcons (elt, Fcons (AREF (CODING_SYSTEM_SPEC (elt), 0),
6490 Qnil)),
6491 list);
6492 }
6493
6494 if (STRINGP (start))
6495 p = pbeg = XSTRING (start)->data;
6496 else
6497 p = pbeg = BYTE_POS_ADDR (start_byte);
6498 pend = p + (end_byte - start_byte);
6499
6500 while (p < pend && ASCII_BYTE_P (*p)) p++, pos++;
6501 while (p < pend && ASCII_BYTE_P (*(pend - 1))) pend--;
6502
6503 while (p < pend)
6504 {
6505 if (ASCII_BYTE_P (*p))
6506 p++;
6507 else
6508 {
6509 c = STRING_CHAR_ADVANCE (p);
6510
6511 charset_map_loaded = 0;
6512 for (tail = list; CONSP (tail); tail = XCDR (tail))
6513 {
6514 elt = XCDR (XCAR (tail));
6515 if (! char_encodable_p (c, XCAR (elt)))
6516 XSETCDR (elt, Fcons (make_number (pos), XCDR (elt)));
6517 }
6518 if (charset_map_loaded)
6519 {
6520 EMACS_INT p_offset = p - pbeg, pend_offset = pend - pbeg;
6521
6522 if (STRINGP (start))
6523 pbeg = XSTRING (start)->data;
6524 else
6525 pbeg = BYTE_POS_ADDR (start_byte);
6526 p = pbeg + p_offset;
6527 pend = pbeg + pend_offset;
6528 }
6529 }
6530 pos++;
6531 }
6532
6533 tail = list;
6534 list = Qnil;
6535 for (; CONSP (tail); tail = XCDR (tail))
6536 {
6537 elt = XCAR (tail);
6538 if (CONSP (XCDR (XCDR (elt))))
6539 list = Fcons (Fcons (XCAR (elt), Fnreverse (XCDR (XCDR (elt)))),
6540 list);
6541 }
6542
6543 return list;
6544 }
6545
6546
6547
6548 Lisp_Object
6549 code_convert_region (start, end, coding_system, dst_object, encodep, norecord)
6550 Lisp_Object start, end, coding_system, dst_object;
6551 int encodep, norecord;
6552 {
6553 struct coding_system coding;
6554 EMACS_INT from, from_byte, to, to_byte;
6555 Lisp_Object src_object;
6556
6557 CHECK_NUMBER_COERCE_MARKER (start);
6558 CHECK_NUMBER_COERCE_MARKER (end);
6559 if (NILP (coding_system))
6560 coding_system = Qno_conversion;
6561 else
6562 CHECK_CODING_SYSTEM (coding_system);
6563 src_object = Fcurrent_buffer ();
6564 if (NILP (dst_object))
6565 dst_object = src_object;
6566 else if (! EQ (dst_object, Qt))
6567 CHECK_BUFFER (dst_object);
6568
6569 validate_region (&start, &end);
6570 from = XFASTINT (start);
6571 from_byte = CHAR_TO_BYTE (from);
6572 to = XFASTINT (end);
6573 to_byte = CHAR_TO_BYTE (to);
6574
6575 setup_coding_system (coding_system, &coding);
6576 coding.mode |= CODING_MODE_LAST_BLOCK;
6577
6578 if (encodep)
6579 encode_coding_object (&coding, src_object, from, from_byte, to, to_byte,
6580 dst_object);
6581 else
6582 decode_coding_object (&coding, src_object, from, from_byte, to, to_byte,
6583 dst_object);
6584 if (! norecord)
6585 Vlast_coding_system_used = CODING_ID_NAME (coding.id);
6586
6587 if (coding.result != CODING_RESULT_SUCCESS)
6588 error ("Code conversion error: %d", coding.result);
6589
6590 return (BUFFERP (dst_object)
6591 ? make_number (coding.produced_char)
6592 : coding.dst_object);
6593 }
6594
6595
6596 DEFUN ("decode-coding-region", Fdecode_coding_region, Sdecode_coding_region,
6597 3, 4, "r\nzCoding system: ",
6598 doc: /* Decode the current region from the specified coding system.
6599 When called from a program, takes four arguments:
6600 START, END, CODING-SYSTEM, and DESTINATION.
6601 START and END are buffer positions.
6602
6603 Optional 4th arguments DESTINATION specifies where the decoded text goes.
6604 If nil, the region between START and END is replace by the decoded text.
6605 If buffer, the decoded text is inserted in the buffer.
6606 If t, the decoded text is returned.
6607
6608 This function sets `last-coding-system-used' to the precise coding system
6609 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
6610 not fully specified.)
6611 It returns the length of the decoded text. */)
6612 (start, end, coding_system, destination)
6613 Lisp_Object start, end, coding_system, destination;
6614 {
6615 return code_convert_region (start, end, coding_system, destination, 0, 0);
6616 }
6617
6618 DEFUN ("encode-coding-region", Fencode_coding_region, Sencode_coding_region,
6619 3, 4, "r\nzCoding system: ",
6620 doc: /* Encode the current region by specified coding system.
6621 When called from a program, takes three arguments:
6622 START, END, and CODING-SYSTEM. START and END are buffer positions.
6623
6624 Optional 4th arguments DESTINATION specifies where the encoded text goes.
6625 If nil, the region between START and END is replace by the encoded text.
6626 If buffer, the encoded text is inserted in the buffer.
6627 If t, the encoded text is returned.
6628
6629 This function sets `last-coding-system-used' to the precise coding system
6630 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
6631 not fully specified.)
6632 It returns the length of the encoded text. */)
6633 (start, end, coding_system, destination)
6634 Lisp_Object start, end, coding_system, destination;
6635 {
6636 return code_convert_region (start, end, coding_system, destination, 1, 0);
6637 }
6638
6639 Lisp_Object
6640 code_convert_string (string, coding_system, dst_object,
6641 encodep, nocopy, norecord)
6642 Lisp_Object string, coding_system, dst_object;
6643 int encodep, nocopy, norecord;
6644 {
6645 struct coding_system coding;
6646 EMACS_INT chars, bytes;
6647
6648 CHECK_STRING (string);
6649 if (NILP (coding_system))
6650 {
6651 if (! norecord)
6652 Vlast_coding_system_used = Qno_conversion;
6653 if (NILP (dst_object))
6654 return (nocopy ? Fcopy_sequence (string) : string);
6655 }
6656
6657 if (NILP (coding_system))
6658 coding_system = Qno_conversion;
6659 else
6660 CHECK_CODING_SYSTEM (coding_system);
6661 if (NILP (dst_object))
6662 dst_object = Qt;
6663 else if (! EQ (dst_object, Qt))
6664 CHECK_BUFFER (dst_object);
6665
6666 setup_coding_system (coding_system, &coding);
6667 coding.mode |= CODING_MODE_LAST_BLOCK;
6668 chars = XSTRING (string)->size;
6669 bytes = STRING_BYTES (XSTRING (string));
6670 if (encodep)
6671 encode_coding_object (&coding, string, 0, 0, chars, bytes, dst_object);
6672 else
6673 decode_coding_object (&coding, string, 0, 0, chars, bytes, dst_object);
6674 if (! norecord)
6675 Vlast_coding_system_used = CODING_ID_NAME (coding.id);
6676
6677 if (coding.result != CODING_RESULT_SUCCESS)
6678 error ("Code conversion error: %d", coding.result);
6679
6680 return (BUFFERP (dst_object)
6681 ? make_number (coding.produced_char)
6682 : coding.dst_object);
6683 }
6684
6685
6686 /* Encode or decode STRING according to CODING_SYSTEM.
6687 Do not set Vlast_coding_system_used.
6688
6689 This function is called only from macros DECODE_FILE and
6690 ENCODE_FILE, thus we ignore character composition. */
6691
6692 Lisp_Object
6693 code_convert_string_norecord (string, coding_system, encodep)
6694 Lisp_Object string, coding_system;
6695 int encodep;
6696 {
6697 return code_convert_string (string, coding_system, Qt, encodep, 0, 1);
6698 }
6699
6700
6701 DEFUN ("decode-coding-string", Fdecode_coding_string, Sdecode_coding_string,
6702 2, 4, 0,
6703 doc: /* Decode STRING which is encoded in CODING-SYSTEM, and return the result.
6704
6705 Optional third arg NOCOPY non-nil means it is OK to return STRING itself
6706 if the decoding operation is trivial.
6707
6708 Optional fourth arg BUFFER non-nil meant that the decoded text is
6709 inserted in BUFFER instead of returned as a astring. In this case,
6710 the return value is BUFFER.
6711
6712 This function sets `last-coding-system-used' to the precise coding system
6713 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
6714 not fully specified. */)
6715 (string, coding_system, nocopy, buffer)
6716 Lisp_Object string, coding_system, nocopy, buffer;
6717 {
6718 return code_convert_string (string, coding_system, buffer,
6719 0, ! NILP (nocopy), 0);
6720 }
6721
6722 DEFUN ("encode-coding-string", Fencode_coding_string, Sencode_coding_string,
6723 2, 4, 0,
6724 doc: /* Encode STRING to CODING-SYSTEM, and return the result.
6725
6726 Optional third arg NOCOPY non-nil means it is OK to return STRING
6727 itself if the encoding operation is trivial.
6728
6729 Optional fourth arg BUFFER non-nil meant that the encoded text is
6730 inserted in BUFFER instead of returned as a astring. In this case,
6731 the return value is BUFFER.
6732
6733 This function sets `last-coding-system-used' to the precise coding system
6734 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
6735 not fully specified.) */)
6736 (string, coding_system, nocopy, buffer)
6737 Lisp_Object string, coding_system, nocopy, buffer;
6738 {
6739 return code_convert_string (string, coding_system, buffer,
6740 nocopy, ! NILP (nocopy), 1);
6741 }
6742
6743 \f
6744 DEFUN ("decode-sjis-char", Fdecode_sjis_char, Sdecode_sjis_char, 1, 1, 0,
6745 doc: /* Decode a Japanese character which has CODE in shift_jis encoding.
6746 Return the corresponding character. */)
6747 (code)
6748 Lisp_Object code;
6749 {
6750 Lisp_Object spec, attrs, val;
6751 struct charset *charset_roman, *charset_kanji, *charset_kana, *charset;
6752 int c;
6753
6754 CHECK_NATNUM (code);
6755 c = XFASTINT (code);
6756 CHECK_CODING_SYSTEM_GET_SPEC (Vsjis_coding_system, spec);
6757 attrs = AREF (spec, 0);
6758
6759 if (ASCII_BYTE_P (c)
6760 && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
6761 return code;
6762
6763 val = CODING_ATTR_CHARSET_LIST (attrs);
6764 charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
6765 charset_kanji = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
6766 charset_kana = CHARSET_FROM_ID (XINT (XCAR (val)));
6767
6768 if (c <= 0x7F)
6769 charset = charset_roman;
6770 else if (c >= 0xA0 && c < 0xDF)
6771 {
6772 charset = charset_kana;
6773 c -= 0x80;
6774 }
6775 else
6776 {
6777 int s1 = c >> 8, s2 = c & 0x7F;
6778
6779 if (s1 < 0x81 || (s1 > 0x9F && s1 < 0xE0) || s1 > 0xEF
6780 || s2 < 0x40 || s2 == 0x7F || s2 > 0xFC)
6781 error ("Invalid code: %d", code);
6782 SJIS_TO_JIS (c);
6783 charset = charset_kanji;
6784 }
6785 c = DECODE_CHAR (charset, c);
6786 if (c < 0)
6787 error ("Invalid code: %d", code);
6788 return make_number (c);
6789 }
6790
6791
6792 DEFUN ("encode-sjis-char", Fencode_sjis_char, Sencode_sjis_char, 1, 1, 0,
6793 doc: /* Encode a Japanese character CHAR to shift_jis encoding.
6794 Return the corresponding code in SJIS. */)
6795 (ch)
6796 Lisp_Object ch;
6797 {
6798 Lisp_Object spec, attrs, charset_list;
6799 int c;
6800 struct charset *charset;
6801 unsigned code;
6802
6803 CHECK_CHARACTER (ch);
6804 c = XFASTINT (ch);
6805 CHECK_CODING_SYSTEM_GET_SPEC (Vsjis_coding_system, spec);
6806 attrs = AREF (spec, 0);
6807
6808 if (ASCII_CHAR_P (c)
6809 && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
6810 return ch;
6811
6812 charset_list = CODING_ATTR_CHARSET_LIST (attrs);
6813 charset = char_charset (c, charset_list, &code);
6814 if (code == CHARSET_INVALID_CODE (charset))
6815 error ("Can't encode by shift_jis encoding: %d", c);
6816 JIS_TO_SJIS (code);
6817
6818 return make_number (code);
6819 }
6820
6821 DEFUN ("decode-big5-char", Fdecode_big5_char, Sdecode_big5_char, 1, 1, 0,
6822 doc: /* Decode a Big5 character which has CODE in BIG5 coding system.
6823 Return the corresponding character. */)
6824 (code)
6825 Lisp_Object code;
6826 {
6827 Lisp_Object spec, attrs, val;
6828 struct charset *charset_roman, *charset_big5, *charset;
6829 int c;
6830
6831 CHECK_NATNUM (code);
6832 c = XFASTINT (code);
6833 CHECK_CODING_SYSTEM_GET_SPEC (Vbig5_coding_system, spec);
6834 attrs = AREF (spec, 0);
6835
6836 if (ASCII_BYTE_P (c)
6837 && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
6838 return code;
6839
6840 val = CODING_ATTR_CHARSET_LIST (attrs);
6841 charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
6842 charset_big5 = CHARSET_FROM_ID (XINT (XCAR (val)));
6843
6844 if (c <= 0x7F)
6845 charset = charset_roman;
6846 else
6847 {
6848 int b1 = c >> 8, b2 = c & 0x7F;
6849 if (b1 < 0xA1 || b1 > 0xFE
6850 || b2 < 0x40 || (b2 > 0x7E && b2 < 0xA1) || b2 > 0xFE)
6851 error ("Invalid code: %d", code);
6852 charset = charset_big5;
6853 }
6854 c = DECODE_CHAR (charset, (unsigned )c);
6855 if (c < 0)
6856 error ("Invalid code: %d", code);
6857 return make_number (c);
6858 }
6859
6860 DEFUN ("encode-big5-char", Fencode_big5_char, Sencode_big5_char, 1, 1, 0,
6861 doc: /* Encode the Big5 character CHAR to BIG5 coding system.
6862 Return the corresponding character code in Big5. */)
6863 (ch)
6864 Lisp_Object ch;
6865 {
6866 Lisp_Object spec, attrs, charset_list;
6867 struct charset *charset;
6868 int c;
6869 unsigned code;
6870
6871 CHECK_CHARACTER (ch);
6872 c = XFASTINT (ch);
6873 CHECK_CODING_SYSTEM_GET_SPEC (Vbig5_coding_system, spec);
6874 attrs = AREF (spec, 0);
6875 if (ASCII_CHAR_P (c)
6876 && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
6877 return ch;
6878
6879 charset_list = CODING_ATTR_CHARSET_LIST (attrs);
6880 charset = char_charset (c, charset_list, &code);
6881 if (code == CHARSET_INVALID_CODE (charset))
6882 error ("Can't encode by Big5 encoding: %d", c);
6883
6884 return make_number (code);
6885 }
6886
6887 \f
6888 DEFUN ("set-terminal-coding-system-internal",
6889 Fset_terminal_coding_system_internal,
6890 Sset_terminal_coding_system_internal, 1, 1, 0,
6891 doc: /* Internal use only. */)
6892 (coding_system)
6893 {
6894 CHECK_SYMBOL (coding_system);
6895 setup_coding_system (Fcheck_coding_system (coding_system),
6896 &terminal_coding);
6897
6898 /* We had better not send unsafe characters to terminal. */
6899 terminal_coding.mode |= CODING_MODE_SAFE_ENCODING;
6900 /* Characer composition should be disabled. */
6901 terminal_coding.common_flags &= ~CODING_ANNOTATE_COMPOSITION_MASK;
6902 terminal_coding.src_multibyte = 1;
6903 terminal_coding.dst_multibyte = 0;
6904 return Qnil;
6905 }
6906
6907 DEFUN ("set-safe-terminal-coding-system-internal",
6908 Fset_safe_terminal_coding_system_internal,
6909 Sset_safe_terminal_coding_system_internal, 1, 1, 0,
6910 doc: /* Internal use only. */)
6911 (coding_system)
6912 {
6913 CHECK_SYMBOL (coding_system);
6914 setup_coding_system (Fcheck_coding_system (coding_system),
6915 &safe_terminal_coding);
6916 /* Characer composition should be disabled. */
6917 safe_terminal_coding.common_flags &= ~CODING_ANNOTATE_COMPOSITION_MASK;
6918 safe_terminal_coding.src_multibyte = 1;
6919 safe_terminal_coding.dst_multibyte = 0;
6920 return Qnil;
6921 }
6922
6923 DEFUN ("terminal-coding-system",
6924 Fterminal_coding_system, Sterminal_coding_system, 0, 0, 0,
6925 doc: /* Return coding system specified for terminal output. */)
6926 ()
6927 {
6928 return CODING_ID_NAME (terminal_coding.id);
6929 }
6930
6931 DEFUN ("set-keyboard-coding-system-internal",
6932 Fset_keyboard_coding_system_internal,
6933 Sset_keyboard_coding_system_internal, 1, 1, 0,
6934 doc: /* Internal use only. */)
6935 (coding_system)
6936 Lisp_Object coding_system;
6937 {
6938 CHECK_SYMBOL (coding_system);
6939 setup_coding_system (Fcheck_coding_system (coding_system),
6940 &keyboard_coding);
6941 /* Characer composition should be disabled. */
6942 keyboard_coding.common_flags &= ~CODING_ANNOTATE_COMPOSITION_MASK;
6943 return Qnil;
6944 }
6945
6946 DEFUN ("keyboard-coding-system",
6947 Fkeyboard_coding_system, Skeyboard_coding_system, 0, 0, 0,
6948 doc: /* Return coding system specified for decoding keyboard input. */)
6949 ()
6950 {
6951 return CODING_ID_NAME (keyboard_coding.id);
6952 }
6953
6954 \f
6955 DEFUN ("find-operation-coding-system", Ffind_operation_coding_system,
6956 Sfind_operation_coding_system, 1, MANY, 0,
6957 doc: /* Choose a coding system for an operation based on the target name.
6958 The value names a pair of coding systems: (DECODING-SYSTEM . ENCODING-SYSTEM).
6959 DECODING-SYSTEM is the coding system to use for decoding
6960 \(in case OPERATION does decoding), and ENCODING-SYSTEM is the coding system
6961 for encoding (in case OPERATION does encoding).
6962
6963 The first argument OPERATION specifies an I/O primitive:
6964 For file I/O, `insert-file-contents' or `write-region'.
6965 For process I/O, `call-process', `call-process-region', or `start-process'.
6966 For network I/O, `open-network-stream'.
6967
6968 The remaining arguments should be the same arguments that were passed
6969 to the primitive. Depending on which primitive, one of those arguments
6970 is selected as the TARGET. For example, if OPERATION does file I/O,
6971 whichever argument specifies the file name is TARGET.
6972
6973 TARGET has a meaning which depends on OPERATION:
6974 For file I/O, TARGET is a file name.
6975 For process I/O, TARGET is a process name.
6976 For network I/O, TARGET is a service name or a port number
6977
6978 This function looks up what specified for TARGET in,
6979 `file-coding-system-alist', `process-coding-system-alist',
6980 or `network-coding-system-alist' depending on OPERATION.
6981 They may specify a coding system, a cons of coding systems,
6982 or a function symbol to call.
6983 In the last case, we call the function with one argument,
6984 which is a list of all the arguments given to this function.
6985
6986 usage: (find-operation-coding-system OPERATION ARGUMENTS ...) */)
6987 (nargs, args)
6988 int nargs;
6989 Lisp_Object *args;
6990 {
6991 Lisp_Object operation, target_idx, target, val;
6992 register Lisp_Object chain;
6993
6994 if (nargs < 2)
6995 error ("Too few arguments");
6996 operation = args[0];
6997 if (!SYMBOLP (operation)
6998 || !INTEGERP (target_idx = Fget (operation, Qtarget_idx)))
6999 error ("Invalid first arguement");
7000 if (nargs < 1 + XINT (target_idx))
7001 error ("Too few arguments for operation: %s",
7002 XSYMBOL (operation)->name->data);
7003 target = args[XINT (target_idx) + 1];
7004 if (!(STRINGP (target)
7005 || (EQ (operation, Qopen_network_stream) && INTEGERP (target))))
7006 error ("Invalid %dth argument", XINT (target_idx) + 1);
7007
7008 chain = ((EQ (operation, Qinsert_file_contents)
7009 || EQ (operation, Qwrite_region))
7010 ? Vfile_coding_system_alist
7011 : (EQ (operation, Qopen_network_stream)
7012 ? Vnetwork_coding_system_alist
7013 : Vprocess_coding_system_alist));
7014 if (NILP (chain))
7015 return Qnil;
7016
7017 for (; CONSP (chain); chain = XCDR (chain))
7018 {
7019 Lisp_Object elt;
7020
7021 elt = XCAR (chain);
7022 if (CONSP (elt)
7023 && ((STRINGP (target)
7024 && STRINGP (XCAR (elt))
7025 && fast_string_match (XCAR (elt), target) >= 0)
7026 || (INTEGERP (target) && EQ (target, XCAR (elt)))))
7027 {
7028 val = XCDR (elt);
7029 /* Here, if VAL is both a valid coding system and a valid
7030 function symbol, we return VAL as a coding system. */
7031 if (CONSP (val))
7032 return val;
7033 if (! SYMBOLP (val))
7034 return Qnil;
7035 if (! NILP (Fcoding_system_p (val)))
7036 return Fcons (val, val);
7037 if (! NILP (Ffboundp (val)))
7038 {
7039 val = call1 (val, Flist (nargs, args));
7040 if (CONSP (val))
7041 return val;
7042 if (SYMBOLP (val) && ! NILP (Fcoding_system_p (val)))
7043 return Fcons (val, val);
7044 }
7045 return Qnil;
7046 }
7047 }
7048 return Qnil;
7049 }
7050
7051 DEFUN ("set-coding-system-priority", Fset_coding_system_priority,
7052 Sset_coding_system_priority, 1, MANY, 0,
7053 doc: /* Put higher priority to coding systems of the arguments. */)
7054 (nargs, args)
7055 int nargs;
7056 Lisp_Object *args;
7057 {
7058 int i, j;
7059 int changed[coding_category_max];
7060 enum coding_category priorities[coding_category_max];
7061
7062 bzero (changed, sizeof changed);
7063
7064 for (i = j = 0; i < nargs; i++)
7065 {
7066 enum coding_category category;
7067 Lisp_Object spec, attrs;
7068
7069 CHECK_CODING_SYSTEM_GET_SPEC (args[i], spec);
7070 attrs = AREF (spec, 0);
7071 category = XINT (CODING_ATTR_CATEGORY (attrs));
7072 if (changed[category])
7073 /* Ignore this coding system because a coding system of the
7074 same category already had a higher priority. */
7075 continue;
7076 changed[category] = 1;
7077 priorities[j++] = category;
7078 if (coding_categories[category].id >= 0
7079 && ! EQ (args[i], CODING_ID_NAME (coding_categories[category].id)))
7080 setup_coding_system (args[i], &coding_categories[category]);
7081 }
7082
7083 /* Now we have decided top J priorities. Reflect the order of the
7084 original priorities to the remaining priorities. */
7085
7086 for (i = j, j = 0; i < coding_category_max; i++, j++)
7087 {
7088 while (j < coding_category_max
7089 && changed[coding_priorities[j]])
7090 j++;
7091 if (j == coding_category_max)
7092 abort ();
7093 priorities[i] = coding_priorities[j];
7094 }
7095
7096 bcopy (priorities, coding_priorities, sizeof priorities);
7097 return Qnil;
7098 }
7099
7100 DEFUN ("coding-system-priority-list", Fcoding_system_priority_list,
7101 Scoding_system_priority_list, 0, 1, 0,
7102 doc: /* Return a list of coding systems ordered by their priorities. */)
7103 (highestp)
7104 Lisp_Object highestp;
7105 {
7106 int i;
7107 Lisp_Object val;
7108
7109 for (i = 0, val = Qnil; i < coding_category_max; i++)
7110 {
7111 enum coding_category category = coding_priorities[i];
7112 int id = coding_categories[category].id;
7113 Lisp_Object attrs;
7114
7115 if (id < 0)
7116 continue;
7117 attrs = CODING_ID_ATTRS (id);
7118 if (! NILP (highestp))
7119 return CODING_ATTR_BASE_NAME (attrs);
7120 val = Fcons (CODING_ATTR_BASE_NAME (attrs), val);
7121 }
7122 return Fnreverse (val);
7123 }
7124
7125 static Lisp_Object
7126 make_subsidiaries (base)
7127 Lisp_Object base;
7128 {
7129 Lisp_Object subsidiaries;
7130 char *suffixes[] = { "-unix", "-dos", "-mac" };
7131 int base_name_len = STRING_BYTES (XSYMBOL (base)->name);
7132 char *buf = (char *) alloca (base_name_len + 6);
7133 int i;
7134
7135 bcopy (XSYMBOL (base)->name->data, buf, base_name_len);
7136 subsidiaries = Fmake_vector (make_number (3), Qnil);
7137 for (i = 0; i < 3; i++)
7138 {
7139 bcopy (suffixes[i], buf + base_name_len, strlen (suffixes[i]) + 1);
7140 ASET (subsidiaries, i, intern (buf));
7141 }
7142 return subsidiaries;
7143 }
7144
7145
7146 DEFUN ("define-coding-system-internal", Fdefine_coding_system_internal,
7147 Sdefine_coding_system_internal, coding_arg_max, MANY, 0,
7148 doc: /* For internal use only. */)
7149 (nargs, args)
7150 int nargs;
7151 Lisp_Object *args;
7152 {
7153 Lisp_Object name;
7154 Lisp_Object spec_vec; /* [ ATTRS ALIASE EOL_TYPE ] */
7155 Lisp_Object attrs; /* Vector of attributes. */
7156 Lisp_Object eol_type;
7157 Lisp_Object aliases;
7158 Lisp_Object coding_type, charset_list, safe_charsets;
7159 enum coding_category category;
7160 Lisp_Object tail, val;
7161 int max_charset_id = 0;
7162 int i;
7163
7164 if (nargs < coding_arg_max)
7165 goto short_args;
7166
7167 attrs = Fmake_vector (make_number (coding_attr_last_index), Qnil);
7168
7169 name = args[coding_arg_name];
7170 CHECK_SYMBOL (name);
7171 CODING_ATTR_BASE_NAME (attrs) = name;
7172
7173 val = args[coding_arg_mnemonic];
7174 if (! STRINGP (val))
7175 CHECK_CHARACTER (val);
7176 CODING_ATTR_MNEMONIC (attrs) = val;
7177
7178 coding_type = args[coding_arg_coding_type];
7179 CHECK_SYMBOL (coding_type);
7180 CODING_ATTR_TYPE (attrs) = coding_type;
7181
7182 charset_list = args[coding_arg_charset_list];
7183 if (SYMBOLP (charset_list))
7184 {
7185 if (EQ (charset_list, Qiso_2022))
7186 {
7187 if (! EQ (coding_type, Qiso_2022))
7188 error ("Invalid charset-list");
7189 charset_list = Viso_2022_charset_list;
7190 }
7191 else if (EQ (charset_list, Qemacs_mule))
7192 {
7193 if (! EQ (coding_type, Qemacs_mule))
7194 error ("Invalid charset-list");
7195 charset_list = Vemacs_mule_charset_list;
7196 }
7197 for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
7198 if (max_charset_id < XFASTINT (XCAR (tail)))
7199 max_charset_id = XFASTINT (XCAR (tail));
7200 }
7201 else
7202 {
7203 charset_list = Fcopy_sequence (charset_list);
7204 for (tail = charset_list; !NILP (tail); tail = Fcdr (tail))
7205 {
7206 struct charset *charset;
7207
7208 val = Fcar (tail);
7209 CHECK_CHARSET_GET_CHARSET (val, charset);
7210 if (EQ (coding_type, Qiso_2022)
7211 ? CHARSET_ISO_FINAL (charset) < 0
7212 : EQ (coding_type, Qemacs_mule)
7213 ? CHARSET_EMACS_MULE_ID (charset) < 0
7214 : 0)
7215 error ("Can't handle charset `%s'",
7216 XSYMBOL (CHARSET_NAME (charset))->name->data);
7217
7218 XCAR (tail) = make_number (charset->id);
7219 if (max_charset_id < charset->id)
7220 max_charset_id = charset->id;
7221 }
7222 }
7223 CODING_ATTR_CHARSET_LIST (attrs) = charset_list;
7224
7225 safe_charsets = Fmake_string (make_number (max_charset_id + 1),
7226 make_number (255));
7227 for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
7228 XSTRING (safe_charsets)->data[XFASTINT (XCAR (tail))] = 0;
7229 CODING_ATTR_SAFE_CHARSETS (attrs) = safe_charsets;
7230
7231 val = args[coding_arg_decode_translation_table];
7232 if (! NILP (val))
7233 CHECK_CHAR_TABLE (val);
7234 CODING_ATTR_DECODE_TBL (attrs) = val;
7235
7236 val = args[coding_arg_encode_translation_table];
7237 if (! NILP (val))
7238 CHECK_CHAR_TABLE (val);
7239 CODING_ATTR_ENCODE_TBL (attrs) = val;
7240
7241 val = args[coding_arg_post_read_conversion];
7242 CHECK_SYMBOL (val);
7243 CODING_ATTR_POST_READ (attrs) = val;
7244
7245 val = args[coding_arg_pre_write_conversion];
7246 CHECK_SYMBOL (val);
7247 CODING_ATTR_PRE_WRITE (attrs) = val;
7248
7249 val = args[coding_arg_default_char];
7250 if (NILP (val))
7251 CODING_ATTR_DEFAULT_CHAR (attrs) = make_number (' ');
7252 else
7253 {
7254 CHECK_CHARACTER (val);
7255 CODING_ATTR_DEFAULT_CHAR (attrs) = val;
7256 }
7257
7258 val = args[coding_arg_plist];
7259 CHECK_LIST (val);
7260 CODING_ATTR_PLIST (attrs) = val;
7261
7262 if (EQ (coding_type, Qcharset))
7263 {
7264 val = Fmake_vector (make_number (256), Qnil);
7265
7266 for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
7267 {
7268 struct charset *charset = CHARSET_FROM_ID (XINT (XCAR (tail)));
7269
7270 for (i = charset->code_space[0]; i <= charset->code_space[1]; i++)
7271 if (NILP (AREF (val, i)))
7272 ASET (val, i, XCAR (tail));
7273 }
7274 ASET (attrs, coding_attr_charset_valids, val);
7275 category = coding_category_charset;
7276 }
7277 else if (EQ (coding_type, Qccl))
7278 {
7279 Lisp_Object valids;
7280
7281 if (nargs < coding_arg_ccl_max)
7282 goto short_args;
7283
7284 val = args[coding_arg_ccl_decoder];
7285 CHECK_CCL_PROGRAM (val);
7286 if (VECTORP (val))
7287 val = Fcopy_sequence (val);
7288 ASET (attrs, coding_attr_ccl_decoder, val);
7289
7290 val = args[coding_arg_ccl_encoder];
7291 CHECK_CCL_PROGRAM (val);
7292 if (VECTORP (val))
7293 val = Fcopy_sequence (val);
7294 ASET (attrs, coding_attr_ccl_encoder, val);
7295
7296 val = args[coding_arg_ccl_valids];
7297 valids = Fmake_string (make_number (256), make_number (0));
7298 for (tail = val; !NILP (tail); tail = Fcdr (tail))
7299 {
7300 val = Fcar (tail);
7301 if (INTEGERP (val))
7302 ASET (valids, XINT (val), 1);
7303 else
7304 {
7305 int from, to;
7306
7307 CHECK_CONS (val);
7308 CHECK_NUMBER (XCAR (val));
7309 CHECK_NUMBER (XCDR (val));
7310 from = XINT (XCAR (val));
7311 to = XINT (XCDR (val));
7312 for (i = from; i <= to; i++)
7313 ASET (valids, i, 1);
7314 }
7315 }
7316 ASET (attrs, coding_attr_ccl_valids, valids);
7317
7318 category = coding_category_ccl;
7319 }
7320 else if (EQ (coding_type, Qutf_16))
7321 {
7322 Lisp_Object bom, endian;
7323
7324 if (nargs < coding_arg_utf16_max)
7325 goto short_args;
7326
7327 bom = args[coding_arg_utf16_bom];
7328 if (! NILP (bom) && ! EQ (bom, Qt))
7329 {
7330 CHECK_CONS (bom);
7331 CHECK_CODING_SYSTEM (XCAR (bom));
7332 CHECK_CODING_SYSTEM (XCDR (bom));
7333 }
7334 ASET (attrs, coding_attr_utf_16_bom, bom);
7335
7336 endian = args[coding_arg_utf16_endian];
7337 ASET (attrs, coding_attr_utf_16_endian, endian);
7338
7339 category = (CONSP (bom)
7340 ? coding_category_utf_16_auto
7341 : NILP (bom)
7342 ? (NILP (endian)
7343 ? coding_category_utf_16_be_nosig
7344 : coding_category_utf_16_le_nosig)
7345 : (NILP (endian)
7346 ? coding_category_utf_16_be
7347 : coding_category_utf_16_le));
7348 }
7349 else if (EQ (coding_type, Qiso_2022))
7350 {
7351 Lisp_Object initial, reg_usage, request, flags;
7352 struct charset *charset;
7353 int i, id;
7354
7355 if (nargs < coding_arg_iso2022_max)
7356 goto short_args;
7357
7358 initial = Fcopy_sequence (args[coding_arg_iso2022_initial]);
7359 CHECK_VECTOR (initial);
7360 for (i = 0; i < 4; i++)
7361 {
7362 val = Faref (initial, make_number (i));
7363 if (! NILP (val))
7364 {
7365 CHECK_CHARSET_GET_ID (val, id);
7366 ASET (initial, i, make_number (id));
7367 }
7368 else
7369 ASET (initial, i, make_number (-1));
7370 }
7371
7372 reg_usage = args[coding_arg_iso2022_reg_usage];
7373 CHECK_CONS (reg_usage);
7374 CHECK_NATNUM (XCAR (reg_usage));
7375 CHECK_NATNUM (XCDR (reg_usage));
7376
7377 request = Fcopy_sequence (args[coding_arg_iso2022_request]);
7378 for (tail = request; ! NILP (tail); tail = Fcdr (tail))
7379 {
7380 int id;
7381
7382 val = Fcar (tail);
7383 CHECK_CONS (val);
7384 CHECK_CHARSET_GET_ID (XCAR (val), id);
7385 CHECK_NATNUM (XCDR (val));
7386 if (XINT (XCDR (val)) >= 4)
7387 error ("Invalid graphic register number: %d", XINT (XCDR (val)));
7388 XCAR (val) = make_number (id);
7389 }
7390
7391 flags = args[coding_arg_iso2022_flags];
7392 CHECK_NATNUM (flags);
7393 i = XINT (flags);
7394 if (EQ (args[coding_arg_charset_list], Qiso_2022))
7395 flags = make_number (i | CODING_ISO_FLAG_FULL_SUPPORT);
7396
7397 ASET (attrs, coding_attr_iso_initial, initial);
7398 ASET (attrs, coding_attr_iso_usage, reg_usage);
7399 ASET (attrs, coding_attr_iso_request, request);
7400 ASET (attrs, coding_attr_iso_flags, flags);
7401 setup_iso_safe_charsets (attrs);
7402
7403 if (i & CODING_ISO_FLAG_SEVEN_BITS)
7404 category = ((i & (CODING_ISO_FLAG_LOCKING_SHIFT
7405 | CODING_ISO_FLAG_SINGLE_SHIFT))
7406 ? coding_category_iso_7_else
7407 : EQ (args[coding_arg_charset_list], Qiso_2022)
7408 ? coding_category_iso_7
7409 : coding_category_iso_7_tight);
7410 else
7411 {
7412 int id = XINT (AREF (initial, 1));
7413
7414 category = (((i & (CODING_ISO_FLAG_LOCKING_SHIFT
7415 | CODING_ISO_FLAG_SINGLE_SHIFT))
7416 || EQ (args[coding_arg_charset_list], Qiso_2022)
7417 || id < 0)
7418 ? coding_category_iso_8_else
7419 : (CHARSET_DIMENSION (CHARSET_FROM_ID (id)) == 1)
7420 ? coding_category_iso_8_1
7421 : coding_category_iso_8_2);
7422 }
7423 }
7424 else if (EQ (coding_type, Qemacs_mule))
7425 {
7426 if (EQ (args[coding_arg_charset_list], Qemacs_mule))
7427 ASET (attrs, coding_attr_emacs_mule_full, Qt);
7428
7429 category = coding_category_emacs_mule;
7430 }
7431 else if (EQ (coding_type, Qshift_jis))
7432 {
7433
7434 struct charset *charset;
7435
7436 if (XINT (Flength (charset_list)) != 3)
7437 error ("There should be just three charsets");
7438
7439 charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
7440 if (CHARSET_DIMENSION (charset) != 1)
7441 error ("Dimension of charset %s is not one",
7442 XSYMBOL (CHARSET_NAME (charset))->name->data);
7443
7444 charset_list = XCDR (charset_list);
7445 charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
7446 if (CHARSET_DIMENSION (charset) != 1)
7447 error ("Dimension of charset %s is not one",
7448 XSYMBOL (CHARSET_NAME (charset))->name->data);
7449
7450 charset_list = XCDR (charset_list);
7451 charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
7452 if (CHARSET_DIMENSION (charset) != 2)
7453 error ("Dimension of charset %s is not two",
7454 XSYMBOL (CHARSET_NAME (charset))->name->data);
7455
7456 category = coding_category_sjis;
7457 Vsjis_coding_system = name;
7458 }
7459 else if (EQ (coding_type, Qbig5))
7460 {
7461 struct charset *charset;
7462
7463 if (XINT (Flength (charset_list)) != 2)
7464 error ("There should be just two charsets");
7465
7466 charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
7467 if (CHARSET_DIMENSION (charset) != 1)
7468 error ("Dimension of charset %s is not one",
7469 XSYMBOL (CHARSET_NAME (charset))->name->data);
7470
7471 charset_list = XCDR (charset_list);
7472 charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
7473 if (CHARSET_DIMENSION (charset) != 2)
7474 error ("Dimension of charset %s is not two",
7475 XSYMBOL (CHARSET_NAME (charset))->name->data);
7476
7477 category = coding_category_big5;
7478 Vbig5_coding_system = name;
7479 }
7480 else if (EQ (coding_type, Qraw_text))
7481 category = coding_category_raw_text;
7482 else if (EQ (coding_type, Qutf_8))
7483 category = coding_category_utf_8;
7484 else if (EQ (coding_type, Qundecided))
7485 category = coding_category_undecided;
7486 else
7487 error ("Invalid coding system type: %s",
7488 XSYMBOL (coding_type)->name->data);
7489
7490 CODING_ATTR_CATEGORY (attrs) = make_number (category);
7491
7492 eol_type = args[coding_arg_eol_type];
7493 if (! NILP (eol_type)
7494 && ! EQ (eol_type, Qunix)
7495 && ! EQ (eol_type, Qdos)
7496 && ! EQ (eol_type, Qmac))
7497 error ("Invalid eol-type");
7498
7499 aliases = Fcons (name, Qnil);
7500
7501 if (NILP (eol_type))
7502 {
7503 eol_type = make_subsidiaries (name);
7504 for (i = 0; i < 3; i++)
7505 {
7506 Lisp_Object this_spec, this_name, this_aliases, this_eol_type;
7507
7508 this_name = AREF (eol_type, i);
7509 this_aliases = Fcons (this_name, Qnil);
7510 this_eol_type = (i == 0 ? Qunix : i == 1 ? Qdos : Qmac);
7511 this_spec = Fmake_vector (make_number (3), attrs);
7512 ASET (this_spec, 1, this_aliases);
7513 ASET (this_spec, 2, this_eol_type);
7514 Fputhash (this_name, this_spec, Vcoding_system_hash_table);
7515 Vcoding_system_list = Fcons (this_name, Vcoding_system_list);
7516 Vcoding_system_alist = Fcons (Fcons (Fsymbol_name (this_name), Qnil),
7517 Vcoding_system_alist);
7518 }
7519 }
7520
7521 spec_vec = Fmake_vector (make_number (3), attrs);
7522 ASET (spec_vec, 1, aliases);
7523 ASET (spec_vec, 2, eol_type);
7524
7525 Fputhash (name, spec_vec, Vcoding_system_hash_table);
7526 Vcoding_system_list = Fcons (name, Vcoding_system_list);
7527 Vcoding_system_alist = Fcons (Fcons (Fsymbol_name (name), Qnil),
7528 Vcoding_system_alist);
7529
7530 {
7531 int id = coding_categories[category].id;
7532
7533 if (id < 0 || EQ (name, CODING_ID_NAME (id)))
7534 setup_coding_system (name, &coding_categories[category]);
7535 }
7536
7537 return Qnil;
7538
7539 short_args:
7540 return Fsignal (Qwrong_number_of_arguments,
7541 Fcons (intern ("define-coding-system-internal"),
7542 make_number (nargs)));
7543 }
7544
7545 DEFUN ("define-coding-system-alias", Fdefine_coding_system_alias,
7546 Sdefine_coding_system_alias, 2, 2, 0,
7547 doc: /* Define ALIAS as an alias for CODING-SYSTEM. */)
7548 (alias, coding_system)
7549 Lisp_Object alias, coding_system;
7550 {
7551 Lisp_Object spec, aliases, eol_type;
7552
7553 CHECK_SYMBOL (alias);
7554 CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
7555 aliases = AREF (spec, 1);
7556 while (!NILP (XCDR (aliases)))
7557 aliases = XCDR (aliases);
7558 XCDR (aliases) = Fcons (alias, Qnil);
7559
7560 eol_type = AREF (spec, 2);
7561 if (VECTORP (eol_type))
7562 {
7563 Lisp_Object subsidiaries;
7564 int i;
7565
7566 subsidiaries = make_subsidiaries (alias);
7567 for (i = 0; i < 3; i++)
7568 Fdefine_coding_system_alias (AREF (subsidiaries, i),
7569 AREF (eol_type, i));
7570
7571 ASET (spec, 2, subsidiaries);
7572 }
7573
7574 Fputhash (alias, spec, Vcoding_system_hash_table);
7575
7576 return Qnil;
7577 }
7578
7579 DEFUN ("coding-system-base", Fcoding_system_base, Scoding_system_base,
7580 1, 1, 0,
7581 doc: /* Return the base of CODING-SYSTEM.
7582 Any alias or subsidiary coding systems are not base coding system. */)
7583 (coding_system)
7584 Lisp_Object coding_system;
7585 {
7586 Lisp_Object spec, attrs;
7587
7588 if (NILP (coding_system))
7589 return (Qno_conversion);
7590 CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
7591 attrs = AREF (spec, 0);
7592 return CODING_ATTR_BASE_NAME (attrs);
7593 }
7594
7595 DEFUN ("coding-system-plist", Fcoding_system_plist, Scoding_system_plist,
7596 1, 1, 0,
7597 doc: "Return the property list of CODING-SYSTEM.")
7598 (coding_system)
7599 Lisp_Object coding_system;
7600 {
7601 Lisp_Object spec, attrs;
7602
7603 if (NILP (coding_system))
7604 coding_system = Qno_conversion;
7605 CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
7606 attrs = AREF (spec, 0);
7607 return CODING_ATTR_PLIST (attrs);
7608 }
7609
7610
7611 DEFUN ("coding-system-aliases", Fcoding_system_aliases, Scoding_system_aliases,
7612 1, 1, 0,
7613 doc: /* Return the list of aliases of CODING-SYSTEM.
7614 A base coding system is what made by `define-coding-system'.
7615 Any alias nor subsidiary coding systems are not base coding system. */)
7616 (coding_system)
7617 Lisp_Object coding_system;
7618 {
7619 Lisp_Object spec;
7620
7621 if (NILP (coding_system))
7622 coding_system = Qno_conversion;
7623 CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
7624 return AREF (spec, 2);
7625 }
7626
7627 DEFUN ("coding-system-eol-type", Fcoding_system_eol_type,
7628 Scoding_system_eol_type, 1, 1, 0,
7629 doc: /* Return eol-type of CODING-SYSTEM.
7630 An eol-type is integer 0, 1, 2, or a vector of coding systems.
7631
7632 Integer values 0, 1, and 2 indicate a format of end-of-line; LF, CRLF,
7633 and CR respectively.
7634
7635 A vector value indicates that a format of end-of-line should be
7636 detected automatically. Nth element of the vector is the subsidiary
7637 coding system whose eol-type is N. */)
7638 (coding_system)
7639 Lisp_Object coding_system;
7640 {
7641 Lisp_Object spec, eol_type;
7642 int n;
7643
7644 if (NILP (coding_system))
7645 coding_system = Qno_conversion;
7646 if (! CODING_SYSTEM_P (coding_system))
7647 return Qnil;
7648 spec = CODING_SYSTEM_SPEC (coding_system);
7649 eol_type = AREF (spec, 2);
7650 if (VECTORP (eol_type))
7651 return Fcopy_sequence (eol_type);
7652 n = EQ (eol_type, Qunix) ? 0 : EQ (eol_type, Qdos) ? 1 : 2;
7653 return make_number (n);
7654 }
7655
7656 #endif /* emacs */
7657
7658 \f
7659 /*** 9. Post-amble ***/
7660
7661 void
7662 init_coding_once ()
7663 {
7664 int i;
7665
7666 for (i = 0; i < coding_category_max; i++)
7667 {
7668 coding_categories[i].id = -1;
7669 coding_priorities[i] = i;
7670 }
7671
7672 /* ISO2022 specific initialize routine. */
7673 for (i = 0; i < 0x20; i++)
7674 iso_code_class[i] = ISO_control_0;
7675 for (i = 0x21; i < 0x7F; i++)
7676 iso_code_class[i] = ISO_graphic_plane_0;
7677 for (i = 0x80; i < 0xA0; i++)
7678 iso_code_class[i] = ISO_control_1;
7679 for (i = 0xA1; i < 0xFF; i++)
7680 iso_code_class[i] = ISO_graphic_plane_1;
7681 iso_code_class[0x20] = iso_code_class[0x7F] = ISO_0x20_or_0x7F;
7682 iso_code_class[0xA0] = iso_code_class[0xFF] = ISO_0xA0_or_0xFF;
7683 iso_code_class[ISO_CODE_CR] = ISO_carriage_return;
7684 iso_code_class[ISO_CODE_SO] = ISO_shift_out;
7685 iso_code_class[ISO_CODE_SI] = ISO_shift_in;
7686 iso_code_class[ISO_CODE_SS2_7] = ISO_single_shift_2_7;
7687 iso_code_class[ISO_CODE_ESC] = ISO_escape;
7688 iso_code_class[ISO_CODE_SS2] = ISO_single_shift_2;
7689 iso_code_class[ISO_CODE_SS3] = ISO_single_shift_3;
7690 iso_code_class[ISO_CODE_CSI] = ISO_control_sequence_introducer;
7691
7692 inhibit_pre_post_conversion = 0;
7693
7694 for (i = 0; i < 256; i++)
7695 {
7696 emacs_mule_bytes[i] = 1;
7697 }
7698 }
7699
7700 #ifdef emacs
7701
7702 void
7703 syms_of_coding ()
7704 {
7705 staticpro (&Vcoding_system_hash_table);
7706 Vcoding_system_hash_table = Fmakehash (Qeq);
7707
7708 staticpro (&Vsjis_coding_system);
7709 Vsjis_coding_system = Qnil;
7710
7711 staticpro (&Vbig5_coding_system);
7712 Vbig5_coding_system = Qnil;
7713
7714 staticpro (&Vcode_conversion_work_buf_list);
7715 Vcode_conversion_work_buf_list = Qnil;
7716
7717 staticpro (&Vcode_conversion_reused_work_buf);
7718 Vcode_conversion_reused_work_buf = Qnil;
7719
7720 DEFSYM (Qcharset, "charset");
7721 DEFSYM (Qtarget_idx, "target-idx");
7722 DEFSYM (Qcoding_system_history, "coding-system-history");
7723 Fset (Qcoding_system_history, Qnil);
7724
7725 /* Target FILENAME is the first argument. */
7726 Fput (Qinsert_file_contents, Qtarget_idx, make_number (0));
7727 /* Target FILENAME is the third argument. */
7728 Fput (Qwrite_region, Qtarget_idx, make_number (2));
7729
7730 DEFSYM (Qcall_process, "call-process");
7731 /* Target PROGRAM is the first argument. */
7732 Fput (Qcall_process, Qtarget_idx, make_number (0));
7733
7734 DEFSYM (Qcall_process_region, "call-process-region");
7735 /* Target PROGRAM is the third argument. */
7736 Fput (Qcall_process_region, Qtarget_idx, make_number (2));
7737
7738 DEFSYM (Qstart_process, "start-process");
7739 /* Target PROGRAM is the third argument. */
7740 Fput (Qstart_process, Qtarget_idx, make_number (2));
7741
7742 DEFSYM (Qopen_network_stream, "open-network-stream");
7743 /* Target SERVICE is the fourth argument. */
7744 Fput (Qopen_network_stream, Qtarget_idx, make_number (3));
7745
7746 DEFSYM (Qcoding_system, "coding-system");
7747 DEFSYM (Qcoding_aliases, "coding-aliases");
7748
7749 DEFSYM (Qeol_type, "eol-type");
7750 DEFSYM (Qunix, "unix");
7751 DEFSYM (Qdos, "dos");
7752 DEFSYM (Qmac, "mac");
7753
7754 DEFSYM (Qbuffer_file_coding_system, "buffer-file-coding-system");
7755 DEFSYM (Qpost_read_conversion, "post-read-conversion");
7756 DEFSYM (Qpre_write_conversion, "pre-write-conversion");
7757 DEFSYM (Qdefault_char, "default-char");
7758 DEFSYM (Qundecided, "undecided");
7759 DEFSYM (Qno_conversion, "no-conversion");
7760 DEFSYM (Qraw_text, "raw-text");
7761
7762 DEFSYM (Qiso_2022, "iso-2022");
7763
7764 DEFSYM (Qutf_8, "utf-8");
7765
7766 DEFSYM (Qutf_16, "utf-16");
7767 DEFSYM (Qutf_16_be, "utf-16-be");
7768 DEFSYM (Qutf_16_be_nosig, "utf-16-be-nosig");
7769 DEFSYM (Qutf_16_le, "utf-16-l3");
7770 DEFSYM (Qutf_16_le_nosig, "utf-16-le-nosig");
7771 DEFSYM (Qsignature, "signature");
7772 DEFSYM (Qendian, "endian");
7773 DEFSYM (Qbig, "big");
7774 DEFSYM (Qlittle, "little");
7775
7776 DEFSYM (Qshift_jis, "shift-jis");
7777 DEFSYM (Qbig5, "big5");
7778
7779 DEFSYM (Qcoding_system_p, "coding-system-p");
7780
7781 DEFSYM (Qcoding_system_error, "coding-system-error");
7782 Fput (Qcoding_system_error, Qerror_conditions,
7783 Fcons (Qcoding_system_error, Fcons (Qerror, Qnil)));
7784 Fput (Qcoding_system_error, Qerror_message,
7785 build_string ("Invalid coding system"));
7786
7787 /* Intern this now in case it isn't already done.
7788 Setting this variable twice is harmless.
7789 But don't staticpro it here--that is done in alloc.c. */
7790 Qchar_table_extra_slots = intern ("char-table-extra-slots");
7791
7792 DEFSYM (Qtranslation_table, "translation-table");
7793 Fput (Qtranslation_table, Qchar_table_extra_slots, make_number (1));
7794 DEFSYM (Qtranslation_table_id, "translation-table-id");
7795 DEFSYM (Qtranslation_table_for_decode, "translation-table-for-decode");
7796 DEFSYM (Qtranslation_table_for_encode, "translation-table-for-encode");
7797
7798 DEFSYM (Qchar_coding_system, "char-coding-system");
7799
7800 Fput (Qchar_coding_system, Qchar_table_extra_slots, make_number (2));
7801
7802 DEFSYM (Qvalid_codes, "valid-codes");
7803
7804 DEFSYM (Qemacs_mule, "emacs-mule");
7805
7806 Vcoding_category_table
7807 = Fmake_vector (make_number (coding_category_max), Qnil);
7808 staticpro (&Vcoding_category_table);
7809 /* Followings are target of code detection. */
7810 ASET (Vcoding_category_table, coding_category_iso_7,
7811 intern ("coding-category-iso-7"));
7812 ASET (Vcoding_category_table, coding_category_iso_7_tight,
7813 intern ("coding-category-iso-7-tight"));
7814 ASET (Vcoding_category_table, coding_category_iso_8_1,
7815 intern ("coding-category-iso-8-1"));
7816 ASET (Vcoding_category_table, coding_category_iso_8_2,
7817 intern ("coding-category-iso-8-2"));
7818 ASET (Vcoding_category_table, coding_category_iso_7_else,
7819 intern ("coding-category-iso-7-else"));
7820 ASET (Vcoding_category_table, coding_category_iso_8_else,
7821 intern ("coding-category-iso-8-else"));
7822 ASET (Vcoding_category_table, coding_category_utf_8,
7823 intern ("coding-category-utf-8"));
7824 ASET (Vcoding_category_table, coding_category_utf_16_be,
7825 intern ("coding-category-utf-16-be"));
7826 ASET (Vcoding_category_table, coding_category_utf_16_le,
7827 intern ("coding-category-utf-16-le"));
7828 ASET (Vcoding_category_table, coding_category_utf_16_be_nosig,
7829 intern ("coding-category-utf-16-be-nosig"));
7830 ASET (Vcoding_category_table, coding_category_utf_16_le_nosig,
7831 intern ("coding-category-utf-16-le-nosig"));
7832 ASET (Vcoding_category_table, coding_category_charset,
7833 intern ("coding-category-charset"));
7834 ASET (Vcoding_category_table, coding_category_sjis,
7835 intern ("coding-category-sjis"));
7836 ASET (Vcoding_category_table, coding_category_big5,
7837 intern ("coding-category-big5"));
7838 ASET (Vcoding_category_table, coding_category_ccl,
7839 intern ("coding-category-ccl"));
7840 ASET (Vcoding_category_table, coding_category_emacs_mule,
7841 intern ("coding-category-emacs-mule"));
7842 /* Followings are NOT target of code detection. */
7843 ASET (Vcoding_category_table, coding_category_raw_text,
7844 intern ("coding-category-raw-text"));
7845 ASET (Vcoding_category_table, coding_category_undecided,
7846 intern ("coding-category-undecided"));
7847
7848 {
7849 Lisp_Object args[coding_arg_max];
7850 Lisp_Object plist[14];
7851 int i;
7852
7853 for (i = 0; i < coding_arg_max; i++)
7854 args[i] = Qnil;
7855
7856 plist[0] = intern (":name");
7857 plist[1] = args[coding_arg_name] = Qno_conversion;
7858 plist[2] = intern (":mnemonic");
7859 plist[3] = args[coding_arg_mnemonic] = make_number ('=');
7860 plist[4] = intern (":coding-type");
7861 plist[5] = args[coding_arg_coding_type] = Qraw_text;
7862 plist[6] = intern (":ascii-compatible-p");
7863 plist[7] = args[coding_arg_ascii_compatible_p] = Qt;
7864 plist[8] = intern (":default-char");
7865 plist[9] = args[coding_arg_default_char] = make_number (0);
7866 plist[10] = intern (":docstring");
7867 plist[11] = build_string ("Do no conversion.\n\
7868 \n\
7869 When you visit a file with this coding, the file is read into a\n\
7870 unibyte buffer as is, thus each byte of a file is treated as a\n\
7871 character.");
7872 plist[12] = intern (":eol-type");
7873 plist[13] = args[coding_arg_eol_type] = Qunix;
7874 args[coding_arg_plist] = Flist (14, plist);
7875 Fdefine_coding_system_internal (coding_arg_max, args);
7876 }
7877
7878 setup_coding_system (Qno_conversion, &keyboard_coding);
7879 setup_coding_system (Qno_conversion, &terminal_coding);
7880 setup_coding_system (Qno_conversion, &safe_terminal_coding);
7881
7882 defsubr (&Scoding_system_p);
7883 defsubr (&Sread_coding_system);
7884 defsubr (&Sread_non_nil_coding_system);
7885 defsubr (&Scheck_coding_system);
7886 defsubr (&Sdetect_coding_region);
7887 defsubr (&Sdetect_coding_string);
7888 defsubr (&Sfind_coding_systems_region_internal);
7889 defsubr (&Scheck_coding_systems_region);
7890 defsubr (&Sdecode_coding_region);
7891 defsubr (&Sencode_coding_region);
7892 defsubr (&Sdecode_coding_string);
7893 defsubr (&Sencode_coding_string);
7894 defsubr (&Sdecode_sjis_char);
7895 defsubr (&Sencode_sjis_char);
7896 defsubr (&Sdecode_big5_char);
7897 defsubr (&Sencode_big5_char);
7898 defsubr (&Sset_terminal_coding_system_internal);
7899 defsubr (&Sset_safe_terminal_coding_system_internal);
7900 defsubr (&Sterminal_coding_system);
7901 defsubr (&Sset_keyboard_coding_system_internal);
7902 defsubr (&Skeyboard_coding_system);
7903 defsubr (&Sfind_operation_coding_system);
7904 defsubr (&Sset_coding_system_priority);
7905 defsubr (&Sdefine_coding_system_internal);
7906 defsubr (&Sdefine_coding_system_alias);
7907 defsubr (&Scoding_system_base);
7908 defsubr (&Scoding_system_plist);
7909 defsubr (&Scoding_system_aliases);
7910 defsubr (&Scoding_system_eol_type);
7911 defsubr (&Scoding_system_priority_list);
7912
7913 DEFVAR_LISP ("coding-system-list", &Vcoding_system_list,
7914 doc: /* List of coding systems.
7915
7916 Do not alter the value of this variable manually. This variable should be
7917 updated by the functions `define-coding-system' and
7918 `define-coding-system-alias'. */);
7919 Vcoding_system_list = Qnil;
7920
7921 DEFVAR_LISP ("coding-system-alist", &Vcoding_system_alist,
7922 doc: /* Alist of coding system names.
7923 Each element is one element list of coding system name.
7924 This variable is given to `completing-read' as TABLE argument.
7925
7926 Do not alter the value of this variable manually. This variable should be
7927 updated by the functions `make-coding-system' and
7928 `define-coding-system-alias'. */);
7929 Vcoding_system_alist = Qnil;
7930
7931 DEFVAR_LISP ("coding-category-list", &Vcoding_category_list,
7932 doc: /* List of coding-categories (symbols) ordered by priority.
7933
7934 On detecting a coding system, Emacs tries code detection algorithms
7935 associated with each coding-category one by one in this order. When
7936 one algorithm agrees with a byte sequence of source text, the coding
7937 system bound to the corresponding coding-category is selected. */);
7938 {
7939 int i;
7940
7941 Vcoding_category_list = Qnil;
7942 for (i = coding_category_max - 1; i >= 0; i--)
7943 Vcoding_category_list
7944 = Fcons (XVECTOR (Vcoding_category_table)->contents[i],
7945 Vcoding_category_list);
7946 }
7947
7948 DEFVAR_LISP ("coding-system-for-read", &Vcoding_system_for_read,
7949 doc: /* Specify the coding system for read operations.
7950 It is useful to bind this variable with `let', but do not set it globally.
7951 If the value is a coding system, it is used for decoding on read operation.
7952 If not, an appropriate element is used from one of the coding system alists:
7953 There are three such tables, `file-coding-system-alist',
7954 `process-coding-system-alist', and `network-coding-system-alist'. */);
7955 Vcoding_system_for_read = Qnil;
7956
7957 DEFVAR_LISP ("coding-system-for-write", &Vcoding_system_for_write,
7958 doc: /* Specify the coding system for write operations.
7959 Programs bind this variable with `let', but you should not set it globally.
7960 If the value is a coding system, it is used for encoding of output,
7961 when writing it to a file and when sending it to a file or subprocess.
7962
7963 If this does not specify a coding system, an appropriate element
7964 is used from one of the coding system alists:
7965 There are three such tables, `file-coding-system-alist',
7966 `process-coding-system-alist', and `network-coding-system-alist'.
7967 For output to files, if the above procedure does not specify a coding system,
7968 the value of `buffer-file-coding-system' is used. */);
7969 Vcoding_system_for_write = Qnil;
7970
7971 DEFVAR_LISP ("last-coding-system-used", &Vlast_coding_system_used,
7972 doc: /*
7973 Coding system used in the latest file or process I/O. */);
7974 Vlast_coding_system_used = Qnil;
7975
7976 DEFVAR_BOOL ("inhibit-eol-conversion", &inhibit_eol_conversion,
7977 doc: /*
7978 *Non-nil means always inhibit code conversion of end-of-line format.
7979 See info node `Coding Systems' and info node `Text and Binary' concerning
7980 such conversion. */);
7981 inhibit_eol_conversion = 0;
7982
7983 DEFVAR_BOOL ("inherit-process-coding-system", &inherit_process_coding_system,
7984 doc: /*
7985 Non-nil means process buffer inherits coding system of process output.
7986 Bind it to t if the process output is to be treated as if it were a file
7987 read from some filesystem. */);
7988 inherit_process_coding_system = 0;
7989
7990 DEFVAR_LISP ("file-coding-system-alist", &Vfile_coding_system_alist,
7991 doc: /*
7992 Alist to decide a coding system to use for a file I/O operation.
7993 The format is ((PATTERN . VAL) ...),
7994 where PATTERN is a regular expression matching a file name,
7995 VAL is a coding system, a cons of coding systems, or a function symbol.
7996 If VAL is a coding system, it is used for both decoding and encoding
7997 the file contents.
7998 If VAL is a cons of coding systems, the car part is used for decoding,
7999 and the cdr part is used for encoding.
8000 If VAL is a function symbol, the function must return a coding system
8001 or a cons of coding systems which are used as above. The function gets
8002 the arguments with which `find-operation-coding-systems' was called.
8003
8004 See also the function `find-operation-coding-system'
8005 and the variable `auto-coding-alist'. */);
8006 Vfile_coding_system_alist = Qnil;
8007
8008 DEFVAR_LISP ("process-coding-system-alist", &Vprocess_coding_system_alist,
8009 doc: /*
8010 Alist to decide a coding system to use for a process I/O operation.
8011 The format is ((PATTERN . VAL) ...),
8012 where PATTERN is a regular expression matching a program name,
8013 VAL is a coding system, a cons of coding systems, or a function symbol.
8014 If VAL is a coding system, it is used for both decoding what received
8015 from the program and encoding what sent to the program.
8016 If VAL is a cons of coding systems, the car part is used for decoding,
8017 and the cdr part is used for encoding.
8018 If VAL is a function symbol, the function must return a coding system
8019 or a cons of coding systems which are used as above.
8020
8021 See also the function `find-operation-coding-system'. */);
8022 Vprocess_coding_system_alist = Qnil;
8023
8024 DEFVAR_LISP ("network-coding-system-alist", &Vnetwork_coding_system_alist,
8025 doc: /*
8026 Alist to decide a coding system to use for a network I/O operation.
8027 The format is ((PATTERN . VAL) ...),
8028 where PATTERN is a regular expression matching a network service name
8029 or is a port number to connect to,
8030 VAL is a coding system, a cons of coding systems, or a function symbol.
8031 If VAL is a coding system, it is used for both decoding what received
8032 from the network stream and encoding what sent to the network stream.
8033 If VAL is a cons of coding systems, the car part is used for decoding,
8034 and the cdr part is used for encoding.
8035 If VAL is a function symbol, the function must return a coding system
8036 or a cons of coding systems which are used as above.
8037
8038 See also the function `find-operation-coding-system'. */);
8039 Vnetwork_coding_system_alist = Qnil;
8040
8041 DEFVAR_LISP ("locale-coding-system", &Vlocale_coding_system,
8042 doc: /* Coding system to use with system messages.
8043 Also used for decoding keyboard input on X Window system. */);
8044 Vlocale_coding_system = Qnil;
8045
8046 /* The eol mnemonics are reset in startup.el system-dependently. */
8047 DEFVAR_LISP ("eol-mnemonic-unix", &eol_mnemonic_unix,
8048 doc: /*
8049 *String displayed in mode line for UNIX-like (LF) end-of-line format. */);
8050 eol_mnemonic_unix = build_string (":");
8051
8052 DEFVAR_LISP ("eol-mnemonic-dos", &eol_mnemonic_dos,
8053 doc: /*
8054 *String displayed in mode line for DOS-like (CRLF) end-of-line format. */);
8055 eol_mnemonic_dos = build_string ("\\");
8056
8057 DEFVAR_LISP ("eol-mnemonic-mac", &eol_mnemonic_mac,
8058 doc: /*
8059 *String displayed in mode line for MAC-like (CR) end-of-line format. */);
8060 eol_mnemonic_mac = build_string ("/");
8061
8062 DEFVAR_LISP ("eol-mnemonic-undecided", &eol_mnemonic_undecided,
8063 doc: /*
8064 *String displayed in mode line when end-of-line format is not yet determined. */);
8065 eol_mnemonic_undecided = build_string (":");
8066
8067 DEFVAR_LISP ("enable-character-translation", &Venable_character_translation,
8068 doc: /*
8069 *Non-nil enables character translation while encoding and decoding. */);
8070 Venable_character_translation = Qt;
8071
8072 DEFVAR_LISP ("standard-translation-table-for-decode",
8073 &Vstandard_translation_table_for_decode,
8074 doc: /* Table for translating characters while decoding. */);
8075 Vstandard_translation_table_for_decode = Qnil;
8076
8077 DEFVAR_LISP ("standard-translation-table-for-encode",
8078 &Vstandard_translation_table_for_encode,
8079 doc: /* Table for translating characters while encoding. */);
8080 Vstandard_translation_table_for_encode = Qnil;
8081
8082 DEFVAR_LISP ("charset-revision-table", &Vcharset_revision_table,
8083 doc: /* Alist of charsets vs revision numbers.
8084 While encoding, if a charset (car part of an element) is found,
8085 designate it with the escape sequence identifying revision (cdr part
8086 of the element). */);
8087 Vcharset_revision_table = Qnil;
8088
8089 DEFVAR_LISP ("default-process-coding-system",
8090 &Vdefault_process_coding_system,
8091 doc: /* Cons of coding systems used for process I/O by default.
8092 The car part is used for decoding a process output,
8093 the cdr part is used for encoding a text to be sent to a process. */);
8094 Vdefault_process_coding_system = Qnil;
8095
8096 DEFVAR_LISP ("latin-extra-code-table", &Vlatin_extra_code_table,
8097 doc: /*
8098 Table of extra Latin codes in the range 128..159 (inclusive).
8099 This is a vector of length 256.
8100 If Nth element is non-nil, the existence of code N in a file
8101 \(or output of subprocess) doesn't prevent it to be detected as
8102 a coding system of ISO 2022 variant which has a flag
8103 `accept-latin-extra-code' t (e.g. iso-latin-1) on reading a file
8104 or reading output of a subprocess.
8105 Only 128th through 159th elements has a meaning. */);
8106 Vlatin_extra_code_table = Fmake_vector (make_number (256), Qnil);
8107
8108 DEFVAR_LISP ("select-safe-coding-system-function",
8109 &Vselect_safe_coding_system_function,
8110 doc: /*
8111 Function to call to select safe coding system for encoding a text.
8112
8113 If set, this function is called to force a user to select a proper
8114 coding system which can encode the text in the case that a default
8115 coding system used in each operation can't encode the text.
8116
8117 The default value is `select-safe-coding-system' (which see). */);
8118 Vselect_safe_coding_system_function = Qnil;
8119
8120 DEFVAR_LISP ("char-coding-system-table", &Vchar_coding_system_table,
8121 doc: /*
8122 Char-table containing safe coding systems of each characters.
8123 Each element doesn't include such generic coding systems that can
8124 encode any characters. They are in the first extra slot. */);
8125 Vchar_coding_system_table = Fmake_char_table (Qchar_coding_system, Qnil);
8126
8127 DEFVAR_BOOL ("inhibit-iso-escape-detection",
8128 &inhibit_iso_escape_detection,
8129 doc: /*
8130 If non-nil, Emacs ignores ISO2022's escape sequence on code detection.
8131
8132 By default, on reading a file, Emacs tries to detect how the text is
8133 encoded. This code detection is sensitive to escape sequences. If
8134 the sequence is valid as ISO2022, the code is determined as one of
8135 the ISO2022 encodings, and the file is decoded by the corresponding
8136 coding system (e.g. `iso-2022-7bit').
8137
8138 However, there may be a case that you want to read escape sequences in
8139 a file as is. In such a case, you can set this variable to non-nil.
8140 Then, as the code detection ignores any escape sequences, no file is
8141 detected as encoded in some ISO2022 encoding. The result is that all
8142 escape sequences become visible in a buffer.
8143
8144 The default value is nil, and it is strongly recommended not to change
8145 it. That is because many Emacs Lisp source files that contain
8146 non-ASCII characters are encoded by the coding system `iso-2022-7bit'
8147 in Emacs's distribution, and they won't be decoded correctly on
8148 reading if you suppress escape sequence detection.
8149
8150 The other way to read escape sequences in a file without decoding is
8151 to explicitly specify some coding system that doesn't use ISO2022's
8152 escape sequence (e.g `latin-1') on reading by \\[universal-coding-system-argument]. */);
8153 inhibit_iso_escape_detection = 0;
8154 }
8155
8156 char *
8157 emacs_strerror (error_number)
8158 int error_number;
8159 {
8160 char *str;
8161
8162 synchronize_system_messages_locale ();
8163 str = strerror (error_number);
8164
8165 if (! NILP (Vlocale_coding_system))
8166 {
8167 Lisp_Object dec = code_convert_string_norecord (build_string (str),
8168 Vlocale_coding_system,
8169 0);
8170 str = (char *) XSTRING (dec)->data;
8171 }
8172
8173 return str;
8174 }
8175
8176 #endif /* emacs */