(detected_mask): Delete unused variable.
[bpt/emacs.git] / src / coding.c
1 /* Coding system handler (conversion, detection, etc).
2 Copyright (C) 1995, 1997, 1998 Electrotechnical Laboratory, JAPAN.
3 Licensed to the Free Software Foundation.
4 Copyright (C) 2001, 2002 Free Software Foundation, Inc.
5 Copyright (C) 2001, 2002
6 National Institute of Advanced Industrial Science and Technology (AIST)
7 Registration Number H13PRO009
8
9 This file is part of GNU Emacs.
10
11 GNU Emacs is free software; you can redistribute it and/or modify
12 it under the terms of the GNU General Public License as published by
13 the Free Software Foundation; either version 2, or (at your option)
14 any later version.
15
16 GNU Emacs is distributed in the hope that it will be useful,
17 but WITHOUT ANY WARRANTY; without even the implied warranty of
18 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19 GNU General Public License for more details.
20
21 You should have received a copy of the GNU General Public License
22 along with GNU Emacs; see the file COPYING. If not, write to
23 the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
24 Boston, MA 02111-1307, USA. */
25
26 /*** TABLE OF CONTENTS ***
27
28 0. General comments
29 1. Preamble
30 2. Emacs' internal format (emacs-utf-8) handlers
31 3. UTF-8 handlers
32 4. UTF-16 handlers
33 5. Charset-base coding systems handlers
34 6. emacs-mule (old Emacs' internal format) handlers
35 7. ISO2022 handlers
36 8. Shift-JIS and BIG5 handlers
37 9. CCL handlers
38 10. C library functions
39 11. Emacs Lisp library functions
40 12. Postamble
41
42 */
43
44 /*** 0. General comments ***
45
46
47 CODING SYSTEM
48
49 A coding system is an object for an encoding mechanism that contains
50 information about how to convert byte sequences to character
51 sequences and vice versa. When we say "decode", it means converting
52 a byte sequence of a specific coding system into a character
53 sequence that is represented by Emacs' internal coding system
54 `emacs-utf-8', and when we say "encode", it means converting a
55 character sequence of emacs-utf-8 to a byte sequence of a specific
56 coding system.
57
58 In Emacs Lisp, a coding system is represented by a Lisp symbol. In
59 C level, a coding system is represented by a vector of attributes
60 stored in the hash table Vcharset_hash_table. The conversion from
61 coding system symbol to attributes vector is done by looking up
62 Vcharset_hash_table by the symbol.
63
64 Coding systems are classified into the following types depending on
65 the encoding mechanism. Here's a brief description of the types.
66
67 o UTF-8
68
69 o UTF-16
70
71 o Charset-base coding system
72
73 A coding system defined by one or more (coded) character sets.
74 Decoding and encoding are done by a code converter defined for each
75 character set.
76
77 o Old Emacs internal format (emacs-mule)
78
79 The coding system adopted by old versions of Emacs (20 and 21).
80
81 o ISO2022-base coding system
82
83 The most famous coding system for multiple character sets. X's
84 Compound Text, various EUCs (Extended Unix Code), and coding systems
85 used in the Internet communication such as ISO-2022-JP are all
86 variants of ISO2022.
87
88 o SJIS (or Shift-JIS or MS-Kanji-Code)
89
90 A coding system to encode character sets: ASCII, JISX0201, and
91 JISX0208. Widely used for PC's in Japan. Details are described in
92 section 8.
93
94 o BIG5
95
96 A coding system to encode character sets: ASCII and Big5. Widely
97 used for Chinese (mainly in Taiwan and Hong Kong). Details are
98 described in section 8. In this file, when we write "big5" (all
99 lowercase), we mean the coding system, and when we write "Big5"
100 (capitalized), we mean the character set.
101
102 o CCL
103
104 If a user wants to decode/encode text encoded in a coding system
105 not listed above, he can supply a decoder and an encoder for it in
106 CCL (Code Conversion Language) programs. Emacs executes the CCL
107 program while decoding/encoding.
108
109 o Raw-text
110
111 A coding system for text containing raw eight-bit data. Emacs
112 treats each byte of source text as a character (except for
113 end-of-line conversion).
114
115 o No-conversion
116
117 Like raw text, but don't do end-of-line conversion.
118
119
120 END-OF-LINE FORMAT
121
122 How text end-of-line is encoded depends on operating system. For
123 instance, Unix's format is just one byte of LF (line-feed) code,
124 whereas DOS's format is two-byte sequence of `carriage-return' and
125 `line-feed' codes. MacOS's format is usually one byte of
126 `carriage-return'.
127
128 Since text character encoding and end-of-line encoding are
129 independent, any coding system described above can take any format
130 of end-of-line (except for no-conversion).
131
132 STRUCT CODING_SYSTEM
133
134 Before using a coding system for code conversion (i.e. decoding and
135 encoding), we setup a structure of type `struct coding_system'.
136 This structure keeps various information about a specific code
137 conversion (e.g. the location of source and destination data).
138
139 */
140
141 /* COMMON MACROS */
142
143
144 /*** GENERAL NOTES on `detect_coding_XXX ()' functions ***
145
146 These functions check if a byte sequence specified as a source in
147 CODING conforms to the format of XXX, and update the members of
148 DETECT_INFO.
149
150 Return 1 if the byte sequence conforms to XXX, otherwise return 0.
151
152 Below is the template of these functions. */
153
154 #if 0
155 static int
156 detect_coding_XXX (coding, detect_info)
157 struct coding_system *coding;
158 struct coding_detection_info *detect_info;
159 {
160 unsigned char *src = coding->source;
161 unsigned char *src_end = coding->source + coding->src_bytes;
162 int multibytep = coding->src_multibyte;
163 int consumed_chars = 0;
164 int found = 0;
165 ...;
166
167 while (1)
168 {
169 /* Get one byte from the source. If the souce is exausted, jump
170 to no_more_source:. */
171 ONE_MORE_BYTE (c);
172
173 if (! __C_conforms_to_XXX___ (c))
174 break;
175 if (! __C_strongly_suggests_XXX__ (c))
176 found = CATEGORY_MASK_XXX;
177 }
178 /* The byte sequence is invalid for XXX. */
179 detect_info->rejected |= CATEGORY_MASK_XXX;
180 return 0;
181
182 no_more_source:
183 /* The source exausted successfully. */
184 detect_info->found |= found;
185 return 1;
186 }
187 #endif
188
189 /*** GENERAL NOTES on `decode_coding_XXX ()' functions ***
190
191 These functions decode a byte sequence specified as a source by
192 CODING. The resulting multibyte text goes to a place pointed to by
193 CODING->charbuf, the length of which should not exceed
194 CODING->charbuf_size;
195
196 These functions set the information of original and decoded texts in
197 CODING->consumed, CODING->consumed_char, and CODING->charbuf_used.
198 They also set CODING->result to one of CODING_RESULT_XXX indicating
199 how the decoding is finished.
200
201 Below is the template of these functions. */
202
203 #if 0
204 static void
205 decode_coding_XXXX (coding)
206 struct coding_system *coding;
207 {
208 unsigned char *src = coding->source + coding->consumed;
209 unsigned char *src_end = coding->source + coding->src_bytes;
210 /* SRC_BASE remembers the start position in source in each loop.
211 The loop will be exited when there's not enough source code, or
212 when there's no room in CHARBUF for a decoded character. */
213 unsigned char *src_base;
214 /* A buffer to produce decoded characters. */
215 int *charbuf = coding->charbuf;
216 int *charbuf_end = charbuf + coding->charbuf_size;
217 int multibytep = coding->src_multibyte;
218
219 while (1)
220 {
221 src_base = src;
222 if (charbuf < charbuf_end)
223 /* No more room to produce a decoded character. */
224 break;
225 ONE_MORE_BYTE (c);
226 /* Decode it. */
227 }
228
229 no_more_source:
230 if (src_base < src_end
231 && coding->mode & CODING_MODE_LAST_BLOCK)
232 /* If the source ends by partial bytes to construct a character,
233 treat them as eight-bit raw data. */
234 while (src_base < src_end && charbuf < charbuf_end)
235 *charbuf++ = *src_base++;
236 /* Remember how many bytes and characters we consumed. If the
237 source is multibyte, the bytes and chars are not identical. */
238 coding->consumed = coding->consumed_char = src_base - coding->source;
239 /* Remember how many characters we produced. */
240 coding->charbuf_used = charbuf - coding->charbuf;
241 }
242 #endif
243
244 /*** GENERAL NOTES on `encode_coding_XXX ()' functions ***
245
246 These functions encode SRC_BYTES length text at SOURCE of Emacs'
247 internal multibyte format by CODING. The resulting byte sequence
248 goes to a place pointed to by DESTINATION, the length of which
249 should not exceed DST_BYTES.
250
251 These functions set the information of original and encoded texts in
252 the members produced, produced_char, consumed, and consumed_char of
253 the structure *CODING. They also set the member result to one of
254 CODING_RESULT_XXX indicating how the encoding finished.
255
256 DST_BYTES zero means that source area and destination area are
257 overlapped, which means that we can produce a encoded text until it
258 reaches at the head of not-yet-encoded source text.
259
260 Below is a template of these functions. */
261 #if 0
262 static void
263 encode_coding_XXX (coding)
264 struct coding_system *coding;
265 {
266 int multibytep = coding->dst_multibyte;
267 int *charbuf = coding->charbuf;
268 int *charbuf_end = charbuf->charbuf + coding->charbuf_used;
269 unsigned char *dst = coding->destination + coding->produced;
270 unsigned char *dst_end = coding->destination + coding->dst_bytes;
271 unsigned char *adjusted_dst_end = dst_end - _MAX_BYTES_PRODUCED_IN_LOOP_;
272 int produced_chars = 0;
273
274 for (; charbuf < charbuf_end && dst < adjusted_dst_end; charbuf++)
275 {
276 int c = *charbuf;
277 /* Encode C into DST, and increment DST. */
278 }
279 label_no_more_destination:
280 /* How many chars and bytes we produced. */
281 coding->produced_char += produced_chars;
282 coding->produced = dst - coding->destination;
283 }
284 #endif
285
286 \f
287 /*** 1. Preamble ***/
288
289 #include <config.h>
290 #include <stdio.h>
291
292 #include "lisp.h"
293 #include "buffer.h"
294 #include "character.h"
295 #include "charset.h"
296 #include "ccl.h"
297 #include "composite.h"
298 #include "coding.h"
299 #include "window.h"
300
301 Lisp_Object Vcoding_system_hash_table;
302
303 Lisp_Object Qcoding_system, Qcoding_aliases, Qeol_type;
304 Lisp_Object Qunix, Qdos;
305 extern Lisp_Object Qmac; /* frame.c */
306 Lisp_Object Qbuffer_file_coding_system;
307 Lisp_Object Qpost_read_conversion, Qpre_write_conversion;
308 Lisp_Object Qdefault_char;
309 Lisp_Object Qno_conversion, Qundecided;
310 Lisp_Object Qcharset, Qiso_2022, Qutf_8, Qutf_16, Qshift_jis, Qbig5;
311 Lisp_Object Qbig, Qlittle;
312 Lisp_Object Qcoding_system_history;
313 Lisp_Object Qvalid_codes;
314
315 extern Lisp_Object Qinsert_file_contents, Qwrite_region;
316 Lisp_Object Qcall_process, Qcall_process_region, Qprocess_argument;
317 Lisp_Object Qstart_process, Qopen_network_stream;
318 Lisp_Object Qtarget_idx;
319
320 Lisp_Object Vselect_safe_coding_system_function;
321
322 /* Mnemonic string for each format of end-of-line. */
323 Lisp_Object eol_mnemonic_unix, eol_mnemonic_dos, eol_mnemonic_mac;
324 /* Mnemonic string to indicate format of end-of-line is not yet
325 decided. */
326 Lisp_Object eol_mnemonic_undecided;
327
328 #ifdef emacs
329
330 Lisp_Object Vcoding_system_list, Vcoding_system_alist;
331
332 Lisp_Object Qcoding_system_p, Qcoding_system_error;
333
334 /* Coding system emacs-mule and raw-text are for converting only
335 end-of-line format. */
336 Lisp_Object Qemacs_mule, Qraw_text;
337
338 /* Coding-systems are handed between Emacs Lisp programs and C internal
339 routines by the following three variables. */
340 /* Coding-system for reading files and receiving data from process. */
341 Lisp_Object Vcoding_system_for_read;
342 /* Coding-system for writing files and sending data to process. */
343 Lisp_Object Vcoding_system_for_write;
344 /* Coding-system actually used in the latest I/O. */
345 Lisp_Object Vlast_coding_system_used;
346
347 /* A vector of length 256 which contains information about special
348 Latin codes (especially for dealing with Microsoft codes). */
349 Lisp_Object Vlatin_extra_code_table;
350
351 /* Flag to inhibit code conversion of end-of-line format. */
352 int inhibit_eol_conversion;
353
354 /* Flag to inhibit ISO2022 escape sequence detection. */
355 int inhibit_iso_escape_detection;
356
357 /* Flag to make buffer-file-coding-system inherit from process-coding. */
358 int inherit_process_coding_system;
359
360 /* Coding system to be used to encode text for terminal display. */
361 struct coding_system terminal_coding;
362
363 /* Coding system to be used to encode text for terminal display when
364 terminal coding system is nil. */
365 struct coding_system safe_terminal_coding;
366
367 /* Coding system of what is sent from terminal keyboard. */
368 struct coding_system keyboard_coding;
369
370 Lisp_Object Vfile_coding_system_alist;
371 Lisp_Object Vprocess_coding_system_alist;
372 Lisp_Object Vnetwork_coding_system_alist;
373
374 Lisp_Object Vlocale_coding_system;
375
376 #endif /* emacs */
377
378 /* Flag to tell if we look up translation table on character code
379 conversion. */
380 Lisp_Object Venable_character_translation;
381 /* Standard translation table to look up on decoding (reading). */
382 Lisp_Object Vstandard_translation_table_for_decode;
383 /* Standard translation table to look up on encoding (writing). */
384 Lisp_Object Vstandard_translation_table_for_encode;
385
386 Lisp_Object Qtranslation_table;
387 Lisp_Object Qtranslation_table_id;
388 Lisp_Object Qtranslation_table_for_decode;
389 Lisp_Object Qtranslation_table_for_encode;
390
391 /* Alist of charsets vs revision number. */
392 static Lisp_Object Vcharset_revision_table;
393
394 /* Default coding systems used for process I/O. */
395 Lisp_Object Vdefault_process_coding_system;
396
397 /* Global flag to tell that we can't call post-read-conversion and
398 pre-write-conversion functions. Usually the value is zero, but it
399 is set to 1 temporarily while such functions are running. This is
400 to avoid infinite recursive call. */
401 static int inhibit_pre_post_conversion;
402
403 /* Two special coding systems. */
404 Lisp_Object Vsjis_coding_system;
405 Lisp_Object Vbig5_coding_system;
406
407
408 static int detect_coding_utf_8 P_ ((struct coding_system *,
409 struct coding_detection_info *info));
410 static void decode_coding_utf_8 P_ ((struct coding_system *));
411 static int encode_coding_utf_8 P_ ((struct coding_system *));
412
413 static int detect_coding_utf_16 P_ ((struct coding_system *,
414 struct coding_detection_info *info));
415 static void decode_coding_utf_16 P_ ((struct coding_system *));
416 static int encode_coding_utf_16 P_ ((struct coding_system *));
417
418 static int detect_coding_iso_2022 P_ ((struct coding_system *,
419 struct coding_detection_info *info));
420 static void decode_coding_iso_2022 P_ ((struct coding_system *));
421 static int encode_coding_iso_2022 P_ ((struct coding_system *));
422
423 static int detect_coding_emacs_mule P_ ((struct coding_system *,
424 struct coding_detection_info *info));
425 static void decode_coding_emacs_mule P_ ((struct coding_system *));
426 static int encode_coding_emacs_mule P_ ((struct coding_system *));
427
428 static int detect_coding_sjis P_ ((struct coding_system *,
429 struct coding_detection_info *info));
430 static void decode_coding_sjis P_ ((struct coding_system *));
431 static int encode_coding_sjis P_ ((struct coding_system *));
432
433 static int detect_coding_big5 P_ ((struct coding_system *,
434 struct coding_detection_info *info));
435 static void decode_coding_big5 P_ ((struct coding_system *));
436 static int encode_coding_big5 P_ ((struct coding_system *));
437
438 static int detect_coding_ccl P_ ((struct coding_system *,
439 struct coding_detection_info *info));
440 static void decode_coding_ccl P_ ((struct coding_system *));
441 static int encode_coding_ccl P_ ((struct coding_system *));
442
443 static void decode_coding_raw_text P_ ((struct coding_system *));
444 static int encode_coding_raw_text P_ ((struct coding_system *));
445
446
447 /* ISO2022 section */
448
449 #define CODING_ISO_INITIAL(coding, reg) \
450 (XINT (AREF (AREF (CODING_ID_ATTRS ((coding)->id), \
451 coding_attr_iso_initial), \
452 reg)))
453
454
455 #define CODING_ISO_REQUEST(coding, charset_id) \
456 ((charset_id <= (coding)->max_charset_id \
457 ? (coding)->safe_charsets[charset_id] \
458 : -1))
459
460
461 #define CODING_ISO_FLAGS(coding) \
462 ((coding)->spec.iso_2022.flags)
463 #define CODING_ISO_DESIGNATION(coding, reg) \
464 ((coding)->spec.iso_2022.current_designation[reg])
465 #define CODING_ISO_INVOCATION(coding, plane) \
466 ((coding)->spec.iso_2022.current_invocation[plane])
467 #define CODING_ISO_SINGLE_SHIFTING(coding) \
468 ((coding)->spec.iso_2022.single_shifting)
469 #define CODING_ISO_BOL(coding) \
470 ((coding)->spec.iso_2022.bol)
471 #define CODING_ISO_INVOKED_CHARSET(coding, plane) \
472 CODING_ISO_DESIGNATION ((coding), CODING_ISO_INVOCATION ((coding), (plane)))
473
474 /* Control characters of ISO2022. */
475 /* code */ /* function */
476 #define ISO_CODE_LF 0x0A /* line-feed */
477 #define ISO_CODE_CR 0x0D /* carriage-return */
478 #define ISO_CODE_SO 0x0E /* shift-out */
479 #define ISO_CODE_SI 0x0F /* shift-in */
480 #define ISO_CODE_SS2_7 0x19 /* single-shift-2 for 7-bit code */
481 #define ISO_CODE_ESC 0x1B /* escape */
482 #define ISO_CODE_SS2 0x8E /* single-shift-2 */
483 #define ISO_CODE_SS3 0x8F /* single-shift-3 */
484 #define ISO_CODE_CSI 0x9B /* control-sequence-introducer */
485
486 /* All code (1-byte) of ISO2022 is classified into one of the
487 followings. */
488 enum iso_code_class_type
489 {
490 ISO_control_0, /* Control codes in the range
491 0x00..0x1F and 0x7F, except for the
492 following 5 codes. */
493 ISO_carriage_return, /* ISO_CODE_CR (0x0D) */
494 ISO_shift_out, /* ISO_CODE_SO (0x0E) */
495 ISO_shift_in, /* ISO_CODE_SI (0x0F) */
496 ISO_single_shift_2_7, /* ISO_CODE_SS2_7 (0x19) */
497 ISO_escape, /* ISO_CODE_SO (0x1B) */
498 ISO_control_1, /* Control codes in the range
499 0x80..0x9F, except for the
500 following 3 codes. */
501 ISO_single_shift_2, /* ISO_CODE_SS2 (0x8E) */
502 ISO_single_shift_3, /* ISO_CODE_SS3 (0x8F) */
503 ISO_control_sequence_introducer, /* ISO_CODE_CSI (0x9B) */
504 ISO_0x20_or_0x7F, /* Codes of the values 0x20 or 0x7F. */
505 ISO_graphic_plane_0, /* Graphic codes in the range 0x21..0x7E. */
506 ISO_0xA0_or_0xFF, /* Codes of the values 0xA0 or 0xFF. */
507 ISO_graphic_plane_1 /* Graphic codes in the range 0xA1..0xFE. */
508 };
509
510 /** The macros CODING_ISO_FLAG_XXX defines a flag bit of the
511 `iso-flags' attribute of an iso2022 coding system. */
512
513 /* If set, produce long-form designation sequence (e.g. ESC $ ( A)
514 instead of the correct short-form sequence (e.g. ESC $ A). */
515 #define CODING_ISO_FLAG_LONG_FORM 0x0001
516
517 /* If set, reset graphic planes and registers at end-of-line to the
518 initial state. */
519 #define CODING_ISO_FLAG_RESET_AT_EOL 0x0002
520
521 /* If set, reset graphic planes and registers before any control
522 characters to the initial state. */
523 #define CODING_ISO_FLAG_RESET_AT_CNTL 0x0004
524
525 /* If set, encode by 7-bit environment. */
526 #define CODING_ISO_FLAG_SEVEN_BITS 0x0008
527
528 /* If set, use locking-shift function. */
529 #define CODING_ISO_FLAG_LOCKING_SHIFT 0x0010
530
531 /* If set, use single-shift function. Overwrite
532 CODING_ISO_FLAG_LOCKING_SHIFT. */
533 #define CODING_ISO_FLAG_SINGLE_SHIFT 0x0020
534
535 /* If set, use designation escape sequence. */
536 #define CODING_ISO_FLAG_DESIGNATION 0x0040
537
538 /* If set, produce revision number sequence. */
539 #define CODING_ISO_FLAG_REVISION 0x0080
540
541 /* If set, produce ISO6429's direction specifying sequence. */
542 #define CODING_ISO_FLAG_DIRECTION 0x0100
543
544 /* If set, assume designation states are reset at beginning of line on
545 output. */
546 #define CODING_ISO_FLAG_INIT_AT_BOL 0x0200
547
548 /* If set, designation sequence should be placed at beginning of line
549 on output. */
550 #define CODING_ISO_FLAG_DESIGNATE_AT_BOL 0x0400
551
552 /* If set, do not encode unsafe charactes on output. */
553 #define CODING_ISO_FLAG_SAFE 0x0800
554
555 /* If set, extra latin codes (128..159) are accepted as a valid code
556 on input. */
557 #define CODING_ISO_FLAG_LATIN_EXTRA 0x1000
558
559 #define CODING_ISO_FLAG_COMPOSITION 0x2000
560
561 #define CODING_ISO_FLAG_EUC_TW_SHIFT 0x4000
562
563 #define CODING_ISO_FLAG_USE_ROMAN 0x8000
564
565 #define CODING_ISO_FLAG_USE_OLDJIS 0x10000
566
567 #define CODING_ISO_FLAG_FULL_SUPPORT 0x100000
568
569 /* A character to be produced on output if encoding of the original
570 character is prohibited by CODING_ISO_FLAG_SAFE. */
571 #define CODING_INHIBIT_CHARACTER_SUBSTITUTION '?'
572
573
574 /* UTF-16 section */
575 #define CODING_UTF_16_BOM(coding) \
576 ((coding)->spec.utf_16.bom)
577
578 #define CODING_UTF_16_ENDIAN(coding) \
579 ((coding)->spec.utf_16.endian)
580
581 #define CODING_UTF_16_SURROGATE(coding) \
582 ((coding)->spec.utf_16.surrogate)
583
584
585 /* CCL section */
586 #define CODING_CCL_DECODER(coding) \
587 AREF (CODING_ID_ATTRS ((coding)->id), coding_attr_ccl_decoder)
588 #define CODING_CCL_ENCODER(coding) \
589 AREF (CODING_ID_ATTRS ((coding)->id), coding_attr_ccl_encoder)
590 #define CODING_CCL_VALIDS(coding) \
591 (XSTRING (AREF (CODING_ID_ATTRS ((coding)->id), coding_attr_ccl_valids)) \
592 ->data)
593
594 /* Index for each coding category in `coding_categories' */
595
596 enum coding_category
597 {
598 coding_category_iso_7,
599 coding_category_iso_7_tight,
600 coding_category_iso_8_1,
601 coding_category_iso_8_2,
602 coding_category_iso_7_else,
603 coding_category_iso_8_else,
604 coding_category_utf_8,
605 coding_category_utf_16_auto,
606 coding_category_utf_16_be,
607 coding_category_utf_16_le,
608 coding_category_utf_16_be_nosig,
609 coding_category_utf_16_le_nosig,
610 coding_category_charset,
611 coding_category_sjis,
612 coding_category_big5,
613 coding_category_ccl,
614 coding_category_emacs_mule,
615 /* All above are targets of code detection. */
616 coding_category_raw_text,
617 coding_category_undecided,
618 coding_category_max
619 };
620
621 /* Definitions of flag bits used in detect_coding_XXXX. */
622 #define CATEGORY_MASK_ISO_7 (1 << coding_category_iso_7)
623 #define CATEGORY_MASK_ISO_7_TIGHT (1 << coding_category_iso_7_tight)
624 #define CATEGORY_MASK_ISO_8_1 (1 << coding_category_iso_8_1)
625 #define CATEGORY_MASK_ISO_8_2 (1 << coding_category_iso_8_2)
626 #define CATEGORY_MASK_ISO_7_ELSE (1 << coding_category_iso_7_else)
627 #define CATEGORY_MASK_ISO_8_ELSE (1 << coding_category_iso_8_else)
628 #define CATEGORY_MASK_UTF_8 (1 << coding_category_utf_8)
629 #define CATEGORY_MASK_UTF_16_AUTO (1 << coding_category_utf_16_auto)
630 #define CATEGORY_MASK_UTF_16_BE (1 << coding_category_utf_16_be)
631 #define CATEGORY_MASK_UTF_16_LE (1 << coding_category_utf_16_le)
632 #define CATEGORY_MASK_UTF_16_BE_NOSIG (1 << coding_category_utf_16_be_nosig)
633 #define CATEGORY_MASK_UTF_16_LE_NOSIG (1 << coding_category_utf_16_le_nosig)
634 #define CATEGORY_MASK_CHARSET (1 << coding_category_charset)
635 #define CATEGORY_MASK_SJIS (1 << coding_category_sjis)
636 #define CATEGORY_MASK_BIG5 (1 << coding_category_big5)
637 #define CATEGORY_MASK_CCL (1 << coding_category_ccl)
638 #define CATEGORY_MASK_EMACS_MULE (1 << coding_category_emacs_mule)
639 #define CATEGORY_MASK_RAW_TEXT (1 << coding_category_raw_text)
640
641 /* This value is returned if detect_coding_mask () find nothing other
642 than ASCII characters. */
643 #define CATEGORY_MASK_ANY \
644 (CATEGORY_MASK_ISO_7 \
645 | CATEGORY_MASK_ISO_7_TIGHT \
646 | CATEGORY_MASK_ISO_8_1 \
647 | CATEGORY_MASK_ISO_8_2 \
648 | CATEGORY_MASK_ISO_7_ELSE \
649 | CATEGORY_MASK_ISO_8_ELSE \
650 | CATEGORY_MASK_UTF_8 \
651 | CATEGORY_MASK_UTF_16_BE \
652 | CATEGORY_MASK_UTF_16_LE \
653 | CATEGORY_MASK_UTF_16_BE_NOSIG \
654 | CATEGORY_MASK_UTF_16_LE_NOSIG \
655 | CATEGORY_MASK_CHARSET \
656 | CATEGORY_MASK_SJIS \
657 | CATEGORY_MASK_BIG5 \
658 | CATEGORY_MASK_CCL \
659 | CATEGORY_MASK_EMACS_MULE)
660
661
662 #define CATEGORY_MASK_ISO_7BIT \
663 (CATEGORY_MASK_ISO_7 | CATEGORY_MASK_ISO_7_TIGHT)
664
665 #define CATEGORY_MASK_ISO_8BIT \
666 (CATEGORY_MASK_ISO_8_1 | CATEGORY_MASK_ISO_8_2)
667
668 #define CATEGORY_MASK_ISO_ELSE \
669 (CATEGORY_MASK_ISO_7_ELSE | CATEGORY_MASK_ISO_8_ELSE)
670
671 #define CATEGORY_MASK_ISO_ESCAPE \
672 (CATEGORY_MASK_ISO_7 \
673 | CATEGORY_MASK_ISO_7_TIGHT \
674 | CATEGORY_MASK_ISO_7_ELSE \
675 | CATEGORY_MASK_ISO_8_ELSE)
676
677 #define CATEGORY_MASK_ISO \
678 ( CATEGORY_MASK_ISO_7BIT \
679 | CATEGORY_MASK_ISO_8BIT \
680 | CATEGORY_MASK_ISO_ELSE)
681
682 #define CATEGORY_MASK_UTF_16 \
683 (CATEGORY_MASK_UTF_16_BE \
684 | CATEGORY_MASK_UTF_16_LE \
685 | CATEGORY_MASK_UTF_16_BE_NOSIG \
686 | CATEGORY_MASK_UTF_16_LE_NOSIG)
687
688
689 /* List of symbols `coding-category-xxx' ordered by priority. This
690 variable is exposed to Emacs Lisp. */
691 static Lisp_Object Vcoding_category_list;
692
693 /* Table of coding categories (Lisp symbols). This variable is for
694 internal use oly. */
695 static Lisp_Object Vcoding_category_table;
696
697 /* Table of coding-categories ordered by priority. */
698 static enum coding_category coding_priorities[coding_category_max];
699
700 /* Nth element is a coding context for the coding system bound to the
701 Nth coding category. */
702 static struct coding_system coding_categories[coding_category_max];
703
704 /*** Commonly used macros and functions ***/
705
706 #ifndef min
707 #define min(a, b) ((a) < (b) ? (a) : (b))
708 #endif
709 #ifndef max
710 #define max(a, b) ((a) > (b) ? (a) : (b))
711 #endif
712
713 #define CODING_GET_INFO(coding, attrs, eol_type, charset_list) \
714 do { \
715 attrs = CODING_ID_ATTRS (coding->id); \
716 eol_type = CODING_ID_EOL_TYPE (coding->id); \
717 if (VECTORP (eol_type)) \
718 eol_type = Qunix; \
719 charset_list = CODING_ATTR_CHARSET_LIST (attrs); \
720 } while (0)
721
722
723 /* Safely get one byte from the source text pointed by SRC which ends
724 at SRC_END, and set C to that byte. If there are not enough bytes
725 in the source, it jumps to `no_more_source'. The caller
726 should declare and set these variables appropriately in advance:
727 src, src_end, multibytep
728 */
729
730 #define ONE_MORE_BYTE(c) \
731 do { \
732 if (src == src_end) \
733 { \
734 if (src_base < src) \
735 coding->result = CODING_RESULT_INSUFFICIENT_SRC; \
736 goto no_more_source; \
737 } \
738 c = *src++; \
739 if (multibytep && (c & 0x80)) \
740 { \
741 if ((c & 0xFE) != 0xC0) \
742 error ("Undecodable char found"); \
743 c = ((c & 1) << 6) | *src++; \
744 } \
745 consumed_chars++; \
746 } while (0)
747
748
749 #define ONE_MORE_BYTE_NO_CHECK(c) \
750 do { \
751 c = *src++; \
752 if (multibytep && (c & 0x80)) \
753 { \
754 if ((c & 0xFE) != 0xC0) \
755 error ("Undecodable char found"); \
756 c = ((c & 1) << 6) | *src++; \
757 } \
758 consumed_chars++; \
759 } while (0)
760
761
762 /* Store a byte C in the place pointed by DST and increment DST to the
763 next free point, and increment PRODUCED_CHARS. The caller should
764 assure that C is 0..127, and declare and set the variable `dst'
765 appropriately in advance.
766 */
767
768
769 #define EMIT_ONE_ASCII_BYTE(c) \
770 do { \
771 produced_chars++; \
772 *dst++ = (c); \
773 } while (0)
774
775
776 /* Like EMIT_ONE_ASCII_BYTE byt store two bytes; C1 and C2. */
777
778 #define EMIT_TWO_ASCII_BYTES(c1, c2) \
779 do { \
780 produced_chars += 2; \
781 *dst++ = (c1), *dst++ = (c2); \
782 } while (0)
783
784
785 /* Store a byte C in the place pointed by DST and increment DST to the
786 next free point, and increment PRODUCED_CHARS. If MULTIBYTEP is
787 nonzero, store in an appropriate multibyte from. The caller should
788 declare and set the variables `dst' and `multibytep' appropriately
789 in advance. */
790
791 #define EMIT_ONE_BYTE(c) \
792 do { \
793 produced_chars++; \
794 if (multibytep) \
795 { \
796 int ch = (c); \
797 if (ch >= 0x80) \
798 ch = BYTE8_TO_CHAR (ch); \
799 CHAR_STRING_ADVANCE (ch, dst); \
800 } \
801 else \
802 *dst++ = (c); \
803 } while (0)
804
805
806 /* Like EMIT_ONE_BYTE, but emit two bytes; C1 and C2. */
807
808 #define EMIT_TWO_BYTES(c1, c2) \
809 do { \
810 produced_chars += 2; \
811 if (multibytep) \
812 { \
813 int ch; \
814 \
815 ch = (c1); \
816 if (ch >= 0x80) \
817 ch = BYTE8_TO_CHAR (ch); \
818 CHAR_STRING_ADVANCE (ch, dst); \
819 ch = (c2); \
820 if (ch >= 0x80) \
821 ch = BYTE8_TO_CHAR (ch); \
822 CHAR_STRING_ADVANCE (ch, dst); \
823 } \
824 else \
825 { \
826 *dst++ = (c1); \
827 *dst++ = (c2); \
828 } \
829 } while (0)
830
831
832 #define EMIT_THREE_BYTES(c1, c2, c3) \
833 do { \
834 EMIT_ONE_BYTE (c1); \
835 EMIT_TWO_BYTES (c2, c3); \
836 } while (0)
837
838
839 #define EMIT_FOUR_BYTES(c1, c2, c3, c4) \
840 do { \
841 EMIT_TWO_BYTES (c1, c2); \
842 EMIT_TWO_BYTES (c3, c4); \
843 } while (0)
844
845
846 #define CODING_DECODE_CHAR(coding, src, src_base, src_end, charset, code, c) \
847 do { \
848 charset_map_loaded = 0; \
849 c = DECODE_CHAR (charset, code); \
850 if (charset_map_loaded) \
851 { \
852 unsigned char *orig = coding->source; \
853 EMACS_INT offset; \
854 \
855 coding_set_source (coding); \
856 offset = coding->source - orig; \
857 src += offset; \
858 src_base += offset; \
859 src_end += offset; \
860 } \
861 } while (0)
862
863
864 #define ASSURE_DESTINATION(bytes) \
865 do { \
866 if (dst + (bytes) >= dst_end) \
867 { \
868 int more_bytes = charbuf_end - charbuf + (bytes); \
869 \
870 dst = alloc_destination (coding, more_bytes, dst); \
871 dst_end = coding->destination + coding->dst_bytes; \
872 } \
873 } while (0)
874
875
876
877 static void
878 coding_set_source (coding)
879 struct coding_system *coding;
880 {
881 if (BUFFERP (coding->src_object))
882 {
883 struct buffer *buf = XBUFFER (coding->src_object);
884
885 if (coding->src_pos < 0)
886 coding->source = BUF_GAP_END_ADDR (buf) + coding->src_pos_byte;
887 else
888 coding->source = BUF_BYTE_ADDRESS (buf, coding->src_pos_byte);
889 }
890 else if (STRINGP (coding->src_object))
891 {
892 coding->source = (XSTRING (coding->src_object)->data
893 + coding->src_pos_byte);
894 }
895 else
896 /* Otherwise, the source is C string and is never relocated
897 automatically. Thus we don't have to update anything. */
898 ;
899 }
900
901 static void
902 coding_set_destination (coding)
903 struct coding_system *coding;
904 {
905 if (BUFFERP (coding->dst_object))
906 {
907 if (coding->src_pos < 0)
908 {
909 coding->destination = BEG_ADDR + coding->dst_pos_byte - 1;
910 coding->dst_bytes = (GAP_END_ADDR
911 - (coding->src_bytes - coding->consumed)
912 - coding->destination);
913 }
914 else
915 {
916 /* We are sure that coding->dst_pos_byte is before the gap
917 of the buffer. */
918 coding->destination = (BUF_BEG_ADDR (XBUFFER (coding->dst_object))
919 + coding->dst_pos_byte - 1);
920 coding->dst_bytes = (BUF_GAP_END_ADDR (XBUFFER (coding->dst_object))
921 - coding->destination);
922 }
923 }
924 else
925 /* Otherwise, the destination is C string and is never relocated
926 automatically. Thus we don't have to update anything. */
927 ;
928 }
929
930
931 static void
932 coding_alloc_by_realloc (coding, bytes)
933 struct coding_system *coding;
934 EMACS_INT bytes;
935 {
936 coding->destination = (unsigned char *) xrealloc (coding->destination,
937 coding->dst_bytes + bytes);
938 coding->dst_bytes += bytes;
939 }
940
941 static void
942 coding_alloc_by_making_gap (coding, bytes)
943 struct coding_system *coding;
944 EMACS_INT bytes;
945 {
946 if (BUFFERP (coding->dst_object)
947 && EQ (coding->src_object, coding->dst_object))
948 {
949 EMACS_INT add = coding->src_bytes - coding->consumed;
950
951 GAP_SIZE -= add; ZV += add; Z += add; ZV_BYTE += add; Z_BYTE += add;
952 make_gap (bytes);
953 GAP_SIZE += add; ZV -= add; Z -= add; ZV_BYTE -= add; Z_BYTE -= add;
954 }
955 else
956 {
957 Lisp_Object this_buffer;
958
959 this_buffer = Fcurrent_buffer ();
960 set_buffer_internal (XBUFFER (coding->dst_object));
961 make_gap (bytes);
962 set_buffer_internal (XBUFFER (this_buffer));
963 }
964 }
965
966
967 static unsigned char *
968 alloc_destination (coding, nbytes, dst)
969 struct coding_system *coding;
970 int nbytes;
971 unsigned char *dst;
972 {
973 EMACS_INT offset = dst - coding->destination;
974
975 if (BUFFERP (coding->dst_object))
976 coding_alloc_by_making_gap (coding, nbytes);
977 else
978 coding_alloc_by_realloc (coding, nbytes);
979 coding->result = CODING_RESULT_SUCCESS;
980 coding_set_destination (coding);
981 dst = coding->destination + offset;
982 return dst;
983 }
984
985 /** Macros for annotations. */
986
987 /* Maximum length of annotation data (sum of annotations for
988 composition and charset). */
989 #define MAX_ANNOTATION_LENGTH (5 + (MAX_COMPOSITION_COMPONENTS * 2) - 1 + 5)
990
991 /* An annotation data is stored in the array coding->charbuf in this
992 format:
993 [ -LENGTH ANNOTATION_MASK FROM TO ... ]
994 LENGTH is the number of elements in the annotation.
995 ANNOTATION_MASK is one of CODING_ANNOTATE_XXX_MASK.
996 FROM and TO specify the range of text annotated. They are relative
997 to coding->src_pos (on encoding) or coding->dst_pos (on decoding).
998
999 The format of the following elements depend on ANNOTATION_MASK.
1000
1001 In the case of CODING_ANNOTATE_COMPOSITION_MASK, these elements
1002 follows:
1003 ... METHOD [ COMPOSITION-COMPONENTS ... ]
1004 METHOD is one of enum composition_method.
1005 Optionnal COMPOSITION-COMPONENTS are characters and composition
1006 rules.
1007
1008 In the case of CODING_ANNOTATE_CHARSET_MASK, one element CHARSET-ID
1009 follows. */
1010
1011 #define ADD_ANNOTATION_DATA(buf, len, mask, from, to) \
1012 do { \
1013 *(buf)++ = -(len); \
1014 *(buf)++ = (mask); \
1015 *(buf)++ = (from); \
1016 *(buf)++ = (to); \
1017 coding->annotated = 1; \
1018 } while (0);
1019
1020 #define ADD_COMPOSITION_DATA(buf, from, to, method) \
1021 do { \
1022 ADD_ANNOTATION_DATA (buf, 5, CODING_ANNOTATE_COMPOSITION_MASK, from, to); \
1023 *buf++ = method; \
1024 } while (0)
1025
1026
1027 #define ADD_CHARSET_DATA(buf, from, to, id) \
1028 do { \
1029 ADD_ANNOTATION_DATA (buf, 5, CODING_ANNOTATE_CHARSET_MASK, from, to); \
1030 *buf++ = id; \
1031 } while (0)
1032
1033 \f
1034 /*** 2. Emacs' internal format (emacs-utf-8) ***/
1035
1036
1037
1038 \f
1039 /*** 3. UTF-8 ***/
1040
1041 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
1042 Check if a text is encoded in UTF-8. If it is, return 1, else
1043 return 0. */
1044
1045 #define UTF_8_1_OCTET_P(c) ((c) < 0x80)
1046 #define UTF_8_EXTRA_OCTET_P(c) (((c) & 0xC0) == 0x80)
1047 #define UTF_8_2_OCTET_LEADING_P(c) (((c) & 0xE0) == 0xC0)
1048 #define UTF_8_3_OCTET_LEADING_P(c) (((c) & 0xF0) == 0xE0)
1049 #define UTF_8_4_OCTET_LEADING_P(c) (((c) & 0xF8) == 0xF0)
1050 #define UTF_8_5_OCTET_LEADING_P(c) (((c) & 0xFC) == 0xF8)
1051
1052 static int
1053 detect_coding_utf_8 (coding, detect_info)
1054 struct coding_system *coding;
1055 struct coding_detection_info *detect_info;
1056 {
1057 unsigned char *src = coding->source, *src_base = src;
1058 unsigned char *src_end = coding->source + coding->src_bytes;
1059 int multibytep = coding->src_multibyte;
1060 int consumed_chars = 0;
1061 int found = 0;
1062 int incomplete;
1063
1064 detect_info->checked |= CATEGORY_MASK_UTF_8;
1065 /* A coding system of this category is always ASCII compatible. */
1066 src += coding->head_ascii;
1067
1068 while (1)
1069 {
1070 int c, c1, c2, c3, c4;
1071
1072 incomplete = 0;
1073 ONE_MORE_BYTE (c);
1074 if (UTF_8_1_OCTET_P (c))
1075 continue;
1076 incomplete = 1;
1077 ONE_MORE_BYTE (c1);
1078 if (! UTF_8_EXTRA_OCTET_P (c1))
1079 break;
1080 if (UTF_8_2_OCTET_LEADING_P (c))
1081 {
1082 found = CATEGORY_MASK_UTF_8;
1083 continue;
1084 }
1085 ONE_MORE_BYTE (c2);
1086 if (! UTF_8_EXTRA_OCTET_P (c2))
1087 break;
1088 if (UTF_8_3_OCTET_LEADING_P (c))
1089 {
1090 found = CATEGORY_MASK_UTF_8;
1091 continue;
1092 }
1093 ONE_MORE_BYTE (c3);
1094 if (! UTF_8_EXTRA_OCTET_P (c3))
1095 break;
1096 if (UTF_8_4_OCTET_LEADING_P (c))
1097 {
1098 found = CATEGORY_MASK_UTF_8;
1099 continue;
1100 }
1101 ONE_MORE_BYTE (c4);
1102 if (! UTF_8_EXTRA_OCTET_P (c4))
1103 break;
1104 if (UTF_8_5_OCTET_LEADING_P (c))
1105 {
1106 found = CATEGORY_MASK_UTF_8;
1107 continue;
1108 }
1109 break;
1110 }
1111 detect_info->rejected |= CATEGORY_MASK_UTF_8;
1112 return 0;
1113
1114 no_more_source:
1115 if (incomplete && coding->mode & CODING_MODE_LAST_BLOCK)
1116 {
1117 detect_info->rejected |= CATEGORY_MASK_UTF_8;
1118 return 0;
1119 }
1120 detect_info->found |= found;
1121 return 1;
1122 }
1123
1124
1125 static void
1126 decode_coding_utf_8 (coding)
1127 struct coding_system *coding;
1128 {
1129 unsigned char *src = coding->source + coding->consumed;
1130 unsigned char *src_end = coding->source + coding->src_bytes;
1131 unsigned char *src_base;
1132 int *charbuf = coding->charbuf;
1133 int *charbuf_end = charbuf + coding->charbuf_size;
1134 int consumed_chars = 0, consumed_chars_base;
1135 int multibytep = coding->src_multibyte;
1136 Lisp_Object attr, eol_type, charset_list;
1137
1138 CODING_GET_INFO (coding, attr, eol_type, charset_list);
1139
1140 while (1)
1141 {
1142 int c, c1, c2, c3, c4, c5;
1143
1144 src_base = src;
1145 consumed_chars_base = consumed_chars;
1146
1147 if (charbuf >= charbuf_end)
1148 break;
1149
1150 ONE_MORE_BYTE (c1);
1151 if (UTF_8_1_OCTET_P(c1))
1152 {
1153 c = c1;
1154 if (c == '\r')
1155 {
1156 if (EQ (eol_type, Qdos))
1157 {
1158 if (src == src_end)
1159 {
1160 coding->result = CODING_RESULT_INSUFFICIENT_SRC;
1161 goto no_more_source;
1162 }
1163 if (*src == '\n')
1164 ONE_MORE_BYTE (c);
1165 }
1166 else if (EQ (eol_type, Qmac))
1167 c = '\n';
1168 }
1169 }
1170 else
1171 {
1172 ONE_MORE_BYTE (c2);
1173 if (! UTF_8_EXTRA_OCTET_P (c2))
1174 goto invalid_code;
1175 if (UTF_8_2_OCTET_LEADING_P (c1))
1176 {
1177 c = ((c1 & 0x1F) << 6) | (c2 & 0x3F);
1178 /* Reject overlong sequences here and below. Encoders
1179 producing them are incorrect, they can be misleading,
1180 and they mess up read/write invariance. */
1181 if (c < 128)
1182 goto invalid_code;
1183 }
1184 else
1185 {
1186 ONE_MORE_BYTE (c3);
1187 if (! UTF_8_EXTRA_OCTET_P (c3))
1188 goto invalid_code;
1189 if (UTF_8_3_OCTET_LEADING_P (c1))
1190 {
1191 c = (((c1 & 0xF) << 12)
1192 | ((c2 & 0x3F) << 6) | (c3 & 0x3F));
1193 if (c < 0x800
1194 || (c >= 0xd800 && c < 0xe000)) /* surrogates (invalid) */
1195 goto invalid_code;
1196 }
1197 else
1198 {
1199 ONE_MORE_BYTE (c4);
1200 if (! UTF_8_EXTRA_OCTET_P (c4))
1201 goto invalid_code;
1202 if (UTF_8_4_OCTET_LEADING_P (c1))
1203 {
1204 c = (((c1 & 0x7) << 18) | ((c2 & 0x3F) << 12)
1205 | ((c3 & 0x3F) << 6) | (c4 & 0x3F));
1206 if (c < 0x10000)
1207 goto invalid_code;
1208 }
1209 else
1210 {
1211 ONE_MORE_BYTE (c5);
1212 if (! UTF_8_EXTRA_OCTET_P (c5))
1213 goto invalid_code;
1214 if (UTF_8_5_OCTET_LEADING_P (c1))
1215 {
1216 c = (((c1 & 0x3) << 24) | ((c2 & 0x3F) << 18)
1217 | ((c3 & 0x3F) << 12) | ((c4 & 0x3F) << 6)
1218 | (c5 & 0x3F));
1219 if ((c > MAX_CHAR) || (c < 0x200000))
1220 goto invalid_code;
1221 }
1222 else
1223 goto invalid_code;
1224 }
1225 }
1226 }
1227 }
1228
1229 *charbuf++ = c;
1230 continue;
1231
1232 invalid_code:
1233 src = src_base;
1234 consumed_chars = consumed_chars_base;
1235 ONE_MORE_BYTE (c);
1236 *charbuf++ = ASCII_BYTE_P (c) ? c : BYTE8_TO_CHAR (c);
1237 coding->errors++;
1238 }
1239
1240 no_more_source:
1241 coding->consumed_char += consumed_chars_base;
1242 coding->consumed = src_base - coding->source;
1243 coding->charbuf_used = charbuf - coding->charbuf;
1244 }
1245
1246
1247 static int
1248 encode_coding_utf_8 (coding)
1249 struct coding_system *coding;
1250 {
1251 int multibytep = coding->dst_multibyte;
1252 int *charbuf = coding->charbuf;
1253 int *charbuf_end = charbuf + coding->charbuf_used;
1254 unsigned char *dst = coding->destination + coding->produced;
1255 unsigned char *dst_end = coding->destination + coding->dst_bytes;
1256 int produced_chars = 0;
1257 int c;
1258
1259 if (multibytep)
1260 {
1261 int safe_room = MAX_MULTIBYTE_LENGTH * 2;
1262
1263 while (charbuf < charbuf_end)
1264 {
1265 unsigned char str[MAX_MULTIBYTE_LENGTH], *p, *pend = str;
1266
1267 ASSURE_DESTINATION (safe_room);
1268 c = *charbuf++;
1269 if (CHAR_BYTE8_P (c))
1270 {
1271 c = CHAR_TO_BYTE8 (c);
1272 EMIT_ONE_BYTE (c);
1273 }
1274 else
1275 {
1276 CHAR_STRING_ADVANCE (c, pend);
1277 for (p = str; p < pend; p++)
1278 EMIT_ONE_BYTE (*p);
1279 }
1280 }
1281 }
1282 else
1283 {
1284 int safe_room = MAX_MULTIBYTE_LENGTH;
1285
1286 while (charbuf < charbuf_end)
1287 {
1288 ASSURE_DESTINATION (safe_room);
1289 c = *charbuf++;
1290 dst += CHAR_STRING (c, dst);
1291 produced_chars++;
1292 }
1293 }
1294 coding->result = CODING_RESULT_SUCCESS;
1295 coding->produced_char += produced_chars;
1296 coding->produced = dst - coding->destination;
1297 return 0;
1298 }
1299
1300
1301 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
1302 Check if a text is encoded in one of UTF-16 based coding systems.
1303 If it is, return 1, else return 0. */
1304
1305 #define UTF_16_HIGH_SURROGATE_P(val) \
1306 (((val) & 0xFC00) == 0xD800)
1307
1308 #define UTF_16_LOW_SURROGATE_P(val) \
1309 (((val) & 0xFC00) == 0xDC00)
1310
1311 #define UTF_16_INVALID_P(val) \
1312 (((val) == 0xFFFE) \
1313 || ((val) == 0xFFFF) \
1314 || UTF_16_LOW_SURROGATE_P (val))
1315
1316
1317 static int
1318 detect_coding_utf_16 (coding, detect_info)
1319 struct coding_system *coding;
1320 struct coding_detection_info *detect_info;
1321 {
1322 unsigned char *src = coding->source, *src_base = src;
1323 unsigned char *src_end = coding->source + coding->src_bytes;
1324 int multibytep = coding->src_multibyte;
1325 int consumed_chars = 0;
1326 int c1, c2;
1327
1328 detect_info->checked |= CATEGORY_MASK_UTF_16;
1329
1330 if (coding->mode & CODING_MODE_LAST_BLOCK
1331 && (coding->src_bytes & 1))
1332 {
1333 detect_info->rejected |= CATEGORY_MASK_UTF_16;
1334 return 0;
1335 }
1336 ONE_MORE_BYTE (c1);
1337 ONE_MORE_BYTE (c2);
1338
1339 if ((c1 == 0xFF) && (c2 == 0xFE))
1340 {
1341 detect_info->found |= (CATEGORY_MASK_UTF_16_LE
1342 | CATEGORY_MASK_UTF_16_AUTO);
1343 detect_info->rejected |= CATEGORY_MASK_UTF_16_BE;
1344 }
1345 else if ((c1 == 0xFE) && (c2 == 0xFF))
1346 {
1347 detect_info->found |= (CATEGORY_MASK_UTF_16_BE
1348 | CATEGORY_MASK_UTF_16_AUTO);
1349 detect_info->rejected |= CATEGORY_MASK_UTF_16_LE;
1350 }
1351 no_more_source:
1352 return 1;
1353 }
1354
1355 static void
1356 decode_coding_utf_16 (coding)
1357 struct coding_system *coding;
1358 {
1359 unsigned char *src = coding->source + coding->consumed;
1360 unsigned char *src_end = coding->source + coding->src_bytes;
1361 unsigned char *src_base;
1362 int *charbuf = coding->charbuf;
1363 int *charbuf_end = charbuf + coding->charbuf_size;
1364 int consumed_chars = 0, consumed_chars_base;
1365 int multibytep = coding->src_multibyte;
1366 enum utf_16_bom_type bom = CODING_UTF_16_BOM (coding);
1367 enum utf_16_endian_type endian = CODING_UTF_16_ENDIAN (coding);
1368 int surrogate = CODING_UTF_16_SURROGATE (coding);
1369 Lisp_Object attr, eol_type, charset_list;
1370
1371 CODING_GET_INFO (coding, attr, eol_type, charset_list);
1372
1373 if (bom == utf_16_with_bom)
1374 {
1375 int c, c1, c2;
1376
1377 src_base = src;
1378 ONE_MORE_BYTE (c1);
1379 ONE_MORE_BYTE (c2);
1380 c = (c1 << 8) | c2;
1381
1382 if (endian == utf_16_big_endian
1383 ? c != 0xFEFF : c != 0xFFFE)
1384 {
1385 /* The first two bytes are not BOM. Treat them as bytes
1386 for a normal character. */
1387 src = src_base;
1388 coding->errors++;
1389 }
1390 CODING_UTF_16_BOM (coding) = utf_16_without_bom;
1391 }
1392 else if (bom == utf_16_detect_bom)
1393 {
1394 /* We have already tried to detect BOM and failed in
1395 detect_coding. */
1396 CODING_UTF_16_BOM (coding) = utf_16_without_bom;
1397 }
1398
1399 while (1)
1400 {
1401 int c, c1, c2;
1402
1403 src_base = src;
1404 consumed_chars_base = consumed_chars;
1405
1406 if (charbuf + 2 >= charbuf_end)
1407 break;
1408
1409 ONE_MORE_BYTE (c1);
1410 ONE_MORE_BYTE (c2);
1411 c = (endian == utf_16_big_endian
1412 ? ((c1 << 8) | c2) : ((c2 << 8) | c1));
1413 if (surrogate)
1414 {
1415 if (! UTF_16_LOW_SURROGATE_P (c))
1416 {
1417 if (endian == utf_16_big_endian)
1418 c1 = surrogate >> 8, c2 = surrogate & 0xFF;
1419 else
1420 c1 = surrogate & 0xFF, c2 = surrogate >> 8;
1421 *charbuf++ = c1;
1422 *charbuf++ = c2;
1423 coding->errors++;
1424 if (UTF_16_HIGH_SURROGATE_P (c))
1425 CODING_UTF_16_SURROGATE (coding) = surrogate = c;
1426 else
1427 *charbuf++ = c;
1428 }
1429 else
1430 {
1431 c = ((surrogate - 0xD800) << 10) | (c - 0xDC00);
1432 CODING_UTF_16_SURROGATE (coding) = surrogate = 0;
1433 *charbuf++ = c;
1434 }
1435 }
1436 else
1437 {
1438 if (UTF_16_HIGH_SURROGATE_P (c))
1439 CODING_UTF_16_SURROGATE (coding) = surrogate = c;
1440 else
1441 *charbuf++ = c;
1442 }
1443 }
1444
1445 no_more_source:
1446 coding->consumed_char += consumed_chars_base;
1447 coding->consumed = src_base - coding->source;
1448 coding->charbuf_used = charbuf - coding->charbuf;
1449 }
1450
1451 static int
1452 encode_coding_utf_16 (coding)
1453 struct coding_system *coding;
1454 {
1455 int multibytep = coding->dst_multibyte;
1456 int *charbuf = coding->charbuf;
1457 int *charbuf_end = charbuf + coding->charbuf_used;
1458 unsigned char *dst = coding->destination + coding->produced;
1459 unsigned char *dst_end = coding->destination + coding->dst_bytes;
1460 int safe_room = 8;
1461 enum utf_16_bom_type bom = CODING_UTF_16_BOM (coding);
1462 int big_endian = CODING_UTF_16_ENDIAN (coding) == utf_16_big_endian;
1463 int produced_chars = 0;
1464 Lisp_Object attrs, eol_type, charset_list;
1465 int c;
1466
1467 CODING_GET_INFO (coding, attrs, eol_type, charset_list);
1468
1469 if (bom != utf_16_without_bom)
1470 {
1471 ASSURE_DESTINATION (safe_room);
1472 if (big_endian)
1473 EMIT_TWO_BYTES (0xFE, 0xFF);
1474 else
1475 EMIT_TWO_BYTES (0xFF, 0xFE);
1476 CODING_UTF_16_BOM (coding) = utf_16_without_bom;
1477 }
1478
1479 while (charbuf < charbuf_end)
1480 {
1481 ASSURE_DESTINATION (safe_room);
1482 c = *charbuf++;
1483 if (c >= MAX_UNICODE_CHAR)
1484 c = coding->default_char;
1485
1486 if (c < 0x10000)
1487 {
1488 if (big_endian)
1489 EMIT_TWO_BYTES (c >> 8, c & 0xFF);
1490 else
1491 EMIT_TWO_BYTES (c & 0xFF, c >> 8);
1492 }
1493 else
1494 {
1495 int c1, c2;
1496
1497 c -= 0x10000;
1498 c1 = (c >> 10) + 0xD800;
1499 c2 = (c & 0x3FF) + 0xDC00;
1500 if (big_endian)
1501 EMIT_FOUR_BYTES (c1 >> 8, c1 & 0xFF, c2 >> 8, c2 & 0xFF);
1502 else
1503 EMIT_FOUR_BYTES (c1 & 0xFF, c1 >> 8, c2 & 0xFF, c2 >> 8);
1504 }
1505 }
1506 coding->result = CODING_RESULT_SUCCESS;
1507 coding->produced = dst - coding->destination;
1508 coding->produced_char += produced_chars;
1509 return 0;
1510 }
1511
1512 \f
1513 /*** 6. Old Emacs' internal format (emacs-mule) ***/
1514
1515 /* Emacs' internal format for representation of multiple character
1516 sets is a kind of multi-byte encoding, i.e. characters are
1517 represented by variable-length sequences of one-byte codes.
1518
1519 ASCII characters and control characters (e.g. `tab', `newline') are
1520 represented by one-byte sequences which are their ASCII codes, in
1521 the range 0x00 through 0x7F.
1522
1523 8-bit characters of the range 0x80..0x9F are represented by
1524 two-byte sequences of LEADING_CODE_8_BIT_CONTROL and (their 8-bit
1525 code + 0x20).
1526
1527 8-bit characters of the range 0xA0..0xFF are represented by
1528 one-byte sequences which are their 8-bit code.
1529
1530 The other characters are represented by a sequence of `base
1531 leading-code', optional `extended leading-code', and one or two
1532 `position-code's. The length of the sequence is determined by the
1533 base leading-code. Leading-code takes the range 0x81 through 0x9D,
1534 whereas extended leading-code and position-code take the range 0xA0
1535 through 0xFF. See `charset.h' for more details about leading-code
1536 and position-code.
1537
1538 --- CODE RANGE of Emacs' internal format ---
1539 character set range
1540 ------------- -----
1541 ascii 0x00..0x7F
1542 eight-bit-control LEADING_CODE_8_BIT_CONTROL + 0xA0..0xBF
1543 eight-bit-graphic 0xA0..0xBF
1544 ELSE 0x81..0x9D + [0xA0..0xFF]+
1545 ---------------------------------------------
1546
1547 As this is the internal character representation, the format is
1548 usually not used externally (i.e. in a file or in a data sent to a
1549 process). But, it is possible to have a text externally in this
1550 format (i.e. by encoding by the coding system `emacs-mule').
1551
1552 In that case, a sequence of one-byte codes has a slightly different
1553 form.
1554
1555 At first, all characters in eight-bit-control are represented by
1556 one-byte sequences which are their 8-bit code.
1557
1558 Next, character composition data are represented by the byte
1559 sequence of the form: 0x80 METHOD BYTES CHARS COMPONENT ...,
1560 where,
1561 METHOD is 0xF0 plus one of composition method (enum
1562 composition_method),
1563
1564 BYTES is 0xA0 plus a byte length of this composition data,
1565
1566 CHARS is 0x20 plus a number of characters composed by this
1567 data,
1568
1569 COMPONENTs are characters of multibye form or composition
1570 rules encoded by two-byte of ASCII codes.
1571
1572 In addition, for backward compatibility, the following formats are
1573 also recognized as composition data on decoding.
1574
1575 0x80 MSEQ ...
1576 0x80 0xFF MSEQ RULE MSEQ RULE ... MSEQ
1577
1578 Here,
1579 MSEQ is a multibyte form but in these special format:
1580 ASCII: 0xA0 ASCII_CODE+0x80,
1581 other: LEADING_CODE+0x20 FOLLOWING-BYTE ...,
1582 RULE is a one byte code of the range 0xA0..0xF0 that
1583 represents a composition rule.
1584 */
1585
1586 char emacs_mule_bytes[256];
1587
1588 int
1589 emacs_mule_char (coding, src, nbytes, nchars, id)
1590 struct coding_system *coding;
1591 unsigned char *src;
1592 int *nbytes, *nchars, *id;
1593 {
1594 unsigned char *src_end = coding->source + coding->src_bytes;
1595 int multibytep = coding->src_multibyte;
1596 unsigned char *src_base = src;
1597 struct charset *charset;
1598 unsigned code;
1599 int c;
1600 int consumed_chars = 0;
1601
1602 ONE_MORE_BYTE (c);
1603 switch (emacs_mule_bytes[c])
1604 {
1605 case 2:
1606 if (! (charset = emacs_mule_charset[c]))
1607 goto invalid_code;
1608 ONE_MORE_BYTE (c);
1609 code = c & 0x7F;
1610 break;
1611
1612 case 3:
1613 if (c == EMACS_MULE_LEADING_CODE_PRIVATE_11
1614 || c == EMACS_MULE_LEADING_CODE_PRIVATE_12)
1615 {
1616 ONE_MORE_BYTE (c);
1617 if (! (charset = emacs_mule_charset[c]))
1618 goto invalid_code;
1619 ONE_MORE_BYTE (c);
1620 code = c & 0x7F;
1621 }
1622 else
1623 {
1624 if (! (charset = emacs_mule_charset[c]))
1625 goto invalid_code;
1626 ONE_MORE_BYTE (c);
1627 code = (c & 0x7F) << 8;
1628 ONE_MORE_BYTE (c);
1629 code |= c & 0x7F;
1630 }
1631 break;
1632
1633 case 4:
1634 ONE_MORE_BYTE (c);
1635 if (! (charset = emacs_mule_charset[c]))
1636 goto invalid_code;
1637 ONE_MORE_BYTE (c);
1638 code = (c & 0x7F) << 8;
1639 ONE_MORE_BYTE (c);
1640 code |= c & 0x7F;
1641 break;
1642
1643 case 1:
1644 code = c;
1645 charset = CHARSET_FROM_ID (ASCII_BYTE_P (code)
1646 ? charset_ascii : charset_eight_bit);
1647 break;
1648
1649 default:
1650 abort ();
1651 }
1652 c = DECODE_CHAR (charset, code);
1653 if (c < 0)
1654 goto invalid_code;
1655 *nbytes = src - src_base;
1656 *nchars = consumed_chars;
1657 if (id)
1658 *id = charset->id;
1659 return c;
1660
1661 no_more_source:
1662 return -2;
1663
1664 invalid_code:
1665 return -1;
1666 }
1667
1668
1669 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
1670 Check if a text is encoded in `emacs-mule'. If it is, return 1,
1671 else return 0. */
1672
1673 static int
1674 detect_coding_emacs_mule (coding, detect_info)
1675 struct coding_system *coding;
1676 struct coding_detection_info *detect_info;
1677 {
1678 unsigned char *src = coding->source, *src_base = src;
1679 unsigned char *src_end = coding->source + coding->src_bytes;
1680 int multibytep = coding->src_multibyte;
1681 int consumed_chars = 0;
1682 int c;
1683 int found = 0;
1684 int incomplete;
1685
1686 detect_info->checked |= CATEGORY_MASK_EMACS_MULE;
1687 /* A coding system of this category is always ASCII compatible. */
1688 src += coding->head_ascii;
1689
1690 while (1)
1691 {
1692 incomplete = 0;
1693 ONE_MORE_BYTE (c);
1694 incomplete = 1;
1695
1696 if (c == 0x80)
1697 {
1698 /* Perhaps the start of composite character. We simple skip
1699 it because analyzing it is too heavy for detecting. But,
1700 at least, we check that the composite character
1701 constitues of more than 4 bytes. */
1702 unsigned char *src_base;
1703
1704 repeat:
1705 src_base = src;
1706 do
1707 {
1708 ONE_MORE_BYTE (c);
1709 }
1710 while (c >= 0xA0);
1711
1712 if (src - src_base <= 4)
1713 break;
1714 found = CATEGORY_MASK_EMACS_MULE;
1715 if (c == 0x80)
1716 goto repeat;
1717 }
1718
1719 if (c < 0x80)
1720 {
1721 if (c < 0x20
1722 && (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO))
1723 break;
1724 }
1725 else
1726 {
1727 unsigned char *src_base = src - 1;
1728
1729 do
1730 {
1731 ONE_MORE_BYTE (c);
1732 }
1733 while (c >= 0xA0);
1734 if (src - src_base != emacs_mule_bytes[*src_base])
1735 break;
1736 found = CATEGORY_MASK_EMACS_MULE;
1737 }
1738 }
1739 detect_info->rejected |= CATEGORY_MASK_EMACS_MULE;
1740 return 0;
1741
1742 no_more_source:
1743 if (incomplete && coding->mode & CODING_MODE_LAST_BLOCK)
1744 {
1745 detect_info->rejected |= CATEGORY_MASK_EMACS_MULE;
1746 return 0;
1747 }
1748 detect_info->found |= found;
1749 return 1;
1750 }
1751
1752
1753 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions". */
1754
1755 /* Decode a character represented as a component of composition
1756 sequence of Emacs 20/21 style at SRC. Set C to that character and
1757 update SRC to the head of next character (or an encoded composition
1758 rule). If SRC doesn't points a composition component, set C to -1.
1759 If SRC points an invalid byte sequence, global exit by a return
1760 value 0. */
1761
1762 #define DECODE_EMACS_MULE_COMPOSITION_CHAR(buf) \
1763 if (1) \
1764 { \
1765 int c; \
1766 int nbytes, nchars; \
1767 \
1768 if (src == src_end) \
1769 break; \
1770 c = emacs_mule_char (coding, src, &nbytes, &nchars, NULL);\
1771 if (c < 0) \
1772 { \
1773 if (c == -2) \
1774 break; \
1775 goto invalid_code; \
1776 } \
1777 *buf++ = c; \
1778 src += nbytes; \
1779 consumed_chars += nchars; \
1780 } \
1781 else
1782
1783
1784 /* Decode a composition rule represented as a component of composition
1785 sequence of Emacs 20 style at SRC. Store the decoded rule in *BUF,
1786 and increment BUF. If SRC points an invalid byte sequence, set C
1787 to -1. */
1788
1789 #define DECODE_EMACS_MULE_COMPOSITION_RULE_20(buf) \
1790 do { \
1791 int c, gref, nref; \
1792 \
1793 if (src >= src_end) \
1794 goto invalid_code; \
1795 ONE_MORE_BYTE_NO_CHECK (c); \
1796 c -= 0x20; \
1797 if (c < 0 || c >= 81) \
1798 goto invalid_code; \
1799 \
1800 gref = c / 9, nref = c % 9; \
1801 *buf++ = COMPOSITION_ENCODE_RULE (gref, nref); \
1802 } while (0)
1803
1804
1805 /* Decode a composition rule represented as a component of composition
1806 sequence of Emacs 21 style at SRC. Store the decoded rule in *BUF,
1807 and increment BUF. If SRC points an invalid byte sequence, set C
1808 to -1. */
1809
1810 #define DECODE_EMACS_MULE_COMPOSITION_RULE_21(buf) \
1811 do { \
1812 int gref, nref; \
1813 \
1814 if (src + 1>= src_end) \
1815 goto invalid_code; \
1816 ONE_MORE_BYTE_NO_CHECK (gref); \
1817 gref -= 0x20; \
1818 ONE_MORE_BYTE_NO_CHECK (nref); \
1819 nref -= 0x20; \
1820 if (gref < 0 || gref >= 81 \
1821 || nref < 0 || nref >= 81) \
1822 goto invalid_code; \
1823 *buf++ = COMPOSITION_ENCODE_RULE (gref, nref); \
1824 } while (0)
1825
1826
1827 #define DECODE_EMACS_MULE_21_COMPOSITION(c) \
1828 do { \
1829 /* Emacs 21 style format. The first three bytes at SRC are \
1830 (METHOD - 0xF2), (BYTES - 0xA0), (CHARS - 0xA0), where BYTES is \
1831 the byte length of this composition information, CHARS is the \
1832 number of characters composed by this composition. */ \
1833 enum composition_method method = c - 0xF2; \
1834 int *charbuf_base = charbuf; \
1835 int from, to; \
1836 int consumed_chars_limit; \
1837 int nbytes, nchars; \
1838 \
1839 ONE_MORE_BYTE (c); \
1840 nbytes = c - 0xA0; \
1841 if (nbytes < 3) \
1842 goto invalid_code; \
1843 ONE_MORE_BYTE (c); \
1844 nchars = c - 0xA0; \
1845 from = coding->produced + char_offset; \
1846 to = from + nchars; \
1847 ADD_COMPOSITION_DATA (charbuf, from, to, method); \
1848 consumed_chars_limit = consumed_chars_base + nbytes; \
1849 if (method != COMPOSITION_RELATIVE) \
1850 { \
1851 int i = 0; \
1852 while (consumed_chars < consumed_chars_limit) \
1853 { \
1854 if (i % 2 && method != COMPOSITION_WITH_ALTCHARS) \
1855 DECODE_EMACS_MULE_COMPOSITION_RULE_21 (charbuf); \
1856 else \
1857 DECODE_EMACS_MULE_COMPOSITION_CHAR (charbuf); \
1858 i++; \
1859 } \
1860 if (consumed_chars < consumed_chars_limit) \
1861 goto invalid_code; \
1862 charbuf_base[0] -= i; \
1863 } \
1864 } while (0)
1865
1866
1867 #define DECODE_EMACS_MULE_20_RELATIVE_COMPOSITION(c) \
1868 do { \
1869 /* Emacs 20 style format for relative composition. */ \
1870 /* Store multibyte form of characters to be composed. */ \
1871 enum composition_method method = COMPOSITION_RELATIVE; \
1872 int components[MAX_COMPOSITION_COMPONENTS * 2 - 1]; \
1873 int *buf = components; \
1874 int i, j; \
1875 int from, to; \
1876 \
1877 src = src_base; \
1878 ONE_MORE_BYTE (c); /* skip 0x80 */ \
1879 for (i = 0; i < MAX_COMPOSITION_COMPONENTS; i++) \
1880 DECODE_EMACS_MULE_COMPOSITION_CHAR (buf); \
1881 if (i < 2) \
1882 goto invalid_code; \
1883 from = coding->produced_char + char_offset; \
1884 to = from + i; \
1885 ADD_COMPOSITION_DATA (charbuf, from, to, method); \
1886 for (j = 0; j < i; j++) \
1887 *charbuf++ = components[j]; \
1888 } while (0)
1889
1890
1891 #define DECODE_EMACS_MULE_20_RULEBASE_COMPOSITION(c) \
1892 do { \
1893 /* Emacs 20 style format for rule-base composition. */ \
1894 /* Store multibyte form of characters to be composed. */ \
1895 enum composition_method method = COMPOSITION_WITH_RULE; \
1896 int components[MAX_COMPOSITION_COMPONENTS * 2 - 1]; \
1897 int *buf = components; \
1898 int i, j; \
1899 int from, to; \
1900 \
1901 DECODE_EMACS_MULE_COMPOSITION_CHAR (buf); \
1902 for (i = 0; i < MAX_COMPOSITION_COMPONENTS; i++) \
1903 { \
1904 DECODE_EMACS_MULE_COMPOSITION_RULE_20 (buf); \
1905 DECODE_EMACS_MULE_COMPOSITION_CHAR (buf); \
1906 } \
1907 if (i < 1 || (buf - components) % 2 == 0) \
1908 goto invalid_code; \
1909 if (charbuf + i + (i / 2) + 1 < charbuf_end) \
1910 goto no_more_source; \
1911 from = coding->produced_char + char_offset; \
1912 to = from + i; \
1913 ADD_COMPOSITION_DATA (buf, from, to, method); \
1914 for (j = 0; j < i; j++) \
1915 *charbuf++ = components[j]; \
1916 for (j = 0; j < i; j += 2) \
1917 *charbuf++ = components[j]; \
1918 } while (0)
1919
1920
1921 static void
1922 decode_coding_emacs_mule (coding)
1923 struct coding_system *coding;
1924 {
1925 unsigned char *src = coding->source + coding->consumed;
1926 unsigned char *src_end = coding->source + coding->src_bytes;
1927 unsigned char *src_base;
1928 int *charbuf = coding->charbuf;
1929 int *charbuf_end = charbuf + coding->charbuf_size - MAX_ANNOTATION_LENGTH;
1930 int consumed_chars = 0, consumed_chars_base;
1931 int multibytep = coding->src_multibyte;
1932 Lisp_Object attrs, eol_type, charset_list;
1933 int char_offset = coding->produced_char;
1934 int last_offset = char_offset;
1935 int last_id = charset_ascii;
1936
1937 CODING_GET_INFO (coding, attrs, eol_type, charset_list);
1938
1939 while (1)
1940 {
1941 int c;
1942
1943 src_base = src;
1944 consumed_chars_base = consumed_chars;
1945
1946 if (charbuf >= charbuf_end)
1947 break;
1948
1949 ONE_MORE_BYTE (c);
1950
1951 if (c < 0x80)
1952 {
1953 if (c == '\r')
1954 {
1955 if (EQ (eol_type, Qdos))
1956 {
1957 if (src == src_end)
1958 {
1959 coding->result = CODING_RESULT_INSUFFICIENT_SRC;
1960 goto no_more_source;
1961 }
1962 if (*src == '\n')
1963 ONE_MORE_BYTE (c);
1964 }
1965 else if (EQ (eol_type, Qmac))
1966 c = '\n';
1967 }
1968 *charbuf++ = c;
1969 char_offset++;
1970 }
1971 else if (c == 0x80)
1972 {
1973 ONE_MORE_BYTE (c);
1974 if (c - 0xF2 >= COMPOSITION_RELATIVE
1975 && c - 0xF2 <= COMPOSITION_WITH_RULE_ALTCHARS)
1976 DECODE_EMACS_MULE_21_COMPOSITION (c);
1977 else if (c < 0xC0)
1978 DECODE_EMACS_MULE_20_RELATIVE_COMPOSITION (c);
1979 else if (c == 0xFF)
1980 DECODE_EMACS_MULE_20_RULEBASE_COMPOSITION (c);
1981 else
1982 goto invalid_code;
1983 }
1984 else if (c < 0xA0 && emacs_mule_bytes[c] > 1)
1985 {
1986 int nbytes, nchars;
1987 int id;
1988
1989 src = src_base;
1990 consumed_chars = consumed_chars_base;
1991 c = emacs_mule_char (coding, src, &nbytes, &nchars, &id);
1992 if (c < 0)
1993 {
1994 if (c == -2)
1995 break;
1996 goto invalid_code;
1997 }
1998 if (last_id != id)
1999 {
2000 if (last_id != charset_ascii)
2001 ADD_CHARSET_DATA (charbuf, last_offset, char_offset, last_id);
2002 last_id = id;
2003 last_offset = char_offset;
2004 }
2005 *charbuf++ = c;
2006 src += nbytes;
2007 consumed_chars += nchars;
2008 char_offset++;
2009 }
2010 continue;
2011
2012 invalid_code:
2013 src = src_base;
2014 consumed_chars = consumed_chars_base;
2015 ONE_MORE_BYTE (c);
2016 *charbuf++ = ASCII_BYTE_P (c) ? c : BYTE8_TO_CHAR (c);
2017 char_offset++;
2018 coding->errors++;
2019 }
2020
2021 no_more_source:
2022 if (last_id != charset_ascii)
2023 ADD_CHARSET_DATA (charbuf, last_offset, char_offset, last_id);
2024 coding->consumed_char += consumed_chars_base;
2025 coding->consumed = src_base - coding->source;
2026 coding->charbuf_used = charbuf - coding->charbuf;
2027 }
2028
2029
2030 #define EMACS_MULE_LEADING_CODES(id, codes) \
2031 do { \
2032 if (id < 0xA0) \
2033 codes[0] = id, codes[1] = 0; \
2034 else if (id < 0xE0) \
2035 codes[0] = 0x9A, codes[1] = id; \
2036 else if (id < 0xF0) \
2037 codes[0] = 0x9B, codes[1] = id; \
2038 else if (id < 0xF5) \
2039 codes[0] = 0x9C, codes[1] = id; \
2040 else \
2041 codes[0] = 0x9D, codes[1] = id; \
2042 } while (0);
2043
2044
2045 static int
2046 encode_coding_emacs_mule (coding)
2047 struct coding_system *coding;
2048 {
2049 int multibytep = coding->dst_multibyte;
2050 int *charbuf = coding->charbuf;
2051 int *charbuf_end = charbuf + coding->charbuf_used;
2052 unsigned char *dst = coding->destination + coding->produced;
2053 unsigned char *dst_end = coding->destination + coding->dst_bytes;
2054 int safe_room = 8;
2055 int produced_chars = 0;
2056 Lisp_Object attrs, eol_type, charset_list;
2057 int c;
2058 int preferred_charset_id = -1;
2059
2060 CODING_GET_INFO (coding, attrs, eol_type, charset_list);
2061
2062 while (charbuf < charbuf_end)
2063 {
2064 ASSURE_DESTINATION (safe_room);
2065 c = *charbuf++;
2066
2067 if (c < 0)
2068 {
2069 /* Handle an annotation. */
2070 switch (*charbuf)
2071 {
2072 case CODING_ANNOTATE_COMPOSITION_MASK:
2073 /* Not yet implemented. */
2074 break;
2075 case CODING_ANNOTATE_CHARSET_MASK:
2076 preferred_charset_id = charbuf[3];
2077 if (preferred_charset_id >= 0
2078 && NILP (Fmemq (make_number (preferred_charset_id),
2079 charset_list)))
2080 preferred_charset_id = -1;
2081 break;
2082 default:
2083 abort ();
2084 }
2085 charbuf += -c - 1;
2086 continue;
2087 }
2088
2089 if (ASCII_CHAR_P (c))
2090 EMIT_ONE_ASCII_BYTE (c);
2091 else if (CHAR_BYTE8_P (c))
2092 {
2093 c = CHAR_TO_BYTE8 (c);
2094 EMIT_ONE_BYTE (c);
2095 }
2096 else
2097 {
2098 struct charset *charset;
2099 unsigned code;
2100 int dimension;
2101 int emacs_mule_id;
2102 unsigned char leading_codes[2];
2103
2104 if (preferred_charset_id >= 0)
2105 {
2106 charset = CHARSET_FROM_ID (preferred_charset_id);
2107 if (! CHAR_CHARSET_P (c, charset))
2108 charset = char_charset (c, charset_list, NULL);
2109 }
2110 else
2111 charset = char_charset (c, charset_list, &code);
2112 if (! charset)
2113 {
2114 c = coding->default_char;
2115 if (ASCII_CHAR_P (c))
2116 {
2117 EMIT_ONE_ASCII_BYTE (c);
2118 continue;
2119 }
2120 charset = char_charset (c, charset_list, &code);
2121 }
2122 dimension = CHARSET_DIMENSION (charset);
2123 emacs_mule_id = CHARSET_EMACS_MULE_ID (charset);
2124 EMACS_MULE_LEADING_CODES (emacs_mule_id, leading_codes);
2125 EMIT_ONE_BYTE (leading_codes[0]);
2126 if (leading_codes[1])
2127 EMIT_ONE_BYTE (leading_codes[1]);
2128 if (dimension == 1)
2129 EMIT_ONE_BYTE (code);
2130 else
2131 {
2132 EMIT_ONE_BYTE (code >> 8);
2133 EMIT_ONE_BYTE (code & 0xFF);
2134 }
2135 }
2136 }
2137 coding->result = CODING_RESULT_SUCCESS;
2138 coding->produced_char += produced_chars;
2139 coding->produced = dst - coding->destination;
2140 return 0;
2141 }
2142
2143 \f
2144 /*** 7. ISO2022 handlers ***/
2145
2146 /* The following note describes the coding system ISO2022 briefly.
2147 Since the intention of this note is to help understand the
2148 functions in this file, some parts are NOT ACCURATE or are OVERLY
2149 SIMPLIFIED. For thorough understanding, please refer to the
2150 original document of ISO2022. This is equivalent to the standard
2151 ECMA-35, obtainable from <URL:http://www.ecma.ch/> (*).
2152
2153 ISO2022 provides many mechanisms to encode several character sets
2154 in 7-bit and 8-bit environments. For 7-bit environments, all text
2155 is encoded using bytes less than 128. This may make the encoded
2156 text a little bit longer, but the text passes more easily through
2157 several types of gateway, some of which strip off the MSB (Most
2158 Significant Bit).
2159
2160 There are two kinds of character sets: control character sets and
2161 graphic character sets. The former contain control characters such
2162 as `newline' and `escape' to provide control functions (control
2163 functions are also provided by escape sequences). The latter
2164 contain graphic characters such as 'A' and '-'. Emacs recognizes
2165 two control character sets and many graphic character sets.
2166
2167 Graphic character sets are classified into one of the following
2168 four classes, according to the number of bytes (DIMENSION) and
2169 number of characters in one dimension (CHARS) of the set:
2170 - DIMENSION1_CHARS94
2171 - DIMENSION1_CHARS96
2172 - DIMENSION2_CHARS94
2173 - DIMENSION2_CHARS96
2174
2175 In addition, each character set is assigned an identification tag,
2176 unique for each set, called the "final character" (denoted as <F>
2177 hereafter). The <F> of each character set is decided by ECMA(*)
2178 when it is registered in ISO. The code range of <F> is 0x30..0x7F
2179 (0x30..0x3F are for private use only).
2180
2181 Note (*): ECMA = European Computer Manufacturers Association
2182
2183 Here are examples of graphic character sets [NAME(<F>)]:
2184 o DIMENSION1_CHARS94 -- ASCII('B'), right-half-of-JISX0201('I'), ...
2185 o DIMENSION1_CHARS96 -- right-half-of-ISO8859-1('A'), ...
2186 o DIMENSION2_CHARS94 -- GB2312('A'), JISX0208('B'), ...
2187 o DIMENSION2_CHARS96 -- none for the moment
2188
2189 A code area (1 byte=8 bits) is divided into 4 areas, C0, GL, C1, and GR.
2190 C0 [0x00..0x1F] -- control character plane 0
2191 GL [0x20..0x7F] -- graphic character plane 0
2192 C1 [0x80..0x9F] -- control character plane 1
2193 GR [0xA0..0xFF] -- graphic character plane 1
2194
2195 A control character set is directly designated and invoked to C0 or
2196 C1 by an escape sequence. The most common case is that:
2197 - ISO646's control character set is designated/invoked to C0, and
2198 - ISO6429's control character set is designated/invoked to C1,
2199 and usually these designations/invocations are omitted in encoded
2200 text. In a 7-bit environment, only C0 can be used, and a control
2201 character for C1 is encoded by an appropriate escape sequence to
2202 fit into the environment. All control characters for C1 are
2203 defined to have corresponding escape sequences.
2204
2205 A graphic character set is at first designated to one of four
2206 graphic registers (G0 through G3), then these graphic registers are
2207 invoked to GL or GR. These designations and invocations can be
2208 done independently. The most common case is that G0 is invoked to
2209 GL, G1 is invoked to GR, and ASCII is designated to G0. Usually
2210 these invocations and designations are omitted in encoded text.
2211 In a 7-bit environment, only GL can be used.
2212
2213 When a graphic character set of CHARS94 is invoked to GL, codes
2214 0x20 and 0x7F of the GL area work as control characters SPACE and
2215 DEL respectively, and codes 0xA0 and 0xFF of the GR area should not
2216 be used.
2217
2218 There are two ways of invocation: locking-shift and single-shift.
2219 With locking-shift, the invocation lasts until the next different
2220 invocation, whereas with single-shift, the invocation affects the
2221 following character only and doesn't affect the locking-shift
2222 state. Invocations are done by the following control characters or
2223 escape sequences:
2224
2225 ----------------------------------------------------------------------
2226 abbrev function cntrl escape seq description
2227 ----------------------------------------------------------------------
2228 SI/LS0 (shift-in) 0x0F none invoke G0 into GL
2229 SO/LS1 (shift-out) 0x0E none invoke G1 into GL
2230 LS2 (locking-shift-2) none ESC 'n' invoke G2 into GL
2231 LS3 (locking-shift-3) none ESC 'o' invoke G3 into GL
2232 LS1R (locking-shift-1 right) none ESC '~' invoke G1 into GR (*)
2233 LS2R (locking-shift-2 right) none ESC '}' invoke G2 into GR (*)
2234 LS3R (locking-shift 3 right) none ESC '|' invoke G3 into GR (*)
2235 SS2 (single-shift-2) 0x8E ESC 'N' invoke G2 for one char
2236 SS3 (single-shift-3) 0x8F ESC 'O' invoke G3 for one char
2237 ----------------------------------------------------------------------
2238 (*) These are not used by any known coding system.
2239
2240 Control characters for these functions are defined by macros
2241 ISO_CODE_XXX in `coding.h'.
2242
2243 Designations are done by the following escape sequences:
2244 ----------------------------------------------------------------------
2245 escape sequence description
2246 ----------------------------------------------------------------------
2247 ESC '(' <F> designate DIMENSION1_CHARS94<F> to G0
2248 ESC ')' <F> designate DIMENSION1_CHARS94<F> to G1
2249 ESC '*' <F> designate DIMENSION1_CHARS94<F> to G2
2250 ESC '+' <F> designate DIMENSION1_CHARS94<F> to G3
2251 ESC ',' <F> designate DIMENSION1_CHARS96<F> to G0 (*)
2252 ESC '-' <F> designate DIMENSION1_CHARS96<F> to G1
2253 ESC '.' <F> designate DIMENSION1_CHARS96<F> to G2
2254 ESC '/' <F> designate DIMENSION1_CHARS96<F> to G3
2255 ESC '$' '(' <F> designate DIMENSION2_CHARS94<F> to G0 (**)
2256 ESC '$' ')' <F> designate DIMENSION2_CHARS94<F> to G1
2257 ESC '$' '*' <F> designate DIMENSION2_CHARS94<F> to G2
2258 ESC '$' '+' <F> designate DIMENSION2_CHARS94<F> to G3
2259 ESC '$' ',' <F> designate DIMENSION2_CHARS96<F> to G0 (*)
2260 ESC '$' '-' <F> designate DIMENSION2_CHARS96<F> to G1
2261 ESC '$' '.' <F> designate DIMENSION2_CHARS96<F> to G2
2262 ESC '$' '/' <F> designate DIMENSION2_CHARS96<F> to G3
2263 ----------------------------------------------------------------------
2264
2265 In this list, "DIMENSION1_CHARS94<F>" means a graphic character set
2266 of dimension 1, chars 94, and final character <F>, etc...
2267
2268 Note (*): Although these designations are not allowed in ISO2022,
2269 Emacs accepts them on decoding, and produces them on encoding
2270 CHARS96 character sets in a coding system which is characterized as
2271 7-bit environment, non-locking-shift, and non-single-shift.
2272
2273 Note (**): If <F> is '@', 'A', or 'B', the intermediate character
2274 '(' must be omitted. We refer to this as "short-form" hereafter.
2275
2276 Now you may notice that there are a lot of ways of encoding the
2277 same multilingual text in ISO2022. Actually, there exist many
2278 coding systems such as Compound Text (used in X11's inter client
2279 communication, ISO-2022-JP (used in Japanese Internet), ISO-2022-KR
2280 (used in Korean Internet), EUC (Extended UNIX Code, used in Asian
2281 localized platforms), and all of these are variants of ISO2022.
2282
2283 In addition to the above, Emacs handles two more kinds of escape
2284 sequences: ISO6429's direction specification and Emacs' private
2285 sequence for specifying character composition.
2286
2287 ISO6429's direction specification takes the following form:
2288 o CSI ']' -- end of the current direction
2289 o CSI '0' ']' -- end of the current direction
2290 o CSI '1' ']' -- start of left-to-right text
2291 o CSI '2' ']' -- start of right-to-left text
2292 The control character CSI (0x9B: control sequence introducer) is
2293 abbreviated to the escape sequence ESC '[' in a 7-bit environment.
2294
2295 Character composition specification takes the following form:
2296 o ESC '0' -- start relative composition
2297 o ESC '1' -- end composition
2298 o ESC '2' -- start rule-base composition (*)
2299 o ESC '3' -- start relative composition with alternate chars (**)
2300 o ESC '4' -- start rule-base composition with alternate chars (**)
2301 Since these are not standard escape sequences of any ISO standard,
2302 the use of them with these meanings is restricted to Emacs only.
2303
2304 (*) This form is used only in Emacs 20.7 and older versions,
2305 but newer versions can safely decode it.
2306 (**) This form is used only in Emacs 21.1 and newer versions,
2307 and older versions can't decode it.
2308
2309 Here's a list of example usages of these composition escape
2310 sequences (categorized by `enum composition_method').
2311
2312 COMPOSITION_RELATIVE:
2313 ESC 0 CHAR [ CHAR ] ESC 1
2314 COMPOSITION_WITH_RULE:
2315 ESC 2 CHAR [ RULE CHAR ] ESC 1
2316 COMPOSITION_WITH_ALTCHARS:
2317 ESC 3 ALTCHAR [ ALTCHAR ] ESC 0 CHAR [ CHAR ] ESC 1
2318 COMPOSITION_WITH_RULE_ALTCHARS:
2319 ESC 4 ALTCHAR [ RULE ALTCHAR ] ESC 0 CHAR [ CHAR ] ESC 1 */
2320
2321 enum iso_code_class_type iso_code_class[256];
2322
2323 #define SAFE_CHARSET_P(coding, id) \
2324 ((id) <= (coding)->max_charset_id \
2325 && (coding)->safe_charsets[id] >= 0)
2326
2327
2328 #define SHIFT_OUT_OK(category) \
2329 (CODING_ISO_INITIAL (&coding_categories[category], 1) >= 0)
2330
2331 static void
2332 setup_iso_safe_charsets (attrs)
2333 Lisp_Object attrs;
2334 {
2335 Lisp_Object charset_list, safe_charsets;
2336 Lisp_Object request;
2337 Lisp_Object reg_usage;
2338 Lisp_Object tail;
2339 int reg94, reg96;
2340 int flags = XINT (AREF (attrs, coding_attr_iso_flags));
2341 int max_charset_id;
2342
2343 charset_list = CODING_ATTR_CHARSET_LIST (attrs);
2344 if ((flags & CODING_ISO_FLAG_FULL_SUPPORT)
2345 && ! EQ (charset_list, Viso_2022_charset_list))
2346 {
2347 CODING_ATTR_CHARSET_LIST (attrs)
2348 = charset_list = Viso_2022_charset_list;
2349 ASET (attrs, coding_attr_safe_charsets, Qnil);
2350 }
2351
2352 if (STRINGP (AREF (attrs, coding_attr_safe_charsets)))
2353 return;
2354
2355 max_charset_id = 0;
2356 for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
2357 {
2358 int id = XINT (XCAR (tail));
2359 if (max_charset_id < id)
2360 max_charset_id = id;
2361 }
2362
2363 safe_charsets = Fmake_string (make_number (max_charset_id + 1),
2364 make_number (255));
2365 request = AREF (attrs, coding_attr_iso_request);
2366 reg_usage = AREF (attrs, coding_attr_iso_usage);
2367 reg94 = XINT (XCAR (reg_usage));
2368 reg96 = XINT (XCDR (reg_usage));
2369
2370 for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
2371 {
2372 Lisp_Object id;
2373 Lisp_Object reg;
2374 struct charset *charset;
2375
2376 id = XCAR (tail);
2377 charset = CHARSET_FROM_ID (XINT (id));
2378 reg = Fcdr (Fassq (id, request));
2379 if (! NILP (reg))
2380 XSTRING (safe_charsets)->data[XINT (id)] = XINT (reg);
2381 else if (charset->iso_chars_96)
2382 {
2383 if (reg96 < 4)
2384 XSTRING (safe_charsets)->data[XINT (id)] = reg96;
2385 }
2386 else
2387 {
2388 if (reg94 < 4)
2389 XSTRING (safe_charsets)->data[XINT (id)] = reg94;
2390 }
2391 }
2392 ASET (attrs, coding_attr_safe_charsets, safe_charsets);
2393 }
2394
2395
2396 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
2397 Check if a text is encoded in one of ISO-2022 based codig systems.
2398 If it is, return 1, else return 0. */
2399
2400 static int
2401 detect_coding_iso_2022 (coding, detect_info)
2402 struct coding_system *coding;
2403 struct coding_detection_info *detect_info;
2404 {
2405 unsigned char *src = coding->source, *src_base = src;
2406 unsigned char *src_end = coding->source + coding->src_bytes;
2407 int multibytep = coding->src_multibyte;
2408 int single_shifting = 0;
2409 int id;
2410 int c, c1;
2411 int consumed_chars = 0;
2412 int i;
2413 int rejected = 0;
2414 int found = 0;
2415
2416 detect_info->checked |= CATEGORY_MASK_ISO;
2417
2418 for (i = coding_category_iso_7; i <= coding_category_iso_8_else; i++)
2419 {
2420 struct coding_system *this = &(coding_categories[i]);
2421 Lisp_Object attrs, val;
2422
2423 attrs = CODING_ID_ATTRS (this->id);
2424 if (CODING_ISO_FLAGS (this) & CODING_ISO_FLAG_FULL_SUPPORT
2425 && ! EQ (CODING_ATTR_SAFE_CHARSETS (attrs), Viso_2022_charset_list))
2426 setup_iso_safe_charsets (attrs);
2427 val = CODING_ATTR_SAFE_CHARSETS (attrs);
2428 this->max_charset_id = XSTRING (val)->size - 1;
2429 this->safe_charsets = (char *) XSTRING (val)->data;
2430 }
2431
2432 /* A coding system of this category is always ASCII compatible. */
2433 src += coding->head_ascii;
2434
2435 while (rejected != CATEGORY_MASK_ISO)
2436 {
2437 ONE_MORE_BYTE (c);
2438 switch (c)
2439 {
2440 case ISO_CODE_ESC:
2441 if (inhibit_iso_escape_detection)
2442 break;
2443 single_shifting = 0;
2444 ONE_MORE_BYTE (c);
2445 if (c >= '(' && c <= '/')
2446 {
2447 /* Designation sequence for a charset of dimension 1. */
2448 ONE_MORE_BYTE (c1);
2449 if (c1 < ' ' || c1 >= 0x80
2450 || (id = iso_charset_table[0][c >= ','][c1]) < 0)
2451 /* Invalid designation sequence. Just ignore. */
2452 break;
2453 }
2454 else if (c == '$')
2455 {
2456 /* Designation sequence for a charset of dimension 2. */
2457 ONE_MORE_BYTE (c);
2458 if (c >= '@' && c <= 'B')
2459 /* Designation for JISX0208.1978, GB2312, or JISX0208. */
2460 id = iso_charset_table[1][0][c];
2461 else if (c >= '(' && c <= '/')
2462 {
2463 ONE_MORE_BYTE (c1);
2464 if (c1 < ' ' || c1 >= 0x80
2465 || (id = iso_charset_table[1][c >= ','][c1]) < 0)
2466 /* Invalid designation sequence. Just ignore. */
2467 break;
2468 }
2469 else
2470 /* Invalid designation sequence. Just ignore it. */
2471 break;
2472 }
2473 else if (c == 'N' || c == 'O')
2474 {
2475 /* ESC <Fe> for SS2 or SS3. */
2476 single_shifting = 1;
2477 rejected |= CATEGORY_MASK_ISO_7BIT | CATEGORY_MASK_ISO_8BIT;
2478 break;
2479 }
2480 else if (c >= '0' && c <= '4')
2481 {
2482 /* ESC <Fp> for start/end composition. */
2483 found |= CATEGORY_MASK_ISO;
2484 break;
2485 }
2486 else
2487 {
2488 /* Invalid escape sequence. Just ignore it. */
2489 break;
2490 }
2491
2492 /* We found a valid designation sequence for CHARSET. */
2493 rejected |= CATEGORY_MASK_ISO_8BIT;
2494 if (SAFE_CHARSET_P (&coding_categories[coding_category_iso_7],
2495 id))
2496 found |= CATEGORY_MASK_ISO_7;
2497 else
2498 rejected |= CATEGORY_MASK_ISO_7;
2499 if (SAFE_CHARSET_P (&coding_categories[coding_category_iso_7_tight],
2500 id))
2501 found |= CATEGORY_MASK_ISO_7_TIGHT;
2502 else
2503 rejected |= CATEGORY_MASK_ISO_7_TIGHT;
2504 if (SAFE_CHARSET_P (&coding_categories[coding_category_iso_7_else],
2505 id))
2506 found |= CATEGORY_MASK_ISO_7_ELSE;
2507 else
2508 rejected |= CATEGORY_MASK_ISO_7_ELSE;
2509 if (SAFE_CHARSET_P (&coding_categories[coding_category_iso_8_else],
2510 id))
2511 found |= CATEGORY_MASK_ISO_8_ELSE;
2512 else
2513 rejected |= CATEGORY_MASK_ISO_8_ELSE;
2514 break;
2515
2516 case ISO_CODE_SO:
2517 case ISO_CODE_SI:
2518 /* Locking shift out/in. */
2519 if (inhibit_iso_escape_detection)
2520 break;
2521 single_shifting = 0;
2522 rejected |= CATEGORY_MASK_ISO_7BIT | CATEGORY_MASK_ISO_8BIT;
2523 found |= CATEGORY_MASK_ISO_ELSE;
2524 break;
2525
2526 case ISO_CODE_CSI:
2527 /* Control sequence introducer. */
2528 single_shifting = 0;
2529 rejected |= CATEGORY_MASK_ISO_7BIT | CATEGORY_MASK_ISO_7_ELSE;
2530 found |= CATEGORY_MASK_ISO_8_ELSE;
2531 goto check_extra_latin;
2532
2533
2534 case ISO_CODE_SS2:
2535 case ISO_CODE_SS3:
2536 /* Single shift. */
2537 if (inhibit_iso_escape_detection)
2538 break;
2539 single_shifting = 1;
2540 rejected |= CATEGORY_MASK_ISO_7BIT;
2541 if (CODING_ISO_FLAGS (&coding_categories[coding_category_iso_8_1])
2542 & CODING_ISO_FLAG_SINGLE_SHIFT)
2543 found |= CATEGORY_MASK_ISO_8_1;
2544 if (CODING_ISO_FLAGS (&coding_categories[coding_category_iso_8_2])
2545 & CODING_ISO_FLAG_SINGLE_SHIFT)
2546 found |= CATEGORY_MASK_ISO_8_2;
2547 goto check_extra_latin;
2548
2549 default:
2550 if (c < 0x80)
2551 {
2552 single_shifting = 0;
2553 break;
2554 }
2555 if (c >= 0xA0)
2556 {
2557 rejected |= CATEGORY_MASK_ISO_7BIT | CATEGORY_MASK_ISO_7_ELSE;
2558 found |= CATEGORY_MASK_ISO_8_1;
2559 /* Check the length of succeeding codes of the range
2560 0xA0..0FF. If the byte length is even, we include
2561 CATEGORY_MASK_ISO_8_2 in `found'. We can check this
2562 only when we are not single shifting. */
2563 if (! single_shifting
2564 && ! (rejected & CATEGORY_MASK_ISO_8_2))
2565 {
2566 int i = 1;
2567 while (src < src_end)
2568 {
2569 ONE_MORE_BYTE (c);
2570 if (c < 0xA0)
2571 break;
2572 i++;
2573 }
2574
2575 if (i & 1 && src < src_end)
2576 rejected |= CATEGORY_MASK_ISO_8_2;
2577 else
2578 found |= CATEGORY_MASK_ISO_8_2;
2579 }
2580 break;
2581 }
2582 check_extra_latin:
2583 single_shifting = 0;
2584 if (! VECTORP (Vlatin_extra_code_table)
2585 || NILP (XVECTOR (Vlatin_extra_code_table)->contents[c]))
2586 {
2587 rejected = CATEGORY_MASK_ISO;
2588 break;
2589 }
2590 if (CODING_ISO_FLAGS (&coding_categories[coding_category_iso_8_1])
2591 & CODING_ISO_FLAG_LATIN_EXTRA)
2592 found |= CATEGORY_MASK_ISO_8_1;
2593 else
2594 rejected |= CATEGORY_MASK_ISO_8_1;
2595 if (CODING_ISO_FLAGS (&coding_categories[coding_category_iso_8_2])
2596 & CODING_ISO_FLAG_LATIN_EXTRA)
2597 found |= CATEGORY_MASK_ISO_8_2;
2598 else
2599 rejected |= CATEGORY_MASK_ISO_8_2;
2600 }
2601 }
2602 detect_info->rejected |= CATEGORY_MASK_ISO;
2603 return 0;
2604
2605 no_more_source:
2606 detect_info->rejected |= rejected;
2607 detect_info->found |= (found & ~rejected);
2608 return 1;
2609 }
2610
2611
2612 /* Set designation state into CODING. */
2613 #define DECODE_DESIGNATION(reg, dim, chars_96, final) \
2614 do { \
2615 int id, prev; \
2616 \
2617 if (final < '0' || final >= 128 \
2618 || ((id = ISO_CHARSET_TABLE (dim, chars_96, final)) < 0) \
2619 || !SAFE_CHARSET_P (coding, id)) \
2620 { \
2621 CODING_ISO_DESIGNATION (coding, reg) = -2; \
2622 goto invalid_code; \
2623 } \
2624 prev = CODING_ISO_DESIGNATION (coding, reg); \
2625 if (id == charset_jisx0201_roman) \
2626 { \
2627 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_USE_ROMAN) \
2628 id = charset_ascii; \
2629 } \
2630 else if (id == charset_jisx0208_1978) \
2631 { \
2632 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_USE_OLDJIS) \
2633 id = charset_jisx0208; \
2634 } \
2635 CODING_ISO_DESIGNATION (coding, reg) = id; \
2636 /* If there was an invalid designation to REG previously, and this \
2637 designation is ASCII to REG, we should keep this designation \
2638 sequence. */ \
2639 if (prev == -2 && id == charset_ascii) \
2640 goto invalid_code; \
2641 } while (0)
2642
2643
2644 #define MAYBE_FINISH_COMPOSITION() \
2645 do { \
2646 int i; \
2647 if (composition_state == COMPOSING_NO) \
2648 break; \
2649 /* It is assured that we have enough room for producing \
2650 characters stored in the table `components'. */ \
2651 if (charbuf + component_idx > charbuf_end) \
2652 goto no_more_source; \
2653 composition_state = COMPOSING_NO; \
2654 if (method == COMPOSITION_RELATIVE \
2655 || method == COMPOSITION_WITH_ALTCHARS) \
2656 { \
2657 for (i = 0; i < component_idx; i++) \
2658 *charbuf++ = components[i]; \
2659 char_offset += component_idx; \
2660 } \
2661 else \
2662 { \
2663 for (i = 0; i < component_idx; i += 2) \
2664 *charbuf++ = components[i]; \
2665 char_offset += (component_idx / 2) + 1; \
2666 } \
2667 } while (0)
2668
2669
2670 /* Handle composition start sequence ESC 0, ESC 2, ESC 3, or ESC 4.
2671 ESC 0 : relative composition : ESC 0 CHAR ... ESC 1
2672 ESC 2 : rulebase composition : ESC 2 CHAR RULE CHAR RULE ... CHAR ESC 1
2673 ESC 3 : altchar composition : ESC 3 CHAR ... ESC 0 CHAR ... ESC 1
2674 ESC 4 : alt&rule composition : ESC 4 CHAR RULE ... CHAR ESC 0 CHAR ... ESC 1
2675 */
2676
2677 #define DECODE_COMPOSITION_START(c1) \
2678 do { \
2679 if (c1 == '0' \
2680 && composition_state == COMPOSING_COMPONENT_RULE) \
2681 { \
2682 component_len = component_idx; \
2683 composition_state = COMPOSING_CHAR; \
2684 } \
2685 else \
2686 { \
2687 unsigned char *p; \
2688 \
2689 MAYBE_FINISH_COMPOSITION (); \
2690 if (charbuf + MAX_COMPOSITION_COMPONENTS > charbuf_end) \
2691 goto no_more_source; \
2692 for (p = src; p < src_end - 1; p++) \
2693 if (*p == ISO_CODE_ESC && p[1] == '1') \
2694 break; \
2695 if (p == src_end - 1) \
2696 { \
2697 if (coding->mode & CODING_MODE_LAST_BLOCK) \
2698 goto invalid_code; \
2699 goto no_more_source; \
2700 } \
2701 \
2702 /* This is surely the start of a composition. */ \
2703 method = (c1 == '0' ? COMPOSITION_RELATIVE \
2704 : c1 == '2' ? COMPOSITION_WITH_RULE \
2705 : c1 == '3' ? COMPOSITION_WITH_ALTCHARS \
2706 : COMPOSITION_WITH_RULE_ALTCHARS); \
2707 composition_state = (c1 <= '2' ? COMPOSING_CHAR \
2708 : COMPOSING_COMPONENT_CHAR); \
2709 component_idx = component_len = 0; \
2710 } \
2711 } while (0)
2712
2713
2714 /* Handle compositoin end sequence ESC 1. */
2715
2716 #define DECODE_COMPOSITION_END() \
2717 do { \
2718 int nchars = (component_len > 0 ? component_idx - component_len \
2719 : method == COMPOSITION_RELATIVE ? component_idx \
2720 : (component_idx + 1) / 2); \
2721 int i; \
2722 int *saved_charbuf = charbuf; \
2723 int from = coding->produced_char + char_offset; \
2724 int to = from + nchars; \
2725 \
2726 ADD_COMPOSITION_DATA (charbuf, from, to, method); \
2727 if (method != COMPOSITION_RELATIVE) \
2728 { \
2729 if (component_len == 0) \
2730 for (i = 0; i < component_idx; i++) \
2731 *charbuf++ = components[i]; \
2732 else \
2733 for (i = 0; i < component_len; i++) \
2734 *charbuf++ = components[i]; \
2735 *saved_charbuf = saved_charbuf - charbuf; \
2736 } \
2737 if (method == COMPOSITION_WITH_RULE) \
2738 for (i = 0; i < component_idx; i += 2, char_offset++) \
2739 *charbuf++ = components[i]; \
2740 else \
2741 for (i = component_len; i < component_idx; i++, char_offset++) \
2742 *charbuf++ = components[i]; \
2743 coding->annotated = 1; \
2744 composition_state = COMPOSING_NO; \
2745 } while (0)
2746
2747
2748 /* Decode a composition rule from the byte C1 (and maybe one more byte
2749 from SRC) and store one encoded composition rule in
2750 coding->cmp_data. */
2751
2752 #define DECODE_COMPOSITION_RULE(c1) \
2753 do { \
2754 (c1) -= 32; \
2755 if (c1 < 81) /* old format (before ver.21) */ \
2756 { \
2757 int gref = (c1) / 9; \
2758 int nref = (c1) % 9; \
2759 if (gref == 4) gref = 10; \
2760 if (nref == 4) nref = 10; \
2761 c1 = COMPOSITION_ENCODE_RULE (gref, nref); \
2762 } \
2763 else if (c1 < 93) /* new format (after ver.21) */ \
2764 { \
2765 ONE_MORE_BYTE (c2); \
2766 c1 = COMPOSITION_ENCODE_RULE (c1 - 81, c2 - 32); \
2767 } \
2768 else \
2769 c1 = 0; \
2770 } while (0)
2771
2772
2773 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions". */
2774
2775 static void
2776 decode_coding_iso_2022 (coding)
2777 struct coding_system *coding;
2778 {
2779 unsigned char *src = coding->source + coding->consumed;
2780 unsigned char *src_end = coding->source + coding->src_bytes;
2781 unsigned char *src_base;
2782 int *charbuf = coding->charbuf;
2783 int *charbuf_end
2784 = charbuf + coding->charbuf_size - 4 - MAX_ANNOTATION_LENGTH;
2785 int consumed_chars = 0, consumed_chars_base;
2786 int multibytep = coding->src_multibyte;
2787 /* Charsets invoked to graphic plane 0 and 1 respectively. */
2788 int charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
2789 int charset_id_1 = CODING_ISO_INVOKED_CHARSET (coding, 1);
2790 struct charset *charset;
2791 int c;
2792 /* For handling composition sequence. */
2793 #define COMPOSING_NO 0
2794 #define COMPOSING_CHAR 1
2795 #define COMPOSING_RULE 2
2796 #define COMPOSING_COMPONENT_CHAR 3
2797 #define COMPOSING_COMPONENT_RULE 4
2798
2799 int composition_state = COMPOSING_NO;
2800 enum composition_method method;
2801 int components[MAX_COMPOSITION_COMPONENTS * 2 + 1];
2802 int component_idx;
2803 int component_len;
2804 Lisp_Object attrs, eol_type, charset_list;
2805 int char_offset = coding->produced_char;
2806 int last_offset = char_offset;
2807 int last_id = charset_ascii;
2808
2809 CODING_GET_INFO (coding, attrs, eol_type, charset_list);
2810 setup_iso_safe_charsets (attrs);
2811
2812 while (1)
2813 {
2814 int c1, c2;
2815
2816 src_base = src;
2817 consumed_chars_base = consumed_chars;
2818
2819 if (charbuf >= charbuf_end)
2820 break;
2821
2822 ONE_MORE_BYTE (c1);
2823
2824 /* We produce at most one character. */
2825 switch (iso_code_class [c1])
2826 {
2827 case ISO_0x20_or_0x7F:
2828 if (composition_state != COMPOSING_NO)
2829 {
2830 if (composition_state == COMPOSING_RULE
2831 || composition_state == COMPOSING_COMPONENT_RULE)
2832 {
2833 DECODE_COMPOSITION_RULE (c1);
2834 components[component_idx++] = c1;
2835 composition_state--;
2836 continue;
2837 }
2838 }
2839 if (charset_id_0 < 0
2840 || ! CHARSET_ISO_CHARS_96 (CHARSET_FROM_ID (charset_id_0)))
2841 /* This is SPACE or DEL. */
2842 charset = CHARSET_FROM_ID (charset_ascii);
2843 else
2844 charset = CHARSET_FROM_ID (charset_id_0);
2845 break;
2846
2847 case ISO_graphic_plane_0:
2848 if (composition_state != COMPOSING_NO)
2849 {
2850 if (composition_state == COMPOSING_RULE
2851 || composition_state == COMPOSING_COMPONENT_RULE)
2852 {
2853 DECODE_COMPOSITION_RULE (c1);
2854 components[component_idx++] = c1;
2855 composition_state--;
2856 continue;
2857 }
2858 }
2859 charset = CHARSET_FROM_ID (charset_id_0);
2860 break;
2861
2862 case ISO_0xA0_or_0xFF:
2863 if (charset_id_1 < 0
2864 || ! CHARSET_ISO_CHARS_96 (CHARSET_FROM_ID (charset_id_1))
2865 || CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS)
2866 goto invalid_code;
2867 /* This is a graphic character, we fall down ... */
2868
2869 case ISO_graphic_plane_1:
2870 if (charset_id_1 < 0)
2871 goto invalid_code;
2872 charset = CHARSET_FROM_ID (charset_id_1);
2873 break;
2874
2875 case ISO_carriage_return:
2876 if (c1 == '\r')
2877 {
2878 if (EQ (eol_type, Qdos))
2879 {
2880 if (src == src_end)
2881 {
2882 coding->result = CODING_RESULT_INSUFFICIENT_SRC;
2883 goto no_more_source;
2884 }
2885 if (*src == '\n')
2886 ONE_MORE_BYTE (c1);
2887 }
2888 else if (EQ (eol_type, Qmac))
2889 c1 = '\n';
2890 }
2891 /* fall through */
2892
2893 case ISO_control_0:
2894 MAYBE_FINISH_COMPOSITION ();
2895 charset = CHARSET_FROM_ID (charset_ascii);
2896 break;
2897
2898 case ISO_control_1:
2899 MAYBE_FINISH_COMPOSITION ();
2900 goto invalid_code;
2901
2902 case ISO_shift_out:
2903 if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LOCKING_SHIFT)
2904 || CODING_ISO_DESIGNATION (coding, 1) < 0)
2905 goto invalid_code;
2906 CODING_ISO_INVOCATION (coding, 0) = 1;
2907 charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
2908 continue;
2909
2910 case ISO_shift_in:
2911 if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LOCKING_SHIFT))
2912 goto invalid_code;
2913 CODING_ISO_INVOCATION (coding, 0) = 0;
2914 charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
2915 continue;
2916
2917 case ISO_single_shift_2_7:
2918 case ISO_single_shift_2:
2919 if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT))
2920 goto invalid_code;
2921 /* SS2 is handled as an escape sequence of ESC 'N' */
2922 c1 = 'N';
2923 goto label_escape_sequence;
2924
2925 case ISO_single_shift_3:
2926 if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT))
2927 goto invalid_code;
2928 /* SS2 is handled as an escape sequence of ESC 'O' */
2929 c1 = 'O';
2930 goto label_escape_sequence;
2931
2932 case ISO_control_sequence_introducer:
2933 /* CSI is handled as an escape sequence of ESC '[' ... */
2934 c1 = '[';
2935 goto label_escape_sequence;
2936
2937 case ISO_escape:
2938 ONE_MORE_BYTE (c1);
2939 label_escape_sequence:
2940 /* Escape sequences handled here are invocation,
2941 designation, direction specification, and character
2942 composition specification. */
2943 switch (c1)
2944 {
2945 case '&': /* revision of following character set */
2946 ONE_MORE_BYTE (c1);
2947 if (!(c1 >= '@' && c1 <= '~'))
2948 goto invalid_code;
2949 ONE_MORE_BYTE (c1);
2950 if (c1 != ISO_CODE_ESC)
2951 goto invalid_code;
2952 ONE_MORE_BYTE (c1);
2953 goto label_escape_sequence;
2954
2955 case '$': /* designation of 2-byte character set */
2956 if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DESIGNATION))
2957 goto invalid_code;
2958 ONE_MORE_BYTE (c1);
2959 if (c1 >= '@' && c1 <= 'B')
2960 { /* designation of JISX0208.1978, GB2312.1980,
2961 or JISX0208.1980 */
2962 DECODE_DESIGNATION (0, 2, 0, c1);
2963 }
2964 else if (c1 >= 0x28 && c1 <= 0x2B)
2965 { /* designation of DIMENSION2_CHARS94 character set */
2966 ONE_MORE_BYTE (c2);
2967 DECODE_DESIGNATION (c1 - 0x28, 2, 0, c2);
2968 }
2969 else if (c1 >= 0x2C && c1 <= 0x2F)
2970 { /* designation of DIMENSION2_CHARS96 character set */
2971 ONE_MORE_BYTE (c2);
2972 DECODE_DESIGNATION (c1 - 0x2C, 2, 1, c2);
2973 }
2974 else
2975 goto invalid_code;
2976 /* We must update these variables now. */
2977 charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
2978 charset_id_1 = CODING_ISO_INVOKED_CHARSET (coding, 1);
2979 continue;
2980
2981 case 'n': /* invocation of locking-shift-2 */
2982 if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LOCKING_SHIFT)
2983 || CODING_ISO_DESIGNATION (coding, 2) < 0)
2984 goto invalid_code;
2985 CODING_ISO_INVOCATION (coding, 0) = 2;
2986 charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
2987 continue;
2988
2989 case 'o': /* invocation of locking-shift-3 */
2990 if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LOCKING_SHIFT)
2991 || CODING_ISO_DESIGNATION (coding, 3) < 0)
2992 goto invalid_code;
2993 CODING_ISO_INVOCATION (coding, 0) = 3;
2994 charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
2995 continue;
2996
2997 case 'N': /* invocation of single-shift-2 */
2998 if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT)
2999 || CODING_ISO_DESIGNATION (coding, 2) < 0)
3000 goto invalid_code;
3001 charset = CHARSET_FROM_ID (CODING_ISO_DESIGNATION (coding, 2));
3002 ONE_MORE_BYTE (c1);
3003 if (c1 < 0x20 || (c1 >= 0x80 && c1 < 0xA0))
3004 goto invalid_code;
3005 break;
3006
3007 case 'O': /* invocation of single-shift-3 */
3008 if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT)
3009 || CODING_ISO_DESIGNATION (coding, 3) < 0)
3010 goto invalid_code;
3011 charset = CHARSET_FROM_ID (CODING_ISO_DESIGNATION (coding, 3));
3012 ONE_MORE_BYTE (c1);
3013 if (c1 < 0x20 || (c1 >= 0x80 && c1 < 0xA0))
3014 goto invalid_code;
3015 break;
3016
3017 case '0': case '2': case '3': case '4': /* start composition */
3018 if (! (coding->common_flags & CODING_ANNOTATE_COMPOSITION_MASK))
3019 goto invalid_code;
3020 DECODE_COMPOSITION_START (c1);
3021 continue;
3022
3023 case '1': /* end composition */
3024 if (composition_state == COMPOSING_NO)
3025 goto invalid_code;
3026 DECODE_COMPOSITION_END ();
3027 continue;
3028
3029 case '[': /* specification of direction */
3030 if (! CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DIRECTION)
3031 goto invalid_code;
3032 /* For the moment, nested direction is not supported.
3033 So, `coding->mode & CODING_MODE_DIRECTION' zero means
3034 left-to-right, and nozero means right-to-left. */
3035 ONE_MORE_BYTE (c1);
3036 switch (c1)
3037 {
3038 case ']': /* end of the current direction */
3039 coding->mode &= ~CODING_MODE_DIRECTION;
3040
3041 case '0': /* end of the current direction */
3042 case '1': /* start of left-to-right direction */
3043 ONE_MORE_BYTE (c1);
3044 if (c1 == ']')
3045 coding->mode &= ~CODING_MODE_DIRECTION;
3046 else
3047 goto invalid_code;
3048 break;
3049
3050 case '2': /* start of right-to-left direction */
3051 ONE_MORE_BYTE (c1);
3052 if (c1 == ']')
3053 coding->mode |= CODING_MODE_DIRECTION;
3054 else
3055 goto invalid_code;
3056 break;
3057
3058 default:
3059 goto invalid_code;
3060 }
3061 continue;
3062
3063 case '%':
3064 ONE_MORE_BYTE (c1);
3065 if (c1 == '/')
3066 {
3067 /* CTEXT extended segment:
3068 ESC % / [0-4] M L --ENCODING-NAME-- \002 --BYTES--
3069 We keep these bytes as is for the moment.
3070 They may be decoded by post-read-conversion. */
3071 int dim, M, L;
3072 int size;
3073
3074 ONE_MORE_BYTE (dim);
3075 ONE_MORE_BYTE (M);
3076 ONE_MORE_BYTE (L);
3077 size = ((M - 128) * 128) + (L - 128);
3078 if (charbuf + 8 + size > charbuf_end)
3079 goto break_loop;
3080 *charbuf++ = ISO_CODE_ESC;
3081 *charbuf++ = '%';
3082 *charbuf++ = '/';
3083 *charbuf++ = dim;
3084 *charbuf++ = BYTE8_TO_CHAR (M);
3085 *charbuf++ = BYTE8_TO_CHAR (L);
3086 while (size-- > 0)
3087 {
3088 ONE_MORE_BYTE (c1);
3089 *charbuf++ = ASCII_BYTE_P (c1) ? c1 : BYTE8_TO_CHAR (c1);
3090 }
3091 }
3092 else if (c1 == 'G')
3093 {
3094 /* XFree86 extension for embedding UTF-8 in CTEXT:
3095 ESC % G --UTF-8-BYTES-- ESC % @
3096 We keep these bytes as is for the moment.
3097 They may be decoded by post-read-conversion. */
3098 int *p = charbuf;
3099
3100 if (p + 6 > charbuf_end)
3101 goto break_loop;
3102 *p++ = ISO_CODE_ESC;
3103 *p++ = '%';
3104 *p++ = 'G';
3105 while (p < charbuf_end)
3106 {
3107 ONE_MORE_BYTE (c1);
3108 if (c1 == ISO_CODE_ESC
3109 && src + 1 < src_end
3110 && src[0] == '%'
3111 && src[1] == '@')
3112 break;
3113 *p++ = ASCII_BYTE_P (c1) ? c1 : BYTE8_TO_CHAR (c1);
3114 }
3115 if (p + 3 > charbuf_end)
3116 goto break_loop;
3117 *p++ = ISO_CODE_ESC;
3118 *p++ = '%';
3119 *p++ = '@';
3120 charbuf = p;
3121 }
3122 else
3123 goto invalid_code;
3124 continue;
3125 break;
3126
3127 default:
3128 if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DESIGNATION))
3129 goto invalid_code;
3130 if (c1 >= 0x28 && c1 <= 0x2B)
3131 { /* designation of DIMENSION1_CHARS94 character set */
3132 ONE_MORE_BYTE (c2);
3133 DECODE_DESIGNATION (c1 - 0x28, 1, 0, c2);
3134 }
3135 else if (c1 >= 0x2C && c1 <= 0x2F)
3136 { /* designation of DIMENSION1_CHARS96 character set */
3137 ONE_MORE_BYTE (c2);
3138 DECODE_DESIGNATION (c1 - 0x2C, 1, 1, c2);
3139 }
3140 else
3141 goto invalid_code;
3142 /* We must update these variables now. */
3143 charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3144 charset_id_1 = CODING_ISO_INVOKED_CHARSET (coding, 1);
3145 continue;
3146 }
3147 }
3148
3149 if (charset->id != charset_ascii
3150 && last_id != charset->id)
3151 {
3152 if (last_id != charset_ascii)
3153 ADD_CHARSET_DATA (charbuf, last_offset, char_offset, last_id);
3154 last_id = charset->id;
3155 last_offset = char_offset;
3156 }
3157
3158 /* Now we know CHARSET and 1st position code C1 of a character.
3159 Produce a decoded character while getting 2nd position code
3160 C2 if necessary. */
3161 c1 &= 0x7F;
3162 if (CHARSET_DIMENSION (charset) > 1)
3163 {
3164 ONE_MORE_BYTE (c2);
3165 if (c2 < 0x20 || (c2 >= 0x80 && c2 < 0xA0))
3166 /* C2 is not in a valid range. */
3167 goto invalid_code;
3168 c1 = (c1 << 8) | (c2 & 0x7F);
3169 if (CHARSET_DIMENSION (charset) > 2)
3170 {
3171 ONE_MORE_BYTE (c2);
3172 if (c2 < 0x20 || (c2 >= 0x80 && c2 < 0xA0))
3173 /* C2 is not in a valid range. */
3174 goto invalid_code;
3175 c1 = (c1 << 8) | (c2 & 0x7F);
3176 }
3177 }
3178
3179 CODING_DECODE_CHAR (coding, src, src_base, src_end, charset, c1, c);
3180 if (c < 0)
3181 {
3182 MAYBE_FINISH_COMPOSITION ();
3183 for (; src_base < src; src_base++, char_offset++)
3184 {
3185 if (ASCII_BYTE_P (*src_base))
3186 *charbuf++ = *src_base;
3187 else
3188 *charbuf++ = BYTE8_TO_CHAR (*src_base);
3189 char_offset++;
3190 }
3191 }
3192 else if (composition_state == COMPOSING_NO)
3193 {
3194 *charbuf++ = c;
3195 char_offset++;
3196 }
3197 else
3198 {
3199 components[component_idx++] = c;
3200 if (method == COMPOSITION_WITH_RULE
3201 || (method == COMPOSITION_WITH_RULE_ALTCHARS
3202 && composition_state == COMPOSING_COMPONENT_CHAR))
3203 composition_state++;
3204 }
3205 continue;
3206
3207 invalid_code:
3208 MAYBE_FINISH_COMPOSITION ();
3209 src = src_base;
3210 consumed_chars = consumed_chars_base;
3211 ONE_MORE_BYTE (c);
3212 *charbuf++ = ASCII_BYTE_P (c) ? c : BYTE8_TO_CHAR (c);
3213 char_offset++;
3214 coding->errors++;
3215 continue;
3216
3217 break_loop:
3218 break;
3219 }
3220
3221 no_more_source:
3222 if (last_id != charset_ascii)
3223 ADD_CHARSET_DATA (charbuf, last_offset, char_offset, last_id);
3224 coding->consumed_char += consumed_chars_base;
3225 coding->consumed = src_base - coding->source;
3226 coding->charbuf_used = charbuf - coding->charbuf;
3227 }
3228
3229
3230 /* ISO2022 encoding stuff. */
3231
3232 /*
3233 It is not enough to say just "ISO2022" on encoding, we have to
3234 specify more details. In Emacs, each coding system of ISO2022
3235 variant has the following specifications:
3236 1. Initial designation to G0 thru G3.
3237 2. Allows short-form designation?
3238 3. ASCII should be designated to G0 before control characters?
3239 4. ASCII should be designated to G0 at end of line?
3240 5. 7-bit environment or 8-bit environment?
3241 6. Use locking-shift?
3242 7. Use Single-shift?
3243 And the following two are only for Japanese:
3244 8. Use ASCII in place of JIS0201-1976-Roman?
3245 9. Use JISX0208-1983 in place of JISX0208-1978?
3246 These specifications are encoded in CODING_ISO_FLAGS (coding) as flag bits
3247 defined by macros CODING_ISO_FLAG_XXX. See `coding.h' for more
3248 details.
3249 */
3250
3251 /* Produce codes (escape sequence) for designating CHARSET to graphic
3252 register REG at DST, and increment DST. If <final-char> of CHARSET is
3253 '@', 'A', or 'B' and the coding system CODING allows, produce
3254 designation sequence of short-form. */
3255
3256 #define ENCODE_DESIGNATION(charset, reg, coding) \
3257 do { \
3258 unsigned char final_char = CHARSET_ISO_FINAL (charset); \
3259 char *intermediate_char_94 = "()*+"; \
3260 char *intermediate_char_96 = ",-./"; \
3261 int revision = -1; \
3262 int c; \
3263 \
3264 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_REVISION) \
3265 revision = CHARSET_ISO_REVISION (charset); \
3266 \
3267 if (revision >= 0) \
3268 { \
3269 EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, '&'); \
3270 EMIT_ONE_BYTE ('@' + revision); \
3271 } \
3272 EMIT_ONE_ASCII_BYTE (ISO_CODE_ESC); \
3273 if (CHARSET_DIMENSION (charset) == 1) \
3274 { \
3275 if (! CHARSET_ISO_CHARS_96 (charset)) \
3276 c = intermediate_char_94[reg]; \
3277 else \
3278 c = intermediate_char_96[reg]; \
3279 EMIT_ONE_ASCII_BYTE (c); \
3280 } \
3281 else \
3282 { \
3283 EMIT_ONE_ASCII_BYTE ('$'); \
3284 if (! CHARSET_ISO_CHARS_96 (charset)) \
3285 { \
3286 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LONG_FORM \
3287 || reg != 0 \
3288 || final_char < '@' || final_char > 'B') \
3289 EMIT_ONE_ASCII_BYTE (intermediate_char_94[reg]); \
3290 } \
3291 else \
3292 EMIT_ONE_ASCII_BYTE (intermediate_char_96[reg]); \
3293 } \
3294 EMIT_ONE_ASCII_BYTE (final_char); \
3295 \
3296 CODING_ISO_DESIGNATION (coding, reg) = CHARSET_ID (charset); \
3297 } while (0)
3298
3299
3300 /* The following two macros produce codes (control character or escape
3301 sequence) for ISO2022 single-shift functions (single-shift-2 and
3302 single-shift-3). */
3303
3304 #define ENCODE_SINGLE_SHIFT_2 \
3305 do { \
3306 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS) \
3307 EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'N'); \
3308 else \
3309 EMIT_ONE_BYTE (ISO_CODE_SS2); \
3310 CODING_ISO_SINGLE_SHIFTING (coding) = 1; \
3311 } while (0)
3312
3313
3314 #define ENCODE_SINGLE_SHIFT_3 \
3315 do { \
3316 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS) \
3317 EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'O'); \
3318 else \
3319 EMIT_ONE_BYTE (ISO_CODE_SS3); \
3320 CODING_ISO_SINGLE_SHIFTING (coding) = 1; \
3321 } while (0)
3322
3323
3324 /* The following four macros produce codes (control character or
3325 escape sequence) for ISO2022 locking-shift functions (shift-in,
3326 shift-out, locking-shift-2, and locking-shift-3). */
3327
3328 #define ENCODE_SHIFT_IN \
3329 do { \
3330 EMIT_ONE_ASCII_BYTE (ISO_CODE_SI); \
3331 CODING_ISO_INVOCATION (coding, 0) = 0; \
3332 } while (0)
3333
3334
3335 #define ENCODE_SHIFT_OUT \
3336 do { \
3337 EMIT_ONE_ASCII_BYTE (ISO_CODE_SO); \
3338 CODING_ISO_INVOCATION (coding, 0) = 1; \
3339 } while (0)
3340
3341
3342 #define ENCODE_LOCKING_SHIFT_2 \
3343 do { \
3344 EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'n'); \
3345 CODING_ISO_INVOCATION (coding, 0) = 2; \
3346 } while (0)
3347
3348
3349 #define ENCODE_LOCKING_SHIFT_3 \
3350 do { \
3351 EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'n'); \
3352 CODING_ISO_INVOCATION (coding, 0) = 3; \
3353 } while (0)
3354
3355
3356 /* Produce codes for a DIMENSION1 character whose character set is
3357 CHARSET and whose position-code is C1. Designation and invocation
3358 sequences are also produced in advance if necessary. */
3359
3360 #define ENCODE_ISO_CHARACTER_DIMENSION1(charset, c1) \
3361 do { \
3362 int id = CHARSET_ID (charset); \
3363 \
3364 if ((CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_USE_ROMAN) \
3365 && id == charset_ascii) \
3366 { \
3367 id = charset_jisx0201_roman; \
3368 charset = CHARSET_FROM_ID (id); \
3369 } \
3370 \
3371 if (CODING_ISO_SINGLE_SHIFTING (coding)) \
3372 { \
3373 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS) \
3374 EMIT_ONE_ASCII_BYTE (c1 & 0x7F); \
3375 else \
3376 EMIT_ONE_BYTE (c1 | 0x80); \
3377 CODING_ISO_SINGLE_SHIFTING (coding) = 0; \
3378 break; \
3379 } \
3380 else if (id == CODING_ISO_INVOKED_CHARSET (coding, 0)) \
3381 { \
3382 EMIT_ONE_ASCII_BYTE (c1 & 0x7F); \
3383 break; \
3384 } \
3385 else if (id == CODING_ISO_INVOKED_CHARSET (coding, 1)) \
3386 { \
3387 EMIT_ONE_BYTE (c1 | 0x80); \
3388 break; \
3389 } \
3390 else \
3391 /* Since CHARSET is not yet invoked to any graphic planes, we \
3392 must invoke it, or, at first, designate it to some graphic \
3393 register. Then repeat the loop to actually produce the \
3394 character. */ \
3395 dst = encode_invocation_designation (charset, coding, dst, \
3396 &produced_chars); \
3397 } while (1)
3398
3399
3400 /* Produce codes for a DIMENSION2 character whose character set is
3401 CHARSET and whose position-codes are C1 and C2. Designation and
3402 invocation codes are also produced in advance if necessary. */
3403
3404 #define ENCODE_ISO_CHARACTER_DIMENSION2(charset, c1, c2) \
3405 do { \
3406 int id = CHARSET_ID (charset); \
3407 \
3408 if ((CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_USE_OLDJIS) \
3409 && id == charset_jisx0208) \
3410 { \
3411 id = charset_jisx0208_1978; \
3412 charset = CHARSET_FROM_ID (id); \
3413 } \
3414 \
3415 if (CODING_ISO_SINGLE_SHIFTING (coding)) \
3416 { \
3417 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS) \
3418 EMIT_TWO_ASCII_BYTES ((c1) & 0x7F, (c2) & 0x7F); \
3419 else \
3420 EMIT_TWO_BYTES ((c1) | 0x80, (c2) | 0x80); \
3421 CODING_ISO_SINGLE_SHIFTING (coding) = 0; \
3422 break; \
3423 } \
3424 else if (id == CODING_ISO_INVOKED_CHARSET (coding, 0)) \
3425 { \
3426 EMIT_TWO_ASCII_BYTES ((c1) & 0x7F, (c2) & 0x7F); \
3427 break; \
3428 } \
3429 else if (id == CODING_ISO_INVOKED_CHARSET (coding, 1)) \
3430 { \
3431 EMIT_TWO_BYTES ((c1) | 0x80, (c2) | 0x80); \
3432 break; \
3433 } \
3434 else \
3435 /* Since CHARSET is not yet invoked to any graphic planes, we \
3436 must invoke it, or, at first, designate it to some graphic \
3437 register. Then repeat the loop to actually produce the \
3438 character. */ \
3439 dst = encode_invocation_designation (charset, coding, dst, \
3440 &produced_chars); \
3441 } while (1)
3442
3443
3444 #define ENCODE_ISO_CHARACTER(charset, c) \
3445 do { \
3446 int code = ENCODE_CHAR ((charset),(c)); \
3447 \
3448 if (CHARSET_DIMENSION (charset) == 1) \
3449 ENCODE_ISO_CHARACTER_DIMENSION1 ((charset), code); \
3450 else \
3451 ENCODE_ISO_CHARACTER_DIMENSION2 ((charset), code >> 8, code & 0xFF); \
3452 } while (0)
3453
3454
3455 /* Produce designation and invocation codes at a place pointed by DST
3456 to use CHARSET. The element `spec.iso_2022' of *CODING is updated.
3457 Return new DST. */
3458
3459 unsigned char *
3460 encode_invocation_designation (charset, coding, dst, p_nchars)
3461 struct charset *charset;
3462 struct coding_system *coding;
3463 unsigned char *dst;
3464 int *p_nchars;
3465 {
3466 int multibytep = coding->dst_multibyte;
3467 int produced_chars = *p_nchars;
3468 int reg; /* graphic register number */
3469 int id = CHARSET_ID (charset);
3470
3471 /* At first, check designations. */
3472 for (reg = 0; reg < 4; reg++)
3473 if (id == CODING_ISO_DESIGNATION (coding, reg))
3474 break;
3475
3476 if (reg >= 4)
3477 {
3478 /* CHARSET is not yet designated to any graphic registers. */
3479 /* At first check the requested designation. */
3480 reg = CODING_ISO_REQUEST (coding, id);
3481 if (reg < 0)
3482 /* Since CHARSET requests no special designation, designate it
3483 to graphic register 0. */
3484 reg = 0;
3485
3486 ENCODE_DESIGNATION (charset, reg, coding);
3487 }
3488
3489 if (CODING_ISO_INVOCATION (coding, 0) != reg
3490 && CODING_ISO_INVOCATION (coding, 1) != reg)
3491 {
3492 /* Since the graphic register REG is not invoked to any graphic
3493 planes, invoke it to graphic plane 0. */
3494 switch (reg)
3495 {
3496 case 0: /* graphic register 0 */
3497 ENCODE_SHIFT_IN;
3498 break;
3499
3500 case 1: /* graphic register 1 */
3501 ENCODE_SHIFT_OUT;
3502 break;
3503
3504 case 2: /* graphic register 2 */
3505 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT)
3506 ENCODE_SINGLE_SHIFT_2;
3507 else
3508 ENCODE_LOCKING_SHIFT_2;
3509 break;
3510
3511 case 3: /* graphic register 3 */
3512 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT)
3513 ENCODE_SINGLE_SHIFT_3;
3514 else
3515 ENCODE_LOCKING_SHIFT_3;
3516 break;
3517 }
3518 }
3519
3520 *p_nchars = produced_chars;
3521 return dst;
3522 }
3523
3524 /* The following three macros produce codes for indicating direction
3525 of text. */
3526 #define ENCODE_CONTROL_SEQUENCE_INTRODUCER \
3527 do { \
3528 if (CODING_ISO_FLAGS (coding) == CODING_ISO_FLAG_SEVEN_BITS) \
3529 EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, '['); \
3530 else \
3531 EMIT_ONE_BYTE (ISO_CODE_CSI); \
3532 } while (0)
3533
3534
3535 #define ENCODE_DIRECTION_R2L() \
3536 do { \
3537 ENCODE_CONTROL_SEQUENCE_INTRODUCER (dst); \
3538 EMIT_TWO_ASCII_BYTES ('2', ']'); \
3539 } while (0)
3540
3541
3542 #define ENCODE_DIRECTION_L2R() \
3543 do { \
3544 ENCODE_CONTROL_SEQUENCE_INTRODUCER (dst); \
3545 EMIT_TWO_ASCII_BYTES ('0', ']'); \
3546 } while (0)
3547
3548
3549 /* Produce codes for designation and invocation to reset the graphic
3550 planes and registers to initial state. */
3551 #define ENCODE_RESET_PLANE_AND_REGISTER() \
3552 do { \
3553 int reg; \
3554 struct charset *charset; \
3555 \
3556 if (CODING_ISO_INVOCATION (coding, 0) != 0) \
3557 ENCODE_SHIFT_IN; \
3558 for (reg = 0; reg < 4; reg++) \
3559 if (CODING_ISO_INITIAL (coding, reg) >= 0 \
3560 && (CODING_ISO_DESIGNATION (coding, reg) \
3561 != CODING_ISO_INITIAL (coding, reg))) \
3562 { \
3563 charset = CHARSET_FROM_ID (CODING_ISO_INITIAL (coding, reg)); \
3564 ENCODE_DESIGNATION (charset, reg, coding); \
3565 } \
3566 } while (0)
3567
3568
3569 /* Produce designation sequences of charsets in the line started from
3570 SRC to a place pointed by DST, and return updated DST.
3571
3572 If the current block ends before any end-of-line, we may fail to
3573 find all the necessary designations. */
3574
3575 static unsigned char *
3576 encode_designation_at_bol (coding, charbuf, charbuf_end, dst)
3577 struct coding_system *coding;
3578 int *charbuf, *charbuf_end;
3579 unsigned char *dst;
3580 {
3581 struct charset *charset;
3582 /* Table of charsets to be designated to each graphic register. */
3583 int r[4];
3584 int c, found = 0, reg;
3585 int produced_chars = 0;
3586 int multibytep = coding->dst_multibyte;
3587 Lisp_Object attrs;
3588 Lisp_Object charset_list;
3589
3590 attrs = CODING_ID_ATTRS (coding->id);
3591 charset_list = CODING_ATTR_CHARSET_LIST (attrs);
3592 if (EQ (charset_list, Qiso_2022))
3593 charset_list = Viso_2022_charset_list;
3594
3595 for (reg = 0; reg < 4; reg++)
3596 r[reg] = -1;
3597
3598 while (found < 4)
3599 {
3600 int id;
3601
3602 c = *charbuf++;
3603 if (c == '\n')
3604 break;
3605 charset = char_charset (c, charset_list, NULL);
3606 id = CHARSET_ID (charset);
3607 reg = CODING_ISO_REQUEST (coding, id);
3608 if (reg >= 0 && r[reg] < 0)
3609 {
3610 found++;
3611 r[reg] = id;
3612 }
3613 }
3614
3615 if (found)
3616 {
3617 for (reg = 0; reg < 4; reg++)
3618 if (r[reg] >= 0
3619 && CODING_ISO_DESIGNATION (coding, reg) != r[reg])
3620 ENCODE_DESIGNATION (CHARSET_FROM_ID (r[reg]), reg, coding);
3621 }
3622
3623 return dst;
3624 }
3625
3626 /* See the above "GENERAL NOTES on `encode_coding_XXX ()' functions". */
3627
3628 static int
3629 encode_coding_iso_2022 (coding)
3630 struct coding_system *coding;
3631 {
3632 int multibytep = coding->dst_multibyte;
3633 int *charbuf = coding->charbuf;
3634 int *charbuf_end = charbuf + coding->charbuf_used;
3635 unsigned char *dst = coding->destination + coding->produced;
3636 unsigned char *dst_end = coding->destination + coding->dst_bytes;
3637 int safe_room = 16;
3638 int bol_designation
3639 = (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DESIGNATE_AT_BOL
3640 && CODING_ISO_BOL (coding));
3641 int produced_chars = 0;
3642 Lisp_Object attrs, eol_type, charset_list;
3643 int ascii_compatible;
3644 int c;
3645 int preferred_charset_id = -1;
3646
3647 CODING_GET_INFO (coding, attrs, eol_type, charset_list);
3648 setup_iso_safe_charsets (attrs);
3649 /* Charset list may have been changed. */
3650 charset_list = CODING_ATTR_CHARSET_LIST (attrs); \
3651 coding->safe_charsets
3652 = (char *) XSTRING (CODING_ATTR_SAFE_CHARSETS(attrs))->data;
3653
3654 ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs));
3655
3656 while (charbuf < charbuf_end)
3657 {
3658 ASSURE_DESTINATION (safe_room);
3659
3660 if (bol_designation)
3661 {
3662 unsigned char *dst_prev = dst;
3663
3664 /* We have to produce designation sequences if any now. */
3665 dst = encode_designation_at_bol (coding, charbuf, charbuf_end, dst);
3666 bol_designation = 0;
3667 /* We are sure that designation sequences are all ASCII bytes. */
3668 produced_chars += dst - dst_prev;
3669 }
3670
3671 c = *charbuf++;
3672
3673 if (c < 0)
3674 {
3675 /* Handle an annotation. */
3676 switch (*charbuf)
3677 {
3678 case CODING_ANNOTATE_COMPOSITION_MASK:
3679 /* Not yet implemented. */
3680 break;
3681 case CODING_ANNOTATE_CHARSET_MASK:
3682 preferred_charset_id = charbuf[3];
3683 if (preferred_charset_id >= 0
3684 && NILP (Fmemq (make_number (preferred_charset_id),
3685 charset_list)))
3686 preferred_charset_id = -1;
3687 break;
3688 default:
3689 abort ();
3690 }
3691 charbuf += -c - 1;
3692 continue;
3693 }
3694
3695 /* Now encode the character C. */
3696 if (c < 0x20 || c == 0x7F)
3697 {
3698 if (c == '\n'
3699 || (c == '\r' && EQ (eol_type, Qmac)))
3700 {
3701 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_RESET_AT_EOL)
3702 ENCODE_RESET_PLANE_AND_REGISTER ();
3703 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_INIT_AT_BOL)
3704 {
3705 int i;
3706
3707 for (i = 0; i < 4; i++)
3708 CODING_ISO_DESIGNATION (coding, i)
3709 = CODING_ISO_INITIAL (coding, i);
3710 }
3711 bol_designation
3712 = CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DESIGNATE_AT_BOL;
3713 }
3714 else if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_RESET_AT_CNTL)
3715 ENCODE_RESET_PLANE_AND_REGISTER ();
3716 EMIT_ONE_ASCII_BYTE (c);
3717 }
3718 else if (ASCII_CHAR_P (c))
3719 {
3720 if (ascii_compatible)
3721 EMIT_ONE_ASCII_BYTE (c);
3722 else
3723 {
3724 struct charset *charset = CHARSET_FROM_ID (charset_ascii);
3725 ENCODE_ISO_CHARACTER (charset, c);
3726 }
3727 }
3728 else if (CHAR_BYTE8_P (c))
3729 {
3730 c = CHAR_TO_BYTE8 (c);
3731 EMIT_ONE_BYTE (c);
3732 }
3733 else
3734 {
3735 struct charset *charset;
3736
3737 if (preferred_charset_id >= 0)
3738 {
3739 charset = CHARSET_FROM_ID (preferred_charset_id);
3740 if (! CHAR_CHARSET_P (c, charset))
3741 charset = char_charset (c, charset_list, NULL);
3742 }
3743 else
3744 charset = char_charset (c, charset_list, NULL);
3745 if (!charset)
3746 {
3747 if (coding->mode & CODING_MODE_SAFE_ENCODING)
3748 {
3749 c = CODING_INHIBIT_CHARACTER_SUBSTITUTION;
3750 charset = CHARSET_FROM_ID (charset_ascii);
3751 }
3752 else
3753 {
3754 c = coding->default_char;
3755 charset = char_charset (c, charset_list, NULL);
3756 }
3757 }
3758 ENCODE_ISO_CHARACTER (charset, c);
3759 }
3760 }
3761
3762 if (coding->mode & CODING_MODE_LAST_BLOCK
3763 && CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_RESET_AT_EOL)
3764 {
3765 ASSURE_DESTINATION (safe_room);
3766 ENCODE_RESET_PLANE_AND_REGISTER ();
3767 }
3768 coding->result = CODING_RESULT_SUCCESS;
3769 CODING_ISO_BOL (coding) = bol_designation;
3770 coding->produced_char += produced_chars;
3771 coding->produced = dst - coding->destination;
3772 return 0;
3773 }
3774
3775 \f
3776 /*** 8,9. SJIS and BIG5 handlers ***/
3777
3778 /* Although SJIS and BIG5 are not ISO's coding system, they are used
3779 quite widely. So, for the moment, Emacs supports them in the bare
3780 C code. But, in the future, they may be supported only by CCL. */
3781
3782 /* SJIS is a coding system encoding three character sets: ASCII, right
3783 half of JISX0201-Kana, and JISX0208. An ASCII character is encoded
3784 as is. A character of charset katakana-jisx0201 is encoded by
3785 "position-code + 0x80". A character of charset japanese-jisx0208
3786 is encoded in 2-byte but two position-codes are divided and shifted
3787 so that it fit in the range below.
3788
3789 --- CODE RANGE of SJIS ---
3790 (character set) (range)
3791 ASCII 0x00 .. 0x7F
3792 KATAKANA-JISX0201 0xA0 .. 0xDF
3793 JISX0208 (1st byte) 0x81 .. 0x9F and 0xE0 .. 0xEF
3794 (2nd byte) 0x40 .. 0x7E and 0x80 .. 0xFC
3795 -------------------------------
3796
3797 */
3798
3799 /* BIG5 is a coding system encoding two character sets: ASCII and
3800 Big5. An ASCII character is encoded as is. Big5 is a two-byte
3801 character set and is encoded in two-byte.
3802
3803 --- CODE RANGE of BIG5 ---
3804 (character set) (range)
3805 ASCII 0x00 .. 0x7F
3806 Big5 (1st byte) 0xA1 .. 0xFE
3807 (2nd byte) 0x40 .. 0x7E and 0xA1 .. 0xFE
3808 --------------------------
3809
3810 */
3811
3812 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
3813 Check if a text is encoded in SJIS. If it is, return
3814 CATEGORY_MASK_SJIS, else return 0. */
3815
3816 static int
3817 detect_coding_sjis (coding, detect_info)
3818 struct coding_system *coding;
3819 struct coding_detection_info *detect_info;
3820 {
3821 unsigned char *src = coding->source, *src_base = src;
3822 unsigned char *src_end = coding->source + coding->src_bytes;
3823 int multibytep = coding->src_multibyte;
3824 int consumed_chars = 0;
3825 int found = 0;
3826 int c;
3827 int incomplete;
3828
3829 detect_info->checked |= CATEGORY_MASK_SJIS;
3830 /* A coding system of this category is always ASCII compatible. */
3831 src += coding->head_ascii;
3832
3833 while (1)
3834 {
3835 incomplete = 0;
3836 ONE_MORE_BYTE (c);
3837 incomplete = 1;
3838 if (c < 0x80)
3839 continue;
3840 if ((c >= 0x81 && c <= 0x9F) || (c >= 0xE0 && c <= 0xEF))
3841 {
3842 ONE_MORE_BYTE (c);
3843 if (c < 0x40 || c == 0x7F || c > 0xFC)
3844 break;
3845 found = CATEGORY_MASK_SJIS;
3846 }
3847 else if (c >= 0xA0 && c < 0xE0)
3848 found = CATEGORY_MASK_SJIS;
3849 else
3850 break;
3851 }
3852 detect_info->rejected |= CATEGORY_MASK_SJIS;
3853 return 0;
3854
3855 no_more_source:
3856 if (incomplete && coding->mode & CODING_MODE_LAST_BLOCK)
3857 {
3858 detect_info->rejected |= CATEGORY_MASK_SJIS;
3859 return 0;
3860 }
3861 detect_info->found |= found;
3862 return 1;
3863 }
3864
3865 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
3866 Check if a text is encoded in BIG5. If it is, return
3867 CATEGORY_MASK_BIG5, else return 0. */
3868
3869 static int
3870 detect_coding_big5 (coding, detect_info)
3871 struct coding_system *coding;
3872 struct coding_detection_info *detect_info;
3873 {
3874 unsigned char *src = coding->source, *src_base = src;
3875 unsigned char *src_end = coding->source + coding->src_bytes;
3876 int multibytep = coding->src_multibyte;
3877 int consumed_chars = 0;
3878 int found = 0;
3879 int c;
3880 int incomplete;
3881
3882 detect_info->checked |= CATEGORY_MASK_BIG5;
3883 /* A coding system of this category is always ASCII compatible. */
3884 src += coding->head_ascii;
3885
3886 while (1)
3887 {
3888 incomplete = 0;
3889 ONE_MORE_BYTE (c);
3890 incomplete = 1;
3891 if (c < 0x80)
3892 continue;
3893 if (c >= 0xA1)
3894 {
3895 ONE_MORE_BYTE (c);
3896 if (c < 0x40 || (c >= 0x7F && c <= 0xA0))
3897 return 0;
3898 found = CATEGORY_MASK_BIG5;
3899 }
3900 else
3901 break;
3902 }
3903 detect_info->rejected |= CATEGORY_MASK_BIG5;
3904 return 0;
3905
3906 no_more_source:
3907 if (incomplete && coding->mode & CODING_MODE_LAST_BLOCK)
3908 {
3909 detect_info->rejected |= CATEGORY_MASK_BIG5;
3910 return 0;
3911 }
3912 detect_info->found |= found;
3913 return 1;
3914 }
3915
3916 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".
3917 If SJIS_P is 1, decode SJIS text, else decode BIG5 test. */
3918
3919 static void
3920 decode_coding_sjis (coding)
3921 struct coding_system *coding;
3922 {
3923 unsigned char *src = coding->source + coding->consumed;
3924 unsigned char *src_end = coding->source + coding->src_bytes;
3925 unsigned char *src_base;
3926 int *charbuf = coding->charbuf;
3927 int *charbuf_end = charbuf + coding->charbuf_size - MAX_ANNOTATION_LENGTH;
3928 int consumed_chars = 0, consumed_chars_base;
3929 int multibytep = coding->src_multibyte;
3930 struct charset *charset_roman, *charset_kanji, *charset_kana;
3931 Lisp_Object attrs, eol_type, charset_list, val;
3932 int char_offset = coding->produced_char;
3933 int last_offset = char_offset;
3934 int last_id = charset_ascii;
3935
3936 CODING_GET_INFO (coding, attrs, eol_type, charset_list);
3937
3938 val = charset_list;
3939 charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
3940 charset_kana = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
3941 charset_kanji = CHARSET_FROM_ID (XINT (XCAR (val)));
3942
3943 while (1)
3944 {
3945 int c, c1;
3946
3947 src_base = src;
3948 consumed_chars_base = consumed_chars;
3949
3950 if (charbuf >= charbuf_end)
3951 break;
3952
3953 ONE_MORE_BYTE (c);
3954
3955 if (c == '\r')
3956 {
3957 if (EQ (eol_type, Qdos))
3958 {
3959 if (src == src_end)
3960 {
3961 coding->result = CODING_RESULT_INSUFFICIENT_SRC;
3962 goto no_more_source;
3963 }
3964 if (*src == '\n')
3965 ONE_MORE_BYTE (c);
3966 }
3967 else if (EQ (eol_type, Qmac))
3968 c = '\n';
3969 }
3970 else
3971 {
3972 struct charset *charset;
3973
3974 if (c < 0x80)
3975 charset = charset_roman;
3976 else
3977 {
3978 if (c >= 0xF0)
3979 goto invalid_code;
3980 if (c < 0xA0 || c >= 0xE0)
3981 {
3982 /* SJIS -> JISX0208 */
3983 ONE_MORE_BYTE (c1);
3984 if (c1 < 0x40 || c1 == 0x7F || c1 > 0xFC)
3985 goto invalid_code;
3986 c = (c << 8) | c1;
3987 SJIS_TO_JIS (c);
3988 charset = charset_kanji;
3989 }
3990 else
3991 {
3992 /* SJIS -> JISX0201-Kana */
3993 c &= 0x7F;
3994 charset = charset_kana;
3995 }
3996 }
3997 if (charset->id != charset_ascii
3998 && last_id != charset->id)
3999 {
4000 if (last_id != charset_ascii)
4001 ADD_CHARSET_DATA (charbuf, last_offset, char_offset, last_id);
4002 last_id = charset->id;
4003 last_offset = char_offset;
4004 }
4005 CODING_DECODE_CHAR (coding, src, src_base, src_end, charset, c, c);
4006 }
4007 *charbuf++ = c;
4008 char_offset++;
4009 continue;
4010
4011 invalid_code:
4012 src = src_base;
4013 consumed_chars = consumed_chars_base;
4014 ONE_MORE_BYTE (c);
4015 *charbuf++ = ASCII_BYTE_P (c) ? c : BYTE8_TO_CHAR (c);
4016 char_offset++;
4017 coding->errors++;
4018 }
4019
4020 no_more_source:
4021 if (last_id != charset_ascii)
4022 ADD_CHARSET_DATA (charbuf, last_offset, char_offset, last_id);
4023 coding->consumed_char += consumed_chars_base;
4024 coding->consumed = src_base - coding->source;
4025 coding->charbuf_used = charbuf - coding->charbuf;
4026 }
4027
4028 static void
4029 decode_coding_big5 (coding)
4030 struct coding_system *coding;
4031 {
4032 unsigned char *src = coding->source + coding->consumed;
4033 unsigned char *src_end = coding->source + coding->src_bytes;
4034 unsigned char *src_base;
4035 int *charbuf = coding->charbuf;
4036 int *charbuf_end = charbuf + coding->charbuf_size - MAX_ANNOTATION_LENGTH;
4037 int consumed_chars = 0, consumed_chars_base;
4038 int multibytep = coding->src_multibyte;
4039 struct charset *charset_roman, *charset_big5;
4040 Lisp_Object attrs, eol_type, charset_list, val;
4041 int char_offset = coding->produced_char;
4042 int last_offset = char_offset;
4043 int last_id = charset_ascii;
4044
4045 CODING_GET_INFO (coding, attrs, eol_type, charset_list);
4046 val = charset_list;
4047 charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4048 charset_big5 = CHARSET_FROM_ID (XINT (XCAR (val)));
4049
4050 while (1)
4051 {
4052 int c, c1;
4053
4054 src_base = src;
4055 consumed_chars_base = consumed_chars;
4056
4057 if (charbuf >= charbuf_end)
4058 break;
4059
4060 ONE_MORE_BYTE (c);
4061
4062 if (c == '\r')
4063 {
4064 if (EQ (eol_type, Qdos))
4065 {
4066 if (src == src_end)
4067 {
4068 coding->result = CODING_RESULT_INSUFFICIENT_SRC;
4069 goto no_more_source;
4070 }
4071 if (*src == '\n')
4072 ONE_MORE_BYTE (c);
4073 }
4074 else if (EQ (eol_type, Qmac))
4075 c = '\n';
4076 }
4077 else
4078 {
4079 struct charset *charset;
4080 if (c < 0x80)
4081 charset = charset_roman;
4082 else
4083 {
4084 /* BIG5 -> Big5 */
4085 if (c < 0xA1 || c > 0xFE)
4086 goto invalid_code;
4087 ONE_MORE_BYTE (c1);
4088 if (c1 < 0x40 || (c1 > 0x7E && c1 < 0xA1) || c1 > 0xFE)
4089 goto invalid_code;
4090 c = c << 8 | c1;
4091 charset = charset_big5;
4092 }
4093 if (charset->id != charset_ascii
4094 && last_id != charset->id)
4095 {
4096 if (last_id != charset_ascii)
4097 ADD_CHARSET_DATA (charbuf, last_offset, char_offset, last_id);
4098 last_id = charset->id;
4099 last_offset = char_offset;
4100 }
4101 CODING_DECODE_CHAR (coding, src, src_base, src_end, charset, c, c);
4102 }
4103
4104 *charbuf++ = c;
4105 char_offset++;
4106 continue;
4107
4108 invalid_code:
4109 src = src_base;
4110 consumed_chars = consumed_chars_base;
4111 ONE_MORE_BYTE (c);
4112 *charbuf++ = ASCII_BYTE_P (c) ? c : BYTE8_TO_CHAR (c);
4113 char_offset++;
4114 coding->errors++;
4115 }
4116
4117 no_more_source:
4118 if (last_id != charset_ascii)
4119 ADD_CHARSET_DATA (charbuf, last_offset, char_offset, last_id);
4120 coding->consumed_char += consumed_chars_base;
4121 coding->consumed = src_base - coding->source;
4122 coding->charbuf_used = charbuf - coding->charbuf;
4123 }
4124
4125 /* See the above "GENERAL NOTES on `encode_coding_XXX ()' functions".
4126 This function can encode charsets `ascii', `katakana-jisx0201',
4127 `japanese-jisx0208', `chinese-big5-1', and `chinese-big5-2'. We
4128 are sure that all these charsets are registered as official charset
4129 (i.e. do not have extended leading-codes). Characters of other
4130 charsets are produced without any encoding. If SJIS_P is 1, encode
4131 SJIS text, else encode BIG5 text. */
4132
4133 static int
4134 encode_coding_sjis (coding)
4135 struct coding_system *coding;
4136 {
4137 int multibytep = coding->dst_multibyte;
4138 int *charbuf = coding->charbuf;
4139 int *charbuf_end = charbuf + coding->charbuf_used;
4140 unsigned char *dst = coding->destination + coding->produced;
4141 unsigned char *dst_end = coding->destination + coding->dst_bytes;
4142 int safe_room = 4;
4143 int produced_chars = 0;
4144 Lisp_Object attrs, eol_type, charset_list, val;
4145 int ascii_compatible;
4146 struct charset *charset_roman, *charset_kanji, *charset_kana;
4147 int c;
4148
4149 CODING_GET_INFO (coding, attrs, eol_type, charset_list);
4150 val = charset_list;
4151 charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4152 charset_kana = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4153 charset_kanji = CHARSET_FROM_ID (XINT (XCAR (val)));
4154
4155 ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs));
4156
4157 while (charbuf < charbuf_end)
4158 {
4159 ASSURE_DESTINATION (safe_room);
4160 c = *charbuf++;
4161 /* Now encode the character C. */
4162 if (ASCII_CHAR_P (c) && ascii_compatible)
4163 EMIT_ONE_ASCII_BYTE (c);
4164 else if (CHAR_BYTE8_P (c))
4165 {
4166 c = CHAR_TO_BYTE8 (c);
4167 EMIT_ONE_BYTE (c);
4168 }
4169 else
4170 {
4171 unsigned code;
4172 struct charset *charset = char_charset (c, charset_list, &code);
4173
4174 if (!charset)
4175 {
4176 if (coding->mode & CODING_MODE_SAFE_ENCODING)
4177 {
4178 code = CODING_INHIBIT_CHARACTER_SUBSTITUTION;
4179 charset = CHARSET_FROM_ID (charset_ascii);
4180 }
4181 else
4182 {
4183 c = coding->default_char;
4184 charset = char_charset (c, charset_list, &code);
4185 }
4186 }
4187 if (code == CHARSET_INVALID_CODE (charset))
4188 abort ();
4189 if (charset == charset_kanji)
4190 {
4191 int c1, c2;
4192 JIS_TO_SJIS (code);
4193 c1 = code >> 8, c2 = code & 0xFF;
4194 EMIT_TWO_BYTES (c1, c2);
4195 }
4196 else if (charset == charset_kana)
4197 EMIT_ONE_BYTE (code | 0x80);
4198 else
4199 EMIT_ONE_ASCII_BYTE (code & 0x7F);
4200 }
4201 }
4202 coding->result = CODING_RESULT_SUCCESS;
4203 coding->produced_char += produced_chars;
4204 coding->produced = dst - coding->destination;
4205 return 0;
4206 }
4207
4208 static int
4209 encode_coding_big5 (coding)
4210 struct coding_system *coding;
4211 {
4212 int multibytep = coding->dst_multibyte;
4213 int *charbuf = coding->charbuf;
4214 int *charbuf_end = charbuf + coding->charbuf_used;
4215 unsigned char *dst = coding->destination + coding->produced;
4216 unsigned char *dst_end = coding->destination + coding->dst_bytes;
4217 int safe_room = 4;
4218 int produced_chars = 0;
4219 Lisp_Object attrs, eol_type, charset_list, val;
4220 int ascii_compatible;
4221 struct charset *charset_roman, *charset_big5;
4222 int c;
4223
4224 CODING_GET_INFO (coding, attrs, eol_type, charset_list);
4225 val = charset_list;
4226 charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4227 charset_big5 = CHARSET_FROM_ID (XINT (XCAR (val)));
4228 ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs));
4229
4230 while (charbuf < charbuf_end)
4231 {
4232 ASSURE_DESTINATION (safe_room);
4233 c = *charbuf++;
4234 /* Now encode the character C. */
4235 if (ASCII_CHAR_P (c) && ascii_compatible)
4236 EMIT_ONE_ASCII_BYTE (c);
4237 else if (CHAR_BYTE8_P (c))
4238 {
4239 c = CHAR_TO_BYTE8 (c);
4240 EMIT_ONE_BYTE (c);
4241 }
4242 else
4243 {
4244 unsigned code;
4245 struct charset *charset = char_charset (c, charset_list, &code);
4246
4247 if (! charset)
4248 {
4249 if (coding->mode & CODING_MODE_SAFE_ENCODING)
4250 {
4251 code = CODING_INHIBIT_CHARACTER_SUBSTITUTION;
4252 charset = CHARSET_FROM_ID (charset_ascii);
4253 }
4254 else
4255 {
4256 c = coding->default_char;
4257 charset = char_charset (c, charset_list, &code);
4258 }
4259 }
4260 if (code == CHARSET_INVALID_CODE (charset))
4261 abort ();
4262 if (charset == charset_big5)
4263 {
4264 int c1, c2;
4265
4266 c1 = code >> 8, c2 = code & 0xFF;
4267 EMIT_TWO_BYTES (c1, c2);
4268 }
4269 else
4270 EMIT_ONE_ASCII_BYTE (code & 0x7F);
4271 }
4272 }
4273 coding->result = CODING_RESULT_SUCCESS;
4274 coding->produced_char += produced_chars;
4275 coding->produced = dst - coding->destination;
4276 return 0;
4277 }
4278
4279 \f
4280 /*** 10. CCL handlers ***/
4281
4282 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
4283 Check if a text is encoded in a coding system of which
4284 encoder/decoder are written in CCL program. If it is, return
4285 CATEGORY_MASK_CCL, else return 0. */
4286
4287 static int
4288 detect_coding_ccl (coding, detect_info)
4289 struct coding_system *coding;
4290 struct coding_detection_info *detect_info;
4291 {
4292 unsigned char *src = coding->source, *src_base = src;
4293 unsigned char *src_end = coding->source + coding->src_bytes;
4294 int multibytep = coding->src_multibyte;
4295 int consumed_chars = 0;
4296 int found = 0;
4297 unsigned char *valids = CODING_CCL_VALIDS (coding);
4298 int head_ascii = coding->head_ascii;
4299 Lisp_Object attrs;
4300
4301 detect_info->checked |= CATEGORY_MASK_CCL;
4302
4303 coding = &coding_categories[coding_category_ccl];
4304 attrs = CODING_ID_ATTRS (coding->id);
4305 if (! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
4306 src += head_ascii;
4307
4308 while (1)
4309 {
4310 int c;
4311 ONE_MORE_BYTE (c);
4312 if (! valids[c])
4313 break;
4314 if ((valids[c] > 1))
4315 found = CATEGORY_MASK_CCL;
4316 }
4317 detect_info->rejected |= CATEGORY_MASK_CCL;
4318 return 0;
4319
4320 no_more_source:
4321 detect_info->found |= found;
4322 return 1;
4323 }
4324
4325 static void
4326 decode_coding_ccl (coding)
4327 struct coding_system *coding;
4328 {
4329 const unsigned char *src = coding->source + coding->consumed;
4330 unsigned char *src_end = coding->source + coding->src_bytes;
4331 int *charbuf = coding->charbuf;
4332 int *charbuf_end = charbuf + coding->charbuf_size;
4333 int consumed_chars = 0;
4334 int multibytep = coding->src_multibyte;
4335 struct ccl_program ccl;
4336 int source_charbuf[1024];
4337 int source_byteidx[1024];
4338 Lisp_Object attrs, eol_type, charset_list;
4339
4340 CODING_GET_INFO (coding, attrs, eol_type, charset_list);
4341 setup_ccl_program (&ccl, CODING_CCL_DECODER (coding));
4342
4343 while (src < src_end)
4344 {
4345 const unsigned char *p = src;
4346 int *source, *source_end;
4347 int i = 0;
4348
4349 if (multibytep)
4350 while (i < 1024 && p < src_end)
4351 {
4352 source_byteidx[i] = p - src;
4353 source_charbuf[i++] = STRING_CHAR_ADVANCE (p);
4354 }
4355 else
4356 while (i < 1024 && p < src_end)
4357 source_charbuf[i++] = *p++;
4358
4359 if (p == src_end && coding->mode & CODING_MODE_LAST_BLOCK)
4360 ccl.last_block = 1;
4361
4362 source = source_charbuf;
4363 source_end = source + i;
4364 while (source < source_end)
4365 {
4366 ccl_driver (&ccl, source, charbuf,
4367 source_end - source, charbuf_end - charbuf,
4368 charset_list);
4369 source += ccl.consumed;
4370 charbuf += ccl.produced;
4371 if (ccl.status != CCL_STAT_SUSPEND_BY_DST)
4372 break;
4373 }
4374 if (source < source_end)
4375 src += source_byteidx[source - source_charbuf];
4376 else
4377 src = p;
4378 consumed_chars += source - source_charbuf;
4379
4380 if (ccl.status != CCL_STAT_SUSPEND_BY_SRC
4381 && ccl.status != CODING_RESULT_INSUFFICIENT_SRC)
4382 break;
4383 }
4384
4385 switch (ccl.status)
4386 {
4387 case CCL_STAT_SUSPEND_BY_SRC:
4388 coding->result = CODING_RESULT_INSUFFICIENT_SRC;
4389 break;
4390 case CCL_STAT_SUSPEND_BY_DST:
4391 break;
4392 case CCL_STAT_QUIT:
4393 case CCL_STAT_INVALID_CMD:
4394 coding->result = CODING_RESULT_INTERRUPT;
4395 break;
4396 default:
4397 coding->result = CODING_RESULT_SUCCESS;
4398 break;
4399 }
4400 coding->consumed_char += consumed_chars;
4401 coding->consumed = src - coding->source;
4402 coding->charbuf_used = charbuf - coding->charbuf;
4403 }
4404
4405 static int
4406 encode_coding_ccl (coding)
4407 struct coding_system *coding;
4408 {
4409 struct ccl_program ccl;
4410 int multibytep = coding->dst_multibyte;
4411 int *charbuf = coding->charbuf;
4412 int *charbuf_end = charbuf + coding->charbuf_used;
4413 unsigned char *dst = coding->destination + coding->produced;
4414 unsigned char *dst_end = coding->destination + coding->dst_bytes;
4415 unsigned char *adjusted_dst_end = dst_end - 1;
4416 int destination_charbuf[1024];
4417 int i, produced_chars = 0;
4418 Lisp_Object attrs, eol_type, charset_list;
4419
4420 CODING_GET_INFO (coding, attrs, eol_type, charset_list);
4421 setup_ccl_program (&ccl, CODING_CCL_ENCODER (coding));
4422
4423 ccl.last_block = coding->mode & CODING_MODE_LAST_BLOCK;
4424 ccl.dst_multibyte = coding->dst_multibyte;
4425
4426 while (charbuf < charbuf_end && dst < adjusted_dst_end)
4427 {
4428 int dst_bytes = dst_end - dst;
4429 if (dst_bytes > 1024)
4430 dst_bytes = 1024;
4431
4432 ccl_driver (&ccl, charbuf, destination_charbuf,
4433 charbuf_end - charbuf, dst_bytes, charset_list);
4434 charbuf += ccl.consumed;
4435 if (multibytep)
4436 for (i = 0; i < ccl.produced; i++)
4437 EMIT_ONE_BYTE (destination_charbuf[i] & 0xFF);
4438 else
4439 {
4440 for (i = 0; i < ccl.produced; i++)
4441 *dst++ = destination_charbuf[i] & 0xFF;
4442 produced_chars += ccl.produced;
4443 }
4444 }
4445
4446 switch (ccl.status)
4447 {
4448 case CCL_STAT_SUSPEND_BY_SRC:
4449 coding->result = CODING_RESULT_INSUFFICIENT_SRC;
4450 break;
4451 case CCL_STAT_SUSPEND_BY_DST:
4452 coding->result = CODING_RESULT_INSUFFICIENT_DST;
4453 break;
4454 case CCL_STAT_QUIT:
4455 case CCL_STAT_INVALID_CMD:
4456 coding->result = CODING_RESULT_INTERRUPT;
4457 break;
4458 default:
4459 coding->result = CODING_RESULT_SUCCESS;
4460 break;
4461 }
4462
4463 coding->produced_char += produced_chars;
4464 coding->produced = dst - coding->destination;
4465 return 0;
4466 }
4467
4468
4469 \f
4470 /*** 10, 11. no-conversion handlers ***/
4471
4472 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions". */
4473
4474 static void
4475 decode_coding_raw_text (coding)
4476 struct coding_system *coding;
4477 {
4478 coding->chars_at_source = 1;
4479 coding->consumed_char = 0;
4480 coding->consumed = 0;
4481 coding->result = CODING_RESULT_SUCCESS;
4482 }
4483
4484 static int
4485 encode_coding_raw_text (coding)
4486 struct coding_system *coding;
4487 {
4488 int multibytep = coding->dst_multibyte;
4489 int *charbuf = coding->charbuf;
4490 int *charbuf_end = coding->charbuf + coding->charbuf_used;
4491 unsigned char *dst = coding->destination + coding->produced;
4492 unsigned char *dst_end = coding->destination + coding->dst_bytes;
4493 int produced_chars = 0;
4494 int c;
4495
4496 if (multibytep)
4497 {
4498 int safe_room = MAX_MULTIBYTE_LENGTH * 2;
4499
4500 if (coding->src_multibyte)
4501 while (charbuf < charbuf_end)
4502 {
4503 ASSURE_DESTINATION (safe_room);
4504 c = *charbuf++;
4505 if (ASCII_CHAR_P (c))
4506 EMIT_ONE_ASCII_BYTE (c);
4507 else if (CHAR_BYTE8_P (c))
4508 {
4509 c = CHAR_TO_BYTE8 (c);
4510 EMIT_ONE_BYTE (c);
4511 }
4512 else
4513 {
4514 unsigned char str[MAX_MULTIBYTE_LENGTH], *p0 = str, *p1 = str;
4515
4516 CHAR_STRING_ADVANCE (c, p1);
4517 while (p0 < p1)
4518 {
4519 EMIT_ONE_BYTE (*p0);
4520 p0++;
4521 }
4522 }
4523 }
4524 else
4525 while (charbuf < charbuf_end)
4526 {
4527 ASSURE_DESTINATION (safe_room);
4528 c = *charbuf++;
4529 EMIT_ONE_BYTE (c);
4530 }
4531 }
4532 else
4533 {
4534 if (coding->src_multibyte)
4535 {
4536 int safe_room = MAX_MULTIBYTE_LENGTH;
4537
4538 while (charbuf < charbuf_end)
4539 {
4540 ASSURE_DESTINATION (safe_room);
4541 c = *charbuf++;
4542 if (ASCII_CHAR_P (c))
4543 *dst++ = c;
4544 else if (CHAR_BYTE8_P (c))
4545 *dst++ = CHAR_TO_BYTE8 (c);
4546 else
4547 CHAR_STRING_ADVANCE (c, dst);
4548 produced_chars++;
4549 }
4550 }
4551 else
4552 {
4553 ASSURE_DESTINATION (charbuf_end - charbuf);
4554 while (charbuf < charbuf_end && dst < dst_end)
4555 *dst++ = *charbuf++;
4556 produced_chars = dst - (coding->destination + coding->dst_bytes);
4557 }
4558 }
4559 coding->result = CODING_RESULT_SUCCESS;
4560 coding->produced_char += produced_chars;
4561 coding->produced = dst - coding->destination;
4562 return 0;
4563 }
4564
4565 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
4566 Check if a text is encoded in a charset-based coding system. If it
4567 is, return 1, else return 0. */
4568
4569 static int
4570 detect_coding_charset (coding, detect_info)
4571 struct coding_system *coding;
4572 struct coding_detection_info *detect_info;
4573 {
4574 unsigned char *src = coding->source, *src_base = src;
4575 unsigned char *src_end = coding->source + coding->src_bytes;
4576 int multibytep = coding->src_multibyte;
4577 int consumed_chars = 0;
4578 Lisp_Object attrs, valids;
4579 int found = 0;
4580
4581 detect_info->checked |= CATEGORY_MASK_CHARSET;
4582
4583 coding = &coding_categories[coding_category_charset];
4584 attrs = CODING_ID_ATTRS (coding->id);
4585 valids = AREF (attrs, coding_attr_charset_valids);
4586
4587 if (! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
4588 src += coding->head_ascii;
4589
4590 while (1)
4591 {
4592 int c;
4593
4594 ONE_MORE_BYTE (c);
4595 if (NILP (AREF (valids, c)))
4596 break;
4597 if (c >= 0x80)
4598 found = CATEGORY_MASK_CHARSET;
4599 }
4600 detect_info->rejected |= CATEGORY_MASK_CHARSET;
4601 return 0;
4602
4603 no_more_source:
4604 detect_info->found |= found;
4605 return 1;
4606 }
4607
4608 static void
4609 decode_coding_charset (coding)
4610 struct coding_system *coding;
4611 {
4612 unsigned char *src = coding->source + coding->consumed;
4613 unsigned char *src_end = coding->source + coding->src_bytes;
4614 unsigned char *src_base;
4615 int *charbuf = coding->charbuf;
4616 int *charbuf_end = charbuf + coding->charbuf_size - MAX_ANNOTATION_LENGTH;
4617 int consumed_chars = 0, consumed_chars_base;
4618 int multibytep = coding->src_multibyte;
4619 Lisp_Object attrs, eol_type, charset_list, valids;
4620 int char_offset = coding->produced_char;
4621 int last_offset = char_offset;
4622 int last_id = charset_ascii;
4623
4624 CODING_GET_INFO (coding, attrs, eol_type, charset_list);
4625 valids = AREF (attrs, coding_attr_charset_valids);
4626
4627 while (1)
4628 {
4629 int c;
4630
4631 src_base = src;
4632 consumed_chars_base = consumed_chars;
4633
4634 if (charbuf >= charbuf_end)
4635 break;
4636
4637 ONE_MORE_BYTE (c);
4638 if (c == '\r')
4639 {
4640 /* Here we assume that no charset maps '\r' to something
4641 else. */
4642 if (EQ (eol_type, Qdos))
4643 {
4644 if (src == src_end)
4645 {
4646 coding->result = CODING_RESULT_INSUFFICIENT_SRC;
4647 goto no_more_source;
4648 }
4649 if (*src == '\n')
4650 ONE_MORE_BYTE (c);
4651 }
4652 else if (EQ (eol_type, Qmac))
4653 c = '\n';
4654 }
4655 else
4656 {
4657 Lisp_Object val;
4658 struct charset *charset;
4659 int dim;
4660 int len = 1;
4661 unsigned code = c;
4662
4663 val = AREF (valids, c);
4664 if (NILP (val))
4665 goto invalid_code;
4666 if (INTEGERP (val))
4667 {
4668 charset = CHARSET_FROM_ID (XFASTINT (val));
4669 dim = CHARSET_DIMENSION (charset);
4670 while (len < dim)
4671 {
4672 ONE_MORE_BYTE (c);
4673 code = (code << 8) | c;
4674 len++;
4675 }
4676 CODING_DECODE_CHAR (coding, src, src_base, src_end,
4677 charset, code, c);
4678 }
4679 else
4680 {
4681 /* VAL is a list of charset IDs. It is assured that the
4682 list is sorted by charset dimensions (smaller one
4683 comes first). */
4684 while (CONSP (val))
4685 {
4686 charset = CHARSET_FROM_ID (XFASTINT (XCAR (val)));
4687 dim = CHARSET_DIMENSION (charset);
4688 while (len < dim)
4689 {
4690 ONE_MORE_BYTE (c);
4691 code = (code << 8) | c;
4692 len++;
4693 }
4694 CODING_DECODE_CHAR (coding, src, src_base,
4695 src_end, charset, code, c);
4696 if (c >= 0)
4697 break;
4698 val = XCDR (val);
4699 }
4700 }
4701 if (c < 0)
4702 goto invalid_code;
4703 if (charset->id != charset_ascii
4704 && last_id != charset->id)
4705 {
4706 if (last_id != charset_ascii)
4707 ADD_CHARSET_DATA (charbuf, last_offset, char_offset, last_id);
4708 last_id = charset->id;
4709 last_offset = char_offset;
4710 }
4711 }
4712 *charbuf++ = c;
4713 char_offset++;
4714 continue;
4715
4716 invalid_code:
4717 src = src_base;
4718 consumed_chars = consumed_chars_base;
4719 ONE_MORE_BYTE (c);
4720 *charbuf++ = ASCII_BYTE_P (c) ? c : BYTE8_TO_CHAR (c);
4721 char_offset++;
4722 coding->errors++;
4723 }
4724
4725 no_more_source:
4726 if (last_id != charset_ascii)
4727 ADD_CHARSET_DATA (charbuf, last_offset, char_offset, last_id);
4728 coding->consumed_char += consumed_chars_base;
4729 coding->consumed = src_base - coding->source;
4730 coding->charbuf_used = charbuf - coding->charbuf;
4731 }
4732
4733 static int
4734 encode_coding_charset (coding)
4735 struct coding_system *coding;
4736 {
4737 int multibytep = coding->dst_multibyte;
4738 int *charbuf = coding->charbuf;
4739 int *charbuf_end = charbuf + coding->charbuf_used;
4740 unsigned char *dst = coding->destination + coding->produced;
4741 unsigned char *dst_end = coding->destination + coding->dst_bytes;
4742 int safe_room = MAX_MULTIBYTE_LENGTH;
4743 int produced_chars = 0;
4744 Lisp_Object attrs, eol_type, charset_list;
4745 int ascii_compatible;
4746 int c;
4747
4748 CODING_GET_INFO (coding, attrs, eol_type, charset_list);
4749 ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs));
4750
4751 while (charbuf < charbuf_end)
4752 {
4753 struct charset *charset;
4754 unsigned code;
4755
4756 ASSURE_DESTINATION (safe_room);
4757 c = *charbuf++;
4758 if (ascii_compatible && ASCII_CHAR_P (c))
4759 EMIT_ONE_ASCII_BYTE (c);
4760 else if (CHAR_BYTE8_P (c))
4761 {
4762 c = CHAR_TO_BYTE8 (c);
4763 EMIT_ONE_BYTE (c);
4764 }
4765 else
4766 {
4767 charset = char_charset (c, charset_list, &code);
4768 if (charset)
4769 {
4770 if (CHARSET_DIMENSION (charset) == 1)
4771 EMIT_ONE_BYTE (code);
4772 else if (CHARSET_DIMENSION (charset) == 2)
4773 EMIT_TWO_BYTES (code >> 8, code & 0xFF);
4774 else if (CHARSET_DIMENSION (charset) == 3)
4775 EMIT_THREE_BYTES (code >> 16, (code >> 8) & 0xFF, code & 0xFF);
4776 else
4777 EMIT_FOUR_BYTES (code >> 24, (code >> 16) & 0xFF,
4778 (code >> 8) & 0xFF, code & 0xFF);
4779 }
4780 else
4781 {
4782 if (coding->mode & CODING_MODE_SAFE_ENCODING)
4783 c = CODING_INHIBIT_CHARACTER_SUBSTITUTION;
4784 else
4785 c = coding->default_char;
4786 EMIT_ONE_BYTE (c);
4787 }
4788 }
4789 }
4790
4791 coding->result = CODING_RESULT_SUCCESS;
4792 coding->produced_char += produced_chars;
4793 coding->produced = dst - coding->destination;
4794 return 0;
4795 }
4796
4797 \f
4798 /*** 7. C library functions ***/
4799
4800 /* Setup coding context CODING from information about CODING_SYSTEM.
4801 If CODING_SYSTEM is nil, `no-conversion' is assumed. If
4802 CODING_SYSTEM is invalid, signal an error. */
4803
4804 void
4805 setup_coding_system (coding_system, coding)
4806 Lisp_Object coding_system;
4807 struct coding_system *coding;
4808 {
4809 Lisp_Object attrs;
4810 Lisp_Object eol_type;
4811 Lisp_Object coding_type;
4812 Lisp_Object val;
4813
4814 if (NILP (coding_system))
4815 coding_system = Qno_conversion;
4816
4817 CHECK_CODING_SYSTEM_GET_ID (coding_system, coding->id);
4818
4819 attrs = CODING_ID_ATTRS (coding->id);
4820 eol_type = CODING_ID_EOL_TYPE (coding->id);
4821
4822 coding->mode = 0;
4823 coding->head_ascii = -1;
4824 coding->common_flags
4825 = (VECTORP (eol_type) ? CODING_REQUIRE_DETECTION_MASK : 0);
4826
4827 val = CODING_ATTR_SAFE_CHARSETS (attrs);
4828 coding->max_charset_id = XSTRING (val)->size - 1;
4829 coding->safe_charsets = (char *) XSTRING (val)->data;
4830 coding->default_char = XINT (CODING_ATTR_DEFAULT_CHAR (attrs));
4831
4832 coding_type = CODING_ATTR_TYPE (attrs);
4833 if (EQ (coding_type, Qundecided))
4834 {
4835 coding->detector = NULL;
4836 coding->decoder = decode_coding_raw_text;
4837 coding->encoder = encode_coding_raw_text;
4838 coding->common_flags |= CODING_REQUIRE_DETECTION_MASK;
4839 }
4840 else if (EQ (coding_type, Qiso_2022))
4841 {
4842 int i;
4843 int flags = XINT (AREF (attrs, coding_attr_iso_flags));
4844
4845 /* Invoke graphic register 0 to plane 0. */
4846 CODING_ISO_INVOCATION (coding, 0) = 0;
4847 /* Invoke graphic register 1 to plane 1 if we can use 8-bit. */
4848 CODING_ISO_INVOCATION (coding, 1)
4849 = (flags & CODING_ISO_FLAG_SEVEN_BITS ? -1 : 1);
4850 /* Setup the initial status of designation. */
4851 for (i = 0; i < 4; i++)
4852 CODING_ISO_DESIGNATION (coding, i) = CODING_ISO_INITIAL (coding, i);
4853 /* Not single shifting initially. */
4854 CODING_ISO_SINGLE_SHIFTING (coding) = 0;
4855 /* Beginning of buffer should also be regarded as bol. */
4856 CODING_ISO_BOL (coding) = 1;
4857 coding->detector = detect_coding_iso_2022;
4858 coding->decoder = decode_coding_iso_2022;
4859 coding->encoder = encode_coding_iso_2022;
4860 if (flags & CODING_ISO_FLAG_SAFE)
4861 coding->mode |= CODING_MODE_SAFE_ENCODING;
4862 coding->common_flags
4863 |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK
4864 | CODING_REQUIRE_FLUSHING_MASK);
4865 if (flags & CODING_ISO_FLAG_COMPOSITION)
4866 coding->common_flags |= CODING_ANNOTATE_COMPOSITION_MASK;
4867 if (flags & CODING_ISO_FLAG_DESIGNATION)
4868 coding->common_flags |= CODING_ANNOTATE_CHARSET_MASK;
4869 if (flags & CODING_ISO_FLAG_FULL_SUPPORT)
4870 {
4871 setup_iso_safe_charsets (attrs);
4872 val = CODING_ATTR_SAFE_CHARSETS (attrs);
4873 coding->max_charset_id = XSTRING (val)->size - 1;
4874 coding->safe_charsets = (char *) XSTRING (val)->data;
4875 }
4876 CODING_ISO_FLAGS (coding) = flags;
4877 }
4878 else if (EQ (coding_type, Qcharset))
4879 {
4880 coding->detector = detect_coding_charset;
4881 coding->decoder = decode_coding_charset;
4882 coding->encoder = encode_coding_charset;
4883 coding->common_flags
4884 |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
4885 }
4886 else if (EQ (coding_type, Qutf_8))
4887 {
4888 coding->detector = detect_coding_utf_8;
4889 coding->decoder = decode_coding_utf_8;
4890 coding->encoder = encode_coding_utf_8;
4891 coding->common_flags
4892 |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
4893 }
4894 else if (EQ (coding_type, Qutf_16))
4895 {
4896 val = AREF (attrs, coding_attr_utf_16_bom);
4897 CODING_UTF_16_BOM (coding) = (CONSP (val) ? utf_16_detect_bom
4898 : EQ (val, Qt) ? utf_16_with_bom
4899 : utf_16_without_bom);
4900 val = AREF (attrs, coding_attr_utf_16_endian);
4901 CODING_UTF_16_ENDIAN (coding) = (EQ (val, Qbig) ? utf_16_big_endian
4902 : utf_16_little_endian);
4903 CODING_UTF_16_SURROGATE (coding) = 0;
4904 coding->detector = detect_coding_utf_16;
4905 coding->decoder = decode_coding_utf_16;
4906 coding->encoder = encode_coding_utf_16;
4907 coding->common_flags
4908 |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
4909 if (CODING_UTF_16_BOM (coding) == utf_16_detect_bom)
4910 coding->common_flags |= CODING_REQUIRE_DETECTION_MASK;
4911 }
4912 else if (EQ (coding_type, Qccl))
4913 {
4914 coding->detector = detect_coding_ccl;
4915 coding->decoder = decode_coding_ccl;
4916 coding->encoder = encode_coding_ccl;
4917 coding->common_flags
4918 |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK
4919 | CODING_REQUIRE_FLUSHING_MASK);
4920 }
4921 else if (EQ (coding_type, Qemacs_mule))
4922 {
4923 coding->detector = detect_coding_emacs_mule;
4924 coding->decoder = decode_coding_emacs_mule;
4925 coding->encoder = encode_coding_emacs_mule;
4926 coding->common_flags
4927 |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
4928 if (! NILP (AREF (attrs, coding_attr_emacs_mule_full))
4929 && ! EQ (CODING_ATTR_CHARSET_LIST (attrs), Vemacs_mule_charset_list))
4930 {
4931 Lisp_Object tail, safe_charsets;
4932 int max_charset_id = 0;
4933
4934 for (tail = Vemacs_mule_charset_list; CONSP (tail);
4935 tail = XCDR (tail))
4936 if (max_charset_id < XFASTINT (XCAR (tail)))
4937 max_charset_id = XFASTINT (XCAR (tail));
4938 safe_charsets = Fmake_string (make_number (max_charset_id + 1),
4939 make_number (255));
4940 for (tail = Vemacs_mule_charset_list; CONSP (tail);
4941 tail = XCDR (tail))
4942 XSTRING (safe_charsets)->data[XFASTINT (XCAR (tail))] = 0;
4943 coding->max_charset_id = max_charset_id;
4944 coding->safe_charsets = (char *) XSTRING (safe_charsets)->data;
4945 }
4946 }
4947 else if (EQ (coding_type, Qshift_jis))
4948 {
4949 coding->detector = detect_coding_sjis;
4950 coding->decoder = decode_coding_sjis;
4951 coding->encoder = encode_coding_sjis;
4952 coding->common_flags
4953 |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
4954 }
4955 else if (EQ (coding_type, Qbig5))
4956 {
4957 coding->detector = detect_coding_big5;
4958 coding->decoder = decode_coding_big5;
4959 coding->encoder = encode_coding_big5;
4960 coding->common_flags
4961 |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
4962 }
4963 else /* EQ (coding_type, Qraw_text) */
4964 {
4965 coding->detector = NULL;
4966 coding->decoder = decode_coding_raw_text;
4967 coding->encoder = encode_coding_raw_text;
4968 coding->common_flags |= CODING_FOR_UNIBYTE_MASK;
4969 }
4970
4971 return;
4972 }
4973
4974 /* Return raw-text or one of its subsidiaries that has the same
4975 eol_type as CODING-SYSTEM. */
4976
4977 Lisp_Object
4978 raw_text_coding_system (coding_system)
4979 Lisp_Object coding_system;
4980 {
4981 Lisp_Object spec, attrs;
4982 Lisp_Object eol_type, raw_text_eol_type;
4983
4984 spec = CODING_SYSTEM_SPEC (coding_system);
4985 attrs = AREF (spec, 0);
4986
4987 if (EQ (CODING_ATTR_TYPE (attrs), Qraw_text))
4988 return coding_system;
4989
4990 eol_type = AREF (spec, 2);
4991 if (VECTORP (eol_type))
4992 return Qraw_text;
4993 spec = CODING_SYSTEM_SPEC (Qraw_text);
4994 raw_text_eol_type = AREF (spec, 2);
4995 return (EQ (eol_type, Qunix) ? AREF (raw_text_eol_type, 0)
4996 : EQ (eol_type, Qdos) ? AREF (raw_text_eol_type, 1)
4997 : AREF (raw_text_eol_type, 2));
4998 }
4999
5000
5001 /* If CODING_SYSTEM doesn't specify end-of-line format but PARENT
5002 does, return one of the subsidiary that has the same eol-spec as
5003 PARENT. Otherwise, return CODING_SYSTEM. */
5004
5005 Lisp_Object
5006 coding_inherit_eol_type (coding_system, parent)
5007 Lisp_Object coding_system, parent;
5008 {
5009 Lisp_Object spec, attrs, eol_type;
5010
5011 spec = CODING_SYSTEM_SPEC (coding_system);
5012 attrs = AREF (spec, 0);
5013 eol_type = AREF (spec, 2);
5014 if (VECTORP (eol_type))
5015 {
5016 Lisp_Object parent_spec;
5017 Lisp_Object parent_eol_type;
5018
5019 parent_spec
5020 = CODING_SYSTEM_SPEC (buffer_defaults.buffer_file_coding_system);
5021 parent_eol_type = AREF (parent_spec, 2);
5022 if (EQ (parent_eol_type, Qunix))
5023 coding_system = AREF (eol_type, 0);
5024 else if (EQ (parent_eol_type, Qdos))
5025 coding_system = AREF (eol_type, 1);
5026 else if (EQ (parent_eol_type, Qmac))
5027 coding_system = AREF (eol_type, 2);
5028 }
5029 return coding_system;
5030 }
5031
5032 /* Emacs has a mechanism to automatically detect a coding system if it
5033 is one of Emacs' internal format, ISO2022, SJIS, and BIG5. But,
5034 it's impossible to distinguish some coding systems accurately
5035 because they use the same range of codes. So, at first, coding
5036 systems are categorized into 7, those are:
5037
5038 o coding-category-emacs-mule
5039
5040 The category for a coding system which has the same code range
5041 as Emacs' internal format. Assigned the coding-system (Lisp
5042 symbol) `emacs-mule' by default.
5043
5044 o coding-category-sjis
5045
5046 The category for a coding system which has the same code range
5047 as SJIS. Assigned the coding-system (Lisp
5048 symbol) `japanese-shift-jis' by default.
5049
5050 o coding-category-iso-7
5051
5052 The category for a coding system which has the same code range
5053 as ISO2022 of 7-bit environment. This doesn't use any locking
5054 shift and single shift functions. This can encode/decode all
5055 charsets. Assigned the coding-system (Lisp symbol)
5056 `iso-2022-7bit' by default.
5057
5058 o coding-category-iso-7-tight
5059
5060 Same as coding-category-iso-7 except that this can
5061 encode/decode only the specified charsets.
5062
5063 o coding-category-iso-8-1
5064
5065 The category for a coding system which has the same code range
5066 as ISO2022 of 8-bit environment and graphic plane 1 used only
5067 for DIMENSION1 charset. This doesn't use any locking shift
5068 and single shift functions. Assigned the coding-system (Lisp
5069 symbol) `iso-latin-1' by default.
5070
5071 o coding-category-iso-8-2
5072
5073 The category for a coding system which has the same code range
5074 as ISO2022 of 8-bit environment and graphic plane 1 used only
5075 for DIMENSION2 charset. This doesn't use any locking shift
5076 and single shift functions. Assigned the coding-system (Lisp
5077 symbol) `japanese-iso-8bit' by default.
5078
5079 o coding-category-iso-7-else
5080
5081 The category for a coding system which has the same code range
5082 as ISO2022 of 7-bit environemnt but uses locking shift or
5083 single shift functions. Assigned the coding-system (Lisp
5084 symbol) `iso-2022-7bit-lock' by default.
5085
5086 o coding-category-iso-8-else
5087
5088 The category for a coding system which has the same code range
5089 as ISO2022 of 8-bit environemnt but uses locking shift or
5090 single shift functions. Assigned the coding-system (Lisp
5091 symbol) `iso-2022-8bit-ss2' by default.
5092
5093 o coding-category-big5
5094
5095 The category for a coding system which has the same code range
5096 as BIG5. Assigned the coding-system (Lisp symbol)
5097 `cn-big5' by default.
5098
5099 o coding-category-utf-8
5100
5101 The category for a coding system which has the same code range
5102 as UTF-8 (cf. RFC2279). Assigned the coding-system (Lisp
5103 symbol) `utf-8' by default.
5104
5105 o coding-category-utf-16-be
5106
5107 The category for a coding system in which a text has an
5108 Unicode signature (cf. Unicode Standard) in the order of BIG
5109 endian at the head. Assigned the coding-system (Lisp symbol)
5110 `utf-16-be' by default.
5111
5112 o coding-category-utf-16-le
5113
5114 The category for a coding system in which a text has an
5115 Unicode signature (cf. Unicode Standard) in the order of
5116 LITTLE endian at the head. Assigned the coding-system (Lisp
5117 symbol) `utf-16-le' by default.
5118
5119 o coding-category-ccl
5120
5121 The category for a coding system of which encoder/decoder is
5122 written in CCL programs. The default value is nil, i.e., no
5123 coding system is assigned.
5124
5125 o coding-category-binary
5126
5127 The category for a coding system not categorized in any of the
5128 above. Assigned the coding-system (Lisp symbol)
5129 `no-conversion' by default.
5130
5131 Each of them is a Lisp symbol and the value is an actual
5132 `coding-system's (this is also a Lisp symbol) assigned by a user.
5133 What Emacs does actually is to detect a category of coding system.
5134 Then, it uses a `coding-system' assigned to it. If Emacs can't
5135 decide only one possible category, it selects a category of the
5136 highest priority. Priorities of categories are also specified by a
5137 user in a Lisp variable `coding-category-list'.
5138
5139 */
5140
5141 #define EOL_SEEN_NONE 0
5142 #define EOL_SEEN_LF 1
5143 #define EOL_SEEN_CR 2
5144 #define EOL_SEEN_CRLF 4
5145
5146 /* Detect how end-of-line of a text of length SRC_BYTES pointed by
5147 SOURCE is encoded. If CATEGORY is one of
5148 coding_category_utf_16_XXXX, assume that CR and LF are encoded by
5149 two-byte, else they are encoded by one-byte.
5150
5151 Return one of EOL_SEEN_XXX. */
5152
5153 #define MAX_EOL_CHECK_COUNT 3
5154
5155 static int
5156 detect_eol (source, src_bytes, category)
5157 unsigned char *source;
5158 EMACS_INT src_bytes;
5159 enum coding_category category;
5160 {
5161 unsigned char *src = source, *src_end = src + src_bytes;
5162 unsigned char c;
5163 int total = 0;
5164 int eol_seen = EOL_SEEN_NONE;
5165
5166 if ((1 << category) & CATEGORY_MASK_UTF_16)
5167 {
5168 int msb, lsb;
5169
5170 msb = category == (coding_category_utf_16_le
5171 | coding_category_utf_16_le_nosig);
5172 lsb = 1 - msb;
5173
5174 while (src + 1 < src_end)
5175 {
5176 c = src[lsb];
5177 if (src[msb] == 0 && (c == '\n' || c == '\r'))
5178 {
5179 int this_eol;
5180
5181 if (c == '\n')
5182 this_eol = EOL_SEEN_LF;
5183 else if (src + 3 >= src_end
5184 || src[msb + 2] != 0
5185 || src[lsb + 2] != '\n')
5186 this_eol = EOL_SEEN_CR;
5187 else
5188 this_eol = EOL_SEEN_CRLF;
5189
5190 if (eol_seen == EOL_SEEN_NONE)
5191 /* This is the first end-of-line. */
5192 eol_seen = this_eol;
5193 else if (eol_seen != this_eol)
5194 {
5195 /* The found type is different from what found before. */
5196 eol_seen = EOL_SEEN_LF;
5197 break;
5198 }
5199 if (++total == MAX_EOL_CHECK_COUNT)
5200 break;
5201 }
5202 src += 2;
5203 }
5204 }
5205 else
5206 {
5207 while (src < src_end)
5208 {
5209 c = *src++;
5210 if (c == '\n' || c == '\r')
5211 {
5212 int this_eol;
5213
5214 if (c == '\n')
5215 this_eol = EOL_SEEN_LF;
5216 else if (src >= src_end || *src != '\n')
5217 this_eol = EOL_SEEN_CR;
5218 else
5219 this_eol = EOL_SEEN_CRLF, src++;
5220
5221 if (eol_seen == EOL_SEEN_NONE)
5222 /* This is the first end-of-line. */
5223 eol_seen = this_eol;
5224 else if (eol_seen != this_eol)
5225 {
5226 /* The found type is different from what found before. */
5227 eol_seen = EOL_SEEN_LF;
5228 break;
5229 }
5230 if (++total == MAX_EOL_CHECK_COUNT)
5231 break;
5232 }
5233 }
5234 }
5235 return eol_seen;
5236 }
5237
5238
5239 static void
5240 adjust_coding_eol_type (coding, eol_seen)
5241 struct coding_system *coding;
5242 int eol_seen;
5243 {
5244 Lisp_Object eol_type;
5245
5246 eol_type = CODING_ID_EOL_TYPE (coding->id);
5247 if (eol_seen & EOL_SEEN_LF)
5248 coding->id = CODING_SYSTEM_ID (AREF (eol_type, 0));
5249 else if (eol_seen & EOL_SEEN_CRLF)
5250 coding->id = CODING_SYSTEM_ID (AREF (eol_type, 1));
5251 else if (eol_seen & EOL_SEEN_CR)
5252 coding->id = CODING_SYSTEM_ID (AREF (eol_type, 2));
5253 }
5254
5255 /* Detect how a text specified in CODING is encoded. If a coding
5256 system is detected, update fields of CODING by the detected coding
5257 system. */
5258
5259 void
5260 detect_coding (coding)
5261 struct coding_system *coding;
5262 {
5263 unsigned char *src, *src_end;
5264 Lisp_Object attrs, coding_type;
5265
5266 coding->consumed = coding->consumed_char = 0;
5267 coding->produced = coding->produced_char = 0;
5268 coding_set_source (coding);
5269
5270 src_end = coding->source + coding->src_bytes;
5271
5272 /* If we have not yet decided the text encoding type, detect it
5273 now. */
5274 if (EQ (CODING_ATTR_TYPE (CODING_ID_ATTRS (coding->id)), Qundecided))
5275 {
5276 int c, i;
5277
5278 for (src = coding->source; src < src_end; src++)
5279 {
5280 c = *src;
5281 if (c & 0x80 || (c < 0x20 && (c == ISO_CODE_ESC
5282 || c == ISO_CODE_SI
5283 || c == ISO_CODE_SO)))
5284 break;
5285 }
5286 coding->head_ascii = src - (coding->source + coding->consumed);
5287
5288 if (coding->head_ascii < coding->src_bytes)
5289 {
5290 struct coding_detection_info detect_info;
5291 enum coding_category category;
5292 struct coding_system *this;
5293
5294 detect_info.checked = detect_info.found = detect_info.rejected = 0;
5295 for (i = 0; i < coding_category_raw_text; i++)
5296 {
5297 category = coding_priorities[i];
5298 this = coding_categories + category;
5299 if (this->id < 0)
5300 {
5301 /* No coding system of this category is defined. */
5302 detect_info.rejected |= (1 << category);
5303 }
5304 else if (category >= coding_category_raw_text)
5305 continue;
5306 else if (detect_info.checked & (1 << category))
5307 {
5308 if (detect_info.found & (1 << category))
5309 break;
5310 }
5311 else if ((*(this->detector)) (coding, &detect_info)
5312 && detect_info.found & (1 << category))
5313 break;
5314 }
5315 if (i < coding_category_raw_text)
5316 setup_coding_system (CODING_ID_NAME (this->id), coding);
5317 else if (detect_info.rejected == CATEGORY_MASK_ANY)
5318 setup_coding_system (Qraw_text, coding);
5319 else if (detect_info.rejected)
5320 for (i = 0; i < coding_category_raw_text; i++)
5321 if (! (detect_info.rejected & (1 << coding_priorities[i])))
5322 {
5323 this = coding_categories + coding_priorities[i];
5324 setup_coding_system (CODING_ID_NAME (this->id), coding);
5325 break;
5326 }
5327 }
5328 }
5329 else if (EQ (CODING_ATTR_TYPE (CODING_ID_ATTRS (coding->id)), Qutf_16))
5330 {
5331 Lisp_Object coding_systems;
5332 struct coding_detection_info detect_info;
5333
5334 coding_systems
5335 = AREF (CODING_ID_ATTRS (coding->id), coding_attr_utf_16_bom);
5336 detect_info.found = detect_info.rejected = 0;
5337 if (CONSP (coding_systems)
5338 && detect_coding_utf_16 (coding, &detect_info)
5339 && (detect_info.found & (CATEGORY_MASK_UTF_16_LE
5340 | CATEGORY_MASK_UTF_16_BE)))
5341 {
5342 if (detect_info.found & CATEGORY_MASK_UTF_16_LE)
5343 setup_coding_system (XCAR (coding_systems), coding);
5344 else
5345 setup_coding_system (XCDR (coding_systems), coding);
5346 }
5347 }
5348
5349 attrs = CODING_ID_ATTRS (coding->id);
5350 coding_type = CODING_ATTR_TYPE (attrs);
5351
5352 /* If we have not yet decided the EOL type, detect it now. But, the
5353 detection is impossible for a CCL based coding system, in which
5354 case, we detct the EOL type after decoding. */
5355 if (VECTORP (CODING_ID_EOL_TYPE (coding->id))
5356 && ! EQ (coding_type, Qccl))
5357 {
5358 int eol_seen = detect_eol (coding->source, coding->src_bytes,
5359 XINT (CODING_ATTR_CATEGORY (attrs)));
5360
5361 if (eol_seen != EOL_SEEN_NONE)
5362 adjust_coding_eol_type (coding, eol_seen);
5363 }
5364 }
5365
5366
5367 static void
5368 decode_eol (coding)
5369 struct coding_system *coding;
5370 {
5371 if (VECTORP (CODING_ID_EOL_TYPE (coding->id)))
5372 {
5373 unsigned char *p = CHAR_POS_ADDR (coding->dst_pos);
5374 unsigned char *pend = p + coding->produced;
5375 int eol_seen = EOL_SEEN_NONE;
5376
5377 for (; p < pend; p++)
5378 {
5379 if (*p == '\n')
5380 eol_seen |= EOL_SEEN_LF;
5381 else if (*p == '\r')
5382 {
5383 if (p + 1 < pend && *(p + 1) == '\n')
5384 {
5385 eol_seen |= EOL_SEEN_CRLF;
5386 p++;
5387 }
5388 else
5389 eol_seen |= EOL_SEEN_CR;
5390 }
5391 }
5392 if (eol_seen != EOL_SEEN_NONE)
5393 adjust_coding_eol_type (coding, eol_seen);
5394 }
5395
5396 if (EQ (CODING_ID_EOL_TYPE (coding->id), Qmac))
5397 {
5398 unsigned char *p = CHAR_POS_ADDR (coding->dst_pos);
5399 unsigned char *pend = p + coding->produced;
5400
5401 for (; p < pend; p++)
5402 if (*p == '\r')
5403 *p = '\n';
5404 }
5405 else if (EQ (CODING_ID_EOL_TYPE (coding->id), Qdos))
5406 {
5407 unsigned char *p, *pbeg, *pend;
5408 Lisp_Object undo_list;
5409
5410 move_gap_both (coding->dst_pos + coding->produced_char,
5411 coding->dst_pos_byte + coding->produced);
5412 undo_list = current_buffer->undo_list;
5413 current_buffer->undo_list = Qt;
5414 del_range_2 (coding->dst_pos, coding->dst_pos_byte, GPT, GPT_BYTE, 0);
5415 current_buffer->undo_list = undo_list;
5416 pbeg = GPT_ADDR;
5417 pend = pbeg + coding->produced;
5418
5419 for (p = pend - 1; p >= pbeg; p--)
5420 if (*p == '\r')
5421 {
5422 safe_bcopy ((char *) (p + 1), (char *) p, pend - p - 1);
5423 pend--;
5424 }
5425 coding->produced_char -= coding->produced - (pend - pbeg);
5426 coding->produced = pend - pbeg;
5427 insert_from_gap (coding->produced_char, coding->produced);
5428 }
5429 }
5430
5431 static void
5432 translate_chars (coding, table)
5433 struct coding_system *coding;
5434 Lisp_Object table;
5435 {
5436 int *charbuf = coding->charbuf;
5437 int *charbuf_end = charbuf + coding->charbuf_used;
5438 int c;
5439
5440 if (coding->chars_at_source)
5441 return;
5442
5443 while (charbuf < charbuf_end)
5444 {
5445 c = *charbuf;
5446 if (c < 0)
5447 charbuf += c;
5448 else
5449 *charbuf++ = translate_char (table, c);
5450 }
5451 }
5452
5453 static int
5454 produce_chars (coding)
5455 struct coding_system *coding;
5456 {
5457 unsigned char *dst = coding->destination + coding->produced;
5458 unsigned char *dst_end = coding->destination + coding->dst_bytes;
5459 int produced;
5460 int produced_chars = 0;
5461
5462 if (! coding->chars_at_source)
5463 {
5464 /* Characters are in coding->charbuf. */
5465 int *buf = coding->charbuf;
5466 int *buf_end = buf + coding->charbuf_used;
5467 unsigned char *adjusted_dst_end;
5468
5469 if (BUFFERP (coding->src_object)
5470 && EQ (coding->src_object, coding->dst_object))
5471 dst_end = coding->source + coding->consumed;
5472 adjusted_dst_end = dst_end - MAX_MULTIBYTE_LENGTH;
5473
5474 while (buf < buf_end)
5475 {
5476 int c = *buf++;
5477
5478 if (dst >= adjusted_dst_end)
5479 {
5480 dst = alloc_destination (coding,
5481 buf_end - buf + MAX_MULTIBYTE_LENGTH,
5482 dst);
5483 dst_end = coding->destination + coding->dst_bytes;
5484 adjusted_dst_end = dst_end - MAX_MULTIBYTE_LENGTH;
5485 }
5486 if (c >= 0)
5487 {
5488 if (coding->dst_multibyte
5489 || ! CHAR_BYTE8_P (c))
5490 CHAR_STRING_ADVANCE (c, dst);
5491 else
5492 *dst++ = CHAR_TO_BYTE8 (c);
5493 produced_chars++;
5494 }
5495 else
5496 /* This is an annotation datum. */
5497 buf -= c + 1;
5498 }
5499 }
5500 else
5501 {
5502 unsigned char *src = coding->source;
5503 unsigned char *src_end = src + coding->src_bytes;
5504 Lisp_Object eol_type;
5505
5506 eol_type = CODING_ID_EOL_TYPE (coding->id);
5507
5508 if (coding->src_multibyte != coding->dst_multibyte)
5509 {
5510 if (coding->src_multibyte)
5511 {
5512 int multibytep = 1;
5513 int consumed_chars;
5514
5515 while (1)
5516 {
5517 unsigned char *src_base = src;
5518 int c;
5519
5520 ONE_MORE_BYTE (c);
5521 if (c == '\r')
5522 {
5523 if (EQ (eol_type, Qdos))
5524 {
5525 if (src == src_end)
5526 {
5527 coding->result = CODING_RESULT_INSUFFICIENT_SRC;
5528 goto no_more_source;
5529 }
5530 if (*src == '\n')
5531 c = *src++;
5532 }
5533 else if (EQ (eol_type, Qmac))
5534 c = '\n';
5535 }
5536 if (dst == dst_end)
5537 {
5538 coding->consumed = src - coding->source;
5539
5540 if (EQ (coding->src_object, coding->dst_object))
5541 dst_end = src;
5542 if (dst == dst_end)
5543 {
5544 dst = alloc_destination (coding, src_end - src + 1,
5545 dst);
5546 dst_end = coding->destination + coding->dst_bytes;
5547 coding_set_source (coding);
5548 src = coding->source + coding->consumed;
5549 src_end = coding->source + coding->src_bytes;
5550 }
5551 }
5552 *dst++ = c;
5553 produced_chars++;
5554 }
5555 no_more_source:
5556 ;
5557 }
5558 else
5559 while (src < src_end)
5560 {
5561 int multibytep = 1;
5562 int c = *src++;
5563
5564 if (c == '\r')
5565 {
5566 if (EQ (eol_type, Qdos))
5567 {
5568 if (src < src_end
5569 && *src == '\n')
5570 c = *src++;
5571 }
5572 else if (EQ (eol_type, Qmac))
5573 c = '\n';
5574 }
5575 if (dst >= dst_end - 1)
5576 {
5577 coding->consumed = src - coding->source;
5578
5579 if (EQ (coding->src_object, coding->dst_object))
5580 dst_end = src;
5581 if (dst >= dst_end - 1)
5582 {
5583 dst = alloc_destination (coding, src_end - src + 2,
5584 dst);
5585 dst_end = coding->destination + coding->dst_bytes;
5586 coding_set_source (coding);
5587 src = coding->source + coding->consumed;
5588 src_end = coding->source + coding->src_bytes;
5589 }
5590 }
5591 EMIT_ONE_BYTE (c);
5592 }
5593 }
5594 else
5595 {
5596 if (!EQ (coding->src_object, coding->dst_object))
5597 {
5598 int require = coding->src_bytes - coding->dst_bytes;
5599
5600 if (require > 0)
5601 {
5602 EMACS_INT offset = src - coding->source;
5603
5604 dst = alloc_destination (coding, require, dst);
5605 coding_set_source (coding);
5606 src = coding->source + offset;
5607 src_end = coding->source + coding->src_bytes;
5608 }
5609 }
5610 produced_chars = coding->src_chars;
5611 while (src < src_end)
5612 {
5613 int c = *src++;
5614
5615 if (c == '\r')
5616 {
5617 if (EQ (eol_type, Qdos))
5618 {
5619 if (src < src_end
5620 && *src == '\n')
5621 c = *src++;
5622 produced_chars--;
5623 }
5624 else if (EQ (eol_type, Qmac))
5625 c = '\n';
5626 }
5627 *dst++ = c;
5628 }
5629 }
5630 coding->consumed = coding->src_bytes;
5631 coding->consumed_char = coding->src_chars;
5632 }
5633
5634 produced = dst - (coding->destination + coding->produced);
5635 if (BUFFERP (coding->dst_object))
5636 insert_from_gap (produced_chars, produced);
5637 coding->produced += produced;
5638 coding->produced_char += produced_chars;
5639 return produced_chars;
5640 }
5641
5642 /* Compose text in CODING->object according to the annotation data at
5643 CHARBUF. CHARBUF is an array:
5644 [ -LENGTH ANNOTATION_MASK FROM TO METHOD COMP_LEN [ COMPONENTS... ] ]
5645 */
5646
5647 static INLINE void
5648 produce_composition (coding, charbuf)
5649 struct coding_system *coding;
5650 int *charbuf;
5651 {
5652 int len;
5653 EMACS_INT from, to;
5654 enum composition_method method;
5655 Lisp_Object components;
5656
5657 len = -charbuf[0];
5658 from = coding->dst_pos + charbuf[2];
5659 to = coding->dst_pos + charbuf[3];
5660 method = (enum composition_method) (charbuf[4]);
5661
5662 if (method == COMPOSITION_RELATIVE)
5663 components = Qnil;
5664 else
5665 {
5666 Lisp_Object args[MAX_COMPOSITION_COMPONENTS * 2 - 1];
5667 int i;
5668
5669 len -= 5;
5670 charbuf += 5;
5671 for (i = 0; i < len; i++)
5672 args[i] = make_number (charbuf[i]);
5673 components = (method == COMPOSITION_WITH_ALTCHARS
5674 ? Fstring (len, args) : Fvector (len, args));
5675 }
5676 compose_text (from, to, components, Qnil, coding->dst_object);
5677 }
5678
5679
5680 /* Put `charset' property on text in CODING->object according to
5681 the annotation data at CHARBUF. CHARBUF is an array:
5682 [ -LENGTH ANNOTATION_MASK FROM TO CHARSET-ID ]
5683 */
5684
5685 static INLINE void
5686 produce_charset (coding, charbuf)
5687 struct coding_system *coding;
5688 int *charbuf;
5689 {
5690 EMACS_INT from = coding->dst_pos + charbuf[2];
5691 EMACS_INT to = coding->dst_pos + charbuf[3];
5692 struct charset *charset = CHARSET_FROM_ID (charbuf[4]);
5693
5694 Fput_text_property (make_number (from), make_number (to),
5695 Qcharset, CHARSET_NAME (charset),
5696 coding->dst_object);
5697 }
5698
5699
5700 #define CHARBUF_SIZE 0x4000
5701
5702 #define ALLOC_CONVERSION_WORK_AREA(coding) \
5703 do { \
5704 int size = CHARBUF_SIZE;; \
5705 \
5706 coding->charbuf = NULL; \
5707 while (size > 1024) \
5708 { \
5709 coding->charbuf = (int *) alloca (sizeof (int) * size); \
5710 if (coding->charbuf) \
5711 break; \
5712 size >>= 1; \
5713 } \
5714 if (! coding->charbuf) \
5715 { \
5716 coding->result = CODING_RESULT_INSUFFICIENT_MEM; \
5717 return coding->result; \
5718 } \
5719 coding->charbuf_size = size; \
5720 } while (0)
5721
5722
5723 static void
5724 produce_annotation (coding)
5725 struct coding_system *coding;
5726 {
5727 int *charbuf = coding->charbuf;
5728 int *charbuf_end = charbuf + coding->charbuf_used;
5729
5730 if (NILP (coding->dst_object))
5731 return;
5732
5733 while (charbuf < charbuf_end)
5734 {
5735 if (*charbuf >= 0)
5736 charbuf++;
5737 else
5738 {
5739 int len = -*charbuf;
5740 switch (charbuf[1])
5741 {
5742 case CODING_ANNOTATE_COMPOSITION_MASK:
5743 produce_composition (coding, charbuf);
5744 break;
5745 case CODING_ANNOTATE_CHARSET_MASK:
5746 produce_charset (coding, charbuf);
5747 break;
5748 default:
5749 abort ();
5750 }
5751 charbuf += len;
5752 }
5753 }
5754 }
5755
5756 /* Decode the data at CODING->src_object into CODING->dst_object.
5757 CODING->src_object is a buffer, a string, or nil.
5758 CODING->dst_object is a buffer.
5759
5760 If CODING->src_object is a buffer, it must be the current buffer.
5761 In this case, if CODING->src_pos is positive, it is a position of
5762 the source text in the buffer, otherwise, the source text is in the
5763 gap area of the buffer, and CODING->src_pos specifies the offset of
5764 the text from GPT (which must be the same as PT). If this is the
5765 same buffer as CODING->dst_object, CODING->src_pos must be
5766 negative.
5767
5768 If CODING->src_object is a string, CODING->src_pos in an index to
5769 that string.
5770
5771 If CODING->src_object is nil, CODING->source must already point to
5772 the non-relocatable memory area. In this case, CODING->src_pos is
5773 an offset from CODING->source.
5774
5775 The decoded data is inserted at the current point of the buffer
5776 CODING->dst_object.
5777 */
5778
5779 static int
5780 decode_coding (coding)
5781 struct coding_system *coding;
5782 {
5783 Lisp_Object attrs;
5784
5785 if (BUFFERP (coding->src_object)
5786 && coding->src_pos > 0
5787 && coding->src_pos < GPT
5788 && coding->src_pos + coding->src_chars > GPT)
5789 move_gap_both (coding->src_pos, coding->src_pos_byte);
5790
5791 if (BUFFERP (coding->dst_object))
5792 {
5793 if (current_buffer != XBUFFER (coding->dst_object))
5794 set_buffer_internal (XBUFFER (coding->dst_object));
5795 if (GPT != PT)
5796 move_gap_both (PT, PT_BYTE);
5797 }
5798
5799 coding->consumed = coding->consumed_char = 0;
5800 coding->produced = coding->produced_char = 0;
5801 coding->chars_at_source = 0;
5802 coding->result = CODING_RESULT_SUCCESS;
5803 coding->errors = 0;
5804
5805 ALLOC_CONVERSION_WORK_AREA (coding);
5806
5807 attrs = CODING_ID_ATTRS (coding->id);
5808
5809 do
5810 {
5811 coding_set_source (coding);
5812 coding->annotated = 0;
5813 (*(coding->decoder)) (coding);
5814 if (!NILP (CODING_ATTR_DECODE_TBL (attrs)))
5815 translate_chars (coding, CODING_ATTR_DECODE_TBL (attrs));
5816 else if (!NILP (Vstandard_translation_table_for_decode))
5817 translate_chars (coding, Vstandard_translation_table_for_decode);
5818 coding_set_destination (coding);
5819 produce_chars (coding);
5820 if (coding->annotated)
5821 produce_annotation (coding);
5822 }
5823 while (coding->consumed < coding->src_bytes
5824 && ! coding->result);
5825
5826 if (EQ (CODING_ATTR_TYPE (CODING_ID_ATTRS (coding->id)), Qccl)
5827 && SYMBOLP (CODING_ID_EOL_TYPE (coding->id))
5828 && ! EQ (CODING_ID_EOL_TYPE (coding->id), Qunix))
5829 decode_eol (coding);
5830
5831 coding->carryover_bytes = 0;
5832 if (coding->consumed < coding->src_bytes)
5833 {
5834 int nbytes = coding->src_bytes - coding->consumed;
5835 unsigned char *src;
5836
5837 coding_set_source (coding);
5838 coding_set_destination (coding);
5839 src = coding->source + coding->consumed;
5840
5841 if (coding->mode & CODING_MODE_LAST_BLOCK)
5842 {
5843 /* Flush out unprocessed data as binary chars. We are sure
5844 that the number of data is less than the size of
5845 coding->charbuf. */
5846 while (nbytes-- > 0)
5847 {
5848 int c = *src++;
5849
5850 coding->charbuf[coding->charbuf_used++] = (c & 0x80 ? - c : c);
5851 }
5852 produce_chars (coding);
5853 }
5854 else
5855 {
5856 /* Record unprocessed bytes in coding->carryover. We are
5857 sure that the number of data is less than the size of
5858 coding->carryover. */
5859 unsigned char *p = coding->carryover;
5860
5861 coding->carryover_bytes = nbytes;
5862 while (nbytes-- > 0)
5863 *p++ = *src++;
5864 }
5865 coding->consumed = coding->src_bytes;
5866 }
5867
5868 return coding->result;
5869 }
5870
5871
5872 /* Extract an annotation datum from a composition starting at POS and
5873 ending before LIMIT of CODING->src_object (buffer or string), store
5874 the data in BUF, set *STOP to a starting position of the next
5875 composition (if any) or to LIMIT, and return the address of the
5876 next element of BUF.
5877
5878 If such an annotation is not found, set *STOP to a starting
5879 position of a composition after POS (if any) or to LIMIT, and
5880 return BUF. */
5881
5882 static INLINE int *
5883 handle_composition_annotation (pos, limit, coding, buf, stop)
5884 EMACS_INT pos, limit;
5885 struct coding_system *coding;
5886 int *buf;
5887 EMACS_INT *stop;
5888 {
5889 EMACS_INT start, end;
5890 Lisp_Object prop;
5891
5892 if (! find_composition (pos, limit, &start, &end, &prop, coding->src_object)
5893 || end > limit)
5894 *stop = limit;
5895 else if (start > pos)
5896 *stop = start;
5897 else
5898 {
5899 if (start == pos)
5900 {
5901 /* We found a composition. Store the corresponding
5902 annotation data in BUF. */
5903 int *head = buf;
5904 enum composition_method method = COMPOSITION_METHOD (prop);
5905 int nchars = COMPOSITION_LENGTH (prop);
5906
5907 ADD_COMPOSITION_DATA (buf, 0, nchars, method);
5908 if (method != COMPOSITION_RELATIVE)
5909 {
5910 Lisp_Object components;
5911 int len, i, i_byte;
5912
5913 components = COMPOSITION_COMPONENTS (prop);
5914 if (VECTORP (components))
5915 {
5916 len = XVECTOR (components)->size;
5917 for (i = 0; i < len; i++)
5918 *buf++ = XINT (AREF (components, i));
5919 }
5920 else if (STRINGP (components))
5921 {
5922 len = XSTRING (components)->size;
5923 i = i_byte = 0;
5924 while (i < len)
5925 {
5926 FETCH_STRING_CHAR_ADVANCE (*buf, components, i, i_byte);
5927 buf++;
5928 }
5929 }
5930 else if (INTEGERP (components))
5931 {
5932 len = 1;
5933 *buf++ = XINT (components);
5934 }
5935 else if (CONSP (components))
5936 {
5937 for (len = 0; CONSP (components);
5938 len++, components = XCDR (components))
5939 *buf++ = XINT (XCAR (components));
5940 }
5941 else
5942 abort ();
5943 *head -= len;
5944 }
5945 }
5946
5947 if (find_composition (end, limit, &start, &end, &prop,
5948 coding->src_object)
5949 && end <= limit)
5950 *stop = start;
5951 else
5952 *stop = limit;
5953 }
5954 return buf;
5955 }
5956
5957
5958 /* Extract an annotation datum from a text property `charset' at POS of
5959 CODING->src_object (buffer of string), store the data in BUF, set
5960 *STOP to the position where the value of `charset' property changes
5961 (limiting by LIMIT), and return the address of the next element of
5962 BUF.
5963
5964 If the property value is nil, set *STOP to the position where the
5965 property value is non-nil (limiting by LIMIT), and return BUF. */
5966
5967 static INLINE int *
5968 handle_charset_annotation (pos, limit, coding, buf, stop)
5969 EMACS_INT pos, limit;
5970 struct coding_system *coding;
5971 int *buf;
5972 EMACS_INT *stop;
5973 {
5974 Lisp_Object val, next;
5975 int id;
5976
5977 val = Fget_text_property (make_number (pos), Qcharset, coding->src_object);
5978 if (! NILP (val) && CHARSETP (val))
5979 id = XINT (CHARSET_SYMBOL_ID (val));
5980 else
5981 id = -1;
5982 ADD_CHARSET_DATA (buf, 0, 0, id);
5983 next = Fnext_single_property_change (make_number (pos), Qcharset,
5984 coding->src_object,
5985 make_number (limit));
5986 *stop = XINT (next);
5987 return buf;
5988 }
5989
5990
5991 static void
5992 consume_chars (coding)
5993 struct coding_system *coding;
5994 {
5995 int *buf = coding->charbuf;
5996 int *buf_end = coding->charbuf + coding->charbuf_size;
5997 const unsigned char *src = coding->source + coding->consumed;
5998 const unsigned char *src_end = coding->source + coding->src_bytes;
5999 EMACS_INT pos = coding->src_pos + coding->consumed_char;
6000 EMACS_INT end_pos = coding->src_pos + coding->src_chars;
6001 int multibytep = coding->src_multibyte;
6002 Lisp_Object eol_type;
6003 int c;
6004 EMACS_INT stop, stop_composition, stop_charset;
6005
6006 eol_type = CODING_ID_EOL_TYPE (coding->id);
6007 if (VECTORP (eol_type))
6008 eol_type = Qunix;
6009
6010 /* Note: composition handling is not yet implemented. */
6011 coding->common_flags &= ~CODING_ANNOTATE_COMPOSITION_MASK;
6012
6013 if (coding->common_flags & CODING_ANNOTATE_COMPOSITION_MASK)
6014 stop = stop_composition = pos;
6015 else
6016 stop = stop_composition = end_pos;
6017 if (coding->common_flags & CODING_ANNOTATE_CHARSET_MASK)
6018 stop = stop_charset = pos;
6019 else
6020 stop_charset = end_pos;
6021
6022 /* Compensate for CRLF and annotation. */
6023 buf_end -= 1 + MAX_ANNOTATION_LENGTH;
6024 while (buf < buf_end)
6025 {
6026 if (pos == stop)
6027 {
6028 if (pos == end_pos)
6029 break;
6030 if (pos == stop_composition)
6031 buf = handle_composition_annotation (pos, end_pos, coding,
6032 buf, &stop_composition);
6033 if (pos == stop_charset)
6034 buf = handle_charset_annotation (pos, end_pos, coding,
6035 buf, &stop_charset);
6036 stop = (stop_composition < stop_charset
6037 ? stop_composition : stop_charset);
6038 }
6039
6040 if (! multibytep)
6041 {
6042 EMACS_INT bytes = MULTIBYTE_LENGTH (src, src_end);
6043
6044 if (bytes > 0)
6045 c = STRING_CHAR_ADVANCE (src), pos += bytes;
6046 else
6047 c = *src++, pos++;
6048 }
6049 else
6050 c = STRING_CHAR_ADVANCE (src), pos++;
6051 if ((c == '\r') && (coding->mode & CODING_MODE_SELECTIVE_DISPLAY))
6052 c = '\n';
6053 if (! EQ (eol_type, Qunix))
6054 {
6055 if (c == '\n')
6056 {
6057 if (EQ (eol_type, Qdos))
6058 *buf++ = '\r';
6059 else
6060 c = '\r';
6061 }
6062 }
6063 *buf++ = c;
6064 }
6065
6066 coding->consumed = src - coding->source;
6067 coding->consumed_char = pos - coding->src_pos;
6068 coding->charbuf_used = buf - coding->charbuf;
6069 coding->chars_at_source = 0;
6070 }
6071
6072
6073 /* Encode the text at CODING->src_object into CODING->dst_object.
6074 CODING->src_object is a buffer or a string.
6075 CODING->dst_object is a buffer or nil.
6076
6077 If CODING->src_object is a buffer, it must be the current buffer.
6078 In this case, if CODING->src_pos is positive, it is a position of
6079 the source text in the buffer, otherwise. the source text is in the
6080 gap area of the buffer, and coding->src_pos specifies the offset of
6081 the text from GPT (which must be the same as PT). If this is the
6082 same buffer as CODING->dst_object, CODING->src_pos must be
6083 negative and CODING should not have `pre-write-conversion'.
6084
6085 If CODING->src_object is a string, CODING should not have
6086 `pre-write-conversion'.
6087
6088 If CODING->dst_object is a buffer, the encoded data is inserted at
6089 the current point of that buffer.
6090
6091 If CODING->dst_object is nil, the encoded data is placed at the
6092 memory area specified by CODING->destination. */
6093
6094 static int
6095 encode_coding (coding)
6096 struct coding_system *coding;
6097 {
6098 Lisp_Object attrs;
6099
6100 attrs = CODING_ID_ATTRS (coding->id);
6101
6102 if (BUFFERP (coding->dst_object))
6103 {
6104 set_buffer_internal (XBUFFER (coding->dst_object));
6105 coding->dst_multibyte
6106 = ! NILP (current_buffer->enable_multibyte_characters);
6107 }
6108
6109 coding->consumed = coding->consumed_char = 0;
6110 coding->produced = coding->produced_char = 0;
6111 coding->result = CODING_RESULT_SUCCESS;
6112 coding->errors = 0;
6113
6114 ALLOC_CONVERSION_WORK_AREA (coding);
6115
6116 do {
6117 coding_set_source (coding);
6118 consume_chars (coding);
6119
6120 if (!NILP (CODING_ATTR_ENCODE_TBL (attrs)))
6121 translate_chars (coding, CODING_ATTR_ENCODE_TBL (attrs));
6122 else if (!NILP (Vstandard_translation_table_for_encode))
6123 translate_chars (coding, Vstandard_translation_table_for_encode);
6124
6125 coding_set_destination (coding);
6126 (*(coding->encoder)) (coding);
6127 } while (coding->consumed_char < coding->src_chars);
6128
6129 if (BUFFERP (coding->dst_object))
6130 insert_from_gap (coding->produced_char, coding->produced);
6131
6132 return (coding->result);
6133 }
6134
6135
6136 /* Stack of working buffers used in code conversion. An nil element
6137 means that the code conversion of that level is not using a working
6138 buffer. */
6139 Lisp_Object Vcode_conversion_work_buf_list;
6140
6141 /* A working buffer used by the top level conversion. */
6142 Lisp_Object Vcode_conversion_reused_work_buf;
6143
6144
6145 /* Return a working buffer that can be freely used by the following
6146 code conversion. MULTIBYTEP specifies the multibyteness of the
6147 buffer. */
6148
6149 Lisp_Object
6150 make_conversion_work_buffer (multibytep, depth)
6151 int multibytep, depth;
6152 {
6153 struct buffer *current = current_buffer;
6154 Lisp_Object buf, name;
6155
6156 if (depth == 0)
6157 {
6158 if (NILP (Vcode_conversion_reused_work_buf))
6159 Vcode_conversion_reused_work_buf
6160 = Fget_buffer_create (build_string (" *code-conversion-work<0>*"));
6161 buf = Vcode_conversion_reused_work_buf;
6162 }
6163 else
6164 {
6165 if (depth < 0)
6166 {
6167 name = build_string (" *code-conversion-work*");
6168 name = Fgenerate_new_buffer_name (name, Qnil);
6169 }
6170 else
6171 {
6172 char str[128];
6173
6174 sprintf (str, " *code-conversion-work*<%d>", depth);
6175 name = build_string (str);
6176 }
6177 buf = Fget_buffer_create (name);
6178 }
6179 set_buffer_internal (XBUFFER (buf));
6180 current_buffer->undo_list = Qt;
6181 Ferase_buffer ();
6182 Fset_buffer_multibyte (multibytep ? Qt : Qnil, Qnil);
6183 set_buffer_internal (current);
6184 return buf;
6185 }
6186
6187 static Lisp_Object
6188 code_conversion_restore (buffer)
6189 Lisp_Object buffer;
6190 {
6191 Lisp_Object workbuf;
6192
6193 workbuf = XCAR (Vcode_conversion_work_buf_list);
6194 if (! NILP (workbuf)
6195 && ! EQ (workbuf, Vcode_conversion_reused_work_buf)
6196 && ! NILP (Fbuffer_live_p (workbuf)))
6197 Fkill_buffer (workbuf);
6198 Vcode_conversion_work_buf_list = XCDR (Vcode_conversion_work_buf_list);
6199 set_buffer_internal (XBUFFER (buffer));
6200 return Qnil;
6201 }
6202
6203 static Lisp_Object
6204 code_conversion_save (buffer, with_work_buf, multibyte)
6205 Lisp_Object buffer;
6206 int with_work_buf, multibyte;
6207 {
6208 Lisp_Object workbuf;
6209
6210 if (with_work_buf)
6211 {
6212 int depth = XINT (Flength (Vcode_conversion_work_buf_list));
6213
6214 workbuf = make_conversion_work_buffer (multibyte, depth);
6215 }
6216 else
6217 workbuf = Qnil;
6218 Vcode_conversion_work_buf_list
6219 = Fcons (workbuf, Vcode_conversion_work_buf_list);
6220 record_unwind_protect (code_conversion_restore, buffer);
6221 return workbuf;
6222 }
6223
6224 int
6225 decode_coding_gap (coding, chars, bytes)
6226 struct coding_system *coding;
6227 EMACS_INT chars, bytes;
6228 {
6229 int count = specpdl_ptr - specpdl;
6230 Lisp_Object buffer;
6231
6232 buffer = Fcurrent_buffer ();
6233 code_conversion_save (buffer, 0, 0);
6234
6235 coding->src_object = buffer;
6236 coding->src_chars = chars;
6237 coding->src_bytes = bytes;
6238 coding->src_pos = -chars;
6239 coding->src_pos_byte = -bytes;
6240 coding->src_multibyte = chars < bytes;
6241 coding->dst_object = coding->src_object;
6242 coding->dst_pos = PT;
6243 coding->dst_pos_byte = PT_BYTE;
6244 coding->dst_multibyte = ! NILP (current_buffer->enable_multibyte_characters);
6245 coding->mode |= CODING_MODE_LAST_BLOCK;
6246
6247 if (CODING_REQUIRE_DETECTION (coding))
6248 detect_coding (coding);
6249
6250 decode_coding (coding);
6251
6252 unbind_to (count, Qnil);
6253 return coding->result;
6254 }
6255
6256 int
6257 encode_coding_gap (coding, chars, bytes)
6258 struct coding_system *coding;
6259 EMACS_INT chars, bytes;
6260 {
6261 int count = specpdl_ptr - specpdl;
6262 Lisp_Object buffer;
6263
6264 buffer = Fcurrent_buffer ();
6265 code_conversion_save (buffer, 0, 0);
6266
6267 coding->src_object = buffer;
6268 coding->src_chars = chars;
6269 coding->src_bytes = bytes;
6270 coding->src_pos = -chars;
6271 coding->src_pos_byte = -bytes;
6272 coding->src_multibyte = chars < bytes;
6273 coding->dst_object = coding->src_object;
6274 coding->dst_pos = PT;
6275 coding->dst_pos_byte = PT_BYTE;
6276
6277 encode_coding (coding);
6278
6279 unbind_to (count, Qnil);
6280 return coding->result;
6281 }
6282
6283
6284 /* Decode the text in the range FROM/FROM_BYTE and TO/TO_BYTE in
6285 SRC_OBJECT into DST_OBJECT by coding context CODING.
6286
6287 SRC_OBJECT is a buffer, a string, or Qnil.
6288
6289 If it is a buffer, the text is at point of the buffer. FROM and TO
6290 are positions in the buffer.
6291
6292 If it is a string, the text is at the beginning of the string.
6293 FROM and TO are indices to the string.
6294
6295 If it is nil, the text is at coding->source. FROM and TO are
6296 indices to coding->source.
6297
6298 DST_OBJECT is a buffer, Qt, or Qnil.
6299
6300 If it is a buffer, the decoded text is inserted at point of the
6301 buffer. If the buffer is the same as SRC_OBJECT, the source text
6302 is deleted.
6303
6304 If it is Qt, a string is made from the decoded text, and
6305 set in CODING->dst_object.
6306
6307 If it is Qnil, the decoded text is stored at CODING->destination.
6308 The caller must allocate CODING->dst_bytes bytes at
6309 CODING->destination by xmalloc. If the decoded text is longer than
6310 CODING->dst_bytes, CODING->destination is relocated by xrealloc.
6311 */
6312
6313 void
6314 decode_coding_object (coding, src_object, from, from_byte, to, to_byte,
6315 dst_object)
6316 struct coding_system *coding;
6317 Lisp_Object src_object;
6318 EMACS_INT from, from_byte, to, to_byte;
6319 Lisp_Object dst_object;
6320 {
6321 int count = specpdl_ptr - specpdl;
6322 unsigned char *destination;
6323 EMACS_INT dst_bytes;
6324 EMACS_INT chars = to - from;
6325 EMACS_INT bytes = to_byte - from_byte;
6326 Lisp_Object attrs;
6327 Lisp_Object buffer;
6328 int saved_pt = -1, saved_pt_byte;
6329
6330 buffer = Fcurrent_buffer ();
6331
6332 if (NILP (dst_object))
6333 {
6334 destination = coding->destination;
6335 dst_bytes = coding->dst_bytes;
6336 }
6337
6338 coding->src_object = src_object;
6339 coding->src_chars = chars;
6340 coding->src_bytes = bytes;
6341 coding->src_multibyte = chars < bytes;
6342
6343 if (STRINGP (src_object))
6344 {
6345 coding->src_pos = from;
6346 coding->src_pos_byte = from_byte;
6347 }
6348 else if (BUFFERP (src_object))
6349 {
6350 set_buffer_internal (XBUFFER (src_object));
6351 if (from != GPT)
6352 move_gap_both (from, from_byte);
6353 if (EQ (src_object, dst_object))
6354 {
6355 saved_pt = PT, saved_pt_byte = PT_BYTE;
6356 TEMP_SET_PT_BOTH (from, from_byte);
6357 del_range_both (from, from_byte, to, to_byte, 1);
6358 coding->src_pos = -chars;
6359 coding->src_pos_byte = -bytes;
6360 }
6361 else
6362 {
6363 coding->src_pos = from;
6364 coding->src_pos_byte = from_byte;
6365 }
6366 }
6367
6368 if (CODING_REQUIRE_DETECTION (coding))
6369 detect_coding (coding);
6370 attrs = CODING_ID_ATTRS (coding->id);
6371
6372 if (EQ (dst_object, Qt)
6373 || (! NILP (CODING_ATTR_POST_READ (attrs))
6374 && NILP (dst_object)))
6375 {
6376 coding->dst_object = code_conversion_save (buffer, 1, 1);
6377 coding->dst_pos = BEG;
6378 coding->dst_pos_byte = BEG_BYTE;
6379 coding->dst_multibyte = 1;
6380 }
6381 else if (BUFFERP (dst_object))
6382 {
6383 code_conversion_save (buffer, 0, 0);
6384 coding->dst_object = dst_object;
6385 coding->dst_pos = BUF_PT (XBUFFER (dst_object));
6386 coding->dst_pos_byte = BUF_PT_BYTE (XBUFFER (dst_object));
6387 coding->dst_multibyte
6388 = ! NILP (XBUFFER (dst_object)->enable_multibyte_characters);
6389 }
6390 else
6391 {
6392 code_conversion_save (buffer, 0, 0);
6393 coding->dst_object = Qnil;
6394 coding->dst_multibyte = 1;
6395 }
6396
6397 decode_coding (coding);
6398
6399 if (BUFFERP (coding->dst_object))
6400 set_buffer_internal (XBUFFER (coding->dst_object));
6401
6402 if (! NILP (CODING_ATTR_POST_READ (attrs)))
6403 {
6404 struct gcpro gcpro1, gcpro2;
6405 EMACS_INT prev_Z = Z, prev_Z_BYTE = Z_BYTE;
6406 Lisp_Object val;
6407
6408 TEMP_SET_PT_BOTH (coding->dst_pos, coding->dst_pos_byte);
6409 GCPRO2 (coding->src_object, coding->dst_object);
6410 val = call1 (CODING_ATTR_POST_READ (attrs),
6411 make_number (coding->produced_char));
6412 UNGCPRO;
6413 CHECK_NATNUM (val);
6414 coding->produced_char += Z - prev_Z;
6415 coding->produced += Z_BYTE - prev_Z_BYTE;
6416 }
6417
6418 if (EQ (dst_object, Qt))
6419 {
6420 coding->dst_object = Fbuffer_string ();
6421 }
6422 else if (NILP (dst_object) && BUFFERP (coding->dst_object))
6423 {
6424 set_buffer_internal (XBUFFER (coding->dst_object));
6425 if (dst_bytes < coding->produced)
6426 {
6427 destination
6428 = (unsigned char *) xrealloc (destination, coding->produced);
6429 if (! destination)
6430 {
6431 coding->result = CODING_RESULT_INSUFFICIENT_DST;
6432 unbind_to (count, Qnil);
6433 return;
6434 }
6435 if (BEGV < GPT && GPT < BEGV + coding->produced_char)
6436 move_gap_both (BEGV, BEGV_BYTE);
6437 bcopy (BEGV_ADDR, destination, coding->produced);
6438 coding->destination = destination;
6439 }
6440 }
6441
6442 if (saved_pt >= 0)
6443 {
6444 /* This is the case of:
6445 (BUFFERP (src_object) && EQ (src_object, dst_object))
6446 As we have moved PT while replacing the original buffer
6447 contents, we must recover it now. */
6448 set_buffer_internal (XBUFFER (src_object));
6449 if (saved_pt < from)
6450 TEMP_SET_PT_BOTH (saved_pt, saved_pt_byte);
6451 else if (saved_pt < from + chars)
6452 TEMP_SET_PT_BOTH (from, from_byte);
6453 else if (! NILP (current_buffer->enable_multibyte_characters))
6454 TEMP_SET_PT_BOTH (saved_pt + (coding->produced_char - chars),
6455 saved_pt_byte + (coding->produced - bytes));
6456 else
6457 TEMP_SET_PT_BOTH (saved_pt + (coding->produced - bytes),
6458 saved_pt_byte + (coding->produced - bytes));
6459 }
6460
6461 unbind_to (count, Qnil);
6462 }
6463
6464
6465 void
6466 encode_coding_object (coding, src_object, from, from_byte, to, to_byte,
6467 dst_object)
6468 struct coding_system *coding;
6469 Lisp_Object src_object;
6470 EMACS_INT from, from_byte, to, to_byte;
6471 Lisp_Object dst_object;
6472 {
6473 int count = specpdl_ptr - specpdl;
6474 EMACS_INT chars = to - from;
6475 EMACS_INT bytes = to_byte - from_byte;
6476 Lisp_Object attrs;
6477 Lisp_Object buffer;
6478 int saved_pt = -1, saved_pt_byte;
6479
6480 buffer = Fcurrent_buffer ();
6481
6482 coding->src_object = src_object;
6483 coding->src_chars = chars;
6484 coding->src_bytes = bytes;
6485 coding->src_multibyte = chars < bytes;
6486
6487 attrs = CODING_ID_ATTRS (coding->id);
6488
6489 if (! NILP (CODING_ATTR_PRE_WRITE (attrs)))
6490 {
6491 coding->src_object = code_conversion_save (buffer, 1,
6492 coding->src_multibyte);
6493 set_buffer_internal (XBUFFER (coding->src_object));
6494 if (STRINGP (src_object))
6495 insert_from_string (src_object, from, from_byte, chars, bytes, 0);
6496 else if (BUFFERP (src_object))
6497 insert_from_buffer (XBUFFER (src_object), from, chars, 0);
6498 else
6499 insert_1_both (coding->source + from, chars, bytes, 0, 0, 0);
6500
6501 if (EQ (src_object, dst_object))
6502 {
6503 set_buffer_internal (XBUFFER (src_object));
6504 saved_pt = PT, saved_pt_byte = PT_BYTE;
6505 del_range_both (from, from_byte, to, to_byte, 1);
6506 set_buffer_internal (XBUFFER (coding->src_object));
6507 }
6508
6509 call2 (CODING_ATTR_PRE_WRITE (attrs),
6510 make_number (BEG), make_number (Z));
6511 coding->src_object = Fcurrent_buffer ();
6512 if (BEG != GPT)
6513 move_gap_both (BEG, BEG_BYTE);
6514 coding->src_chars = Z - BEG;
6515 coding->src_bytes = Z_BYTE - BEG_BYTE;
6516 coding->src_pos = BEG;
6517 coding->src_pos_byte = BEG_BYTE;
6518 coding->src_multibyte = Z < Z_BYTE;
6519 }
6520 else if (STRINGP (src_object))
6521 {
6522 code_conversion_save (buffer, 0, 0);
6523 coding->src_pos = from;
6524 coding->src_pos_byte = from_byte;
6525 }
6526 else if (BUFFERP (src_object))
6527 {
6528 code_conversion_save (buffer, 0, 0);
6529 set_buffer_internal (XBUFFER (src_object));
6530 if (EQ (src_object, dst_object))
6531 {
6532 saved_pt = PT, saved_pt_byte = PT_BYTE;
6533 coding->src_object = del_range_1 (from, to, 1, 1);
6534 coding->src_pos = 0;
6535 coding->src_pos_byte = 0;
6536 }
6537 else
6538 {
6539 if (from < GPT && to >= GPT)
6540 move_gap_both (from, from_byte);
6541 coding->src_pos = from;
6542 coding->src_pos_byte = from_byte;
6543 }
6544 }
6545 else
6546 code_conversion_save (buffer, 0, 0);
6547
6548 if (BUFFERP (dst_object))
6549 {
6550 coding->dst_object = dst_object;
6551 if (EQ (src_object, dst_object))
6552 {
6553 coding->dst_pos = from;
6554 coding->dst_pos_byte = from_byte;
6555 }
6556 else
6557 {
6558 coding->dst_pos = BUF_PT (XBUFFER (dst_object));
6559 coding->dst_pos_byte = BUF_PT_BYTE (XBUFFER (dst_object));
6560 }
6561 coding->dst_multibyte
6562 = ! NILP (XBUFFER (dst_object)->enable_multibyte_characters);
6563 }
6564 else if (EQ (dst_object, Qt))
6565 {
6566 coding->dst_object = Qnil;
6567 coding->dst_bytes = coding->src_chars;
6568 if (coding->dst_bytes == 0)
6569 coding->dst_bytes = 1;
6570 coding->destination = (unsigned char *) xmalloc (coding->dst_bytes);
6571 coding->dst_multibyte = 0;
6572 }
6573 else
6574 {
6575 coding->dst_object = Qnil;
6576 coding->dst_multibyte = 0;
6577 }
6578
6579 encode_coding (coding);
6580
6581 if (EQ (dst_object, Qt))
6582 {
6583 if (BUFFERP (coding->dst_object))
6584 coding->dst_object = Fbuffer_string ();
6585 else
6586 {
6587 coding->dst_object
6588 = make_unibyte_string ((char *) coding->destination,
6589 coding->produced);
6590 xfree (coding->destination);
6591 }
6592 }
6593
6594 if (saved_pt >= 0)
6595 {
6596 /* This is the case of:
6597 (BUFFERP (src_object) && EQ (src_object, dst_object))
6598 As we have moved PT while replacing the original buffer
6599 contents, we must recover it now. */
6600 set_buffer_internal (XBUFFER (src_object));
6601 if (saved_pt < from)
6602 TEMP_SET_PT_BOTH (saved_pt, saved_pt_byte);
6603 else if (saved_pt < from + chars)
6604 TEMP_SET_PT_BOTH (from, from_byte);
6605 else if (! NILP (current_buffer->enable_multibyte_characters))
6606 TEMP_SET_PT_BOTH (saved_pt + (coding->produced_char - chars),
6607 saved_pt_byte + (coding->produced - bytes));
6608 else
6609 TEMP_SET_PT_BOTH (saved_pt + (coding->produced - bytes),
6610 saved_pt_byte + (coding->produced - bytes));
6611 }
6612
6613 unbind_to (count, Qnil);
6614 }
6615
6616
6617 Lisp_Object
6618 preferred_coding_system ()
6619 {
6620 int id = coding_categories[coding_priorities[0]].id;
6621
6622 return CODING_ID_NAME (id);
6623 }
6624
6625 \f
6626 #ifdef emacs
6627 /*** 8. Emacs Lisp library functions ***/
6628
6629 DEFUN ("coding-system-p", Fcoding_system_p, Scoding_system_p, 1, 1, 0,
6630 doc: /* Return t if OBJECT is nil or a coding-system.
6631 See the documentation of `define-coding-system' for information
6632 about coding-system objects. */)
6633 (obj)
6634 Lisp_Object obj;
6635 {
6636 return ((NILP (obj) || CODING_SYSTEM_P (obj)) ? Qt : Qnil);
6637 }
6638
6639 DEFUN ("read-non-nil-coding-system", Fread_non_nil_coding_system,
6640 Sread_non_nil_coding_system, 1, 1, 0,
6641 doc: /* Read a coding system from the minibuffer, prompting with string PROMPT. */)
6642 (prompt)
6643 Lisp_Object prompt;
6644 {
6645 Lisp_Object val;
6646 do
6647 {
6648 val = Fcompleting_read (prompt, Vcoding_system_alist, Qnil,
6649 Qt, Qnil, Qcoding_system_history, Qnil, Qnil);
6650 }
6651 while (XSTRING (val)->size == 0);
6652 return (Fintern (val, Qnil));
6653 }
6654
6655 DEFUN ("read-coding-system", Fread_coding_system, Sread_coding_system, 1, 2, 0,
6656 doc: /* Read a coding system from the minibuffer, prompting with string PROMPT.
6657 If the user enters null input, return second argument DEFAULT-CODING-SYSTEM. */)
6658 (prompt, default_coding_system)
6659 Lisp_Object prompt, default_coding_system;
6660 {
6661 Lisp_Object val;
6662 if (SYMBOLP (default_coding_system))
6663 XSETSTRING (default_coding_system, XSYMBOL (default_coding_system)->name);
6664 val = Fcompleting_read (prompt, Vcoding_system_alist, Qnil,
6665 Qt, Qnil, Qcoding_system_history,
6666 default_coding_system, Qnil);
6667 return (XSTRING (val)->size == 0 ? Qnil : Fintern (val, Qnil));
6668 }
6669
6670 DEFUN ("check-coding-system", Fcheck_coding_system, Scheck_coding_system,
6671 1, 1, 0,
6672 doc: /* Check validity of CODING-SYSTEM.
6673 If valid, return CODING-SYSTEM, else signal a `coding-system-error' error. */)
6674 (coding_system)
6675 Lisp_Object coding_system;
6676 {
6677 CHECK_SYMBOL (coding_system);
6678 if (!NILP (Fcoding_system_p (coding_system)))
6679 return coding_system;
6680 while (1)
6681 Fsignal (Qcoding_system_error, Fcons (coding_system, Qnil));
6682 }
6683
6684 \f
6685 /* Detect how the bytes at SRC of length SRC_BYTES are encoded. If
6686 HIGHEST is nonzero, return the coding system of the highest
6687 priority among the detected coding systems. Otherwize return a
6688 list of detected coding systems sorted by their priorities. If
6689 MULTIBYTEP is nonzero, it is assumed that the bytes are in correct
6690 multibyte form but contains only ASCII and eight-bit chars.
6691 Otherwise, the bytes are raw bytes.
6692
6693 CODING-SYSTEM controls the detection as below:
6694
6695 If it is nil, detect both text-format and eol-format. If the
6696 text-format part of CODING-SYSTEM is already specified
6697 (e.g. `iso-latin-1'), detect only eol-format. If the eol-format
6698 part of CODING-SYSTEM is already specified (e.g. `undecided-unix'),
6699 detect only text-format. */
6700
6701 Lisp_Object
6702 detect_coding_system (src, src_bytes, highest, multibytep, coding_system)
6703 unsigned char *src;
6704 int src_bytes, highest;
6705 int multibytep;
6706 Lisp_Object coding_system;
6707 {
6708 unsigned char *src_end = src + src_bytes;
6709 Lisp_Object attrs, eol_type;
6710 Lisp_Object val;
6711 struct coding_system coding;
6712 int id;
6713 struct coding_detection_info detect_info;
6714
6715 if (NILP (coding_system))
6716 coding_system = Qundecided;
6717 setup_coding_system (coding_system, &coding);
6718 attrs = CODING_ID_ATTRS (coding.id);
6719 eol_type = CODING_ID_EOL_TYPE (coding.id);
6720 coding_system = CODING_ATTR_BASE_NAME (attrs);
6721
6722 coding.source = src;
6723 coding.src_bytes = src_bytes;
6724 coding.src_multibyte = multibytep;
6725 coding.consumed = 0;
6726 coding.mode |= CODING_MODE_LAST_BLOCK;
6727
6728 detect_info.checked = detect_info.found = detect_info.rejected = 0;
6729
6730 /* At first, detect text-format if necessary. */
6731 if (XINT (CODING_ATTR_CATEGORY (attrs)) == coding_category_undecided)
6732 {
6733 enum coding_category category;
6734 struct coding_system *this;
6735 int c, i;
6736
6737 for (; src < src_end; src++)
6738 {
6739 c = *src;
6740 if (c & 0x80
6741 || (c < 0x20 && (c == ISO_CODE_ESC
6742 || c == ISO_CODE_SI
6743 || c == ISO_CODE_SO)))
6744 break;
6745 }
6746 coding.head_ascii = src - coding.source;
6747
6748 if (src < src_end)
6749 for (i = 0; i < coding_category_raw_text; i++)
6750 {
6751 category = coding_priorities[i];
6752 this = coding_categories + category;
6753
6754 if (this->id < 0)
6755 {
6756 /* No coding system of this category is defined. */
6757 detect_info.rejected |= (1 << category);
6758 }
6759 else if (category >= coding_category_raw_text)
6760 continue;
6761 else if (detect_info.checked & (1 << category))
6762 {
6763 if (highest
6764 && (detect_info.found & (1 << category)))
6765 break;
6766 }
6767 else
6768 {
6769 if ((*(this->detector)) (&coding, &detect_info)
6770 && highest
6771 && (detect_info.found & (1 << category)))
6772 break;
6773 }
6774 }
6775
6776
6777 if (detect_info.rejected == CATEGORY_MASK_ANY)
6778 {
6779 detect_info.found = CATEGORY_MASK_RAW_TEXT;
6780 id = coding_categories[coding_category_raw_text].id;
6781 val = Fcons (make_number (id), Qnil);
6782 }
6783 else if (! detect_info.rejected && ! detect_info.found)
6784 {
6785 detect_info.found = CATEGORY_MASK_ANY;
6786 id = coding_categories[coding_category_undecided].id;
6787 val = Fcons (make_number (id), Qnil);
6788 }
6789 else if (highest)
6790 {
6791 if (detect_info.found)
6792 {
6793 detect_info.found = 1 << category;
6794 val = Fcons (make_number (this->id), Qnil);
6795 }
6796 else
6797 for (i = 0; i < coding_category_raw_text; i++)
6798 if (! (detect_info.rejected & (1 << coding_priorities[i])))
6799 {
6800 detect_info.found = 1 << coding_priorities[i];
6801 id = coding_categories[coding_priorities[i]].id;
6802 val = Fcons (make_number (id), Qnil);
6803 break;
6804 }
6805 }
6806 else
6807 {
6808 int mask = detect_info.rejected | detect_info.found;
6809 int found = 0;
6810 val = Qnil;
6811
6812 for (i = coding_category_raw_text - 1; i >= 0; i--)
6813 {
6814 category = coding_priorities[i];
6815 if (! (mask & (1 << category)))
6816 {
6817 found |= 1 << category;
6818 id = coding_categories[category].id;
6819 val = Fcons (make_number (id), val);
6820 }
6821 }
6822 for (i = coding_category_raw_text - 1; i >= 0; i--)
6823 {
6824 category = coding_priorities[i];
6825 if (detect_info.found & (1 << category))
6826 {
6827 id = coding_categories[category].id;
6828 val = Fcons (make_number (id), val);
6829 }
6830 }
6831 detect_info.found |= found;
6832 }
6833 }
6834 else
6835 {
6836 detect_info.found = 1 << XINT (CODING_ATTR_CATEGORY (attrs));
6837 val = Fcons (make_number (coding.id), Qnil);
6838 }
6839
6840 /* Then, detect eol-format if necessary. */
6841 {
6842 int normal_eol = -1, utf_16_be_eol = -1, utf_16_le_eol;
6843 Lisp_Object tail;
6844
6845 if (VECTORP (eol_type))
6846 {
6847 if (detect_info.found & ~CATEGORY_MASK_UTF_16)
6848 normal_eol = detect_eol (coding.source, src_bytes,
6849 coding_category_raw_text);
6850 if (detect_info.found & (CATEGORY_MASK_UTF_16_BE
6851 | CATEGORY_MASK_UTF_16_BE_NOSIG))
6852 utf_16_be_eol = detect_eol (coding.source, src_bytes,
6853 coding_category_utf_16_be);
6854 if (detect_info.found & (CATEGORY_MASK_UTF_16_LE
6855 | CATEGORY_MASK_UTF_16_LE_NOSIG))
6856 utf_16_le_eol = detect_eol (coding.source, src_bytes,
6857 coding_category_utf_16_le);
6858 }
6859 else
6860 {
6861 if (EQ (eol_type, Qunix))
6862 normal_eol = utf_16_be_eol = utf_16_le_eol = EOL_SEEN_LF;
6863 else if (EQ (eol_type, Qdos))
6864 normal_eol = utf_16_be_eol = utf_16_le_eol = EOL_SEEN_CRLF;
6865 else
6866 normal_eol = utf_16_be_eol = utf_16_le_eol = EOL_SEEN_CR;
6867 }
6868
6869 for (tail = val; CONSP (tail); tail = XCDR (tail))
6870 {
6871 enum coding_category category;
6872 int this_eol;
6873
6874 id = XINT (XCAR (tail));
6875 attrs = CODING_ID_ATTRS (id);
6876 category = XINT (CODING_ATTR_CATEGORY (attrs));
6877 eol_type = CODING_ID_EOL_TYPE (id);
6878 if (VECTORP (eol_type))
6879 {
6880 if (category == coding_category_utf_16_be
6881 || category == coding_category_utf_16_be_nosig)
6882 this_eol = utf_16_be_eol;
6883 else if (category == coding_category_utf_16_le
6884 || category == coding_category_utf_16_le_nosig)
6885 this_eol = utf_16_le_eol;
6886 else
6887 this_eol = normal_eol;
6888
6889 if (this_eol == EOL_SEEN_LF)
6890 XSETCAR (tail, AREF (eol_type, 0));
6891 else if (this_eol == EOL_SEEN_CRLF)
6892 XSETCAR (tail, AREF (eol_type, 1));
6893 else if (this_eol == EOL_SEEN_CR)
6894 XSETCAR (tail, AREF (eol_type, 2));
6895 else
6896 XSETCAR (tail, CODING_ID_NAME (id));
6897 }
6898 else
6899 XSETCAR (tail, CODING_ID_NAME (id));
6900 }
6901 }
6902
6903 return (highest ? XCAR (val) : val);
6904 }
6905
6906
6907 DEFUN ("detect-coding-region", Fdetect_coding_region, Sdetect_coding_region,
6908 2, 3, 0,
6909 doc: /* Detect coding system of the text in the region between START and END.
6910 Return a list of possible coding systems ordered by priority.
6911
6912 If only ASCII characters are found, it returns a list of single element
6913 `undecided' or its subsidiary coding system according to a detected
6914 end-of-line format.
6915
6916 If optional argument HIGHEST is non-nil, return the coding system of
6917 highest priority. */)
6918 (start, end, highest)
6919 Lisp_Object start, end, highest;
6920 {
6921 int from, to;
6922 int from_byte, to_byte;
6923
6924 CHECK_NUMBER_COERCE_MARKER (start);
6925 CHECK_NUMBER_COERCE_MARKER (end);
6926
6927 validate_region (&start, &end);
6928 from = XINT (start), to = XINT (end);
6929 from_byte = CHAR_TO_BYTE (from);
6930 to_byte = CHAR_TO_BYTE (to);
6931
6932 if (from < GPT && to >= GPT)
6933 move_gap_both (to, to_byte);
6934
6935 return detect_coding_system (BYTE_POS_ADDR (from_byte),
6936 to_byte - from_byte,
6937 !NILP (highest),
6938 !NILP (current_buffer
6939 ->enable_multibyte_characters),
6940 Qnil);
6941 }
6942
6943 DEFUN ("detect-coding-string", Fdetect_coding_string, Sdetect_coding_string,
6944 1, 2, 0,
6945 doc: /* Detect coding system of the text in STRING.
6946 Return a list of possible coding systems ordered by priority.
6947
6948 If only ASCII characters are found, it returns a list of single element
6949 `undecided' or its subsidiary coding system according to a detected
6950 end-of-line format.
6951
6952 If optional argument HIGHEST is non-nil, return the coding system of
6953 highest priority. */)
6954 (string, highest)
6955 Lisp_Object string, highest;
6956 {
6957 CHECK_STRING (string);
6958
6959 return detect_coding_system (XSTRING (string)->data,
6960 STRING_BYTES (XSTRING (string)),
6961 !NILP (highest),
6962 STRING_MULTIBYTE (string),
6963 Qnil);
6964 }
6965
6966
6967 static INLINE int
6968 char_encodable_p (c, attrs)
6969 int c;
6970 Lisp_Object attrs;
6971 {
6972 Lisp_Object tail;
6973 struct charset *charset;
6974
6975 for (tail = CODING_ATTR_CHARSET_LIST (attrs);
6976 CONSP (tail); tail = XCDR (tail))
6977 {
6978 charset = CHARSET_FROM_ID (XINT (XCAR (tail)));
6979 if (CHAR_CHARSET_P (c, charset))
6980 break;
6981 }
6982 return (! NILP (tail));
6983 }
6984
6985
6986 /* Return a list of coding systems that safely encode the text between
6987 START and END. If EXCLUDE is non-nil, it is a list of coding
6988 systems not to check. The returned list doesn't contain any such
6989 coding systems. In any case, if the text contains only ASCII or is
6990 unibyte, return t. */
6991
6992 DEFUN ("find-coding-systems-region-internal",
6993 Ffind_coding_systems_region_internal,
6994 Sfind_coding_systems_region_internal, 2, 3, 0,
6995 doc: /* Internal use only. */)
6996 (start, end, exclude)
6997 Lisp_Object start, end, exclude;
6998 {
6999 Lisp_Object coding_attrs_list, safe_codings;
7000 EMACS_INT start_byte, end_byte;
7001 const unsigned char *p, *pbeg, *pend;
7002 int c;
7003 Lisp_Object tail, elt;
7004
7005 if (STRINGP (start))
7006 {
7007 if (!STRING_MULTIBYTE (start)
7008 || XSTRING (start)->size == STRING_BYTES (XSTRING (start)))
7009 return Qt;
7010 start_byte = 0;
7011 end_byte = STRING_BYTES (XSTRING (start));
7012 }
7013 else
7014 {
7015 CHECK_NUMBER_COERCE_MARKER (start);
7016 CHECK_NUMBER_COERCE_MARKER (end);
7017 if (XINT (start) < BEG || XINT (end) > Z || XINT (start) > XINT (end))
7018 args_out_of_range (start, end);
7019 if (NILP (current_buffer->enable_multibyte_characters))
7020 return Qt;
7021 start_byte = CHAR_TO_BYTE (XINT (start));
7022 end_byte = CHAR_TO_BYTE (XINT (end));
7023 if (XINT (end) - XINT (start) == end_byte - start_byte)
7024 return Qt;
7025
7026 if (XINT (start) < GPT && XINT (end) > GPT)
7027 {
7028 if ((GPT - XINT (start)) < (XINT (end) - GPT))
7029 move_gap_both (XINT (start), start_byte);
7030 else
7031 move_gap_both (XINT (end), end_byte);
7032 }
7033 }
7034
7035 coding_attrs_list = Qnil;
7036 for (tail = Vcoding_system_list; CONSP (tail); tail = XCDR (tail))
7037 if (NILP (exclude)
7038 || NILP (Fmemq (XCAR (tail), exclude)))
7039 {
7040 Lisp_Object attrs;
7041
7042 attrs = AREF (CODING_SYSTEM_SPEC (XCAR (tail)), 0);
7043 if (EQ (XCAR (tail), CODING_ATTR_BASE_NAME (attrs))
7044 && ! EQ (CODING_ATTR_TYPE (attrs), Qundecided))
7045 coding_attrs_list = Fcons (attrs, coding_attrs_list);
7046 }
7047
7048 if (STRINGP (start))
7049 p = pbeg = XSTRING (start)->data;
7050 else
7051 p = pbeg = BYTE_POS_ADDR (start_byte);
7052 pend = p + (end_byte - start_byte);
7053
7054 while (p < pend && ASCII_BYTE_P (*p)) p++;
7055 while (p < pend && ASCII_BYTE_P (*(pend - 1))) pend--;
7056
7057 while (p < pend)
7058 {
7059 if (ASCII_BYTE_P (*p))
7060 p++;
7061 else
7062 {
7063 c = STRING_CHAR_ADVANCE (p);
7064
7065 charset_map_loaded = 0;
7066 for (tail = coding_attrs_list; CONSP (tail);)
7067 {
7068 elt = XCAR (tail);
7069 if (NILP (elt))
7070 tail = XCDR (tail);
7071 else if (char_encodable_p (c, elt))
7072 tail = XCDR (tail);
7073 else if (CONSP (XCDR (tail)))
7074 {
7075 XSETCAR (tail, XCAR (XCDR (tail)));
7076 XSETCDR (tail, XCDR (XCDR (tail)));
7077 }
7078 else
7079 {
7080 XSETCAR (tail, Qnil);
7081 tail = XCDR (tail);
7082 }
7083 }
7084 if (charset_map_loaded)
7085 {
7086 EMACS_INT p_offset = p - pbeg, pend_offset = pend - pbeg;
7087
7088 if (STRINGP (start))
7089 pbeg = XSTRING (start)->data;
7090 else
7091 pbeg = BYTE_POS_ADDR (start_byte);
7092 p = pbeg + p_offset;
7093 pend = pbeg + pend_offset;
7094 }
7095 }
7096 }
7097
7098 safe_codings = Qnil;
7099 for (tail = coding_attrs_list; CONSP (tail); tail = XCDR (tail))
7100 if (! NILP (XCAR (tail)))
7101 safe_codings = Fcons (CODING_ATTR_BASE_NAME (XCAR (tail)), safe_codings);
7102
7103 return safe_codings;
7104 }
7105
7106
7107 DEFUN ("check-coding-systems-region", Fcheck_coding_systems_region,
7108 Scheck_coding_systems_region, 3, 3, 0,
7109 doc: /* Check if the region is encodable by coding systems.
7110
7111 START and END are buffer positions specifying the region.
7112 CODING-SYSTEM-LIST is a list of coding systems to check.
7113
7114 The value is an alist ((CODING-SYSTEM POS0 POS1 ...) ...), where
7115 CODING-SYSTEM is a member of CODING-SYSTEM-LIst and can't encode the
7116 whole region, POS0, POS1, ... are buffer positions where non-encodable
7117 characters are found.
7118
7119 If all coding systems in CODING-SYSTEM-LIST can encode the region, the
7120 value is nil.
7121
7122 START may be a string. In that case, check if the string is
7123 encodable, and the value contains indices to the string instead of
7124 buffer positions. END is ignored. */)
7125 (start, end, coding_system_list)
7126 Lisp_Object start, end, coding_system_list;
7127 {
7128 Lisp_Object list;
7129 EMACS_INT start_byte, end_byte;
7130 int pos;
7131 const unsigned char *p, *pbeg, *pend;
7132 int c;
7133 Lisp_Object tail, elt;
7134
7135 if (STRINGP (start))
7136 {
7137 if (!STRING_MULTIBYTE (start)
7138 && XSTRING (start)->size != STRING_BYTES (XSTRING (start)))
7139 return Qnil;
7140 start_byte = 0;
7141 end_byte = STRING_BYTES (XSTRING (start));
7142 pos = 0;
7143 }
7144 else
7145 {
7146 CHECK_NUMBER_COERCE_MARKER (start);
7147 CHECK_NUMBER_COERCE_MARKER (end);
7148 if (XINT (start) < BEG || XINT (end) > Z || XINT (start) > XINT (end))
7149 args_out_of_range (start, end);
7150 if (NILP (current_buffer->enable_multibyte_characters))
7151 return Qnil;
7152 start_byte = CHAR_TO_BYTE (XINT (start));
7153 end_byte = CHAR_TO_BYTE (XINT (end));
7154 if (XINT (end) - XINT (start) == end_byte - start_byte)
7155 return Qt;
7156
7157 if (XINT (start) < GPT && XINT (end) > GPT)
7158 {
7159 if ((GPT - XINT (start)) < (XINT (end) - GPT))
7160 move_gap_both (XINT (start), start_byte);
7161 else
7162 move_gap_both (XINT (end), end_byte);
7163 }
7164 pos = XINT (start);
7165 }
7166
7167 list = Qnil;
7168 for (tail = coding_system_list; CONSP (tail); tail = XCDR (tail))
7169 {
7170 elt = XCAR (tail);
7171 list = Fcons (Fcons (elt, Fcons (AREF (CODING_SYSTEM_SPEC (elt), 0),
7172 Qnil)),
7173 list);
7174 }
7175
7176 if (STRINGP (start))
7177 p = pbeg = XSTRING (start)->data;
7178 else
7179 p = pbeg = BYTE_POS_ADDR (start_byte);
7180 pend = p + (end_byte - start_byte);
7181
7182 while (p < pend && ASCII_BYTE_P (*p)) p++, pos++;
7183 while (p < pend && ASCII_BYTE_P (*(pend - 1))) pend--;
7184
7185 while (p < pend)
7186 {
7187 if (ASCII_BYTE_P (*p))
7188 p++;
7189 else
7190 {
7191 c = STRING_CHAR_ADVANCE (p);
7192
7193 charset_map_loaded = 0;
7194 for (tail = list; CONSP (tail); tail = XCDR (tail))
7195 {
7196 elt = XCDR (XCAR (tail));
7197 if (! char_encodable_p (c, XCAR (elt)))
7198 XSETCDR (elt, Fcons (make_number (pos), XCDR (elt)));
7199 }
7200 if (charset_map_loaded)
7201 {
7202 EMACS_INT p_offset = p - pbeg, pend_offset = pend - pbeg;
7203
7204 if (STRINGP (start))
7205 pbeg = XSTRING (start)->data;
7206 else
7207 pbeg = BYTE_POS_ADDR (start_byte);
7208 p = pbeg + p_offset;
7209 pend = pbeg + pend_offset;
7210 }
7211 }
7212 pos++;
7213 }
7214
7215 tail = list;
7216 list = Qnil;
7217 for (; CONSP (tail); tail = XCDR (tail))
7218 {
7219 elt = XCAR (tail);
7220 if (CONSP (XCDR (XCDR (elt))))
7221 list = Fcons (Fcons (XCAR (elt), Fnreverse (XCDR (XCDR (elt)))),
7222 list);
7223 }
7224
7225 return list;
7226 }
7227
7228
7229
7230 Lisp_Object
7231 code_convert_region (start, end, coding_system, dst_object, encodep, norecord)
7232 Lisp_Object start, end, coding_system, dst_object;
7233 int encodep, norecord;
7234 {
7235 struct coding_system coding;
7236 EMACS_INT from, from_byte, to, to_byte;
7237 Lisp_Object src_object;
7238
7239 CHECK_NUMBER_COERCE_MARKER (start);
7240 CHECK_NUMBER_COERCE_MARKER (end);
7241 if (NILP (coding_system))
7242 coding_system = Qno_conversion;
7243 else
7244 CHECK_CODING_SYSTEM (coding_system);
7245 src_object = Fcurrent_buffer ();
7246 if (NILP (dst_object))
7247 dst_object = src_object;
7248 else if (! EQ (dst_object, Qt))
7249 CHECK_BUFFER (dst_object);
7250
7251 validate_region (&start, &end);
7252 from = XFASTINT (start);
7253 from_byte = CHAR_TO_BYTE (from);
7254 to = XFASTINT (end);
7255 to_byte = CHAR_TO_BYTE (to);
7256
7257 setup_coding_system (coding_system, &coding);
7258 coding.mode |= CODING_MODE_LAST_BLOCK;
7259
7260 if (encodep)
7261 encode_coding_object (&coding, src_object, from, from_byte, to, to_byte,
7262 dst_object);
7263 else
7264 decode_coding_object (&coding, src_object, from, from_byte, to, to_byte,
7265 dst_object);
7266 if (! norecord)
7267 Vlast_coding_system_used = CODING_ID_NAME (coding.id);
7268
7269 if (coding.result != CODING_RESULT_SUCCESS)
7270 error ("Code conversion error: %d", coding.result);
7271
7272 return (BUFFERP (dst_object)
7273 ? make_number (coding.produced_char)
7274 : coding.dst_object);
7275 }
7276
7277
7278 DEFUN ("decode-coding-region", Fdecode_coding_region, Sdecode_coding_region,
7279 3, 4, "r\nzCoding system: ",
7280 doc: /* Decode the current region from the specified coding system.
7281 When called from a program, takes four arguments:
7282 START, END, CODING-SYSTEM, and DESTINATION.
7283 START and END are buffer positions.
7284
7285 Optional 4th arguments DESTINATION specifies where the decoded text goes.
7286 If nil, the region between START and END is replace by the decoded text.
7287 If buffer, the decoded text is inserted in the buffer.
7288 If t, the decoded text is returned.
7289
7290 This function sets `last-coding-system-used' to the precise coding system
7291 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
7292 not fully specified.)
7293 It returns the length of the decoded text. */)
7294 (start, end, coding_system, destination)
7295 Lisp_Object start, end, coding_system, destination;
7296 {
7297 return code_convert_region (start, end, coding_system, destination, 0, 0);
7298 }
7299
7300 DEFUN ("encode-coding-region", Fencode_coding_region, Sencode_coding_region,
7301 3, 4, "r\nzCoding system: ",
7302 doc: /* Encode the current region by specified coding system.
7303 When called from a program, takes three arguments:
7304 START, END, and CODING-SYSTEM. START and END are buffer positions.
7305
7306 Optional 4th arguments DESTINATION specifies where the encoded text goes.
7307 If nil, the region between START and END is replace by the encoded text.
7308 If buffer, the encoded text is inserted in the buffer.
7309 If t, the encoded text is returned.
7310
7311 This function sets `last-coding-system-used' to the precise coding system
7312 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
7313 not fully specified.)
7314 It returns the length of the encoded text. */)
7315 (start, end, coding_system, destination)
7316 Lisp_Object start, end, coding_system, destination;
7317 {
7318 return code_convert_region (start, end, coding_system, destination, 1, 0);
7319 }
7320
7321 Lisp_Object
7322 code_convert_string (string, coding_system, dst_object,
7323 encodep, nocopy, norecord)
7324 Lisp_Object string, coding_system, dst_object;
7325 int encodep, nocopy, norecord;
7326 {
7327 struct coding_system coding;
7328 EMACS_INT chars, bytes;
7329
7330 CHECK_STRING (string);
7331 if (NILP (coding_system))
7332 {
7333 if (! norecord)
7334 Vlast_coding_system_used = Qno_conversion;
7335 if (NILP (dst_object))
7336 return (nocopy ? Fcopy_sequence (string) : string);
7337 }
7338
7339 if (NILP (coding_system))
7340 coding_system = Qno_conversion;
7341 else
7342 CHECK_CODING_SYSTEM (coding_system);
7343 if (NILP (dst_object))
7344 dst_object = Qt;
7345 else if (! EQ (dst_object, Qt))
7346 CHECK_BUFFER (dst_object);
7347
7348 setup_coding_system (coding_system, &coding);
7349 coding.mode |= CODING_MODE_LAST_BLOCK;
7350 chars = XSTRING (string)->size;
7351 bytes = STRING_BYTES (XSTRING (string));
7352 if (encodep)
7353 encode_coding_object (&coding, string, 0, 0, chars, bytes, dst_object);
7354 else
7355 decode_coding_object (&coding, string, 0, 0, chars, bytes, dst_object);
7356 if (! norecord)
7357 Vlast_coding_system_used = CODING_ID_NAME (coding.id);
7358
7359 if (coding.result != CODING_RESULT_SUCCESS)
7360 error ("Code conversion error: %d", coding.result);
7361
7362 return (BUFFERP (dst_object)
7363 ? make_number (coding.produced_char)
7364 : coding.dst_object);
7365 }
7366
7367
7368 /* Encode or decode STRING according to CODING_SYSTEM.
7369 Do not set Vlast_coding_system_used.
7370
7371 This function is called only from macros DECODE_FILE and
7372 ENCODE_FILE, thus we ignore character composition. */
7373
7374 Lisp_Object
7375 code_convert_string_norecord (string, coding_system, encodep)
7376 Lisp_Object string, coding_system;
7377 int encodep;
7378 {
7379 return code_convert_string (string, coding_system, Qt, encodep, 0, 1);
7380 }
7381
7382
7383 DEFUN ("decode-coding-string", Fdecode_coding_string, Sdecode_coding_string,
7384 2, 4, 0,
7385 doc: /* Decode STRING which is encoded in CODING-SYSTEM, and return the result.
7386
7387 Optional third arg NOCOPY non-nil means it is OK to return STRING itself
7388 if the decoding operation is trivial.
7389
7390 Optional fourth arg BUFFER non-nil meant that the decoded text is
7391 inserted in BUFFER instead of returned as a string. In this case,
7392 the return value is BUFFER.
7393
7394 This function sets `last-coding-system-used' to the precise coding system
7395 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
7396 not fully specified. */)
7397 (string, coding_system, nocopy, buffer)
7398 Lisp_Object string, coding_system, nocopy, buffer;
7399 {
7400 return code_convert_string (string, coding_system, buffer,
7401 0, ! NILP (nocopy), 0);
7402 }
7403
7404 DEFUN ("encode-coding-string", Fencode_coding_string, Sencode_coding_string,
7405 2, 4, 0,
7406 doc: /* Encode STRING to CODING-SYSTEM, and return the result.
7407
7408 Optional third arg NOCOPY non-nil means it is OK to return STRING
7409 itself if the encoding operation is trivial.
7410
7411 Optional fourth arg BUFFER non-nil meant that the encoded text is
7412 inserted in BUFFER instead of returned as a string. In this case,
7413 the return value is BUFFER.
7414
7415 This function sets `last-coding-system-used' to the precise coding system
7416 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
7417 not fully specified.) */)
7418 (string, coding_system, nocopy, buffer)
7419 Lisp_Object string, coding_system, nocopy, buffer;
7420 {
7421 return code_convert_string (string, coding_system, buffer,
7422 1, ! NILP (nocopy), 1);
7423 }
7424
7425 \f
7426 DEFUN ("decode-sjis-char", Fdecode_sjis_char, Sdecode_sjis_char, 1, 1, 0,
7427 doc: /* Decode a Japanese character which has CODE in shift_jis encoding.
7428 Return the corresponding character. */)
7429 (code)
7430 Lisp_Object code;
7431 {
7432 Lisp_Object spec, attrs, val;
7433 struct charset *charset_roman, *charset_kanji, *charset_kana, *charset;
7434 int c;
7435
7436 CHECK_NATNUM (code);
7437 c = XFASTINT (code);
7438 CHECK_CODING_SYSTEM_GET_SPEC (Vsjis_coding_system, spec);
7439 attrs = AREF (spec, 0);
7440
7441 if (ASCII_BYTE_P (c)
7442 && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
7443 return code;
7444
7445 val = CODING_ATTR_CHARSET_LIST (attrs);
7446 charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
7447 charset_kana = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
7448 charset_kanji = CHARSET_FROM_ID (XINT (XCAR (val)));
7449
7450 if (c <= 0x7F)
7451 charset = charset_roman;
7452 else if (c >= 0xA0 && c < 0xDF)
7453 {
7454 charset = charset_kana;
7455 c -= 0x80;
7456 }
7457 else
7458 {
7459 int s1 = c >> 8, s2 = c & 0xFF;
7460
7461 if (s1 < 0x81 || (s1 > 0x9F && s1 < 0xE0) || s1 > 0xEF
7462 || s2 < 0x40 || s2 == 0x7F || s2 > 0xFC)
7463 error ("Invalid code: %d", code);
7464 SJIS_TO_JIS (c);
7465 charset = charset_kanji;
7466 }
7467 c = DECODE_CHAR (charset, c);
7468 if (c < 0)
7469 error ("Invalid code: %d", code);
7470 return make_number (c);
7471 }
7472
7473
7474 DEFUN ("encode-sjis-char", Fencode_sjis_char, Sencode_sjis_char, 1, 1, 0,
7475 doc: /* Encode a Japanese character CHAR to shift_jis encoding.
7476 Return the corresponding code in SJIS. */)
7477 (ch)
7478 Lisp_Object ch;
7479 {
7480 Lisp_Object spec, attrs, charset_list;
7481 int c;
7482 struct charset *charset;
7483 unsigned code;
7484
7485 CHECK_CHARACTER (ch);
7486 c = XFASTINT (ch);
7487 CHECK_CODING_SYSTEM_GET_SPEC (Vsjis_coding_system, spec);
7488 attrs = AREF (spec, 0);
7489
7490 if (ASCII_CHAR_P (c)
7491 && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
7492 return ch;
7493
7494 charset_list = CODING_ATTR_CHARSET_LIST (attrs);
7495 charset = char_charset (c, charset_list, &code);
7496 if (code == CHARSET_INVALID_CODE (charset))
7497 error ("Can't encode by shift_jis encoding: %d", c);
7498 JIS_TO_SJIS (code);
7499
7500 return make_number (code);
7501 }
7502
7503 DEFUN ("decode-big5-char", Fdecode_big5_char, Sdecode_big5_char, 1, 1, 0,
7504 doc: /* Decode a Big5 character which has CODE in BIG5 coding system.
7505 Return the corresponding character. */)
7506 (code)
7507 Lisp_Object code;
7508 {
7509 Lisp_Object spec, attrs, val;
7510 struct charset *charset_roman, *charset_big5, *charset;
7511 int c;
7512
7513 CHECK_NATNUM (code);
7514 c = XFASTINT (code);
7515 CHECK_CODING_SYSTEM_GET_SPEC (Vbig5_coding_system, spec);
7516 attrs = AREF (spec, 0);
7517
7518 if (ASCII_BYTE_P (c)
7519 && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
7520 return code;
7521
7522 val = CODING_ATTR_CHARSET_LIST (attrs);
7523 charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
7524 charset_big5 = CHARSET_FROM_ID (XINT (XCAR (val)));
7525
7526 if (c <= 0x7F)
7527 charset = charset_roman;
7528 else
7529 {
7530 int b1 = c >> 8, b2 = c & 0x7F;
7531 if (b1 < 0xA1 || b1 > 0xFE
7532 || b2 < 0x40 || (b2 > 0x7E && b2 < 0xA1) || b2 > 0xFE)
7533 error ("Invalid code: %d", code);
7534 charset = charset_big5;
7535 }
7536 c = DECODE_CHAR (charset, (unsigned )c);
7537 if (c < 0)
7538 error ("Invalid code: %d", code);
7539 return make_number (c);
7540 }
7541
7542 DEFUN ("encode-big5-char", Fencode_big5_char, Sencode_big5_char, 1, 1, 0,
7543 doc: /* Encode the Big5 character CHAR to BIG5 coding system.
7544 Return the corresponding character code in Big5. */)
7545 (ch)
7546 Lisp_Object ch;
7547 {
7548 Lisp_Object spec, attrs, charset_list;
7549 struct charset *charset;
7550 int c;
7551 unsigned code;
7552
7553 CHECK_CHARACTER (ch);
7554 c = XFASTINT (ch);
7555 CHECK_CODING_SYSTEM_GET_SPEC (Vbig5_coding_system, spec);
7556 attrs = AREF (spec, 0);
7557 if (ASCII_CHAR_P (c)
7558 && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
7559 return ch;
7560
7561 charset_list = CODING_ATTR_CHARSET_LIST (attrs);
7562 charset = char_charset (c, charset_list, &code);
7563 if (code == CHARSET_INVALID_CODE (charset))
7564 error ("Can't encode by Big5 encoding: %d", c);
7565
7566 return make_number (code);
7567 }
7568
7569 \f
7570 DEFUN ("set-terminal-coding-system-internal",
7571 Fset_terminal_coding_system_internal,
7572 Sset_terminal_coding_system_internal, 1, 1, 0,
7573 doc: /* Internal use only. */)
7574 (coding_system)
7575 Lisp_Object coding_system;
7576 {
7577 CHECK_SYMBOL (coding_system);
7578 setup_coding_system (Fcheck_coding_system (coding_system),
7579 &terminal_coding);
7580
7581 /* We had better not send unsafe characters to terminal. */
7582 terminal_coding.mode |= CODING_MODE_SAFE_ENCODING;
7583 /* Characer composition should be disabled. */
7584 terminal_coding.common_flags &= ~CODING_ANNOTATE_COMPOSITION_MASK;
7585 terminal_coding.src_multibyte = 1;
7586 terminal_coding.dst_multibyte = 0;
7587 return Qnil;
7588 }
7589
7590 DEFUN ("set-safe-terminal-coding-system-internal",
7591 Fset_safe_terminal_coding_system_internal,
7592 Sset_safe_terminal_coding_system_internal, 1, 1, 0,
7593 doc: /* Internal use only. */)
7594 (coding_system)
7595 Lisp_Object coding_system;
7596 {
7597 CHECK_SYMBOL (coding_system);
7598 setup_coding_system (Fcheck_coding_system (coding_system),
7599 &safe_terminal_coding);
7600 /* Characer composition should be disabled. */
7601 safe_terminal_coding.common_flags &= ~CODING_ANNOTATE_COMPOSITION_MASK;
7602 safe_terminal_coding.src_multibyte = 1;
7603 safe_terminal_coding.dst_multibyte = 0;
7604 return Qnil;
7605 }
7606
7607 DEFUN ("terminal-coding-system",
7608 Fterminal_coding_system, Sterminal_coding_system, 0, 0, 0,
7609 doc: /* Return coding system specified for terminal output. */)
7610 ()
7611 {
7612 return CODING_ID_NAME (terminal_coding.id);
7613 }
7614
7615 DEFUN ("set-keyboard-coding-system-internal",
7616 Fset_keyboard_coding_system_internal,
7617 Sset_keyboard_coding_system_internal, 1, 1, 0,
7618 doc: /* Internal use only. */)
7619 (coding_system)
7620 Lisp_Object coding_system;
7621 {
7622 CHECK_SYMBOL (coding_system);
7623 setup_coding_system (Fcheck_coding_system (coding_system),
7624 &keyboard_coding);
7625 /* Characer composition should be disabled. */
7626 keyboard_coding.common_flags &= ~CODING_ANNOTATE_COMPOSITION_MASK;
7627 return Qnil;
7628 }
7629
7630 DEFUN ("keyboard-coding-system",
7631 Fkeyboard_coding_system, Skeyboard_coding_system, 0, 0, 0,
7632 doc: /* Return coding system specified for decoding keyboard input. */)
7633 ()
7634 {
7635 return CODING_ID_NAME (keyboard_coding.id);
7636 }
7637
7638 \f
7639 DEFUN ("find-operation-coding-system", Ffind_operation_coding_system,
7640 Sfind_operation_coding_system, 1, MANY, 0,
7641 doc: /* Choose a coding system for an operation based on the target name.
7642 The value names a pair of coding systems: (DECODING-SYSTEM . ENCODING-SYSTEM).
7643 DECODING-SYSTEM is the coding system to use for decoding
7644 \(in case OPERATION does decoding), and ENCODING-SYSTEM is the coding system
7645 for encoding (in case OPERATION does encoding).
7646
7647 The first argument OPERATION specifies an I/O primitive:
7648 For file I/O, `insert-file-contents' or `write-region'.
7649 For process I/O, `call-process', `call-process-region', or `start-process'.
7650 For network I/O, `open-network-stream'.
7651
7652 The remaining arguments should be the same arguments that were passed
7653 to the primitive. Depending on which primitive, one of those arguments
7654 is selected as the TARGET. For example, if OPERATION does file I/O,
7655 whichever argument specifies the file name is TARGET.
7656
7657 TARGET has a meaning which depends on OPERATION:
7658 For file I/O, TARGET is a file name.
7659 For process I/O, TARGET is a process name.
7660 For network I/O, TARGET is a service name or a port number
7661
7662 This function looks up what specified for TARGET in,
7663 `file-coding-system-alist', `process-coding-system-alist',
7664 or `network-coding-system-alist' depending on OPERATION.
7665 They may specify a coding system, a cons of coding systems,
7666 or a function symbol to call.
7667 In the last case, we call the function with one argument,
7668 which is a list of all the arguments given to this function.
7669
7670 usage: (find-operation-coding-system OPERATION ARGUMENTS ...) */)
7671 (nargs, args)
7672 int nargs;
7673 Lisp_Object *args;
7674 {
7675 Lisp_Object operation, target_idx, target, val;
7676 register Lisp_Object chain;
7677
7678 if (nargs < 2)
7679 error ("Too few arguments");
7680 operation = args[0];
7681 if (!SYMBOLP (operation)
7682 || !INTEGERP (target_idx = Fget (operation, Qtarget_idx)))
7683 error ("Invalid first arguement");
7684 if (nargs < 1 + XINT (target_idx))
7685 error ("Too few arguments for operation: %s",
7686 XSYMBOL (operation)->name->data);
7687 target = args[XINT (target_idx) + 1];
7688 if (!(STRINGP (target)
7689 || (EQ (operation, Qopen_network_stream) && INTEGERP (target))))
7690 error ("Invalid %dth argument", XINT (target_idx) + 1);
7691
7692 chain = ((EQ (operation, Qinsert_file_contents)
7693 || EQ (operation, Qwrite_region))
7694 ? Vfile_coding_system_alist
7695 : (EQ (operation, Qopen_network_stream)
7696 ? Vnetwork_coding_system_alist
7697 : Vprocess_coding_system_alist));
7698 if (NILP (chain))
7699 return Qnil;
7700
7701 for (; CONSP (chain); chain = XCDR (chain))
7702 {
7703 Lisp_Object elt;
7704
7705 elt = XCAR (chain);
7706 if (CONSP (elt)
7707 && ((STRINGP (target)
7708 && STRINGP (XCAR (elt))
7709 && fast_string_match (XCAR (elt), target) >= 0)
7710 || (INTEGERP (target) && EQ (target, XCAR (elt)))))
7711 {
7712 val = XCDR (elt);
7713 /* Here, if VAL is both a valid coding system and a valid
7714 function symbol, we return VAL as a coding system. */
7715 if (CONSP (val))
7716 return val;
7717 if (! SYMBOLP (val))
7718 return Qnil;
7719 if (! NILP (Fcoding_system_p (val)))
7720 return Fcons (val, val);
7721 if (! NILP (Ffboundp (val)))
7722 {
7723 val = call1 (val, Flist (nargs, args));
7724 if (CONSP (val))
7725 return val;
7726 if (SYMBOLP (val) && ! NILP (Fcoding_system_p (val)))
7727 return Fcons (val, val);
7728 }
7729 return Qnil;
7730 }
7731 }
7732 return Qnil;
7733 }
7734
7735 DEFUN ("set-coding-system-priority", Fset_coding_system_priority,
7736 Sset_coding_system_priority, 0, MANY, 0,
7737 doc: /* Assign higher priority to the coding systems given as arguments.
7738 usage: (set-coding-system-priority CODING-SYSTEM ...) */)
7739 (nargs, args)
7740 int nargs;
7741 Lisp_Object *args;
7742 {
7743 int i, j;
7744 int changed[coding_category_max];
7745 enum coding_category priorities[coding_category_max];
7746
7747 bzero (changed, sizeof changed);
7748
7749 for (i = j = 0; i < nargs; i++)
7750 {
7751 enum coding_category category;
7752 Lisp_Object spec, attrs;
7753
7754 CHECK_CODING_SYSTEM_GET_SPEC (args[i], spec);
7755 attrs = AREF (spec, 0);
7756 category = XINT (CODING_ATTR_CATEGORY (attrs));
7757 if (changed[category])
7758 /* Ignore this coding system because a coding system of the
7759 same category already had a higher priority. */
7760 continue;
7761 changed[category] = 1;
7762 priorities[j++] = category;
7763 if (coding_categories[category].id >= 0
7764 && ! EQ (args[i], CODING_ID_NAME (coding_categories[category].id)))
7765 setup_coding_system (args[i], &coding_categories[category]);
7766 }
7767
7768 /* Now we have decided top J priorities. Reflect the order of the
7769 original priorities to the remaining priorities. */
7770
7771 for (i = j, j = 0; i < coding_category_max; i++, j++)
7772 {
7773 while (j < coding_category_max
7774 && changed[coding_priorities[j]])
7775 j++;
7776 if (j == coding_category_max)
7777 abort ();
7778 priorities[i] = coding_priorities[j];
7779 }
7780
7781 bcopy (priorities, coding_priorities, sizeof priorities);
7782 return Qnil;
7783 }
7784
7785 DEFUN ("coding-system-priority-list", Fcoding_system_priority_list,
7786 Scoding_system_priority_list, 0, 1, 0,
7787 doc: /* Return a list of coding systems ordered by their priorities.
7788 HIGHESTP non-nil means just return the highest priority one. */)
7789 (highestp)
7790 Lisp_Object highestp;
7791 {
7792 int i;
7793 Lisp_Object val;
7794
7795 for (i = 0, val = Qnil; i < coding_category_max; i++)
7796 {
7797 enum coding_category category = coding_priorities[i];
7798 int id = coding_categories[category].id;
7799 Lisp_Object attrs;
7800
7801 if (id < 0)
7802 continue;
7803 attrs = CODING_ID_ATTRS (id);
7804 if (! NILP (highestp))
7805 return CODING_ATTR_BASE_NAME (attrs);
7806 val = Fcons (CODING_ATTR_BASE_NAME (attrs), val);
7807 }
7808 return Fnreverse (val);
7809 }
7810
7811 static char *suffixes[] = { "-unix", "-dos", "-mac" };
7812
7813 static Lisp_Object
7814 make_subsidiaries (base)
7815 Lisp_Object base;
7816 {
7817 Lisp_Object subsidiaries;
7818 int base_name_len = STRING_BYTES (XSYMBOL (base)->name);
7819 char *buf = (char *) alloca (base_name_len + 6);
7820 int i;
7821
7822 bcopy (XSYMBOL (base)->name->data, buf, base_name_len);
7823 subsidiaries = Fmake_vector (make_number (3), Qnil);
7824 for (i = 0; i < 3; i++)
7825 {
7826 bcopy (suffixes[i], buf + base_name_len, strlen (suffixes[i]) + 1);
7827 ASET (subsidiaries, i, intern (buf));
7828 }
7829 return subsidiaries;
7830 }
7831
7832
7833 DEFUN ("define-coding-system-internal", Fdefine_coding_system_internal,
7834 Sdefine_coding_system_internal, coding_arg_max, MANY, 0,
7835 doc: /* For internal use only.
7836 usage: (define-coding-system-internal ...) */)
7837 (nargs, args)
7838 int nargs;
7839 Lisp_Object *args;
7840 {
7841 Lisp_Object name;
7842 Lisp_Object spec_vec; /* [ ATTRS ALIASE EOL_TYPE ] */
7843 Lisp_Object attrs; /* Vector of attributes. */
7844 Lisp_Object eol_type;
7845 Lisp_Object aliases;
7846 Lisp_Object coding_type, charset_list, safe_charsets;
7847 enum coding_category category;
7848 Lisp_Object tail, val;
7849 int max_charset_id = 0;
7850 int i;
7851
7852 if (nargs < coding_arg_max)
7853 goto short_args;
7854
7855 attrs = Fmake_vector (make_number (coding_attr_last_index), Qnil);
7856
7857 name = args[coding_arg_name];
7858 CHECK_SYMBOL (name);
7859 CODING_ATTR_BASE_NAME (attrs) = name;
7860
7861 val = args[coding_arg_mnemonic];
7862 if (! STRINGP (val))
7863 CHECK_CHARACTER (val);
7864 CODING_ATTR_MNEMONIC (attrs) = val;
7865
7866 coding_type = args[coding_arg_coding_type];
7867 CHECK_SYMBOL (coding_type);
7868 CODING_ATTR_TYPE (attrs) = coding_type;
7869
7870 charset_list = args[coding_arg_charset_list];
7871 if (SYMBOLP (charset_list))
7872 {
7873 if (EQ (charset_list, Qiso_2022))
7874 {
7875 if (! EQ (coding_type, Qiso_2022))
7876 error ("Invalid charset-list");
7877 charset_list = Viso_2022_charset_list;
7878 }
7879 else if (EQ (charset_list, Qemacs_mule))
7880 {
7881 if (! EQ (coding_type, Qemacs_mule))
7882 error ("Invalid charset-list");
7883 charset_list = Vemacs_mule_charset_list;
7884 }
7885 for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
7886 if (max_charset_id < XFASTINT (XCAR (tail)))
7887 max_charset_id = XFASTINT (XCAR (tail));
7888 }
7889 else
7890 {
7891 charset_list = Fcopy_sequence (charset_list);
7892 for (tail = charset_list; !NILP (tail); tail = Fcdr (tail))
7893 {
7894 struct charset *charset;
7895
7896 val = Fcar (tail);
7897 CHECK_CHARSET_GET_CHARSET (val, charset);
7898 if (EQ (coding_type, Qiso_2022)
7899 ? CHARSET_ISO_FINAL (charset) < 0
7900 : EQ (coding_type, Qemacs_mule)
7901 ? CHARSET_EMACS_MULE_ID (charset) < 0
7902 : 0)
7903 error ("Can't handle charset `%s'",
7904 XSYMBOL (CHARSET_NAME (charset))->name->data);
7905
7906 XCAR (tail) = make_number (charset->id);
7907 if (max_charset_id < charset->id)
7908 max_charset_id = charset->id;
7909 }
7910 }
7911 CODING_ATTR_CHARSET_LIST (attrs) = charset_list;
7912
7913 safe_charsets = Fmake_string (make_number (max_charset_id + 1),
7914 make_number (255));
7915 for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
7916 XSTRING (safe_charsets)->data[XFASTINT (XCAR (tail))] = 0;
7917 CODING_ATTR_SAFE_CHARSETS (attrs) = safe_charsets;
7918
7919 CODING_ATTR_ASCII_COMPAT (attrs) = args[coding_arg_ascii_compatible_p];
7920
7921 val = args[coding_arg_decode_translation_table];
7922 if (! NILP (val))
7923 CHECK_CHAR_TABLE (val);
7924 CODING_ATTR_DECODE_TBL (attrs) = val;
7925
7926 val = args[coding_arg_encode_translation_table];
7927 if (! NILP (val))
7928 CHECK_CHAR_TABLE (val);
7929 CODING_ATTR_ENCODE_TBL (attrs) = val;
7930
7931 val = args[coding_arg_post_read_conversion];
7932 CHECK_SYMBOL (val);
7933 CODING_ATTR_POST_READ (attrs) = val;
7934
7935 val = args[coding_arg_pre_write_conversion];
7936 CHECK_SYMBOL (val);
7937 CODING_ATTR_PRE_WRITE (attrs) = val;
7938
7939 val = args[coding_arg_default_char];
7940 if (NILP (val))
7941 CODING_ATTR_DEFAULT_CHAR (attrs) = make_number (' ');
7942 else
7943 {
7944 CHECK_CHARACTER (val);
7945 CODING_ATTR_DEFAULT_CHAR (attrs) = val;
7946 }
7947
7948 val = args[coding_arg_plist];
7949 CHECK_LIST (val);
7950 CODING_ATTR_PLIST (attrs) = val;
7951
7952 if (EQ (coding_type, Qcharset))
7953 {
7954 Lisp_Object list;
7955 /* Generate a lisp vector of 256 elements. Each element is nil,
7956 integer, or a list of charset IDs.
7957
7958 If Nth element is nil, the byte code N is invalid in this
7959 coding system.
7960
7961 If Nth element is a number NUM, N is the first byte of a
7962 charset whose ID is NUM.
7963
7964 If Nth element is a list of charset IDs, N is the first byte
7965 of one of them. The list is sorted by dimensions of the
7966 charsets. A charset of smaller dimension comes firtst.
7967 */
7968 for (list = Qnil, tail = charset_list; CONSP (tail); tail = XCDR (tail))
7969 {
7970 struct charset *charset = CHARSET_FROM_ID (XFASTINT (XCAR (tail)));
7971
7972 if (charset->method == CHARSET_METHOD_SUPERSET)
7973 {
7974 val = CHARSET_SUPERSET (charset);
7975 for (; CONSP (val); val = XCDR (val))
7976 list = Fcons (XCAR (XCAR (val)), list);
7977 }
7978 else
7979 list = Fcons (XCAR (tail), list);
7980 }
7981
7982 val = Fmake_vector (make_number (256), Qnil);
7983
7984 for (tail = Fnreverse (list); CONSP (tail); tail = XCDR (tail))
7985 {
7986 struct charset *charset = CHARSET_FROM_ID (XFASTINT (XCAR (tail)));
7987 int dim = CHARSET_DIMENSION (charset);
7988 int idx = (dim - 1) * 4;
7989
7990 if (CHARSET_ASCII_COMPATIBLE_P (charset))
7991 CODING_ATTR_ASCII_COMPAT (attrs) = Qt;
7992
7993 for (i = charset->code_space[idx];
7994 i <= charset->code_space[idx + 1]; i++)
7995 {
7996 Lisp_Object tmp, tmp2;
7997 int dim2;
7998
7999 tmp = AREF (val, i);
8000 if (NILP (tmp))
8001 tmp = XCAR (tail);
8002 else if (NUMBERP (tmp))
8003 {
8004 dim2 = CHARSET_DIMENSION (CHARSET_FROM_ID (XFASTINT (tmp)));
8005 if (dim < dim2)
8006 tmp = Fcons (XCAR (tail), Fcons (tmp, Qnil));
8007 else
8008 tmp = Fcons (tmp, Fcons (XCAR (tail), Qnil));
8009 }
8010 else
8011 {
8012 for (tmp2 = tmp; CONSP (tmp2); tmp2 = XCDR (tmp2))
8013 {
8014 dim2 = CHARSET_DIMENSION (CHARSET_FROM_ID (XFASTINT (XCAR (tmp2))));
8015 if (dim < dim2)
8016 break;
8017 }
8018 if (NILP (tmp2))
8019 tmp = nconc2 (tmp, Fcons (XCAR (tail), Qnil));
8020 else
8021 {
8022 XSETCDR (tmp2, Fcons (XCAR (tmp2), XCDR (tmp2)));
8023 XSETCAR (tmp2, XCAR (tail));
8024 }
8025 }
8026 ASET (val, i, tmp);
8027 }
8028 }
8029 ASET (attrs, coding_attr_charset_valids, val);
8030 category = coding_category_charset;
8031 }
8032 else if (EQ (coding_type, Qccl))
8033 {
8034 Lisp_Object valids;
8035
8036 if (nargs < coding_arg_ccl_max)
8037 goto short_args;
8038
8039 val = args[coding_arg_ccl_decoder];
8040 CHECK_CCL_PROGRAM (val);
8041 if (VECTORP (val))
8042 val = Fcopy_sequence (val);
8043 ASET (attrs, coding_attr_ccl_decoder, val);
8044
8045 val = args[coding_arg_ccl_encoder];
8046 CHECK_CCL_PROGRAM (val);
8047 if (VECTORP (val))
8048 val = Fcopy_sequence (val);
8049 ASET (attrs, coding_attr_ccl_encoder, val);
8050
8051 val = args[coding_arg_ccl_valids];
8052 valids = Fmake_string (make_number (256), make_number (0));
8053 for (tail = val; !NILP (tail); tail = Fcdr (tail))
8054 {
8055 int from, to;
8056
8057 val = Fcar (tail);
8058 if (INTEGERP (val))
8059 {
8060 from = to = XINT (val);
8061 if (from < 0 || from > 255)
8062 args_out_of_range_3 (val, make_number (0), make_number (255));
8063 }
8064 else
8065 {
8066 CHECK_CONS (val);
8067 CHECK_NUMBER (XCAR (val));
8068 CHECK_NUMBER (XCDR (val));
8069 from = XINT (XCAR (val));
8070 if (from < 0 || from > 255)
8071 args_out_of_range_3 (XCAR (val),
8072 make_number (0), make_number (255));
8073 to = XINT (XCDR (val));
8074 if (to < from || to > 255)
8075 args_out_of_range_3 (XCDR (val),
8076 XCAR (val), make_number (255));
8077 }
8078 for (i = from; i <= to; i++)
8079 XSTRING (valids)->data[i] = 1;
8080 }
8081 ASET (attrs, coding_attr_ccl_valids, valids);
8082
8083 category = coding_category_ccl;
8084 }
8085 else if (EQ (coding_type, Qutf_16))
8086 {
8087 Lisp_Object bom, endian;
8088
8089 CODING_ATTR_ASCII_COMPAT (attrs) = Qnil;
8090
8091 if (nargs < coding_arg_utf16_max)
8092 goto short_args;
8093
8094 bom = args[coding_arg_utf16_bom];
8095 if (! NILP (bom) && ! EQ (bom, Qt))
8096 {
8097 CHECK_CONS (bom);
8098 CHECK_CODING_SYSTEM (XCAR (bom));
8099 CHECK_CODING_SYSTEM (XCDR (bom));
8100 }
8101 ASET (attrs, coding_attr_utf_16_bom, bom);
8102
8103 endian = args[coding_arg_utf16_endian];
8104 CHECK_SYMBOL (endian);
8105 if (NILP (endian))
8106 endian = Qbig;
8107 else if (! EQ (endian, Qbig) && ! EQ (endian, Qlittle))
8108 error ("Invalid endian: %s", XSYMBOL (endian)->name->data);
8109 ASET (attrs, coding_attr_utf_16_endian, endian);
8110
8111 category = (CONSP (bom)
8112 ? coding_category_utf_16_auto
8113 : NILP (bom)
8114 ? (EQ (endian, Qbig)
8115 ? coding_category_utf_16_be_nosig
8116 : coding_category_utf_16_le_nosig)
8117 : (EQ (endian, Qbig)
8118 ? coding_category_utf_16_be
8119 : coding_category_utf_16_le));
8120 }
8121 else if (EQ (coding_type, Qiso_2022))
8122 {
8123 Lisp_Object initial, reg_usage, request, flags;
8124 int i;
8125
8126 if (nargs < coding_arg_iso2022_max)
8127 goto short_args;
8128
8129 initial = Fcopy_sequence (args[coding_arg_iso2022_initial]);
8130 CHECK_VECTOR (initial);
8131 for (i = 0; i < 4; i++)
8132 {
8133 val = Faref (initial, make_number (i));
8134 if (! NILP (val))
8135 {
8136 struct charset *charset;
8137
8138 CHECK_CHARSET_GET_CHARSET (val, charset);
8139 ASET (initial, i, make_number (CHARSET_ID (charset)));
8140 if (i == 0 && CHARSET_ASCII_COMPATIBLE_P (charset))
8141 CODING_ATTR_ASCII_COMPAT (attrs) = Qt;
8142 }
8143 else
8144 ASET (initial, i, make_number (-1));
8145 }
8146
8147 reg_usage = args[coding_arg_iso2022_reg_usage];
8148 CHECK_CONS (reg_usage);
8149 CHECK_NATNUM (XCAR (reg_usage));
8150 CHECK_NATNUM (XCDR (reg_usage));
8151
8152 request = Fcopy_sequence (args[coding_arg_iso2022_request]);
8153 for (tail = request; ! NILP (tail); tail = Fcdr (tail))
8154 {
8155 int id;
8156
8157 val = Fcar (tail);
8158 CHECK_CONS (val);
8159 CHECK_CHARSET_GET_ID (XCAR (val), id);
8160 CHECK_NATNUM (XCDR (val));
8161 if (XINT (XCDR (val)) >= 4)
8162 error ("Invalid graphic register number: %d", XINT (XCDR (val)));
8163 XCAR (val) = make_number (id);
8164 }
8165
8166 flags = args[coding_arg_iso2022_flags];
8167 CHECK_NATNUM (flags);
8168 i = XINT (flags);
8169 if (EQ (args[coding_arg_charset_list], Qiso_2022))
8170 flags = make_number (i | CODING_ISO_FLAG_FULL_SUPPORT);
8171
8172 ASET (attrs, coding_attr_iso_initial, initial);
8173 ASET (attrs, coding_attr_iso_usage, reg_usage);
8174 ASET (attrs, coding_attr_iso_request, request);
8175 ASET (attrs, coding_attr_iso_flags, flags);
8176 setup_iso_safe_charsets (attrs);
8177
8178 if (i & CODING_ISO_FLAG_SEVEN_BITS)
8179 category = ((i & (CODING_ISO_FLAG_LOCKING_SHIFT
8180 | CODING_ISO_FLAG_SINGLE_SHIFT))
8181 ? coding_category_iso_7_else
8182 : EQ (args[coding_arg_charset_list], Qiso_2022)
8183 ? coding_category_iso_7
8184 : coding_category_iso_7_tight);
8185 else
8186 {
8187 int id = XINT (AREF (initial, 1));
8188
8189 category = (((i & CODING_ISO_FLAG_LOCKING_SHIFT)
8190 || EQ (args[coding_arg_charset_list], Qiso_2022)
8191 || id < 0)
8192 ? coding_category_iso_8_else
8193 : (CHARSET_DIMENSION (CHARSET_FROM_ID (id)) == 1)
8194 ? coding_category_iso_8_1
8195 : coding_category_iso_8_2);
8196 }
8197 if (category != coding_category_iso_8_1
8198 && category != coding_category_iso_8_2)
8199 CODING_ATTR_ASCII_COMPAT (attrs) = Qnil;
8200 }
8201 else if (EQ (coding_type, Qemacs_mule))
8202 {
8203 if (EQ (args[coding_arg_charset_list], Qemacs_mule))
8204 ASET (attrs, coding_attr_emacs_mule_full, Qt);
8205 CODING_ATTR_ASCII_COMPAT (attrs) = Qt;
8206 category = coding_category_emacs_mule;
8207 }
8208 else if (EQ (coding_type, Qshift_jis))
8209 {
8210
8211 struct charset *charset;
8212
8213 if (XINT (Flength (charset_list)) != 3)
8214 error ("There should be just three charsets");
8215
8216 charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
8217 if (CHARSET_DIMENSION (charset) != 1)
8218 error ("Dimension of charset %s is not one",
8219 XSYMBOL (CHARSET_NAME (charset))->name->data);
8220 if (CHARSET_ASCII_COMPATIBLE_P (charset))
8221 CODING_ATTR_ASCII_COMPAT (attrs) = Qt;
8222
8223 charset_list = XCDR (charset_list);
8224 charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
8225 if (CHARSET_DIMENSION (charset) != 1)
8226 error ("Dimension of charset %s is not one",
8227 XSYMBOL (CHARSET_NAME (charset))->name->data);
8228
8229 charset_list = XCDR (charset_list);
8230 charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
8231 if (CHARSET_DIMENSION (charset) != 2)
8232 error ("Dimension of charset %s is not two",
8233 XSYMBOL (CHARSET_NAME (charset))->name->data);
8234
8235 category = coding_category_sjis;
8236 Vsjis_coding_system = name;
8237 }
8238 else if (EQ (coding_type, Qbig5))
8239 {
8240 struct charset *charset;
8241
8242 if (XINT (Flength (charset_list)) != 2)
8243 error ("There should be just two charsets");
8244
8245 charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
8246 if (CHARSET_DIMENSION (charset) != 1)
8247 error ("Dimension of charset %s is not one",
8248 XSYMBOL (CHARSET_NAME (charset))->name->data);
8249 if (CHARSET_ASCII_COMPATIBLE_P (charset))
8250 CODING_ATTR_ASCII_COMPAT (attrs) = Qt;
8251
8252 charset_list = XCDR (charset_list);
8253 charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
8254 if (CHARSET_DIMENSION (charset) != 2)
8255 error ("Dimension of charset %s is not two",
8256 XSYMBOL (CHARSET_NAME (charset))->name->data);
8257
8258 category = coding_category_big5;
8259 Vbig5_coding_system = name;
8260 }
8261 else if (EQ (coding_type, Qraw_text))
8262 {
8263 category = coding_category_raw_text;
8264 CODING_ATTR_ASCII_COMPAT (attrs) = Qt;
8265 }
8266 else if (EQ (coding_type, Qutf_8))
8267 {
8268 category = coding_category_utf_8;
8269 CODING_ATTR_ASCII_COMPAT (attrs) = Qt;
8270 }
8271 else if (EQ (coding_type, Qundecided))
8272 category = coding_category_undecided;
8273 else
8274 error ("Invalid coding system type: %s",
8275 XSYMBOL (coding_type)->name->data);
8276
8277 CODING_ATTR_CATEGORY (attrs) = make_number (category);
8278
8279 eol_type = args[coding_arg_eol_type];
8280 if (! NILP (eol_type)
8281 && ! EQ (eol_type, Qunix)
8282 && ! EQ (eol_type, Qdos)
8283 && ! EQ (eol_type, Qmac))
8284 error ("Invalid eol-type");
8285
8286 aliases = Fcons (name, Qnil);
8287
8288 if (NILP (eol_type))
8289 {
8290 eol_type = make_subsidiaries (name);
8291 for (i = 0; i < 3; i++)
8292 {
8293 Lisp_Object this_spec, this_name, this_aliases, this_eol_type;
8294
8295 this_name = AREF (eol_type, i);
8296 this_aliases = Fcons (this_name, Qnil);
8297 this_eol_type = (i == 0 ? Qunix : i == 1 ? Qdos : Qmac);
8298 this_spec = Fmake_vector (make_number (3), attrs);
8299 ASET (this_spec, 1, this_aliases);
8300 ASET (this_spec, 2, this_eol_type);
8301 Fputhash (this_name, this_spec, Vcoding_system_hash_table);
8302 Vcoding_system_list = Fcons (this_name, Vcoding_system_list);
8303 Vcoding_system_alist = Fcons (Fcons (Fsymbol_name (this_name), Qnil),
8304 Vcoding_system_alist);
8305 }
8306 }
8307
8308 spec_vec = Fmake_vector (make_number (3), attrs);
8309 ASET (spec_vec, 1, aliases);
8310 ASET (spec_vec, 2, eol_type);
8311
8312 Fputhash (name, spec_vec, Vcoding_system_hash_table);
8313 Vcoding_system_list = Fcons (name, Vcoding_system_list);
8314 Vcoding_system_alist = Fcons (Fcons (Fsymbol_name (name), Qnil),
8315 Vcoding_system_alist);
8316
8317 {
8318 int id = coding_categories[category].id;
8319
8320 if (id < 0 || EQ (name, CODING_ID_NAME (id)))
8321 setup_coding_system (name, &coding_categories[category]);
8322 }
8323
8324 return Qnil;
8325
8326 short_args:
8327 return Fsignal (Qwrong_number_of_arguments,
8328 Fcons (intern ("define-coding-system-internal"),
8329 make_number (nargs)));
8330 }
8331
8332 /* Fixme: should this record the alias relationships for
8333 diagnostics? Should it update coding-system-list? */
8334 DEFUN ("define-coding-system-alias", Fdefine_coding_system_alias,
8335 Sdefine_coding_system_alias, 2, 2, 0,
8336 doc: /* Define ALIAS as an alias for CODING-SYSTEM. */)
8337 (alias, coding_system)
8338 Lisp_Object alias, coding_system;
8339 {
8340 Lisp_Object spec, aliases, eol_type;
8341
8342 CHECK_SYMBOL (alias);
8343 CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
8344 aliases = AREF (spec, 1);
8345 while (!NILP (XCDR (aliases)))
8346 aliases = XCDR (aliases);
8347 XCDR (aliases) = Fcons (alias, Qnil);
8348
8349 eol_type = AREF (spec, 2);
8350 if (VECTORP (eol_type))
8351 {
8352 Lisp_Object subsidiaries;
8353 int i;
8354
8355 subsidiaries = make_subsidiaries (alias);
8356 for (i = 0; i < 3; i++)
8357 Fdefine_coding_system_alias (AREF (subsidiaries, i),
8358 AREF (eol_type, i));
8359
8360 ASET (spec, 2, subsidiaries);
8361 }
8362
8363 Fputhash (alias, spec, Vcoding_system_hash_table);
8364 Vcoding_system_alist = Fcons (Fcons (Fsymbol_name (alias), Qnil),
8365 Vcoding_system_alist);
8366
8367 return Qnil;
8368 }
8369
8370 DEFUN ("coding-system-base", Fcoding_system_base, Scoding_system_base,
8371 1, 1, 0,
8372 doc: /* Return the base of CODING-SYSTEM.
8373 Any alias or subsidiary coding system is not a base coding system. */)
8374 (coding_system)
8375 Lisp_Object coding_system;
8376 {
8377 Lisp_Object spec, attrs;
8378
8379 if (NILP (coding_system))
8380 return (Qno_conversion);
8381 CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
8382 attrs = AREF (spec, 0);
8383 return CODING_ATTR_BASE_NAME (attrs);
8384 }
8385
8386 DEFUN ("coding-system-plist", Fcoding_system_plist, Scoding_system_plist,
8387 1, 1, 0,
8388 doc: "Return the property list of CODING-SYSTEM.")
8389 (coding_system)
8390 Lisp_Object coding_system;
8391 {
8392 Lisp_Object spec, attrs;
8393
8394 if (NILP (coding_system))
8395 coding_system = Qno_conversion;
8396 CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
8397 attrs = AREF (spec, 0);
8398 return CODING_ATTR_PLIST (attrs);
8399 }
8400
8401
8402 DEFUN ("coding-system-aliases", Fcoding_system_aliases, Scoding_system_aliases,
8403 1, 1, 0,
8404 doc: /* Return the list of aliases of CODING-SYSTEM. */)
8405 (coding_system)
8406 Lisp_Object coding_system;
8407 {
8408 Lisp_Object spec;
8409
8410 if (NILP (coding_system))
8411 coding_system = Qno_conversion;
8412 CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
8413 return AREF (spec, 1);
8414 }
8415
8416 DEFUN ("coding-system-eol-type", Fcoding_system_eol_type,
8417 Scoding_system_eol_type, 1, 1, 0,
8418 doc: /* Return eol-type of CODING-SYSTEM.
8419 An eol-type is integer 0, 1, 2, or a vector of coding systems.
8420
8421 Integer values 0, 1, and 2 indicate a format of end-of-line; LF, CRLF,
8422 and CR respectively.
8423
8424 A vector value indicates that a format of end-of-line should be
8425 detected automatically. Nth element of the vector is the subsidiary
8426 coding system whose eol-type is N. */)
8427 (coding_system)
8428 Lisp_Object coding_system;
8429 {
8430 Lisp_Object spec, eol_type;
8431 int n;
8432
8433 if (NILP (coding_system))
8434 coding_system = Qno_conversion;
8435 if (! CODING_SYSTEM_P (coding_system))
8436 return Qnil;
8437 spec = CODING_SYSTEM_SPEC (coding_system);
8438 eol_type = AREF (spec, 2);
8439 if (VECTORP (eol_type))
8440 return Fcopy_sequence (eol_type);
8441 n = EQ (eol_type, Qunix) ? 0 : EQ (eol_type, Qdos) ? 1 : 2;
8442 return make_number (n);
8443 }
8444
8445 #endif /* emacs */
8446
8447 \f
8448 /*** 9. Post-amble ***/
8449
8450 void
8451 init_coding_once ()
8452 {
8453 int i;
8454
8455 for (i = 0; i < coding_category_max; i++)
8456 {
8457 coding_categories[i].id = -1;
8458 coding_priorities[i] = i;
8459 }
8460
8461 /* ISO2022 specific initialize routine. */
8462 for (i = 0; i < 0x20; i++)
8463 iso_code_class[i] = ISO_control_0;
8464 for (i = 0x21; i < 0x7F; i++)
8465 iso_code_class[i] = ISO_graphic_plane_0;
8466 for (i = 0x80; i < 0xA0; i++)
8467 iso_code_class[i] = ISO_control_1;
8468 for (i = 0xA1; i < 0xFF; i++)
8469 iso_code_class[i] = ISO_graphic_plane_1;
8470 iso_code_class[0x20] = iso_code_class[0x7F] = ISO_0x20_or_0x7F;
8471 iso_code_class[0xA0] = iso_code_class[0xFF] = ISO_0xA0_or_0xFF;
8472 iso_code_class[ISO_CODE_CR] = ISO_carriage_return;
8473 iso_code_class[ISO_CODE_SO] = ISO_shift_out;
8474 iso_code_class[ISO_CODE_SI] = ISO_shift_in;
8475 iso_code_class[ISO_CODE_SS2_7] = ISO_single_shift_2_7;
8476 iso_code_class[ISO_CODE_ESC] = ISO_escape;
8477 iso_code_class[ISO_CODE_SS2] = ISO_single_shift_2;
8478 iso_code_class[ISO_CODE_SS3] = ISO_single_shift_3;
8479 iso_code_class[ISO_CODE_CSI] = ISO_control_sequence_introducer;
8480
8481 inhibit_pre_post_conversion = 0;
8482
8483 for (i = 0; i < 256; i++)
8484 {
8485 emacs_mule_bytes[i] = 1;
8486 }
8487 emacs_mule_bytes[EMACS_MULE_LEADING_CODE_PRIVATE_11] = 3;
8488 emacs_mule_bytes[EMACS_MULE_LEADING_CODE_PRIVATE_12] = 3;
8489 emacs_mule_bytes[EMACS_MULE_LEADING_CODE_PRIVATE_21] = 4;
8490 emacs_mule_bytes[EMACS_MULE_LEADING_CODE_PRIVATE_22] = 4;
8491 }
8492
8493 #ifdef emacs
8494
8495 void
8496 syms_of_coding ()
8497 {
8498 staticpro (&Vcoding_system_hash_table);
8499 Vcoding_system_hash_table = Fmakehash (Qeq);
8500
8501 staticpro (&Vsjis_coding_system);
8502 Vsjis_coding_system = Qnil;
8503
8504 staticpro (&Vbig5_coding_system);
8505 Vbig5_coding_system = Qnil;
8506
8507 staticpro (&Vcode_conversion_work_buf_list);
8508 Vcode_conversion_work_buf_list = Qnil;
8509
8510 staticpro (&Vcode_conversion_reused_work_buf);
8511 Vcode_conversion_reused_work_buf = Qnil;
8512
8513 DEFSYM (Qcharset, "charset");
8514 DEFSYM (Qtarget_idx, "target-idx");
8515 DEFSYM (Qcoding_system_history, "coding-system-history");
8516 Fset (Qcoding_system_history, Qnil);
8517
8518 /* Target FILENAME is the first argument. */
8519 Fput (Qinsert_file_contents, Qtarget_idx, make_number (0));
8520 /* Target FILENAME is the third argument. */
8521 Fput (Qwrite_region, Qtarget_idx, make_number (2));
8522
8523 DEFSYM (Qcall_process, "call-process");
8524 /* Target PROGRAM is the first argument. */
8525 Fput (Qcall_process, Qtarget_idx, make_number (0));
8526
8527 DEFSYM (Qcall_process_region, "call-process-region");
8528 /* Target PROGRAM is the third argument. */
8529 Fput (Qcall_process_region, Qtarget_idx, make_number (2));
8530
8531 DEFSYM (Qstart_process, "start-process");
8532 /* Target PROGRAM is the third argument. */
8533 Fput (Qstart_process, Qtarget_idx, make_number (2));
8534
8535 DEFSYM (Qopen_network_stream, "open-network-stream");
8536 /* Target SERVICE is the fourth argument. */
8537 Fput (Qopen_network_stream, Qtarget_idx, make_number (3));
8538
8539 DEFSYM (Qcoding_system, "coding-system");
8540 DEFSYM (Qcoding_aliases, "coding-aliases");
8541
8542 DEFSYM (Qeol_type, "eol-type");
8543 DEFSYM (Qunix, "unix");
8544 DEFSYM (Qdos, "dos");
8545
8546 DEFSYM (Qbuffer_file_coding_system, "buffer-file-coding-system");
8547 DEFSYM (Qpost_read_conversion, "post-read-conversion");
8548 DEFSYM (Qpre_write_conversion, "pre-write-conversion");
8549 DEFSYM (Qdefault_char, "default-char");
8550 DEFSYM (Qundecided, "undecided");
8551 DEFSYM (Qno_conversion, "no-conversion");
8552 DEFSYM (Qraw_text, "raw-text");
8553
8554 DEFSYM (Qiso_2022, "iso-2022");
8555
8556 DEFSYM (Qutf_8, "utf-8");
8557
8558 DEFSYM (Qutf_16, "utf-16");
8559 DEFSYM (Qbig, "big");
8560 DEFSYM (Qlittle, "little");
8561
8562 DEFSYM (Qshift_jis, "shift-jis");
8563 DEFSYM (Qbig5, "big5");
8564
8565 DEFSYM (Qcoding_system_p, "coding-system-p");
8566
8567 DEFSYM (Qcoding_system_error, "coding-system-error");
8568 Fput (Qcoding_system_error, Qerror_conditions,
8569 Fcons (Qcoding_system_error, Fcons (Qerror, Qnil)));
8570 Fput (Qcoding_system_error, Qerror_message,
8571 build_string ("Invalid coding system"));
8572
8573 /* Intern this now in case it isn't already done.
8574 Setting this variable twice is harmless.
8575 But don't staticpro it here--that is done in alloc.c. */
8576 Qchar_table_extra_slots = intern ("char-table-extra-slots");
8577
8578 DEFSYM (Qtranslation_table, "translation-table");
8579 Fput (Qtranslation_table, Qchar_table_extra_slots, make_number (1));
8580 DEFSYM (Qtranslation_table_id, "translation-table-id");
8581 DEFSYM (Qtranslation_table_for_decode, "translation-table-for-decode");
8582 DEFSYM (Qtranslation_table_for_encode, "translation-table-for-encode");
8583
8584 DEFSYM (Qvalid_codes, "valid-codes");
8585
8586 DEFSYM (Qemacs_mule, "emacs-mule");
8587
8588 Vcoding_category_table
8589 = Fmake_vector (make_number (coding_category_max), Qnil);
8590 staticpro (&Vcoding_category_table);
8591 /* Followings are target of code detection. */
8592 ASET (Vcoding_category_table, coding_category_iso_7,
8593 intern ("coding-category-iso-7"));
8594 ASET (Vcoding_category_table, coding_category_iso_7_tight,
8595 intern ("coding-category-iso-7-tight"));
8596 ASET (Vcoding_category_table, coding_category_iso_8_1,
8597 intern ("coding-category-iso-8-1"));
8598 ASET (Vcoding_category_table, coding_category_iso_8_2,
8599 intern ("coding-category-iso-8-2"));
8600 ASET (Vcoding_category_table, coding_category_iso_7_else,
8601 intern ("coding-category-iso-7-else"));
8602 ASET (Vcoding_category_table, coding_category_iso_8_else,
8603 intern ("coding-category-iso-8-else"));
8604 ASET (Vcoding_category_table, coding_category_utf_8,
8605 intern ("coding-category-utf-8"));
8606 ASET (Vcoding_category_table, coding_category_utf_16_be,
8607 intern ("coding-category-utf-16-be"));
8608 ASET (Vcoding_category_table, coding_category_utf_16_le,
8609 intern ("coding-category-utf-16-le"));
8610 ASET (Vcoding_category_table, coding_category_utf_16_be_nosig,
8611 intern ("coding-category-utf-16-be-nosig"));
8612 ASET (Vcoding_category_table, coding_category_utf_16_le_nosig,
8613 intern ("coding-category-utf-16-le-nosig"));
8614 ASET (Vcoding_category_table, coding_category_charset,
8615 intern ("coding-category-charset"));
8616 ASET (Vcoding_category_table, coding_category_sjis,
8617 intern ("coding-category-sjis"));
8618 ASET (Vcoding_category_table, coding_category_big5,
8619 intern ("coding-category-big5"));
8620 ASET (Vcoding_category_table, coding_category_ccl,
8621 intern ("coding-category-ccl"));
8622 ASET (Vcoding_category_table, coding_category_emacs_mule,
8623 intern ("coding-category-emacs-mule"));
8624 /* Followings are NOT target of code detection. */
8625 ASET (Vcoding_category_table, coding_category_raw_text,
8626 intern ("coding-category-raw-text"));
8627 ASET (Vcoding_category_table, coding_category_undecided,
8628 intern ("coding-category-undecided"));
8629
8630 defsubr (&Scoding_system_p);
8631 defsubr (&Sread_coding_system);
8632 defsubr (&Sread_non_nil_coding_system);
8633 defsubr (&Scheck_coding_system);
8634 defsubr (&Sdetect_coding_region);
8635 defsubr (&Sdetect_coding_string);
8636 defsubr (&Sfind_coding_systems_region_internal);
8637 defsubr (&Scheck_coding_systems_region);
8638 defsubr (&Sdecode_coding_region);
8639 defsubr (&Sencode_coding_region);
8640 defsubr (&Sdecode_coding_string);
8641 defsubr (&Sencode_coding_string);
8642 defsubr (&Sdecode_sjis_char);
8643 defsubr (&Sencode_sjis_char);
8644 defsubr (&Sdecode_big5_char);
8645 defsubr (&Sencode_big5_char);
8646 defsubr (&Sset_terminal_coding_system_internal);
8647 defsubr (&Sset_safe_terminal_coding_system_internal);
8648 defsubr (&Sterminal_coding_system);
8649 defsubr (&Sset_keyboard_coding_system_internal);
8650 defsubr (&Skeyboard_coding_system);
8651 defsubr (&Sfind_operation_coding_system);
8652 defsubr (&Sset_coding_system_priority);
8653 defsubr (&Sdefine_coding_system_internal);
8654 defsubr (&Sdefine_coding_system_alias);
8655 defsubr (&Scoding_system_base);
8656 defsubr (&Scoding_system_plist);
8657 defsubr (&Scoding_system_aliases);
8658 defsubr (&Scoding_system_eol_type);
8659 defsubr (&Scoding_system_priority_list);
8660
8661 DEFVAR_LISP ("coding-system-list", &Vcoding_system_list,
8662 doc: /* List of coding systems.
8663
8664 Do not alter the value of this variable manually. This variable should be
8665 updated by the functions `define-coding-system' and
8666 `define-coding-system-alias'. */);
8667 Vcoding_system_list = Qnil;
8668
8669 DEFVAR_LISP ("coding-system-alist", &Vcoding_system_alist,
8670 doc: /* Alist of coding system names.
8671 Each element is one element list of coding system name.
8672 This variable is given to `completing-read' as TABLE argument.
8673
8674 Do not alter the value of this variable manually. This variable should be
8675 updated by the functions `make-coding-system' and
8676 `define-coding-system-alias'. */);
8677 Vcoding_system_alist = Qnil;
8678
8679 DEFVAR_LISP ("coding-category-list", &Vcoding_category_list,
8680 doc: /* List of coding-categories (symbols) ordered by priority.
8681
8682 On detecting a coding system, Emacs tries code detection algorithms
8683 associated with each coding-category one by one in this order. When
8684 one algorithm agrees with a byte sequence of source text, the coding
8685 system bound to the corresponding coding-category is selected. */);
8686 {
8687 int i;
8688
8689 Vcoding_category_list = Qnil;
8690 for (i = coding_category_max - 1; i >= 0; i--)
8691 Vcoding_category_list
8692 = Fcons (XVECTOR (Vcoding_category_table)->contents[i],
8693 Vcoding_category_list);
8694 }
8695
8696 DEFVAR_LISP ("coding-system-for-read", &Vcoding_system_for_read,
8697 doc: /* Specify the coding system for read operations.
8698 It is useful to bind this variable with `let', but do not set it globally.
8699 If the value is a coding system, it is used for decoding on read operation.
8700 If not, an appropriate element is used from one of the coding system alists:
8701 There are three such tables, `file-coding-system-alist',
8702 `process-coding-system-alist', and `network-coding-system-alist'. */);
8703 Vcoding_system_for_read = Qnil;
8704
8705 DEFVAR_LISP ("coding-system-for-write", &Vcoding_system_for_write,
8706 doc: /* Specify the coding system for write operations.
8707 Programs bind this variable with `let', but you should not set it globally.
8708 If the value is a coding system, it is used for encoding of output,
8709 when writing it to a file and when sending it to a file or subprocess.
8710
8711 If this does not specify a coding system, an appropriate element
8712 is used from one of the coding system alists:
8713 There are three such tables, `file-coding-system-alist',
8714 `process-coding-system-alist', and `network-coding-system-alist'.
8715 For output to files, if the above procedure does not specify a coding system,
8716 the value of `buffer-file-coding-system' is used. */);
8717 Vcoding_system_for_write = Qnil;
8718
8719 DEFVAR_LISP ("last-coding-system-used", &Vlast_coding_system_used,
8720 doc: /*
8721 Coding system used in the latest file or process I/O. */);
8722 Vlast_coding_system_used = Qnil;
8723
8724 DEFVAR_BOOL ("inhibit-eol-conversion", &inhibit_eol_conversion,
8725 doc: /*
8726 *Non-nil means always inhibit code conversion of end-of-line format.
8727 See info node `Coding Systems' and info node `Text and Binary' concerning
8728 such conversion. */);
8729 inhibit_eol_conversion = 0;
8730
8731 DEFVAR_BOOL ("inherit-process-coding-system", &inherit_process_coding_system,
8732 doc: /*
8733 Non-nil means process buffer inherits coding system of process output.
8734 Bind it to t if the process output is to be treated as if it were a file
8735 read from some filesystem. */);
8736 inherit_process_coding_system = 0;
8737
8738 DEFVAR_LISP ("file-coding-system-alist", &Vfile_coding_system_alist,
8739 doc: /*
8740 Alist to decide a coding system to use for a file I/O operation.
8741 The format is ((PATTERN . VAL) ...),
8742 where PATTERN is a regular expression matching a file name,
8743 VAL is a coding system, a cons of coding systems, or a function symbol.
8744 If VAL is a coding system, it is used for both decoding and encoding
8745 the file contents.
8746 If VAL is a cons of coding systems, the car part is used for decoding,
8747 and the cdr part is used for encoding.
8748 If VAL is a function symbol, the function must return a coding system
8749 or a cons of coding systems which are used as above. The function gets
8750 the arguments with which `find-operation-coding-systems' was called.
8751
8752 See also the function `find-operation-coding-system'
8753 and the variable `auto-coding-alist'. */);
8754 Vfile_coding_system_alist = Qnil;
8755
8756 DEFVAR_LISP ("process-coding-system-alist", &Vprocess_coding_system_alist,
8757 doc: /*
8758 Alist to decide a coding system to use for a process I/O operation.
8759 The format is ((PATTERN . VAL) ...),
8760 where PATTERN is a regular expression matching a program name,
8761 VAL is a coding system, a cons of coding systems, or a function symbol.
8762 If VAL is a coding system, it is used for both decoding what received
8763 from the program and encoding what sent to the program.
8764 If VAL is a cons of coding systems, the car part is used for decoding,
8765 and the cdr part is used for encoding.
8766 If VAL is a function symbol, the function must return a coding system
8767 or a cons of coding systems which are used as above.
8768
8769 See also the function `find-operation-coding-system'. */);
8770 Vprocess_coding_system_alist = Qnil;
8771
8772 DEFVAR_LISP ("network-coding-system-alist", &Vnetwork_coding_system_alist,
8773 doc: /*
8774 Alist to decide a coding system to use for a network I/O operation.
8775 The format is ((PATTERN . VAL) ...),
8776 where PATTERN is a regular expression matching a network service name
8777 or is a port number to connect to,
8778 VAL is a coding system, a cons of coding systems, or a function symbol.
8779 If VAL is a coding system, it is used for both decoding what received
8780 from the network stream and encoding what sent to the network stream.
8781 If VAL is a cons of coding systems, the car part is used for decoding,
8782 and the cdr part is used for encoding.
8783 If VAL is a function symbol, the function must return a coding system
8784 or a cons of coding systems which are used as above.
8785
8786 See also the function `find-operation-coding-system'. */);
8787 Vnetwork_coding_system_alist = Qnil;
8788
8789 DEFVAR_LISP ("locale-coding-system", &Vlocale_coding_system,
8790 doc: /* Coding system to use with system messages.
8791 Also used for decoding keyboard input on X Window system. */);
8792 Vlocale_coding_system = Qnil;
8793
8794 /* The eol mnemonics are reset in startup.el system-dependently. */
8795 DEFVAR_LISP ("eol-mnemonic-unix", &eol_mnemonic_unix,
8796 doc: /*
8797 *String displayed in mode line for UNIX-like (LF) end-of-line format. */);
8798 eol_mnemonic_unix = build_string (":");
8799
8800 DEFVAR_LISP ("eol-mnemonic-dos", &eol_mnemonic_dos,
8801 doc: /*
8802 *String displayed in mode line for DOS-like (CRLF) end-of-line format. */);
8803 eol_mnemonic_dos = build_string ("\\");
8804
8805 DEFVAR_LISP ("eol-mnemonic-mac", &eol_mnemonic_mac,
8806 doc: /*
8807 *String displayed in mode line for MAC-like (CR) end-of-line format. */);
8808 eol_mnemonic_mac = build_string ("/");
8809
8810 DEFVAR_LISP ("eol-mnemonic-undecided", &eol_mnemonic_undecided,
8811 doc: /*
8812 *String displayed in mode line when end-of-line format is not yet determined. */);
8813 eol_mnemonic_undecided = build_string (":");
8814
8815 DEFVAR_LISP ("enable-character-translation", &Venable_character_translation,
8816 doc: /*
8817 *Non-nil enables character translation while encoding and decoding. */);
8818 Venable_character_translation = Qt;
8819
8820 DEFVAR_LISP ("standard-translation-table-for-decode",
8821 &Vstandard_translation_table_for_decode,
8822 doc: /* Table for translating characters while decoding. */);
8823 Vstandard_translation_table_for_decode = Qnil;
8824
8825 DEFVAR_LISP ("standard-translation-table-for-encode",
8826 &Vstandard_translation_table_for_encode,
8827 doc: /* Table for translating characters while encoding. */);
8828 Vstandard_translation_table_for_encode = Qnil;
8829
8830 DEFVAR_LISP ("charset-revision-table", &Vcharset_revision_table,
8831 doc: /* Alist of charsets vs revision numbers.
8832 While encoding, if a charset (car part of an element) is found,
8833 designate it with the escape sequence identifying revision (cdr part
8834 of the element). */);
8835 Vcharset_revision_table = Qnil;
8836
8837 DEFVAR_LISP ("default-process-coding-system",
8838 &Vdefault_process_coding_system,
8839 doc: /* Cons of coding systems used for process I/O by default.
8840 The car part is used for decoding a process output,
8841 the cdr part is used for encoding a text to be sent to a process. */);
8842 Vdefault_process_coding_system = Qnil;
8843
8844 DEFVAR_LISP ("latin-extra-code-table", &Vlatin_extra_code_table,
8845 doc: /*
8846 Table of extra Latin codes in the range 128..159 (inclusive).
8847 This is a vector of length 256.
8848 If Nth element is non-nil, the existence of code N in a file
8849 \(or output of subprocess) doesn't prevent it to be detected as
8850 a coding system of ISO 2022 variant which has a flag
8851 `accept-latin-extra-code' t (e.g. iso-latin-1) on reading a file
8852 or reading output of a subprocess.
8853 Only 128th through 159th elements has a meaning. */);
8854 Vlatin_extra_code_table = Fmake_vector (make_number (256), Qnil);
8855
8856 DEFVAR_LISP ("select-safe-coding-system-function",
8857 &Vselect_safe_coding_system_function,
8858 doc: /*
8859 Function to call to select safe coding system for encoding a text.
8860
8861 If set, this function is called to force a user to select a proper
8862 coding system which can encode the text in the case that a default
8863 coding system used in each operation can't encode the text.
8864
8865 The default value is `select-safe-coding-system' (which see). */);
8866 Vselect_safe_coding_system_function = Qnil;
8867
8868 DEFVAR_BOOL ("inhibit-iso-escape-detection",
8869 &inhibit_iso_escape_detection,
8870 doc: /*
8871 If non-nil, Emacs ignores ISO2022's escape sequence on code detection.
8872
8873 By default, on reading a file, Emacs tries to detect how the text is
8874 encoded. This code detection is sensitive to escape sequences. If
8875 the sequence is valid as ISO2022, the code is determined as one of
8876 the ISO2022 encodings, and the file is decoded by the corresponding
8877 coding system (e.g. `iso-2022-7bit').
8878
8879 However, there may be a case that you want to read escape sequences in
8880 a file as is. In such a case, you can set this variable to non-nil.
8881 Then, as the code detection ignores any escape sequences, no file is
8882 detected as encoded in some ISO2022 encoding. The result is that all
8883 escape sequences become visible in a buffer.
8884
8885 The default value is nil, and it is strongly recommended not to change
8886 it. That is because many Emacs Lisp source files that contain
8887 non-ASCII characters are encoded by the coding system `iso-2022-7bit'
8888 in Emacs's distribution, and they won't be decoded correctly on
8889 reading if you suppress escape sequence detection.
8890
8891 The other way to read escape sequences in a file without decoding is
8892 to explicitly specify some coding system that doesn't use ISO2022's
8893 escape sequence (e.g `latin-1') on reading by \\[universal-coding-system-argument]. */);
8894 inhibit_iso_escape_detection = 0;
8895
8896 {
8897 Lisp_Object args[coding_arg_max];
8898 Lisp_Object plist[14];
8899 int i;
8900
8901 for (i = 0; i < coding_arg_max; i++)
8902 args[i] = Qnil;
8903
8904 plist[0] = intern (":name");
8905 plist[1] = args[coding_arg_name] = Qno_conversion;
8906 plist[2] = intern (":mnemonic");
8907 plist[3] = args[coding_arg_mnemonic] = make_number ('=');
8908 plist[4] = intern (":coding-type");
8909 plist[5] = args[coding_arg_coding_type] = Qraw_text;
8910 plist[6] = intern (":ascii-compatible-p");
8911 plist[7] = args[coding_arg_ascii_compatible_p] = Qt;
8912 plist[8] = intern (":default-char");
8913 plist[9] = args[coding_arg_default_char] = make_number (0);
8914 plist[10] = intern (":docstring");
8915 plist[11] = build_string ("Do no conversion.\n\
8916 \n\
8917 When you visit a file with this coding, the file is read into a\n\
8918 unibyte buffer as is, thus each byte of a file is treated as a\n\
8919 character.");
8920 plist[12] = intern (":eol-type");
8921 plist[13] = args[coding_arg_eol_type] = Qunix;
8922 args[coding_arg_plist] = Flist (14, plist);
8923 Fdefine_coding_system_internal (coding_arg_max, args);
8924 }
8925
8926 setup_coding_system (Qno_conversion, &keyboard_coding);
8927 setup_coding_system (Qno_conversion, &terminal_coding);
8928 setup_coding_system (Qno_conversion, &safe_terminal_coding);
8929 }
8930
8931 char *
8932 emacs_strerror (error_number)
8933 int error_number;
8934 {
8935 char *str;
8936
8937 synchronize_system_messages_locale ();
8938 str = strerror (error_number);
8939
8940 if (! NILP (Vlocale_coding_system))
8941 {
8942 Lisp_Object dec = code_convert_string_norecord (build_string (str),
8943 Vlocale_coding_system,
8944 0);
8945 str = (char *) XSTRING (dec)->data;
8946 }
8947
8948 return str;
8949 }
8950
8951 #endif /* emacs */