(Fread_coding_system): Fix arg of XSETSTRING.
[bpt/emacs.git] / src / coding.c
1 /* Coding system handler (conversion, detection, etc).
2 Copyright (C) 1995, 1997, 1998 Electrotechnical Laboratory, JAPAN.
3 Licensed to the Free Software Foundation.
4 Copyright (C) 2001, 2002 Free Software Foundation, Inc.
5 Copyright (C) 2003
6 National Institute of Advanced Industrial Science and Technology (AIST)
7 Registration Number H13PRO009
8
9 This file is part of GNU Emacs.
10
11 GNU Emacs is free software; you can redistribute it and/or modify
12 it under the terms of the GNU General Public License as published by
13 the Free Software Foundation; either version 2, or (at your option)
14 any later version.
15
16 GNU Emacs is distributed in the hope that it will be useful,
17 but WITHOUT ANY WARRANTY; without even the implied warranty of
18 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19 GNU General Public License for more details.
20
21 You should have received a copy of the GNU General Public License
22 along with GNU Emacs; see the file COPYING. If not, write to
23 the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
24 Boston, MA 02111-1307, USA. */
25
26 /*** TABLE OF CONTENTS ***
27
28 0. General comments
29 1. Preamble
30 2. Emacs' internal format (emacs-utf-8) handlers
31 3. UTF-8 handlers
32 4. UTF-16 handlers
33 5. Charset-base coding systems handlers
34 6. emacs-mule (old Emacs' internal format) handlers
35 7. ISO2022 handlers
36 8. Shift-JIS and BIG5 handlers
37 9. CCL handlers
38 10. C library functions
39 11. Emacs Lisp library functions
40 12. Postamble
41
42 */
43
44 /*** 0. General comments ***
45
46
47 CODING SYSTEM
48
49 A coding system is an object for an encoding mechanism that contains
50 information about how to convert byte sequences to character
51 sequences and vice versa. When we say "decode", it means converting
52 a byte sequence of a specific coding system into a character
53 sequence that is represented by Emacs' internal coding system
54 `emacs-utf-8', and when we say "encode", it means converting a
55 character sequence of emacs-utf-8 to a byte sequence of a specific
56 coding system.
57
58 In Emacs Lisp, a coding system is represented by a Lisp symbol. In
59 C level, a coding system is represented by a vector of attributes
60 stored in the hash table Vcharset_hash_table. The conversion from
61 coding system symbol to attributes vector is done by looking up
62 Vcharset_hash_table by the symbol.
63
64 Coding systems are classified into the following types depending on
65 the encoding mechanism. Here's a brief description of the types.
66
67 o UTF-8
68
69 o UTF-16
70
71 o Charset-base coding system
72
73 A coding system defined by one or more (coded) character sets.
74 Decoding and encoding are done by a code converter defined for each
75 character set.
76
77 o Old Emacs internal format (emacs-mule)
78
79 The coding system adopted by old versions of Emacs (20 and 21).
80
81 o ISO2022-base coding system
82
83 The most famous coding system for multiple character sets. X's
84 Compound Text, various EUCs (Extended Unix Code), and coding systems
85 used in the Internet communication such as ISO-2022-JP are all
86 variants of ISO2022.
87
88 o SJIS (or Shift-JIS or MS-Kanji-Code)
89
90 A coding system to encode character sets: ASCII, JISX0201, and
91 JISX0208. Widely used for PC's in Japan. Details are described in
92 section 8.
93
94 o BIG5
95
96 A coding system to encode character sets: ASCII and Big5. Widely
97 used for Chinese (mainly in Taiwan and Hong Kong). Details are
98 described in section 8. In this file, when we write "big5" (all
99 lowercase), we mean the coding system, and when we write "Big5"
100 (capitalized), we mean the character set.
101
102 o CCL
103
104 If a user wants to decode/encode text encoded in a coding system
105 not listed above, he can supply a decoder and an encoder for it in
106 CCL (Code Conversion Language) programs. Emacs executes the CCL
107 program while decoding/encoding.
108
109 o Raw-text
110
111 A coding system for text containing raw eight-bit data. Emacs
112 treats each byte of source text as a character (except for
113 end-of-line conversion).
114
115 o No-conversion
116
117 Like raw text, but don't do end-of-line conversion.
118
119
120 END-OF-LINE FORMAT
121
122 How text end-of-line is encoded depends on operating system. For
123 instance, Unix's format is just one byte of LF (line-feed) code,
124 whereas DOS's format is two-byte sequence of `carriage-return' and
125 `line-feed' codes. MacOS's format is usually one byte of
126 `carriage-return'.
127
128 Since text character encoding and end-of-line encoding are
129 independent, any coding system described above can take any format
130 of end-of-line (except for no-conversion).
131
132 STRUCT CODING_SYSTEM
133
134 Before using a coding system for code conversion (i.e. decoding and
135 encoding), we setup a structure of type `struct coding_system'.
136 This structure keeps various information about a specific code
137 conversion (e.g. the location of source and destination data).
138
139 */
140
141 /* COMMON MACROS */
142
143
144 /*** GENERAL NOTES on `detect_coding_XXX ()' functions ***
145
146 These functions check if a byte sequence specified as a source in
147 CODING conforms to the format of XXX, and update the members of
148 DETECT_INFO.
149
150 Return 1 if the byte sequence conforms to XXX, otherwise return 0.
151
152 Below is the template of these functions. */
153
154 #if 0
155 static int
156 detect_coding_XXX (coding, detect_info)
157 struct coding_system *coding;
158 struct coding_detection_info *detect_info;
159 {
160 unsigned char *src = coding->source;
161 unsigned char *src_end = coding->source + coding->src_bytes;
162 int multibytep = coding->src_multibyte;
163 int consumed_chars = 0;
164 int found = 0;
165 ...;
166
167 while (1)
168 {
169 /* Get one byte from the source. If the souce is exausted, jump
170 to no_more_source:. */
171 ONE_MORE_BYTE (c);
172
173 if (! __C_conforms_to_XXX___ (c))
174 break;
175 if (! __C_strongly_suggests_XXX__ (c))
176 found = CATEGORY_MASK_XXX;
177 }
178 /* The byte sequence is invalid for XXX. */
179 detect_info->rejected |= CATEGORY_MASK_XXX;
180 return 0;
181
182 no_more_source:
183 /* The source exausted successfully. */
184 detect_info->found |= found;
185 return 1;
186 }
187 #endif
188
189 /*** GENERAL NOTES on `decode_coding_XXX ()' functions ***
190
191 These functions decode a byte sequence specified as a source by
192 CODING. The resulting multibyte text goes to a place pointed to by
193 CODING->charbuf, the length of which should not exceed
194 CODING->charbuf_size;
195
196 These functions set the information of original and decoded texts in
197 CODING->consumed, CODING->consumed_char, and CODING->charbuf_used.
198 They also set CODING->result to one of CODING_RESULT_XXX indicating
199 how the decoding is finished.
200
201 Below is the template of these functions. */
202
203 #if 0
204 static void
205 decode_coding_XXXX (coding)
206 struct coding_system *coding;
207 {
208 unsigned char *src = coding->source + coding->consumed;
209 unsigned char *src_end = coding->source + coding->src_bytes;
210 /* SRC_BASE remembers the start position in source in each loop.
211 The loop will be exited when there's not enough source code, or
212 when there's no room in CHARBUF for a decoded character. */
213 unsigned char *src_base;
214 /* A buffer to produce decoded characters. */
215 int *charbuf = coding->charbuf;
216 int *charbuf_end = charbuf + coding->charbuf_size;
217 int multibytep = coding->src_multibyte;
218
219 while (1)
220 {
221 src_base = src;
222 if (charbuf < charbuf_end)
223 /* No more room to produce a decoded character. */
224 break;
225 ONE_MORE_BYTE (c);
226 /* Decode it. */
227 }
228
229 no_more_source:
230 if (src_base < src_end
231 && coding->mode & CODING_MODE_LAST_BLOCK)
232 /* If the source ends by partial bytes to construct a character,
233 treat them as eight-bit raw data. */
234 while (src_base < src_end && charbuf < charbuf_end)
235 *charbuf++ = *src_base++;
236 /* Remember how many bytes and characters we consumed. If the
237 source is multibyte, the bytes and chars are not identical. */
238 coding->consumed = coding->consumed_char = src_base - coding->source;
239 /* Remember how many characters we produced. */
240 coding->charbuf_used = charbuf - coding->charbuf;
241 }
242 #endif
243
244 /*** GENERAL NOTES on `encode_coding_XXX ()' functions ***
245
246 These functions encode SRC_BYTES length text at SOURCE of Emacs'
247 internal multibyte format by CODING. The resulting byte sequence
248 goes to a place pointed to by DESTINATION, the length of which
249 should not exceed DST_BYTES.
250
251 These functions set the information of original and encoded texts in
252 the members produced, produced_char, consumed, and consumed_char of
253 the structure *CODING. They also set the member result to one of
254 CODING_RESULT_XXX indicating how the encoding finished.
255
256 DST_BYTES zero means that source area and destination area are
257 overlapped, which means that we can produce a encoded text until it
258 reaches at the head of not-yet-encoded source text.
259
260 Below is a template of these functions. */
261 #if 0
262 static void
263 encode_coding_XXX (coding)
264 struct coding_system *coding;
265 {
266 int multibytep = coding->dst_multibyte;
267 int *charbuf = coding->charbuf;
268 int *charbuf_end = charbuf->charbuf + coding->charbuf_used;
269 unsigned char *dst = coding->destination + coding->produced;
270 unsigned char *dst_end = coding->destination + coding->dst_bytes;
271 unsigned char *adjusted_dst_end = dst_end - _MAX_BYTES_PRODUCED_IN_LOOP_;
272 int produced_chars = 0;
273
274 for (; charbuf < charbuf_end && dst < adjusted_dst_end; charbuf++)
275 {
276 int c = *charbuf;
277 /* Encode C into DST, and increment DST. */
278 }
279 label_no_more_destination:
280 /* How many chars and bytes we produced. */
281 coding->produced_char += produced_chars;
282 coding->produced = dst - coding->destination;
283 }
284 #endif
285
286 \f
287 /*** 1. Preamble ***/
288
289 #include <config.h>
290 #include <stdio.h>
291
292 #include "lisp.h"
293 #include "buffer.h"
294 #include "character.h"
295 #include "charset.h"
296 #include "ccl.h"
297 #include "composite.h"
298 #include "coding.h"
299 #include "window.h"
300
301 Lisp_Object Vcoding_system_hash_table;
302
303 Lisp_Object Qcoding_system, Qcoding_aliases, Qeol_type;
304 Lisp_Object Qunix, Qdos;
305 extern Lisp_Object Qmac; /* frame.c */
306 Lisp_Object Qbuffer_file_coding_system;
307 Lisp_Object Qpost_read_conversion, Qpre_write_conversion;
308 Lisp_Object Qdefault_char;
309 Lisp_Object Qno_conversion, Qundecided;
310 Lisp_Object Qcharset, Qiso_2022, Qutf_8, Qutf_16, Qshift_jis, Qbig5;
311 Lisp_Object Qbig, Qlittle;
312 Lisp_Object Qcoding_system_history;
313 Lisp_Object Qvalid_codes;
314 Lisp_Object QCcategory;
315
316 extern Lisp_Object Qinsert_file_contents, Qwrite_region;
317 Lisp_Object Qcall_process, Qcall_process_region, Qprocess_argument;
318 Lisp_Object Qstart_process, Qopen_network_stream;
319 Lisp_Object Qtarget_idx;
320
321 int coding_system_require_warning;
322
323 Lisp_Object Vselect_safe_coding_system_function;
324
325 /* Mnemonic string for each format of end-of-line. */
326 Lisp_Object eol_mnemonic_unix, eol_mnemonic_dos, eol_mnemonic_mac;
327 /* Mnemonic string to indicate format of end-of-line is not yet
328 decided. */
329 Lisp_Object eol_mnemonic_undecided;
330
331 #ifdef emacs
332
333 Lisp_Object Vcoding_system_list, Vcoding_system_alist;
334
335 Lisp_Object Qcoding_system_p, Qcoding_system_error;
336
337 /* Coding system emacs-mule and raw-text are for converting only
338 end-of-line format. */
339 Lisp_Object Qemacs_mule, Qraw_text;
340 Lisp_Object Qutf_8_emacs;
341
342 /* Coding-systems are handed between Emacs Lisp programs and C internal
343 routines by the following three variables. */
344 /* Coding-system for reading files and receiving data from process. */
345 Lisp_Object Vcoding_system_for_read;
346 /* Coding-system for writing files and sending data to process. */
347 Lisp_Object Vcoding_system_for_write;
348 /* Coding-system actually used in the latest I/O. */
349 Lisp_Object Vlast_coding_system_used;
350
351 /* A vector of length 256 which contains information about special
352 Latin codes (especially for dealing with Microsoft codes). */
353 Lisp_Object Vlatin_extra_code_table;
354
355 /* Flag to inhibit code conversion of end-of-line format. */
356 int inhibit_eol_conversion;
357
358 /* Flag to inhibit ISO2022 escape sequence detection. */
359 int inhibit_iso_escape_detection;
360
361 /* Flag to make buffer-file-coding-system inherit from process-coding. */
362 int inherit_process_coding_system;
363
364 /* Coding system to be used to encode text for terminal display. */
365 struct coding_system terminal_coding;
366
367 /* Coding system to be used to encode text for terminal display when
368 terminal coding system is nil. */
369 struct coding_system safe_terminal_coding;
370
371 /* Coding system of what is sent from terminal keyboard. */
372 struct coding_system keyboard_coding;
373
374 Lisp_Object Vfile_coding_system_alist;
375 Lisp_Object Vprocess_coding_system_alist;
376 Lisp_Object Vnetwork_coding_system_alist;
377
378 Lisp_Object Vlocale_coding_system;
379
380 #endif /* emacs */
381
382 /* Flag to tell if we look up translation table on character code
383 conversion. */
384 Lisp_Object Venable_character_translation;
385 /* Standard translation table to look up on decoding (reading). */
386 Lisp_Object Vstandard_translation_table_for_decode;
387 /* Standard translation table to look up on encoding (writing). */
388 Lisp_Object Vstandard_translation_table_for_encode;
389
390 Lisp_Object Qtranslation_table;
391 Lisp_Object Qtranslation_table_id;
392 Lisp_Object Qtranslation_table_for_decode;
393 Lisp_Object Qtranslation_table_for_encode;
394
395 /* Alist of charsets vs revision number. */
396 static Lisp_Object Vcharset_revision_table;
397
398 /* Default coding systems used for process I/O. */
399 Lisp_Object Vdefault_process_coding_system;
400
401 /* Char table for translating Quail and self-inserting input. */
402 Lisp_Object Vtranslation_table_for_input;
403
404 /* Global flag to tell that we can't call post-read-conversion and
405 pre-write-conversion functions. Usually the value is zero, but it
406 is set to 1 temporarily while such functions are running. This is
407 to avoid infinite recursive call. */
408 static int inhibit_pre_post_conversion;
409
410 /* Two special coding systems. */
411 Lisp_Object Vsjis_coding_system;
412 Lisp_Object Vbig5_coding_system;
413
414
415 static int detect_coding_utf_8 P_ ((struct coding_system *,
416 struct coding_detection_info *info));
417 static void decode_coding_utf_8 P_ ((struct coding_system *));
418 static int encode_coding_utf_8 P_ ((struct coding_system *));
419
420 static int detect_coding_utf_16 P_ ((struct coding_system *,
421 struct coding_detection_info *info));
422 static void decode_coding_utf_16 P_ ((struct coding_system *));
423 static int encode_coding_utf_16 P_ ((struct coding_system *));
424
425 static int detect_coding_iso_2022 P_ ((struct coding_system *,
426 struct coding_detection_info *info));
427 static void decode_coding_iso_2022 P_ ((struct coding_system *));
428 static int encode_coding_iso_2022 P_ ((struct coding_system *));
429
430 static int detect_coding_emacs_mule P_ ((struct coding_system *,
431 struct coding_detection_info *info));
432 static void decode_coding_emacs_mule P_ ((struct coding_system *));
433 static int encode_coding_emacs_mule P_ ((struct coding_system *));
434
435 static int detect_coding_sjis P_ ((struct coding_system *,
436 struct coding_detection_info *info));
437 static void decode_coding_sjis P_ ((struct coding_system *));
438 static int encode_coding_sjis P_ ((struct coding_system *));
439
440 static int detect_coding_big5 P_ ((struct coding_system *,
441 struct coding_detection_info *info));
442 static void decode_coding_big5 P_ ((struct coding_system *));
443 static int encode_coding_big5 P_ ((struct coding_system *));
444
445 static int detect_coding_ccl P_ ((struct coding_system *,
446 struct coding_detection_info *info));
447 static void decode_coding_ccl P_ ((struct coding_system *));
448 static int encode_coding_ccl P_ ((struct coding_system *));
449
450 static void decode_coding_raw_text P_ ((struct coding_system *));
451 static int encode_coding_raw_text P_ ((struct coding_system *));
452
453
454 /* ISO2022 section */
455
456 #define CODING_ISO_INITIAL(coding, reg) \
457 (XINT (AREF (AREF (CODING_ID_ATTRS ((coding)->id), \
458 coding_attr_iso_initial), \
459 reg)))
460
461
462 #define CODING_ISO_REQUEST(coding, charset_id) \
463 ((charset_id <= (coding)->max_charset_id \
464 ? (coding)->safe_charsets[charset_id] \
465 : -1))
466
467
468 #define CODING_ISO_FLAGS(coding) \
469 ((coding)->spec.iso_2022.flags)
470 #define CODING_ISO_DESIGNATION(coding, reg) \
471 ((coding)->spec.iso_2022.current_designation[reg])
472 #define CODING_ISO_INVOCATION(coding, plane) \
473 ((coding)->spec.iso_2022.current_invocation[plane])
474 #define CODING_ISO_SINGLE_SHIFTING(coding) \
475 ((coding)->spec.iso_2022.single_shifting)
476 #define CODING_ISO_BOL(coding) \
477 ((coding)->spec.iso_2022.bol)
478 #define CODING_ISO_INVOKED_CHARSET(coding, plane) \
479 CODING_ISO_DESIGNATION ((coding), CODING_ISO_INVOCATION ((coding), (plane)))
480
481 /* Control characters of ISO2022. */
482 /* code */ /* function */
483 #define ISO_CODE_LF 0x0A /* line-feed */
484 #define ISO_CODE_CR 0x0D /* carriage-return */
485 #define ISO_CODE_SO 0x0E /* shift-out */
486 #define ISO_CODE_SI 0x0F /* shift-in */
487 #define ISO_CODE_SS2_7 0x19 /* single-shift-2 for 7-bit code */
488 #define ISO_CODE_ESC 0x1B /* escape */
489 #define ISO_CODE_SS2 0x8E /* single-shift-2 */
490 #define ISO_CODE_SS3 0x8F /* single-shift-3 */
491 #define ISO_CODE_CSI 0x9B /* control-sequence-introducer */
492
493 /* All code (1-byte) of ISO2022 is classified into one of the
494 followings. */
495 enum iso_code_class_type
496 {
497 ISO_control_0, /* Control codes in the range
498 0x00..0x1F and 0x7F, except for the
499 following 5 codes. */
500 ISO_carriage_return, /* ISO_CODE_CR (0x0D) */
501 ISO_shift_out, /* ISO_CODE_SO (0x0E) */
502 ISO_shift_in, /* ISO_CODE_SI (0x0F) */
503 ISO_single_shift_2_7, /* ISO_CODE_SS2_7 (0x19) */
504 ISO_escape, /* ISO_CODE_SO (0x1B) */
505 ISO_control_1, /* Control codes in the range
506 0x80..0x9F, except for the
507 following 3 codes. */
508 ISO_single_shift_2, /* ISO_CODE_SS2 (0x8E) */
509 ISO_single_shift_3, /* ISO_CODE_SS3 (0x8F) */
510 ISO_control_sequence_introducer, /* ISO_CODE_CSI (0x9B) */
511 ISO_0x20_or_0x7F, /* Codes of the values 0x20 or 0x7F. */
512 ISO_graphic_plane_0, /* Graphic codes in the range 0x21..0x7E. */
513 ISO_0xA0_or_0xFF, /* Codes of the values 0xA0 or 0xFF. */
514 ISO_graphic_plane_1 /* Graphic codes in the range 0xA1..0xFE. */
515 };
516
517 /** The macros CODING_ISO_FLAG_XXX defines a flag bit of the
518 `iso-flags' attribute of an iso2022 coding system. */
519
520 /* If set, produce long-form designation sequence (e.g. ESC $ ( A)
521 instead of the correct short-form sequence (e.g. ESC $ A). */
522 #define CODING_ISO_FLAG_LONG_FORM 0x0001
523
524 /* If set, reset graphic planes and registers at end-of-line to the
525 initial state. */
526 #define CODING_ISO_FLAG_RESET_AT_EOL 0x0002
527
528 /* If set, reset graphic planes and registers before any control
529 characters to the initial state. */
530 #define CODING_ISO_FLAG_RESET_AT_CNTL 0x0004
531
532 /* If set, encode by 7-bit environment. */
533 #define CODING_ISO_FLAG_SEVEN_BITS 0x0008
534
535 /* If set, use locking-shift function. */
536 #define CODING_ISO_FLAG_LOCKING_SHIFT 0x0010
537
538 /* If set, use single-shift function. Overwrite
539 CODING_ISO_FLAG_LOCKING_SHIFT. */
540 #define CODING_ISO_FLAG_SINGLE_SHIFT 0x0020
541
542 /* If set, use designation escape sequence. */
543 #define CODING_ISO_FLAG_DESIGNATION 0x0040
544
545 /* If set, produce revision number sequence. */
546 #define CODING_ISO_FLAG_REVISION 0x0080
547
548 /* If set, produce ISO6429's direction specifying sequence. */
549 #define CODING_ISO_FLAG_DIRECTION 0x0100
550
551 /* If set, assume designation states are reset at beginning of line on
552 output. */
553 #define CODING_ISO_FLAG_INIT_AT_BOL 0x0200
554
555 /* If set, designation sequence should be placed at beginning of line
556 on output. */
557 #define CODING_ISO_FLAG_DESIGNATE_AT_BOL 0x0400
558
559 /* If set, do not encode unsafe charactes on output. */
560 #define CODING_ISO_FLAG_SAFE 0x0800
561
562 /* If set, extra latin codes (128..159) are accepted as a valid code
563 on input. */
564 #define CODING_ISO_FLAG_LATIN_EXTRA 0x1000
565
566 #define CODING_ISO_FLAG_COMPOSITION 0x2000
567
568 #define CODING_ISO_FLAG_EUC_TW_SHIFT 0x4000
569
570 #define CODING_ISO_FLAG_USE_ROMAN 0x8000
571
572 #define CODING_ISO_FLAG_USE_OLDJIS 0x10000
573
574 #define CODING_ISO_FLAG_FULL_SUPPORT 0x100000
575
576 /* A character to be produced on output if encoding of the original
577 character is prohibited by CODING_ISO_FLAG_SAFE. */
578 #define CODING_INHIBIT_CHARACTER_SUBSTITUTION '?'
579
580
581 /* UTF-16 section */
582 #define CODING_UTF_16_BOM(coding) \
583 ((coding)->spec.utf_16.bom)
584
585 #define CODING_UTF_16_ENDIAN(coding) \
586 ((coding)->spec.utf_16.endian)
587
588 #define CODING_UTF_16_SURROGATE(coding) \
589 ((coding)->spec.utf_16.surrogate)
590
591
592 /* CCL section */
593 #define CODING_CCL_DECODER(coding) \
594 AREF (CODING_ID_ATTRS ((coding)->id), coding_attr_ccl_decoder)
595 #define CODING_CCL_ENCODER(coding) \
596 AREF (CODING_ID_ATTRS ((coding)->id), coding_attr_ccl_encoder)
597 #define CODING_CCL_VALIDS(coding) \
598 (SDATA (AREF (CODING_ID_ATTRS ((coding)->id), coding_attr_ccl_valids)))
599
600 /* Index for each coding category in `coding_categories' */
601
602 enum coding_category
603 {
604 coding_category_iso_7,
605 coding_category_iso_7_tight,
606 coding_category_iso_8_1,
607 coding_category_iso_8_2,
608 coding_category_iso_7_else,
609 coding_category_iso_8_else,
610 coding_category_utf_8,
611 coding_category_utf_16_auto,
612 coding_category_utf_16_be,
613 coding_category_utf_16_le,
614 coding_category_utf_16_be_nosig,
615 coding_category_utf_16_le_nosig,
616 coding_category_charset,
617 coding_category_sjis,
618 coding_category_big5,
619 coding_category_ccl,
620 coding_category_emacs_mule,
621 /* All above are targets of code detection. */
622 coding_category_raw_text,
623 coding_category_undecided,
624 coding_category_max
625 };
626
627 /* Definitions of flag bits used in detect_coding_XXXX. */
628 #define CATEGORY_MASK_ISO_7 (1 << coding_category_iso_7)
629 #define CATEGORY_MASK_ISO_7_TIGHT (1 << coding_category_iso_7_tight)
630 #define CATEGORY_MASK_ISO_8_1 (1 << coding_category_iso_8_1)
631 #define CATEGORY_MASK_ISO_8_2 (1 << coding_category_iso_8_2)
632 #define CATEGORY_MASK_ISO_7_ELSE (1 << coding_category_iso_7_else)
633 #define CATEGORY_MASK_ISO_8_ELSE (1 << coding_category_iso_8_else)
634 #define CATEGORY_MASK_UTF_8 (1 << coding_category_utf_8)
635 #define CATEGORY_MASK_UTF_16_AUTO (1 << coding_category_utf_16_auto)
636 #define CATEGORY_MASK_UTF_16_BE (1 << coding_category_utf_16_be)
637 #define CATEGORY_MASK_UTF_16_LE (1 << coding_category_utf_16_le)
638 #define CATEGORY_MASK_UTF_16_BE_NOSIG (1 << coding_category_utf_16_be_nosig)
639 #define CATEGORY_MASK_UTF_16_LE_NOSIG (1 << coding_category_utf_16_le_nosig)
640 #define CATEGORY_MASK_CHARSET (1 << coding_category_charset)
641 #define CATEGORY_MASK_SJIS (1 << coding_category_sjis)
642 #define CATEGORY_MASK_BIG5 (1 << coding_category_big5)
643 #define CATEGORY_MASK_CCL (1 << coding_category_ccl)
644 #define CATEGORY_MASK_EMACS_MULE (1 << coding_category_emacs_mule)
645 #define CATEGORY_MASK_RAW_TEXT (1 << coding_category_raw_text)
646
647 /* This value is returned if detect_coding_mask () find nothing other
648 than ASCII characters. */
649 #define CATEGORY_MASK_ANY \
650 (CATEGORY_MASK_ISO_7 \
651 | CATEGORY_MASK_ISO_7_TIGHT \
652 | CATEGORY_MASK_ISO_8_1 \
653 | CATEGORY_MASK_ISO_8_2 \
654 | CATEGORY_MASK_ISO_7_ELSE \
655 | CATEGORY_MASK_ISO_8_ELSE \
656 | CATEGORY_MASK_UTF_8 \
657 | CATEGORY_MASK_UTF_16_BE \
658 | CATEGORY_MASK_UTF_16_LE \
659 | CATEGORY_MASK_UTF_16_BE_NOSIG \
660 | CATEGORY_MASK_UTF_16_LE_NOSIG \
661 | CATEGORY_MASK_CHARSET \
662 | CATEGORY_MASK_SJIS \
663 | CATEGORY_MASK_BIG5 \
664 | CATEGORY_MASK_CCL \
665 | CATEGORY_MASK_EMACS_MULE)
666
667
668 #define CATEGORY_MASK_ISO_7BIT \
669 (CATEGORY_MASK_ISO_7 | CATEGORY_MASK_ISO_7_TIGHT)
670
671 #define CATEGORY_MASK_ISO_8BIT \
672 (CATEGORY_MASK_ISO_8_1 | CATEGORY_MASK_ISO_8_2)
673
674 #define CATEGORY_MASK_ISO_ELSE \
675 (CATEGORY_MASK_ISO_7_ELSE | CATEGORY_MASK_ISO_8_ELSE)
676
677 #define CATEGORY_MASK_ISO_ESCAPE \
678 (CATEGORY_MASK_ISO_7 \
679 | CATEGORY_MASK_ISO_7_TIGHT \
680 | CATEGORY_MASK_ISO_7_ELSE \
681 | CATEGORY_MASK_ISO_8_ELSE)
682
683 #define CATEGORY_MASK_ISO \
684 ( CATEGORY_MASK_ISO_7BIT \
685 | CATEGORY_MASK_ISO_8BIT \
686 | CATEGORY_MASK_ISO_ELSE)
687
688 #define CATEGORY_MASK_UTF_16 \
689 (CATEGORY_MASK_UTF_16_BE \
690 | CATEGORY_MASK_UTF_16_LE \
691 | CATEGORY_MASK_UTF_16_BE_NOSIG \
692 | CATEGORY_MASK_UTF_16_LE_NOSIG)
693
694
695 /* List of symbols `coding-category-xxx' ordered by priority. This
696 variable is exposed to Emacs Lisp. */
697 static Lisp_Object Vcoding_category_list;
698
699 /* Table of coding categories (Lisp symbols). This variable is for
700 internal use oly. */
701 static Lisp_Object Vcoding_category_table;
702
703 /* Table of coding-categories ordered by priority. */
704 static enum coding_category coding_priorities[coding_category_max];
705
706 /* Nth element is a coding context for the coding system bound to the
707 Nth coding category. */
708 static struct coding_system coding_categories[coding_category_max];
709
710 /*** Commonly used macros and functions ***/
711
712 #ifndef min
713 #define min(a, b) ((a) < (b) ? (a) : (b))
714 #endif
715 #ifndef max
716 #define max(a, b) ((a) > (b) ? (a) : (b))
717 #endif
718
719 #define CODING_GET_INFO(coding, attrs, eol_type, charset_list) \
720 do { \
721 attrs = CODING_ID_ATTRS (coding->id); \
722 eol_type = CODING_ID_EOL_TYPE (coding->id); \
723 if (VECTORP (eol_type)) \
724 eol_type = Qunix; \
725 charset_list = CODING_ATTR_CHARSET_LIST (attrs); \
726 } while (0)
727
728
729 /* Safely get one byte from the source text pointed by SRC which ends
730 at SRC_END, and set C to that byte. If there are not enough bytes
731 in the source, it jumps to `no_more_source'. The caller
732 should declare and set these variables appropriately in advance:
733 src, src_end, multibytep
734 */
735
736 #define ONE_MORE_BYTE(c) \
737 do { \
738 if (src == src_end) \
739 { \
740 if (src_base < src) \
741 coding->result = CODING_RESULT_INSUFFICIENT_SRC; \
742 goto no_more_source; \
743 } \
744 c = *src++; \
745 if (multibytep && (c & 0x80)) \
746 { \
747 if ((c & 0xFE) != 0xC0) \
748 error ("Undecodable char found"); \
749 c = ((c & 1) << 6) | *src++; \
750 } \
751 consumed_chars++; \
752 } while (0)
753
754
755 #define ONE_MORE_BYTE_NO_CHECK(c) \
756 do { \
757 c = *src++; \
758 if (multibytep && (c & 0x80)) \
759 { \
760 if ((c & 0xFE) != 0xC0) \
761 error ("Undecodable char found"); \
762 c = ((c & 1) << 6) | *src++; \
763 } \
764 consumed_chars++; \
765 } while (0)
766
767
768 /* Store a byte C in the place pointed by DST and increment DST to the
769 next free point, and increment PRODUCED_CHARS. The caller should
770 assure that C is 0..127, and declare and set the variable `dst'
771 appropriately in advance.
772 */
773
774
775 #define EMIT_ONE_ASCII_BYTE(c) \
776 do { \
777 produced_chars++; \
778 *dst++ = (c); \
779 } while (0)
780
781
782 /* Like EMIT_ONE_ASCII_BYTE byt store two bytes; C1 and C2. */
783
784 #define EMIT_TWO_ASCII_BYTES(c1, c2) \
785 do { \
786 produced_chars += 2; \
787 *dst++ = (c1), *dst++ = (c2); \
788 } while (0)
789
790
791 /* Store a byte C in the place pointed by DST and increment DST to the
792 next free point, and increment PRODUCED_CHARS. If MULTIBYTEP is
793 nonzero, store in an appropriate multibyte from. The caller should
794 declare and set the variables `dst' and `multibytep' appropriately
795 in advance. */
796
797 #define EMIT_ONE_BYTE(c) \
798 do { \
799 produced_chars++; \
800 if (multibytep) \
801 { \
802 int ch = (c); \
803 if (ch >= 0x80) \
804 ch = BYTE8_TO_CHAR (ch); \
805 CHAR_STRING_ADVANCE (ch, dst); \
806 } \
807 else \
808 *dst++ = (c); \
809 } while (0)
810
811
812 /* Like EMIT_ONE_BYTE, but emit two bytes; C1 and C2. */
813
814 #define EMIT_TWO_BYTES(c1, c2) \
815 do { \
816 produced_chars += 2; \
817 if (multibytep) \
818 { \
819 int ch; \
820 \
821 ch = (c1); \
822 if (ch >= 0x80) \
823 ch = BYTE8_TO_CHAR (ch); \
824 CHAR_STRING_ADVANCE (ch, dst); \
825 ch = (c2); \
826 if (ch >= 0x80) \
827 ch = BYTE8_TO_CHAR (ch); \
828 CHAR_STRING_ADVANCE (ch, dst); \
829 } \
830 else \
831 { \
832 *dst++ = (c1); \
833 *dst++ = (c2); \
834 } \
835 } while (0)
836
837
838 #define EMIT_THREE_BYTES(c1, c2, c3) \
839 do { \
840 EMIT_ONE_BYTE (c1); \
841 EMIT_TWO_BYTES (c2, c3); \
842 } while (0)
843
844
845 #define EMIT_FOUR_BYTES(c1, c2, c3, c4) \
846 do { \
847 EMIT_TWO_BYTES (c1, c2); \
848 EMIT_TWO_BYTES (c3, c4); \
849 } while (0)
850
851
852 #define CODING_DECODE_CHAR(coding, src, src_base, src_end, charset, code, c) \
853 do { \
854 charset_map_loaded = 0; \
855 c = DECODE_CHAR (charset, code); \
856 if (charset_map_loaded) \
857 { \
858 const unsigned char *orig = coding->source; \
859 EMACS_INT offset; \
860 \
861 coding_set_source (coding); \
862 offset = coding->source - orig; \
863 src += offset; \
864 src_base += offset; \
865 src_end += offset; \
866 } \
867 } while (0)
868
869
870 #define ASSURE_DESTINATION(bytes) \
871 do { \
872 if (dst + (bytes) >= dst_end) \
873 { \
874 int more_bytes = charbuf_end - charbuf + (bytes); \
875 \
876 dst = alloc_destination (coding, more_bytes, dst); \
877 dst_end = coding->destination + coding->dst_bytes; \
878 } \
879 } while (0)
880
881
882
883 static void
884 coding_set_source (coding)
885 struct coding_system *coding;
886 {
887 if (BUFFERP (coding->src_object))
888 {
889 struct buffer *buf = XBUFFER (coding->src_object);
890
891 if (coding->src_pos < 0)
892 coding->source = BUF_GAP_END_ADDR (buf) + coding->src_pos_byte;
893 else
894 coding->source = BUF_BYTE_ADDRESS (buf, coding->src_pos_byte);
895 }
896 else if (STRINGP (coding->src_object))
897 {
898 coding->source = SDATA (coding->src_object) + coding->src_pos_byte;
899 }
900 else
901 /* Otherwise, the source is C string and is never relocated
902 automatically. Thus we don't have to update anything. */
903 ;
904 }
905
906 static void
907 coding_set_destination (coding)
908 struct coding_system *coding;
909 {
910 if (BUFFERP (coding->dst_object))
911 {
912 if (coding->src_pos < 0)
913 {
914 coding->destination = BEG_ADDR + coding->dst_pos_byte - 1;
915 coding->dst_bytes = (GAP_END_ADDR
916 - (coding->src_bytes - coding->consumed)
917 - coding->destination);
918 }
919 else
920 {
921 /* We are sure that coding->dst_pos_byte is before the gap
922 of the buffer. */
923 coding->destination = (BUF_BEG_ADDR (XBUFFER (coding->dst_object))
924 + coding->dst_pos_byte - 1);
925 coding->dst_bytes = (BUF_GAP_END_ADDR (XBUFFER (coding->dst_object))
926 - coding->destination);
927 }
928 }
929 else
930 /* Otherwise, the destination is C string and is never relocated
931 automatically. Thus we don't have to update anything. */
932 ;
933 }
934
935
936 static void
937 coding_alloc_by_realloc (coding, bytes)
938 struct coding_system *coding;
939 EMACS_INT bytes;
940 {
941 coding->destination = (unsigned char *) xrealloc (coding->destination,
942 coding->dst_bytes + bytes);
943 coding->dst_bytes += bytes;
944 }
945
946 static void
947 coding_alloc_by_making_gap (coding, bytes)
948 struct coding_system *coding;
949 EMACS_INT bytes;
950 {
951 if (BUFFERP (coding->dst_object)
952 && EQ (coding->src_object, coding->dst_object))
953 {
954 EMACS_INT add = coding->src_bytes - coding->consumed;
955
956 GAP_SIZE -= add; ZV += add; Z += add; ZV_BYTE += add; Z_BYTE += add;
957 make_gap (bytes);
958 GAP_SIZE += add; ZV -= add; Z -= add; ZV_BYTE -= add; Z_BYTE -= add;
959 }
960 else
961 {
962 Lisp_Object this_buffer;
963
964 this_buffer = Fcurrent_buffer ();
965 set_buffer_internal (XBUFFER (coding->dst_object));
966 make_gap (bytes);
967 set_buffer_internal (XBUFFER (this_buffer));
968 }
969 }
970
971
972 static unsigned char *
973 alloc_destination (coding, nbytes, dst)
974 struct coding_system *coding;
975 int nbytes;
976 unsigned char *dst;
977 {
978 EMACS_INT offset = dst - coding->destination;
979
980 if (BUFFERP (coding->dst_object))
981 coding_alloc_by_making_gap (coding, nbytes);
982 else
983 coding_alloc_by_realloc (coding, nbytes);
984 coding->result = CODING_RESULT_SUCCESS;
985 coding_set_destination (coding);
986 dst = coding->destination + offset;
987 return dst;
988 }
989
990 /** Macros for annotations. */
991
992 /* Maximum length of annotation data (sum of annotations for
993 composition and charset). */
994 #define MAX_ANNOTATION_LENGTH (5 + (MAX_COMPOSITION_COMPONENTS * 2) - 1 + 5)
995
996 /* An annotation data is stored in the array coding->charbuf in this
997 format:
998 [ -LENGTH ANNOTATION_MASK FROM TO ... ]
999 LENGTH is the number of elements in the annotation.
1000 ANNOTATION_MASK is one of CODING_ANNOTATE_XXX_MASK.
1001 FROM and TO specify the range of text annotated. They are relative
1002 to coding->src_pos (on encoding) or coding->dst_pos (on decoding).
1003
1004 The format of the following elements depend on ANNOTATION_MASK.
1005
1006 In the case of CODING_ANNOTATE_COMPOSITION_MASK, these elements
1007 follows:
1008 ... METHOD [ COMPOSITION-COMPONENTS ... ]
1009 METHOD is one of enum composition_method.
1010 Optionnal COMPOSITION-COMPONENTS are characters and composition
1011 rules.
1012
1013 In the case of CODING_ANNOTATE_CHARSET_MASK, one element CHARSET-ID
1014 follows. */
1015
1016 #define ADD_ANNOTATION_DATA(buf, len, mask, from, to) \
1017 do { \
1018 *(buf)++ = -(len); \
1019 *(buf)++ = (mask); \
1020 *(buf)++ = (from); \
1021 *(buf)++ = (to); \
1022 coding->annotated = 1; \
1023 } while (0);
1024
1025 #define ADD_COMPOSITION_DATA(buf, from, to, method) \
1026 do { \
1027 ADD_ANNOTATION_DATA (buf, 5, CODING_ANNOTATE_COMPOSITION_MASK, from, to); \
1028 *buf++ = method; \
1029 } while (0)
1030
1031
1032 #define ADD_CHARSET_DATA(buf, from, to, id) \
1033 do { \
1034 ADD_ANNOTATION_DATA (buf, 5, CODING_ANNOTATE_CHARSET_MASK, from, to); \
1035 *buf++ = id; \
1036 } while (0)
1037
1038 \f
1039 /*** 2. Emacs' internal format (emacs-utf-8) ***/
1040
1041
1042
1043 \f
1044 /*** 3. UTF-8 ***/
1045
1046 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
1047 Check if a text is encoded in UTF-8. If it is, return 1, else
1048 return 0. */
1049
1050 #define UTF_8_1_OCTET_P(c) ((c) < 0x80)
1051 #define UTF_8_EXTRA_OCTET_P(c) (((c) & 0xC0) == 0x80)
1052 #define UTF_8_2_OCTET_LEADING_P(c) (((c) & 0xE0) == 0xC0)
1053 #define UTF_8_3_OCTET_LEADING_P(c) (((c) & 0xF0) == 0xE0)
1054 #define UTF_8_4_OCTET_LEADING_P(c) (((c) & 0xF8) == 0xF0)
1055 #define UTF_8_5_OCTET_LEADING_P(c) (((c) & 0xFC) == 0xF8)
1056
1057 static int
1058 detect_coding_utf_8 (coding, detect_info)
1059 struct coding_system *coding;
1060 struct coding_detection_info *detect_info;
1061 {
1062 const unsigned char *src = coding->source, *src_base = src;
1063 const unsigned char *src_end = coding->source + coding->src_bytes;
1064 int multibytep = coding->src_multibyte;
1065 int consumed_chars = 0;
1066 int found = 0;
1067 int incomplete;
1068
1069 detect_info->checked |= CATEGORY_MASK_UTF_8;
1070 /* A coding system of this category is always ASCII compatible. */
1071 src += coding->head_ascii;
1072
1073 while (1)
1074 {
1075 int c, c1, c2, c3, c4;
1076
1077 incomplete = 0;
1078 ONE_MORE_BYTE (c);
1079 if (UTF_8_1_OCTET_P (c))
1080 continue;
1081 incomplete = 1;
1082 ONE_MORE_BYTE (c1);
1083 if (! UTF_8_EXTRA_OCTET_P (c1))
1084 break;
1085 if (UTF_8_2_OCTET_LEADING_P (c))
1086 {
1087 found = CATEGORY_MASK_UTF_8;
1088 continue;
1089 }
1090 ONE_MORE_BYTE (c2);
1091 if (! UTF_8_EXTRA_OCTET_P (c2))
1092 break;
1093 if (UTF_8_3_OCTET_LEADING_P (c))
1094 {
1095 found = CATEGORY_MASK_UTF_8;
1096 continue;
1097 }
1098 ONE_MORE_BYTE (c3);
1099 if (! UTF_8_EXTRA_OCTET_P (c3))
1100 break;
1101 if (UTF_8_4_OCTET_LEADING_P (c))
1102 {
1103 found = CATEGORY_MASK_UTF_8;
1104 continue;
1105 }
1106 ONE_MORE_BYTE (c4);
1107 if (! UTF_8_EXTRA_OCTET_P (c4))
1108 break;
1109 if (UTF_8_5_OCTET_LEADING_P (c))
1110 {
1111 found = CATEGORY_MASK_UTF_8;
1112 continue;
1113 }
1114 break;
1115 }
1116 detect_info->rejected |= CATEGORY_MASK_UTF_8;
1117 return 0;
1118
1119 no_more_source:
1120 if (incomplete && coding->mode & CODING_MODE_LAST_BLOCK)
1121 {
1122 detect_info->rejected |= CATEGORY_MASK_UTF_8;
1123 return 0;
1124 }
1125 detect_info->found |= found;
1126 return 1;
1127 }
1128
1129
1130 static void
1131 decode_coding_utf_8 (coding)
1132 struct coding_system *coding;
1133 {
1134 const unsigned char *src = coding->source + coding->consumed;
1135 const unsigned char *src_end = coding->source + coding->src_bytes;
1136 const unsigned char *src_base;
1137 int *charbuf = coding->charbuf;
1138 int *charbuf_end = charbuf + coding->charbuf_size;
1139 int consumed_chars = 0, consumed_chars_base;
1140 int multibytep = coding->src_multibyte;
1141 Lisp_Object attr, eol_type, charset_list;
1142
1143 CODING_GET_INFO (coding, attr, eol_type, charset_list);
1144
1145 while (1)
1146 {
1147 int c, c1, c2, c3, c4, c5;
1148
1149 src_base = src;
1150 consumed_chars_base = consumed_chars;
1151
1152 if (charbuf >= charbuf_end)
1153 break;
1154
1155 ONE_MORE_BYTE (c1);
1156 if (UTF_8_1_OCTET_P(c1))
1157 {
1158 c = c1;
1159 if (c == '\r')
1160 {
1161 if (EQ (eol_type, Qdos))
1162 {
1163 if (src == src_end)
1164 {
1165 coding->result = CODING_RESULT_INSUFFICIENT_SRC;
1166 goto no_more_source;
1167 }
1168 if (*src == '\n')
1169 ONE_MORE_BYTE (c);
1170 }
1171 else if (EQ (eol_type, Qmac))
1172 c = '\n';
1173 }
1174 }
1175 else
1176 {
1177 ONE_MORE_BYTE (c2);
1178 if (! UTF_8_EXTRA_OCTET_P (c2))
1179 goto invalid_code;
1180 if (UTF_8_2_OCTET_LEADING_P (c1))
1181 {
1182 c = ((c1 & 0x1F) << 6) | (c2 & 0x3F);
1183 /* Reject overlong sequences here and below. Encoders
1184 producing them are incorrect, they can be misleading,
1185 and they mess up read/write invariance. */
1186 if (c < 128)
1187 goto invalid_code;
1188 }
1189 else
1190 {
1191 ONE_MORE_BYTE (c3);
1192 if (! UTF_8_EXTRA_OCTET_P (c3))
1193 goto invalid_code;
1194 if (UTF_8_3_OCTET_LEADING_P (c1))
1195 {
1196 c = (((c1 & 0xF) << 12)
1197 | ((c2 & 0x3F) << 6) | (c3 & 0x3F));
1198 if (c < 0x800
1199 || (c >= 0xd800 && c < 0xe000)) /* surrogates (invalid) */
1200 goto invalid_code;
1201 }
1202 else
1203 {
1204 ONE_MORE_BYTE (c4);
1205 if (! UTF_8_EXTRA_OCTET_P (c4))
1206 goto invalid_code;
1207 if (UTF_8_4_OCTET_LEADING_P (c1))
1208 {
1209 c = (((c1 & 0x7) << 18) | ((c2 & 0x3F) << 12)
1210 | ((c3 & 0x3F) << 6) | (c4 & 0x3F));
1211 if (c < 0x10000)
1212 goto invalid_code;
1213 }
1214 else
1215 {
1216 ONE_MORE_BYTE (c5);
1217 if (! UTF_8_EXTRA_OCTET_P (c5))
1218 goto invalid_code;
1219 if (UTF_8_5_OCTET_LEADING_P (c1))
1220 {
1221 c = (((c1 & 0x3) << 24) | ((c2 & 0x3F) << 18)
1222 | ((c3 & 0x3F) << 12) | ((c4 & 0x3F) << 6)
1223 | (c5 & 0x3F));
1224 if ((c > MAX_CHAR) || (c < 0x200000))
1225 goto invalid_code;
1226 }
1227 else
1228 goto invalid_code;
1229 }
1230 }
1231 }
1232 }
1233
1234 *charbuf++ = c;
1235 continue;
1236
1237 invalid_code:
1238 src = src_base;
1239 consumed_chars = consumed_chars_base;
1240 ONE_MORE_BYTE (c);
1241 *charbuf++ = ASCII_BYTE_P (c) ? c : BYTE8_TO_CHAR (c);
1242 coding->errors++;
1243 }
1244
1245 no_more_source:
1246 coding->consumed_char += consumed_chars_base;
1247 coding->consumed = src_base - coding->source;
1248 coding->charbuf_used = charbuf - coding->charbuf;
1249 }
1250
1251
1252 static int
1253 encode_coding_utf_8 (coding)
1254 struct coding_system *coding;
1255 {
1256 int multibytep = coding->dst_multibyte;
1257 int *charbuf = coding->charbuf;
1258 int *charbuf_end = charbuf + coding->charbuf_used;
1259 unsigned char *dst = coding->destination + coding->produced;
1260 unsigned char *dst_end = coding->destination + coding->dst_bytes;
1261 int produced_chars = 0;
1262 int c;
1263
1264 if (multibytep)
1265 {
1266 int safe_room = MAX_MULTIBYTE_LENGTH * 2;
1267
1268 while (charbuf < charbuf_end)
1269 {
1270 unsigned char str[MAX_MULTIBYTE_LENGTH], *p, *pend = str;
1271
1272 ASSURE_DESTINATION (safe_room);
1273 c = *charbuf++;
1274 if (CHAR_BYTE8_P (c))
1275 {
1276 c = CHAR_TO_BYTE8 (c);
1277 EMIT_ONE_BYTE (c);
1278 }
1279 else
1280 {
1281 CHAR_STRING_ADVANCE (c, pend);
1282 for (p = str; p < pend; p++)
1283 EMIT_ONE_BYTE (*p);
1284 }
1285 }
1286 }
1287 else
1288 {
1289 int safe_room = MAX_MULTIBYTE_LENGTH;
1290
1291 while (charbuf < charbuf_end)
1292 {
1293 ASSURE_DESTINATION (safe_room);
1294 c = *charbuf++;
1295 dst += CHAR_STRING (c, dst);
1296 produced_chars++;
1297 }
1298 }
1299 coding->result = CODING_RESULT_SUCCESS;
1300 coding->produced_char += produced_chars;
1301 coding->produced = dst - coding->destination;
1302 return 0;
1303 }
1304
1305
1306 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
1307 Check if a text is encoded in one of UTF-16 based coding systems.
1308 If it is, return 1, else return 0. */
1309
1310 #define UTF_16_HIGH_SURROGATE_P(val) \
1311 (((val) & 0xFC00) == 0xD800)
1312
1313 #define UTF_16_LOW_SURROGATE_P(val) \
1314 (((val) & 0xFC00) == 0xDC00)
1315
1316 #define UTF_16_INVALID_P(val) \
1317 (((val) == 0xFFFE) \
1318 || ((val) == 0xFFFF) \
1319 || UTF_16_LOW_SURROGATE_P (val))
1320
1321
1322 static int
1323 detect_coding_utf_16 (coding, detect_info)
1324 struct coding_system *coding;
1325 struct coding_detection_info *detect_info;
1326 {
1327 const unsigned char *src = coding->source, *src_base = src;
1328 const unsigned char *src_end = coding->source + coding->src_bytes;
1329 int multibytep = coding->src_multibyte;
1330 int consumed_chars = 0;
1331 int c1, c2;
1332
1333 detect_info->checked |= CATEGORY_MASK_UTF_16;
1334
1335 if (coding->mode & CODING_MODE_LAST_BLOCK
1336 && (coding->src_bytes & 1))
1337 {
1338 detect_info->rejected |= CATEGORY_MASK_UTF_16;
1339 return 0;
1340 }
1341 ONE_MORE_BYTE (c1);
1342 ONE_MORE_BYTE (c2);
1343
1344 if ((c1 == 0xFF) && (c2 == 0xFE))
1345 {
1346 detect_info->found |= (CATEGORY_MASK_UTF_16_LE
1347 | CATEGORY_MASK_UTF_16_AUTO);
1348 detect_info->rejected |= CATEGORY_MASK_UTF_16_BE;
1349 }
1350 else if ((c1 == 0xFE) && (c2 == 0xFF))
1351 {
1352 detect_info->found |= (CATEGORY_MASK_UTF_16_BE
1353 | CATEGORY_MASK_UTF_16_AUTO);
1354 detect_info->rejected |= CATEGORY_MASK_UTF_16_LE;
1355 }
1356 no_more_source:
1357 return 1;
1358 }
1359
1360 static void
1361 decode_coding_utf_16 (coding)
1362 struct coding_system *coding;
1363 {
1364 const unsigned char *src = coding->source + coding->consumed;
1365 const unsigned char *src_end = coding->source + coding->src_bytes;
1366 const unsigned char *src_base;
1367 int *charbuf = coding->charbuf;
1368 int *charbuf_end = charbuf + coding->charbuf_size;
1369 int consumed_chars = 0, consumed_chars_base;
1370 int multibytep = coding->src_multibyte;
1371 enum utf_16_bom_type bom = CODING_UTF_16_BOM (coding);
1372 enum utf_16_endian_type endian = CODING_UTF_16_ENDIAN (coding);
1373 int surrogate = CODING_UTF_16_SURROGATE (coding);
1374 Lisp_Object attr, eol_type, charset_list;
1375
1376 CODING_GET_INFO (coding, attr, eol_type, charset_list);
1377
1378 if (bom == utf_16_with_bom)
1379 {
1380 int c, c1, c2;
1381
1382 src_base = src;
1383 ONE_MORE_BYTE (c1);
1384 ONE_MORE_BYTE (c2);
1385 c = (c1 << 8) | c2;
1386
1387 if (endian == utf_16_big_endian
1388 ? c != 0xFEFF : c != 0xFFFE)
1389 {
1390 /* The first two bytes are not BOM. Treat them as bytes
1391 for a normal character. */
1392 src = src_base;
1393 coding->errors++;
1394 }
1395 CODING_UTF_16_BOM (coding) = utf_16_without_bom;
1396 }
1397 else if (bom == utf_16_detect_bom)
1398 {
1399 /* We have already tried to detect BOM and failed in
1400 detect_coding. */
1401 CODING_UTF_16_BOM (coding) = utf_16_without_bom;
1402 }
1403
1404 while (1)
1405 {
1406 int c, c1, c2;
1407
1408 src_base = src;
1409 consumed_chars_base = consumed_chars;
1410
1411 if (charbuf + 2 >= charbuf_end)
1412 break;
1413
1414 ONE_MORE_BYTE (c1);
1415 ONE_MORE_BYTE (c2);
1416 c = (endian == utf_16_big_endian
1417 ? ((c1 << 8) | c2) : ((c2 << 8) | c1));
1418 if (surrogate)
1419 {
1420 if (! UTF_16_LOW_SURROGATE_P (c))
1421 {
1422 if (endian == utf_16_big_endian)
1423 c1 = surrogate >> 8, c2 = surrogate & 0xFF;
1424 else
1425 c1 = surrogate & 0xFF, c2 = surrogate >> 8;
1426 *charbuf++ = c1;
1427 *charbuf++ = c2;
1428 coding->errors++;
1429 if (UTF_16_HIGH_SURROGATE_P (c))
1430 CODING_UTF_16_SURROGATE (coding) = surrogate = c;
1431 else
1432 *charbuf++ = c;
1433 }
1434 else
1435 {
1436 c = ((surrogate - 0xD800) << 10) | (c - 0xDC00);
1437 CODING_UTF_16_SURROGATE (coding) = surrogate = 0;
1438 *charbuf++ = c;
1439 }
1440 }
1441 else
1442 {
1443 if (UTF_16_HIGH_SURROGATE_P (c))
1444 CODING_UTF_16_SURROGATE (coding) = surrogate = c;
1445 else
1446 *charbuf++ = c;
1447 }
1448 }
1449
1450 no_more_source:
1451 coding->consumed_char += consumed_chars_base;
1452 coding->consumed = src_base - coding->source;
1453 coding->charbuf_used = charbuf - coding->charbuf;
1454 }
1455
1456 static int
1457 encode_coding_utf_16 (coding)
1458 struct coding_system *coding;
1459 {
1460 int multibytep = coding->dst_multibyte;
1461 int *charbuf = coding->charbuf;
1462 int *charbuf_end = charbuf + coding->charbuf_used;
1463 unsigned char *dst = coding->destination + coding->produced;
1464 unsigned char *dst_end = coding->destination + coding->dst_bytes;
1465 int safe_room = 8;
1466 enum utf_16_bom_type bom = CODING_UTF_16_BOM (coding);
1467 int big_endian = CODING_UTF_16_ENDIAN (coding) == utf_16_big_endian;
1468 int produced_chars = 0;
1469 Lisp_Object attrs, eol_type, charset_list;
1470 int c;
1471
1472 CODING_GET_INFO (coding, attrs, eol_type, charset_list);
1473
1474 if (bom != utf_16_without_bom)
1475 {
1476 ASSURE_DESTINATION (safe_room);
1477 if (big_endian)
1478 EMIT_TWO_BYTES (0xFE, 0xFF);
1479 else
1480 EMIT_TWO_BYTES (0xFF, 0xFE);
1481 CODING_UTF_16_BOM (coding) = utf_16_without_bom;
1482 }
1483
1484 while (charbuf < charbuf_end)
1485 {
1486 ASSURE_DESTINATION (safe_room);
1487 c = *charbuf++;
1488 if (c >= MAX_UNICODE_CHAR)
1489 c = coding->default_char;
1490
1491 if (c < 0x10000)
1492 {
1493 if (big_endian)
1494 EMIT_TWO_BYTES (c >> 8, c & 0xFF);
1495 else
1496 EMIT_TWO_BYTES (c & 0xFF, c >> 8);
1497 }
1498 else
1499 {
1500 int c1, c2;
1501
1502 c -= 0x10000;
1503 c1 = (c >> 10) + 0xD800;
1504 c2 = (c & 0x3FF) + 0xDC00;
1505 if (big_endian)
1506 EMIT_FOUR_BYTES (c1 >> 8, c1 & 0xFF, c2 >> 8, c2 & 0xFF);
1507 else
1508 EMIT_FOUR_BYTES (c1 & 0xFF, c1 >> 8, c2 & 0xFF, c2 >> 8);
1509 }
1510 }
1511 coding->result = CODING_RESULT_SUCCESS;
1512 coding->produced = dst - coding->destination;
1513 coding->produced_char += produced_chars;
1514 return 0;
1515 }
1516
1517 \f
1518 /*** 6. Old Emacs' internal format (emacs-mule) ***/
1519
1520 /* Emacs' internal format for representation of multiple character
1521 sets is a kind of multi-byte encoding, i.e. characters are
1522 represented by variable-length sequences of one-byte codes.
1523
1524 ASCII characters and control characters (e.g. `tab', `newline') are
1525 represented by one-byte sequences which are their ASCII codes, in
1526 the range 0x00 through 0x7F.
1527
1528 8-bit characters of the range 0x80..0x9F are represented by
1529 two-byte sequences of LEADING_CODE_8_BIT_CONTROL and (their 8-bit
1530 code + 0x20).
1531
1532 8-bit characters of the range 0xA0..0xFF are represented by
1533 one-byte sequences which are their 8-bit code.
1534
1535 The other characters are represented by a sequence of `base
1536 leading-code', optional `extended leading-code', and one or two
1537 `position-code's. The length of the sequence is determined by the
1538 base leading-code. Leading-code takes the range 0x81 through 0x9D,
1539 whereas extended leading-code and position-code take the range 0xA0
1540 through 0xFF. See `charset.h' for more details about leading-code
1541 and position-code.
1542
1543 --- CODE RANGE of Emacs' internal format ---
1544 character set range
1545 ------------- -----
1546 ascii 0x00..0x7F
1547 eight-bit-control LEADING_CODE_8_BIT_CONTROL + 0xA0..0xBF
1548 eight-bit-graphic 0xA0..0xBF
1549 ELSE 0x81..0x9D + [0xA0..0xFF]+
1550 ---------------------------------------------
1551
1552 As this is the internal character representation, the format is
1553 usually not used externally (i.e. in a file or in a data sent to a
1554 process). But, it is possible to have a text externally in this
1555 format (i.e. by encoding by the coding system `emacs-mule').
1556
1557 In that case, a sequence of one-byte codes has a slightly different
1558 form.
1559
1560 At first, all characters in eight-bit-control are represented by
1561 one-byte sequences which are their 8-bit code.
1562
1563 Next, character composition data are represented by the byte
1564 sequence of the form: 0x80 METHOD BYTES CHARS COMPONENT ...,
1565 where,
1566 METHOD is 0xF0 plus one of composition method (enum
1567 composition_method),
1568
1569 BYTES is 0xA0 plus a byte length of this composition data,
1570
1571 CHARS is 0x20 plus a number of characters composed by this
1572 data,
1573
1574 COMPONENTs are characters of multibye form or composition
1575 rules encoded by two-byte of ASCII codes.
1576
1577 In addition, for backward compatibility, the following formats are
1578 also recognized as composition data on decoding.
1579
1580 0x80 MSEQ ...
1581 0x80 0xFF MSEQ RULE MSEQ RULE ... MSEQ
1582
1583 Here,
1584 MSEQ is a multibyte form but in these special format:
1585 ASCII: 0xA0 ASCII_CODE+0x80,
1586 other: LEADING_CODE+0x20 FOLLOWING-BYTE ...,
1587 RULE is a one byte code of the range 0xA0..0xF0 that
1588 represents a composition rule.
1589 */
1590
1591 char emacs_mule_bytes[256];
1592
1593 int
1594 emacs_mule_char (coding, src, nbytes, nchars, id)
1595 struct coding_system *coding;
1596 unsigned char *src;
1597 int *nbytes, *nchars, *id;
1598 {
1599 const unsigned char *src_end = coding->source + coding->src_bytes;
1600 const unsigned char *src_base = src;
1601 int multibytep = coding->src_multibyte;
1602 struct charset *charset;
1603 unsigned code;
1604 int c;
1605 int consumed_chars = 0;
1606
1607 ONE_MORE_BYTE (c);
1608 switch (emacs_mule_bytes[c])
1609 {
1610 case 2:
1611 if (! (charset = emacs_mule_charset[c]))
1612 goto invalid_code;
1613 ONE_MORE_BYTE (c);
1614 code = c & 0x7F;
1615 break;
1616
1617 case 3:
1618 if (c == EMACS_MULE_LEADING_CODE_PRIVATE_11
1619 || c == EMACS_MULE_LEADING_CODE_PRIVATE_12)
1620 {
1621 ONE_MORE_BYTE (c);
1622 if (! (charset = emacs_mule_charset[c]))
1623 goto invalid_code;
1624 ONE_MORE_BYTE (c);
1625 code = c & 0x7F;
1626 }
1627 else
1628 {
1629 if (! (charset = emacs_mule_charset[c]))
1630 goto invalid_code;
1631 ONE_MORE_BYTE (c);
1632 code = (c & 0x7F) << 8;
1633 ONE_MORE_BYTE (c);
1634 code |= c & 0x7F;
1635 }
1636 break;
1637
1638 case 4:
1639 ONE_MORE_BYTE (c);
1640 if (! (charset = emacs_mule_charset[c]))
1641 goto invalid_code;
1642 ONE_MORE_BYTE (c);
1643 code = (c & 0x7F) << 8;
1644 ONE_MORE_BYTE (c);
1645 code |= c & 0x7F;
1646 break;
1647
1648 case 1:
1649 code = c;
1650 charset = CHARSET_FROM_ID (ASCII_BYTE_P (code)
1651 ? charset_ascii : charset_eight_bit);
1652 break;
1653
1654 default:
1655 abort ();
1656 }
1657 c = DECODE_CHAR (charset, code);
1658 if (c < 0)
1659 goto invalid_code;
1660 *nbytes = src - src_base;
1661 *nchars = consumed_chars;
1662 if (id)
1663 *id = charset->id;
1664 return c;
1665
1666 no_more_source:
1667 return -2;
1668
1669 invalid_code:
1670 return -1;
1671 }
1672
1673
1674 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
1675 Check if a text is encoded in `emacs-mule'. If it is, return 1,
1676 else return 0. */
1677
1678 static int
1679 detect_coding_emacs_mule (coding, detect_info)
1680 struct coding_system *coding;
1681 struct coding_detection_info *detect_info;
1682 {
1683 const unsigned char *src = coding->source, *src_base = src;
1684 const unsigned char *src_end = coding->source + coding->src_bytes;
1685 int multibytep = coding->src_multibyte;
1686 int consumed_chars = 0;
1687 int c;
1688 int found = 0;
1689 int incomplete;
1690
1691 detect_info->checked |= CATEGORY_MASK_EMACS_MULE;
1692 /* A coding system of this category is always ASCII compatible. */
1693 src += coding->head_ascii;
1694
1695 while (1)
1696 {
1697 incomplete = 0;
1698 ONE_MORE_BYTE (c);
1699 incomplete = 1;
1700
1701 if (c == 0x80)
1702 {
1703 /* Perhaps the start of composite character. We simple skip
1704 it because analyzing it is too heavy for detecting. But,
1705 at least, we check that the composite character
1706 constitues of more than 4 bytes. */
1707 const unsigned char *src_base;
1708
1709 repeat:
1710 src_base = src;
1711 do
1712 {
1713 ONE_MORE_BYTE (c);
1714 }
1715 while (c >= 0xA0);
1716
1717 if (src - src_base <= 4)
1718 break;
1719 found = CATEGORY_MASK_EMACS_MULE;
1720 if (c == 0x80)
1721 goto repeat;
1722 }
1723
1724 if (c < 0x80)
1725 {
1726 if (c < 0x20
1727 && (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO))
1728 break;
1729 }
1730 else
1731 {
1732 const unsigned char *src_base = src - 1;
1733
1734 do
1735 {
1736 ONE_MORE_BYTE (c);
1737 }
1738 while (c >= 0xA0);
1739 if (src - src_base != emacs_mule_bytes[*src_base])
1740 break;
1741 found = CATEGORY_MASK_EMACS_MULE;
1742 }
1743 }
1744 detect_info->rejected |= CATEGORY_MASK_EMACS_MULE;
1745 return 0;
1746
1747 no_more_source:
1748 if (incomplete && coding->mode & CODING_MODE_LAST_BLOCK)
1749 {
1750 detect_info->rejected |= CATEGORY_MASK_EMACS_MULE;
1751 return 0;
1752 }
1753 detect_info->found |= found;
1754 return 1;
1755 }
1756
1757
1758 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions". */
1759
1760 /* Decode a character represented as a component of composition
1761 sequence of Emacs 20/21 style at SRC. Set C to that character and
1762 update SRC to the head of next character (or an encoded composition
1763 rule). If SRC doesn't points a composition component, set C to -1.
1764 If SRC points an invalid byte sequence, global exit by a return
1765 value 0. */
1766
1767 #define DECODE_EMACS_MULE_COMPOSITION_CHAR(buf) \
1768 if (1) \
1769 { \
1770 int c; \
1771 int nbytes, nchars; \
1772 \
1773 if (src == src_end) \
1774 break; \
1775 c = emacs_mule_char (coding, src, &nbytes, &nchars, NULL);\
1776 if (c < 0) \
1777 { \
1778 if (c == -2) \
1779 break; \
1780 goto invalid_code; \
1781 } \
1782 *buf++ = c; \
1783 src += nbytes; \
1784 consumed_chars += nchars; \
1785 } \
1786 else
1787
1788
1789 /* Decode a composition rule represented as a component of composition
1790 sequence of Emacs 20 style at SRC. Store the decoded rule in *BUF,
1791 and increment BUF. If SRC points an invalid byte sequence, set C
1792 to -1. */
1793
1794 #define DECODE_EMACS_MULE_COMPOSITION_RULE_20(buf) \
1795 do { \
1796 int c, gref, nref; \
1797 \
1798 if (src >= src_end) \
1799 goto invalid_code; \
1800 ONE_MORE_BYTE_NO_CHECK (c); \
1801 c -= 0x20; \
1802 if (c < 0 || c >= 81) \
1803 goto invalid_code; \
1804 \
1805 gref = c / 9, nref = c % 9; \
1806 *buf++ = COMPOSITION_ENCODE_RULE (gref, nref); \
1807 } while (0)
1808
1809
1810 /* Decode a composition rule represented as a component of composition
1811 sequence of Emacs 21 style at SRC. Store the decoded rule in *BUF,
1812 and increment BUF. If SRC points an invalid byte sequence, set C
1813 to -1. */
1814
1815 #define DECODE_EMACS_MULE_COMPOSITION_RULE_21(buf) \
1816 do { \
1817 int gref, nref; \
1818 \
1819 if (src + 1>= src_end) \
1820 goto invalid_code; \
1821 ONE_MORE_BYTE_NO_CHECK (gref); \
1822 gref -= 0x20; \
1823 ONE_MORE_BYTE_NO_CHECK (nref); \
1824 nref -= 0x20; \
1825 if (gref < 0 || gref >= 81 \
1826 || nref < 0 || nref >= 81) \
1827 goto invalid_code; \
1828 *buf++ = COMPOSITION_ENCODE_RULE (gref, nref); \
1829 } while (0)
1830
1831
1832 #define DECODE_EMACS_MULE_21_COMPOSITION(c) \
1833 do { \
1834 /* Emacs 21 style format. The first three bytes at SRC are \
1835 (METHOD - 0xF2), (BYTES - 0xA0), (CHARS - 0xA0), where BYTES is \
1836 the byte length of this composition information, CHARS is the \
1837 number of characters composed by this composition. */ \
1838 enum composition_method method = c - 0xF2; \
1839 int *charbuf_base = charbuf; \
1840 int from, to; \
1841 int consumed_chars_limit; \
1842 int nbytes, nchars; \
1843 \
1844 ONE_MORE_BYTE (c); \
1845 nbytes = c - 0xA0; \
1846 if (nbytes < 3) \
1847 goto invalid_code; \
1848 ONE_MORE_BYTE (c); \
1849 nchars = c - 0xA0; \
1850 from = coding->produced + char_offset; \
1851 to = from + nchars; \
1852 ADD_COMPOSITION_DATA (charbuf, from, to, method); \
1853 consumed_chars_limit = consumed_chars_base + nbytes; \
1854 if (method != COMPOSITION_RELATIVE) \
1855 { \
1856 int i = 0; \
1857 while (consumed_chars < consumed_chars_limit) \
1858 { \
1859 if (i % 2 && method != COMPOSITION_WITH_ALTCHARS) \
1860 DECODE_EMACS_MULE_COMPOSITION_RULE_21 (charbuf); \
1861 else \
1862 DECODE_EMACS_MULE_COMPOSITION_CHAR (charbuf); \
1863 i++; \
1864 } \
1865 if (consumed_chars < consumed_chars_limit) \
1866 goto invalid_code; \
1867 charbuf_base[0] -= i; \
1868 } \
1869 } while (0)
1870
1871
1872 #define DECODE_EMACS_MULE_20_RELATIVE_COMPOSITION(c) \
1873 do { \
1874 /* Emacs 20 style format for relative composition. */ \
1875 /* Store multibyte form of characters to be composed. */ \
1876 enum composition_method method = COMPOSITION_RELATIVE; \
1877 int components[MAX_COMPOSITION_COMPONENTS * 2 - 1]; \
1878 int *buf = components; \
1879 int i, j; \
1880 int from, to; \
1881 \
1882 src = src_base; \
1883 ONE_MORE_BYTE (c); /* skip 0x80 */ \
1884 for (i = 0; i < MAX_COMPOSITION_COMPONENTS; i++) \
1885 DECODE_EMACS_MULE_COMPOSITION_CHAR (buf); \
1886 if (i < 2) \
1887 goto invalid_code; \
1888 from = coding->produced_char + char_offset; \
1889 to = from + i; \
1890 ADD_COMPOSITION_DATA (charbuf, from, to, method); \
1891 for (j = 0; j < i; j++) \
1892 *charbuf++ = components[j]; \
1893 } while (0)
1894
1895
1896 #define DECODE_EMACS_MULE_20_RULEBASE_COMPOSITION(c) \
1897 do { \
1898 /* Emacs 20 style format for rule-base composition. */ \
1899 /* Store multibyte form of characters to be composed. */ \
1900 enum composition_method method = COMPOSITION_WITH_RULE; \
1901 int components[MAX_COMPOSITION_COMPONENTS * 2 - 1]; \
1902 int *buf = components; \
1903 int i, j; \
1904 int from, to; \
1905 \
1906 DECODE_EMACS_MULE_COMPOSITION_CHAR (buf); \
1907 for (i = 0; i < MAX_COMPOSITION_COMPONENTS; i++) \
1908 { \
1909 DECODE_EMACS_MULE_COMPOSITION_RULE_20 (buf); \
1910 DECODE_EMACS_MULE_COMPOSITION_CHAR (buf); \
1911 } \
1912 if (i < 1 || (buf - components) % 2 == 0) \
1913 goto invalid_code; \
1914 if (charbuf + i + (i / 2) + 1 < charbuf_end) \
1915 goto no_more_source; \
1916 from = coding->produced_char + char_offset; \
1917 to = from + i; \
1918 ADD_COMPOSITION_DATA (buf, from, to, method); \
1919 for (j = 0; j < i; j++) \
1920 *charbuf++ = components[j]; \
1921 for (j = 0; j < i; j += 2) \
1922 *charbuf++ = components[j]; \
1923 } while (0)
1924
1925
1926 static void
1927 decode_coding_emacs_mule (coding)
1928 struct coding_system *coding;
1929 {
1930 const unsigned char *src = coding->source + coding->consumed;
1931 const unsigned char *src_end = coding->source + coding->src_bytes;
1932 const unsigned char *src_base;
1933 int *charbuf = coding->charbuf;
1934 int *charbuf_end = charbuf + coding->charbuf_size - MAX_ANNOTATION_LENGTH;
1935 int consumed_chars = 0, consumed_chars_base;
1936 int multibytep = coding->src_multibyte;
1937 Lisp_Object attrs, eol_type, charset_list;
1938 int char_offset = coding->produced_char;
1939 int last_offset = char_offset;
1940 int last_id = charset_ascii;
1941
1942 CODING_GET_INFO (coding, attrs, eol_type, charset_list);
1943
1944 while (1)
1945 {
1946 int c;
1947
1948 src_base = src;
1949 consumed_chars_base = consumed_chars;
1950
1951 if (charbuf >= charbuf_end)
1952 break;
1953
1954 ONE_MORE_BYTE (c);
1955
1956 if (c < 0x80)
1957 {
1958 if (c == '\r')
1959 {
1960 if (EQ (eol_type, Qdos))
1961 {
1962 if (src == src_end)
1963 {
1964 coding->result = CODING_RESULT_INSUFFICIENT_SRC;
1965 goto no_more_source;
1966 }
1967 if (*src == '\n')
1968 ONE_MORE_BYTE (c);
1969 }
1970 else if (EQ (eol_type, Qmac))
1971 c = '\n';
1972 }
1973 *charbuf++ = c;
1974 char_offset++;
1975 }
1976 else if (c == 0x80)
1977 {
1978 ONE_MORE_BYTE (c);
1979 if (c - 0xF2 >= COMPOSITION_RELATIVE
1980 && c - 0xF2 <= COMPOSITION_WITH_RULE_ALTCHARS)
1981 DECODE_EMACS_MULE_21_COMPOSITION (c);
1982 else if (c < 0xC0)
1983 DECODE_EMACS_MULE_20_RELATIVE_COMPOSITION (c);
1984 else if (c == 0xFF)
1985 DECODE_EMACS_MULE_20_RULEBASE_COMPOSITION (c);
1986 else
1987 goto invalid_code;
1988 }
1989 else if (c < 0xA0 && emacs_mule_bytes[c] > 1)
1990 {
1991 int nbytes, nchars;
1992 int id;
1993
1994 src = src_base;
1995 consumed_chars = consumed_chars_base;
1996 c = emacs_mule_char (coding, src, &nbytes, &nchars, &id);
1997 if (c < 0)
1998 {
1999 if (c == -2)
2000 break;
2001 goto invalid_code;
2002 }
2003 if (last_id != id)
2004 {
2005 if (last_id != charset_ascii)
2006 ADD_CHARSET_DATA (charbuf, last_offset, char_offset, last_id);
2007 last_id = id;
2008 last_offset = char_offset;
2009 }
2010 *charbuf++ = c;
2011 src += nbytes;
2012 consumed_chars += nchars;
2013 char_offset++;
2014 }
2015 continue;
2016
2017 invalid_code:
2018 src = src_base;
2019 consumed_chars = consumed_chars_base;
2020 ONE_MORE_BYTE (c);
2021 *charbuf++ = ASCII_BYTE_P (c) ? c : BYTE8_TO_CHAR (c);
2022 char_offset++;
2023 coding->errors++;
2024 }
2025
2026 no_more_source:
2027 if (last_id != charset_ascii)
2028 ADD_CHARSET_DATA (charbuf, last_offset, char_offset, last_id);
2029 coding->consumed_char += consumed_chars_base;
2030 coding->consumed = src_base - coding->source;
2031 coding->charbuf_used = charbuf - coding->charbuf;
2032 }
2033
2034
2035 #define EMACS_MULE_LEADING_CODES(id, codes) \
2036 do { \
2037 if (id < 0xA0) \
2038 codes[0] = id, codes[1] = 0; \
2039 else if (id < 0xE0) \
2040 codes[0] = 0x9A, codes[1] = id; \
2041 else if (id < 0xF0) \
2042 codes[0] = 0x9B, codes[1] = id; \
2043 else if (id < 0xF5) \
2044 codes[0] = 0x9C, codes[1] = id; \
2045 else \
2046 codes[0] = 0x9D, codes[1] = id; \
2047 } while (0);
2048
2049
2050 static int
2051 encode_coding_emacs_mule (coding)
2052 struct coding_system *coding;
2053 {
2054 int multibytep = coding->dst_multibyte;
2055 int *charbuf = coding->charbuf;
2056 int *charbuf_end = charbuf + coding->charbuf_used;
2057 unsigned char *dst = coding->destination + coding->produced;
2058 unsigned char *dst_end = coding->destination + coding->dst_bytes;
2059 int safe_room = 8;
2060 int produced_chars = 0;
2061 Lisp_Object attrs, eol_type, charset_list;
2062 int c;
2063 int preferred_charset_id = -1;
2064
2065 CODING_GET_INFO (coding, attrs, eol_type, charset_list);
2066
2067 while (charbuf < charbuf_end)
2068 {
2069 ASSURE_DESTINATION (safe_room);
2070 c = *charbuf++;
2071
2072 if (c < 0)
2073 {
2074 /* Handle an annotation. */
2075 switch (*charbuf)
2076 {
2077 case CODING_ANNOTATE_COMPOSITION_MASK:
2078 /* Not yet implemented. */
2079 break;
2080 case CODING_ANNOTATE_CHARSET_MASK:
2081 preferred_charset_id = charbuf[3];
2082 if (preferred_charset_id >= 0
2083 && NILP (Fmemq (make_number (preferred_charset_id),
2084 charset_list)))
2085 preferred_charset_id = -1;
2086 break;
2087 default:
2088 abort ();
2089 }
2090 charbuf += -c - 1;
2091 continue;
2092 }
2093
2094 if (ASCII_CHAR_P (c))
2095 EMIT_ONE_ASCII_BYTE (c);
2096 else if (CHAR_BYTE8_P (c))
2097 {
2098 c = CHAR_TO_BYTE8 (c);
2099 EMIT_ONE_BYTE (c);
2100 }
2101 else
2102 {
2103 struct charset *charset;
2104 unsigned code;
2105 int dimension;
2106 int emacs_mule_id;
2107 unsigned char leading_codes[2];
2108
2109 if (preferred_charset_id >= 0)
2110 {
2111 charset = CHARSET_FROM_ID (preferred_charset_id);
2112 if (! CHAR_CHARSET_P (c, charset))
2113 charset = char_charset (c, charset_list, NULL);
2114 }
2115 else
2116 charset = char_charset (c, charset_list, &code);
2117 if (! charset)
2118 {
2119 c = coding->default_char;
2120 if (ASCII_CHAR_P (c))
2121 {
2122 EMIT_ONE_ASCII_BYTE (c);
2123 continue;
2124 }
2125 charset = char_charset (c, charset_list, &code);
2126 }
2127 dimension = CHARSET_DIMENSION (charset);
2128 emacs_mule_id = CHARSET_EMACS_MULE_ID (charset);
2129 EMACS_MULE_LEADING_CODES (emacs_mule_id, leading_codes);
2130 EMIT_ONE_BYTE (leading_codes[0]);
2131 if (leading_codes[1])
2132 EMIT_ONE_BYTE (leading_codes[1]);
2133 if (dimension == 1)
2134 EMIT_ONE_BYTE (code);
2135 else
2136 {
2137 EMIT_ONE_BYTE (code >> 8);
2138 EMIT_ONE_BYTE (code & 0xFF);
2139 }
2140 }
2141 }
2142 coding->result = CODING_RESULT_SUCCESS;
2143 coding->produced_char += produced_chars;
2144 coding->produced = dst - coding->destination;
2145 return 0;
2146 }
2147
2148 \f
2149 /*** 7. ISO2022 handlers ***/
2150
2151 /* The following note describes the coding system ISO2022 briefly.
2152 Since the intention of this note is to help understand the
2153 functions in this file, some parts are NOT ACCURATE or are OVERLY
2154 SIMPLIFIED. For thorough understanding, please refer to the
2155 original document of ISO2022. This is equivalent to the standard
2156 ECMA-35, obtainable from <URL:http://www.ecma.ch/> (*).
2157
2158 ISO2022 provides many mechanisms to encode several character sets
2159 in 7-bit and 8-bit environments. For 7-bit environments, all text
2160 is encoded using bytes less than 128. This may make the encoded
2161 text a little bit longer, but the text passes more easily through
2162 several types of gateway, some of which strip off the MSB (Most
2163 Significant Bit).
2164
2165 There are two kinds of character sets: control character sets and
2166 graphic character sets. The former contain control characters such
2167 as `newline' and `escape' to provide control functions (control
2168 functions are also provided by escape sequences). The latter
2169 contain graphic characters such as 'A' and '-'. Emacs recognizes
2170 two control character sets and many graphic character sets.
2171
2172 Graphic character sets are classified into one of the following
2173 four classes, according to the number of bytes (DIMENSION) and
2174 number of characters in one dimension (CHARS) of the set:
2175 - DIMENSION1_CHARS94
2176 - DIMENSION1_CHARS96
2177 - DIMENSION2_CHARS94
2178 - DIMENSION2_CHARS96
2179
2180 In addition, each character set is assigned an identification tag,
2181 unique for each set, called the "final character" (denoted as <F>
2182 hereafter). The <F> of each character set is decided by ECMA(*)
2183 when it is registered in ISO. The code range of <F> is 0x30..0x7F
2184 (0x30..0x3F are for private use only).
2185
2186 Note (*): ECMA = European Computer Manufacturers Association
2187
2188 Here are examples of graphic character sets [NAME(<F>)]:
2189 o DIMENSION1_CHARS94 -- ASCII('B'), right-half-of-JISX0201('I'), ...
2190 o DIMENSION1_CHARS96 -- right-half-of-ISO8859-1('A'), ...
2191 o DIMENSION2_CHARS94 -- GB2312('A'), JISX0208('B'), ...
2192 o DIMENSION2_CHARS96 -- none for the moment
2193
2194 A code area (1 byte=8 bits) is divided into 4 areas, C0, GL, C1, and GR.
2195 C0 [0x00..0x1F] -- control character plane 0
2196 GL [0x20..0x7F] -- graphic character plane 0
2197 C1 [0x80..0x9F] -- control character plane 1
2198 GR [0xA0..0xFF] -- graphic character plane 1
2199
2200 A control character set is directly designated and invoked to C0 or
2201 C1 by an escape sequence. The most common case is that:
2202 - ISO646's control character set is designated/invoked to C0, and
2203 - ISO6429's control character set is designated/invoked to C1,
2204 and usually these designations/invocations are omitted in encoded
2205 text. In a 7-bit environment, only C0 can be used, and a control
2206 character for C1 is encoded by an appropriate escape sequence to
2207 fit into the environment. All control characters for C1 are
2208 defined to have corresponding escape sequences.
2209
2210 A graphic character set is at first designated to one of four
2211 graphic registers (G0 through G3), then these graphic registers are
2212 invoked to GL or GR. These designations and invocations can be
2213 done independently. The most common case is that G0 is invoked to
2214 GL, G1 is invoked to GR, and ASCII is designated to G0. Usually
2215 these invocations and designations are omitted in encoded text.
2216 In a 7-bit environment, only GL can be used.
2217
2218 When a graphic character set of CHARS94 is invoked to GL, codes
2219 0x20 and 0x7F of the GL area work as control characters SPACE and
2220 DEL respectively, and codes 0xA0 and 0xFF of the GR area should not
2221 be used.
2222
2223 There are two ways of invocation: locking-shift and single-shift.
2224 With locking-shift, the invocation lasts until the next different
2225 invocation, whereas with single-shift, the invocation affects the
2226 following character only and doesn't affect the locking-shift
2227 state. Invocations are done by the following control characters or
2228 escape sequences:
2229
2230 ----------------------------------------------------------------------
2231 abbrev function cntrl escape seq description
2232 ----------------------------------------------------------------------
2233 SI/LS0 (shift-in) 0x0F none invoke G0 into GL
2234 SO/LS1 (shift-out) 0x0E none invoke G1 into GL
2235 LS2 (locking-shift-2) none ESC 'n' invoke G2 into GL
2236 LS3 (locking-shift-3) none ESC 'o' invoke G3 into GL
2237 LS1R (locking-shift-1 right) none ESC '~' invoke G1 into GR (*)
2238 LS2R (locking-shift-2 right) none ESC '}' invoke G2 into GR (*)
2239 LS3R (locking-shift 3 right) none ESC '|' invoke G3 into GR (*)
2240 SS2 (single-shift-2) 0x8E ESC 'N' invoke G2 for one char
2241 SS3 (single-shift-3) 0x8F ESC 'O' invoke G3 for one char
2242 ----------------------------------------------------------------------
2243 (*) These are not used by any known coding system.
2244
2245 Control characters for these functions are defined by macros
2246 ISO_CODE_XXX in `coding.h'.
2247
2248 Designations are done by the following escape sequences:
2249 ----------------------------------------------------------------------
2250 escape sequence description
2251 ----------------------------------------------------------------------
2252 ESC '(' <F> designate DIMENSION1_CHARS94<F> to G0
2253 ESC ')' <F> designate DIMENSION1_CHARS94<F> to G1
2254 ESC '*' <F> designate DIMENSION1_CHARS94<F> to G2
2255 ESC '+' <F> designate DIMENSION1_CHARS94<F> to G3
2256 ESC ',' <F> designate DIMENSION1_CHARS96<F> to G0 (*)
2257 ESC '-' <F> designate DIMENSION1_CHARS96<F> to G1
2258 ESC '.' <F> designate DIMENSION1_CHARS96<F> to G2
2259 ESC '/' <F> designate DIMENSION1_CHARS96<F> to G3
2260 ESC '$' '(' <F> designate DIMENSION2_CHARS94<F> to G0 (**)
2261 ESC '$' ')' <F> designate DIMENSION2_CHARS94<F> to G1
2262 ESC '$' '*' <F> designate DIMENSION2_CHARS94<F> to G2
2263 ESC '$' '+' <F> designate DIMENSION2_CHARS94<F> to G3
2264 ESC '$' ',' <F> designate DIMENSION2_CHARS96<F> to G0 (*)
2265 ESC '$' '-' <F> designate DIMENSION2_CHARS96<F> to G1
2266 ESC '$' '.' <F> designate DIMENSION2_CHARS96<F> to G2
2267 ESC '$' '/' <F> designate DIMENSION2_CHARS96<F> to G3
2268 ----------------------------------------------------------------------
2269
2270 In this list, "DIMENSION1_CHARS94<F>" means a graphic character set
2271 of dimension 1, chars 94, and final character <F>, etc...
2272
2273 Note (*): Although these designations are not allowed in ISO2022,
2274 Emacs accepts them on decoding, and produces them on encoding
2275 CHARS96 character sets in a coding system which is characterized as
2276 7-bit environment, non-locking-shift, and non-single-shift.
2277
2278 Note (**): If <F> is '@', 'A', or 'B', the intermediate character
2279 '(' must be omitted. We refer to this as "short-form" hereafter.
2280
2281 Now you may notice that there are a lot of ways of encoding the
2282 same multilingual text in ISO2022. Actually, there exist many
2283 coding systems such as Compound Text (used in X11's inter client
2284 communication, ISO-2022-JP (used in Japanese Internet), ISO-2022-KR
2285 (used in Korean Internet), EUC (Extended UNIX Code, used in Asian
2286 localized platforms), and all of these are variants of ISO2022.
2287
2288 In addition to the above, Emacs handles two more kinds of escape
2289 sequences: ISO6429's direction specification and Emacs' private
2290 sequence for specifying character composition.
2291
2292 ISO6429's direction specification takes the following form:
2293 o CSI ']' -- end of the current direction
2294 o CSI '0' ']' -- end of the current direction
2295 o CSI '1' ']' -- start of left-to-right text
2296 o CSI '2' ']' -- start of right-to-left text
2297 The control character CSI (0x9B: control sequence introducer) is
2298 abbreviated to the escape sequence ESC '[' in a 7-bit environment.
2299
2300 Character composition specification takes the following form:
2301 o ESC '0' -- start relative composition
2302 o ESC '1' -- end composition
2303 o ESC '2' -- start rule-base composition (*)
2304 o ESC '3' -- start relative composition with alternate chars (**)
2305 o ESC '4' -- start rule-base composition with alternate chars (**)
2306 Since these are not standard escape sequences of any ISO standard,
2307 the use of them with these meanings is restricted to Emacs only.
2308
2309 (*) This form is used only in Emacs 20.7 and older versions,
2310 but newer versions can safely decode it.
2311 (**) This form is used only in Emacs 21.1 and newer versions,
2312 and older versions can't decode it.
2313
2314 Here's a list of example usages of these composition escape
2315 sequences (categorized by `enum composition_method').
2316
2317 COMPOSITION_RELATIVE:
2318 ESC 0 CHAR [ CHAR ] ESC 1
2319 COMPOSITION_WITH_RULE:
2320 ESC 2 CHAR [ RULE CHAR ] ESC 1
2321 COMPOSITION_WITH_ALTCHARS:
2322 ESC 3 ALTCHAR [ ALTCHAR ] ESC 0 CHAR [ CHAR ] ESC 1
2323 COMPOSITION_WITH_RULE_ALTCHARS:
2324 ESC 4 ALTCHAR [ RULE ALTCHAR ] ESC 0 CHAR [ CHAR ] ESC 1 */
2325
2326 enum iso_code_class_type iso_code_class[256];
2327
2328 #define SAFE_CHARSET_P(coding, id) \
2329 ((id) <= (coding)->max_charset_id \
2330 && (coding)->safe_charsets[id] >= 0)
2331
2332
2333 #define SHIFT_OUT_OK(category) \
2334 (CODING_ISO_INITIAL (&coding_categories[category], 1) >= 0)
2335
2336 static void
2337 setup_iso_safe_charsets (attrs)
2338 Lisp_Object attrs;
2339 {
2340 Lisp_Object charset_list, safe_charsets;
2341 Lisp_Object request;
2342 Lisp_Object reg_usage;
2343 Lisp_Object tail;
2344 int reg94, reg96;
2345 int flags = XINT (AREF (attrs, coding_attr_iso_flags));
2346 int max_charset_id;
2347
2348 charset_list = CODING_ATTR_CHARSET_LIST (attrs);
2349 if ((flags & CODING_ISO_FLAG_FULL_SUPPORT)
2350 && ! EQ (charset_list, Viso_2022_charset_list))
2351 {
2352 CODING_ATTR_CHARSET_LIST (attrs)
2353 = charset_list = Viso_2022_charset_list;
2354 ASET (attrs, coding_attr_safe_charsets, Qnil);
2355 }
2356
2357 if (STRINGP (AREF (attrs, coding_attr_safe_charsets)))
2358 return;
2359
2360 max_charset_id = 0;
2361 for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
2362 {
2363 int id = XINT (XCAR (tail));
2364 if (max_charset_id < id)
2365 max_charset_id = id;
2366 }
2367
2368 safe_charsets = Fmake_string (make_number (max_charset_id + 1),
2369 make_number (255));
2370 request = AREF (attrs, coding_attr_iso_request);
2371 reg_usage = AREF (attrs, coding_attr_iso_usage);
2372 reg94 = XINT (XCAR (reg_usage));
2373 reg96 = XINT (XCDR (reg_usage));
2374
2375 for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
2376 {
2377 Lisp_Object id;
2378 Lisp_Object reg;
2379 struct charset *charset;
2380
2381 id = XCAR (tail);
2382 charset = CHARSET_FROM_ID (XINT (id));
2383 reg = Fcdr (Fassq (id, request));
2384 if (! NILP (reg))
2385 SSET (safe_charsets, XINT (id), XINT (reg));
2386 else if (charset->iso_chars_96)
2387 {
2388 if (reg96 < 4)
2389 SSET (safe_charsets, XINT (id), reg96);
2390 }
2391 else
2392 {
2393 if (reg94 < 4)
2394 SSET (safe_charsets, XINT (id), reg94);
2395 }
2396 }
2397 ASET (attrs, coding_attr_safe_charsets, safe_charsets);
2398 }
2399
2400
2401 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
2402 Check if a text is encoded in one of ISO-2022 based codig systems.
2403 If it is, return 1, else return 0. */
2404
2405 static int
2406 detect_coding_iso_2022 (coding, detect_info)
2407 struct coding_system *coding;
2408 struct coding_detection_info *detect_info;
2409 {
2410 const unsigned char *src = coding->source, *src_base = src;
2411 const unsigned char *src_end = coding->source + coding->src_bytes;
2412 int multibytep = coding->src_multibyte;
2413 int single_shifting = 0;
2414 int id;
2415 int c, c1;
2416 int consumed_chars = 0;
2417 int i;
2418 int rejected = 0;
2419 int found = 0;
2420
2421 detect_info->checked |= CATEGORY_MASK_ISO;
2422
2423 for (i = coding_category_iso_7; i <= coding_category_iso_8_else; i++)
2424 {
2425 struct coding_system *this = &(coding_categories[i]);
2426 Lisp_Object attrs, val;
2427
2428 attrs = CODING_ID_ATTRS (this->id);
2429 if (CODING_ISO_FLAGS (this) & CODING_ISO_FLAG_FULL_SUPPORT
2430 && ! EQ (CODING_ATTR_SAFE_CHARSETS (attrs), Viso_2022_charset_list))
2431 setup_iso_safe_charsets (attrs);
2432 val = CODING_ATTR_SAFE_CHARSETS (attrs);
2433 this->max_charset_id = SCHARS (val) - 1;
2434 this->safe_charsets = (char *) SDATA (val);
2435 }
2436
2437 /* A coding system of this category is always ASCII compatible. */
2438 src += coding->head_ascii;
2439
2440 while (rejected != CATEGORY_MASK_ISO)
2441 {
2442 ONE_MORE_BYTE (c);
2443 switch (c)
2444 {
2445 case ISO_CODE_ESC:
2446 if (inhibit_iso_escape_detection)
2447 break;
2448 single_shifting = 0;
2449 ONE_MORE_BYTE (c);
2450 if (c >= '(' && c <= '/')
2451 {
2452 /* Designation sequence for a charset of dimension 1. */
2453 ONE_MORE_BYTE (c1);
2454 if (c1 < ' ' || c1 >= 0x80
2455 || (id = iso_charset_table[0][c >= ','][c1]) < 0)
2456 /* Invalid designation sequence. Just ignore. */
2457 break;
2458 }
2459 else if (c == '$')
2460 {
2461 /* Designation sequence for a charset of dimension 2. */
2462 ONE_MORE_BYTE (c);
2463 if (c >= '@' && c <= 'B')
2464 /* Designation for JISX0208.1978, GB2312, or JISX0208. */
2465 id = iso_charset_table[1][0][c];
2466 else if (c >= '(' && c <= '/')
2467 {
2468 ONE_MORE_BYTE (c1);
2469 if (c1 < ' ' || c1 >= 0x80
2470 || (id = iso_charset_table[1][c >= ','][c1]) < 0)
2471 /* Invalid designation sequence. Just ignore. */
2472 break;
2473 }
2474 else
2475 /* Invalid designation sequence. Just ignore it. */
2476 break;
2477 }
2478 else if (c == 'N' || c == 'O')
2479 {
2480 /* ESC <Fe> for SS2 or SS3. */
2481 single_shifting = 1;
2482 rejected |= CATEGORY_MASK_ISO_7BIT | CATEGORY_MASK_ISO_8BIT;
2483 break;
2484 }
2485 else if (c >= '0' && c <= '4')
2486 {
2487 /* ESC <Fp> for start/end composition. */
2488 found |= CATEGORY_MASK_ISO;
2489 break;
2490 }
2491 else
2492 {
2493 /* Invalid escape sequence. Just ignore it. */
2494 break;
2495 }
2496
2497 /* We found a valid designation sequence for CHARSET. */
2498 rejected |= CATEGORY_MASK_ISO_8BIT;
2499 if (SAFE_CHARSET_P (&coding_categories[coding_category_iso_7],
2500 id))
2501 found |= CATEGORY_MASK_ISO_7;
2502 else
2503 rejected |= CATEGORY_MASK_ISO_7;
2504 if (SAFE_CHARSET_P (&coding_categories[coding_category_iso_7_tight],
2505 id))
2506 found |= CATEGORY_MASK_ISO_7_TIGHT;
2507 else
2508 rejected |= CATEGORY_MASK_ISO_7_TIGHT;
2509 if (SAFE_CHARSET_P (&coding_categories[coding_category_iso_7_else],
2510 id))
2511 found |= CATEGORY_MASK_ISO_7_ELSE;
2512 else
2513 rejected |= CATEGORY_MASK_ISO_7_ELSE;
2514 if (SAFE_CHARSET_P (&coding_categories[coding_category_iso_8_else],
2515 id))
2516 found |= CATEGORY_MASK_ISO_8_ELSE;
2517 else
2518 rejected |= CATEGORY_MASK_ISO_8_ELSE;
2519 break;
2520
2521 case ISO_CODE_SO:
2522 case ISO_CODE_SI:
2523 /* Locking shift out/in. */
2524 if (inhibit_iso_escape_detection)
2525 break;
2526 single_shifting = 0;
2527 rejected |= CATEGORY_MASK_ISO_7BIT | CATEGORY_MASK_ISO_8BIT;
2528 found |= CATEGORY_MASK_ISO_ELSE;
2529 break;
2530
2531 case ISO_CODE_CSI:
2532 /* Control sequence introducer. */
2533 single_shifting = 0;
2534 rejected |= CATEGORY_MASK_ISO_7BIT | CATEGORY_MASK_ISO_7_ELSE;
2535 found |= CATEGORY_MASK_ISO_8_ELSE;
2536 goto check_extra_latin;
2537
2538
2539 case ISO_CODE_SS2:
2540 case ISO_CODE_SS3:
2541 /* Single shift. */
2542 if (inhibit_iso_escape_detection)
2543 break;
2544 single_shifting = 1;
2545 rejected |= CATEGORY_MASK_ISO_7BIT;
2546 if (CODING_ISO_FLAGS (&coding_categories[coding_category_iso_8_1])
2547 & CODING_ISO_FLAG_SINGLE_SHIFT)
2548 found |= CATEGORY_MASK_ISO_8_1;
2549 if (CODING_ISO_FLAGS (&coding_categories[coding_category_iso_8_2])
2550 & CODING_ISO_FLAG_SINGLE_SHIFT)
2551 found |= CATEGORY_MASK_ISO_8_2;
2552 goto check_extra_latin;
2553
2554 default:
2555 if (c < 0x80)
2556 {
2557 single_shifting = 0;
2558 break;
2559 }
2560 if (c >= 0xA0)
2561 {
2562 rejected |= CATEGORY_MASK_ISO_7BIT | CATEGORY_MASK_ISO_7_ELSE;
2563 found |= CATEGORY_MASK_ISO_8_1;
2564 /* Check the length of succeeding codes of the range
2565 0xA0..0FF. If the byte length is even, we include
2566 CATEGORY_MASK_ISO_8_2 in `found'. We can check this
2567 only when we are not single shifting. */
2568 if (! single_shifting
2569 && ! (rejected & CATEGORY_MASK_ISO_8_2))
2570 {
2571 int i = 1;
2572 while (src < src_end)
2573 {
2574 ONE_MORE_BYTE (c);
2575 if (c < 0xA0)
2576 break;
2577 i++;
2578 }
2579
2580 if (i & 1 && src < src_end)
2581 rejected |= CATEGORY_MASK_ISO_8_2;
2582 else
2583 found |= CATEGORY_MASK_ISO_8_2;
2584 }
2585 break;
2586 }
2587 check_extra_latin:
2588 single_shifting = 0;
2589 if (! VECTORP (Vlatin_extra_code_table)
2590 || NILP (XVECTOR (Vlatin_extra_code_table)->contents[c]))
2591 {
2592 rejected = CATEGORY_MASK_ISO;
2593 break;
2594 }
2595 if (CODING_ISO_FLAGS (&coding_categories[coding_category_iso_8_1])
2596 & CODING_ISO_FLAG_LATIN_EXTRA)
2597 found |= CATEGORY_MASK_ISO_8_1;
2598 else
2599 rejected |= CATEGORY_MASK_ISO_8_1;
2600 if (CODING_ISO_FLAGS (&coding_categories[coding_category_iso_8_2])
2601 & CODING_ISO_FLAG_LATIN_EXTRA)
2602 found |= CATEGORY_MASK_ISO_8_2;
2603 else
2604 rejected |= CATEGORY_MASK_ISO_8_2;
2605 }
2606 }
2607 detect_info->rejected |= CATEGORY_MASK_ISO;
2608 return 0;
2609
2610 no_more_source:
2611 detect_info->rejected |= rejected;
2612 detect_info->found |= (found & ~rejected);
2613 return 1;
2614 }
2615
2616
2617 /* Set designation state into CODING. */
2618 #define DECODE_DESIGNATION(reg, dim, chars_96, final) \
2619 do { \
2620 int id, prev; \
2621 \
2622 if (final < '0' || final >= 128 \
2623 || ((id = ISO_CHARSET_TABLE (dim, chars_96, final)) < 0) \
2624 || !SAFE_CHARSET_P (coding, id)) \
2625 { \
2626 CODING_ISO_DESIGNATION (coding, reg) = -2; \
2627 goto invalid_code; \
2628 } \
2629 prev = CODING_ISO_DESIGNATION (coding, reg); \
2630 if (id == charset_jisx0201_roman) \
2631 { \
2632 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_USE_ROMAN) \
2633 id = charset_ascii; \
2634 } \
2635 else if (id == charset_jisx0208_1978) \
2636 { \
2637 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_USE_OLDJIS) \
2638 id = charset_jisx0208; \
2639 } \
2640 CODING_ISO_DESIGNATION (coding, reg) = id; \
2641 /* If there was an invalid designation to REG previously, and this \
2642 designation is ASCII to REG, we should keep this designation \
2643 sequence. */ \
2644 if (prev == -2 && id == charset_ascii) \
2645 goto invalid_code; \
2646 } while (0)
2647
2648
2649 #define MAYBE_FINISH_COMPOSITION() \
2650 do { \
2651 int i; \
2652 if (composition_state == COMPOSING_NO) \
2653 break; \
2654 /* It is assured that we have enough room for producing \
2655 characters stored in the table `components'. */ \
2656 if (charbuf + component_idx > charbuf_end) \
2657 goto no_more_source; \
2658 composition_state = COMPOSING_NO; \
2659 if (method == COMPOSITION_RELATIVE \
2660 || method == COMPOSITION_WITH_ALTCHARS) \
2661 { \
2662 for (i = 0; i < component_idx; i++) \
2663 *charbuf++ = components[i]; \
2664 char_offset += component_idx; \
2665 } \
2666 else \
2667 { \
2668 for (i = 0; i < component_idx; i += 2) \
2669 *charbuf++ = components[i]; \
2670 char_offset += (component_idx / 2) + 1; \
2671 } \
2672 } while (0)
2673
2674
2675 /* Handle composition start sequence ESC 0, ESC 2, ESC 3, or ESC 4.
2676 ESC 0 : relative composition : ESC 0 CHAR ... ESC 1
2677 ESC 2 : rulebase composition : ESC 2 CHAR RULE CHAR RULE ... CHAR ESC 1
2678 ESC 3 : altchar composition : ESC 3 CHAR ... ESC 0 CHAR ... ESC 1
2679 ESC 4 : alt&rule composition : ESC 4 CHAR RULE ... CHAR ESC 0 CHAR ... ESC 1
2680 */
2681
2682 #define DECODE_COMPOSITION_START(c1) \
2683 do { \
2684 if (c1 == '0' \
2685 && composition_state == COMPOSING_COMPONENT_RULE) \
2686 { \
2687 component_len = component_idx; \
2688 composition_state = COMPOSING_CHAR; \
2689 } \
2690 else \
2691 { \
2692 const unsigned char *p; \
2693 \
2694 MAYBE_FINISH_COMPOSITION (); \
2695 if (charbuf + MAX_COMPOSITION_COMPONENTS > charbuf_end) \
2696 goto no_more_source; \
2697 for (p = src; p < src_end - 1; p++) \
2698 if (*p == ISO_CODE_ESC && p[1] == '1') \
2699 break; \
2700 if (p == src_end - 1) \
2701 { \
2702 if (coding->mode & CODING_MODE_LAST_BLOCK) \
2703 goto invalid_code; \
2704 goto no_more_source; \
2705 } \
2706 \
2707 /* This is surely the start of a composition. */ \
2708 method = (c1 == '0' ? COMPOSITION_RELATIVE \
2709 : c1 == '2' ? COMPOSITION_WITH_RULE \
2710 : c1 == '3' ? COMPOSITION_WITH_ALTCHARS \
2711 : COMPOSITION_WITH_RULE_ALTCHARS); \
2712 composition_state = (c1 <= '2' ? COMPOSING_CHAR \
2713 : COMPOSING_COMPONENT_CHAR); \
2714 component_idx = component_len = 0; \
2715 } \
2716 } while (0)
2717
2718
2719 /* Handle compositoin end sequence ESC 1. */
2720
2721 #define DECODE_COMPOSITION_END() \
2722 do { \
2723 int nchars = (component_len > 0 ? component_idx - component_len \
2724 : method == COMPOSITION_RELATIVE ? component_idx \
2725 : (component_idx + 1) / 2); \
2726 int i; \
2727 int *saved_charbuf = charbuf; \
2728 int from = char_offset; \
2729 int to = from + nchars; \
2730 \
2731 ADD_COMPOSITION_DATA (charbuf, from, to, method); \
2732 if (method != COMPOSITION_RELATIVE) \
2733 { \
2734 if (component_len == 0) \
2735 for (i = 0; i < component_idx; i++) \
2736 *charbuf++ = components[i]; \
2737 else \
2738 for (i = 0; i < component_len; i++) \
2739 *charbuf++ = components[i]; \
2740 *saved_charbuf = saved_charbuf - charbuf; \
2741 } \
2742 if (method == COMPOSITION_WITH_RULE) \
2743 for (i = 0; i < component_idx; i += 2, char_offset++) \
2744 *charbuf++ = components[i]; \
2745 else \
2746 for (i = component_len; i < component_idx; i++, char_offset++) \
2747 *charbuf++ = components[i]; \
2748 coding->annotated = 1; \
2749 composition_state = COMPOSING_NO; \
2750 } while (0)
2751
2752
2753 /* Decode a composition rule from the byte C1 (and maybe one more byte
2754 from SRC) and store one encoded composition rule in
2755 coding->cmp_data. */
2756
2757 #define DECODE_COMPOSITION_RULE(c1) \
2758 do { \
2759 (c1) -= 32; \
2760 if (c1 < 81) /* old format (before ver.21) */ \
2761 { \
2762 int gref = (c1) / 9; \
2763 int nref = (c1) % 9; \
2764 if (gref == 4) gref = 10; \
2765 if (nref == 4) nref = 10; \
2766 c1 = COMPOSITION_ENCODE_RULE (gref, nref); \
2767 } \
2768 else if (c1 < 93) /* new format (after ver.21) */ \
2769 { \
2770 ONE_MORE_BYTE (c2); \
2771 c1 = COMPOSITION_ENCODE_RULE (c1 - 81, c2 - 32); \
2772 } \
2773 else \
2774 c1 = 0; \
2775 } while (0)
2776
2777
2778 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions". */
2779
2780 static void
2781 decode_coding_iso_2022 (coding)
2782 struct coding_system *coding;
2783 {
2784 const unsigned char *src = coding->source + coding->consumed;
2785 const unsigned char *src_end = coding->source + coding->src_bytes;
2786 const unsigned char *src_base;
2787 int *charbuf = coding->charbuf;
2788 int *charbuf_end
2789 = charbuf + coding->charbuf_size - 4 - MAX_ANNOTATION_LENGTH;
2790 int consumed_chars = 0, consumed_chars_base;
2791 int multibytep = coding->src_multibyte;
2792 /* Charsets invoked to graphic plane 0 and 1 respectively. */
2793 int charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
2794 int charset_id_1 = CODING_ISO_INVOKED_CHARSET (coding, 1);
2795 struct charset *charset;
2796 int c;
2797 /* For handling composition sequence. */
2798 #define COMPOSING_NO 0
2799 #define COMPOSING_CHAR 1
2800 #define COMPOSING_RULE 2
2801 #define COMPOSING_COMPONENT_CHAR 3
2802 #define COMPOSING_COMPONENT_RULE 4
2803
2804 int composition_state = COMPOSING_NO;
2805 enum composition_method method;
2806 int components[MAX_COMPOSITION_COMPONENTS * 2 + 1];
2807 int component_idx;
2808 int component_len;
2809 Lisp_Object attrs, eol_type, charset_list;
2810 int char_offset = coding->produced_char;
2811 int last_offset = char_offset;
2812 int last_id = charset_ascii;
2813
2814 CODING_GET_INFO (coding, attrs, eol_type, charset_list);
2815 setup_iso_safe_charsets (attrs);
2816
2817 while (1)
2818 {
2819 int c1, c2;
2820
2821 src_base = src;
2822 consumed_chars_base = consumed_chars;
2823
2824 if (charbuf >= charbuf_end)
2825 break;
2826
2827 ONE_MORE_BYTE (c1);
2828
2829 /* We produce at most one character. */
2830 switch (iso_code_class [c1])
2831 {
2832 case ISO_0x20_or_0x7F:
2833 if (composition_state != COMPOSING_NO)
2834 {
2835 if (composition_state == COMPOSING_RULE
2836 || composition_state == COMPOSING_COMPONENT_RULE)
2837 {
2838 DECODE_COMPOSITION_RULE (c1);
2839 components[component_idx++] = c1;
2840 composition_state--;
2841 continue;
2842 }
2843 }
2844 if (charset_id_0 < 0
2845 || ! CHARSET_ISO_CHARS_96 (CHARSET_FROM_ID (charset_id_0)))
2846 /* This is SPACE or DEL. */
2847 charset = CHARSET_FROM_ID (charset_ascii);
2848 else
2849 charset = CHARSET_FROM_ID (charset_id_0);
2850 break;
2851
2852 case ISO_graphic_plane_0:
2853 if (composition_state != COMPOSING_NO)
2854 {
2855 if (composition_state == COMPOSING_RULE
2856 || composition_state == COMPOSING_COMPONENT_RULE)
2857 {
2858 DECODE_COMPOSITION_RULE (c1);
2859 components[component_idx++] = c1;
2860 composition_state--;
2861 continue;
2862 }
2863 }
2864 charset = CHARSET_FROM_ID (charset_id_0);
2865 break;
2866
2867 case ISO_0xA0_or_0xFF:
2868 if (charset_id_1 < 0
2869 || ! CHARSET_ISO_CHARS_96 (CHARSET_FROM_ID (charset_id_1))
2870 || CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS)
2871 goto invalid_code;
2872 /* This is a graphic character, we fall down ... */
2873
2874 case ISO_graphic_plane_1:
2875 if (charset_id_1 < 0)
2876 goto invalid_code;
2877 charset = CHARSET_FROM_ID (charset_id_1);
2878 break;
2879
2880 case ISO_carriage_return:
2881 if (c1 == '\r')
2882 {
2883 if (EQ (eol_type, Qdos))
2884 {
2885 if (src == src_end)
2886 {
2887 coding->result = CODING_RESULT_INSUFFICIENT_SRC;
2888 goto no_more_source;
2889 }
2890 if (*src == '\n')
2891 ONE_MORE_BYTE (c1);
2892 }
2893 else if (EQ (eol_type, Qmac))
2894 c1 = '\n';
2895 }
2896 /* fall through */
2897
2898 case ISO_control_0:
2899 MAYBE_FINISH_COMPOSITION ();
2900 charset = CHARSET_FROM_ID (charset_ascii);
2901 break;
2902
2903 case ISO_control_1:
2904 MAYBE_FINISH_COMPOSITION ();
2905 goto invalid_code;
2906
2907 case ISO_shift_out:
2908 if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LOCKING_SHIFT)
2909 || CODING_ISO_DESIGNATION (coding, 1) < 0)
2910 goto invalid_code;
2911 CODING_ISO_INVOCATION (coding, 0) = 1;
2912 charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
2913 continue;
2914
2915 case ISO_shift_in:
2916 if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LOCKING_SHIFT))
2917 goto invalid_code;
2918 CODING_ISO_INVOCATION (coding, 0) = 0;
2919 charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
2920 continue;
2921
2922 case ISO_single_shift_2_7:
2923 case ISO_single_shift_2:
2924 if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT))
2925 goto invalid_code;
2926 /* SS2 is handled as an escape sequence of ESC 'N' */
2927 c1 = 'N';
2928 goto label_escape_sequence;
2929
2930 case ISO_single_shift_3:
2931 if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT))
2932 goto invalid_code;
2933 /* SS2 is handled as an escape sequence of ESC 'O' */
2934 c1 = 'O';
2935 goto label_escape_sequence;
2936
2937 case ISO_control_sequence_introducer:
2938 /* CSI is handled as an escape sequence of ESC '[' ... */
2939 c1 = '[';
2940 goto label_escape_sequence;
2941
2942 case ISO_escape:
2943 ONE_MORE_BYTE (c1);
2944 label_escape_sequence:
2945 /* Escape sequences handled here are invocation,
2946 designation, direction specification, and character
2947 composition specification. */
2948 switch (c1)
2949 {
2950 case '&': /* revision of following character set */
2951 ONE_MORE_BYTE (c1);
2952 if (!(c1 >= '@' && c1 <= '~'))
2953 goto invalid_code;
2954 ONE_MORE_BYTE (c1);
2955 if (c1 != ISO_CODE_ESC)
2956 goto invalid_code;
2957 ONE_MORE_BYTE (c1);
2958 goto label_escape_sequence;
2959
2960 case '$': /* designation of 2-byte character set */
2961 if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DESIGNATION))
2962 goto invalid_code;
2963 ONE_MORE_BYTE (c1);
2964 if (c1 >= '@' && c1 <= 'B')
2965 { /* designation of JISX0208.1978, GB2312.1980,
2966 or JISX0208.1980 */
2967 DECODE_DESIGNATION (0, 2, 0, c1);
2968 }
2969 else if (c1 >= 0x28 && c1 <= 0x2B)
2970 { /* designation of DIMENSION2_CHARS94 character set */
2971 ONE_MORE_BYTE (c2);
2972 DECODE_DESIGNATION (c1 - 0x28, 2, 0, c2);
2973 }
2974 else if (c1 >= 0x2C && c1 <= 0x2F)
2975 { /* designation of DIMENSION2_CHARS96 character set */
2976 ONE_MORE_BYTE (c2);
2977 DECODE_DESIGNATION (c1 - 0x2C, 2, 1, c2);
2978 }
2979 else
2980 goto invalid_code;
2981 /* We must update these variables now. */
2982 charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
2983 charset_id_1 = CODING_ISO_INVOKED_CHARSET (coding, 1);
2984 continue;
2985
2986 case 'n': /* invocation of locking-shift-2 */
2987 if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LOCKING_SHIFT)
2988 || CODING_ISO_DESIGNATION (coding, 2) < 0)
2989 goto invalid_code;
2990 CODING_ISO_INVOCATION (coding, 0) = 2;
2991 charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
2992 continue;
2993
2994 case 'o': /* invocation of locking-shift-3 */
2995 if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LOCKING_SHIFT)
2996 || CODING_ISO_DESIGNATION (coding, 3) < 0)
2997 goto invalid_code;
2998 CODING_ISO_INVOCATION (coding, 0) = 3;
2999 charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3000 continue;
3001
3002 case 'N': /* invocation of single-shift-2 */
3003 if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT)
3004 || CODING_ISO_DESIGNATION (coding, 2) < 0)
3005 goto invalid_code;
3006 charset = CHARSET_FROM_ID (CODING_ISO_DESIGNATION (coding, 2));
3007 ONE_MORE_BYTE (c1);
3008 if (c1 < 0x20 || (c1 >= 0x80 && c1 < 0xA0))
3009 goto invalid_code;
3010 break;
3011
3012 case 'O': /* invocation of single-shift-3 */
3013 if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT)
3014 || CODING_ISO_DESIGNATION (coding, 3) < 0)
3015 goto invalid_code;
3016 charset = CHARSET_FROM_ID (CODING_ISO_DESIGNATION (coding, 3));
3017 ONE_MORE_BYTE (c1);
3018 if (c1 < 0x20 || (c1 >= 0x80 && c1 < 0xA0))
3019 goto invalid_code;
3020 break;
3021
3022 case '0': case '2': case '3': case '4': /* start composition */
3023 if (! (coding->common_flags & CODING_ANNOTATE_COMPOSITION_MASK))
3024 goto invalid_code;
3025 DECODE_COMPOSITION_START (c1);
3026 continue;
3027
3028 case '1': /* end composition */
3029 if (composition_state == COMPOSING_NO)
3030 goto invalid_code;
3031 DECODE_COMPOSITION_END ();
3032 continue;
3033
3034 case '[': /* specification of direction */
3035 if (! CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DIRECTION)
3036 goto invalid_code;
3037 /* For the moment, nested direction is not supported.
3038 So, `coding->mode & CODING_MODE_DIRECTION' zero means
3039 left-to-right, and nozero means right-to-left. */
3040 ONE_MORE_BYTE (c1);
3041 switch (c1)
3042 {
3043 case ']': /* end of the current direction */
3044 coding->mode &= ~CODING_MODE_DIRECTION;
3045
3046 case '0': /* end of the current direction */
3047 case '1': /* start of left-to-right direction */
3048 ONE_MORE_BYTE (c1);
3049 if (c1 == ']')
3050 coding->mode &= ~CODING_MODE_DIRECTION;
3051 else
3052 goto invalid_code;
3053 break;
3054
3055 case '2': /* start of right-to-left direction */
3056 ONE_MORE_BYTE (c1);
3057 if (c1 == ']')
3058 coding->mode |= CODING_MODE_DIRECTION;
3059 else
3060 goto invalid_code;
3061 break;
3062
3063 default:
3064 goto invalid_code;
3065 }
3066 continue;
3067
3068 case '%':
3069 ONE_MORE_BYTE (c1);
3070 if (c1 == '/')
3071 {
3072 /* CTEXT extended segment:
3073 ESC % / [0-4] M L --ENCODING-NAME-- \002 --BYTES--
3074 We keep these bytes as is for the moment.
3075 They may be decoded by post-read-conversion. */
3076 int dim, M, L;
3077 int size;
3078
3079 ONE_MORE_BYTE (dim);
3080 ONE_MORE_BYTE (M);
3081 ONE_MORE_BYTE (L);
3082 size = ((M - 128) * 128) + (L - 128);
3083 if (charbuf + 8 + size > charbuf_end)
3084 goto break_loop;
3085 *charbuf++ = ISO_CODE_ESC;
3086 *charbuf++ = '%';
3087 *charbuf++ = '/';
3088 *charbuf++ = dim;
3089 *charbuf++ = BYTE8_TO_CHAR (M);
3090 *charbuf++ = BYTE8_TO_CHAR (L);
3091 while (size-- > 0)
3092 {
3093 ONE_MORE_BYTE (c1);
3094 *charbuf++ = ASCII_BYTE_P (c1) ? c1 : BYTE8_TO_CHAR (c1);
3095 }
3096 }
3097 else if (c1 == 'G')
3098 {
3099 /* XFree86 extension for embedding UTF-8 in CTEXT:
3100 ESC % G --UTF-8-BYTES-- ESC % @
3101 We keep these bytes as is for the moment.
3102 They may be decoded by post-read-conversion. */
3103 int *p = charbuf;
3104
3105 if (p + 6 > charbuf_end)
3106 goto break_loop;
3107 *p++ = ISO_CODE_ESC;
3108 *p++ = '%';
3109 *p++ = 'G';
3110 while (p < charbuf_end)
3111 {
3112 ONE_MORE_BYTE (c1);
3113 if (c1 == ISO_CODE_ESC
3114 && src + 1 < src_end
3115 && src[0] == '%'
3116 && src[1] == '@')
3117 break;
3118 *p++ = ASCII_BYTE_P (c1) ? c1 : BYTE8_TO_CHAR (c1);
3119 }
3120 if (p + 3 > charbuf_end)
3121 goto break_loop;
3122 *p++ = ISO_CODE_ESC;
3123 *p++ = '%';
3124 *p++ = '@';
3125 charbuf = p;
3126 }
3127 else
3128 goto invalid_code;
3129 continue;
3130 break;
3131
3132 default:
3133 if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DESIGNATION))
3134 goto invalid_code;
3135 if (c1 >= 0x28 && c1 <= 0x2B)
3136 { /* designation of DIMENSION1_CHARS94 character set */
3137 ONE_MORE_BYTE (c2);
3138 DECODE_DESIGNATION (c1 - 0x28, 1, 0, c2);
3139 }
3140 else if (c1 >= 0x2C && c1 <= 0x2F)
3141 { /* designation of DIMENSION1_CHARS96 character set */
3142 ONE_MORE_BYTE (c2);
3143 DECODE_DESIGNATION (c1 - 0x2C, 1, 1, c2);
3144 }
3145 else
3146 goto invalid_code;
3147 /* We must update these variables now. */
3148 charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3149 charset_id_1 = CODING_ISO_INVOKED_CHARSET (coding, 1);
3150 continue;
3151 }
3152 }
3153
3154 if (charset->id != charset_ascii
3155 && last_id != charset->id)
3156 {
3157 if (last_id != charset_ascii)
3158 ADD_CHARSET_DATA (charbuf, last_offset, char_offset, last_id);
3159 last_id = charset->id;
3160 last_offset = char_offset;
3161 }
3162
3163 /* Now we know CHARSET and 1st position code C1 of a character.
3164 Produce a decoded character while getting 2nd position code
3165 C2 if necessary. */
3166 c1 &= 0x7F;
3167 if (CHARSET_DIMENSION (charset) > 1)
3168 {
3169 ONE_MORE_BYTE (c2);
3170 if (c2 < 0x20 || (c2 >= 0x80 && c2 < 0xA0))
3171 /* C2 is not in a valid range. */
3172 goto invalid_code;
3173 c1 = (c1 << 8) | (c2 & 0x7F);
3174 if (CHARSET_DIMENSION (charset) > 2)
3175 {
3176 ONE_MORE_BYTE (c2);
3177 if (c2 < 0x20 || (c2 >= 0x80 && c2 < 0xA0))
3178 /* C2 is not in a valid range. */
3179 goto invalid_code;
3180 c1 = (c1 << 8) | (c2 & 0x7F);
3181 }
3182 }
3183
3184 CODING_DECODE_CHAR (coding, src, src_base, src_end, charset, c1, c);
3185 if (c < 0)
3186 {
3187 MAYBE_FINISH_COMPOSITION ();
3188 for (; src_base < src; src_base++, char_offset++)
3189 {
3190 if (ASCII_BYTE_P (*src_base))
3191 *charbuf++ = *src_base;
3192 else
3193 *charbuf++ = BYTE8_TO_CHAR (*src_base);
3194 }
3195 }
3196 else if (composition_state == COMPOSING_NO)
3197 {
3198 *charbuf++ = c;
3199 char_offset++;
3200 }
3201 else
3202 {
3203 components[component_idx++] = c;
3204 if (method == COMPOSITION_WITH_RULE
3205 || (method == COMPOSITION_WITH_RULE_ALTCHARS
3206 && composition_state == COMPOSING_COMPONENT_CHAR))
3207 composition_state++;
3208 }
3209 continue;
3210
3211 invalid_code:
3212 MAYBE_FINISH_COMPOSITION ();
3213 src = src_base;
3214 consumed_chars = consumed_chars_base;
3215 ONE_MORE_BYTE (c);
3216 *charbuf++ = ASCII_BYTE_P (c) ? c : BYTE8_TO_CHAR (c);
3217 char_offset++;
3218 coding->errors++;
3219 continue;
3220
3221 break_loop:
3222 break;
3223 }
3224
3225 no_more_source:
3226 if (last_id != charset_ascii)
3227 ADD_CHARSET_DATA (charbuf, last_offset, char_offset, last_id);
3228 coding->consumed_char += consumed_chars_base;
3229 coding->consumed = src_base - coding->source;
3230 coding->charbuf_used = charbuf - coding->charbuf;
3231 }
3232
3233
3234 /* ISO2022 encoding stuff. */
3235
3236 /*
3237 It is not enough to say just "ISO2022" on encoding, we have to
3238 specify more details. In Emacs, each coding system of ISO2022
3239 variant has the following specifications:
3240 1. Initial designation to G0 thru G3.
3241 2. Allows short-form designation?
3242 3. ASCII should be designated to G0 before control characters?
3243 4. ASCII should be designated to G0 at end of line?
3244 5. 7-bit environment or 8-bit environment?
3245 6. Use locking-shift?
3246 7. Use Single-shift?
3247 And the following two are only for Japanese:
3248 8. Use ASCII in place of JIS0201-1976-Roman?
3249 9. Use JISX0208-1983 in place of JISX0208-1978?
3250 These specifications are encoded in CODING_ISO_FLAGS (coding) as flag bits
3251 defined by macros CODING_ISO_FLAG_XXX. See `coding.h' for more
3252 details.
3253 */
3254
3255 /* Produce codes (escape sequence) for designating CHARSET to graphic
3256 register REG at DST, and increment DST. If <final-char> of CHARSET is
3257 '@', 'A', or 'B' and the coding system CODING allows, produce
3258 designation sequence of short-form. */
3259
3260 #define ENCODE_DESIGNATION(charset, reg, coding) \
3261 do { \
3262 unsigned char final_char = CHARSET_ISO_FINAL (charset); \
3263 char *intermediate_char_94 = "()*+"; \
3264 char *intermediate_char_96 = ",-./"; \
3265 int revision = -1; \
3266 int c; \
3267 \
3268 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_REVISION) \
3269 revision = CHARSET_ISO_REVISION (charset); \
3270 \
3271 if (revision >= 0) \
3272 { \
3273 EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, '&'); \
3274 EMIT_ONE_BYTE ('@' + revision); \
3275 } \
3276 EMIT_ONE_ASCII_BYTE (ISO_CODE_ESC); \
3277 if (CHARSET_DIMENSION (charset) == 1) \
3278 { \
3279 if (! CHARSET_ISO_CHARS_96 (charset)) \
3280 c = intermediate_char_94[reg]; \
3281 else \
3282 c = intermediate_char_96[reg]; \
3283 EMIT_ONE_ASCII_BYTE (c); \
3284 } \
3285 else \
3286 { \
3287 EMIT_ONE_ASCII_BYTE ('$'); \
3288 if (! CHARSET_ISO_CHARS_96 (charset)) \
3289 { \
3290 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LONG_FORM \
3291 || reg != 0 \
3292 || final_char < '@' || final_char > 'B') \
3293 EMIT_ONE_ASCII_BYTE (intermediate_char_94[reg]); \
3294 } \
3295 else \
3296 EMIT_ONE_ASCII_BYTE (intermediate_char_96[reg]); \
3297 } \
3298 EMIT_ONE_ASCII_BYTE (final_char); \
3299 \
3300 CODING_ISO_DESIGNATION (coding, reg) = CHARSET_ID (charset); \
3301 } while (0)
3302
3303
3304 /* The following two macros produce codes (control character or escape
3305 sequence) for ISO2022 single-shift functions (single-shift-2 and
3306 single-shift-3). */
3307
3308 #define ENCODE_SINGLE_SHIFT_2 \
3309 do { \
3310 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS) \
3311 EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'N'); \
3312 else \
3313 EMIT_ONE_BYTE (ISO_CODE_SS2); \
3314 CODING_ISO_SINGLE_SHIFTING (coding) = 1; \
3315 } while (0)
3316
3317
3318 #define ENCODE_SINGLE_SHIFT_3 \
3319 do { \
3320 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS) \
3321 EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'O'); \
3322 else \
3323 EMIT_ONE_BYTE (ISO_CODE_SS3); \
3324 CODING_ISO_SINGLE_SHIFTING (coding) = 1; \
3325 } while (0)
3326
3327
3328 /* The following four macros produce codes (control character or
3329 escape sequence) for ISO2022 locking-shift functions (shift-in,
3330 shift-out, locking-shift-2, and locking-shift-3). */
3331
3332 #define ENCODE_SHIFT_IN \
3333 do { \
3334 EMIT_ONE_ASCII_BYTE (ISO_CODE_SI); \
3335 CODING_ISO_INVOCATION (coding, 0) = 0; \
3336 } while (0)
3337
3338
3339 #define ENCODE_SHIFT_OUT \
3340 do { \
3341 EMIT_ONE_ASCII_BYTE (ISO_CODE_SO); \
3342 CODING_ISO_INVOCATION (coding, 0) = 1; \
3343 } while (0)
3344
3345
3346 #define ENCODE_LOCKING_SHIFT_2 \
3347 do { \
3348 EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'n'); \
3349 CODING_ISO_INVOCATION (coding, 0) = 2; \
3350 } while (0)
3351
3352
3353 #define ENCODE_LOCKING_SHIFT_3 \
3354 do { \
3355 EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'n'); \
3356 CODING_ISO_INVOCATION (coding, 0) = 3; \
3357 } while (0)
3358
3359
3360 /* Produce codes for a DIMENSION1 character whose character set is
3361 CHARSET and whose position-code is C1. Designation and invocation
3362 sequences are also produced in advance if necessary. */
3363
3364 #define ENCODE_ISO_CHARACTER_DIMENSION1(charset, c1) \
3365 do { \
3366 int id = CHARSET_ID (charset); \
3367 \
3368 if ((CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_USE_ROMAN) \
3369 && id == charset_ascii) \
3370 { \
3371 id = charset_jisx0201_roman; \
3372 charset = CHARSET_FROM_ID (id); \
3373 } \
3374 \
3375 if (CODING_ISO_SINGLE_SHIFTING (coding)) \
3376 { \
3377 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS) \
3378 EMIT_ONE_ASCII_BYTE (c1 & 0x7F); \
3379 else \
3380 EMIT_ONE_BYTE (c1 | 0x80); \
3381 CODING_ISO_SINGLE_SHIFTING (coding) = 0; \
3382 break; \
3383 } \
3384 else if (id == CODING_ISO_INVOKED_CHARSET (coding, 0)) \
3385 { \
3386 EMIT_ONE_ASCII_BYTE (c1 & 0x7F); \
3387 break; \
3388 } \
3389 else if (id == CODING_ISO_INVOKED_CHARSET (coding, 1)) \
3390 { \
3391 EMIT_ONE_BYTE (c1 | 0x80); \
3392 break; \
3393 } \
3394 else \
3395 /* Since CHARSET is not yet invoked to any graphic planes, we \
3396 must invoke it, or, at first, designate it to some graphic \
3397 register. Then repeat the loop to actually produce the \
3398 character. */ \
3399 dst = encode_invocation_designation (charset, coding, dst, \
3400 &produced_chars); \
3401 } while (1)
3402
3403
3404 /* Produce codes for a DIMENSION2 character whose character set is
3405 CHARSET and whose position-codes are C1 and C2. Designation and
3406 invocation codes are also produced in advance if necessary. */
3407
3408 #define ENCODE_ISO_CHARACTER_DIMENSION2(charset, c1, c2) \
3409 do { \
3410 int id = CHARSET_ID (charset); \
3411 \
3412 if ((CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_USE_OLDJIS) \
3413 && id == charset_jisx0208) \
3414 { \
3415 id = charset_jisx0208_1978; \
3416 charset = CHARSET_FROM_ID (id); \
3417 } \
3418 \
3419 if (CODING_ISO_SINGLE_SHIFTING (coding)) \
3420 { \
3421 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS) \
3422 EMIT_TWO_ASCII_BYTES ((c1) & 0x7F, (c2) & 0x7F); \
3423 else \
3424 EMIT_TWO_BYTES ((c1) | 0x80, (c2) | 0x80); \
3425 CODING_ISO_SINGLE_SHIFTING (coding) = 0; \
3426 break; \
3427 } \
3428 else if (id == CODING_ISO_INVOKED_CHARSET (coding, 0)) \
3429 { \
3430 EMIT_TWO_ASCII_BYTES ((c1) & 0x7F, (c2) & 0x7F); \
3431 break; \
3432 } \
3433 else if (id == CODING_ISO_INVOKED_CHARSET (coding, 1)) \
3434 { \
3435 EMIT_TWO_BYTES ((c1) | 0x80, (c2) | 0x80); \
3436 break; \
3437 } \
3438 else \
3439 /* Since CHARSET is not yet invoked to any graphic planes, we \
3440 must invoke it, or, at first, designate it to some graphic \
3441 register. Then repeat the loop to actually produce the \
3442 character. */ \
3443 dst = encode_invocation_designation (charset, coding, dst, \
3444 &produced_chars); \
3445 } while (1)
3446
3447
3448 #define ENCODE_ISO_CHARACTER(charset, c) \
3449 do { \
3450 int code = ENCODE_CHAR ((charset),(c)); \
3451 \
3452 if (CHARSET_DIMENSION (charset) == 1) \
3453 ENCODE_ISO_CHARACTER_DIMENSION1 ((charset), code); \
3454 else \
3455 ENCODE_ISO_CHARACTER_DIMENSION2 ((charset), code >> 8, code & 0xFF); \
3456 } while (0)
3457
3458
3459 /* Produce designation and invocation codes at a place pointed by DST
3460 to use CHARSET. The element `spec.iso_2022' of *CODING is updated.
3461 Return new DST. */
3462
3463 unsigned char *
3464 encode_invocation_designation (charset, coding, dst, p_nchars)
3465 struct charset *charset;
3466 struct coding_system *coding;
3467 unsigned char *dst;
3468 int *p_nchars;
3469 {
3470 int multibytep = coding->dst_multibyte;
3471 int produced_chars = *p_nchars;
3472 int reg; /* graphic register number */
3473 int id = CHARSET_ID (charset);
3474
3475 /* At first, check designations. */
3476 for (reg = 0; reg < 4; reg++)
3477 if (id == CODING_ISO_DESIGNATION (coding, reg))
3478 break;
3479
3480 if (reg >= 4)
3481 {
3482 /* CHARSET is not yet designated to any graphic registers. */
3483 /* At first check the requested designation. */
3484 reg = CODING_ISO_REQUEST (coding, id);
3485 if (reg < 0)
3486 /* Since CHARSET requests no special designation, designate it
3487 to graphic register 0. */
3488 reg = 0;
3489
3490 ENCODE_DESIGNATION (charset, reg, coding);
3491 }
3492
3493 if (CODING_ISO_INVOCATION (coding, 0) != reg
3494 && CODING_ISO_INVOCATION (coding, 1) != reg)
3495 {
3496 /* Since the graphic register REG is not invoked to any graphic
3497 planes, invoke it to graphic plane 0. */
3498 switch (reg)
3499 {
3500 case 0: /* graphic register 0 */
3501 ENCODE_SHIFT_IN;
3502 break;
3503
3504 case 1: /* graphic register 1 */
3505 ENCODE_SHIFT_OUT;
3506 break;
3507
3508 case 2: /* graphic register 2 */
3509 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT)
3510 ENCODE_SINGLE_SHIFT_2;
3511 else
3512 ENCODE_LOCKING_SHIFT_2;
3513 break;
3514
3515 case 3: /* graphic register 3 */
3516 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT)
3517 ENCODE_SINGLE_SHIFT_3;
3518 else
3519 ENCODE_LOCKING_SHIFT_3;
3520 break;
3521 }
3522 }
3523
3524 *p_nchars = produced_chars;
3525 return dst;
3526 }
3527
3528 /* The following three macros produce codes for indicating direction
3529 of text. */
3530 #define ENCODE_CONTROL_SEQUENCE_INTRODUCER \
3531 do { \
3532 if (CODING_ISO_FLAGS (coding) == CODING_ISO_FLAG_SEVEN_BITS) \
3533 EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, '['); \
3534 else \
3535 EMIT_ONE_BYTE (ISO_CODE_CSI); \
3536 } while (0)
3537
3538
3539 #define ENCODE_DIRECTION_R2L() \
3540 do { \
3541 ENCODE_CONTROL_SEQUENCE_INTRODUCER (dst); \
3542 EMIT_TWO_ASCII_BYTES ('2', ']'); \
3543 } while (0)
3544
3545
3546 #define ENCODE_DIRECTION_L2R() \
3547 do { \
3548 ENCODE_CONTROL_SEQUENCE_INTRODUCER (dst); \
3549 EMIT_TWO_ASCII_BYTES ('0', ']'); \
3550 } while (0)
3551
3552
3553 /* Produce codes for designation and invocation to reset the graphic
3554 planes and registers to initial state. */
3555 #define ENCODE_RESET_PLANE_AND_REGISTER() \
3556 do { \
3557 int reg; \
3558 struct charset *charset; \
3559 \
3560 if (CODING_ISO_INVOCATION (coding, 0) != 0) \
3561 ENCODE_SHIFT_IN; \
3562 for (reg = 0; reg < 4; reg++) \
3563 if (CODING_ISO_INITIAL (coding, reg) >= 0 \
3564 && (CODING_ISO_DESIGNATION (coding, reg) \
3565 != CODING_ISO_INITIAL (coding, reg))) \
3566 { \
3567 charset = CHARSET_FROM_ID (CODING_ISO_INITIAL (coding, reg)); \
3568 ENCODE_DESIGNATION (charset, reg, coding); \
3569 } \
3570 } while (0)
3571
3572
3573 /* Produce designation sequences of charsets in the line started from
3574 SRC to a place pointed by DST, and return updated DST.
3575
3576 If the current block ends before any end-of-line, we may fail to
3577 find all the necessary designations. */
3578
3579 static unsigned char *
3580 encode_designation_at_bol (coding, charbuf, charbuf_end, dst)
3581 struct coding_system *coding;
3582 int *charbuf, *charbuf_end;
3583 unsigned char *dst;
3584 {
3585 struct charset *charset;
3586 /* Table of charsets to be designated to each graphic register. */
3587 int r[4];
3588 int c, found = 0, reg;
3589 int produced_chars = 0;
3590 int multibytep = coding->dst_multibyte;
3591 Lisp_Object attrs;
3592 Lisp_Object charset_list;
3593
3594 attrs = CODING_ID_ATTRS (coding->id);
3595 charset_list = CODING_ATTR_CHARSET_LIST (attrs);
3596 if (EQ (charset_list, Qiso_2022))
3597 charset_list = Viso_2022_charset_list;
3598
3599 for (reg = 0; reg < 4; reg++)
3600 r[reg] = -1;
3601
3602 while (found < 4)
3603 {
3604 int id;
3605
3606 c = *charbuf++;
3607 if (c == '\n')
3608 break;
3609 charset = char_charset (c, charset_list, NULL);
3610 id = CHARSET_ID (charset);
3611 reg = CODING_ISO_REQUEST (coding, id);
3612 if (reg >= 0 && r[reg] < 0)
3613 {
3614 found++;
3615 r[reg] = id;
3616 }
3617 }
3618
3619 if (found)
3620 {
3621 for (reg = 0; reg < 4; reg++)
3622 if (r[reg] >= 0
3623 && CODING_ISO_DESIGNATION (coding, reg) != r[reg])
3624 ENCODE_DESIGNATION (CHARSET_FROM_ID (r[reg]), reg, coding);
3625 }
3626
3627 return dst;
3628 }
3629
3630 /* See the above "GENERAL NOTES on `encode_coding_XXX ()' functions". */
3631
3632 static int
3633 encode_coding_iso_2022 (coding)
3634 struct coding_system *coding;
3635 {
3636 int multibytep = coding->dst_multibyte;
3637 int *charbuf = coding->charbuf;
3638 int *charbuf_end = charbuf + coding->charbuf_used;
3639 unsigned char *dst = coding->destination + coding->produced;
3640 unsigned char *dst_end = coding->destination + coding->dst_bytes;
3641 int safe_room = 16;
3642 int bol_designation
3643 = (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DESIGNATE_AT_BOL
3644 && CODING_ISO_BOL (coding));
3645 int produced_chars = 0;
3646 Lisp_Object attrs, eol_type, charset_list;
3647 int ascii_compatible;
3648 int c;
3649 int preferred_charset_id = -1;
3650
3651 CODING_GET_INFO (coding, attrs, eol_type, charset_list);
3652 setup_iso_safe_charsets (attrs);
3653 /* Charset list may have been changed. */
3654 charset_list = CODING_ATTR_CHARSET_LIST (attrs); \
3655 coding->safe_charsets = (char *) SDATA (CODING_ATTR_SAFE_CHARSETS(attrs));
3656
3657 ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs));
3658
3659 while (charbuf < charbuf_end)
3660 {
3661 ASSURE_DESTINATION (safe_room);
3662
3663 if (bol_designation)
3664 {
3665 unsigned char *dst_prev = dst;
3666
3667 /* We have to produce designation sequences if any now. */
3668 dst = encode_designation_at_bol (coding, charbuf, charbuf_end, dst);
3669 bol_designation = 0;
3670 /* We are sure that designation sequences are all ASCII bytes. */
3671 produced_chars += dst - dst_prev;
3672 }
3673
3674 c = *charbuf++;
3675
3676 if (c < 0)
3677 {
3678 /* Handle an annotation. */
3679 switch (*charbuf)
3680 {
3681 case CODING_ANNOTATE_COMPOSITION_MASK:
3682 /* Not yet implemented. */
3683 break;
3684 case CODING_ANNOTATE_CHARSET_MASK:
3685 preferred_charset_id = charbuf[3];
3686 if (preferred_charset_id >= 0
3687 && NILP (Fmemq (make_number (preferred_charset_id),
3688 charset_list)))
3689 preferred_charset_id = -1;
3690 break;
3691 default:
3692 abort ();
3693 }
3694 charbuf += -c - 1;
3695 continue;
3696 }
3697
3698 /* Now encode the character C. */
3699 if (c < 0x20 || c == 0x7F)
3700 {
3701 if (c == '\n'
3702 || (c == '\r' && EQ (eol_type, Qmac)))
3703 {
3704 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_RESET_AT_EOL)
3705 ENCODE_RESET_PLANE_AND_REGISTER ();
3706 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_INIT_AT_BOL)
3707 {
3708 int i;
3709
3710 for (i = 0; i < 4; i++)
3711 CODING_ISO_DESIGNATION (coding, i)
3712 = CODING_ISO_INITIAL (coding, i);
3713 }
3714 bol_designation
3715 = CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DESIGNATE_AT_BOL;
3716 }
3717 else if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_RESET_AT_CNTL)
3718 ENCODE_RESET_PLANE_AND_REGISTER ();
3719 EMIT_ONE_ASCII_BYTE (c);
3720 }
3721 else if (ASCII_CHAR_P (c))
3722 {
3723 if (ascii_compatible)
3724 EMIT_ONE_ASCII_BYTE (c);
3725 else
3726 {
3727 struct charset *charset = CHARSET_FROM_ID (charset_ascii);
3728 ENCODE_ISO_CHARACTER (charset, c);
3729 }
3730 }
3731 else if (CHAR_BYTE8_P (c))
3732 {
3733 c = CHAR_TO_BYTE8 (c);
3734 EMIT_ONE_BYTE (c);
3735 }
3736 else
3737 {
3738 struct charset *charset;
3739
3740 if (preferred_charset_id >= 0)
3741 {
3742 charset = CHARSET_FROM_ID (preferred_charset_id);
3743 if (! CHAR_CHARSET_P (c, charset))
3744 charset = char_charset (c, charset_list, NULL);
3745 }
3746 else
3747 charset = char_charset (c, charset_list, NULL);
3748 if (!charset)
3749 {
3750 if (coding->mode & CODING_MODE_SAFE_ENCODING)
3751 {
3752 c = CODING_INHIBIT_CHARACTER_SUBSTITUTION;
3753 charset = CHARSET_FROM_ID (charset_ascii);
3754 }
3755 else
3756 {
3757 c = coding->default_char;
3758 charset = char_charset (c, charset_list, NULL);
3759 }
3760 }
3761 ENCODE_ISO_CHARACTER (charset, c);
3762 }
3763 }
3764
3765 if (coding->mode & CODING_MODE_LAST_BLOCK
3766 && CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_RESET_AT_EOL)
3767 {
3768 ASSURE_DESTINATION (safe_room);
3769 ENCODE_RESET_PLANE_AND_REGISTER ();
3770 }
3771 coding->result = CODING_RESULT_SUCCESS;
3772 CODING_ISO_BOL (coding) = bol_designation;
3773 coding->produced_char += produced_chars;
3774 coding->produced = dst - coding->destination;
3775 return 0;
3776 }
3777
3778 \f
3779 /*** 8,9. SJIS and BIG5 handlers ***/
3780
3781 /* Although SJIS and BIG5 are not ISO's coding system, they are used
3782 quite widely. So, for the moment, Emacs supports them in the bare
3783 C code. But, in the future, they may be supported only by CCL. */
3784
3785 /* SJIS is a coding system encoding three character sets: ASCII, right
3786 half of JISX0201-Kana, and JISX0208. An ASCII character is encoded
3787 as is. A character of charset katakana-jisx0201 is encoded by
3788 "position-code + 0x80". A character of charset japanese-jisx0208
3789 is encoded in 2-byte but two position-codes are divided and shifted
3790 so that it fit in the range below.
3791
3792 --- CODE RANGE of SJIS ---
3793 (character set) (range)
3794 ASCII 0x00 .. 0x7F
3795 KATAKANA-JISX0201 0xA0 .. 0xDF
3796 JISX0208 (1st byte) 0x81 .. 0x9F and 0xE0 .. 0xEF
3797 (2nd byte) 0x40 .. 0x7E and 0x80 .. 0xFC
3798 -------------------------------
3799
3800 */
3801
3802 /* BIG5 is a coding system encoding two character sets: ASCII and
3803 Big5. An ASCII character is encoded as is. Big5 is a two-byte
3804 character set and is encoded in two-byte.
3805
3806 --- CODE RANGE of BIG5 ---
3807 (character set) (range)
3808 ASCII 0x00 .. 0x7F
3809 Big5 (1st byte) 0xA1 .. 0xFE
3810 (2nd byte) 0x40 .. 0x7E and 0xA1 .. 0xFE
3811 --------------------------
3812
3813 */
3814
3815 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
3816 Check if a text is encoded in SJIS. If it is, return
3817 CATEGORY_MASK_SJIS, else return 0. */
3818
3819 static int
3820 detect_coding_sjis (coding, detect_info)
3821 struct coding_system *coding;
3822 struct coding_detection_info *detect_info;
3823 {
3824 const unsigned char *src = coding->source, *src_base = src;
3825 const unsigned char *src_end = coding->source + coding->src_bytes;
3826 int multibytep = coding->src_multibyte;
3827 int consumed_chars = 0;
3828 int found = 0;
3829 int c;
3830 int incomplete;
3831
3832 detect_info->checked |= CATEGORY_MASK_SJIS;
3833 /* A coding system of this category is always ASCII compatible. */
3834 src += coding->head_ascii;
3835
3836 while (1)
3837 {
3838 incomplete = 0;
3839 ONE_MORE_BYTE (c);
3840 incomplete = 1;
3841 if (c < 0x80)
3842 continue;
3843 if ((c >= 0x81 && c <= 0x9F) || (c >= 0xE0 && c <= 0xEF))
3844 {
3845 ONE_MORE_BYTE (c);
3846 if (c < 0x40 || c == 0x7F || c > 0xFC)
3847 break;
3848 found = CATEGORY_MASK_SJIS;
3849 }
3850 else if (c >= 0xA0 && c < 0xE0)
3851 found = CATEGORY_MASK_SJIS;
3852 else
3853 break;
3854 }
3855 detect_info->rejected |= CATEGORY_MASK_SJIS;
3856 return 0;
3857
3858 no_more_source:
3859 if (incomplete && coding->mode & CODING_MODE_LAST_BLOCK)
3860 {
3861 detect_info->rejected |= CATEGORY_MASK_SJIS;
3862 return 0;
3863 }
3864 detect_info->found |= found;
3865 return 1;
3866 }
3867
3868 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
3869 Check if a text is encoded in BIG5. If it is, return
3870 CATEGORY_MASK_BIG5, else return 0. */
3871
3872 static int
3873 detect_coding_big5 (coding, detect_info)
3874 struct coding_system *coding;
3875 struct coding_detection_info *detect_info;
3876 {
3877 const unsigned char *src = coding->source, *src_base = src;
3878 const unsigned char *src_end = coding->source + coding->src_bytes;
3879 int multibytep = coding->src_multibyte;
3880 int consumed_chars = 0;
3881 int found = 0;
3882 int c;
3883 int incomplete;
3884
3885 detect_info->checked |= CATEGORY_MASK_BIG5;
3886 /* A coding system of this category is always ASCII compatible. */
3887 src += coding->head_ascii;
3888
3889 while (1)
3890 {
3891 incomplete = 0;
3892 ONE_MORE_BYTE (c);
3893 incomplete = 1;
3894 if (c < 0x80)
3895 continue;
3896 if (c >= 0xA1)
3897 {
3898 ONE_MORE_BYTE (c);
3899 if (c < 0x40 || (c >= 0x7F && c <= 0xA0))
3900 return 0;
3901 found = CATEGORY_MASK_BIG5;
3902 }
3903 else
3904 break;
3905 }
3906 detect_info->rejected |= CATEGORY_MASK_BIG5;
3907 return 0;
3908
3909 no_more_source:
3910 if (incomplete && coding->mode & CODING_MODE_LAST_BLOCK)
3911 {
3912 detect_info->rejected |= CATEGORY_MASK_BIG5;
3913 return 0;
3914 }
3915 detect_info->found |= found;
3916 return 1;
3917 }
3918
3919 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".
3920 If SJIS_P is 1, decode SJIS text, else decode BIG5 test. */
3921
3922 static void
3923 decode_coding_sjis (coding)
3924 struct coding_system *coding;
3925 {
3926 const unsigned char *src = coding->source + coding->consumed;
3927 const unsigned char *src_end = coding->source + coding->src_bytes;
3928 const unsigned char *src_base;
3929 int *charbuf = coding->charbuf;
3930 int *charbuf_end = charbuf + coding->charbuf_size - MAX_ANNOTATION_LENGTH;
3931 int consumed_chars = 0, consumed_chars_base;
3932 int multibytep = coding->src_multibyte;
3933 struct charset *charset_roman, *charset_kanji, *charset_kana;
3934 Lisp_Object attrs, eol_type, charset_list, val;
3935 int char_offset = coding->produced_char;
3936 int last_offset = char_offset;
3937 int last_id = charset_ascii;
3938
3939 CODING_GET_INFO (coding, attrs, eol_type, charset_list);
3940
3941 val = charset_list;
3942 charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
3943 charset_kana = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
3944 charset_kanji = CHARSET_FROM_ID (XINT (XCAR (val)));
3945
3946 while (1)
3947 {
3948 int c, c1;
3949
3950 src_base = src;
3951 consumed_chars_base = consumed_chars;
3952
3953 if (charbuf >= charbuf_end)
3954 break;
3955
3956 ONE_MORE_BYTE (c);
3957
3958 if (c == '\r')
3959 {
3960 if (EQ (eol_type, Qdos))
3961 {
3962 if (src == src_end)
3963 {
3964 coding->result = CODING_RESULT_INSUFFICIENT_SRC;
3965 goto no_more_source;
3966 }
3967 if (*src == '\n')
3968 ONE_MORE_BYTE (c);
3969 }
3970 else if (EQ (eol_type, Qmac))
3971 c = '\n';
3972 }
3973 else
3974 {
3975 struct charset *charset;
3976
3977 if (c < 0x80)
3978 charset = charset_roman;
3979 else
3980 {
3981 if (c >= 0xF0)
3982 goto invalid_code;
3983 if (c < 0xA0 || c >= 0xE0)
3984 {
3985 /* SJIS -> JISX0208 */
3986 ONE_MORE_BYTE (c1);
3987 if (c1 < 0x40 || c1 == 0x7F || c1 > 0xFC)
3988 goto invalid_code;
3989 c = (c << 8) | c1;
3990 SJIS_TO_JIS (c);
3991 charset = charset_kanji;
3992 }
3993 else if (c > 0xA0)
3994 {
3995 /* SJIS -> JISX0201-Kana */
3996 c &= 0x7F;
3997 charset = charset_kana;
3998 }
3999 else
4000 goto invalid_code;
4001 }
4002 if (charset->id != charset_ascii
4003 && last_id != charset->id)
4004 {
4005 if (last_id != charset_ascii)
4006 ADD_CHARSET_DATA (charbuf, last_offset, char_offset, last_id);
4007 last_id = charset->id;
4008 last_offset = char_offset;
4009 }
4010 CODING_DECODE_CHAR (coding, src, src_base, src_end, charset, c, c);
4011 }
4012 *charbuf++ = c;
4013 char_offset++;
4014 continue;
4015
4016 invalid_code:
4017 src = src_base;
4018 consumed_chars = consumed_chars_base;
4019 ONE_MORE_BYTE (c);
4020 *charbuf++ = ASCII_BYTE_P (c) ? c : BYTE8_TO_CHAR (c);
4021 char_offset++;
4022 coding->errors++;
4023 }
4024
4025 no_more_source:
4026 if (last_id != charset_ascii)
4027 ADD_CHARSET_DATA (charbuf, last_offset, char_offset, last_id);
4028 coding->consumed_char += consumed_chars_base;
4029 coding->consumed = src_base - coding->source;
4030 coding->charbuf_used = charbuf - coding->charbuf;
4031 }
4032
4033 static void
4034 decode_coding_big5 (coding)
4035 struct coding_system *coding;
4036 {
4037 const unsigned char *src = coding->source + coding->consumed;
4038 const unsigned char *src_end = coding->source + coding->src_bytes;
4039 const unsigned char *src_base;
4040 int *charbuf = coding->charbuf;
4041 int *charbuf_end = charbuf + coding->charbuf_size - MAX_ANNOTATION_LENGTH;
4042 int consumed_chars = 0, consumed_chars_base;
4043 int multibytep = coding->src_multibyte;
4044 struct charset *charset_roman, *charset_big5;
4045 Lisp_Object attrs, eol_type, charset_list, val;
4046 int char_offset = coding->produced_char;
4047 int last_offset = char_offset;
4048 int last_id = charset_ascii;
4049
4050 CODING_GET_INFO (coding, attrs, eol_type, charset_list);
4051 val = charset_list;
4052 charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4053 charset_big5 = CHARSET_FROM_ID (XINT (XCAR (val)));
4054
4055 while (1)
4056 {
4057 int c, c1;
4058
4059 src_base = src;
4060 consumed_chars_base = consumed_chars;
4061
4062 if (charbuf >= charbuf_end)
4063 break;
4064
4065 ONE_MORE_BYTE (c);
4066
4067 if (c == '\r')
4068 {
4069 if (EQ (eol_type, Qdos))
4070 {
4071 if (src == src_end)
4072 {
4073 coding->result = CODING_RESULT_INSUFFICIENT_SRC;
4074 goto no_more_source;
4075 }
4076 if (*src == '\n')
4077 ONE_MORE_BYTE (c);
4078 }
4079 else if (EQ (eol_type, Qmac))
4080 c = '\n';
4081 }
4082 else
4083 {
4084 struct charset *charset;
4085 if (c < 0x80)
4086 charset = charset_roman;
4087 else
4088 {
4089 /* BIG5 -> Big5 */
4090 if (c < 0xA1 || c > 0xFE)
4091 goto invalid_code;
4092 ONE_MORE_BYTE (c1);
4093 if (c1 < 0x40 || (c1 > 0x7E && c1 < 0xA1) || c1 > 0xFE)
4094 goto invalid_code;
4095 c = c << 8 | c1;
4096 charset = charset_big5;
4097 }
4098 if (charset->id != charset_ascii
4099 && last_id != charset->id)
4100 {
4101 if (last_id != charset_ascii)
4102 ADD_CHARSET_DATA (charbuf, last_offset, char_offset, last_id);
4103 last_id = charset->id;
4104 last_offset = char_offset;
4105 }
4106 CODING_DECODE_CHAR (coding, src, src_base, src_end, charset, c, c);
4107 }
4108
4109 *charbuf++ = c;
4110 char_offset++;
4111 continue;
4112
4113 invalid_code:
4114 src = src_base;
4115 consumed_chars = consumed_chars_base;
4116 ONE_MORE_BYTE (c);
4117 *charbuf++ = ASCII_BYTE_P (c) ? c : BYTE8_TO_CHAR (c);
4118 char_offset++;
4119 coding->errors++;
4120 }
4121
4122 no_more_source:
4123 if (last_id != charset_ascii)
4124 ADD_CHARSET_DATA (charbuf, last_offset, char_offset, last_id);
4125 coding->consumed_char += consumed_chars_base;
4126 coding->consumed = src_base - coding->source;
4127 coding->charbuf_used = charbuf - coding->charbuf;
4128 }
4129
4130 /* See the above "GENERAL NOTES on `encode_coding_XXX ()' functions".
4131 This function can encode charsets `ascii', `katakana-jisx0201',
4132 `japanese-jisx0208', `chinese-big5-1', and `chinese-big5-2'. We
4133 are sure that all these charsets are registered as official charset
4134 (i.e. do not have extended leading-codes). Characters of other
4135 charsets are produced without any encoding. If SJIS_P is 1, encode
4136 SJIS text, else encode BIG5 text. */
4137
4138 static int
4139 encode_coding_sjis (coding)
4140 struct coding_system *coding;
4141 {
4142 int multibytep = coding->dst_multibyte;
4143 int *charbuf = coding->charbuf;
4144 int *charbuf_end = charbuf + coding->charbuf_used;
4145 unsigned char *dst = coding->destination + coding->produced;
4146 unsigned char *dst_end = coding->destination + coding->dst_bytes;
4147 int safe_room = 4;
4148 int produced_chars = 0;
4149 Lisp_Object attrs, eol_type, charset_list, val;
4150 int ascii_compatible;
4151 struct charset *charset_roman, *charset_kanji, *charset_kana;
4152 int c;
4153
4154 CODING_GET_INFO (coding, attrs, eol_type, charset_list);
4155 val = charset_list;
4156 charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4157 charset_kana = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4158 charset_kanji = CHARSET_FROM_ID (XINT (XCAR (val)));
4159
4160 ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs));
4161
4162 while (charbuf < charbuf_end)
4163 {
4164 ASSURE_DESTINATION (safe_room);
4165 c = *charbuf++;
4166 /* Now encode the character C. */
4167 if (ASCII_CHAR_P (c) && ascii_compatible)
4168 EMIT_ONE_ASCII_BYTE (c);
4169 else if (CHAR_BYTE8_P (c))
4170 {
4171 c = CHAR_TO_BYTE8 (c);
4172 EMIT_ONE_BYTE (c);
4173 }
4174 else
4175 {
4176 unsigned code;
4177 struct charset *charset = char_charset (c, charset_list, &code);
4178
4179 if (!charset)
4180 {
4181 if (coding->mode & CODING_MODE_SAFE_ENCODING)
4182 {
4183 code = CODING_INHIBIT_CHARACTER_SUBSTITUTION;
4184 charset = CHARSET_FROM_ID (charset_ascii);
4185 }
4186 else
4187 {
4188 c = coding->default_char;
4189 charset = char_charset (c, charset_list, &code);
4190 }
4191 }
4192 if (code == CHARSET_INVALID_CODE (charset))
4193 abort ();
4194 if (charset == charset_kanji)
4195 {
4196 int c1, c2;
4197 JIS_TO_SJIS (code);
4198 c1 = code >> 8, c2 = code & 0xFF;
4199 EMIT_TWO_BYTES (c1, c2);
4200 }
4201 else if (charset == charset_kana)
4202 EMIT_ONE_BYTE (code | 0x80);
4203 else
4204 EMIT_ONE_ASCII_BYTE (code & 0x7F);
4205 }
4206 }
4207 coding->result = CODING_RESULT_SUCCESS;
4208 coding->produced_char += produced_chars;
4209 coding->produced = dst - coding->destination;
4210 return 0;
4211 }
4212
4213 static int
4214 encode_coding_big5 (coding)
4215 struct coding_system *coding;
4216 {
4217 int multibytep = coding->dst_multibyte;
4218 int *charbuf = coding->charbuf;
4219 int *charbuf_end = charbuf + coding->charbuf_used;
4220 unsigned char *dst = coding->destination + coding->produced;
4221 unsigned char *dst_end = coding->destination + coding->dst_bytes;
4222 int safe_room = 4;
4223 int produced_chars = 0;
4224 Lisp_Object attrs, eol_type, charset_list, val;
4225 int ascii_compatible;
4226 struct charset *charset_roman, *charset_big5;
4227 int c;
4228
4229 CODING_GET_INFO (coding, attrs, eol_type, charset_list);
4230 val = charset_list;
4231 charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4232 charset_big5 = CHARSET_FROM_ID (XINT (XCAR (val)));
4233 ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs));
4234
4235 while (charbuf < charbuf_end)
4236 {
4237 ASSURE_DESTINATION (safe_room);
4238 c = *charbuf++;
4239 /* Now encode the character C. */
4240 if (ASCII_CHAR_P (c) && ascii_compatible)
4241 EMIT_ONE_ASCII_BYTE (c);
4242 else if (CHAR_BYTE8_P (c))
4243 {
4244 c = CHAR_TO_BYTE8 (c);
4245 EMIT_ONE_BYTE (c);
4246 }
4247 else
4248 {
4249 unsigned code;
4250 struct charset *charset = char_charset (c, charset_list, &code);
4251
4252 if (! charset)
4253 {
4254 if (coding->mode & CODING_MODE_SAFE_ENCODING)
4255 {
4256 code = CODING_INHIBIT_CHARACTER_SUBSTITUTION;
4257 charset = CHARSET_FROM_ID (charset_ascii);
4258 }
4259 else
4260 {
4261 c = coding->default_char;
4262 charset = char_charset (c, charset_list, &code);
4263 }
4264 }
4265 if (code == CHARSET_INVALID_CODE (charset))
4266 abort ();
4267 if (charset == charset_big5)
4268 {
4269 int c1, c2;
4270
4271 c1 = code >> 8, c2 = code & 0xFF;
4272 EMIT_TWO_BYTES (c1, c2);
4273 }
4274 else
4275 EMIT_ONE_ASCII_BYTE (code & 0x7F);
4276 }
4277 }
4278 coding->result = CODING_RESULT_SUCCESS;
4279 coding->produced_char += produced_chars;
4280 coding->produced = dst - coding->destination;
4281 return 0;
4282 }
4283
4284 \f
4285 /*** 10. CCL handlers ***/
4286
4287 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
4288 Check if a text is encoded in a coding system of which
4289 encoder/decoder are written in CCL program. If it is, return
4290 CATEGORY_MASK_CCL, else return 0. */
4291
4292 static int
4293 detect_coding_ccl (coding, detect_info)
4294 struct coding_system *coding;
4295 struct coding_detection_info *detect_info;
4296 {
4297 const unsigned char *src = coding->source, *src_base = src;
4298 const unsigned char *src_end = coding->source + coding->src_bytes;
4299 int multibytep = coding->src_multibyte;
4300 int consumed_chars = 0;
4301 int found = 0;
4302 unsigned char *valids = CODING_CCL_VALIDS (coding);
4303 int head_ascii = coding->head_ascii;
4304 Lisp_Object attrs;
4305
4306 detect_info->checked |= CATEGORY_MASK_CCL;
4307
4308 coding = &coding_categories[coding_category_ccl];
4309 attrs = CODING_ID_ATTRS (coding->id);
4310 if (! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
4311 src += head_ascii;
4312
4313 while (1)
4314 {
4315 int c;
4316 ONE_MORE_BYTE (c);
4317 if (! valids[c])
4318 break;
4319 if ((valids[c] > 1))
4320 found = CATEGORY_MASK_CCL;
4321 }
4322 detect_info->rejected |= CATEGORY_MASK_CCL;
4323 return 0;
4324
4325 no_more_source:
4326 detect_info->found |= found;
4327 return 1;
4328 }
4329
4330 static void
4331 decode_coding_ccl (coding)
4332 struct coding_system *coding;
4333 {
4334 const unsigned char *src = coding->source + coding->consumed;
4335 const unsigned char *src_end = coding->source + coding->src_bytes;
4336 int *charbuf = coding->charbuf;
4337 int *charbuf_end = charbuf + coding->charbuf_size;
4338 int consumed_chars = 0;
4339 int multibytep = coding->src_multibyte;
4340 struct ccl_program ccl;
4341 int source_charbuf[1024];
4342 int source_byteidx[1024];
4343 Lisp_Object attrs, eol_type, charset_list;
4344
4345 CODING_GET_INFO (coding, attrs, eol_type, charset_list);
4346 setup_ccl_program (&ccl, CODING_CCL_DECODER (coding));
4347
4348 while (src < src_end)
4349 {
4350 const unsigned char *p = src;
4351 int *source, *source_end;
4352 int i = 0;
4353
4354 if (multibytep)
4355 while (i < 1024 && p < src_end)
4356 {
4357 source_byteidx[i] = p - src;
4358 source_charbuf[i++] = STRING_CHAR_ADVANCE (p);
4359 }
4360 else
4361 while (i < 1024 && p < src_end)
4362 source_charbuf[i++] = *p++;
4363
4364 if (p == src_end && coding->mode & CODING_MODE_LAST_BLOCK)
4365 ccl.last_block = 1;
4366
4367 source = source_charbuf;
4368 source_end = source + i;
4369 while (source < source_end)
4370 {
4371 ccl_driver (&ccl, source, charbuf,
4372 source_end - source, charbuf_end - charbuf,
4373 charset_list);
4374 source += ccl.consumed;
4375 charbuf += ccl.produced;
4376 if (ccl.status != CCL_STAT_SUSPEND_BY_DST)
4377 break;
4378 }
4379 if (source < source_end)
4380 src += source_byteidx[source - source_charbuf];
4381 else
4382 src = p;
4383 consumed_chars += source - source_charbuf;
4384
4385 if (ccl.status != CCL_STAT_SUSPEND_BY_SRC
4386 && ccl.status != CODING_RESULT_INSUFFICIENT_SRC)
4387 break;
4388 }
4389
4390 switch (ccl.status)
4391 {
4392 case CCL_STAT_SUSPEND_BY_SRC:
4393 coding->result = CODING_RESULT_INSUFFICIENT_SRC;
4394 break;
4395 case CCL_STAT_SUSPEND_BY_DST:
4396 break;
4397 case CCL_STAT_QUIT:
4398 case CCL_STAT_INVALID_CMD:
4399 coding->result = CODING_RESULT_INTERRUPT;
4400 break;
4401 default:
4402 coding->result = CODING_RESULT_SUCCESS;
4403 break;
4404 }
4405 coding->consumed_char += consumed_chars;
4406 coding->consumed = src - coding->source;
4407 coding->charbuf_used = charbuf - coding->charbuf;
4408 }
4409
4410 static int
4411 encode_coding_ccl (coding)
4412 struct coding_system *coding;
4413 {
4414 struct ccl_program ccl;
4415 int multibytep = coding->dst_multibyte;
4416 int *charbuf = coding->charbuf;
4417 int *charbuf_end = charbuf + coding->charbuf_used;
4418 unsigned char *dst = coding->destination + coding->produced;
4419 unsigned char *dst_end = coding->destination + coding->dst_bytes;
4420 unsigned char *adjusted_dst_end = dst_end - 1;
4421 int destination_charbuf[1024];
4422 int i, produced_chars = 0;
4423 Lisp_Object attrs, eol_type, charset_list;
4424
4425 CODING_GET_INFO (coding, attrs, eol_type, charset_list);
4426 setup_ccl_program (&ccl, CODING_CCL_ENCODER (coding));
4427
4428 ccl.last_block = coding->mode & CODING_MODE_LAST_BLOCK;
4429 ccl.dst_multibyte = coding->dst_multibyte;
4430
4431 while (charbuf < charbuf_end && dst < adjusted_dst_end)
4432 {
4433 int dst_bytes = dst_end - dst;
4434 if (dst_bytes > 1024)
4435 dst_bytes = 1024;
4436
4437 ccl_driver (&ccl, charbuf, destination_charbuf,
4438 charbuf_end - charbuf, dst_bytes, charset_list);
4439 charbuf += ccl.consumed;
4440 if (multibytep)
4441 for (i = 0; i < ccl.produced; i++)
4442 EMIT_ONE_BYTE (destination_charbuf[i] & 0xFF);
4443 else
4444 {
4445 for (i = 0; i < ccl.produced; i++)
4446 *dst++ = destination_charbuf[i] & 0xFF;
4447 produced_chars += ccl.produced;
4448 }
4449 }
4450
4451 switch (ccl.status)
4452 {
4453 case CCL_STAT_SUSPEND_BY_SRC:
4454 coding->result = CODING_RESULT_INSUFFICIENT_SRC;
4455 break;
4456 case CCL_STAT_SUSPEND_BY_DST:
4457 coding->result = CODING_RESULT_INSUFFICIENT_DST;
4458 break;
4459 case CCL_STAT_QUIT:
4460 case CCL_STAT_INVALID_CMD:
4461 coding->result = CODING_RESULT_INTERRUPT;
4462 break;
4463 default:
4464 coding->result = CODING_RESULT_SUCCESS;
4465 break;
4466 }
4467
4468 coding->produced_char += produced_chars;
4469 coding->produced = dst - coding->destination;
4470 return 0;
4471 }
4472
4473
4474 \f
4475 /*** 10, 11. no-conversion handlers ***/
4476
4477 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions". */
4478
4479 static void
4480 decode_coding_raw_text (coding)
4481 struct coding_system *coding;
4482 {
4483 coding->chars_at_source = 1;
4484 coding->consumed_char = 0;
4485 coding->consumed = 0;
4486 coding->result = CODING_RESULT_SUCCESS;
4487 }
4488
4489 static int
4490 encode_coding_raw_text (coding)
4491 struct coding_system *coding;
4492 {
4493 int multibytep = coding->dst_multibyte;
4494 int *charbuf = coding->charbuf;
4495 int *charbuf_end = coding->charbuf + coding->charbuf_used;
4496 unsigned char *dst = coding->destination + coding->produced;
4497 unsigned char *dst_end = coding->destination + coding->dst_bytes;
4498 int produced_chars = 0;
4499 int c;
4500
4501 if (multibytep)
4502 {
4503 int safe_room = MAX_MULTIBYTE_LENGTH * 2;
4504
4505 if (coding->src_multibyte)
4506 while (charbuf < charbuf_end)
4507 {
4508 ASSURE_DESTINATION (safe_room);
4509 c = *charbuf++;
4510 if (ASCII_CHAR_P (c))
4511 EMIT_ONE_ASCII_BYTE (c);
4512 else if (CHAR_BYTE8_P (c))
4513 {
4514 c = CHAR_TO_BYTE8 (c);
4515 EMIT_ONE_BYTE (c);
4516 }
4517 else
4518 {
4519 unsigned char str[MAX_MULTIBYTE_LENGTH], *p0 = str, *p1 = str;
4520
4521 CHAR_STRING_ADVANCE (c, p1);
4522 while (p0 < p1)
4523 {
4524 EMIT_ONE_BYTE (*p0);
4525 p0++;
4526 }
4527 }
4528 }
4529 else
4530 while (charbuf < charbuf_end)
4531 {
4532 ASSURE_DESTINATION (safe_room);
4533 c = *charbuf++;
4534 EMIT_ONE_BYTE (c);
4535 }
4536 }
4537 else
4538 {
4539 if (coding->src_multibyte)
4540 {
4541 int safe_room = MAX_MULTIBYTE_LENGTH;
4542
4543 while (charbuf < charbuf_end)
4544 {
4545 ASSURE_DESTINATION (safe_room);
4546 c = *charbuf++;
4547 if (ASCII_CHAR_P (c))
4548 *dst++ = c;
4549 else if (CHAR_BYTE8_P (c))
4550 *dst++ = CHAR_TO_BYTE8 (c);
4551 else
4552 CHAR_STRING_ADVANCE (c, dst);
4553 produced_chars++;
4554 }
4555 }
4556 else
4557 {
4558 ASSURE_DESTINATION (charbuf_end - charbuf);
4559 while (charbuf < charbuf_end && dst < dst_end)
4560 *dst++ = *charbuf++;
4561 produced_chars = dst - (coding->destination + coding->dst_bytes);
4562 }
4563 }
4564 coding->result = CODING_RESULT_SUCCESS;
4565 coding->produced_char += produced_chars;
4566 coding->produced = dst - coding->destination;
4567 return 0;
4568 }
4569
4570 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
4571 Check if a text is encoded in a charset-based coding system. If it
4572 is, return 1, else return 0. */
4573
4574 static int
4575 detect_coding_charset (coding, detect_info)
4576 struct coding_system *coding;
4577 struct coding_detection_info *detect_info;
4578 {
4579 const unsigned char *src = coding->source, *src_base = src;
4580 const unsigned char *src_end = coding->source + coding->src_bytes;
4581 int multibytep = coding->src_multibyte;
4582 int consumed_chars = 0;
4583 Lisp_Object attrs, valids;
4584 int found = 0;
4585
4586 detect_info->checked |= CATEGORY_MASK_CHARSET;
4587
4588 coding = &coding_categories[coding_category_charset];
4589 attrs = CODING_ID_ATTRS (coding->id);
4590 valids = AREF (attrs, coding_attr_charset_valids);
4591
4592 if (! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
4593 src += coding->head_ascii;
4594
4595 while (1)
4596 {
4597 int c;
4598
4599 ONE_MORE_BYTE (c);
4600 if (NILP (AREF (valids, c)))
4601 break;
4602 if (c >= 0x80)
4603 found = CATEGORY_MASK_CHARSET;
4604 }
4605 detect_info->rejected |= CATEGORY_MASK_CHARSET;
4606 return 0;
4607
4608 no_more_source:
4609 detect_info->found |= found;
4610 return 1;
4611 }
4612
4613 static void
4614 decode_coding_charset (coding)
4615 struct coding_system *coding;
4616 {
4617 const unsigned char *src = coding->source + coding->consumed;
4618 const unsigned char *src_end = coding->source + coding->src_bytes;
4619 const unsigned char *src_base;
4620 int *charbuf = coding->charbuf;
4621 int *charbuf_end = charbuf + coding->charbuf_size - MAX_ANNOTATION_LENGTH;
4622 int consumed_chars = 0, consumed_chars_base;
4623 int multibytep = coding->src_multibyte;
4624 Lisp_Object attrs, eol_type, charset_list, valids;
4625 int char_offset = coding->produced_char;
4626 int last_offset = char_offset;
4627 int last_id = charset_ascii;
4628
4629 CODING_GET_INFO (coding, attrs, eol_type, charset_list);
4630 valids = AREF (attrs, coding_attr_charset_valids);
4631
4632 while (1)
4633 {
4634 int c;
4635
4636 src_base = src;
4637 consumed_chars_base = consumed_chars;
4638
4639 if (charbuf >= charbuf_end)
4640 break;
4641
4642 ONE_MORE_BYTE (c);
4643 if (c == '\r')
4644 {
4645 /* Here we assume that no charset maps '\r' to something
4646 else. */
4647 if (EQ (eol_type, Qdos))
4648 {
4649 if (src == src_end)
4650 {
4651 coding->result = CODING_RESULT_INSUFFICIENT_SRC;
4652 goto no_more_source;
4653 }
4654 if (*src == '\n')
4655 ONE_MORE_BYTE (c);
4656 }
4657 else if (EQ (eol_type, Qmac))
4658 c = '\n';
4659 }
4660 else
4661 {
4662 Lisp_Object val;
4663 struct charset *charset;
4664 int dim;
4665 int len = 1;
4666 unsigned code = c;
4667
4668 val = AREF (valids, c);
4669 if (NILP (val))
4670 goto invalid_code;
4671 if (INTEGERP (val))
4672 {
4673 charset = CHARSET_FROM_ID (XFASTINT (val));
4674 dim = CHARSET_DIMENSION (charset);
4675 while (len < dim)
4676 {
4677 ONE_MORE_BYTE (c);
4678 code = (code << 8) | c;
4679 len++;
4680 }
4681 CODING_DECODE_CHAR (coding, src, src_base, src_end,
4682 charset, code, c);
4683 }
4684 else
4685 {
4686 /* VAL is a list of charset IDs. It is assured that the
4687 list is sorted by charset dimensions (smaller one
4688 comes first). */
4689 while (CONSP (val))
4690 {
4691 charset = CHARSET_FROM_ID (XFASTINT (XCAR (val)));
4692 dim = CHARSET_DIMENSION (charset);
4693 while (len < dim)
4694 {
4695 ONE_MORE_BYTE (c);
4696 code = (code << 8) | c;
4697 len++;
4698 }
4699 CODING_DECODE_CHAR (coding, src, src_base,
4700 src_end, charset, code, c);
4701 if (c >= 0)
4702 break;
4703 val = XCDR (val);
4704 }
4705 }
4706 if (c < 0)
4707 goto invalid_code;
4708 if (charset->id != charset_ascii
4709 && last_id != charset->id)
4710 {
4711 if (last_id != charset_ascii)
4712 ADD_CHARSET_DATA (charbuf, last_offset, char_offset, last_id);
4713 last_id = charset->id;
4714 last_offset = char_offset;
4715 }
4716 }
4717 *charbuf++ = c;
4718 char_offset++;
4719 continue;
4720
4721 invalid_code:
4722 src = src_base;
4723 consumed_chars = consumed_chars_base;
4724 ONE_MORE_BYTE (c);
4725 *charbuf++ = ASCII_BYTE_P (c) ? c : BYTE8_TO_CHAR (c);
4726 char_offset++;
4727 coding->errors++;
4728 }
4729
4730 no_more_source:
4731 if (last_id != charset_ascii)
4732 ADD_CHARSET_DATA (charbuf, last_offset, char_offset, last_id);
4733 coding->consumed_char += consumed_chars_base;
4734 coding->consumed = src_base - coding->source;
4735 coding->charbuf_used = charbuf - coding->charbuf;
4736 }
4737
4738 static int
4739 encode_coding_charset (coding)
4740 struct coding_system *coding;
4741 {
4742 int multibytep = coding->dst_multibyte;
4743 int *charbuf = coding->charbuf;
4744 int *charbuf_end = charbuf + coding->charbuf_used;
4745 unsigned char *dst = coding->destination + coding->produced;
4746 unsigned char *dst_end = coding->destination + coding->dst_bytes;
4747 int safe_room = MAX_MULTIBYTE_LENGTH;
4748 int produced_chars = 0;
4749 Lisp_Object attrs, eol_type, charset_list;
4750 int ascii_compatible;
4751 int c;
4752
4753 CODING_GET_INFO (coding, attrs, eol_type, charset_list);
4754 ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs));
4755
4756 while (charbuf < charbuf_end)
4757 {
4758 struct charset *charset;
4759 unsigned code;
4760
4761 ASSURE_DESTINATION (safe_room);
4762 c = *charbuf++;
4763 if (ascii_compatible && ASCII_CHAR_P (c))
4764 EMIT_ONE_ASCII_BYTE (c);
4765 else if (CHAR_BYTE8_P (c))
4766 {
4767 c = CHAR_TO_BYTE8 (c);
4768 EMIT_ONE_BYTE (c);
4769 }
4770 else
4771 {
4772 charset = char_charset (c, charset_list, &code);
4773 if (charset)
4774 {
4775 if (CHARSET_DIMENSION (charset) == 1)
4776 EMIT_ONE_BYTE (code);
4777 else if (CHARSET_DIMENSION (charset) == 2)
4778 EMIT_TWO_BYTES (code >> 8, code & 0xFF);
4779 else if (CHARSET_DIMENSION (charset) == 3)
4780 EMIT_THREE_BYTES (code >> 16, (code >> 8) & 0xFF, code & 0xFF);
4781 else
4782 EMIT_FOUR_BYTES (code >> 24, (code >> 16) & 0xFF,
4783 (code >> 8) & 0xFF, code & 0xFF);
4784 }
4785 else
4786 {
4787 if (coding->mode & CODING_MODE_SAFE_ENCODING)
4788 c = CODING_INHIBIT_CHARACTER_SUBSTITUTION;
4789 else
4790 c = coding->default_char;
4791 EMIT_ONE_BYTE (c);
4792 }
4793 }
4794 }
4795
4796 coding->result = CODING_RESULT_SUCCESS;
4797 coding->produced_char += produced_chars;
4798 coding->produced = dst - coding->destination;
4799 return 0;
4800 }
4801
4802 \f
4803 /*** 7. C library functions ***/
4804
4805 /* Setup coding context CODING from information about CODING_SYSTEM.
4806 If CODING_SYSTEM is nil, `no-conversion' is assumed. If
4807 CODING_SYSTEM is invalid, signal an error. */
4808
4809 void
4810 setup_coding_system (coding_system, coding)
4811 Lisp_Object coding_system;
4812 struct coding_system *coding;
4813 {
4814 Lisp_Object attrs;
4815 Lisp_Object eol_type;
4816 Lisp_Object coding_type;
4817 Lisp_Object val;
4818
4819 if (NILP (coding_system))
4820 coding_system = Qno_conversion;
4821
4822 CHECK_CODING_SYSTEM_GET_ID (coding_system, coding->id);
4823
4824 attrs = CODING_ID_ATTRS (coding->id);
4825 eol_type = CODING_ID_EOL_TYPE (coding->id);
4826
4827 coding->mode = 0;
4828 coding->head_ascii = -1;
4829 coding->common_flags
4830 = (VECTORP (eol_type) ? CODING_REQUIRE_DETECTION_MASK : 0);
4831 if (! NILP (CODING_ATTR_POST_READ (attrs)))
4832 coding->common_flags |= CODING_REQUIRE_DECODING_MASK;
4833 if (! NILP (CODING_ATTR_PRE_WRITE (attrs)))
4834 coding->common_flags |= CODING_REQUIRE_ENCODING_MASK;
4835 if (! NILP (CODING_ATTR_FOR_UNIBYTE (attrs)))
4836 coding->common_flags |= CODING_FOR_UNIBYTE_MASK;
4837
4838 val = CODING_ATTR_SAFE_CHARSETS (attrs);
4839 coding->max_charset_id = SCHARS (val) - 1;
4840 coding->safe_charsets = (char *) SDATA (val);
4841 coding->default_char = XINT (CODING_ATTR_DEFAULT_CHAR (attrs));
4842
4843 coding_type = CODING_ATTR_TYPE (attrs);
4844 if (EQ (coding_type, Qundecided))
4845 {
4846 coding->detector = NULL;
4847 coding->decoder = decode_coding_raw_text;
4848 coding->encoder = encode_coding_raw_text;
4849 coding->common_flags |= CODING_REQUIRE_DETECTION_MASK;
4850 }
4851 else if (EQ (coding_type, Qiso_2022))
4852 {
4853 int i;
4854 int flags = XINT (AREF (attrs, coding_attr_iso_flags));
4855
4856 /* Invoke graphic register 0 to plane 0. */
4857 CODING_ISO_INVOCATION (coding, 0) = 0;
4858 /* Invoke graphic register 1 to plane 1 if we can use 8-bit. */
4859 CODING_ISO_INVOCATION (coding, 1)
4860 = (flags & CODING_ISO_FLAG_SEVEN_BITS ? -1 : 1);
4861 /* Setup the initial status of designation. */
4862 for (i = 0; i < 4; i++)
4863 CODING_ISO_DESIGNATION (coding, i) = CODING_ISO_INITIAL (coding, i);
4864 /* Not single shifting initially. */
4865 CODING_ISO_SINGLE_SHIFTING (coding) = 0;
4866 /* Beginning of buffer should also be regarded as bol. */
4867 CODING_ISO_BOL (coding) = 1;
4868 coding->detector = detect_coding_iso_2022;
4869 coding->decoder = decode_coding_iso_2022;
4870 coding->encoder = encode_coding_iso_2022;
4871 if (flags & CODING_ISO_FLAG_SAFE)
4872 coding->mode |= CODING_MODE_SAFE_ENCODING;
4873 coding->common_flags
4874 |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK
4875 | CODING_REQUIRE_FLUSHING_MASK);
4876 if (flags & CODING_ISO_FLAG_COMPOSITION)
4877 coding->common_flags |= CODING_ANNOTATE_COMPOSITION_MASK;
4878 if (flags & CODING_ISO_FLAG_DESIGNATION)
4879 coding->common_flags |= CODING_ANNOTATE_CHARSET_MASK;
4880 if (flags & CODING_ISO_FLAG_FULL_SUPPORT)
4881 {
4882 setup_iso_safe_charsets (attrs);
4883 val = CODING_ATTR_SAFE_CHARSETS (attrs);
4884 coding->max_charset_id = SCHARS (val) - 1;
4885 coding->safe_charsets = (char *) SDATA (val);
4886 }
4887 CODING_ISO_FLAGS (coding) = flags;
4888 }
4889 else if (EQ (coding_type, Qcharset))
4890 {
4891 coding->detector = detect_coding_charset;
4892 coding->decoder = decode_coding_charset;
4893 coding->encoder = encode_coding_charset;
4894 coding->common_flags
4895 |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
4896 }
4897 else if (EQ (coding_type, Qutf_8))
4898 {
4899 coding->detector = detect_coding_utf_8;
4900 coding->decoder = decode_coding_utf_8;
4901 coding->encoder = encode_coding_utf_8;
4902 coding->common_flags
4903 |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
4904 }
4905 else if (EQ (coding_type, Qutf_16))
4906 {
4907 val = AREF (attrs, coding_attr_utf_16_bom);
4908 CODING_UTF_16_BOM (coding) = (CONSP (val) ? utf_16_detect_bom
4909 : EQ (val, Qt) ? utf_16_with_bom
4910 : utf_16_without_bom);
4911 val = AREF (attrs, coding_attr_utf_16_endian);
4912 CODING_UTF_16_ENDIAN (coding) = (EQ (val, Qbig) ? utf_16_big_endian
4913 : utf_16_little_endian);
4914 CODING_UTF_16_SURROGATE (coding) = 0;
4915 coding->detector = detect_coding_utf_16;
4916 coding->decoder = decode_coding_utf_16;
4917 coding->encoder = encode_coding_utf_16;
4918 coding->common_flags
4919 |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
4920 if (CODING_UTF_16_BOM (coding) == utf_16_detect_bom)
4921 coding->common_flags |= CODING_REQUIRE_DETECTION_MASK;
4922 }
4923 else if (EQ (coding_type, Qccl))
4924 {
4925 coding->detector = detect_coding_ccl;
4926 coding->decoder = decode_coding_ccl;
4927 coding->encoder = encode_coding_ccl;
4928 coding->common_flags
4929 |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK
4930 | CODING_REQUIRE_FLUSHING_MASK);
4931 }
4932 else if (EQ (coding_type, Qemacs_mule))
4933 {
4934 coding->detector = detect_coding_emacs_mule;
4935 coding->decoder = decode_coding_emacs_mule;
4936 coding->encoder = encode_coding_emacs_mule;
4937 coding->common_flags
4938 |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
4939 if (! NILP (AREF (attrs, coding_attr_emacs_mule_full))
4940 && ! EQ (CODING_ATTR_CHARSET_LIST (attrs), Vemacs_mule_charset_list))
4941 {
4942 Lisp_Object tail, safe_charsets;
4943 int max_charset_id = 0;
4944
4945 for (tail = Vemacs_mule_charset_list; CONSP (tail);
4946 tail = XCDR (tail))
4947 if (max_charset_id < XFASTINT (XCAR (tail)))
4948 max_charset_id = XFASTINT (XCAR (tail));
4949 safe_charsets = Fmake_string (make_number (max_charset_id + 1),
4950 make_number (255));
4951 for (tail = Vemacs_mule_charset_list; CONSP (tail);
4952 tail = XCDR (tail))
4953 SSET (safe_charsets, XFASTINT (XCAR (tail)), 0);
4954 coding->max_charset_id = max_charset_id;
4955 coding->safe_charsets = (char *) SDATA (safe_charsets);
4956 }
4957 }
4958 else if (EQ (coding_type, Qshift_jis))
4959 {
4960 coding->detector = detect_coding_sjis;
4961 coding->decoder = decode_coding_sjis;
4962 coding->encoder = encode_coding_sjis;
4963 coding->common_flags
4964 |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
4965 }
4966 else if (EQ (coding_type, Qbig5))
4967 {
4968 coding->detector = detect_coding_big5;
4969 coding->decoder = decode_coding_big5;
4970 coding->encoder = encode_coding_big5;
4971 coding->common_flags
4972 |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
4973 }
4974 else /* EQ (coding_type, Qraw_text) */
4975 {
4976 coding->detector = NULL;
4977 coding->decoder = decode_coding_raw_text;
4978 coding->encoder = encode_coding_raw_text;
4979 }
4980
4981 return;
4982 }
4983
4984 /* Return raw-text or one of its subsidiaries that has the same
4985 eol_type as CODING-SYSTEM. */
4986
4987 Lisp_Object
4988 raw_text_coding_system (coding_system)
4989 Lisp_Object coding_system;
4990 {
4991 Lisp_Object spec, attrs;
4992 Lisp_Object eol_type, raw_text_eol_type;
4993
4994 if (NILP (coding_system))
4995 return Qraw_text;
4996 spec = CODING_SYSTEM_SPEC (coding_system);
4997 attrs = AREF (spec, 0);
4998
4999 if (EQ (CODING_ATTR_TYPE (attrs), Qraw_text))
5000 return coding_system;
5001
5002 eol_type = AREF (spec, 2);
5003 if (VECTORP (eol_type))
5004 return Qraw_text;
5005 spec = CODING_SYSTEM_SPEC (Qraw_text);
5006 raw_text_eol_type = AREF (spec, 2);
5007 return (EQ (eol_type, Qunix) ? AREF (raw_text_eol_type, 0)
5008 : EQ (eol_type, Qdos) ? AREF (raw_text_eol_type, 1)
5009 : AREF (raw_text_eol_type, 2));
5010 }
5011
5012
5013 /* If CODING_SYSTEM doesn't specify end-of-line format but PARENT
5014 does, return one of the subsidiary that has the same eol-spec as
5015 PARENT. Otherwise, return CODING_SYSTEM. */
5016
5017 Lisp_Object
5018 coding_inherit_eol_type (coding_system, parent)
5019 Lisp_Object coding_system, parent;
5020 {
5021 Lisp_Object spec, attrs, eol_type;
5022
5023 if (NILP (coding_system))
5024 coding_system = Qraw_text;
5025 spec = CODING_SYSTEM_SPEC (coding_system);
5026 attrs = AREF (spec, 0);
5027 eol_type = AREF (spec, 2);
5028 if (VECTORP (eol_type)
5029 && ! NILP (parent))
5030 {
5031 Lisp_Object parent_spec;
5032 Lisp_Object parent_eol_type;
5033
5034 parent_spec
5035 = CODING_SYSTEM_SPEC (buffer_defaults.buffer_file_coding_system);
5036 parent_eol_type = AREF (parent_spec, 2);
5037 if (EQ (parent_eol_type, Qunix))
5038 coding_system = AREF (eol_type, 0);
5039 else if (EQ (parent_eol_type, Qdos))
5040 coding_system = AREF (eol_type, 1);
5041 else if (EQ (parent_eol_type, Qmac))
5042 coding_system = AREF (eol_type, 2);
5043 }
5044 return coding_system;
5045 }
5046
5047 /* Emacs has a mechanism to automatically detect a coding system if it
5048 is one of Emacs' internal format, ISO2022, SJIS, and BIG5. But,
5049 it's impossible to distinguish some coding systems accurately
5050 because they use the same range of codes. So, at first, coding
5051 systems are categorized into 7, those are:
5052
5053 o coding-category-emacs-mule
5054
5055 The category for a coding system which has the same code range
5056 as Emacs' internal format. Assigned the coding-system (Lisp
5057 symbol) `emacs-mule' by default.
5058
5059 o coding-category-sjis
5060
5061 The category for a coding system which has the same code range
5062 as SJIS. Assigned the coding-system (Lisp
5063 symbol) `japanese-shift-jis' by default.
5064
5065 o coding-category-iso-7
5066
5067 The category for a coding system which has the same code range
5068 as ISO2022 of 7-bit environment. This doesn't use any locking
5069 shift and single shift functions. This can encode/decode all
5070 charsets. Assigned the coding-system (Lisp symbol)
5071 `iso-2022-7bit' by default.
5072
5073 o coding-category-iso-7-tight
5074
5075 Same as coding-category-iso-7 except that this can
5076 encode/decode only the specified charsets.
5077
5078 o coding-category-iso-8-1
5079
5080 The category for a coding system which has the same code range
5081 as ISO2022 of 8-bit environment and graphic plane 1 used only
5082 for DIMENSION1 charset. This doesn't use any locking shift
5083 and single shift functions. Assigned the coding-system (Lisp
5084 symbol) `iso-latin-1' by default.
5085
5086 o coding-category-iso-8-2
5087
5088 The category for a coding system which has the same code range
5089 as ISO2022 of 8-bit environment and graphic plane 1 used only
5090 for DIMENSION2 charset. This doesn't use any locking shift
5091 and single shift functions. Assigned the coding-system (Lisp
5092 symbol) `japanese-iso-8bit' by default.
5093
5094 o coding-category-iso-7-else
5095
5096 The category for a coding system which has the same code range
5097 as ISO2022 of 7-bit environemnt but uses locking shift or
5098 single shift functions. Assigned the coding-system (Lisp
5099 symbol) `iso-2022-7bit-lock' by default.
5100
5101 o coding-category-iso-8-else
5102
5103 The category for a coding system which has the same code range
5104 as ISO2022 of 8-bit environemnt but uses locking shift or
5105 single shift functions. Assigned the coding-system (Lisp
5106 symbol) `iso-2022-8bit-ss2' by default.
5107
5108 o coding-category-big5
5109
5110 The category for a coding system which has the same code range
5111 as BIG5. Assigned the coding-system (Lisp symbol)
5112 `cn-big5' by default.
5113
5114 o coding-category-utf-8
5115
5116 The category for a coding system which has the same code range
5117 as UTF-8 (cf. RFC2279). Assigned the coding-system (Lisp
5118 symbol) `utf-8' by default.
5119
5120 o coding-category-utf-16-be
5121
5122 The category for a coding system in which a text has an
5123 Unicode signature (cf. Unicode Standard) in the order of BIG
5124 endian at the head. Assigned the coding-system (Lisp symbol)
5125 `utf-16-be' by default.
5126
5127 o coding-category-utf-16-le
5128
5129 The category for a coding system in which a text has an
5130 Unicode signature (cf. Unicode Standard) in the order of
5131 LITTLE endian at the head. Assigned the coding-system (Lisp
5132 symbol) `utf-16-le' by default.
5133
5134 o coding-category-ccl
5135
5136 The category for a coding system of which encoder/decoder is
5137 written in CCL programs. The default value is nil, i.e., no
5138 coding system is assigned.
5139
5140 o coding-category-binary
5141
5142 The category for a coding system not categorized in any of the
5143 above. Assigned the coding-system (Lisp symbol)
5144 `no-conversion' by default.
5145
5146 Each of them is a Lisp symbol and the value is an actual
5147 `coding-system's (this is also a Lisp symbol) assigned by a user.
5148 What Emacs does actually is to detect a category of coding system.
5149 Then, it uses a `coding-system' assigned to it. If Emacs can't
5150 decide only one possible category, it selects a category of the
5151 highest priority. Priorities of categories are also specified by a
5152 user in a Lisp variable `coding-category-list'.
5153
5154 */
5155
5156 #define EOL_SEEN_NONE 0
5157 #define EOL_SEEN_LF 1
5158 #define EOL_SEEN_CR 2
5159 #define EOL_SEEN_CRLF 4
5160
5161 /* Detect how end-of-line of a text of length SRC_BYTES pointed by
5162 SOURCE is encoded. If CATEGORY is one of
5163 coding_category_utf_16_XXXX, assume that CR and LF are encoded by
5164 two-byte, else they are encoded by one-byte.
5165
5166 Return one of EOL_SEEN_XXX. */
5167
5168 #define MAX_EOL_CHECK_COUNT 3
5169
5170 static int
5171 detect_eol (source, src_bytes, category)
5172 unsigned char *source;
5173 EMACS_INT src_bytes;
5174 enum coding_category category;
5175 {
5176 unsigned char *src = source, *src_end = src + src_bytes;
5177 unsigned char c;
5178 int total = 0;
5179 int eol_seen = EOL_SEEN_NONE;
5180
5181 if ((1 << category) & CATEGORY_MASK_UTF_16)
5182 {
5183 int msb, lsb;
5184
5185 msb = category == (coding_category_utf_16_le
5186 | coding_category_utf_16_le_nosig);
5187 lsb = 1 - msb;
5188
5189 while (src + 1 < src_end)
5190 {
5191 c = src[lsb];
5192 if (src[msb] == 0 && (c == '\n' || c == '\r'))
5193 {
5194 int this_eol;
5195
5196 if (c == '\n')
5197 this_eol = EOL_SEEN_LF;
5198 else if (src + 3 >= src_end
5199 || src[msb + 2] != 0
5200 || src[lsb + 2] != '\n')
5201 this_eol = EOL_SEEN_CR;
5202 else
5203 this_eol = EOL_SEEN_CRLF;
5204
5205 if (eol_seen == EOL_SEEN_NONE)
5206 /* This is the first end-of-line. */
5207 eol_seen = this_eol;
5208 else if (eol_seen != this_eol)
5209 {
5210 /* The found type is different from what found before. */
5211 eol_seen = EOL_SEEN_LF;
5212 break;
5213 }
5214 if (++total == MAX_EOL_CHECK_COUNT)
5215 break;
5216 }
5217 src += 2;
5218 }
5219 }
5220 else
5221 {
5222 while (src < src_end)
5223 {
5224 c = *src++;
5225 if (c == '\n' || c == '\r')
5226 {
5227 int this_eol;
5228
5229 if (c == '\n')
5230 this_eol = EOL_SEEN_LF;
5231 else if (src >= src_end || *src != '\n')
5232 this_eol = EOL_SEEN_CR;
5233 else
5234 this_eol = EOL_SEEN_CRLF, src++;
5235
5236 if (eol_seen == EOL_SEEN_NONE)
5237 /* This is the first end-of-line. */
5238 eol_seen = this_eol;
5239 else if (eol_seen != this_eol)
5240 {
5241 /* The found type is different from what found before. */
5242 eol_seen = EOL_SEEN_LF;
5243 break;
5244 }
5245 if (++total == MAX_EOL_CHECK_COUNT)
5246 break;
5247 }
5248 }
5249 }
5250 return eol_seen;
5251 }
5252
5253
5254 static void
5255 adjust_coding_eol_type (coding, eol_seen)
5256 struct coding_system *coding;
5257 int eol_seen;
5258 {
5259 Lisp_Object eol_type;
5260
5261 eol_type = CODING_ID_EOL_TYPE (coding->id);
5262 if (eol_seen & EOL_SEEN_LF)
5263 coding->id = CODING_SYSTEM_ID (AREF (eol_type, 0));
5264 else if (eol_seen & EOL_SEEN_CRLF)
5265 coding->id = CODING_SYSTEM_ID (AREF (eol_type, 1));
5266 else if (eol_seen & EOL_SEEN_CR)
5267 coding->id = CODING_SYSTEM_ID (AREF (eol_type, 2));
5268 }
5269
5270 /* Detect how a text specified in CODING is encoded. If a coding
5271 system is detected, update fields of CODING by the detected coding
5272 system. */
5273
5274 void
5275 detect_coding (coding)
5276 struct coding_system *coding;
5277 {
5278 const unsigned char *src, *src_end;
5279 Lisp_Object attrs, coding_type;
5280
5281 coding->consumed = coding->consumed_char = 0;
5282 coding->produced = coding->produced_char = 0;
5283 coding_set_source (coding);
5284
5285 src_end = coding->source + coding->src_bytes;
5286
5287 /* If we have not yet decided the text encoding type, detect it
5288 now. */
5289 if (EQ (CODING_ATTR_TYPE (CODING_ID_ATTRS (coding->id)), Qundecided))
5290 {
5291 int c, i;
5292
5293 for (src = coding->source; src < src_end; src++)
5294 {
5295 c = *src;
5296 if (c & 0x80 || (c < 0x20 && (c == ISO_CODE_ESC
5297 || c == ISO_CODE_SI
5298 || c == ISO_CODE_SO)))
5299 break;
5300 }
5301 coding->head_ascii = src - (coding->source + coding->consumed);
5302
5303 if (coding->head_ascii < coding->src_bytes)
5304 {
5305 struct coding_detection_info detect_info;
5306 enum coding_category category;
5307 struct coding_system *this;
5308
5309 detect_info.checked = detect_info.found = detect_info.rejected = 0;
5310 for (i = 0; i < coding_category_raw_text; i++)
5311 {
5312 category = coding_priorities[i];
5313 this = coding_categories + category;
5314 if (this->id < 0)
5315 {
5316 /* No coding system of this category is defined. */
5317 detect_info.rejected |= (1 << category);
5318 }
5319 else if (category >= coding_category_raw_text)
5320 continue;
5321 else if (detect_info.checked & (1 << category))
5322 {
5323 if (detect_info.found & (1 << category))
5324 break;
5325 }
5326 else if ((*(this->detector)) (coding, &detect_info)
5327 && detect_info.found & (1 << category))
5328 break;
5329 }
5330 if (i < coding_category_raw_text)
5331 setup_coding_system (CODING_ID_NAME (this->id), coding);
5332 else if (detect_info.rejected == CATEGORY_MASK_ANY)
5333 setup_coding_system (Qraw_text, coding);
5334 else if (detect_info.rejected)
5335 for (i = 0; i < coding_category_raw_text; i++)
5336 if (! (detect_info.rejected & (1 << coding_priorities[i])))
5337 {
5338 this = coding_categories + coding_priorities[i];
5339 setup_coding_system (CODING_ID_NAME (this->id), coding);
5340 break;
5341 }
5342 }
5343 }
5344 else if (EQ (CODING_ATTR_TYPE (CODING_ID_ATTRS (coding->id)), Qutf_16))
5345 {
5346 Lisp_Object coding_systems;
5347 struct coding_detection_info detect_info;
5348
5349 coding_systems
5350 = AREF (CODING_ID_ATTRS (coding->id), coding_attr_utf_16_bom);
5351 detect_info.found = detect_info.rejected = 0;
5352 if (CONSP (coding_systems)
5353 && detect_coding_utf_16 (coding, &detect_info)
5354 && (detect_info.found & (CATEGORY_MASK_UTF_16_LE
5355 | CATEGORY_MASK_UTF_16_BE)))
5356 {
5357 if (detect_info.found & CATEGORY_MASK_UTF_16_LE)
5358 setup_coding_system (XCAR (coding_systems), coding);
5359 else
5360 setup_coding_system (XCDR (coding_systems), coding);
5361 }
5362 }
5363
5364 attrs = CODING_ID_ATTRS (coding->id);
5365 coding_type = CODING_ATTR_TYPE (attrs);
5366
5367 /* If we have not yet decided the EOL type, detect it now. But, the
5368 detection is impossible for a CCL based coding system, in which
5369 case, we detct the EOL type after decoding. */
5370 if (VECTORP (CODING_ID_EOL_TYPE (coding->id))
5371 && ! EQ (coding_type, Qccl))
5372 {
5373 int eol_seen = detect_eol (coding->source, coding->src_bytes,
5374 XINT (CODING_ATTR_CATEGORY (attrs)));
5375
5376 if (eol_seen != EOL_SEEN_NONE)
5377 adjust_coding_eol_type (coding, eol_seen);
5378 }
5379 }
5380
5381
5382 static void
5383 decode_eol (coding)
5384 struct coding_system *coding;
5385 {
5386 if (VECTORP (CODING_ID_EOL_TYPE (coding->id)))
5387 {
5388 unsigned char *p = CHAR_POS_ADDR (coding->dst_pos);
5389 unsigned char *pend = p + coding->produced;
5390 int eol_seen = EOL_SEEN_NONE;
5391
5392 for (; p < pend; p++)
5393 {
5394 if (*p == '\n')
5395 eol_seen |= EOL_SEEN_LF;
5396 else if (*p == '\r')
5397 {
5398 if (p + 1 < pend && *(p + 1) == '\n')
5399 {
5400 eol_seen |= EOL_SEEN_CRLF;
5401 p++;
5402 }
5403 else
5404 eol_seen |= EOL_SEEN_CR;
5405 }
5406 }
5407 if (eol_seen != EOL_SEEN_NONE)
5408 adjust_coding_eol_type (coding, eol_seen);
5409 }
5410
5411 if (EQ (CODING_ID_EOL_TYPE (coding->id), Qmac))
5412 {
5413 unsigned char *p = CHAR_POS_ADDR (coding->dst_pos);
5414 unsigned char *pend = p + coding->produced;
5415
5416 for (; p < pend; p++)
5417 if (*p == '\r')
5418 *p = '\n';
5419 }
5420 else if (EQ (CODING_ID_EOL_TYPE (coding->id), Qdos))
5421 {
5422 unsigned char *p, *pbeg, *pend;
5423 Lisp_Object undo_list;
5424
5425 move_gap_both (coding->dst_pos + coding->produced_char,
5426 coding->dst_pos_byte + coding->produced);
5427 undo_list = current_buffer->undo_list;
5428 current_buffer->undo_list = Qt;
5429 del_range_2 (coding->dst_pos, coding->dst_pos_byte, GPT, GPT_BYTE, 0);
5430 current_buffer->undo_list = undo_list;
5431 pbeg = GPT_ADDR;
5432 pend = pbeg + coding->produced;
5433
5434 for (p = pend - 1; p >= pbeg; p--)
5435 if (*p == '\r')
5436 {
5437 safe_bcopy ((char *) (p + 1), (char *) p, pend - p - 1);
5438 pend--;
5439 }
5440 coding->produced_char -= coding->produced - (pend - pbeg);
5441 coding->produced = pend - pbeg;
5442 insert_from_gap (coding->produced_char, coding->produced);
5443 }
5444 }
5445
5446 static void
5447 translate_chars (coding, table)
5448 struct coding_system *coding;
5449 Lisp_Object table;
5450 {
5451 int *charbuf = coding->charbuf;
5452 int *charbuf_end = charbuf + coding->charbuf_used;
5453 int c;
5454
5455 if (coding->chars_at_source)
5456 return;
5457
5458 while (charbuf < charbuf_end)
5459 {
5460 c = *charbuf;
5461 if (c < 0)
5462 charbuf += c;
5463 else
5464 *charbuf++ = translate_char (table, c);
5465 }
5466 }
5467
5468 static int
5469 produce_chars (coding)
5470 struct coding_system *coding;
5471 {
5472 unsigned char *dst = coding->destination + coding->produced;
5473 unsigned char *dst_end = coding->destination + coding->dst_bytes;
5474 int produced;
5475 int produced_chars = 0;
5476
5477 if (! coding->chars_at_source)
5478 {
5479 /* Characters are in coding->charbuf. */
5480 int *buf = coding->charbuf;
5481 int *buf_end = buf + coding->charbuf_used;
5482 unsigned char *adjusted_dst_end;
5483
5484 if (BUFFERP (coding->src_object)
5485 && EQ (coding->src_object, coding->dst_object))
5486 dst_end = ((unsigned char *) coding->source) + coding->consumed;
5487 adjusted_dst_end = dst_end - MAX_MULTIBYTE_LENGTH;
5488
5489 while (buf < buf_end)
5490 {
5491 int c = *buf++;
5492
5493 if (dst >= adjusted_dst_end)
5494 {
5495 dst = alloc_destination (coding,
5496 buf_end - buf + MAX_MULTIBYTE_LENGTH,
5497 dst);
5498 dst_end = coding->destination + coding->dst_bytes;
5499 adjusted_dst_end = dst_end - MAX_MULTIBYTE_LENGTH;
5500 }
5501 if (c >= 0)
5502 {
5503 if (coding->dst_multibyte
5504 || ! CHAR_BYTE8_P (c))
5505 CHAR_STRING_ADVANCE (c, dst);
5506 else
5507 *dst++ = CHAR_TO_BYTE8 (c);
5508 produced_chars++;
5509 }
5510 else
5511 /* This is an annotation datum. (-C) is the length of
5512 it. */
5513 buf += -c - 1;
5514 }
5515 }
5516 else
5517 {
5518 const unsigned char *src = coding->source;
5519 const unsigned char *src_end = src + coding->src_bytes;
5520 Lisp_Object eol_type;
5521
5522 eol_type = CODING_ID_EOL_TYPE (coding->id);
5523
5524 if (coding->src_multibyte != coding->dst_multibyte)
5525 {
5526 if (coding->src_multibyte)
5527 {
5528 int multibytep = 1;
5529 int consumed_chars;
5530
5531 while (1)
5532 {
5533 const unsigned char *src_base = src;
5534 int c;
5535
5536 ONE_MORE_BYTE (c);
5537 if (c == '\r')
5538 {
5539 if (EQ (eol_type, Qdos))
5540 {
5541 if (src == src_end)
5542 {
5543 coding->result = CODING_RESULT_INSUFFICIENT_SRC;
5544 goto no_more_source;
5545 }
5546 if (*src == '\n')
5547 c = *src++;
5548 }
5549 else if (EQ (eol_type, Qmac))
5550 c = '\n';
5551 }
5552 if (dst == dst_end)
5553 {
5554 coding->consumed = src - coding->source;
5555
5556 if (EQ (coding->src_object, coding->dst_object))
5557 dst_end = (unsigned char *) src;
5558 if (dst == dst_end)
5559 {
5560 dst = alloc_destination (coding, src_end - src + 1,
5561 dst);
5562 dst_end = coding->destination + coding->dst_bytes;
5563 coding_set_source (coding);
5564 src = coding->source + coding->consumed;
5565 src_end = coding->source + coding->src_bytes;
5566 }
5567 }
5568 *dst++ = c;
5569 produced_chars++;
5570 }
5571 no_more_source:
5572 ;
5573 }
5574 else
5575 while (src < src_end)
5576 {
5577 int multibytep = 1;
5578 int c = *src++;
5579
5580 if (c == '\r')
5581 {
5582 if (EQ (eol_type, Qdos))
5583 {
5584 if (src < src_end
5585 && *src == '\n')
5586 c = *src++;
5587 }
5588 else if (EQ (eol_type, Qmac))
5589 c = '\n';
5590 }
5591 if (dst >= dst_end - 1)
5592 {
5593 coding->consumed = src - coding->source;
5594
5595 if (EQ (coding->src_object, coding->dst_object))
5596 dst_end = (unsigned char *) src;
5597 if (dst >= dst_end - 1)
5598 {
5599 dst = alloc_destination (coding, src_end - src + 2,
5600 dst);
5601 dst_end = coding->destination + coding->dst_bytes;
5602 coding_set_source (coding);
5603 src = coding->source + coding->consumed;
5604 src_end = coding->source + coding->src_bytes;
5605 }
5606 }
5607 EMIT_ONE_BYTE (c);
5608 }
5609 }
5610 else
5611 {
5612 if (!EQ (coding->src_object, coding->dst_object))
5613 {
5614 int require = coding->src_bytes - coding->dst_bytes;
5615
5616 if (require > 0)
5617 {
5618 EMACS_INT offset = src - coding->source;
5619
5620 dst = alloc_destination (coding, require, dst);
5621 coding_set_source (coding);
5622 src = coding->source + offset;
5623 src_end = coding->source + coding->src_bytes;
5624 }
5625 }
5626 produced_chars = coding->src_chars;
5627 while (src < src_end)
5628 {
5629 int c = *src++;
5630
5631 if (c == '\r')
5632 {
5633 if (EQ (eol_type, Qdos))
5634 {
5635 if (src < src_end
5636 && *src == '\n')
5637 c = *src++;
5638 produced_chars--;
5639 }
5640 else if (EQ (eol_type, Qmac))
5641 c = '\n';
5642 }
5643 *dst++ = c;
5644 }
5645 }
5646 coding->consumed = coding->src_bytes;
5647 coding->consumed_char = coding->src_chars;
5648 }
5649
5650 produced = dst - (coding->destination + coding->produced);
5651 if (BUFFERP (coding->dst_object))
5652 insert_from_gap (produced_chars, produced);
5653 coding->produced += produced;
5654 coding->produced_char += produced_chars;
5655 return produced_chars;
5656 }
5657
5658 /* Compose text in CODING->object according to the annotation data at
5659 CHARBUF. CHARBUF is an array:
5660 [ -LENGTH ANNOTATION_MASK FROM TO METHOD COMP_LEN [ COMPONENTS... ] ]
5661 */
5662
5663 static INLINE void
5664 produce_composition (coding, charbuf)
5665 struct coding_system *coding;
5666 int *charbuf;
5667 {
5668 int len;
5669 EMACS_INT from, to;
5670 enum composition_method method;
5671 Lisp_Object components;
5672
5673 len = -charbuf[0];
5674 from = coding->dst_pos + charbuf[2];
5675 to = coding->dst_pos + charbuf[3];
5676 method = (enum composition_method) (charbuf[4]);
5677
5678 if (method == COMPOSITION_RELATIVE)
5679 components = Qnil;
5680 else
5681 {
5682 Lisp_Object args[MAX_COMPOSITION_COMPONENTS * 2 - 1];
5683 int i;
5684
5685 len -= 5;
5686 charbuf += 5;
5687 for (i = 0; i < len; i++)
5688 args[i] = make_number (charbuf[i]);
5689 components = (method == COMPOSITION_WITH_ALTCHARS
5690 ? Fstring (len, args) : Fvector (len, args));
5691 }
5692 compose_text (from, to, components, Qnil, coding->dst_object);
5693 }
5694
5695
5696 /* Put `charset' property on text in CODING->object according to
5697 the annotation data at CHARBUF. CHARBUF is an array:
5698 [ -LENGTH ANNOTATION_MASK FROM TO CHARSET-ID ]
5699 */
5700
5701 static INLINE void
5702 produce_charset (coding, charbuf)
5703 struct coding_system *coding;
5704 int *charbuf;
5705 {
5706 EMACS_INT from = coding->dst_pos + charbuf[2];
5707 EMACS_INT to = coding->dst_pos + charbuf[3];
5708 struct charset *charset = CHARSET_FROM_ID (charbuf[4]);
5709
5710 Fput_text_property (make_number (from), make_number (to),
5711 Qcharset, CHARSET_NAME (charset),
5712 coding->dst_object);
5713 }
5714
5715
5716 #define CHARBUF_SIZE 0x4000
5717
5718 #define ALLOC_CONVERSION_WORK_AREA(coding) \
5719 do { \
5720 int size = CHARBUF_SIZE;; \
5721 \
5722 coding->charbuf = NULL; \
5723 while (size > 1024) \
5724 { \
5725 coding->charbuf = (int *) alloca (sizeof (int) * size); \
5726 if (coding->charbuf) \
5727 break; \
5728 size >>= 1; \
5729 } \
5730 if (! coding->charbuf) \
5731 { \
5732 coding->result = CODING_RESULT_INSUFFICIENT_MEM; \
5733 return coding->result; \
5734 } \
5735 coding->charbuf_size = size; \
5736 } while (0)
5737
5738
5739 static void
5740 produce_annotation (coding)
5741 struct coding_system *coding;
5742 {
5743 int *charbuf = coding->charbuf;
5744 int *charbuf_end = charbuf + coding->charbuf_used;
5745
5746 if (NILP (coding->dst_object))
5747 return;
5748
5749 while (charbuf < charbuf_end)
5750 {
5751 if (*charbuf >= 0)
5752 charbuf++;
5753 else
5754 {
5755 int len = -*charbuf;
5756 switch (charbuf[1])
5757 {
5758 case CODING_ANNOTATE_COMPOSITION_MASK:
5759 produce_composition (coding, charbuf);
5760 break;
5761 case CODING_ANNOTATE_CHARSET_MASK:
5762 produce_charset (coding, charbuf);
5763 break;
5764 default:
5765 abort ();
5766 }
5767 charbuf += len;
5768 }
5769 }
5770 }
5771
5772 /* Decode the data at CODING->src_object into CODING->dst_object.
5773 CODING->src_object is a buffer, a string, or nil.
5774 CODING->dst_object is a buffer.
5775
5776 If CODING->src_object is a buffer, it must be the current buffer.
5777 In this case, if CODING->src_pos is positive, it is a position of
5778 the source text in the buffer, otherwise, the source text is in the
5779 gap area of the buffer, and CODING->src_pos specifies the offset of
5780 the text from GPT (which must be the same as PT). If this is the
5781 same buffer as CODING->dst_object, CODING->src_pos must be
5782 negative.
5783
5784 If CODING->src_object is a string, CODING->src_pos in an index to
5785 that string.
5786
5787 If CODING->src_object is nil, CODING->source must already point to
5788 the non-relocatable memory area. In this case, CODING->src_pos is
5789 an offset from CODING->source.
5790
5791 The decoded data is inserted at the current point of the buffer
5792 CODING->dst_object.
5793 */
5794
5795 static int
5796 decode_coding (coding)
5797 struct coding_system *coding;
5798 {
5799 Lisp_Object attrs;
5800
5801 if (BUFFERP (coding->src_object)
5802 && coding->src_pos > 0
5803 && coding->src_pos < GPT
5804 && coding->src_pos + coding->src_chars > GPT)
5805 move_gap_both (coding->src_pos, coding->src_pos_byte);
5806
5807 if (BUFFERP (coding->dst_object))
5808 {
5809 if (current_buffer != XBUFFER (coding->dst_object))
5810 set_buffer_internal (XBUFFER (coding->dst_object));
5811 if (GPT != PT)
5812 move_gap_both (PT, PT_BYTE);
5813 }
5814
5815 coding->consumed = coding->consumed_char = 0;
5816 coding->produced = coding->produced_char = 0;
5817 coding->chars_at_source = 0;
5818 coding->result = CODING_RESULT_SUCCESS;
5819 coding->errors = 0;
5820
5821 ALLOC_CONVERSION_WORK_AREA (coding);
5822
5823 attrs = CODING_ID_ATTRS (coding->id);
5824
5825 do
5826 {
5827 coding_set_source (coding);
5828 coding->annotated = 0;
5829 (*(coding->decoder)) (coding);
5830 if (!NILP (CODING_ATTR_DECODE_TBL (attrs)))
5831 translate_chars (coding, CODING_ATTR_DECODE_TBL (attrs));
5832 else if (!NILP (Vstandard_translation_table_for_decode))
5833 translate_chars (coding, Vstandard_translation_table_for_decode);
5834 coding_set_destination (coding);
5835 produce_chars (coding);
5836 if (coding->annotated)
5837 produce_annotation (coding);
5838 }
5839 while (coding->consumed < coding->src_bytes
5840 && ! coding->result);
5841
5842 if (EQ (CODING_ATTR_TYPE (CODING_ID_ATTRS (coding->id)), Qccl)
5843 && SYMBOLP (CODING_ID_EOL_TYPE (coding->id))
5844 && ! EQ (CODING_ID_EOL_TYPE (coding->id), Qunix))
5845 decode_eol (coding);
5846
5847 coding->carryover_bytes = 0;
5848 if (coding->consumed < coding->src_bytes)
5849 {
5850 int nbytes = coding->src_bytes - coding->consumed;
5851 const unsigned char *src;
5852
5853 coding_set_source (coding);
5854 coding_set_destination (coding);
5855 src = coding->source + coding->consumed;
5856
5857 if (coding->mode & CODING_MODE_LAST_BLOCK)
5858 {
5859 /* Flush out unprocessed data as binary chars. We are sure
5860 that the number of data is less than the size of
5861 coding->charbuf. */
5862 while (nbytes-- > 0)
5863 {
5864 int c = *src++;
5865
5866 coding->charbuf[coding->charbuf_used++] = (c & 0x80 ? - c : c);
5867 }
5868 produce_chars (coding);
5869 }
5870 else
5871 {
5872 /* Record unprocessed bytes in coding->carryover. We are
5873 sure that the number of data is less than the size of
5874 coding->carryover. */
5875 unsigned char *p = coding->carryover;
5876
5877 coding->carryover_bytes = nbytes;
5878 while (nbytes-- > 0)
5879 *p++ = *src++;
5880 }
5881 coding->consumed = coding->src_bytes;
5882 }
5883
5884 return coding->result;
5885 }
5886
5887
5888 /* Extract an annotation datum from a composition starting at POS and
5889 ending before LIMIT of CODING->src_object (buffer or string), store
5890 the data in BUF, set *STOP to a starting position of the next
5891 composition (if any) or to LIMIT, and return the address of the
5892 next element of BUF.
5893
5894 If such an annotation is not found, set *STOP to a starting
5895 position of a composition after POS (if any) or to LIMIT, and
5896 return BUF. */
5897
5898 static INLINE int *
5899 handle_composition_annotation (pos, limit, coding, buf, stop)
5900 EMACS_INT pos, limit;
5901 struct coding_system *coding;
5902 int *buf;
5903 EMACS_INT *stop;
5904 {
5905 EMACS_INT start, end;
5906 Lisp_Object prop;
5907
5908 if (! find_composition (pos, limit, &start, &end, &prop, coding->src_object)
5909 || end > limit)
5910 *stop = limit;
5911 else if (start > pos)
5912 *stop = start;
5913 else
5914 {
5915 if (start == pos)
5916 {
5917 /* We found a composition. Store the corresponding
5918 annotation data in BUF. */
5919 int *head = buf;
5920 enum composition_method method = COMPOSITION_METHOD (prop);
5921 int nchars = COMPOSITION_LENGTH (prop);
5922
5923 ADD_COMPOSITION_DATA (buf, 0, nchars, method);
5924 if (method != COMPOSITION_RELATIVE)
5925 {
5926 Lisp_Object components;
5927 int len, i, i_byte;
5928
5929 components = COMPOSITION_COMPONENTS (prop);
5930 if (VECTORP (components))
5931 {
5932 len = XVECTOR (components)->size;
5933 for (i = 0; i < len; i++)
5934 *buf++ = XINT (AREF (components, i));
5935 }
5936 else if (STRINGP (components))
5937 {
5938 len = SCHARS (components);
5939 i = i_byte = 0;
5940 while (i < len)
5941 {
5942 FETCH_STRING_CHAR_ADVANCE (*buf, components, i, i_byte);
5943 buf++;
5944 }
5945 }
5946 else if (INTEGERP (components))
5947 {
5948 len = 1;
5949 *buf++ = XINT (components);
5950 }
5951 else if (CONSP (components))
5952 {
5953 for (len = 0; CONSP (components);
5954 len++, components = XCDR (components))
5955 *buf++ = XINT (XCAR (components));
5956 }
5957 else
5958 abort ();
5959 *head -= len;
5960 }
5961 }
5962
5963 if (find_composition (end, limit, &start, &end, &prop,
5964 coding->src_object)
5965 && end <= limit)
5966 *stop = start;
5967 else
5968 *stop = limit;
5969 }
5970 return buf;
5971 }
5972
5973
5974 /* Extract an annotation datum from a text property `charset' at POS of
5975 CODING->src_object (buffer of string), store the data in BUF, set
5976 *STOP to the position where the value of `charset' property changes
5977 (limiting by LIMIT), and return the address of the next element of
5978 BUF.
5979
5980 If the property value is nil, set *STOP to the position where the
5981 property value is non-nil (limiting by LIMIT), and return BUF. */
5982
5983 static INLINE int *
5984 handle_charset_annotation (pos, limit, coding, buf, stop)
5985 EMACS_INT pos, limit;
5986 struct coding_system *coding;
5987 int *buf;
5988 EMACS_INT *stop;
5989 {
5990 Lisp_Object val, next;
5991 int id;
5992
5993 val = Fget_text_property (make_number (pos), Qcharset, coding->src_object);
5994 if (! NILP (val) && CHARSETP (val))
5995 id = XINT (CHARSET_SYMBOL_ID (val));
5996 else
5997 id = -1;
5998 ADD_CHARSET_DATA (buf, 0, 0, id);
5999 next = Fnext_single_property_change (make_number (pos), Qcharset,
6000 coding->src_object,
6001 make_number (limit));
6002 *stop = XINT (next);
6003 return buf;
6004 }
6005
6006
6007 static void
6008 consume_chars (coding)
6009 struct coding_system *coding;
6010 {
6011 int *buf = coding->charbuf;
6012 int *buf_end = coding->charbuf + coding->charbuf_size;
6013 const unsigned char *src = coding->source + coding->consumed;
6014 const unsigned char *src_end = coding->source + coding->src_bytes;
6015 EMACS_INT pos = coding->src_pos + coding->consumed_char;
6016 EMACS_INT end_pos = coding->src_pos + coding->src_chars;
6017 int multibytep = coding->src_multibyte;
6018 Lisp_Object eol_type;
6019 int c;
6020 EMACS_INT stop, stop_composition, stop_charset;
6021
6022 eol_type = CODING_ID_EOL_TYPE (coding->id);
6023 if (VECTORP (eol_type))
6024 eol_type = Qunix;
6025
6026 /* Note: composition handling is not yet implemented. */
6027 coding->common_flags &= ~CODING_ANNOTATE_COMPOSITION_MASK;
6028
6029 if (coding->common_flags & CODING_ANNOTATE_COMPOSITION_MASK)
6030 stop = stop_composition = pos;
6031 else
6032 stop = stop_composition = end_pos;
6033 if (coding->common_flags & CODING_ANNOTATE_CHARSET_MASK)
6034 stop = stop_charset = pos;
6035 else
6036 stop_charset = end_pos;
6037
6038 /* Compensate for CRLF and annotation. */
6039 buf_end -= 1 + MAX_ANNOTATION_LENGTH;
6040 while (buf < buf_end)
6041 {
6042 if (pos == stop)
6043 {
6044 if (pos == end_pos)
6045 break;
6046 if (pos == stop_composition)
6047 buf = handle_composition_annotation (pos, end_pos, coding,
6048 buf, &stop_composition);
6049 if (pos == stop_charset)
6050 buf = handle_charset_annotation (pos, end_pos, coding,
6051 buf, &stop_charset);
6052 stop = (stop_composition < stop_charset
6053 ? stop_composition : stop_charset);
6054 }
6055
6056 if (! multibytep)
6057 {
6058 EMACS_INT bytes;
6059
6060 if (! CODING_FOR_UNIBYTE (coding)
6061 && (bytes = MULTIBYTE_LENGTH (src, src_end)) > 0)
6062 c = STRING_CHAR_ADVANCE (src), pos += bytes;
6063 else
6064 c = *src++, pos++;
6065 }
6066 else
6067 c = STRING_CHAR_ADVANCE (src), pos++;
6068 if ((c == '\r') && (coding->mode & CODING_MODE_SELECTIVE_DISPLAY))
6069 c = '\n';
6070 if (! EQ (eol_type, Qunix))
6071 {
6072 if (c == '\n')
6073 {
6074 if (EQ (eol_type, Qdos))
6075 *buf++ = '\r';
6076 else
6077 c = '\r';
6078 }
6079 }
6080 *buf++ = c;
6081 }
6082
6083 coding->consumed = src - coding->source;
6084 coding->consumed_char = pos - coding->src_pos;
6085 coding->charbuf_used = buf - coding->charbuf;
6086 coding->chars_at_source = 0;
6087 }
6088
6089
6090 /* Encode the text at CODING->src_object into CODING->dst_object.
6091 CODING->src_object is a buffer or a string.
6092 CODING->dst_object is a buffer or nil.
6093
6094 If CODING->src_object is a buffer, it must be the current buffer.
6095 In this case, if CODING->src_pos is positive, it is a position of
6096 the source text in the buffer, otherwise. the source text is in the
6097 gap area of the buffer, and coding->src_pos specifies the offset of
6098 the text from GPT (which must be the same as PT). If this is the
6099 same buffer as CODING->dst_object, CODING->src_pos must be
6100 negative and CODING should not have `pre-write-conversion'.
6101
6102 If CODING->src_object is a string, CODING should not have
6103 `pre-write-conversion'.
6104
6105 If CODING->dst_object is a buffer, the encoded data is inserted at
6106 the current point of that buffer.
6107
6108 If CODING->dst_object is nil, the encoded data is placed at the
6109 memory area specified by CODING->destination. */
6110
6111 static int
6112 encode_coding (coding)
6113 struct coding_system *coding;
6114 {
6115 Lisp_Object attrs;
6116
6117 attrs = CODING_ID_ATTRS (coding->id);
6118
6119 if (BUFFERP (coding->dst_object))
6120 {
6121 set_buffer_internal (XBUFFER (coding->dst_object));
6122 coding->dst_multibyte
6123 = ! NILP (current_buffer->enable_multibyte_characters);
6124 }
6125
6126 coding->consumed = coding->consumed_char = 0;
6127 coding->produced = coding->produced_char = 0;
6128 coding->result = CODING_RESULT_SUCCESS;
6129 coding->errors = 0;
6130
6131 ALLOC_CONVERSION_WORK_AREA (coding);
6132
6133 do {
6134 coding_set_source (coding);
6135 consume_chars (coding);
6136
6137 if (!NILP (CODING_ATTR_ENCODE_TBL (attrs)))
6138 translate_chars (coding, CODING_ATTR_ENCODE_TBL (attrs));
6139 else if (!NILP (Vstandard_translation_table_for_encode))
6140 translate_chars (coding, Vstandard_translation_table_for_encode);
6141
6142 coding_set_destination (coding);
6143 (*(coding->encoder)) (coding);
6144 } while (coding->consumed_char < coding->src_chars);
6145
6146 if (BUFFERP (coding->dst_object))
6147 insert_from_gap (coding->produced_char, coding->produced);
6148
6149 return (coding->result);
6150 }
6151
6152
6153 /* Stack of working buffers used in code conversion. An nil element
6154 means that the code conversion of that level is not using a working
6155 buffer. */
6156 Lisp_Object Vcode_conversion_work_buf_list;
6157
6158 /* A working buffer used by the top level conversion. */
6159 Lisp_Object Vcode_conversion_reused_work_buf;
6160
6161
6162 /* Return a working buffer that can be freely used by the following
6163 code conversion. MULTIBYTEP specifies the multibyteness of the
6164 buffer. */
6165
6166 Lisp_Object
6167 make_conversion_work_buffer (multibytep, depth)
6168 int multibytep, depth;
6169 {
6170 struct buffer *current = current_buffer;
6171 Lisp_Object buf, name;
6172
6173 if (depth == 0)
6174 {
6175 if (NILP (Vcode_conversion_reused_work_buf))
6176 Vcode_conversion_reused_work_buf
6177 = Fget_buffer_create (build_string (" *code-converting-work<0>*"));
6178 buf = Vcode_conversion_reused_work_buf;
6179 }
6180 else
6181 {
6182 if (depth < 0)
6183 {
6184 name = build_string (" *code-converting-work*");
6185 name = Fgenerate_new_buffer_name (name, Qnil);
6186 }
6187 else
6188 {
6189 char str[128];
6190
6191 sprintf (str, " *code-converting-work*<%d>", depth);
6192 name = build_string (str);
6193 }
6194 buf = Fget_buffer_create (name);
6195 }
6196 set_buffer_internal (XBUFFER (buf));
6197 current_buffer->undo_list = Qt;
6198 Ferase_buffer ();
6199 Fset_buffer_multibyte (multibytep ? Qt : Qnil);
6200 set_buffer_internal (current);
6201 return buf;
6202 }
6203
6204 static Lisp_Object
6205 code_conversion_restore (buffer)
6206 Lisp_Object buffer;
6207 {
6208 Lisp_Object workbuf;
6209
6210 workbuf = XCAR (Vcode_conversion_work_buf_list);
6211 if (! NILP (workbuf)
6212 && ! EQ (workbuf, Vcode_conversion_reused_work_buf)
6213 && ! NILP (Fbuffer_live_p (workbuf)))
6214 Fkill_buffer (workbuf);
6215 Vcode_conversion_work_buf_list = XCDR (Vcode_conversion_work_buf_list);
6216 set_buffer_internal (XBUFFER (buffer));
6217 return Qnil;
6218 }
6219
6220 static Lisp_Object
6221 code_conversion_save (buffer, with_work_buf, multibyte)
6222 Lisp_Object buffer;
6223 int with_work_buf, multibyte;
6224 {
6225 Lisp_Object workbuf;
6226
6227 if (with_work_buf)
6228 {
6229 int depth = XINT (Flength (Vcode_conversion_work_buf_list));
6230
6231 workbuf = make_conversion_work_buffer (multibyte, depth);
6232 }
6233 else
6234 workbuf = Qnil;
6235 Vcode_conversion_work_buf_list
6236 = Fcons (workbuf, Vcode_conversion_work_buf_list);
6237 record_unwind_protect (code_conversion_restore, buffer);
6238 return workbuf;
6239 }
6240
6241 int
6242 decode_coding_gap (coding, chars, bytes)
6243 struct coding_system *coding;
6244 EMACS_INT chars, bytes;
6245 {
6246 int count = specpdl_ptr - specpdl;
6247 Lisp_Object attrs;
6248 Lisp_Object buffer;
6249
6250 buffer = Fcurrent_buffer ();
6251 code_conversion_save (buffer, 0, 0);
6252
6253 coding->src_object = buffer;
6254 coding->src_chars = chars;
6255 coding->src_bytes = bytes;
6256 coding->src_pos = -chars;
6257 coding->src_pos_byte = -bytes;
6258 coding->src_multibyte = chars < bytes;
6259 coding->dst_object = buffer;
6260 coding->dst_pos = PT;
6261 coding->dst_pos_byte = PT_BYTE;
6262 coding->dst_multibyte = ! NILP (current_buffer->enable_multibyte_characters);
6263 coding->mode |= CODING_MODE_LAST_BLOCK;
6264
6265 if (CODING_REQUIRE_DETECTION (coding))
6266 detect_coding (coding);
6267
6268 decode_coding (coding);
6269
6270 attrs = CODING_ID_ATTRS (coding->id);
6271 if (! NILP (CODING_ATTR_POST_READ (attrs)))
6272 {
6273 EMACS_INT prev_Z = Z, prev_Z_BYTE = Z_BYTE;
6274 Lisp_Object val;
6275
6276 TEMP_SET_PT_BOTH (coding->dst_pos, coding->dst_pos_byte);
6277 val = call1 (CODING_ATTR_POST_READ (attrs),
6278 make_number (coding->produced_char));
6279 CHECK_NATNUM (val);
6280 coding->produced_char += Z - prev_Z;
6281 coding->produced += Z_BYTE - prev_Z_BYTE;
6282 }
6283
6284 unbind_to (count, Qnil);
6285 return coding->result;
6286 }
6287
6288 int
6289 encode_coding_gap (coding, chars, bytes)
6290 struct coding_system *coding;
6291 EMACS_INT chars, bytes;
6292 {
6293 int count = specpdl_ptr - specpdl;
6294 Lisp_Object buffer;
6295
6296 buffer = Fcurrent_buffer ();
6297 code_conversion_save (buffer, 0, 0);
6298
6299 coding->src_object = buffer;
6300 coding->src_chars = chars;
6301 coding->src_bytes = bytes;
6302 coding->src_pos = -chars;
6303 coding->src_pos_byte = -bytes;
6304 coding->src_multibyte = chars < bytes;
6305 coding->dst_object = coding->src_object;
6306 coding->dst_pos = PT;
6307 coding->dst_pos_byte = PT_BYTE;
6308
6309 encode_coding (coding);
6310
6311 unbind_to (count, Qnil);
6312 return coding->result;
6313 }
6314
6315
6316 /* Decode the text in the range FROM/FROM_BYTE and TO/TO_BYTE in
6317 SRC_OBJECT into DST_OBJECT by coding context CODING.
6318
6319 SRC_OBJECT is a buffer, a string, or Qnil.
6320
6321 If it is a buffer, the text is at point of the buffer. FROM and TO
6322 are positions in the buffer.
6323
6324 If it is a string, the text is at the beginning of the string.
6325 FROM and TO are indices to the string.
6326
6327 If it is nil, the text is at coding->source. FROM and TO are
6328 indices to coding->source.
6329
6330 DST_OBJECT is a buffer, Qt, or Qnil.
6331
6332 If it is a buffer, the decoded text is inserted at point of the
6333 buffer. If the buffer is the same as SRC_OBJECT, the source text
6334 is deleted.
6335
6336 If it is Qt, a string is made from the decoded text, and
6337 set in CODING->dst_object.
6338
6339 If it is Qnil, the decoded text is stored at CODING->destination.
6340 The caller must allocate CODING->dst_bytes bytes at
6341 CODING->destination by xmalloc. If the decoded text is longer than
6342 CODING->dst_bytes, CODING->destination is relocated by xrealloc.
6343 */
6344
6345 void
6346 decode_coding_object (coding, src_object, from, from_byte, to, to_byte,
6347 dst_object)
6348 struct coding_system *coding;
6349 Lisp_Object src_object;
6350 EMACS_INT from, from_byte, to, to_byte;
6351 Lisp_Object dst_object;
6352 {
6353 int count = specpdl_ptr - specpdl;
6354 unsigned char *destination;
6355 EMACS_INT dst_bytes;
6356 EMACS_INT chars = to - from;
6357 EMACS_INT bytes = to_byte - from_byte;
6358 Lisp_Object attrs;
6359 Lisp_Object buffer;
6360 int saved_pt = -1, saved_pt_byte;
6361
6362 buffer = Fcurrent_buffer ();
6363
6364 if (NILP (dst_object))
6365 {
6366 destination = coding->destination;
6367 dst_bytes = coding->dst_bytes;
6368 }
6369
6370 coding->src_object = src_object;
6371 coding->src_chars = chars;
6372 coding->src_bytes = bytes;
6373 coding->src_multibyte = chars < bytes;
6374
6375 if (STRINGP (src_object))
6376 {
6377 coding->src_pos = from;
6378 coding->src_pos_byte = from_byte;
6379 }
6380 else if (BUFFERP (src_object))
6381 {
6382 set_buffer_internal (XBUFFER (src_object));
6383 if (from != GPT)
6384 move_gap_both (from, from_byte);
6385 if (EQ (src_object, dst_object))
6386 {
6387 saved_pt = PT, saved_pt_byte = PT_BYTE;
6388 TEMP_SET_PT_BOTH (from, from_byte);
6389 del_range_both (from, from_byte, to, to_byte, 1);
6390 coding->src_pos = -chars;
6391 coding->src_pos_byte = -bytes;
6392 }
6393 else
6394 {
6395 coding->src_pos = from;
6396 coding->src_pos_byte = from_byte;
6397 }
6398 }
6399
6400 if (CODING_REQUIRE_DETECTION (coding))
6401 detect_coding (coding);
6402 attrs = CODING_ID_ATTRS (coding->id);
6403
6404 if (EQ (dst_object, Qt)
6405 || (! NILP (CODING_ATTR_POST_READ (attrs))
6406 && NILP (dst_object)))
6407 {
6408 coding->dst_object = code_conversion_save (buffer, 1, 1);
6409 coding->dst_pos = BEG;
6410 coding->dst_pos_byte = BEG_BYTE;
6411 coding->dst_multibyte = 1;
6412 }
6413 else if (BUFFERP (dst_object))
6414 {
6415 code_conversion_save (buffer, 0, 0);
6416 coding->dst_object = dst_object;
6417 coding->dst_pos = BUF_PT (XBUFFER (dst_object));
6418 coding->dst_pos_byte = BUF_PT_BYTE (XBUFFER (dst_object));
6419 coding->dst_multibyte
6420 = ! NILP (XBUFFER (dst_object)->enable_multibyte_characters);
6421 }
6422 else
6423 {
6424 code_conversion_save (buffer, 0, 0);
6425 coding->dst_object = Qnil;
6426 coding->dst_multibyte = 1;
6427 }
6428
6429 decode_coding (coding);
6430
6431 if (BUFFERP (coding->dst_object))
6432 set_buffer_internal (XBUFFER (coding->dst_object));
6433
6434 if (! NILP (CODING_ATTR_POST_READ (attrs)))
6435 {
6436 struct gcpro gcpro1, gcpro2;
6437 EMACS_INT prev_Z = Z, prev_Z_BYTE = Z_BYTE;
6438 Lisp_Object val;
6439
6440 TEMP_SET_PT_BOTH (coding->dst_pos, coding->dst_pos_byte);
6441 GCPRO2 (coding->src_object, coding->dst_object);
6442 val = call1 (CODING_ATTR_POST_READ (attrs),
6443 make_number (coding->produced_char));
6444 UNGCPRO;
6445 CHECK_NATNUM (val);
6446 coding->produced_char += Z - prev_Z;
6447 coding->produced += Z_BYTE - prev_Z_BYTE;
6448 }
6449
6450 if (EQ (dst_object, Qt))
6451 {
6452 coding->dst_object = Fbuffer_string ();
6453 }
6454 else if (NILP (dst_object) && BUFFERP (coding->dst_object))
6455 {
6456 set_buffer_internal (XBUFFER (coding->dst_object));
6457 if (dst_bytes < coding->produced)
6458 {
6459 destination
6460 = (unsigned char *) xrealloc (destination, coding->produced);
6461 if (! destination)
6462 {
6463 coding->result = CODING_RESULT_INSUFFICIENT_DST;
6464 unbind_to (count, Qnil);
6465 return;
6466 }
6467 if (BEGV < GPT && GPT < BEGV + coding->produced_char)
6468 move_gap_both (BEGV, BEGV_BYTE);
6469 bcopy (BEGV_ADDR, destination, coding->produced);
6470 coding->destination = destination;
6471 }
6472 }
6473
6474 if (saved_pt >= 0)
6475 {
6476 /* This is the case of:
6477 (BUFFERP (src_object) && EQ (src_object, dst_object))
6478 As we have moved PT while replacing the original buffer
6479 contents, we must recover it now. */
6480 set_buffer_internal (XBUFFER (src_object));
6481 if (saved_pt < from)
6482 TEMP_SET_PT_BOTH (saved_pt, saved_pt_byte);
6483 else if (saved_pt < from + chars)
6484 TEMP_SET_PT_BOTH (from, from_byte);
6485 else if (! NILP (current_buffer->enable_multibyte_characters))
6486 TEMP_SET_PT_BOTH (saved_pt + (coding->produced_char - chars),
6487 saved_pt_byte + (coding->produced - bytes));
6488 else
6489 TEMP_SET_PT_BOTH (saved_pt + (coding->produced - bytes),
6490 saved_pt_byte + (coding->produced - bytes));
6491 }
6492
6493 unbind_to (count, Qnil);
6494 }
6495
6496
6497 void
6498 encode_coding_object (coding, src_object, from, from_byte, to, to_byte,
6499 dst_object)
6500 struct coding_system *coding;
6501 Lisp_Object src_object;
6502 EMACS_INT from, from_byte, to, to_byte;
6503 Lisp_Object dst_object;
6504 {
6505 int count = specpdl_ptr - specpdl;
6506 EMACS_INT chars = to - from;
6507 EMACS_INT bytes = to_byte - from_byte;
6508 Lisp_Object attrs;
6509 Lisp_Object buffer;
6510 int saved_pt = -1, saved_pt_byte;
6511
6512 buffer = Fcurrent_buffer ();
6513
6514 coding->src_object = src_object;
6515 coding->src_chars = chars;
6516 coding->src_bytes = bytes;
6517 coding->src_multibyte = chars < bytes;
6518
6519 attrs = CODING_ID_ATTRS (coding->id);
6520
6521 if (! NILP (CODING_ATTR_PRE_WRITE (attrs)))
6522 {
6523 coding->src_object = code_conversion_save (buffer, 1,
6524 coding->src_multibyte);
6525 set_buffer_internal (XBUFFER (coding->src_object));
6526 if (STRINGP (src_object))
6527 insert_from_string (src_object, from, from_byte, chars, bytes, 0);
6528 else if (BUFFERP (src_object))
6529 insert_from_buffer (XBUFFER (src_object), from, chars, 0);
6530 else
6531 insert_1_both (coding->source + from, chars, bytes, 0, 0, 0);
6532
6533 if (EQ (src_object, dst_object))
6534 {
6535 set_buffer_internal (XBUFFER (src_object));
6536 saved_pt = PT, saved_pt_byte = PT_BYTE;
6537 del_range_both (from, from_byte, to, to_byte, 1);
6538 set_buffer_internal (XBUFFER (coding->src_object));
6539 }
6540
6541 call2 (CODING_ATTR_PRE_WRITE (attrs),
6542 make_number (BEG), make_number (Z));
6543 coding->src_object = Fcurrent_buffer ();
6544 if (BEG != GPT)
6545 move_gap_both (BEG, BEG_BYTE);
6546 coding->src_chars = Z - BEG;
6547 coding->src_bytes = Z_BYTE - BEG_BYTE;
6548 coding->src_pos = BEG;
6549 coding->src_pos_byte = BEG_BYTE;
6550 coding->src_multibyte = Z < Z_BYTE;
6551 }
6552 else if (STRINGP (src_object))
6553 {
6554 code_conversion_save (buffer, 0, 0);
6555 coding->src_pos = from;
6556 coding->src_pos_byte = from_byte;
6557 }
6558 else if (BUFFERP (src_object))
6559 {
6560 code_conversion_save (buffer, 0, 0);
6561 set_buffer_internal (XBUFFER (src_object));
6562 if (EQ (src_object, dst_object))
6563 {
6564 saved_pt = PT, saved_pt_byte = PT_BYTE;
6565 coding->src_object = del_range_1 (from, to, 1, 1);
6566 coding->src_pos = 0;
6567 coding->src_pos_byte = 0;
6568 }
6569 else
6570 {
6571 if (from < GPT && to >= GPT)
6572 move_gap_both (from, from_byte);
6573 coding->src_pos = from;
6574 coding->src_pos_byte = from_byte;
6575 }
6576 }
6577 else
6578 code_conversion_save (buffer, 0, 0);
6579
6580 if (BUFFERP (dst_object))
6581 {
6582 coding->dst_object = dst_object;
6583 if (EQ (src_object, dst_object))
6584 {
6585 coding->dst_pos = from;
6586 coding->dst_pos_byte = from_byte;
6587 }
6588 else
6589 {
6590 coding->dst_pos = BUF_PT (XBUFFER (dst_object));
6591 coding->dst_pos_byte = BUF_PT_BYTE (XBUFFER (dst_object));
6592 }
6593 coding->dst_multibyte
6594 = ! NILP (XBUFFER (dst_object)->enable_multibyte_characters);
6595 }
6596 else if (EQ (dst_object, Qt))
6597 {
6598 coding->dst_object = Qnil;
6599 coding->dst_bytes = coding->src_chars;
6600 if (coding->dst_bytes == 0)
6601 coding->dst_bytes = 1;
6602 coding->destination = (unsigned char *) xmalloc (coding->dst_bytes);
6603 coding->dst_multibyte = 0;
6604 }
6605 else
6606 {
6607 coding->dst_object = Qnil;
6608 coding->dst_multibyte = 0;
6609 }
6610
6611 encode_coding (coding);
6612
6613 if (EQ (dst_object, Qt))
6614 {
6615 if (BUFFERP (coding->dst_object))
6616 coding->dst_object = Fbuffer_string ();
6617 else
6618 {
6619 coding->dst_object
6620 = make_unibyte_string ((char *) coding->destination,
6621 coding->produced);
6622 xfree (coding->destination);
6623 }
6624 }
6625
6626 if (saved_pt >= 0)
6627 {
6628 /* This is the case of:
6629 (BUFFERP (src_object) && EQ (src_object, dst_object))
6630 As we have moved PT while replacing the original buffer
6631 contents, we must recover it now. */
6632 set_buffer_internal (XBUFFER (src_object));
6633 if (saved_pt < from)
6634 TEMP_SET_PT_BOTH (saved_pt, saved_pt_byte);
6635 else if (saved_pt < from + chars)
6636 TEMP_SET_PT_BOTH (from, from_byte);
6637 else if (! NILP (current_buffer->enable_multibyte_characters))
6638 TEMP_SET_PT_BOTH (saved_pt + (coding->produced_char - chars),
6639 saved_pt_byte + (coding->produced - bytes));
6640 else
6641 TEMP_SET_PT_BOTH (saved_pt + (coding->produced - bytes),
6642 saved_pt_byte + (coding->produced - bytes));
6643 }
6644
6645 unbind_to (count, Qnil);
6646 }
6647
6648
6649 Lisp_Object
6650 preferred_coding_system ()
6651 {
6652 int id = coding_categories[coding_priorities[0]].id;
6653
6654 return CODING_ID_NAME (id);
6655 }
6656
6657 \f
6658 #ifdef emacs
6659 /*** 8. Emacs Lisp library functions ***/
6660
6661 DEFUN ("coding-system-p", Fcoding_system_p, Scoding_system_p, 1, 1, 0,
6662 doc: /* Return t if OBJECT is nil or a coding-system.
6663 See the documentation of `define-coding-system' for information
6664 about coding-system objects. */)
6665 (obj)
6666 Lisp_Object obj;
6667 {
6668 return ((NILP (obj) || CODING_SYSTEM_P (obj)) ? Qt : Qnil);
6669 }
6670
6671 DEFUN ("read-non-nil-coding-system", Fread_non_nil_coding_system,
6672 Sread_non_nil_coding_system, 1, 1, 0,
6673 doc: /* Read a coding system from the minibuffer, prompting with string PROMPT. */)
6674 (prompt)
6675 Lisp_Object prompt;
6676 {
6677 Lisp_Object val;
6678 do
6679 {
6680 val = Fcompleting_read (prompt, Vcoding_system_alist, Qnil,
6681 Qt, Qnil, Qcoding_system_history, Qnil, Qnil);
6682 }
6683 while (SCHARS (val) == 0);
6684 return (Fintern (val, Qnil));
6685 }
6686
6687 DEFUN ("read-coding-system", Fread_coding_system, Sread_coding_system, 1, 2, 0,
6688 doc: /* Read a coding system from the minibuffer, prompting with string PROMPT.
6689 If the user enters null input, return second argument DEFAULT-CODING-SYSTEM. */)
6690 (prompt, default_coding_system)
6691 Lisp_Object prompt, default_coding_system;
6692 {
6693 Lisp_Object val;
6694 if (SYMBOLP (default_coding_system))
6695 XSETSTRING (default_coding_system, XPNTR (SYMBOL_NAME (default_coding_system)));
6696 val = Fcompleting_read (prompt, Vcoding_system_alist, Qnil,
6697 Qt, Qnil, Qcoding_system_history,
6698 default_coding_system, Qnil);
6699 return (SCHARS (val) == 0 ? Qnil : Fintern (val, Qnil));
6700 }
6701
6702 DEFUN ("check-coding-system", Fcheck_coding_system, Scheck_coding_system,
6703 1, 1, 0,
6704 doc: /* Check validity of CODING-SYSTEM.
6705 If valid, return CODING-SYSTEM, else signal a `coding-system-error' error. */)
6706 (coding_system)
6707 Lisp_Object coding_system;
6708 {
6709 CHECK_SYMBOL (coding_system);
6710 if (!NILP (Fcoding_system_p (coding_system)))
6711 return coding_system;
6712 while (1)
6713 Fsignal (Qcoding_system_error, Fcons (coding_system, Qnil));
6714 }
6715
6716 \f
6717 /* Detect how the bytes at SRC of length SRC_BYTES are encoded. If
6718 HIGHEST is nonzero, return the coding system of the highest
6719 priority among the detected coding systems. Otherwize return a
6720 list of detected coding systems sorted by their priorities. If
6721 MULTIBYTEP is nonzero, it is assumed that the bytes are in correct
6722 multibyte form but contains only ASCII and eight-bit chars.
6723 Otherwise, the bytes are raw bytes.
6724
6725 CODING-SYSTEM controls the detection as below:
6726
6727 If it is nil, detect both text-format and eol-format. If the
6728 text-format part of CODING-SYSTEM is already specified
6729 (e.g. `iso-latin-1'), detect only eol-format. If the eol-format
6730 part of CODING-SYSTEM is already specified (e.g. `undecided-unix'),
6731 detect only text-format. */
6732
6733 Lisp_Object
6734 detect_coding_system (src, src_bytes, highest, multibytep, coding_system)
6735 const unsigned char *src;
6736 int src_bytes, highest;
6737 int multibytep;
6738 Lisp_Object coding_system;
6739 {
6740 const unsigned char *src_end = src + src_bytes;
6741 Lisp_Object attrs, eol_type;
6742 Lisp_Object val;
6743 struct coding_system coding;
6744 int id;
6745 struct coding_detection_info detect_info;
6746
6747 if (NILP (coding_system))
6748 coding_system = Qundecided;
6749 setup_coding_system (coding_system, &coding);
6750 attrs = CODING_ID_ATTRS (coding.id);
6751 eol_type = CODING_ID_EOL_TYPE (coding.id);
6752 coding_system = CODING_ATTR_BASE_NAME (attrs);
6753
6754 coding.source = src;
6755 coding.src_bytes = src_bytes;
6756 coding.src_multibyte = multibytep;
6757 coding.consumed = 0;
6758 coding.mode |= CODING_MODE_LAST_BLOCK;
6759
6760 detect_info.checked = detect_info.found = detect_info.rejected = 0;
6761
6762 /* At first, detect text-format if necessary. */
6763 if (XINT (CODING_ATTR_CATEGORY (attrs)) == coding_category_undecided)
6764 {
6765 enum coding_category category;
6766 struct coding_system *this;
6767 int c, i;
6768
6769 for (; src < src_end; src++)
6770 {
6771 c = *src;
6772 if (c & 0x80
6773 || (c < 0x20 && (c == ISO_CODE_ESC
6774 || c == ISO_CODE_SI
6775 || c == ISO_CODE_SO)))
6776 break;
6777 }
6778 coding.head_ascii = src - coding.source;
6779
6780 if (src < src_end)
6781 for (i = 0; i < coding_category_raw_text; i++)
6782 {
6783 category = coding_priorities[i];
6784 this = coding_categories + category;
6785
6786 if (this->id < 0)
6787 {
6788 /* No coding system of this category is defined. */
6789 detect_info.rejected |= (1 << category);
6790 }
6791 else if (category >= coding_category_raw_text)
6792 continue;
6793 else if (detect_info.checked & (1 << category))
6794 {
6795 if (highest
6796 && (detect_info.found & (1 << category)))
6797 break;
6798 }
6799 else
6800 {
6801 if ((*(this->detector)) (&coding, &detect_info)
6802 && highest
6803 && (detect_info.found & (1 << category)))
6804 break;
6805 }
6806 }
6807
6808
6809 if (detect_info.rejected == CATEGORY_MASK_ANY)
6810 {
6811 detect_info.found = CATEGORY_MASK_RAW_TEXT;
6812 id = coding_categories[coding_category_raw_text].id;
6813 val = Fcons (make_number (id), Qnil);
6814 }
6815 else if (! detect_info.rejected && ! detect_info.found)
6816 {
6817 detect_info.found = CATEGORY_MASK_ANY;
6818 id = coding_categories[coding_category_undecided].id;
6819 val = Fcons (make_number (id), Qnil);
6820 }
6821 else if (highest)
6822 {
6823 if (detect_info.found)
6824 {
6825 detect_info.found = 1 << category;
6826 val = Fcons (make_number (this->id), Qnil);
6827 }
6828 else
6829 for (i = 0; i < coding_category_raw_text; i++)
6830 if (! (detect_info.rejected & (1 << coding_priorities[i])))
6831 {
6832 detect_info.found = 1 << coding_priorities[i];
6833 id = coding_categories[coding_priorities[i]].id;
6834 val = Fcons (make_number (id), Qnil);
6835 break;
6836 }
6837 }
6838 else
6839 {
6840 int mask = detect_info.rejected | detect_info.found;
6841 int found = 0;
6842 val = Qnil;
6843
6844 for (i = coding_category_raw_text - 1; i >= 0; i--)
6845 {
6846 category = coding_priorities[i];
6847 if (! (mask & (1 << category)))
6848 {
6849 found |= 1 << category;
6850 id = coding_categories[category].id;
6851 val = Fcons (make_number (id), val);
6852 }
6853 }
6854 for (i = coding_category_raw_text - 1; i >= 0; i--)
6855 {
6856 category = coding_priorities[i];
6857 if (detect_info.found & (1 << category))
6858 {
6859 id = coding_categories[category].id;
6860 val = Fcons (make_number (id), val);
6861 }
6862 }
6863 detect_info.found |= found;
6864 }
6865 }
6866 else
6867 {
6868 detect_info.found = 1 << XINT (CODING_ATTR_CATEGORY (attrs));
6869 val = Fcons (make_number (coding.id), Qnil);
6870 }
6871
6872 /* Then, detect eol-format if necessary. */
6873 {
6874 int normal_eol = -1, utf_16_be_eol = -1, utf_16_le_eol;
6875 Lisp_Object tail;
6876
6877 if (VECTORP (eol_type))
6878 {
6879 if (detect_info.found & ~CATEGORY_MASK_UTF_16)
6880 normal_eol = detect_eol (coding.source, src_bytes,
6881 coding_category_raw_text);
6882 if (detect_info.found & (CATEGORY_MASK_UTF_16_BE
6883 | CATEGORY_MASK_UTF_16_BE_NOSIG))
6884 utf_16_be_eol = detect_eol (coding.source, src_bytes,
6885 coding_category_utf_16_be);
6886 if (detect_info.found & (CATEGORY_MASK_UTF_16_LE
6887 | CATEGORY_MASK_UTF_16_LE_NOSIG))
6888 utf_16_le_eol = detect_eol (coding.source, src_bytes,
6889 coding_category_utf_16_le);
6890 }
6891 else
6892 {
6893 if (EQ (eol_type, Qunix))
6894 normal_eol = utf_16_be_eol = utf_16_le_eol = EOL_SEEN_LF;
6895 else if (EQ (eol_type, Qdos))
6896 normal_eol = utf_16_be_eol = utf_16_le_eol = EOL_SEEN_CRLF;
6897 else
6898 normal_eol = utf_16_be_eol = utf_16_le_eol = EOL_SEEN_CR;
6899 }
6900
6901 for (tail = val; CONSP (tail); tail = XCDR (tail))
6902 {
6903 enum coding_category category;
6904 int this_eol;
6905
6906 id = XINT (XCAR (tail));
6907 attrs = CODING_ID_ATTRS (id);
6908 category = XINT (CODING_ATTR_CATEGORY (attrs));
6909 eol_type = CODING_ID_EOL_TYPE (id);
6910 if (VECTORP (eol_type))
6911 {
6912 if (category == coding_category_utf_16_be
6913 || category == coding_category_utf_16_be_nosig)
6914 this_eol = utf_16_be_eol;
6915 else if (category == coding_category_utf_16_le
6916 || category == coding_category_utf_16_le_nosig)
6917 this_eol = utf_16_le_eol;
6918 else
6919 this_eol = normal_eol;
6920
6921 if (this_eol == EOL_SEEN_LF)
6922 XSETCAR (tail, AREF (eol_type, 0));
6923 else if (this_eol == EOL_SEEN_CRLF)
6924 XSETCAR (tail, AREF (eol_type, 1));
6925 else if (this_eol == EOL_SEEN_CR)
6926 XSETCAR (tail, AREF (eol_type, 2));
6927 else
6928 XSETCAR (tail, CODING_ID_NAME (id));
6929 }
6930 else
6931 XSETCAR (tail, CODING_ID_NAME (id));
6932 }
6933 }
6934
6935 return (highest ? XCAR (val) : val);
6936 }
6937
6938
6939 DEFUN ("detect-coding-region", Fdetect_coding_region, Sdetect_coding_region,
6940 2, 3, 0,
6941 doc: /* Detect coding system of the text in the region between START and END.
6942 Return a list of possible coding systems ordered by priority.
6943
6944 If only ASCII characters are found, it returns a list of single element
6945 `undecided' or its subsidiary coding system according to a detected
6946 end-of-line format.
6947
6948 If optional argument HIGHEST is non-nil, return the coding system of
6949 highest priority. */)
6950 (start, end, highest)
6951 Lisp_Object start, end, highest;
6952 {
6953 int from, to;
6954 int from_byte, to_byte;
6955
6956 CHECK_NUMBER_COERCE_MARKER (start);
6957 CHECK_NUMBER_COERCE_MARKER (end);
6958
6959 validate_region (&start, &end);
6960 from = XINT (start), to = XINT (end);
6961 from_byte = CHAR_TO_BYTE (from);
6962 to_byte = CHAR_TO_BYTE (to);
6963
6964 if (from < GPT && to >= GPT)
6965 move_gap_both (to, to_byte);
6966
6967 return detect_coding_system (BYTE_POS_ADDR (from_byte),
6968 to_byte - from_byte,
6969 !NILP (highest),
6970 !NILP (current_buffer
6971 ->enable_multibyte_characters),
6972 Qnil);
6973 }
6974
6975 DEFUN ("detect-coding-string", Fdetect_coding_string, Sdetect_coding_string,
6976 1, 2, 0,
6977 doc: /* Detect coding system of the text in STRING.
6978 Return a list of possible coding systems ordered by priority.
6979
6980 If only ASCII characters are found, it returns a list of single element
6981 `undecided' or its subsidiary coding system according to a detected
6982 end-of-line format.
6983
6984 If optional argument HIGHEST is non-nil, return the coding system of
6985 highest priority. */)
6986 (string, highest)
6987 Lisp_Object string, highest;
6988 {
6989 CHECK_STRING (string);
6990
6991 return detect_coding_system (SDATA (string), SBYTES (string),
6992 !NILP (highest), STRING_MULTIBYTE (string),
6993 Qnil);
6994 }
6995
6996
6997 static INLINE int
6998 char_encodable_p (c, attrs)
6999 int c;
7000 Lisp_Object attrs;
7001 {
7002 Lisp_Object tail;
7003 struct charset *charset;
7004
7005 for (tail = CODING_ATTR_CHARSET_LIST (attrs);
7006 CONSP (tail); tail = XCDR (tail))
7007 {
7008 charset = CHARSET_FROM_ID (XINT (XCAR (tail)));
7009 if (CHAR_CHARSET_P (c, charset))
7010 break;
7011 }
7012 return (! NILP (tail));
7013 }
7014
7015
7016 /* Return a list of coding systems that safely encode the text between
7017 START and END. If EXCLUDE is non-nil, it is a list of coding
7018 systems not to check. The returned list doesn't contain any such
7019 coding systems. In any case, if the text contains only ASCII or is
7020 unibyte, return t. */
7021
7022 DEFUN ("find-coding-systems-region-internal",
7023 Ffind_coding_systems_region_internal,
7024 Sfind_coding_systems_region_internal, 2, 3, 0,
7025 doc: /* Internal use only. */)
7026 (start, end, exclude)
7027 Lisp_Object start, end, exclude;
7028 {
7029 Lisp_Object coding_attrs_list, safe_codings;
7030 EMACS_INT start_byte, end_byte;
7031 const unsigned char *p, *pbeg, *pend;
7032 int c;
7033 Lisp_Object tail, elt;
7034
7035 if (STRINGP (start))
7036 {
7037 if (!STRING_MULTIBYTE (start)
7038 || SCHARS (start) == SBYTES (start))
7039 return Qt;
7040 start_byte = 0;
7041 end_byte = SBYTES (start);
7042 }
7043 else
7044 {
7045 CHECK_NUMBER_COERCE_MARKER (start);
7046 CHECK_NUMBER_COERCE_MARKER (end);
7047 if (XINT (start) < BEG || XINT (end) > Z || XINT (start) > XINT (end))
7048 args_out_of_range (start, end);
7049 if (NILP (current_buffer->enable_multibyte_characters))
7050 return Qt;
7051 start_byte = CHAR_TO_BYTE (XINT (start));
7052 end_byte = CHAR_TO_BYTE (XINT (end));
7053 if (XINT (end) - XINT (start) == end_byte - start_byte)
7054 return Qt;
7055
7056 if (XINT (start) < GPT && XINT (end) > GPT)
7057 {
7058 if ((GPT - XINT (start)) < (XINT (end) - GPT))
7059 move_gap_both (XINT (start), start_byte);
7060 else
7061 move_gap_both (XINT (end), end_byte);
7062 }
7063 }
7064
7065 coding_attrs_list = Qnil;
7066 for (tail = Vcoding_system_list; CONSP (tail); tail = XCDR (tail))
7067 if (NILP (exclude)
7068 || NILP (Fmemq (XCAR (tail), exclude)))
7069 {
7070 Lisp_Object attrs;
7071
7072 attrs = AREF (CODING_SYSTEM_SPEC (XCAR (tail)), 0);
7073 if (EQ (XCAR (tail), CODING_ATTR_BASE_NAME (attrs))
7074 && ! EQ (CODING_ATTR_TYPE (attrs), Qundecided))
7075 coding_attrs_list = Fcons (attrs, coding_attrs_list);
7076 }
7077
7078 if (STRINGP (start))
7079 p = pbeg = SDATA (start);
7080 else
7081 p = pbeg = BYTE_POS_ADDR (start_byte);
7082 pend = p + (end_byte - start_byte);
7083
7084 while (p < pend && ASCII_BYTE_P (*p)) p++;
7085 while (p < pend && ASCII_BYTE_P (*(pend - 1))) pend--;
7086
7087 while (p < pend)
7088 {
7089 if (ASCII_BYTE_P (*p))
7090 p++;
7091 else
7092 {
7093 c = STRING_CHAR_ADVANCE (p);
7094
7095 charset_map_loaded = 0;
7096 for (tail = coding_attrs_list; CONSP (tail);)
7097 {
7098 elt = XCAR (tail);
7099 if (NILP (elt))
7100 tail = XCDR (tail);
7101 else if (char_encodable_p (c, elt))
7102 tail = XCDR (tail);
7103 else if (CONSP (XCDR (tail)))
7104 {
7105 XSETCAR (tail, XCAR (XCDR (tail)));
7106 XSETCDR (tail, XCDR (XCDR (tail)));
7107 }
7108 else
7109 {
7110 XSETCAR (tail, Qnil);
7111 tail = XCDR (tail);
7112 }
7113 }
7114 if (charset_map_loaded)
7115 {
7116 EMACS_INT p_offset = p - pbeg, pend_offset = pend - pbeg;
7117
7118 if (STRINGP (start))
7119 pbeg = SDATA (start);
7120 else
7121 pbeg = BYTE_POS_ADDR (start_byte);
7122 p = pbeg + p_offset;
7123 pend = pbeg + pend_offset;
7124 }
7125 }
7126 }
7127
7128 safe_codings = Qnil;
7129 for (tail = coding_attrs_list; CONSP (tail); tail = XCDR (tail))
7130 if (! NILP (XCAR (tail)))
7131 safe_codings = Fcons (CODING_ATTR_BASE_NAME (XCAR (tail)), safe_codings);
7132
7133 return safe_codings;
7134 }
7135
7136
7137 DEFUN ("unencodable-char-position", Funencodable_char_position,
7138 Sunencodable_char_position, 3, 5, 0,
7139 doc: /*
7140 Return position of first un-encodable character in a region.
7141 START and END specfiy the region and CODING-SYSTEM specifies the
7142 encoding to check. Return nil if CODING-SYSTEM does encode the region.
7143
7144 If optional 4th argument COUNT is non-nil, it specifies at most how
7145 many un-encodable characters to search. In this case, the value is a
7146 list of positions.
7147
7148 If optional 5th argument STRING is non-nil, it is a string to search
7149 for un-encodable characters. In that case, START and END are indexes
7150 to the string. */)
7151 (start, end, coding_system, count, string)
7152 Lisp_Object start, end, coding_system, count, string;
7153 {
7154 int n;
7155 struct coding_system coding;
7156 Lisp_Object attrs, charset_list;
7157 Lisp_Object positions;
7158 int from, to;
7159 const unsigned char *p, *stop, *pend;
7160 int ascii_compatible;
7161
7162 setup_coding_system (Fcheck_coding_system (coding_system), &coding);
7163 attrs = CODING_ID_ATTRS (coding.id);
7164 if (EQ (CODING_ATTR_TYPE (attrs), Qraw_text))
7165 return Qnil;
7166 ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs));
7167 charset_list = CODING_ATTR_CHARSET_LIST (attrs);
7168
7169 if (NILP (string))
7170 {
7171 validate_region (&start, &end);
7172 from = XINT (start);
7173 to = XINT (end);
7174 if (NILP (current_buffer->enable_multibyte_characters)
7175 || (ascii_compatible
7176 && (to - from) == (CHAR_TO_BYTE (to) - (CHAR_TO_BYTE (from)))))
7177 return Qnil;
7178 p = CHAR_POS_ADDR (from);
7179 pend = CHAR_POS_ADDR (to);
7180 if (from < GPT && to >= GPT)
7181 stop = GPT_ADDR;
7182 else
7183 stop = pend;
7184 }
7185 else
7186 {
7187 CHECK_STRING (string);
7188 CHECK_NATNUM (start);
7189 CHECK_NATNUM (end);
7190 from = XINT (start);
7191 to = XINT (end);
7192 if (from > to
7193 || to > SCHARS (string))
7194 args_out_of_range_3 (string, start, end);
7195 if (! STRING_MULTIBYTE (string))
7196 return Qnil;
7197 p = SDATA (string) + string_char_to_byte (string, from);
7198 stop = pend = SDATA (string) + string_char_to_byte (string, to);
7199 if (ascii_compatible && (to - from) == (pend - p))
7200 return Qnil;
7201 }
7202
7203 if (NILP (count))
7204 n = 1;
7205 else
7206 {
7207 CHECK_NATNUM (count);
7208 n = XINT (count);
7209 }
7210
7211 positions = Qnil;
7212 while (1)
7213 {
7214 int c;
7215
7216 if (ascii_compatible)
7217 while (p < stop && ASCII_BYTE_P (*p))
7218 p++, from++;
7219 if (p >= stop)
7220 {
7221 if (p >= pend)
7222 break;
7223 stop = pend;
7224 p = GAP_END_ADDR;
7225 }
7226
7227 c = STRING_CHAR_ADVANCE (p);
7228 if (! (ASCII_CHAR_P (c) && ascii_compatible)
7229 && ! char_charset (c, charset_list, NULL))
7230 {
7231 positions = Fcons (make_number (from), positions);
7232 n--;
7233 if (n == 0)
7234 break;
7235 }
7236
7237 from++;
7238 }
7239
7240 return (NILP (count) ? Fcar (positions) : Fnreverse (positions));
7241 }
7242
7243
7244 DEFUN ("check-coding-systems-region", Fcheck_coding_systems_region,
7245 Scheck_coding_systems_region, 3, 3, 0,
7246 doc: /* Check if the region is encodable by coding systems.
7247
7248 START and END are buffer positions specifying the region.
7249 CODING-SYSTEM-LIST is a list of coding systems to check.
7250
7251 The value is an alist ((CODING-SYSTEM POS0 POS1 ...) ...), where
7252 CODING-SYSTEM is a member of CODING-SYSTEM-LIst and can't encode the
7253 whole region, POS0, POS1, ... are buffer positions where non-encodable
7254 characters are found.
7255
7256 If all coding systems in CODING-SYSTEM-LIST can encode the region, the
7257 value is nil.
7258
7259 START may be a string. In that case, check if the string is
7260 encodable, and the value contains indices to the string instead of
7261 buffer positions. END is ignored. */)
7262 (start, end, coding_system_list)
7263 Lisp_Object start, end, coding_system_list;
7264 {
7265 Lisp_Object list;
7266 EMACS_INT start_byte, end_byte;
7267 int pos;
7268 const unsigned char *p, *pbeg, *pend;
7269 int c;
7270 Lisp_Object tail, elt;
7271
7272 if (STRINGP (start))
7273 {
7274 if (!STRING_MULTIBYTE (start)
7275 && SCHARS (start) != SBYTES (start))
7276 return Qnil;
7277 start_byte = 0;
7278 end_byte = SBYTES (start);
7279 pos = 0;
7280 }
7281 else
7282 {
7283 CHECK_NUMBER_COERCE_MARKER (start);
7284 CHECK_NUMBER_COERCE_MARKER (end);
7285 if (XINT (start) < BEG || XINT (end) > Z || XINT (start) > XINT (end))
7286 args_out_of_range (start, end);
7287 if (NILP (current_buffer->enable_multibyte_characters))
7288 return Qnil;
7289 start_byte = CHAR_TO_BYTE (XINT (start));
7290 end_byte = CHAR_TO_BYTE (XINT (end));
7291 if (XINT (end) - XINT (start) == end_byte - start_byte)
7292 return Qt;
7293
7294 if (XINT (start) < GPT && XINT (end) > GPT)
7295 {
7296 if ((GPT - XINT (start)) < (XINT (end) - GPT))
7297 move_gap_both (XINT (start), start_byte);
7298 else
7299 move_gap_both (XINT (end), end_byte);
7300 }
7301 pos = XINT (start);
7302 }
7303
7304 list = Qnil;
7305 for (tail = coding_system_list; CONSP (tail); tail = XCDR (tail))
7306 {
7307 elt = XCAR (tail);
7308 list = Fcons (Fcons (elt, Fcons (AREF (CODING_SYSTEM_SPEC (elt), 0),
7309 Qnil)),
7310 list);
7311 }
7312
7313 if (STRINGP (start))
7314 p = pbeg = SDATA (start);
7315 else
7316 p = pbeg = BYTE_POS_ADDR (start_byte);
7317 pend = p + (end_byte - start_byte);
7318
7319 while (p < pend && ASCII_BYTE_P (*p)) p++, pos++;
7320 while (p < pend && ASCII_BYTE_P (*(pend - 1))) pend--;
7321
7322 while (p < pend)
7323 {
7324 if (ASCII_BYTE_P (*p))
7325 p++;
7326 else
7327 {
7328 c = STRING_CHAR_ADVANCE (p);
7329
7330 charset_map_loaded = 0;
7331 for (tail = list; CONSP (tail); tail = XCDR (tail))
7332 {
7333 elt = XCDR (XCAR (tail));
7334 if (! char_encodable_p (c, XCAR (elt)))
7335 XSETCDR (elt, Fcons (make_number (pos), XCDR (elt)));
7336 }
7337 if (charset_map_loaded)
7338 {
7339 EMACS_INT p_offset = p - pbeg, pend_offset = pend - pbeg;
7340
7341 if (STRINGP (start))
7342 pbeg = SDATA (start);
7343 else
7344 pbeg = BYTE_POS_ADDR (start_byte);
7345 p = pbeg + p_offset;
7346 pend = pbeg + pend_offset;
7347 }
7348 }
7349 pos++;
7350 }
7351
7352 tail = list;
7353 list = Qnil;
7354 for (; CONSP (tail); tail = XCDR (tail))
7355 {
7356 elt = XCAR (tail);
7357 if (CONSP (XCDR (XCDR (elt))))
7358 list = Fcons (Fcons (XCAR (elt), Fnreverse (XCDR (XCDR (elt)))),
7359 list);
7360 }
7361
7362 return list;
7363 }
7364
7365
7366
7367 Lisp_Object
7368 code_convert_region (start, end, coding_system, dst_object, encodep, norecord)
7369 Lisp_Object start, end, coding_system, dst_object;
7370 int encodep, norecord;
7371 {
7372 struct coding_system coding;
7373 EMACS_INT from, from_byte, to, to_byte;
7374 Lisp_Object src_object;
7375
7376 CHECK_NUMBER_COERCE_MARKER (start);
7377 CHECK_NUMBER_COERCE_MARKER (end);
7378 if (NILP (coding_system))
7379 coding_system = Qno_conversion;
7380 else
7381 CHECK_CODING_SYSTEM (coding_system);
7382 src_object = Fcurrent_buffer ();
7383 if (NILP (dst_object))
7384 dst_object = src_object;
7385 else if (! EQ (dst_object, Qt))
7386 CHECK_BUFFER (dst_object);
7387
7388 validate_region (&start, &end);
7389 from = XFASTINT (start);
7390 from_byte = CHAR_TO_BYTE (from);
7391 to = XFASTINT (end);
7392 to_byte = CHAR_TO_BYTE (to);
7393
7394 setup_coding_system (coding_system, &coding);
7395 coding.mode |= CODING_MODE_LAST_BLOCK;
7396
7397 if (encodep)
7398 encode_coding_object (&coding, src_object, from, from_byte, to, to_byte,
7399 dst_object);
7400 else
7401 decode_coding_object (&coding, src_object, from, from_byte, to, to_byte,
7402 dst_object);
7403 if (! norecord)
7404 Vlast_coding_system_used = CODING_ID_NAME (coding.id);
7405
7406 if (coding.result != CODING_RESULT_SUCCESS)
7407 error ("Code conversion error: %d", coding.result);
7408
7409 return (BUFFERP (dst_object)
7410 ? make_number (coding.produced_char)
7411 : coding.dst_object);
7412 }
7413
7414
7415 DEFUN ("decode-coding-region", Fdecode_coding_region, Sdecode_coding_region,
7416 3, 4, "r\nzCoding system: ",
7417 doc: /* Decode the current region from the specified coding system.
7418 When called from a program, takes four arguments:
7419 START, END, CODING-SYSTEM, and DESTINATION.
7420 START and END are buffer positions.
7421
7422 Optional 4th arguments DESTINATION specifies where the decoded text goes.
7423 If nil, the region between START and END is replace by the decoded text.
7424 If buffer, the decoded text is inserted in the buffer.
7425 If t, the decoded text is returned.
7426
7427 This function sets `last-coding-system-used' to the precise coding system
7428 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
7429 not fully specified.)
7430 It returns the length of the decoded text. */)
7431 (start, end, coding_system, destination)
7432 Lisp_Object start, end, coding_system, destination;
7433 {
7434 return code_convert_region (start, end, coding_system, destination, 0, 0);
7435 }
7436
7437 DEFUN ("encode-coding-region", Fencode_coding_region, Sencode_coding_region,
7438 3, 4, "r\nzCoding system: ",
7439 doc: /* Encode the current region by specified coding system.
7440 When called from a program, takes three arguments:
7441 START, END, and CODING-SYSTEM. START and END are buffer positions.
7442
7443 Optional 4th arguments DESTINATION specifies where the encoded text goes.
7444 If nil, the region between START and END is replace by the encoded text.
7445 If buffer, the encoded text is inserted in the buffer.
7446 If t, the encoded text is returned.
7447
7448 This function sets `last-coding-system-used' to the precise coding system
7449 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
7450 not fully specified.)
7451 It returns the length of the encoded text. */)
7452 (start, end, coding_system, destination)
7453 Lisp_Object start, end, coding_system, destination;
7454 {
7455 return code_convert_region (start, end, coding_system, destination, 1, 0);
7456 }
7457
7458 Lisp_Object
7459 code_convert_string (string, coding_system, dst_object,
7460 encodep, nocopy, norecord)
7461 Lisp_Object string, coding_system, dst_object;
7462 int encodep, nocopy, norecord;
7463 {
7464 struct coding_system coding;
7465 EMACS_INT chars, bytes;
7466
7467 CHECK_STRING (string);
7468 if (NILP (coding_system))
7469 {
7470 if (! norecord)
7471 Vlast_coding_system_used = Qno_conversion;
7472 if (NILP (dst_object))
7473 return (nocopy ? Fcopy_sequence (string) : string);
7474 }
7475
7476 if (NILP (coding_system))
7477 coding_system = Qno_conversion;
7478 else
7479 CHECK_CODING_SYSTEM (coding_system);
7480 if (NILP (dst_object))
7481 dst_object = Qt;
7482 else if (! EQ (dst_object, Qt))
7483 CHECK_BUFFER (dst_object);
7484
7485 setup_coding_system (coding_system, &coding);
7486 coding.mode |= CODING_MODE_LAST_BLOCK;
7487 chars = SCHARS (string);
7488 bytes = SBYTES (string);
7489 if (encodep)
7490 encode_coding_object (&coding, string, 0, 0, chars, bytes, dst_object);
7491 else
7492 decode_coding_object (&coding, string, 0, 0, chars, bytes, dst_object);
7493 if (! norecord)
7494 Vlast_coding_system_used = CODING_ID_NAME (coding.id);
7495
7496 if (coding.result != CODING_RESULT_SUCCESS)
7497 error ("Code conversion error: %d", coding.result);
7498
7499 return (BUFFERP (dst_object)
7500 ? make_number (coding.produced_char)
7501 : coding.dst_object);
7502 }
7503
7504
7505 /* Encode or decode STRING according to CODING_SYSTEM.
7506 Do not set Vlast_coding_system_used.
7507
7508 This function is called only from macros DECODE_FILE and
7509 ENCODE_FILE, thus we ignore character composition. */
7510
7511 Lisp_Object
7512 code_convert_string_norecord (string, coding_system, encodep)
7513 Lisp_Object string, coding_system;
7514 int encodep;
7515 {
7516 return code_convert_string (string, coding_system, Qt, encodep, 0, 1);
7517 }
7518
7519
7520 DEFUN ("decode-coding-string", Fdecode_coding_string, Sdecode_coding_string,
7521 2, 4, 0,
7522 doc: /* Decode STRING which is encoded in CODING-SYSTEM, and return the result.
7523
7524 Optional third arg NOCOPY non-nil means it is OK to return STRING itself
7525 if the decoding operation is trivial.
7526
7527 Optional fourth arg BUFFER non-nil meant that the decoded text is
7528 inserted in BUFFER instead of returned as a string. In this case,
7529 the return value is BUFFER.
7530
7531 This function sets `last-coding-system-used' to the precise coding system
7532 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
7533 not fully specified. */)
7534 (string, coding_system, nocopy, buffer)
7535 Lisp_Object string, coding_system, nocopy, buffer;
7536 {
7537 return code_convert_string (string, coding_system, buffer,
7538 0, ! NILP (nocopy), 0);
7539 }
7540
7541 DEFUN ("encode-coding-string", Fencode_coding_string, Sencode_coding_string,
7542 2, 4, 0,
7543 doc: /* Encode STRING to CODING-SYSTEM, and return the result.
7544
7545 Optional third arg NOCOPY non-nil means it is OK to return STRING
7546 itself if the encoding operation is trivial.
7547
7548 Optional fourth arg BUFFER non-nil meant that the encoded text is
7549 inserted in BUFFER instead of returned as a string. In this case,
7550 the return value is BUFFER.
7551
7552 This function sets `last-coding-system-used' to the precise coding system
7553 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
7554 not fully specified.) */)
7555 (string, coding_system, nocopy, buffer)
7556 Lisp_Object string, coding_system, nocopy, buffer;
7557 {
7558 return code_convert_string (string, coding_system, buffer,
7559 1, ! NILP (nocopy), 1);
7560 }
7561
7562 \f
7563 DEFUN ("decode-sjis-char", Fdecode_sjis_char, Sdecode_sjis_char, 1, 1, 0,
7564 doc: /* Decode a Japanese character which has CODE in shift_jis encoding.
7565 Return the corresponding character. */)
7566 (code)
7567 Lisp_Object code;
7568 {
7569 Lisp_Object spec, attrs, val;
7570 struct charset *charset_roman, *charset_kanji, *charset_kana, *charset;
7571 int c;
7572
7573 CHECK_NATNUM (code);
7574 c = XFASTINT (code);
7575 CHECK_CODING_SYSTEM_GET_SPEC (Vsjis_coding_system, spec);
7576 attrs = AREF (spec, 0);
7577
7578 if (ASCII_BYTE_P (c)
7579 && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
7580 return code;
7581
7582 val = CODING_ATTR_CHARSET_LIST (attrs);
7583 charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
7584 charset_kana = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
7585 charset_kanji = CHARSET_FROM_ID (XINT (XCAR (val)));
7586
7587 if (c <= 0x7F)
7588 charset = charset_roman;
7589 else if (c >= 0xA0 && c < 0xDF)
7590 {
7591 charset = charset_kana;
7592 c -= 0x80;
7593 }
7594 else
7595 {
7596 int s1 = c >> 8, s2 = c & 0xFF;
7597
7598 if (s1 < 0x81 || (s1 > 0x9F && s1 < 0xE0) || s1 > 0xEF
7599 || s2 < 0x40 || s2 == 0x7F || s2 > 0xFC)
7600 error ("Invalid code: %d", code);
7601 SJIS_TO_JIS (c);
7602 charset = charset_kanji;
7603 }
7604 c = DECODE_CHAR (charset, c);
7605 if (c < 0)
7606 error ("Invalid code: %d", code);
7607 return make_number (c);
7608 }
7609
7610
7611 DEFUN ("encode-sjis-char", Fencode_sjis_char, Sencode_sjis_char, 1, 1, 0,
7612 doc: /* Encode a Japanese character CHAR to shift_jis encoding.
7613 Return the corresponding code in SJIS. */)
7614 (ch)
7615 Lisp_Object ch;
7616 {
7617 Lisp_Object spec, attrs, charset_list;
7618 int c;
7619 struct charset *charset;
7620 unsigned code;
7621
7622 CHECK_CHARACTER (ch);
7623 c = XFASTINT (ch);
7624 CHECK_CODING_SYSTEM_GET_SPEC (Vsjis_coding_system, spec);
7625 attrs = AREF (spec, 0);
7626
7627 if (ASCII_CHAR_P (c)
7628 && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
7629 return ch;
7630
7631 charset_list = CODING_ATTR_CHARSET_LIST (attrs);
7632 charset = char_charset (c, charset_list, &code);
7633 if (code == CHARSET_INVALID_CODE (charset))
7634 error ("Can't encode by shift_jis encoding: %d", c);
7635 JIS_TO_SJIS (code);
7636
7637 return make_number (code);
7638 }
7639
7640 DEFUN ("decode-big5-char", Fdecode_big5_char, Sdecode_big5_char, 1, 1, 0,
7641 doc: /* Decode a Big5 character which has CODE in BIG5 coding system.
7642 Return the corresponding character. */)
7643 (code)
7644 Lisp_Object code;
7645 {
7646 Lisp_Object spec, attrs, val;
7647 struct charset *charset_roman, *charset_big5, *charset;
7648 int c;
7649
7650 CHECK_NATNUM (code);
7651 c = XFASTINT (code);
7652 CHECK_CODING_SYSTEM_GET_SPEC (Vbig5_coding_system, spec);
7653 attrs = AREF (spec, 0);
7654
7655 if (ASCII_BYTE_P (c)
7656 && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
7657 return code;
7658
7659 val = CODING_ATTR_CHARSET_LIST (attrs);
7660 charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
7661 charset_big5 = CHARSET_FROM_ID (XINT (XCAR (val)));
7662
7663 if (c <= 0x7F)
7664 charset = charset_roman;
7665 else
7666 {
7667 int b1 = c >> 8, b2 = c & 0x7F;
7668 if (b1 < 0xA1 || b1 > 0xFE
7669 || b2 < 0x40 || (b2 > 0x7E && b2 < 0xA1) || b2 > 0xFE)
7670 error ("Invalid code: %d", code);
7671 charset = charset_big5;
7672 }
7673 c = DECODE_CHAR (charset, (unsigned )c);
7674 if (c < 0)
7675 error ("Invalid code: %d", code);
7676 return make_number (c);
7677 }
7678
7679 DEFUN ("encode-big5-char", Fencode_big5_char, Sencode_big5_char, 1, 1, 0,
7680 doc: /* Encode the Big5 character CHAR to BIG5 coding system.
7681 Return the corresponding character code in Big5. */)
7682 (ch)
7683 Lisp_Object ch;
7684 {
7685 Lisp_Object spec, attrs, charset_list;
7686 struct charset *charset;
7687 int c;
7688 unsigned code;
7689
7690 CHECK_CHARACTER (ch);
7691 c = XFASTINT (ch);
7692 CHECK_CODING_SYSTEM_GET_SPEC (Vbig5_coding_system, spec);
7693 attrs = AREF (spec, 0);
7694 if (ASCII_CHAR_P (c)
7695 && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
7696 return ch;
7697
7698 charset_list = CODING_ATTR_CHARSET_LIST (attrs);
7699 charset = char_charset (c, charset_list, &code);
7700 if (code == CHARSET_INVALID_CODE (charset))
7701 error ("Can't encode by Big5 encoding: %d", c);
7702
7703 return make_number (code);
7704 }
7705
7706 \f
7707 DEFUN ("set-terminal-coding-system-internal",
7708 Fset_terminal_coding_system_internal,
7709 Sset_terminal_coding_system_internal, 1, 1, 0,
7710 doc: /* Internal use only. */)
7711 (coding_system)
7712 Lisp_Object coding_system;
7713 {
7714 CHECK_SYMBOL (coding_system);
7715 setup_coding_system (Fcheck_coding_system (coding_system),
7716 &terminal_coding);
7717
7718 /* We had better not send unsafe characters to terminal. */
7719 terminal_coding.mode |= CODING_MODE_SAFE_ENCODING;
7720 /* Characer composition should be disabled. */
7721 terminal_coding.common_flags &= ~CODING_ANNOTATE_COMPOSITION_MASK;
7722 terminal_coding.src_multibyte = 1;
7723 terminal_coding.dst_multibyte = 0;
7724 return Qnil;
7725 }
7726
7727 DEFUN ("set-safe-terminal-coding-system-internal",
7728 Fset_safe_terminal_coding_system_internal,
7729 Sset_safe_terminal_coding_system_internal, 1, 1, 0,
7730 doc: /* Internal use only. */)
7731 (coding_system)
7732 Lisp_Object coding_system;
7733 {
7734 CHECK_SYMBOL (coding_system);
7735 setup_coding_system (Fcheck_coding_system (coding_system),
7736 &safe_terminal_coding);
7737 /* Characer composition should be disabled. */
7738 safe_terminal_coding.common_flags &= ~CODING_ANNOTATE_COMPOSITION_MASK;
7739 safe_terminal_coding.src_multibyte = 1;
7740 safe_terminal_coding.dst_multibyte = 0;
7741 return Qnil;
7742 }
7743
7744 DEFUN ("terminal-coding-system",
7745 Fterminal_coding_system, Sterminal_coding_system, 0, 0, 0,
7746 doc: /* Return coding system specified for terminal output. */)
7747 ()
7748 {
7749 return CODING_ID_NAME (terminal_coding.id);
7750 }
7751
7752 DEFUN ("set-keyboard-coding-system-internal",
7753 Fset_keyboard_coding_system_internal,
7754 Sset_keyboard_coding_system_internal, 1, 1, 0,
7755 doc: /* Internal use only. */)
7756 (coding_system)
7757 Lisp_Object coding_system;
7758 {
7759 CHECK_SYMBOL (coding_system);
7760 setup_coding_system (Fcheck_coding_system (coding_system),
7761 &keyboard_coding);
7762 /* Characer composition should be disabled. */
7763 keyboard_coding.common_flags &= ~CODING_ANNOTATE_COMPOSITION_MASK;
7764 return Qnil;
7765 }
7766
7767 DEFUN ("keyboard-coding-system",
7768 Fkeyboard_coding_system, Skeyboard_coding_system, 0, 0, 0,
7769 doc: /* Return coding system specified for decoding keyboard input. */)
7770 ()
7771 {
7772 return CODING_ID_NAME (keyboard_coding.id);
7773 }
7774
7775 \f
7776 DEFUN ("find-operation-coding-system", Ffind_operation_coding_system,
7777 Sfind_operation_coding_system, 1, MANY, 0,
7778 doc: /* Choose a coding system for an operation based on the target name.
7779 The value names a pair of coding systems: (DECODING-SYSTEM . ENCODING-SYSTEM).
7780 DECODING-SYSTEM is the coding system to use for decoding
7781 \(in case OPERATION does decoding), and ENCODING-SYSTEM is the coding system
7782 for encoding (in case OPERATION does encoding).
7783
7784 The first argument OPERATION specifies an I/O primitive:
7785 For file I/O, `insert-file-contents' or `write-region'.
7786 For process I/O, `call-process', `call-process-region', or `start-process'.
7787 For network I/O, `open-network-stream'.
7788
7789 The remaining arguments should be the same arguments that were passed
7790 to the primitive. Depending on which primitive, one of those arguments
7791 is selected as the TARGET. For example, if OPERATION does file I/O,
7792 whichever argument specifies the file name is TARGET.
7793
7794 TARGET has a meaning which depends on OPERATION:
7795 For file I/O, TARGET is a file name.
7796 For process I/O, TARGET is a process name.
7797 For network I/O, TARGET is a service name or a port number
7798
7799 This function looks up what specified for TARGET in,
7800 `file-coding-system-alist', `process-coding-system-alist',
7801 or `network-coding-system-alist' depending on OPERATION.
7802 They may specify a coding system, a cons of coding systems,
7803 or a function symbol to call.
7804 In the last case, we call the function with one argument,
7805 which is a list of all the arguments given to this function.
7806
7807 usage: (find-operation-coding-system OPERATION ARGUMENTS ...) */)
7808 (nargs, args)
7809 int nargs;
7810 Lisp_Object *args;
7811 {
7812 Lisp_Object operation, target_idx, target, val;
7813 register Lisp_Object chain;
7814
7815 if (nargs < 2)
7816 error ("Too few arguments");
7817 operation = args[0];
7818 if (!SYMBOLP (operation)
7819 || !INTEGERP (target_idx = Fget (operation, Qtarget_idx)))
7820 error ("Invalid first arguement");
7821 if (nargs < 1 + XINT (target_idx))
7822 error ("Too few arguments for operation: %s",
7823 SDATA (SYMBOL_NAME (operation)));
7824 target = args[XINT (target_idx) + 1];
7825 if (!(STRINGP (target)
7826 || (EQ (operation, Qopen_network_stream) && INTEGERP (target))))
7827 error ("Invalid %dth argument", XINT (target_idx) + 1);
7828
7829 chain = ((EQ (operation, Qinsert_file_contents)
7830 || EQ (operation, Qwrite_region))
7831 ? Vfile_coding_system_alist
7832 : (EQ (operation, Qopen_network_stream)
7833 ? Vnetwork_coding_system_alist
7834 : Vprocess_coding_system_alist));
7835 if (NILP (chain))
7836 return Qnil;
7837
7838 for (; CONSP (chain); chain = XCDR (chain))
7839 {
7840 Lisp_Object elt;
7841
7842 elt = XCAR (chain);
7843 if (CONSP (elt)
7844 && ((STRINGP (target)
7845 && STRINGP (XCAR (elt))
7846 && fast_string_match (XCAR (elt), target) >= 0)
7847 || (INTEGERP (target) && EQ (target, XCAR (elt)))))
7848 {
7849 val = XCDR (elt);
7850 /* Here, if VAL is both a valid coding system and a valid
7851 function symbol, we return VAL as a coding system. */
7852 if (CONSP (val))
7853 return val;
7854 if (! SYMBOLP (val))
7855 return Qnil;
7856 if (! NILP (Fcoding_system_p (val)))
7857 return Fcons (val, val);
7858 if (! NILP (Ffboundp (val)))
7859 {
7860 val = call1 (val, Flist (nargs, args));
7861 if (CONSP (val))
7862 return val;
7863 if (SYMBOLP (val) && ! NILP (Fcoding_system_p (val)))
7864 return Fcons (val, val);
7865 }
7866 return Qnil;
7867 }
7868 }
7869 return Qnil;
7870 }
7871
7872 DEFUN ("set-coding-system-priority", Fset_coding_system_priority,
7873 Sset_coding_system_priority, 0, MANY, 0,
7874 doc: /* Assign higher priority to the coding systems given as arguments.
7875 If multiple coding systems belongs to the same category,
7876 all but the first one are ignored.
7877
7878 usage: (set-coding-system-priority ...) */)
7879 (nargs, args)
7880 int nargs;
7881 Lisp_Object *args;
7882 {
7883 int i, j;
7884 int changed[coding_category_max];
7885 enum coding_category priorities[coding_category_max];
7886
7887 bzero (changed, sizeof changed);
7888
7889 for (i = j = 0; i < nargs; i++)
7890 {
7891 enum coding_category category;
7892 Lisp_Object spec, attrs;
7893
7894 CHECK_CODING_SYSTEM_GET_SPEC (args[i], spec);
7895 attrs = AREF (spec, 0);
7896 category = XINT (CODING_ATTR_CATEGORY (attrs));
7897 if (changed[category])
7898 /* Ignore this coding system because a coding system of the
7899 same category already had a higher priority. */
7900 continue;
7901 changed[category] = 1;
7902 priorities[j++] = category;
7903 if (coding_categories[category].id >= 0
7904 && ! EQ (args[i], CODING_ID_NAME (coding_categories[category].id)))
7905 setup_coding_system (args[i], &coding_categories[category]);
7906 Fset (AREF (Vcoding_category_table, category), args[i]);
7907 }
7908
7909 /* Now we have decided top J priorities. Reflect the order of the
7910 original priorities to the remaining priorities. */
7911
7912 for (i = j, j = 0; i < coding_category_max; i++, j++)
7913 {
7914 while (j < coding_category_max
7915 && changed[coding_priorities[j]])
7916 j++;
7917 if (j == coding_category_max)
7918 abort ();
7919 priorities[i] = coding_priorities[j];
7920 }
7921
7922 bcopy (priorities, coding_priorities, sizeof priorities);
7923
7924 /* Update `coding-category-list'. */
7925 Vcoding_category_list = Qnil;
7926 for (i = coding_category_max - 1; i >= 0; i--)
7927 Vcoding_category_list
7928 = Fcons (AREF (Vcoding_category_table, priorities[i]),
7929 Vcoding_category_list);
7930
7931 return Qnil;
7932 }
7933
7934 DEFUN ("coding-system-priority-list", Fcoding_system_priority_list,
7935 Scoding_system_priority_list, 0, 1, 0,
7936 doc: /* Return a list of coding systems ordered by their priorities.
7937 HIGHESTP non-nil means just return the highest priority one. */)
7938 (highestp)
7939 Lisp_Object highestp;
7940 {
7941 int i;
7942 Lisp_Object val;
7943
7944 for (i = 0, val = Qnil; i < coding_category_max; i++)
7945 {
7946 enum coding_category category = coding_priorities[i];
7947 int id = coding_categories[category].id;
7948 Lisp_Object attrs;
7949
7950 if (id < 0)
7951 continue;
7952 attrs = CODING_ID_ATTRS (id);
7953 if (! NILP (highestp))
7954 return CODING_ATTR_BASE_NAME (attrs);
7955 val = Fcons (CODING_ATTR_BASE_NAME (attrs), val);
7956 }
7957 return Fnreverse (val);
7958 }
7959
7960 static char *suffixes[] = { "-unix", "-dos", "-mac" };
7961
7962 static Lisp_Object
7963 make_subsidiaries (base)
7964 Lisp_Object base;
7965 {
7966 Lisp_Object subsidiaries;
7967 int base_name_len = SBYTES (SYMBOL_NAME (base));
7968 char *buf = (char *) alloca (base_name_len + 6);
7969 int i;
7970
7971 bcopy (SDATA (SYMBOL_NAME (base)), buf, base_name_len);
7972 subsidiaries = Fmake_vector (make_number (3), Qnil);
7973 for (i = 0; i < 3; i++)
7974 {
7975 bcopy (suffixes[i], buf + base_name_len, strlen (suffixes[i]) + 1);
7976 ASET (subsidiaries, i, intern (buf));
7977 }
7978 return subsidiaries;
7979 }
7980
7981
7982 DEFUN ("define-coding-system-internal", Fdefine_coding_system_internal,
7983 Sdefine_coding_system_internal, coding_arg_max, MANY, 0,
7984 doc: /* For internal use only.
7985 usage: (define-coding-system-internal ...) */)
7986 (nargs, args)
7987 int nargs;
7988 Lisp_Object *args;
7989 {
7990 Lisp_Object name;
7991 Lisp_Object spec_vec; /* [ ATTRS ALIASE EOL_TYPE ] */
7992 Lisp_Object attrs; /* Vector of attributes. */
7993 Lisp_Object eol_type;
7994 Lisp_Object aliases;
7995 Lisp_Object coding_type, charset_list, safe_charsets;
7996 enum coding_category category;
7997 Lisp_Object tail, val;
7998 int max_charset_id = 0;
7999 int i;
8000
8001 if (nargs < coding_arg_max)
8002 goto short_args;
8003
8004 attrs = Fmake_vector (make_number (coding_attr_last_index), Qnil);
8005
8006 name = args[coding_arg_name];
8007 CHECK_SYMBOL (name);
8008 CODING_ATTR_BASE_NAME (attrs) = name;
8009
8010 val = args[coding_arg_mnemonic];
8011 if (! STRINGP (val))
8012 CHECK_CHARACTER (val);
8013 CODING_ATTR_MNEMONIC (attrs) = val;
8014
8015 coding_type = args[coding_arg_coding_type];
8016 CHECK_SYMBOL (coding_type);
8017 CODING_ATTR_TYPE (attrs) = coding_type;
8018
8019 charset_list = args[coding_arg_charset_list];
8020 if (SYMBOLP (charset_list))
8021 {
8022 if (EQ (charset_list, Qiso_2022))
8023 {
8024 if (! EQ (coding_type, Qiso_2022))
8025 error ("Invalid charset-list");
8026 charset_list = Viso_2022_charset_list;
8027 }
8028 else if (EQ (charset_list, Qemacs_mule))
8029 {
8030 if (! EQ (coding_type, Qemacs_mule))
8031 error ("Invalid charset-list");
8032 charset_list = Vemacs_mule_charset_list;
8033 }
8034 for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
8035 if (max_charset_id < XFASTINT (XCAR (tail)))
8036 max_charset_id = XFASTINT (XCAR (tail));
8037 }
8038 else
8039 {
8040 charset_list = Fcopy_sequence (charset_list);
8041 for (tail = charset_list; !NILP (tail); tail = Fcdr (tail))
8042 {
8043 struct charset *charset;
8044
8045 val = Fcar (tail);
8046 CHECK_CHARSET_GET_CHARSET (val, charset);
8047 if (EQ (coding_type, Qiso_2022)
8048 ? CHARSET_ISO_FINAL (charset) < 0
8049 : EQ (coding_type, Qemacs_mule)
8050 ? CHARSET_EMACS_MULE_ID (charset) < 0
8051 : 0)
8052 error ("Can't handle charset `%s'",
8053 SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
8054
8055 XSETCAR (tail, make_number (charset->id));
8056 if (max_charset_id < charset->id)
8057 max_charset_id = charset->id;
8058 }
8059 }
8060 CODING_ATTR_CHARSET_LIST (attrs) = charset_list;
8061
8062 safe_charsets = Fmake_string (make_number (max_charset_id + 1),
8063 make_number (255));
8064 for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
8065 SSET (safe_charsets, XFASTINT (XCAR (tail)), 0);
8066 CODING_ATTR_SAFE_CHARSETS (attrs) = safe_charsets;
8067
8068 CODING_ATTR_ASCII_COMPAT (attrs) = args[coding_arg_ascii_compatible_p];
8069
8070 val = args[coding_arg_decode_translation_table];
8071 if (! NILP (val))
8072 CHECK_CHAR_TABLE (val);
8073 CODING_ATTR_DECODE_TBL (attrs) = val;
8074
8075 val = args[coding_arg_encode_translation_table];
8076 if (! NILP (val))
8077 CHECK_CHAR_TABLE (val);
8078 CODING_ATTR_ENCODE_TBL (attrs) = val;
8079
8080 val = args[coding_arg_post_read_conversion];
8081 CHECK_SYMBOL (val);
8082 CODING_ATTR_POST_READ (attrs) = val;
8083
8084 val = args[coding_arg_pre_write_conversion];
8085 CHECK_SYMBOL (val);
8086 CODING_ATTR_PRE_WRITE (attrs) = val;
8087
8088 val = args[coding_arg_default_char];
8089 if (NILP (val))
8090 CODING_ATTR_DEFAULT_CHAR (attrs) = make_number (' ');
8091 else
8092 {
8093 CHECK_CHARACTER (val);
8094 CODING_ATTR_DEFAULT_CHAR (attrs) = val;
8095 }
8096
8097 val = args[coding_arg_for_unibyte];
8098 CODING_ATTR_FOR_UNIBYTE (attrs) = NILP (val) ? Qnil : Qt;
8099
8100 val = args[coding_arg_plist];
8101 CHECK_LIST (val);
8102 CODING_ATTR_PLIST (attrs) = val;
8103
8104 if (EQ (coding_type, Qcharset))
8105 {
8106 Lisp_Object list;
8107 /* Generate a lisp vector of 256 elements. Each element is nil,
8108 integer, or a list of charset IDs.
8109
8110 If Nth element is nil, the byte code N is invalid in this
8111 coding system.
8112
8113 If Nth element is a number NUM, N is the first byte of a
8114 charset whose ID is NUM.
8115
8116 If Nth element is a list of charset IDs, N is the first byte
8117 of one of them. The list is sorted by dimensions of the
8118 charsets. A charset of smaller dimension comes firtst.
8119 */
8120 for (list = Qnil, tail = charset_list; CONSP (tail); tail = XCDR (tail))
8121 {
8122 struct charset *charset = CHARSET_FROM_ID (XFASTINT (XCAR (tail)));
8123
8124 if (charset->method == CHARSET_METHOD_SUPERSET)
8125 {
8126 val = CHARSET_SUPERSET (charset);
8127 for (; CONSP (val); val = XCDR (val))
8128 list = Fcons (XCAR (XCAR (val)), list);
8129 }
8130 else
8131 list = Fcons (XCAR (tail), list);
8132 }
8133
8134 val = Fmake_vector (make_number (256), Qnil);
8135
8136 for (tail = Fnreverse (list); CONSP (tail); tail = XCDR (tail))
8137 {
8138 struct charset *charset = CHARSET_FROM_ID (XFASTINT (XCAR (tail)));
8139 int dim = CHARSET_DIMENSION (charset);
8140 int idx = (dim - 1) * 4;
8141
8142 if (CHARSET_ASCII_COMPATIBLE_P (charset))
8143 CODING_ATTR_ASCII_COMPAT (attrs) = Qt;
8144
8145 for (i = charset->code_space[idx];
8146 i <= charset->code_space[idx + 1]; i++)
8147 {
8148 Lisp_Object tmp, tmp2;
8149 int dim2;
8150
8151 tmp = AREF (val, i);
8152 if (NILP (tmp))
8153 tmp = XCAR (tail);
8154 else if (NUMBERP (tmp))
8155 {
8156 dim2 = CHARSET_DIMENSION (CHARSET_FROM_ID (XFASTINT (tmp)));
8157 if (dim < dim2)
8158 tmp = Fcons (XCAR (tail), Fcons (tmp, Qnil));
8159 else
8160 tmp = Fcons (tmp, Fcons (XCAR (tail), Qnil));
8161 }
8162 else
8163 {
8164 for (tmp2 = tmp; CONSP (tmp2); tmp2 = XCDR (tmp2))
8165 {
8166 dim2 = CHARSET_DIMENSION (CHARSET_FROM_ID (XFASTINT (XCAR (tmp2))));
8167 if (dim < dim2)
8168 break;
8169 }
8170 if (NILP (tmp2))
8171 tmp = nconc2 (tmp, Fcons (XCAR (tail), Qnil));
8172 else
8173 {
8174 XSETCDR (tmp2, Fcons (XCAR (tmp2), XCDR (tmp2)));
8175 XSETCAR (tmp2, XCAR (tail));
8176 }
8177 }
8178 ASET (val, i, tmp);
8179 }
8180 }
8181 ASET (attrs, coding_attr_charset_valids, val);
8182 category = coding_category_charset;
8183 }
8184 else if (EQ (coding_type, Qccl))
8185 {
8186 Lisp_Object valids;
8187
8188 if (nargs < coding_arg_ccl_max)
8189 goto short_args;
8190
8191 val = args[coding_arg_ccl_decoder];
8192 CHECK_CCL_PROGRAM (val);
8193 if (VECTORP (val))
8194 val = Fcopy_sequence (val);
8195 ASET (attrs, coding_attr_ccl_decoder, val);
8196
8197 val = args[coding_arg_ccl_encoder];
8198 CHECK_CCL_PROGRAM (val);
8199 if (VECTORP (val))
8200 val = Fcopy_sequence (val);
8201 ASET (attrs, coding_attr_ccl_encoder, val);
8202
8203 val = args[coding_arg_ccl_valids];
8204 valids = Fmake_string (make_number (256), make_number (0));
8205 for (tail = val; !NILP (tail); tail = Fcdr (tail))
8206 {
8207 int from, to;
8208
8209 val = Fcar (tail);
8210 if (INTEGERP (val))
8211 {
8212 from = to = XINT (val);
8213 if (from < 0 || from > 255)
8214 args_out_of_range_3 (val, make_number (0), make_number (255));
8215 }
8216 else
8217 {
8218 CHECK_CONS (val);
8219 CHECK_NATNUM_CAR (val);
8220 CHECK_NATNUM_CDR (val);
8221 from = XINT (XCAR (val));
8222 if (from > 255)
8223 args_out_of_range_3 (XCAR (val),
8224 make_number (0), make_number (255));
8225 to = XINT (XCDR (val));
8226 if (to < from || to > 255)
8227 args_out_of_range_3 (XCDR (val),
8228 XCAR (val), make_number (255));
8229 }
8230 for (i = from; i <= to; i++)
8231 SSET (valids, i, 1);
8232 }
8233 ASET (attrs, coding_attr_ccl_valids, valids);
8234
8235 category = coding_category_ccl;
8236 }
8237 else if (EQ (coding_type, Qutf_16))
8238 {
8239 Lisp_Object bom, endian;
8240
8241 CODING_ATTR_ASCII_COMPAT (attrs) = Qnil;
8242
8243 if (nargs < coding_arg_utf16_max)
8244 goto short_args;
8245
8246 bom = args[coding_arg_utf16_bom];
8247 if (! NILP (bom) && ! EQ (bom, Qt))
8248 {
8249 CHECK_CONS (bom);
8250 val = XCAR (bom);
8251 CHECK_CODING_SYSTEM (val);
8252 val = XCDR (bom);
8253 CHECK_CODING_SYSTEM (val);
8254 }
8255 ASET (attrs, coding_attr_utf_16_bom, bom);
8256
8257 endian = args[coding_arg_utf16_endian];
8258 CHECK_SYMBOL (endian);
8259 if (NILP (endian))
8260 endian = Qbig;
8261 else if (! EQ (endian, Qbig) && ! EQ (endian, Qlittle))
8262 error ("Invalid endian: %s", SDATA (SYMBOL_NAME (endian)));
8263 ASET (attrs, coding_attr_utf_16_endian, endian);
8264
8265 category = (CONSP (bom)
8266 ? coding_category_utf_16_auto
8267 : NILP (bom)
8268 ? (EQ (endian, Qbig)
8269 ? coding_category_utf_16_be_nosig
8270 : coding_category_utf_16_le_nosig)
8271 : (EQ (endian, Qbig)
8272 ? coding_category_utf_16_be
8273 : coding_category_utf_16_le));
8274 }
8275 else if (EQ (coding_type, Qiso_2022))
8276 {
8277 Lisp_Object initial, reg_usage, request, flags;
8278 int i;
8279
8280 if (nargs < coding_arg_iso2022_max)
8281 goto short_args;
8282
8283 initial = Fcopy_sequence (args[coding_arg_iso2022_initial]);
8284 CHECK_VECTOR (initial);
8285 for (i = 0; i < 4; i++)
8286 {
8287 val = Faref (initial, make_number (i));
8288 if (! NILP (val))
8289 {
8290 struct charset *charset;
8291
8292 CHECK_CHARSET_GET_CHARSET (val, charset);
8293 ASET (initial, i, make_number (CHARSET_ID (charset)));
8294 if (i == 0 && CHARSET_ASCII_COMPATIBLE_P (charset))
8295 CODING_ATTR_ASCII_COMPAT (attrs) = Qt;
8296 }
8297 else
8298 ASET (initial, i, make_number (-1));
8299 }
8300
8301 reg_usage = args[coding_arg_iso2022_reg_usage];
8302 CHECK_CONS (reg_usage);
8303 CHECK_NUMBER_CAR (reg_usage);
8304 CHECK_NUMBER_CDR (reg_usage);
8305
8306 request = Fcopy_sequence (args[coding_arg_iso2022_request]);
8307 for (tail = request; ! NILP (tail); tail = Fcdr (tail))
8308 {
8309 int id;
8310 Lisp_Object tmp;
8311
8312 val = Fcar (tail);
8313 CHECK_CONS (val);
8314 tmp = XCAR (val);
8315 CHECK_CHARSET_GET_ID (tmp, id);
8316 CHECK_NATNUM_CDR (val);
8317 if (XINT (XCDR (val)) >= 4)
8318 error ("Invalid graphic register number: %d", XINT (XCDR (val)));
8319 XSETCAR (val, make_number (id));
8320 }
8321
8322 flags = args[coding_arg_iso2022_flags];
8323 CHECK_NATNUM (flags);
8324 i = XINT (flags);
8325 if (EQ (args[coding_arg_charset_list], Qiso_2022))
8326 flags = make_number (i | CODING_ISO_FLAG_FULL_SUPPORT);
8327
8328 ASET (attrs, coding_attr_iso_initial, initial);
8329 ASET (attrs, coding_attr_iso_usage, reg_usage);
8330 ASET (attrs, coding_attr_iso_request, request);
8331 ASET (attrs, coding_attr_iso_flags, flags);
8332 setup_iso_safe_charsets (attrs);
8333
8334 if (i & CODING_ISO_FLAG_SEVEN_BITS)
8335 category = ((i & (CODING_ISO_FLAG_LOCKING_SHIFT
8336 | CODING_ISO_FLAG_SINGLE_SHIFT))
8337 ? coding_category_iso_7_else
8338 : EQ (args[coding_arg_charset_list], Qiso_2022)
8339 ? coding_category_iso_7
8340 : coding_category_iso_7_tight);
8341 else
8342 {
8343 int id = XINT (AREF (initial, 1));
8344
8345 category = (((i & CODING_ISO_FLAG_LOCKING_SHIFT)
8346 || EQ (args[coding_arg_charset_list], Qiso_2022)
8347 || id < 0)
8348 ? coding_category_iso_8_else
8349 : (CHARSET_DIMENSION (CHARSET_FROM_ID (id)) == 1)
8350 ? coding_category_iso_8_1
8351 : coding_category_iso_8_2);
8352 }
8353 if (category != coding_category_iso_8_1
8354 && category != coding_category_iso_8_2)
8355 CODING_ATTR_ASCII_COMPAT (attrs) = Qnil;
8356 }
8357 else if (EQ (coding_type, Qemacs_mule))
8358 {
8359 if (EQ (args[coding_arg_charset_list], Qemacs_mule))
8360 ASET (attrs, coding_attr_emacs_mule_full, Qt);
8361 CODING_ATTR_ASCII_COMPAT (attrs) = Qt;
8362 category = coding_category_emacs_mule;
8363 }
8364 else if (EQ (coding_type, Qshift_jis))
8365 {
8366
8367 struct charset *charset;
8368
8369 if (XINT (Flength (charset_list)) != 3)
8370 error ("There should be just three charsets");
8371
8372 charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
8373 if (CHARSET_DIMENSION (charset) != 1)
8374 error ("Dimension of charset %s is not one",
8375 SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
8376 if (CHARSET_ASCII_COMPATIBLE_P (charset))
8377 CODING_ATTR_ASCII_COMPAT (attrs) = Qt;
8378
8379 charset_list = XCDR (charset_list);
8380 charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
8381 if (CHARSET_DIMENSION (charset) != 1)
8382 error ("Dimension of charset %s is not one",
8383 SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
8384
8385 charset_list = XCDR (charset_list);
8386 charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
8387 if (CHARSET_DIMENSION (charset) != 2)
8388 error ("Dimension of charset %s is not two",
8389 SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
8390
8391 category = coding_category_sjis;
8392 Vsjis_coding_system = name;
8393 }
8394 else if (EQ (coding_type, Qbig5))
8395 {
8396 struct charset *charset;
8397
8398 if (XINT (Flength (charset_list)) != 2)
8399 error ("There should be just two charsets");
8400
8401 charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
8402 if (CHARSET_DIMENSION (charset) != 1)
8403 error ("Dimension of charset %s is not one",
8404 SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
8405 if (CHARSET_ASCII_COMPATIBLE_P (charset))
8406 CODING_ATTR_ASCII_COMPAT (attrs) = Qt;
8407
8408 charset_list = XCDR (charset_list);
8409 charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
8410 if (CHARSET_DIMENSION (charset) != 2)
8411 error ("Dimension of charset %s is not two",
8412 SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
8413
8414 category = coding_category_big5;
8415 Vbig5_coding_system = name;
8416 }
8417 else if (EQ (coding_type, Qraw_text))
8418 {
8419 category = coding_category_raw_text;
8420 CODING_ATTR_ASCII_COMPAT (attrs) = Qt;
8421 }
8422 else if (EQ (coding_type, Qutf_8))
8423 {
8424 category = coding_category_utf_8;
8425 CODING_ATTR_ASCII_COMPAT (attrs) = Qt;
8426 }
8427 else if (EQ (coding_type, Qundecided))
8428 category = coding_category_undecided;
8429 else
8430 error ("Invalid coding system type: %s",
8431 SDATA (SYMBOL_NAME (coding_type)));
8432
8433 CODING_ATTR_CATEGORY (attrs) = make_number (category);
8434 CODING_ATTR_PLIST (attrs)
8435 = Fcons (QCcategory, Fcons (AREF (Vcoding_category_table, category),
8436 CODING_ATTR_PLIST (attrs)));
8437
8438 eol_type = args[coding_arg_eol_type];
8439 if (! NILP (eol_type)
8440 && ! EQ (eol_type, Qunix)
8441 && ! EQ (eol_type, Qdos)
8442 && ! EQ (eol_type, Qmac))
8443 error ("Invalid eol-type");
8444
8445 aliases = Fcons (name, Qnil);
8446
8447 if (NILP (eol_type))
8448 {
8449 eol_type = make_subsidiaries (name);
8450 for (i = 0; i < 3; i++)
8451 {
8452 Lisp_Object this_spec, this_name, this_aliases, this_eol_type;
8453
8454 this_name = AREF (eol_type, i);
8455 this_aliases = Fcons (this_name, Qnil);
8456 this_eol_type = (i == 0 ? Qunix : i == 1 ? Qdos : Qmac);
8457 this_spec = Fmake_vector (make_number (3), attrs);
8458 ASET (this_spec, 1, this_aliases);
8459 ASET (this_spec, 2, this_eol_type);
8460 Fputhash (this_name, this_spec, Vcoding_system_hash_table);
8461 Vcoding_system_list = Fcons (this_name, Vcoding_system_list);
8462 Vcoding_system_alist = Fcons (Fcons (Fsymbol_name (this_name), Qnil),
8463 Vcoding_system_alist);
8464 }
8465 }
8466
8467 spec_vec = Fmake_vector (make_number (3), attrs);
8468 ASET (spec_vec, 1, aliases);
8469 ASET (spec_vec, 2, eol_type);
8470
8471 Fputhash (name, spec_vec, Vcoding_system_hash_table);
8472 Vcoding_system_list = Fcons (name, Vcoding_system_list);
8473 Vcoding_system_alist = Fcons (Fcons (Fsymbol_name (name), Qnil),
8474 Vcoding_system_alist);
8475
8476 {
8477 int id = coding_categories[category].id;
8478
8479 if (id < 0 || EQ (name, CODING_ID_NAME (id)))
8480 setup_coding_system (name, &coding_categories[category]);
8481 }
8482
8483 return Qnil;
8484
8485 short_args:
8486 return Fsignal (Qwrong_number_of_arguments,
8487 Fcons (intern ("define-coding-system-internal"),
8488 make_number (nargs)));
8489 }
8490
8491 /* Fixme: should this record the alias relationships for
8492 diagnostics? Should it update coding-system-list? */
8493 DEFUN ("define-coding-system-alias", Fdefine_coding_system_alias,
8494 Sdefine_coding_system_alias, 2, 2, 0,
8495 doc: /* Define ALIAS as an alias for CODING-SYSTEM. */)
8496 (alias, coding_system)
8497 Lisp_Object alias, coding_system;
8498 {
8499 Lisp_Object spec, aliases, eol_type;
8500
8501 CHECK_SYMBOL (alias);
8502 CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
8503 aliases = AREF (spec, 1);
8504 while (!NILP (XCDR (aliases)))
8505 aliases = XCDR (aliases);
8506 XSETCDR (aliases, Fcons (alias, Qnil));
8507
8508 eol_type = AREF (spec, 2);
8509 if (VECTORP (eol_type))
8510 {
8511 Lisp_Object subsidiaries;
8512 int i;
8513
8514 subsidiaries = make_subsidiaries (alias);
8515 for (i = 0; i < 3; i++)
8516 Fdefine_coding_system_alias (AREF (subsidiaries, i),
8517 AREF (eol_type, i));
8518
8519 ASET (spec, 2, subsidiaries);
8520 }
8521
8522 Fputhash (alias, spec, Vcoding_system_hash_table);
8523 Vcoding_system_alist = Fcons (Fcons (Fsymbol_name (alias), Qnil),
8524 Vcoding_system_alist);
8525
8526 return Qnil;
8527 }
8528
8529 DEFUN ("coding-system-base", Fcoding_system_base, Scoding_system_base,
8530 1, 1, 0,
8531 doc: /* Return the base of CODING-SYSTEM.
8532 Any alias or subsidiary coding system is not a base coding system. */)
8533 (coding_system)
8534 Lisp_Object coding_system;
8535 {
8536 Lisp_Object spec, attrs;
8537
8538 if (NILP (coding_system))
8539 return (Qno_conversion);
8540 CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
8541 attrs = AREF (spec, 0);
8542 return CODING_ATTR_BASE_NAME (attrs);
8543 }
8544
8545 DEFUN ("coding-system-plist", Fcoding_system_plist, Scoding_system_plist,
8546 1, 1, 0,
8547 doc: "Return the property list of CODING-SYSTEM.")
8548 (coding_system)
8549 Lisp_Object coding_system;
8550 {
8551 Lisp_Object spec, attrs;
8552
8553 if (NILP (coding_system))
8554 coding_system = Qno_conversion;
8555 CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
8556 attrs = AREF (spec, 0);
8557 return CODING_ATTR_PLIST (attrs);
8558 }
8559
8560
8561 DEFUN ("coding-system-aliases", Fcoding_system_aliases, Scoding_system_aliases,
8562 1, 1, 0,
8563 doc: /* Return the list of aliases of CODING-SYSTEM. */)
8564 (coding_system)
8565 Lisp_Object coding_system;
8566 {
8567 Lisp_Object spec;
8568
8569 if (NILP (coding_system))
8570 coding_system = Qno_conversion;
8571 CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
8572 return AREF (spec, 1);
8573 }
8574
8575 DEFUN ("coding-system-eol-type", Fcoding_system_eol_type,
8576 Scoding_system_eol_type, 1, 1, 0,
8577 doc: /* Return eol-type of CODING-SYSTEM.
8578 An eol-type is integer 0, 1, 2, or a vector of coding systems.
8579
8580 Integer values 0, 1, and 2 indicate a format of end-of-line; LF, CRLF,
8581 and CR respectively.
8582
8583 A vector value indicates that a format of end-of-line should be
8584 detected automatically. Nth element of the vector is the subsidiary
8585 coding system whose eol-type is N. */)
8586 (coding_system)
8587 Lisp_Object coding_system;
8588 {
8589 Lisp_Object spec, eol_type;
8590 int n;
8591
8592 if (NILP (coding_system))
8593 coding_system = Qno_conversion;
8594 if (! CODING_SYSTEM_P (coding_system))
8595 return Qnil;
8596 spec = CODING_SYSTEM_SPEC (coding_system);
8597 eol_type = AREF (spec, 2);
8598 if (VECTORP (eol_type))
8599 return Fcopy_sequence (eol_type);
8600 n = EQ (eol_type, Qunix) ? 0 : EQ (eol_type, Qdos) ? 1 : 2;
8601 return make_number (n);
8602 }
8603
8604 #endif /* emacs */
8605
8606 \f
8607 /*** 9. Post-amble ***/
8608
8609 void
8610 init_coding_once ()
8611 {
8612 int i;
8613
8614 for (i = 0; i < coding_category_max; i++)
8615 {
8616 coding_categories[i].id = -1;
8617 coding_priorities[i] = i;
8618 }
8619
8620 /* ISO2022 specific initialize routine. */
8621 for (i = 0; i < 0x20; i++)
8622 iso_code_class[i] = ISO_control_0;
8623 for (i = 0x21; i < 0x7F; i++)
8624 iso_code_class[i] = ISO_graphic_plane_0;
8625 for (i = 0x80; i < 0xA0; i++)
8626 iso_code_class[i] = ISO_control_1;
8627 for (i = 0xA1; i < 0xFF; i++)
8628 iso_code_class[i] = ISO_graphic_plane_1;
8629 iso_code_class[0x20] = iso_code_class[0x7F] = ISO_0x20_or_0x7F;
8630 iso_code_class[0xA0] = iso_code_class[0xFF] = ISO_0xA0_or_0xFF;
8631 iso_code_class[ISO_CODE_CR] = ISO_carriage_return;
8632 iso_code_class[ISO_CODE_SO] = ISO_shift_out;
8633 iso_code_class[ISO_CODE_SI] = ISO_shift_in;
8634 iso_code_class[ISO_CODE_SS2_7] = ISO_single_shift_2_7;
8635 iso_code_class[ISO_CODE_ESC] = ISO_escape;
8636 iso_code_class[ISO_CODE_SS2] = ISO_single_shift_2;
8637 iso_code_class[ISO_CODE_SS3] = ISO_single_shift_3;
8638 iso_code_class[ISO_CODE_CSI] = ISO_control_sequence_introducer;
8639
8640 inhibit_pre_post_conversion = 0;
8641
8642 for (i = 0; i < 256; i++)
8643 {
8644 emacs_mule_bytes[i] = 1;
8645 }
8646 emacs_mule_bytes[EMACS_MULE_LEADING_CODE_PRIVATE_11] = 3;
8647 emacs_mule_bytes[EMACS_MULE_LEADING_CODE_PRIVATE_12] = 3;
8648 emacs_mule_bytes[EMACS_MULE_LEADING_CODE_PRIVATE_21] = 4;
8649 emacs_mule_bytes[EMACS_MULE_LEADING_CODE_PRIVATE_22] = 4;
8650 }
8651
8652 #ifdef emacs
8653
8654 void
8655 syms_of_coding ()
8656 {
8657 staticpro (&Vcoding_system_hash_table);
8658 {
8659 Lisp_Object args[2];
8660 args[0] = QCtest;
8661 args[1] = Qeq;
8662 Vcoding_system_hash_table = Fmake_hash_table (2, args);
8663 }
8664
8665 staticpro (&Vsjis_coding_system);
8666 Vsjis_coding_system = Qnil;
8667
8668 staticpro (&Vbig5_coding_system);
8669 Vbig5_coding_system = Qnil;
8670
8671 staticpro (&Vcode_conversion_work_buf_list);
8672 Vcode_conversion_work_buf_list = Qnil;
8673
8674 staticpro (&Vcode_conversion_reused_work_buf);
8675 Vcode_conversion_reused_work_buf = Qnil;
8676
8677 DEFSYM (Qcharset, "charset");
8678 DEFSYM (Qtarget_idx, "target-idx");
8679 DEFSYM (Qcoding_system_history, "coding-system-history");
8680 Fset (Qcoding_system_history, Qnil);
8681
8682 /* Target FILENAME is the first argument. */
8683 Fput (Qinsert_file_contents, Qtarget_idx, make_number (0));
8684 /* Target FILENAME is the third argument. */
8685 Fput (Qwrite_region, Qtarget_idx, make_number (2));
8686
8687 DEFSYM (Qcall_process, "call-process");
8688 /* Target PROGRAM is the first argument. */
8689 Fput (Qcall_process, Qtarget_idx, make_number (0));
8690
8691 DEFSYM (Qcall_process_region, "call-process-region");
8692 /* Target PROGRAM is the third argument. */
8693 Fput (Qcall_process_region, Qtarget_idx, make_number (2));
8694
8695 DEFSYM (Qstart_process, "start-process");
8696 /* Target PROGRAM is the third argument. */
8697 Fput (Qstart_process, Qtarget_idx, make_number (2));
8698
8699 DEFSYM (Qopen_network_stream, "open-network-stream");
8700 /* Target SERVICE is the fourth argument. */
8701 Fput (Qopen_network_stream, Qtarget_idx, make_number (3));
8702
8703 DEFSYM (Qcoding_system, "coding-system");
8704 DEFSYM (Qcoding_aliases, "coding-aliases");
8705
8706 DEFSYM (Qeol_type, "eol-type");
8707 DEFSYM (Qunix, "unix");
8708 DEFSYM (Qdos, "dos");
8709
8710 DEFSYM (Qbuffer_file_coding_system, "buffer-file-coding-system");
8711 DEFSYM (Qpost_read_conversion, "post-read-conversion");
8712 DEFSYM (Qpre_write_conversion, "pre-write-conversion");
8713 DEFSYM (Qdefault_char, "default-char");
8714 DEFSYM (Qundecided, "undecided");
8715 DEFSYM (Qno_conversion, "no-conversion");
8716 DEFSYM (Qraw_text, "raw-text");
8717
8718 DEFSYM (Qiso_2022, "iso-2022");
8719
8720 DEFSYM (Qutf_8, "utf-8");
8721 DEFSYM (Qutf_8_emacs, "utf-8-emacs");
8722
8723 DEFSYM (Qutf_16, "utf-16");
8724 DEFSYM (Qbig, "big");
8725 DEFSYM (Qlittle, "little");
8726
8727 DEFSYM (Qshift_jis, "shift-jis");
8728 DEFSYM (Qbig5, "big5");
8729
8730 DEFSYM (Qcoding_system_p, "coding-system-p");
8731
8732 DEFSYM (Qcoding_system_error, "coding-system-error");
8733 Fput (Qcoding_system_error, Qerror_conditions,
8734 Fcons (Qcoding_system_error, Fcons (Qerror, Qnil)));
8735 Fput (Qcoding_system_error, Qerror_message,
8736 build_string ("Invalid coding system"));
8737
8738 /* Intern this now in case it isn't already done.
8739 Setting this variable twice is harmless.
8740 But don't staticpro it here--that is done in alloc.c. */
8741 Qchar_table_extra_slots = intern ("char-table-extra-slots");
8742
8743 DEFSYM (Qtranslation_table, "translation-table");
8744 Fput (Qtranslation_table, Qchar_table_extra_slots, make_number (1));
8745 DEFSYM (Qtranslation_table_id, "translation-table-id");
8746 DEFSYM (Qtranslation_table_for_decode, "translation-table-for-decode");
8747 DEFSYM (Qtranslation_table_for_encode, "translation-table-for-encode");
8748
8749 DEFSYM (Qvalid_codes, "valid-codes");
8750
8751 DEFSYM (Qemacs_mule, "emacs-mule");
8752
8753 DEFSYM (QCcategory, ":category");
8754
8755 Vcoding_category_table
8756 = Fmake_vector (make_number (coding_category_max), Qnil);
8757 staticpro (&Vcoding_category_table);
8758 /* Followings are target of code detection. */
8759 ASET (Vcoding_category_table, coding_category_iso_7,
8760 intern ("coding-category-iso-7"));
8761 ASET (Vcoding_category_table, coding_category_iso_7_tight,
8762 intern ("coding-category-iso-7-tight"));
8763 ASET (Vcoding_category_table, coding_category_iso_8_1,
8764 intern ("coding-category-iso-8-1"));
8765 ASET (Vcoding_category_table, coding_category_iso_8_2,
8766 intern ("coding-category-iso-8-2"));
8767 ASET (Vcoding_category_table, coding_category_iso_7_else,
8768 intern ("coding-category-iso-7-else"));
8769 ASET (Vcoding_category_table, coding_category_iso_8_else,
8770 intern ("coding-category-iso-8-else"));
8771 ASET (Vcoding_category_table, coding_category_utf_8,
8772 intern ("coding-category-utf-8"));
8773 ASET (Vcoding_category_table, coding_category_utf_16_be,
8774 intern ("coding-category-utf-16-be"));
8775 ASET (Vcoding_category_table, coding_category_utf_16_auto,
8776 intern ("coding-category-utf-16-auto"));
8777 ASET (Vcoding_category_table, coding_category_utf_16_le,
8778 intern ("coding-category-utf-16-le"));
8779 ASET (Vcoding_category_table, coding_category_utf_16_be_nosig,
8780 intern ("coding-category-utf-16-be-nosig"));
8781 ASET (Vcoding_category_table, coding_category_utf_16_le_nosig,
8782 intern ("coding-category-utf-16-le-nosig"));
8783 ASET (Vcoding_category_table, coding_category_charset,
8784 intern ("coding-category-charset"));
8785 ASET (Vcoding_category_table, coding_category_sjis,
8786 intern ("coding-category-sjis"));
8787 ASET (Vcoding_category_table, coding_category_big5,
8788 intern ("coding-category-big5"));
8789 ASET (Vcoding_category_table, coding_category_ccl,
8790 intern ("coding-category-ccl"));
8791 ASET (Vcoding_category_table, coding_category_emacs_mule,
8792 intern ("coding-category-emacs-mule"));
8793 /* Followings are NOT target of code detection. */
8794 ASET (Vcoding_category_table, coding_category_raw_text,
8795 intern ("coding-category-raw-text"));
8796 ASET (Vcoding_category_table, coding_category_undecided,
8797 intern ("coding-category-undecided"));
8798
8799 defsubr (&Scoding_system_p);
8800 defsubr (&Sread_coding_system);
8801 defsubr (&Sread_non_nil_coding_system);
8802 defsubr (&Scheck_coding_system);
8803 defsubr (&Sdetect_coding_region);
8804 defsubr (&Sdetect_coding_string);
8805 defsubr (&Sfind_coding_systems_region_internal);
8806 defsubr (&Sunencodable_char_position);
8807 defsubr (&Scheck_coding_systems_region);
8808 defsubr (&Sdecode_coding_region);
8809 defsubr (&Sencode_coding_region);
8810 defsubr (&Sdecode_coding_string);
8811 defsubr (&Sencode_coding_string);
8812 defsubr (&Sdecode_sjis_char);
8813 defsubr (&Sencode_sjis_char);
8814 defsubr (&Sdecode_big5_char);
8815 defsubr (&Sencode_big5_char);
8816 defsubr (&Sset_terminal_coding_system_internal);
8817 defsubr (&Sset_safe_terminal_coding_system_internal);
8818 defsubr (&Sterminal_coding_system);
8819 defsubr (&Sset_keyboard_coding_system_internal);
8820 defsubr (&Skeyboard_coding_system);
8821 defsubr (&Sfind_operation_coding_system);
8822 defsubr (&Sset_coding_system_priority);
8823 defsubr (&Sdefine_coding_system_internal);
8824 defsubr (&Sdefine_coding_system_alias);
8825 defsubr (&Scoding_system_base);
8826 defsubr (&Scoding_system_plist);
8827 defsubr (&Scoding_system_aliases);
8828 defsubr (&Scoding_system_eol_type);
8829 defsubr (&Scoding_system_priority_list);
8830
8831 DEFVAR_LISP ("coding-system-list", &Vcoding_system_list,
8832 doc: /* List of coding systems.
8833
8834 Do not alter the value of this variable manually. This variable should be
8835 updated by the functions `define-coding-system' and
8836 `define-coding-system-alias'. */);
8837 Vcoding_system_list = Qnil;
8838
8839 DEFVAR_LISP ("coding-system-alist", &Vcoding_system_alist,
8840 doc: /* Alist of coding system names.
8841 Each element is one element list of coding system name.
8842 This variable is given to `completing-read' as TABLE argument.
8843
8844 Do not alter the value of this variable manually. This variable should be
8845 updated by the functions `make-coding-system' and
8846 `define-coding-system-alias'. */);
8847 Vcoding_system_alist = Qnil;
8848
8849 DEFVAR_LISP ("coding-category-list", &Vcoding_category_list,
8850 doc: /* List of coding-categories (symbols) ordered by priority.
8851
8852 On detecting a coding system, Emacs tries code detection algorithms
8853 associated with each coding-category one by one in this order. When
8854 one algorithm agrees with a byte sequence of source text, the coding
8855 system bound to the corresponding coding-category is selected. */);
8856 {
8857 int i;
8858
8859 Vcoding_category_list = Qnil;
8860 for (i = coding_category_max - 1; i >= 0; i--)
8861 Vcoding_category_list
8862 = Fcons (XVECTOR (Vcoding_category_table)->contents[i],
8863 Vcoding_category_list);
8864 }
8865
8866 DEFVAR_LISP ("coding-system-for-read", &Vcoding_system_for_read,
8867 doc: /* Specify the coding system for read operations.
8868 It is useful to bind this variable with `let', but do not set it globally.
8869 If the value is a coding system, it is used for decoding on read operation.
8870 If not, an appropriate element is used from one of the coding system alists:
8871 There are three such tables, `file-coding-system-alist',
8872 `process-coding-system-alist', and `network-coding-system-alist'. */);
8873 Vcoding_system_for_read = Qnil;
8874
8875 DEFVAR_LISP ("coding-system-for-write", &Vcoding_system_for_write,
8876 doc: /* Specify the coding system for write operations.
8877 Programs bind this variable with `let', but you should not set it globally.
8878 If the value is a coding system, it is used for encoding of output,
8879 when writing it to a file and when sending it to a file or subprocess.
8880
8881 If this does not specify a coding system, an appropriate element
8882 is used from one of the coding system alists:
8883 There are three such tables, `file-coding-system-alist',
8884 `process-coding-system-alist', and `network-coding-system-alist'.
8885 For output to files, if the above procedure does not specify a coding system,
8886 the value of `buffer-file-coding-system' is used. */);
8887 Vcoding_system_for_write = Qnil;
8888
8889 DEFVAR_LISP ("last-coding-system-used", &Vlast_coding_system_used,
8890 doc: /*
8891 Coding system used in the latest file or process I/O. */);
8892 Vlast_coding_system_used = Qnil;
8893
8894 DEFVAR_BOOL ("inhibit-eol-conversion", &inhibit_eol_conversion,
8895 doc: /*
8896 *Non-nil means always inhibit code conversion of end-of-line format.
8897 See info node `Coding Systems' and info node `Text and Binary' concerning
8898 such conversion. */);
8899 inhibit_eol_conversion = 0;
8900
8901 DEFVAR_BOOL ("inherit-process-coding-system", &inherit_process_coding_system,
8902 doc: /*
8903 Non-nil means process buffer inherits coding system of process output.
8904 Bind it to t if the process output is to be treated as if it were a file
8905 read from some filesystem. */);
8906 inherit_process_coding_system = 0;
8907
8908 DEFVAR_LISP ("file-coding-system-alist", &Vfile_coding_system_alist,
8909 doc: /*
8910 Alist to decide a coding system to use for a file I/O operation.
8911 The format is ((PATTERN . VAL) ...),
8912 where PATTERN is a regular expression matching a file name,
8913 VAL is a coding system, a cons of coding systems, or a function symbol.
8914 If VAL is a coding system, it is used for both decoding and encoding
8915 the file contents.
8916 If VAL is a cons of coding systems, the car part is used for decoding,
8917 and the cdr part is used for encoding.
8918 If VAL is a function symbol, the function must return a coding system
8919 or a cons of coding systems which are used as above. The function gets
8920 the arguments with which `find-operation-coding-systems' was called.
8921
8922 See also the function `find-operation-coding-system'
8923 and the variable `auto-coding-alist'. */);
8924 Vfile_coding_system_alist = Qnil;
8925
8926 DEFVAR_LISP ("process-coding-system-alist", &Vprocess_coding_system_alist,
8927 doc: /*
8928 Alist to decide a coding system to use for a process I/O operation.
8929 The format is ((PATTERN . VAL) ...),
8930 where PATTERN is a regular expression matching a program name,
8931 VAL is a coding system, a cons of coding systems, or a function symbol.
8932 If VAL is a coding system, it is used for both decoding what received
8933 from the program and encoding what sent to the program.
8934 If VAL is a cons of coding systems, the car part is used for decoding,
8935 and the cdr part is used for encoding.
8936 If VAL is a function symbol, the function must return a coding system
8937 or a cons of coding systems which are used as above.
8938
8939 See also the function `find-operation-coding-system'. */);
8940 Vprocess_coding_system_alist = Qnil;
8941
8942 DEFVAR_LISP ("network-coding-system-alist", &Vnetwork_coding_system_alist,
8943 doc: /*
8944 Alist to decide a coding system to use for a network I/O operation.
8945 The format is ((PATTERN . VAL) ...),
8946 where PATTERN is a regular expression matching a network service name
8947 or is a port number to connect to,
8948 VAL is a coding system, a cons of coding systems, or a function symbol.
8949 If VAL is a coding system, it is used for both decoding what received
8950 from the network stream and encoding what sent to the network stream.
8951 If VAL is a cons of coding systems, the car part is used for decoding,
8952 and the cdr part is used for encoding.
8953 If VAL is a function symbol, the function must return a coding system
8954 or a cons of coding systems which are used as above.
8955
8956 See also the function `find-operation-coding-system'. */);
8957 Vnetwork_coding_system_alist = Qnil;
8958
8959 DEFVAR_LISP ("locale-coding-system", &Vlocale_coding_system,
8960 doc: /* Coding system to use with system messages.
8961 Also used for decoding keyboard input on X Window system. */);
8962 Vlocale_coding_system = Qnil;
8963
8964 /* The eol mnemonics are reset in startup.el system-dependently. */
8965 DEFVAR_LISP ("eol-mnemonic-unix", &eol_mnemonic_unix,
8966 doc: /*
8967 *String displayed in mode line for UNIX-like (LF) end-of-line format. */);
8968 eol_mnemonic_unix = build_string (":");
8969
8970 DEFVAR_LISP ("eol-mnemonic-dos", &eol_mnemonic_dos,
8971 doc: /*
8972 *String displayed in mode line for DOS-like (CRLF) end-of-line format. */);
8973 eol_mnemonic_dos = build_string ("\\");
8974
8975 DEFVAR_LISP ("eol-mnemonic-mac", &eol_mnemonic_mac,
8976 doc: /*
8977 *String displayed in mode line for MAC-like (CR) end-of-line format. */);
8978 eol_mnemonic_mac = build_string ("/");
8979
8980 DEFVAR_LISP ("eol-mnemonic-undecided", &eol_mnemonic_undecided,
8981 doc: /*
8982 *String displayed in mode line when end-of-line format is not yet determined. */);
8983 eol_mnemonic_undecided = build_string (":");
8984
8985 DEFVAR_LISP ("enable-character-translation", &Venable_character_translation,
8986 doc: /*
8987 *Non-nil enables character translation while encoding and decoding. */);
8988 Venable_character_translation = Qt;
8989
8990 DEFVAR_LISP ("standard-translation-table-for-decode",
8991 &Vstandard_translation_table_for_decode,
8992 doc: /* Table for translating characters while decoding. */);
8993 Vstandard_translation_table_for_decode = Qnil;
8994
8995 DEFVAR_LISP ("standard-translation-table-for-encode",
8996 &Vstandard_translation_table_for_encode,
8997 doc: /* Table for translating characters while encoding. */);
8998 Vstandard_translation_table_for_encode = Qnil;
8999
9000 DEFVAR_LISP ("charset-revision-table", &Vcharset_revision_table,
9001 doc: /* Alist of charsets vs revision numbers.
9002 While encoding, if a charset (car part of an element) is found,
9003 designate it with the escape sequence identifying revision (cdr part
9004 of the element). */);
9005 Vcharset_revision_table = Qnil;
9006
9007 DEFVAR_LISP ("default-process-coding-system",
9008 &Vdefault_process_coding_system,
9009 doc: /* Cons of coding systems used for process I/O by default.
9010 The car part is used for decoding a process output,
9011 the cdr part is used for encoding a text to be sent to a process. */);
9012 Vdefault_process_coding_system = Qnil;
9013
9014 DEFVAR_LISP ("latin-extra-code-table", &Vlatin_extra_code_table,
9015 doc: /*
9016 Table of extra Latin codes in the range 128..159 (inclusive).
9017 This is a vector of length 256.
9018 If Nth element is non-nil, the existence of code N in a file
9019 \(or output of subprocess) doesn't prevent it to be detected as
9020 a coding system of ISO 2022 variant which has a flag
9021 `accept-latin-extra-code' t (e.g. iso-latin-1) on reading a file
9022 or reading output of a subprocess.
9023 Only 128th through 159th elements has a meaning. */);
9024 Vlatin_extra_code_table = Fmake_vector (make_number (256), Qnil);
9025
9026 DEFVAR_LISP ("select-safe-coding-system-function",
9027 &Vselect_safe_coding_system_function,
9028 doc: /*
9029 Function to call to select safe coding system for encoding a text.
9030
9031 If set, this function is called to force a user to select a proper
9032 coding system which can encode the text in the case that a default
9033 coding system used in each operation can't encode the text.
9034
9035 The default value is `select-safe-coding-system' (which see). */);
9036 Vselect_safe_coding_system_function = Qnil;
9037
9038 DEFVAR_BOOL ("coding-system-require-warning",
9039 &coding_system_require_warning,
9040 doc: /* Internal use only.
9041 If non-nil, on writing a file, `select-safe-coding-system-function' is
9042 called even if `coding-system-for-write' is non-nil. The command
9043 `universal-coding-system-argument' binds this variable to t temporarily. */);
9044 coding_system_require_warning = 0;
9045
9046
9047 DEFVAR_BOOL ("inhibit-iso-escape-detection",
9048 &inhibit_iso_escape_detection,
9049 doc: /*
9050 If non-nil, Emacs ignores ISO2022's escape sequence on code detection.
9051
9052 By default, on reading a file, Emacs tries to detect how the text is
9053 encoded. This code detection is sensitive to escape sequences. If
9054 the sequence is valid as ISO2022, the code is determined as one of
9055 the ISO2022 encodings, and the file is decoded by the corresponding
9056 coding system (e.g. `iso-2022-7bit').
9057
9058 However, there may be a case that you want to read escape sequences in
9059 a file as is. In such a case, you can set this variable to non-nil.
9060 Then, as the code detection ignores any escape sequences, no file is
9061 detected as encoded in some ISO2022 encoding. The result is that all
9062 escape sequences become visible in a buffer.
9063
9064 The default value is nil, and it is strongly recommended not to change
9065 it. That is because many Emacs Lisp source files that contain
9066 non-ASCII characters are encoded by the coding system `iso-2022-7bit'
9067 in Emacs's distribution, and they won't be decoded correctly on
9068 reading if you suppress escape sequence detection.
9069
9070 The other way to read escape sequences in a file without decoding is
9071 to explicitly specify some coding system that doesn't use ISO2022's
9072 escape sequence (e.g `latin-1') on reading by \\[universal-coding-system-argument]. */);
9073 inhibit_iso_escape_detection = 0;
9074
9075 DEFVAR_LISP ("translation-table-for-input", &Vtranslation_table_for_input,
9076 doc: /* Char table for translating self-inserting characters.
9077 This is applied to the result of input methods, not their input. See also
9078 `keyboard-translate-table'. */);
9079 Vtranslation_table_for_input = Qnil;
9080
9081 {
9082 Lisp_Object args[coding_arg_max];
9083 Lisp_Object plist[16];
9084 int i;
9085
9086 for (i = 0; i < coding_arg_max; i++)
9087 args[i] = Qnil;
9088
9089 plist[0] = intern (":name");
9090 plist[1] = args[coding_arg_name] = Qno_conversion;
9091 plist[2] = intern (":mnemonic");
9092 plist[3] = args[coding_arg_mnemonic] = make_number ('=');
9093 plist[4] = intern (":coding-type");
9094 plist[5] = args[coding_arg_coding_type] = Qraw_text;
9095 plist[6] = intern (":ascii-compatible-p");
9096 plist[7] = args[coding_arg_ascii_compatible_p] = Qt;
9097 plist[8] = intern (":default-char");
9098 plist[9] = args[coding_arg_default_char] = make_number (0);
9099 plist[10] = intern (":for-unibyte");
9100 plist[11] = args[coding_arg_for_unibyte] = Qt;
9101 plist[12] = intern (":docstring");
9102 plist[13] = build_string ("Do no conversion.\n\
9103 \n\
9104 When you visit a file with this coding, the file is read into a\n\
9105 unibyte buffer as is, thus each byte of a file is treated as a\n\
9106 character.");
9107 plist[14] = intern (":eol-type");
9108 plist[15] = args[coding_arg_eol_type] = Qunix;
9109 args[coding_arg_plist] = Flist (16, plist);
9110 Fdefine_coding_system_internal (coding_arg_max, args);
9111 }
9112
9113 setup_coding_system (Qno_conversion, &keyboard_coding);
9114 setup_coding_system (Qno_conversion, &terminal_coding);
9115 setup_coding_system (Qno_conversion, &safe_terminal_coding);
9116
9117 {
9118 int i;
9119
9120 for (i = 0; i < coding_category_max; i++)
9121 Fset (AREF (Vcoding_category_table, i), Qno_conversion);
9122 }
9123 }
9124
9125 char *
9126 emacs_strerror (error_number)
9127 int error_number;
9128 {
9129 char *str;
9130
9131 synchronize_system_messages_locale ();
9132 str = strerror (error_number);
9133
9134 if (! NILP (Vlocale_coding_system))
9135 {
9136 Lisp_Object dec = code_convert_string_norecord (build_string (str),
9137 Vlocale_coding_system,
9138 0);
9139 str = (char *) SDATA (dec);
9140 }
9141
9142 return str;
9143 }
9144
9145 #endif /* emacs */