(get_translation_table): New arg max_lookup. Caller changed.
[bpt/emacs.git] / src / coding.c
1 /* Coding system handler (conversion, detection, etc).
2 Copyright (C) 1995, 1997, 1998 Electrotechnical Laboratory, JAPAN.
3 Licensed to the Free Software Foundation.
4 Copyright (C) 2001, 2002 Free Software Foundation, Inc.
5 Copyright (C) 2003
6 National Institute of Advanced Industrial Science and Technology (AIST)
7 Registration Number H13PRO009
8
9 This file is part of GNU Emacs.
10
11 GNU Emacs is free software; you can redistribute it and/or modify
12 it under the terms of the GNU General Public License as published by
13 the Free Software Foundation; either version 2, or (at your option)
14 any later version.
15
16 GNU Emacs is distributed in the hope that it will be useful,
17 but WITHOUT ANY WARRANTY; without even the implied warranty of
18 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19 GNU General Public License for more details.
20
21 You should have received a copy of the GNU General Public License
22 along with GNU Emacs; see the file COPYING. If not, write to
23 the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
24 Boston, MA 02111-1307, USA. */
25
26 /*** TABLE OF CONTENTS ***
27
28 0. General comments
29 1. Preamble
30 2. Emacs' internal format (emacs-utf-8) handlers
31 3. UTF-8 handlers
32 4. UTF-16 handlers
33 5. Charset-base coding systems handlers
34 6. emacs-mule (old Emacs' internal format) handlers
35 7. ISO2022 handlers
36 8. Shift-JIS and BIG5 handlers
37 9. CCL handlers
38 10. C library functions
39 11. Emacs Lisp library functions
40 12. Postamble
41
42 */
43
44 /*** 0. General comments ***
45
46
47 CODING SYSTEM
48
49 A coding system is an object for an encoding mechanism that contains
50 information about how to convert byte sequences to character
51 sequences and vice versa. When we say "decode", it means converting
52 a byte sequence of a specific coding system into a character
53 sequence that is represented by Emacs' internal coding system
54 `emacs-utf-8', and when we say "encode", it means converting a
55 character sequence of emacs-utf-8 to a byte sequence of a specific
56 coding system.
57
58 In Emacs Lisp, a coding system is represented by a Lisp symbol. In
59 C level, a coding system is represented by a vector of attributes
60 stored in the hash table Vcharset_hash_table. The conversion from
61 coding system symbol to attributes vector is done by looking up
62 Vcharset_hash_table by the symbol.
63
64 Coding systems are classified into the following types depending on
65 the encoding mechanism. Here's a brief description of the types.
66
67 o UTF-8
68
69 o UTF-16
70
71 o Charset-base coding system
72
73 A coding system defined by one or more (coded) character sets.
74 Decoding and encoding are done by a code converter defined for each
75 character set.
76
77 o Old Emacs internal format (emacs-mule)
78
79 The coding system adopted by old versions of Emacs (20 and 21).
80
81 o ISO2022-base coding system
82
83 The most famous coding system for multiple character sets. X's
84 Compound Text, various EUCs (Extended Unix Code), and coding systems
85 used in the Internet communication such as ISO-2022-JP are all
86 variants of ISO2022.
87
88 o SJIS (or Shift-JIS or MS-Kanji-Code)
89
90 A coding system to encode character sets: ASCII, JISX0201, and
91 JISX0208. Widely used for PC's in Japan. Details are described in
92 section 8.
93
94 o BIG5
95
96 A coding system to encode character sets: ASCII and Big5. Widely
97 used for Chinese (mainly in Taiwan and Hong Kong). Details are
98 described in section 8. In this file, when we write "big5" (all
99 lowercase), we mean the coding system, and when we write "Big5"
100 (capitalized), we mean the character set.
101
102 o CCL
103
104 If a user wants to decode/encode text encoded in a coding system
105 not listed above, he can supply a decoder and an encoder for it in
106 CCL (Code Conversion Language) programs. Emacs executes the CCL
107 program while decoding/encoding.
108
109 o Raw-text
110
111 A coding system for text containing raw eight-bit data. Emacs
112 treats each byte of source text as a character (except for
113 end-of-line conversion).
114
115 o No-conversion
116
117 Like raw text, but don't do end-of-line conversion.
118
119
120 END-OF-LINE FORMAT
121
122 How text end-of-line is encoded depends on operating system. For
123 instance, Unix's format is just one byte of LF (line-feed) code,
124 whereas DOS's format is two-byte sequence of `carriage-return' and
125 `line-feed' codes. MacOS's format is usually one byte of
126 `carriage-return'.
127
128 Since text character encoding and end-of-line encoding are
129 independent, any coding system described above can take any format
130 of end-of-line (except for no-conversion).
131
132 STRUCT CODING_SYSTEM
133
134 Before using a coding system for code conversion (i.e. decoding and
135 encoding), we setup a structure of type `struct coding_system'.
136 This structure keeps various information about a specific code
137 conversion (e.g. the location of source and destination data).
138
139 */
140
141 /* COMMON MACROS */
142
143
144 /*** GENERAL NOTES on `detect_coding_XXX ()' functions ***
145
146 These functions check if a byte sequence specified as a source in
147 CODING conforms to the format of XXX, and update the members of
148 DETECT_INFO.
149
150 Return 1 if the byte sequence conforms to XXX, otherwise return 0.
151
152 Below is the template of these functions. */
153
154 #if 0
155 static int
156 detect_coding_XXX (coding, detect_info)
157 struct coding_system *coding;
158 struct coding_detection_info *detect_info;
159 {
160 unsigned char *src = coding->source;
161 unsigned char *src_end = coding->source + coding->src_bytes;
162 int multibytep = coding->src_multibyte;
163 int consumed_chars = 0;
164 int found = 0;
165 ...;
166
167 while (1)
168 {
169 /* Get one byte from the source. If the souce is exausted, jump
170 to no_more_source:. */
171 ONE_MORE_BYTE (c);
172
173 if (! __C_conforms_to_XXX___ (c))
174 break;
175 if (! __C_strongly_suggests_XXX__ (c))
176 found = CATEGORY_MASK_XXX;
177 }
178 /* The byte sequence is invalid for XXX. */
179 detect_info->rejected |= CATEGORY_MASK_XXX;
180 return 0;
181
182 no_more_source:
183 /* The source exausted successfully. */
184 detect_info->found |= found;
185 return 1;
186 }
187 #endif
188
189 /*** GENERAL NOTES on `decode_coding_XXX ()' functions ***
190
191 These functions decode a byte sequence specified as a source by
192 CODING. The resulting multibyte text goes to a place pointed to by
193 CODING->charbuf, the length of which should not exceed
194 CODING->charbuf_size;
195
196 These functions set the information of original and decoded texts in
197 CODING->consumed, CODING->consumed_char, and CODING->charbuf_used.
198 They also set CODING->result to one of CODING_RESULT_XXX indicating
199 how the decoding is finished.
200
201 Below is the template of these functions. */
202
203 #if 0
204 static void
205 decode_coding_XXXX (coding)
206 struct coding_system *coding;
207 {
208 unsigned char *src = coding->source + coding->consumed;
209 unsigned char *src_end = coding->source + coding->src_bytes;
210 /* SRC_BASE remembers the start position in source in each loop.
211 The loop will be exited when there's not enough source code, or
212 when there's no room in CHARBUF for a decoded character. */
213 unsigned char *src_base;
214 /* A buffer to produce decoded characters. */
215 int *charbuf = coding->charbuf + coding->charbuf_used;
216 int *charbuf_end = coding->charbuf + coding->charbuf_size;
217 int multibytep = coding->src_multibyte;
218
219 while (1)
220 {
221 src_base = src;
222 if (charbuf < charbuf_end)
223 /* No more room to produce a decoded character. */
224 break;
225 ONE_MORE_BYTE (c);
226 /* Decode it. */
227 }
228
229 no_more_source:
230 if (src_base < src_end
231 && coding->mode & CODING_MODE_LAST_BLOCK)
232 /* If the source ends by partial bytes to construct a character,
233 treat them as eight-bit raw data. */
234 while (src_base < src_end && charbuf < charbuf_end)
235 *charbuf++ = *src_base++;
236 /* Remember how many bytes and characters we consumed. If the
237 source is multibyte, the bytes and chars are not identical. */
238 coding->consumed = coding->consumed_char = src_base - coding->source;
239 /* Remember how many characters we produced. */
240 coding->charbuf_used = charbuf - coding->charbuf;
241 }
242 #endif
243
244 /*** GENERAL NOTES on `encode_coding_XXX ()' functions ***
245
246 These functions encode SRC_BYTES length text at SOURCE of Emacs'
247 internal multibyte format by CODING. The resulting byte sequence
248 goes to a place pointed to by DESTINATION, the length of which
249 should not exceed DST_BYTES.
250
251 These functions set the information of original and encoded texts in
252 the members produced, produced_char, consumed, and consumed_char of
253 the structure *CODING. They also set the member result to one of
254 CODING_RESULT_XXX indicating how the encoding finished.
255
256 DST_BYTES zero means that source area and destination area are
257 overlapped, which means that we can produce a encoded text until it
258 reaches at the head of not-yet-encoded source text.
259
260 Below is a template of these functions. */
261 #if 0
262 static void
263 encode_coding_XXX (coding)
264 struct coding_system *coding;
265 {
266 int multibytep = coding->dst_multibyte;
267 int *charbuf = coding->charbuf;
268 int *charbuf_end = charbuf->charbuf + coding->charbuf_used;
269 unsigned char *dst = coding->destination + coding->produced;
270 unsigned char *dst_end = coding->destination + coding->dst_bytes;
271 unsigned char *adjusted_dst_end = dst_end - _MAX_BYTES_PRODUCED_IN_LOOP_;
272 int produced_chars = 0;
273
274 for (; charbuf < charbuf_end && dst < adjusted_dst_end; charbuf++)
275 {
276 int c = *charbuf;
277 /* Encode C into DST, and increment DST. */
278 }
279 label_no_more_destination:
280 /* How many chars and bytes we produced. */
281 coding->produced_char += produced_chars;
282 coding->produced = dst - coding->destination;
283 }
284 #endif
285
286 \f
287 /*** 1. Preamble ***/
288
289 #include <config.h>
290 #include <stdio.h>
291
292 #include "lisp.h"
293 #include "buffer.h"
294 #include "character.h"
295 #include "charset.h"
296 #include "ccl.h"
297 #include "composite.h"
298 #include "coding.h"
299 #include "window.h"
300
301 Lisp_Object Vcoding_system_hash_table;
302
303 Lisp_Object Qcoding_system, Qcoding_aliases, Qeol_type;
304 Lisp_Object Qunix, Qdos;
305 extern Lisp_Object Qmac; /* frame.c */
306 Lisp_Object Qbuffer_file_coding_system;
307 Lisp_Object Qpost_read_conversion, Qpre_write_conversion;
308 Lisp_Object Qdefault_char;
309 Lisp_Object Qno_conversion, Qundecided;
310 Lisp_Object Qcharset, Qiso_2022, Qutf_8, Qutf_16, Qshift_jis, Qbig5;
311 Lisp_Object Qbig, Qlittle;
312 Lisp_Object Qcoding_system_history;
313 Lisp_Object Qvalid_codes;
314 Lisp_Object QCcategory, QCmnemonic, QCdefalut_char;
315 Lisp_Object QCdecode_translation_table, QCencode_translation_table;
316 Lisp_Object QCpost_read_conversion, QCpre_write_conversion;
317
318 extern Lisp_Object Qinsert_file_contents, Qwrite_region;
319 Lisp_Object Qcall_process, Qcall_process_region, Qprocess_argument;
320 Lisp_Object Qstart_process, Qopen_network_stream;
321 Lisp_Object Qtarget_idx;
322
323 Lisp_Object Qinsufficient_source, Qinconsistent_eol, Qinvalid_source;
324 Lisp_Object Qinterrupted, Qinsufficient_memory;
325
326 int coding_system_require_warning;
327
328 Lisp_Object Vselect_safe_coding_system_function;
329
330 /* Mnemonic string for each format of end-of-line. */
331 Lisp_Object eol_mnemonic_unix, eol_mnemonic_dos, eol_mnemonic_mac;
332 /* Mnemonic string to indicate format of end-of-line is not yet
333 decided. */
334 Lisp_Object eol_mnemonic_undecided;
335
336 #ifdef emacs
337
338 Lisp_Object Vcoding_system_list, Vcoding_system_alist;
339
340 Lisp_Object Qcoding_system_p, Qcoding_system_error;
341
342 /* Coding system emacs-mule and raw-text are for converting only
343 end-of-line format. */
344 Lisp_Object Qemacs_mule, Qraw_text;
345 Lisp_Object Qutf_8_emacs;
346
347 /* Coding-systems are handed between Emacs Lisp programs and C internal
348 routines by the following three variables. */
349 /* Coding-system for reading files and receiving data from process. */
350 Lisp_Object Vcoding_system_for_read;
351 /* Coding-system for writing files and sending data to process. */
352 Lisp_Object Vcoding_system_for_write;
353 /* Coding-system actually used in the latest I/O. */
354 Lisp_Object Vlast_coding_system_used;
355 /* Set to non-nil when an error is detected while code conversion. */
356 Lisp_Object Vlast_code_conversion_error;
357 /* A vector of length 256 which contains information about special
358 Latin codes (especially for dealing with Microsoft codes). */
359 Lisp_Object Vlatin_extra_code_table;
360
361 /* Flag to inhibit code conversion of end-of-line format. */
362 int inhibit_eol_conversion;
363
364 /* Flag to inhibit ISO2022 escape sequence detection. */
365 int inhibit_iso_escape_detection;
366
367 /* Flag to make buffer-file-coding-system inherit from process-coding. */
368 int inherit_process_coding_system;
369
370 /* Coding system to be used to encode text for terminal display. */
371 struct coding_system terminal_coding;
372
373 /* Coding system to be used to encode text for terminal display when
374 terminal coding system is nil. */
375 struct coding_system safe_terminal_coding;
376
377 /* Coding system of what is sent from terminal keyboard. */
378 struct coding_system keyboard_coding;
379
380 Lisp_Object Vfile_coding_system_alist;
381 Lisp_Object Vprocess_coding_system_alist;
382 Lisp_Object Vnetwork_coding_system_alist;
383
384 Lisp_Object Vlocale_coding_system;
385
386 #endif /* emacs */
387
388 /* Flag to tell if we look up translation table on character code
389 conversion. */
390 Lisp_Object Venable_character_translation;
391 /* Standard translation table to look up on decoding (reading). */
392 Lisp_Object Vstandard_translation_table_for_decode;
393 /* Standard translation table to look up on encoding (writing). */
394 Lisp_Object Vstandard_translation_table_for_encode;
395
396 Lisp_Object Qtranslation_table;
397 Lisp_Object Qtranslation_table_id;
398 Lisp_Object Qtranslation_table_for_decode;
399 Lisp_Object Qtranslation_table_for_encode;
400
401 /* Alist of charsets vs revision number. */
402 static Lisp_Object Vcharset_revision_table;
403
404 /* Default coding systems used for process I/O. */
405 Lisp_Object Vdefault_process_coding_system;
406
407 /* Char table for translating Quail and self-inserting input. */
408 Lisp_Object Vtranslation_table_for_input;
409
410 /* Two special coding systems. */
411 Lisp_Object Vsjis_coding_system;
412 Lisp_Object Vbig5_coding_system;
413
414 static void record_conversion_result (struct coding_system *coding,
415 enum coding_result_code result);
416 static int detect_coding_utf_8 P_ ((struct coding_system *,
417 struct coding_detection_info *info));
418 static void decode_coding_utf_8 P_ ((struct coding_system *));
419 static int encode_coding_utf_8 P_ ((struct coding_system *));
420
421 static int detect_coding_utf_16 P_ ((struct coding_system *,
422 struct coding_detection_info *info));
423 static void decode_coding_utf_16 P_ ((struct coding_system *));
424 static int encode_coding_utf_16 P_ ((struct coding_system *));
425
426 static int detect_coding_iso_2022 P_ ((struct coding_system *,
427 struct coding_detection_info *info));
428 static void decode_coding_iso_2022 P_ ((struct coding_system *));
429 static int encode_coding_iso_2022 P_ ((struct coding_system *));
430
431 static int detect_coding_emacs_mule P_ ((struct coding_system *,
432 struct coding_detection_info *info));
433 static void decode_coding_emacs_mule P_ ((struct coding_system *));
434 static int encode_coding_emacs_mule P_ ((struct coding_system *));
435
436 static int detect_coding_sjis P_ ((struct coding_system *,
437 struct coding_detection_info *info));
438 static void decode_coding_sjis P_ ((struct coding_system *));
439 static int encode_coding_sjis P_ ((struct coding_system *));
440
441 static int detect_coding_big5 P_ ((struct coding_system *,
442 struct coding_detection_info *info));
443 static void decode_coding_big5 P_ ((struct coding_system *));
444 static int encode_coding_big5 P_ ((struct coding_system *));
445
446 static int detect_coding_ccl P_ ((struct coding_system *,
447 struct coding_detection_info *info));
448 static void decode_coding_ccl P_ ((struct coding_system *));
449 static int encode_coding_ccl P_ ((struct coding_system *));
450
451 static void decode_coding_raw_text P_ ((struct coding_system *));
452 static int encode_coding_raw_text P_ ((struct coding_system *));
453
454
455 /* ISO2022 section */
456
457 #define CODING_ISO_INITIAL(coding, reg) \
458 (XINT (AREF (AREF (CODING_ID_ATTRS ((coding)->id), \
459 coding_attr_iso_initial), \
460 reg)))
461
462
463 #define CODING_ISO_REQUEST(coding, charset_id) \
464 ((charset_id <= (coding)->max_charset_id \
465 ? (coding)->safe_charsets[charset_id] \
466 : -1))
467
468
469 #define CODING_ISO_FLAGS(coding) \
470 ((coding)->spec.iso_2022.flags)
471 #define CODING_ISO_DESIGNATION(coding, reg) \
472 ((coding)->spec.iso_2022.current_designation[reg])
473 #define CODING_ISO_INVOCATION(coding, plane) \
474 ((coding)->spec.iso_2022.current_invocation[plane])
475 #define CODING_ISO_SINGLE_SHIFTING(coding) \
476 ((coding)->spec.iso_2022.single_shifting)
477 #define CODING_ISO_BOL(coding) \
478 ((coding)->spec.iso_2022.bol)
479 #define CODING_ISO_INVOKED_CHARSET(coding, plane) \
480 CODING_ISO_DESIGNATION ((coding), CODING_ISO_INVOCATION ((coding), (plane)))
481
482 /* Control characters of ISO2022. */
483 /* code */ /* function */
484 #define ISO_CODE_LF 0x0A /* line-feed */
485 #define ISO_CODE_CR 0x0D /* carriage-return */
486 #define ISO_CODE_SO 0x0E /* shift-out */
487 #define ISO_CODE_SI 0x0F /* shift-in */
488 #define ISO_CODE_SS2_7 0x19 /* single-shift-2 for 7-bit code */
489 #define ISO_CODE_ESC 0x1B /* escape */
490 #define ISO_CODE_SS2 0x8E /* single-shift-2 */
491 #define ISO_CODE_SS3 0x8F /* single-shift-3 */
492 #define ISO_CODE_CSI 0x9B /* control-sequence-introducer */
493
494 /* All code (1-byte) of ISO2022 is classified into one of the
495 followings. */
496 enum iso_code_class_type
497 {
498 ISO_control_0, /* Control codes in the range
499 0x00..0x1F and 0x7F, except for the
500 following 5 codes. */
501 ISO_shift_out, /* ISO_CODE_SO (0x0E) */
502 ISO_shift_in, /* ISO_CODE_SI (0x0F) */
503 ISO_single_shift_2_7, /* ISO_CODE_SS2_7 (0x19) */
504 ISO_escape, /* ISO_CODE_SO (0x1B) */
505 ISO_control_1, /* Control codes in the range
506 0x80..0x9F, except for the
507 following 3 codes. */
508 ISO_single_shift_2, /* ISO_CODE_SS2 (0x8E) */
509 ISO_single_shift_3, /* ISO_CODE_SS3 (0x8F) */
510 ISO_control_sequence_introducer, /* ISO_CODE_CSI (0x9B) */
511 ISO_0x20_or_0x7F, /* Codes of the values 0x20 or 0x7F. */
512 ISO_graphic_plane_0, /* Graphic codes in the range 0x21..0x7E. */
513 ISO_0xA0_or_0xFF, /* Codes of the values 0xA0 or 0xFF. */
514 ISO_graphic_plane_1 /* Graphic codes in the range 0xA1..0xFE. */
515 };
516
517 /** The macros CODING_ISO_FLAG_XXX defines a flag bit of the
518 `iso-flags' attribute of an iso2022 coding system. */
519
520 /* If set, produce long-form designation sequence (e.g. ESC $ ( A)
521 instead of the correct short-form sequence (e.g. ESC $ A). */
522 #define CODING_ISO_FLAG_LONG_FORM 0x0001
523
524 /* If set, reset graphic planes and registers at end-of-line to the
525 initial state. */
526 #define CODING_ISO_FLAG_RESET_AT_EOL 0x0002
527
528 /* If set, reset graphic planes and registers before any control
529 characters to the initial state. */
530 #define CODING_ISO_FLAG_RESET_AT_CNTL 0x0004
531
532 /* If set, encode by 7-bit environment. */
533 #define CODING_ISO_FLAG_SEVEN_BITS 0x0008
534
535 /* If set, use locking-shift function. */
536 #define CODING_ISO_FLAG_LOCKING_SHIFT 0x0010
537
538 /* If set, use single-shift function. Overwrite
539 CODING_ISO_FLAG_LOCKING_SHIFT. */
540 #define CODING_ISO_FLAG_SINGLE_SHIFT 0x0020
541
542 /* If set, use designation escape sequence. */
543 #define CODING_ISO_FLAG_DESIGNATION 0x0040
544
545 /* If set, produce revision number sequence. */
546 #define CODING_ISO_FLAG_REVISION 0x0080
547
548 /* If set, produce ISO6429's direction specifying sequence. */
549 #define CODING_ISO_FLAG_DIRECTION 0x0100
550
551 /* If set, assume designation states are reset at beginning of line on
552 output. */
553 #define CODING_ISO_FLAG_INIT_AT_BOL 0x0200
554
555 /* If set, designation sequence should be placed at beginning of line
556 on output. */
557 #define CODING_ISO_FLAG_DESIGNATE_AT_BOL 0x0400
558
559 /* If set, do not encode unsafe charactes on output. */
560 #define CODING_ISO_FLAG_SAFE 0x0800
561
562 /* If set, extra latin codes (128..159) are accepted as a valid code
563 on input. */
564 #define CODING_ISO_FLAG_LATIN_EXTRA 0x1000
565
566 #define CODING_ISO_FLAG_COMPOSITION 0x2000
567
568 #define CODING_ISO_FLAG_EUC_TW_SHIFT 0x4000
569
570 #define CODING_ISO_FLAG_USE_ROMAN 0x8000
571
572 #define CODING_ISO_FLAG_USE_OLDJIS 0x10000
573
574 #define CODING_ISO_FLAG_FULL_SUPPORT 0x100000
575
576 /* A character to be produced on output if encoding of the original
577 character is prohibited by CODING_ISO_FLAG_SAFE. */
578 #define CODING_INHIBIT_CHARACTER_SUBSTITUTION '?'
579
580
581 /* UTF-16 section */
582 #define CODING_UTF_16_BOM(coding) \
583 ((coding)->spec.utf_16.bom)
584
585 #define CODING_UTF_16_ENDIAN(coding) \
586 ((coding)->spec.utf_16.endian)
587
588 #define CODING_UTF_16_SURROGATE(coding) \
589 ((coding)->spec.utf_16.surrogate)
590
591
592 /* CCL section */
593 #define CODING_CCL_DECODER(coding) \
594 AREF (CODING_ID_ATTRS ((coding)->id), coding_attr_ccl_decoder)
595 #define CODING_CCL_ENCODER(coding) \
596 AREF (CODING_ID_ATTRS ((coding)->id), coding_attr_ccl_encoder)
597 #define CODING_CCL_VALIDS(coding) \
598 (SDATA (AREF (CODING_ID_ATTRS ((coding)->id), coding_attr_ccl_valids)))
599
600 /* Index for each coding category in `coding_categories' */
601
602 enum coding_category
603 {
604 coding_category_iso_7,
605 coding_category_iso_7_tight,
606 coding_category_iso_8_1,
607 coding_category_iso_8_2,
608 coding_category_iso_7_else,
609 coding_category_iso_8_else,
610 coding_category_utf_8,
611 coding_category_utf_16_auto,
612 coding_category_utf_16_be,
613 coding_category_utf_16_le,
614 coding_category_utf_16_be_nosig,
615 coding_category_utf_16_le_nosig,
616 coding_category_charset,
617 coding_category_sjis,
618 coding_category_big5,
619 coding_category_ccl,
620 coding_category_emacs_mule,
621 /* All above are targets of code detection. */
622 coding_category_raw_text,
623 coding_category_undecided,
624 coding_category_max
625 };
626
627 /* Definitions of flag bits used in detect_coding_XXXX. */
628 #define CATEGORY_MASK_ISO_7 (1 << coding_category_iso_7)
629 #define CATEGORY_MASK_ISO_7_TIGHT (1 << coding_category_iso_7_tight)
630 #define CATEGORY_MASK_ISO_8_1 (1 << coding_category_iso_8_1)
631 #define CATEGORY_MASK_ISO_8_2 (1 << coding_category_iso_8_2)
632 #define CATEGORY_MASK_ISO_7_ELSE (1 << coding_category_iso_7_else)
633 #define CATEGORY_MASK_ISO_8_ELSE (1 << coding_category_iso_8_else)
634 #define CATEGORY_MASK_UTF_8 (1 << coding_category_utf_8)
635 #define CATEGORY_MASK_UTF_16_AUTO (1 << coding_category_utf_16_auto)
636 #define CATEGORY_MASK_UTF_16_BE (1 << coding_category_utf_16_be)
637 #define CATEGORY_MASK_UTF_16_LE (1 << coding_category_utf_16_le)
638 #define CATEGORY_MASK_UTF_16_BE_NOSIG (1 << coding_category_utf_16_be_nosig)
639 #define CATEGORY_MASK_UTF_16_LE_NOSIG (1 << coding_category_utf_16_le_nosig)
640 #define CATEGORY_MASK_CHARSET (1 << coding_category_charset)
641 #define CATEGORY_MASK_SJIS (1 << coding_category_sjis)
642 #define CATEGORY_MASK_BIG5 (1 << coding_category_big5)
643 #define CATEGORY_MASK_CCL (1 << coding_category_ccl)
644 #define CATEGORY_MASK_EMACS_MULE (1 << coding_category_emacs_mule)
645 #define CATEGORY_MASK_RAW_TEXT (1 << coding_category_raw_text)
646
647 /* This value is returned if detect_coding_mask () find nothing other
648 than ASCII characters. */
649 #define CATEGORY_MASK_ANY \
650 (CATEGORY_MASK_ISO_7 \
651 | CATEGORY_MASK_ISO_7_TIGHT \
652 | CATEGORY_MASK_ISO_8_1 \
653 | CATEGORY_MASK_ISO_8_2 \
654 | CATEGORY_MASK_ISO_7_ELSE \
655 | CATEGORY_MASK_ISO_8_ELSE \
656 | CATEGORY_MASK_UTF_8 \
657 | CATEGORY_MASK_UTF_16_BE \
658 | CATEGORY_MASK_UTF_16_LE \
659 | CATEGORY_MASK_UTF_16_BE_NOSIG \
660 | CATEGORY_MASK_UTF_16_LE_NOSIG \
661 | CATEGORY_MASK_CHARSET \
662 | CATEGORY_MASK_SJIS \
663 | CATEGORY_MASK_BIG5 \
664 | CATEGORY_MASK_CCL \
665 | CATEGORY_MASK_EMACS_MULE)
666
667
668 #define CATEGORY_MASK_ISO_7BIT \
669 (CATEGORY_MASK_ISO_7 | CATEGORY_MASK_ISO_7_TIGHT)
670
671 #define CATEGORY_MASK_ISO_8BIT \
672 (CATEGORY_MASK_ISO_8_1 | CATEGORY_MASK_ISO_8_2)
673
674 #define CATEGORY_MASK_ISO_ELSE \
675 (CATEGORY_MASK_ISO_7_ELSE | CATEGORY_MASK_ISO_8_ELSE)
676
677 #define CATEGORY_MASK_ISO_ESCAPE \
678 (CATEGORY_MASK_ISO_7 \
679 | CATEGORY_MASK_ISO_7_TIGHT \
680 | CATEGORY_MASK_ISO_7_ELSE \
681 | CATEGORY_MASK_ISO_8_ELSE)
682
683 #define CATEGORY_MASK_ISO \
684 ( CATEGORY_MASK_ISO_7BIT \
685 | CATEGORY_MASK_ISO_8BIT \
686 | CATEGORY_MASK_ISO_ELSE)
687
688 #define CATEGORY_MASK_UTF_16 \
689 (CATEGORY_MASK_UTF_16_BE \
690 | CATEGORY_MASK_UTF_16_LE \
691 | CATEGORY_MASK_UTF_16_BE_NOSIG \
692 | CATEGORY_MASK_UTF_16_LE_NOSIG)
693
694
695 /* List of symbols `coding-category-xxx' ordered by priority. This
696 variable is exposed to Emacs Lisp. */
697 static Lisp_Object Vcoding_category_list;
698
699 /* Table of coding categories (Lisp symbols). This variable is for
700 internal use oly. */
701 static Lisp_Object Vcoding_category_table;
702
703 /* Table of coding-categories ordered by priority. */
704 static enum coding_category coding_priorities[coding_category_max];
705
706 /* Nth element is a coding context for the coding system bound to the
707 Nth coding category. */
708 static struct coding_system coding_categories[coding_category_max];
709
710 /*** Commonly used macros and functions ***/
711
712 #ifndef min
713 #define min(a, b) ((a) < (b) ? (a) : (b))
714 #endif
715 #ifndef max
716 #define max(a, b) ((a) > (b) ? (a) : (b))
717 #endif
718
719 #define CODING_GET_INFO(coding, attrs, charset_list) \
720 do { \
721 (attrs) = CODING_ID_ATTRS ((coding)->id); \
722 (charset_list) = CODING_ATTR_CHARSET_LIST (attrs); \
723 } while (0)
724
725
726 /* Safely get one byte from the source text pointed by SRC which ends
727 at SRC_END, and set C to that byte. If there are not enough bytes
728 in the source, it jumps to `no_more_source'. If multibytep is
729 nonzero, and a multibyte character is found at SRC, set C to the
730 negative value of the character code. The caller should declare
731 and set these variables appropriately in advance:
732 src, src_end, multibytep */
733
734 #define ONE_MORE_BYTE(c) \
735 do { \
736 if (src == src_end) \
737 { \
738 if (src_base < src) \
739 record_conversion_result \
740 (coding, CODING_RESULT_INSUFFICIENT_SRC); \
741 goto no_more_source; \
742 } \
743 c = *src++; \
744 if (multibytep && (c & 0x80)) \
745 { \
746 if ((c & 0xFE) == 0xC0) \
747 c = ((c & 1) << 6) | *src++; \
748 else \
749 { \
750 c = - string_char (--src, &src, NULL); \
751 record_conversion_result \
752 (coding, CODING_RESULT_INVALID_SRC); \
753 } \
754 } \
755 consumed_chars++; \
756 } while (0)
757
758
759 #define ONE_MORE_BYTE_NO_CHECK(c) \
760 do { \
761 c = *src++; \
762 if (multibytep && (c & 0x80)) \
763 { \
764 if ((c & 0xFE) == 0xC0) \
765 c = ((c & 1) << 6) | *src++; \
766 else \
767 { \
768 c = - string_char (--src, &src, NULL); \
769 record_conversion_result \
770 (coding, CODING_RESULT_INVALID_SRC); \
771 } \
772 } \
773 consumed_chars++; \
774 } while (0)
775
776
777 /* Store a byte C in the place pointed by DST and increment DST to the
778 next free point, and increment PRODUCED_CHARS. The caller should
779 assure that C is 0..127, and declare and set the variable `dst'
780 appropriately in advance.
781 */
782
783
784 #define EMIT_ONE_ASCII_BYTE(c) \
785 do { \
786 produced_chars++; \
787 *dst++ = (c); \
788 } while (0)
789
790
791 /* Like EMIT_ONE_ASCII_BYTE byt store two bytes; C1 and C2. */
792
793 #define EMIT_TWO_ASCII_BYTES(c1, c2) \
794 do { \
795 produced_chars += 2; \
796 *dst++ = (c1), *dst++ = (c2); \
797 } while (0)
798
799
800 /* Store a byte C in the place pointed by DST and increment DST to the
801 next free point, and increment PRODUCED_CHARS. If MULTIBYTEP is
802 nonzero, store in an appropriate multibyte from. The caller should
803 declare and set the variables `dst' and `multibytep' appropriately
804 in advance. */
805
806 #define EMIT_ONE_BYTE(c) \
807 do { \
808 produced_chars++; \
809 if (multibytep) \
810 { \
811 int ch = (c); \
812 if (ch >= 0x80) \
813 ch = BYTE8_TO_CHAR (ch); \
814 CHAR_STRING_ADVANCE (ch, dst); \
815 } \
816 else \
817 *dst++ = (c); \
818 } while (0)
819
820
821 /* Like EMIT_ONE_BYTE, but emit two bytes; C1 and C2. */
822
823 #define EMIT_TWO_BYTES(c1, c2) \
824 do { \
825 produced_chars += 2; \
826 if (multibytep) \
827 { \
828 int ch; \
829 \
830 ch = (c1); \
831 if (ch >= 0x80) \
832 ch = BYTE8_TO_CHAR (ch); \
833 CHAR_STRING_ADVANCE (ch, dst); \
834 ch = (c2); \
835 if (ch >= 0x80) \
836 ch = BYTE8_TO_CHAR (ch); \
837 CHAR_STRING_ADVANCE (ch, dst); \
838 } \
839 else \
840 { \
841 *dst++ = (c1); \
842 *dst++ = (c2); \
843 } \
844 } while (0)
845
846
847 #define EMIT_THREE_BYTES(c1, c2, c3) \
848 do { \
849 EMIT_ONE_BYTE (c1); \
850 EMIT_TWO_BYTES (c2, c3); \
851 } while (0)
852
853
854 #define EMIT_FOUR_BYTES(c1, c2, c3, c4) \
855 do { \
856 EMIT_TWO_BYTES (c1, c2); \
857 EMIT_TWO_BYTES (c3, c4); \
858 } while (0)
859
860
861 static void
862 record_conversion_result (struct coding_system *coding,
863 enum coding_result_code result)
864 {
865 coding->result = result;
866 switch (result)
867 {
868 case CODING_RESULT_INSUFFICIENT_SRC:
869 Vlast_code_conversion_error = Qinsufficient_source;
870 break;
871 case CODING_RESULT_INCONSISTENT_EOL:
872 Vlast_code_conversion_error = Qinconsistent_eol;
873 break;
874 case CODING_RESULT_INVALID_SRC:
875 Vlast_code_conversion_error = Qinvalid_source;
876 break;
877 case CODING_RESULT_INTERRUPT:
878 Vlast_code_conversion_error = Qinterrupted;
879 break;
880 case CODING_RESULT_INSUFFICIENT_MEM:
881 Vlast_code_conversion_error = Qinsufficient_memory;
882 break;
883 }
884 }
885
886 #define CODING_DECODE_CHAR(coding, src, src_base, src_end, charset, code, c) \
887 do { \
888 charset_map_loaded = 0; \
889 c = DECODE_CHAR (charset, code); \
890 if (charset_map_loaded) \
891 { \
892 const unsigned char *orig = coding->source; \
893 EMACS_INT offset; \
894 \
895 coding_set_source (coding); \
896 offset = coding->source - orig; \
897 src += offset; \
898 src_base += offset; \
899 src_end += offset; \
900 } \
901 } while (0)
902
903
904 #define ASSURE_DESTINATION(bytes) \
905 do { \
906 if (dst + (bytes) >= dst_end) \
907 { \
908 int more_bytes = charbuf_end - charbuf + (bytes); \
909 \
910 dst = alloc_destination (coding, more_bytes, dst); \
911 dst_end = coding->destination + coding->dst_bytes; \
912 } \
913 } while (0)
914
915
916
917 static void
918 coding_set_source (coding)
919 struct coding_system *coding;
920 {
921 if (BUFFERP (coding->src_object))
922 {
923 struct buffer *buf = XBUFFER (coding->src_object);
924
925 if (coding->src_pos < 0)
926 coding->source = BUF_GAP_END_ADDR (buf) + coding->src_pos_byte;
927 else
928 coding->source = BUF_BYTE_ADDRESS (buf, coding->src_pos_byte);
929 }
930 else if (STRINGP (coding->src_object))
931 {
932 coding->source = SDATA (coding->src_object) + coding->src_pos_byte;
933 }
934 else
935 /* Otherwise, the source is C string and is never relocated
936 automatically. Thus we don't have to update anything. */
937 ;
938 }
939
940 static void
941 coding_set_destination (coding)
942 struct coding_system *coding;
943 {
944 if (BUFFERP (coding->dst_object))
945 {
946 if (coding->src_pos < 0)
947 {
948 coding->destination = BEG_ADDR + coding->dst_pos_byte - 1;
949 coding->dst_bytes = (GAP_END_ADDR
950 - (coding->src_bytes - coding->consumed)
951 - coding->destination);
952 }
953 else
954 {
955 /* We are sure that coding->dst_pos_byte is before the gap
956 of the buffer. */
957 coding->destination = (BUF_BEG_ADDR (XBUFFER (coding->dst_object))
958 + coding->dst_pos_byte - 1);
959 coding->dst_bytes = (BUF_GAP_END_ADDR (XBUFFER (coding->dst_object))
960 - coding->destination);
961 }
962 }
963 else
964 /* Otherwise, the destination is C string and is never relocated
965 automatically. Thus we don't have to update anything. */
966 ;
967 }
968
969
970 static void
971 coding_alloc_by_realloc (coding, bytes)
972 struct coding_system *coding;
973 EMACS_INT bytes;
974 {
975 coding->destination = (unsigned char *) xrealloc (coding->destination,
976 coding->dst_bytes + bytes);
977 coding->dst_bytes += bytes;
978 }
979
980 static void
981 coding_alloc_by_making_gap (coding, bytes)
982 struct coding_system *coding;
983 EMACS_INT bytes;
984 {
985 if (BUFFERP (coding->dst_object)
986 && EQ (coding->src_object, coding->dst_object))
987 {
988 EMACS_INT add = coding->src_bytes - coding->consumed;
989
990 GAP_SIZE -= add; ZV += add; Z += add; ZV_BYTE += add; Z_BYTE += add;
991 make_gap (bytes);
992 GAP_SIZE += add; ZV -= add; Z -= add; ZV_BYTE -= add; Z_BYTE -= add;
993 }
994 else
995 {
996 Lisp_Object this_buffer;
997
998 this_buffer = Fcurrent_buffer ();
999 set_buffer_internal (XBUFFER (coding->dst_object));
1000 make_gap (bytes);
1001 set_buffer_internal (XBUFFER (this_buffer));
1002 }
1003 }
1004
1005
1006 static unsigned char *
1007 alloc_destination (coding, nbytes, dst)
1008 struct coding_system *coding;
1009 EMACS_INT nbytes;
1010 unsigned char *dst;
1011 {
1012 EMACS_INT offset = dst - coding->destination;
1013
1014 if (BUFFERP (coding->dst_object))
1015 coding_alloc_by_making_gap (coding, nbytes);
1016 else
1017 coding_alloc_by_realloc (coding, nbytes);
1018 record_conversion_result (coding, CODING_RESULT_SUCCESS);
1019 coding_set_destination (coding);
1020 dst = coding->destination + offset;
1021 return dst;
1022 }
1023
1024 /** Macros for annotations. */
1025
1026 /* Maximum length of annotation data (sum of annotations for
1027 composition and charset). */
1028 #define MAX_ANNOTATION_LENGTH (4 + (MAX_COMPOSITION_COMPONENTS * 2) - 1 + 4)
1029
1030 /* An annotation data is stored in the array coding->charbuf in this
1031 format:
1032 [ -LENGTH ANNOTATION_MASK NCHARS ... ]
1033 LENGTH is the number of elements in the annotation.
1034 ANNOTATION_MASK is one of CODING_ANNOTATE_XXX_MASK.
1035 NCHARS is the number of characters in the text annotated.
1036
1037 The format of the following elements depend on ANNOTATION_MASK.
1038
1039 In the case of CODING_ANNOTATE_COMPOSITION_MASK, these elements
1040 follows:
1041 ... METHOD [ COMPOSITION-COMPONENTS ... ]
1042 METHOD is one of enum composition_method.
1043 Optionnal COMPOSITION-COMPONENTS are characters and composition
1044 rules.
1045
1046 In the case of CODING_ANNOTATE_CHARSET_MASK, one element CHARSET-ID
1047 follows. */
1048
1049 #define ADD_ANNOTATION_DATA(buf, len, mask, nchars) \
1050 do { \
1051 *(buf)++ = -(len); \
1052 *(buf)++ = (mask); \
1053 *(buf)++ = (nchars); \
1054 coding->annotated = 1; \
1055 } while (0);
1056
1057 #define ADD_COMPOSITION_DATA(buf, nchars, method) \
1058 do { \
1059 ADD_ANNOTATION_DATA (buf, 4, CODING_ANNOTATE_COMPOSITION_MASK, nchars); \
1060 *buf++ = method; \
1061 } while (0)
1062
1063
1064 #define ADD_CHARSET_DATA(buf, nchars, id) \
1065 do { \
1066 ADD_ANNOTATION_DATA (buf, 4, CODING_ANNOTATE_CHARSET_MASK, nchars); \
1067 *buf++ = id; \
1068 } while (0)
1069
1070 \f
1071 /*** 2. Emacs' internal format (emacs-utf-8) ***/
1072
1073
1074
1075 \f
1076 /*** 3. UTF-8 ***/
1077
1078 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
1079 Check if a text is encoded in UTF-8. If it is, return 1, else
1080 return 0. */
1081
1082 #define UTF_8_1_OCTET_P(c) ((c) < 0x80)
1083 #define UTF_8_EXTRA_OCTET_P(c) (((c) & 0xC0) == 0x80)
1084 #define UTF_8_2_OCTET_LEADING_P(c) (((c) & 0xE0) == 0xC0)
1085 #define UTF_8_3_OCTET_LEADING_P(c) (((c) & 0xF0) == 0xE0)
1086 #define UTF_8_4_OCTET_LEADING_P(c) (((c) & 0xF8) == 0xF0)
1087 #define UTF_8_5_OCTET_LEADING_P(c) (((c) & 0xFC) == 0xF8)
1088
1089 static int
1090 detect_coding_utf_8 (coding, detect_info)
1091 struct coding_system *coding;
1092 struct coding_detection_info *detect_info;
1093 {
1094 const unsigned char *src = coding->source, *src_base;
1095 const unsigned char *src_end = coding->source + coding->src_bytes;
1096 int multibytep = coding->src_multibyte;
1097 int consumed_chars = 0;
1098 int found = 0;
1099
1100 detect_info->checked |= CATEGORY_MASK_UTF_8;
1101 /* A coding system of this category is always ASCII compatible. */
1102 src += coding->head_ascii;
1103
1104 while (1)
1105 {
1106 int c, c1, c2, c3, c4;
1107
1108 src_base = src;
1109 ONE_MORE_BYTE (c);
1110 if (c < 0 || UTF_8_1_OCTET_P (c))
1111 continue;
1112 ONE_MORE_BYTE (c1);
1113 if (c1 < 0 || ! UTF_8_EXTRA_OCTET_P (c1))
1114 break;
1115 if (UTF_8_2_OCTET_LEADING_P (c))
1116 {
1117 found = CATEGORY_MASK_UTF_8;
1118 continue;
1119 }
1120 ONE_MORE_BYTE (c2);
1121 if (c2 < 0 || ! UTF_8_EXTRA_OCTET_P (c2))
1122 break;
1123 if (UTF_8_3_OCTET_LEADING_P (c))
1124 {
1125 found = CATEGORY_MASK_UTF_8;
1126 continue;
1127 }
1128 ONE_MORE_BYTE (c3);
1129 if (c3 < 0 || ! UTF_8_EXTRA_OCTET_P (c3))
1130 break;
1131 if (UTF_8_4_OCTET_LEADING_P (c))
1132 {
1133 found = CATEGORY_MASK_UTF_8;
1134 continue;
1135 }
1136 ONE_MORE_BYTE (c4);
1137 if (c4 < 0 || ! UTF_8_EXTRA_OCTET_P (c4))
1138 break;
1139 if (UTF_8_5_OCTET_LEADING_P (c))
1140 {
1141 found = CATEGORY_MASK_UTF_8;
1142 continue;
1143 }
1144 break;
1145 }
1146 detect_info->rejected |= CATEGORY_MASK_UTF_8;
1147 return 0;
1148
1149 no_more_source:
1150 if (src_base < src && coding->mode & CODING_MODE_LAST_BLOCK)
1151 {
1152 detect_info->rejected |= CATEGORY_MASK_UTF_8;
1153 return 0;
1154 }
1155 detect_info->found |= found;
1156 return 1;
1157 }
1158
1159
1160 static void
1161 decode_coding_utf_8 (coding)
1162 struct coding_system *coding;
1163 {
1164 const unsigned char *src = coding->source + coding->consumed;
1165 const unsigned char *src_end = coding->source + coding->src_bytes;
1166 const unsigned char *src_base;
1167 int *charbuf = coding->charbuf + coding->charbuf_used;
1168 int *charbuf_end = coding->charbuf + coding->charbuf_size;
1169 int consumed_chars = 0, consumed_chars_base;
1170 int multibytep = coding->src_multibyte;
1171 Lisp_Object attr, charset_list;
1172
1173 CODING_GET_INFO (coding, attr, charset_list);
1174
1175 while (1)
1176 {
1177 int c, c1, c2, c3, c4, c5;
1178
1179 src_base = src;
1180 consumed_chars_base = consumed_chars;
1181
1182 if (charbuf >= charbuf_end)
1183 break;
1184
1185 ONE_MORE_BYTE (c1);
1186 if (c1 < 0)
1187 {
1188 c = - c1;
1189 }
1190 else if (UTF_8_1_OCTET_P(c1))
1191 {
1192 c = c1;
1193 }
1194 else
1195 {
1196 ONE_MORE_BYTE (c2);
1197 if (c2 < 0 || ! UTF_8_EXTRA_OCTET_P (c2))
1198 goto invalid_code;
1199 if (UTF_8_2_OCTET_LEADING_P (c1))
1200 {
1201 c = ((c1 & 0x1F) << 6) | (c2 & 0x3F);
1202 /* Reject overlong sequences here and below. Encoders
1203 producing them are incorrect, they can be misleading,
1204 and they mess up read/write invariance. */
1205 if (c < 128)
1206 goto invalid_code;
1207 }
1208 else
1209 {
1210 ONE_MORE_BYTE (c3);
1211 if (c3 < 0 || ! UTF_8_EXTRA_OCTET_P (c3))
1212 goto invalid_code;
1213 if (UTF_8_3_OCTET_LEADING_P (c1))
1214 {
1215 c = (((c1 & 0xF) << 12)
1216 | ((c2 & 0x3F) << 6) | (c3 & 0x3F));
1217 if (c < 0x800
1218 || (c >= 0xd800 && c < 0xe000)) /* surrogates (invalid) */
1219 goto invalid_code;
1220 }
1221 else
1222 {
1223 ONE_MORE_BYTE (c4);
1224 if (c4 < 0 || ! UTF_8_EXTRA_OCTET_P (c4))
1225 goto invalid_code;
1226 if (UTF_8_4_OCTET_LEADING_P (c1))
1227 {
1228 c = (((c1 & 0x7) << 18) | ((c2 & 0x3F) << 12)
1229 | ((c3 & 0x3F) << 6) | (c4 & 0x3F));
1230 if (c < 0x10000)
1231 goto invalid_code;
1232 }
1233 else
1234 {
1235 ONE_MORE_BYTE (c5);
1236 if (c5 < 0 || ! UTF_8_EXTRA_OCTET_P (c5))
1237 goto invalid_code;
1238 if (UTF_8_5_OCTET_LEADING_P (c1))
1239 {
1240 c = (((c1 & 0x3) << 24) | ((c2 & 0x3F) << 18)
1241 | ((c3 & 0x3F) << 12) | ((c4 & 0x3F) << 6)
1242 | (c5 & 0x3F));
1243 if ((c > MAX_CHAR) || (c < 0x200000))
1244 goto invalid_code;
1245 }
1246 else
1247 goto invalid_code;
1248 }
1249 }
1250 }
1251 }
1252
1253 *charbuf++ = c;
1254 continue;
1255
1256 invalid_code:
1257 src = src_base;
1258 consumed_chars = consumed_chars_base;
1259 ONE_MORE_BYTE (c);
1260 *charbuf++ = ASCII_BYTE_P (c) ? c : BYTE8_TO_CHAR (c);
1261 coding->errors++;
1262 }
1263
1264 no_more_source:
1265 coding->consumed_char += consumed_chars_base;
1266 coding->consumed = src_base - coding->source;
1267 coding->charbuf_used = charbuf - coding->charbuf;
1268 }
1269
1270
1271 static int
1272 encode_coding_utf_8 (coding)
1273 struct coding_system *coding;
1274 {
1275 int multibytep = coding->dst_multibyte;
1276 int *charbuf = coding->charbuf;
1277 int *charbuf_end = charbuf + coding->charbuf_used;
1278 unsigned char *dst = coding->destination + coding->produced;
1279 unsigned char *dst_end = coding->destination + coding->dst_bytes;
1280 int produced_chars = 0;
1281 int c;
1282
1283 if (multibytep)
1284 {
1285 int safe_room = MAX_MULTIBYTE_LENGTH * 2;
1286
1287 while (charbuf < charbuf_end)
1288 {
1289 unsigned char str[MAX_MULTIBYTE_LENGTH], *p, *pend = str;
1290
1291 ASSURE_DESTINATION (safe_room);
1292 c = *charbuf++;
1293 if (CHAR_BYTE8_P (c))
1294 {
1295 c = CHAR_TO_BYTE8 (c);
1296 EMIT_ONE_BYTE (c);
1297 }
1298 else
1299 {
1300 CHAR_STRING_ADVANCE (c, pend);
1301 for (p = str; p < pend; p++)
1302 EMIT_ONE_BYTE (*p);
1303 }
1304 }
1305 }
1306 else
1307 {
1308 int safe_room = MAX_MULTIBYTE_LENGTH;
1309
1310 while (charbuf < charbuf_end)
1311 {
1312 ASSURE_DESTINATION (safe_room);
1313 c = *charbuf++;
1314 dst += CHAR_STRING (c, dst);
1315 produced_chars++;
1316 }
1317 }
1318 record_conversion_result (coding, CODING_RESULT_SUCCESS);
1319 coding->produced_char += produced_chars;
1320 coding->produced = dst - coding->destination;
1321 return 0;
1322 }
1323
1324
1325 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
1326 Check if a text is encoded in one of UTF-16 based coding systems.
1327 If it is, return 1, else return 0. */
1328
1329 #define UTF_16_HIGH_SURROGATE_P(val) \
1330 (((val) & 0xFC00) == 0xD800)
1331
1332 #define UTF_16_LOW_SURROGATE_P(val) \
1333 (((val) & 0xFC00) == 0xDC00)
1334
1335 #define UTF_16_INVALID_P(val) \
1336 (((val) == 0xFFFE) \
1337 || ((val) == 0xFFFF) \
1338 || UTF_16_LOW_SURROGATE_P (val))
1339
1340
1341 static int
1342 detect_coding_utf_16 (coding, detect_info)
1343 struct coding_system *coding;
1344 struct coding_detection_info *detect_info;
1345 {
1346 const unsigned char *src = coding->source, *src_base = src;
1347 const unsigned char *src_end = coding->source + coding->src_bytes;
1348 int multibytep = coding->src_multibyte;
1349 int consumed_chars = 0;
1350 int c1, c2;
1351
1352 detect_info->checked |= CATEGORY_MASK_UTF_16;
1353 if (coding->mode & CODING_MODE_LAST_BLOCK
1354 && (coding->src_chars & 1))
1355 {
1356 detect_info->rejected |= CATEGORY_MASK_UTF_16;
1357 return 0;
1358 }
1359
1360 ONE_MORE_BYTE (c1);
1361 ONE_MORE_BYTE (c2);
1362 if ((c1 == 0xFF) && (c2 == 0xFE))
1363 {
1364 detect_info->found |= (CATEGORY_MASK_UTF_16_LE
1365 | CATEGORY_MASK_UTF_16_AUTO);
1366 detect_info->rejected |= (CATEGORY_MASK_UTF_16_BE
1367 | CATEGORY_MASK_UTF_16_BE_NOSIG
1368 | CATEGORY_MASK_UTF_16_LE_NOSIG);
1369 }
1370 else if ((c1 == 0xFE) && (c2 == 0xFF))
1371 {
1372 detect_info->found |= (CATEGORY_MASK_UTF_16_BE
1373 | CATEGORY_MASK_UTF_16_AUTO);
1374 detect_info->rejected |= (CATEGORY_MASK_UTF_16_LE
1375 | CATEGORY_MASK_UTF_16_BE_NOSIG
1376 | CATEGORY_MASK_UTF_16_LE_NOSIG);
1377 }
1378 else if (c1 >= 0 && c2 >= 0)
1379 {
1380 unsigned char b1[256], b2[256];
1381 int b1_variants = 1, b2_variants = 1;
1382 int n;
1383
1384 bzero (b1, 256), bzero (b2, 256);
1385 b1[c1]++, b2[c2]++;
1386 for (n = 0; n < 256 && src < src_end; n++)
1387 {
1388 src_base = src;
1389 ONE_MORE_BYTE (c1);
1390 ONE_MORE_BYTE (c2);
1391 if (c1 < 0 || c2 < 0)
1392 break;
1393 if (! b1[c1++]) b1_variants++;
1394 if (! b2[c2++]) b2_variants++;
1395 }
1396 if (b1_variants < b2_variants)
1397 detect_info->found |= CATEGORY_MASK_UTF_16_BE_NOSIG;
1398 else
1399 detect_info->found |= CATEGORY_MASK_UTF_16_LE_NOSIG;
1400 detect_info->rejected
1401 |= (CATEGORY_MASK_UTF_16_BE | CATEGORY_MASK_UTF_16_LE);
1402 }
1403 no_more_source:
1404 return 1;
1405 }
1406
1407 static void
1408 decode_coding_utf_16 (coding)
1409 struct coding_system *coding;
1410 {
1411 const unsigned char *src = coding->source + coding->consumed;
1412 const unsigned char *src_end = coding->source + coding->src_bytes;
1413 const unsigned char *src_base;
1414 int *charbuf = coding->charbuf + coding->charbuf_used;
1415 int *charbuf_end = coding->charbuf + coding->charbuf_size;
1416 int consumed_chars = 0, consumed_chars_base;
1417 int multibytep = coding->src_multibyte;
1418 enum utf_16_bom_type bom = CODING_UTF_16_BOM (coding);
1419 enum utf_16_endian_type endian = CODING_UTF_16_ENDIAN (coding);
1420 int surrogate = CODING_UTF_16_SURROGATE (coding);
1421 Lisp_Object attr, charset_list;
1422
1423 CODING_GET_INFO (coding, attr, charset_list);
1424
1425 if (bom == utf_16_with_bom)
1426 {
1427 int c, c1, c2;
1428
1429 src_base = src;
1430 ONE_MORE_BYTE (c1);
1431 ONE_MORE_BYTE (c2);
1432 c = (c1 << 8) | c2;
1433
1434 if (endian == utf_16_big_endian
1435 ? c != 0xFEFF : c != 0xFFFE)
1436 {
1437 /* The first two bytes are not BOM. Treat them as bytes
1438 for a normal character. */
1439 src = src_base;
1440 coding->errors++;
1441 }
1442 CODING_UTF_16_BOM (coding) = utf_16_without_bom;
1443 }
1444 else if (bom == utf_16_detect_bom)
1445 {
1446 /* We have already tried to detect BOM and failed in
1447 detect_coding. */
1448 CODING_UTF_16_BOM (coding) = utf_16_without_bom;
1449 }
1450
1451 while (1)
1452 {
1453 int c, c1, c2;
1454
1455 src_base = src;
1456 consumed_chars_base = consumed_chars;
1457
1458 if (charbuf + 2 >= charbuf_end)
1459 break;
1460
1461 ONE_MORE_BYTE (c1);
1462 if (c1 < 0)
1463 {
1464 *charbuf++ = -c1;
1465 continue;
1466 }
1467 ONE_MORE_BYTE (c2);
1468 if (c2 < 0)
1469 {
1470 *charbuf++ = ASCII_BYTE_P (c1) ? c1 : BYTE8_TO_CHAR (c1);
1471 *charbuf++ = -c2;
1472 continue;
1473 }
1474 c = (endian == utf_16_big_endian
1475 ? ((c1 << 8) | c2) : ((c2 << 8) | c1));
1476 if (surrogate)
1477 {
1478 if (! UTF_16_LOW_SURROGATE_P (c))
1479 {
1480 if (endian == utf_16_big_endian)
1481 c1 = surrogate >> 8, c2 = surrogate & 0xFF;
1482 else
1483 c1 = surrogate & 0xFF, c2 = surrogate >> 8;
1484 *charbuf++ = c1;
1485 *charbuf++ = c2;
1486 coding->errors++;
1487 if (UTF_16_HIGH_SURROGATE_P (c))
1488 CODING_UTF_16_SURROGATE (coding) = surrogate = c;
1489 else
1490 *charbuf++ = c;
1491 }
1492 else
1493 {
1494 c = ((surrogate - 0xD800) << 10) | (c - 0xDC00);
1495 CODING_UTF_16_SURROGATE (coding) = surrogate = 0;
1496 *charbuf++ = 0x10000 + c;
1497 }
1498 }
1499 else
1500 {
1501 if (UTF_16_HIGH_SURROGATE_P (c))
1502 CODING_UTF_16_SURROGATE (coding) = surrogate = c;
1503 else
1504 *charbuf++ = c;
1505 }
1506 }
1507
1508 no_more_source:
1509 coding->consumed_char += consumed_chars_base;
1510 coding->consumed = src_base - coding->source;
1511 coding->charbuf_used = charbuf - coding->charbuf;
1512 }
1513
1514 static int
1515 encode_coding_utf_16 (coding)
1516 struct coding_system *coding;
1517 {
1518 int multibytep = coding->dst_multibyte;
1519 int *charbuf = coding->charbuf;
1520 int *charbuf_end = charbuf + coding->charbuf_used;
1521 unsigned char *dst = coding->destination + coding->produced;
1522 unsigned char *dst_end = coding->destination + coding->dst_bytes;
1523 int safe_room = 8;
1524 enum utf_16_bom_type bom = CODING_UTF_16_BOM (coding);
1525 int big_endian = CODING_UTF_16_ENDIAN (coding) == utf_16_big_endian;
1526 int produced_chars = 0;
1527 Lisp_Object attrs, charset_list;
1528 int c;
1529
1530 CODING_GET_INFO (coding, attrs, charset_list);
1531
1532 if (bom != utf_16_without_bom)
1533 {
1534 ASSURE_DESTINATION (safe_room);
1535 if (big_endian)
1536 EMIT_TWO_BYTES (0xFE, 0xFF);
1537 else
1538 EMIT_TWO_BYTES (0xFF, 0xFE);
1539 CODING_UTF_16_BOM (coding) = utf_16_without_bom;
1540 }
1541
1542 while (charbuf < charbuf_end)
1543 {
1544 ASSURE_DESTINATION (safe_room);
1545 c = *charbuf++;
1546 if (c >= MAX_UNICODE_CHAR)
1547 c = coding->default_char;
1548
1549 if (c < 0x10000)
1550 {
1551 if (big_endian)
1552 EMIT_TWO_BYTES (c >> 8, c & 0xFF);
1553 else
1554 EMIT_TWO_BYTES (c & 0xFF, c >> 8);
1555 }
1556 else
1557 {
1558 int c1, c2;
1559
1560 c -= 0x10000;
1561 c1 = (c >> 10) + 0xD800;
1562 c2 = (c & 0x3FF) + 0xDC00;
1563 if (big_endian)
1564 EMIT_FOUR_BYTES (c1 >> 8, c1 & 0xFF, c2 >> 8, c2 & 0xFF);
1565 else
1566 EMIT_FOUR_BYTES (c1 & 0xFF, c1 >> 8, c2 & 0xFF, c2 >> 8);
1567 }
1568 }
1569 record_conversion_result (coding, CODING_RESULT_SUCCESS);
1570 coding->produced = dst - coding->destination;
1571 coding->produced_char += produced_chars;
1572 return 0;
1573 }
1574
1575 \f
1576 /*** 6. Old Emacs' internal format (emacs-mule) ***/
1577
1578 /* Emacs' internal format for representation of multiple character
1579 sets is a kind of multi-byte encoding, i.e. characters are
1580 represented by variable-length sequences of one-byte codes.
1581
1582 ASCII characters and control characters (e.g. `tab', `newline') are
1583 represented by one-byte sequences which are their ASCII codes, in
1584 the range 0x00 through 0x7F.
1585
1586 8-bit characters of the range 0x80..0x9F are represented by
1587 two-byte sequences of LEADING_CODE_8_BIT_CONTROL and (their 8-bit
1588 code + 0x20).
1589
1590 8-bit characters of the range 0xA0..0xFF are represented by
1591 one-byte sequences which are their 8-bit code.
1592
1593 The other characters are represented by a sequence of `base
1594 leading-code', optional `extended leading-code', and one or two
1595 `position-code's. The length of the sequence is determined by the
1596 base leading-code. Leading-code takes the range 0x81 through 0x9D,
1597 whereas extended leading-code and position-code take the range 0xA0
1598 through 0xFF. See `charset.h' for more details about leading-code
1599 and position-code.
1600
1601 --- CODE RANGE of Emacs' internal format ---
1602 character set range
1603 ------------- -----
1604 ascii 0x00..0x7F
1605 eight-bit-control LEADING_CODE_8_BIT_CONTROL + 0xA0..0xBF
1606 eight-bit-graphic 0xA0..0xBF
1607 ELSE 0x81..0x9D + [0xA0..0xFF]+
1608 ---------------------------------------------
1609
1610 As this is the internal character representation, the format is
1611 usually not used externally (i.e. in a file or in a data sent to a
1612 process). But, it is possible to have a text externally in this
1613 format (i.e. by encoding by the coding system `emacs-mule').
1614
1615 In that case, a sequence of one-byte codes has a slightly different
1616 form.
1617
1618 At first, all characters in eight-bit-control are represented by
1619 one-byte sequences which are their 8-bit code.
1620
1621 Next, character composition data are represented by the byte
1622 sequence of the form: 0x80 METHOD BYTES CHARS COMPONENT ...,
1623 where,
1624 METHOD is 0xF0 plus one of composition method (enum
1625 composition_method),
1626
1627 BYTES is 0xA0 plus a byte length of this composition data,
1628
1629 CHARS is 0x20 plus a number of characters composed by this
1630 data,
1631
1632 COMPONENTs are characters of multibye form or composition
1633 rules encoded by two-byte of ASCII codes.
1634
1635 In addition, for backward compatibility, the following formats are
1636 also recognized as composition data on decoding.
1637
1638 0x80 MSEQ ...
1639 0x80 0xFF MSEQ RULE MSEQ RULE ... MSEQ
1640
1641 Here,
1642 MSEQ is a multibyte form but in these special format:
1643 ASCII: 0xA0 ASCII_CODE+0x80,
1644 other: LEADING_CODE+0x20 FOLLOWING-BYTE ...,
1645 RULE is a one byte code of the range 0xA0..0xF0 that
1646 represents a composition rule.
1647 */
1648
1649 char emacs_mule_bytes[256];
1650
1651 int
1652 emacs_mule_char (coding, src, nbytes, nchars, id)
1653 struct coding_system *coding;
1654 const unsigned char *src;
1655 int *nbytes, *nchars, *id;
1656 {
1657 const unsigned char *src_end = coding->source + coding->src_bytes;
1658 const unsigned char *src_base = src;
1659 int multibytep = coding->src_multibyte;
1660 struct charset *charset;
1661 unsigned code;
1662 int c;
1663 int consumed_chars = 0;
1664
1665 ONE_MORE_BYTE (c);
1666 if (c < 0)
1667 {
1668 c = -c;
1669 charset = emacs_mule_charset[0];
1670 }
1671 else
1672 {
1673 switch (emacs_mule_bytes[c])
1674 {
1675 case 2:
1676 if (! (charset = emacs_mule_charset[c]))
1677 goto invalid_code;
1678 ONE_MORE_BYTE (c);
1679 if (c < 0)
1680 goto invalid_code;
1681 code = c & 0x7F;
1682 break;
1683
1684 case 3:
1685 if (c == EMACS_MULE_LEADING_CODE_PRIVATE_11
1686 || c == EMACS_MULE_LEADING_CODE_PRIVATE_12)
1687 {
1688 ONE_MORE_BYTE (c);
1689 if (c < 0 || ! (charset = emacs_mule_charset[c]))
1690 goto invalid_code;
1691 ONE_MORE_BYTE (c);
1692 if (c < 0)
1693 goto invalid_code;
1694 code = c & 0x7F;
1695 }
1696 else
1697 {
1698 if (! (charset = emacs_mule_charset[c]))
1699 goto invalid_code;
1700 ONE_MORE_BYTE (c);
1701 if (c < 0)
1702 goto invalid_code;
1703 code = (c & 0x7F) << 8;
1704 ONE_MORE_BYTE (c);
1705 if (c < 0)
1706 goto invalid_code;
1707 code |= c & 0x7F;
1708 }
1709 break;
1710
1711 case 4:
1712 ONE_MORE_BYTE (c);
1713 if (c < 0 || ! (charset = emacs_mule_charset[c]))
1714 goto invalid_code;
1715 ONE_MORE_BYTE (c);
1716 if (c < 0)
1717 goto invalid_code;
1718 code = (c & 0x7F) << 8;
1719 ONE_MORE_BYTE (c);
1720 if (c < 0)
1721 goto invalid_code;
1722 code |= c & 0x7F;
1723 break;
1724
1725 case 1:
1726 code = c;
1727 charset = CHARSET_FROM_ID (ASCII_BYTE_P (code)
1728 ? charset_ascii : charset_eight_bit);
1729 break;
1730
1731 default:
1732 abort ();
1733 }
1734 c = DECODE_CHAR (charset, code);
1735 if (c < 0)
1736 goto invalid_code;
1737 }
1738 *nbytes = src - src_base;
1739 *nchars = consumed_chars;
1740 if (id)
1741 *id = charset->id;
1742 return c;
1743
1744 no_more_source:
1745 return -2;
1746
1747 invalid_code:
1748 return -1;
1749 }
1750
1751
1752 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
1753 Check if a text is encoded in `emacs-mule'. If it is, return 1,
1754 else return 0. */
1755
1756 static int
1757 detect_coding_emacs_mule (coding, detect_info)
1758 struct coding_system *coding;
1759 struct coding_detection_info *detect_info;
1760 {
1761 const unsigned char *src = coding->source, *src_base;
1762 const unsigned char *src_end = coding->source + coding->src_bytes;
1763 int multibytep = coding->src_multibyte;
1764 int consumed_chars = 0;
1765 int c;
1766 int found = 0;
1767
1768 detect_info->checked |= CATEGORY_MASK_EMACS_MULE;
1769 /* A coding system of this category is always ASCII compatible. */
1770 src += coding->head_ascii;
1771
1772 while (1)
1773 {
1774 src_base = src;
1775 ONE_MORE_BYTE (c);
1776 if (c < 0)
1777 continue;
1778 if (c == 0x80)
1779 {
1780 /* Perhaps the start of composite character. We simple skip
1781 it because analyzing it is too heavy for detecting. But,
1782 at least, we check that the composite character
1783 constitues of more than 4 bytes. */
1784 const unsigned char *src_base;
1785
1786 repeat:
1787 src_base = src;
1788 do
1789 {
1790 ONE_MORE_BYTE (c);
1791 }
1792 while (c >= 0xA0);
1793
1794 if (src - src_base <= 4)
1795 break;
1796 found = CATEGORY_MASK_EMACS_MULE;
1797 if (c == 0x80)
1798 goto repeat;
1799 }
1800
1801 if (c < 0x80)
1802 {
1803 if (c < 0x20
1804 && (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO))
1805 break;
1806 }
1807 else
1808 {
1809 int more_bytes = emacs_mule_bytes[*src_base] - 1;
1810
1811 while (more_bytes > 0)
1812 {
1813 ONE_MORE_BYTE (c);
1814 if (c < 0xA0)
1815 {
1816 src--; /* Unread the last byte. */
1817 break;
1818 }
1819 more_bytes--;
1820 }
1821 if (more_bytes != 0)
1822 break;
1823 found = CATEGORY_MASK_EMACS_MULE;
1824 }
1825 }
1826 detect_info->rejected |= CATEGORY_MASK_EMACS_MULE;
1827 return 0;
1828
1829 no_more_source:
1830 if (src_base < src && coding->mode & CODING_MODE_LAST_BLOCK)
1831 {
1832 detect_info->rejected |= CATEGORY_MASK_EMACS_MULE;
1833 return 0;
1834 }
1835 detect_info->found |= found;
1836 return 1;
1837 }
1838
1839
1840 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions". */
1841
1842 /* Decode a character represented as a component of composition
1843 sequence of Emacs 20/21 style at SRC. Set C to that character and
1844 update SRC to the head of next character (or an encoded composition
1845 rule). If SRC doesn't points a composition component, set C to -1.
1846 If SRC points an invalid byte sequence, global exit by a return
1847 value 0. */
1848
1849 #define DECODE_EMACS_MULE_COMPOSITION_CHAR(buf) \
1850 if (1) \
1851 { \
1852 int c; \
1853 int nbytes, nchars; \
1854 \
1855 if (src == src_end) \
1856 break; \
1857 c = emacs_mule_char (coding, src, &nbytes, &nchars, NULL);\
1858 if (c < 0) \
1859 { \
1860 if (c == -2) \
1861 break; \
1862 goto invalid_code; \
1863 } \
1864 *buf++ = c; \
1865 src += nbytes; \
1866 consumed_chars += nchars; \
1867 } \
1868 else
1869
1870
1871 /* Decode a composition rule represented as a component of composition
1872 sequence of Emacs 20 style at SRC. Store the decoded rule in *BUF,
1873 and increment BUF. If SRC points an invalid byte sequence, set C
1874 to -1. */
1875
1876 #define DECODE_EMACS_MULE_COMPOSITION_RULE_20(buf) \
1877 do { \
1878 int c, gref, nref; \
1879 \
1880 if (src >= src_end) \
1881 goto invalid_code; \
1882 ONE_MORE_BYTE_NO_CHECK (c); \
1883 c -= 0x20; \
1884 if (c < 0 || c >= 81) \
1885 goto invalid_code; \
1886 \
1887 gref = c / 9, nref = c % 9; \
1888 *buf++ = COMPOSITION_ENCODE_RULE (gref, nref); \
1889 } while (0)
1890
1891
1892 /* Decode a composition rule represented as a component of composition
1893 sequence of Emacs 21 style at SRC. Store the decoded rule in *BUF,
1894 and increment BUF. If SRC points an invalid byte sequence, set C
1895 to -1. */
1896
1897 #define DECODE_EMACS_MULE_COMPOSITION_RULE_21(buf) \
1898 do { \
1899 int gref, nref; \
1900 \
1901 if (src + 1>= src_end) \
1902 goto invalid_code; \
1903 ONE_MORE_BYTE_NO_CHECK (gref); \
1904 gref -= 0x20; \
1905 ONE_MORE_BYTE_NO_CHECK (nref); \
1906 nref -= 0x20; \
1907 if (gref < 0 || gref >= 81 \
1908 || nref < 0 || nref >= 81) \
1909 goto invalid_code; \
1910 *buf++ = COMPOSITION_ENCODE_RULE (gref, nref); \
1911 } while (0)
1912
1913
1914 #define DECODE_EMACS_MULE_21_COMPOSITION(c) \
1915 do { \
1916 /* Emacs 21 style format. The first three bytes at SRC are \
1917 (METHOD - 0xF2), (BYTES - 0xA0), (CHARS - 0xA0), where BYTES is \
1918 the byte length of this composition information, CHARS is the \
1919 number of characters composed by this composition. */ \
1920 enum composition_method method = c - 0xF2; \
1921 int *charbuf_base = charbuf; \
1922 int consumed_chars_limit; \
1923 int nbytes, nchars; \
1924 \
1925 ONE_MORE_BYTE (c); \
1926 if (c < 0) \
1927 goto invalid_code; \
1928 nbytes = c - 0xA0; \
1929 if (nbytes < 3) \
1930 goto invalid_code; \
1931 ONE_MORE_BYTE (c); \
1932 if (c < 0) \
1933 goto invalid_code; \
1934 nchars = c - 0xA0; \
1935 ADD_COMPOSITION_DATA (charbuf, nchars, method); \
1936 consumed_chars_limit = consumed_chars_base + nbytes; \
1937 if (method != COMPOSITION_RELATIVE) \
1938 { \
1939 int i = 0; \
1940 while (consumed_chars < consumed_chars_limit) \
1941 { \
1942 if (i % 2 && method != COMPOSITION_WITH_ALTCHARS) \
1943 DECODE_EMACS_MULE_COMPOSITION_RULE_21 (charbuf); \
1944 else \
1945 DECODE_EMACS_MULE_COMPOSITION_CHAR (charbuf); \
1946 i++; \
1947 } \
1948 if (consumed_chars < consumed_chars_limit) \
1949 goto invalid_code; \
1950 charbuf_base[0] -= i; \
1951 } \
1952 } while (0)
1953
1954
1955 #define DECODE_EMACS_MULE_20_RELATIVE_COMPOSITION(c) \
1956 do { \
1957 /* Emacs 20 style format for relative composition. */ \
1958 /* Store multibyte form of characters to be composed. */ \
1959 enum composition_method method = COMPOSITION_RELATIVE; \
1960 int components[MAX_COMPOSITION_COMPONENTS * 2 - 1]; \
1961 int *buf = components; \
1962 int i, j; \
1963 \
1964 src = src_base; \
1965 ONE_MORE_BYTE (c); /* skip 0x80 */ \
1966 for (i = 0; i < MAX_COMPOSITION_COMPONENTS; i++) \
1967 DECODE_EMACS_MULE_COMPOSITION_CHAR (buf); \
1968 if (i < 2) \
1969 goto invalid_code; \
1970 ADD_COMPOSITION_DATA (charbuf, i, method); \
1971 for (j = 0; j < i; j++) \
1972 *charbuf++ = components[j]; \
1973 } while (0)
1974
1975
1976 #define DECODE_EMACS_MULE_20_RULEBASE_COMPOSITION(c) \
1977 do { \
1978 /* Emacs 20 style format for rule-base composition. */ \
1979 /* Store multibyte form of characters to be composed. */ \
1980 enum composition_method method = COMPOSITION_WITH_RULE; \
1981 int components[MAX_COMPOSITION_COMPONENTS * 2 - 1]; \
1982 int *buf = components; \
1983 int i, j; \
1984 \
1985 DECODE_EMACS_MULE_COMPOSITION_CHAR (buf); \
1986 for (i = 0; i < MAX_COMPOSITION_COMPONENTS; i++) \
1987 { \
1988 DECODE_EMACS_MULE_COMPOSITION_RULE_20 (buf); \
1989 DECODE_EMACS_MULE_COMPOSITION_CHAR (buf); \
1990 } \
1991 if (i < 1 || (buf - components) % 2 == 0) \
1992 goto invalid_code; \
1993 if (charbuf + i + (i / 2) + 1 < charbuf_end) \
1994 goto no_more_source; \
1995 ADD_COMPOSITION_DATA (buf, i, method); \
1996 for (j = 0; j < i; j++) \
1997 *charbuf++ = components[j]; \
1998 for (j = 0; j < i; j += 2) \
1999 *charbuf++ = components[j]; \
2000 } while (0)
2001
2002
2003 static void
2004 decode_coding_emacs_mule (coding)
2005 struct coding_system *coding;
2006 {
2007 const unsigned char *src = coding->source + coding->consumed;
2008 const unsigned char *src_end = coding->source + coding->src_bytes;
2009 const unsigned char *src_base;
2010 int *charbuf = coding->charbuf + coding->charbuf_used;
2011 int *charbuf_end
2012 = coding->charbuf + coding->charbuf_size - MAX_ANNOTATION_LENGTH;
2013 int consumed_chars = 0, consumed_chars_base;
2014 int multibytep = coding->src_multibyte;
2015 Lisp_Object attrs, charset_list;
2016 int char_offset = coding->produced_char;
2017 int last_offset = char_offset;
2018 int last_id = charset_ascii;
2019
2020 CODING_GET_INFO (coding, attrs, charset_list);
2021
2022 while (1)
2023 {
2024 int c;
2025
2026 src_base = src;
2027 consumed_chars_base = consumed_chars;
2028
2029 if (charbuf >= charbuf_end)
2030 break;
2031
2032 ONE_MORE_BYTE (c);
2033 if (c < 0)
2034 {
2035 *charbuf++ = -c;
2036 char_offset++;
2037 }
2038 else if (c < 0x80)
2039 {
2040 *charbuf++ = c;
2041 char_offset++;
2042 }
2043 else if (c == 0x80)
2044 {
2045 ONE_MORE_BYTE (c);
2046 if (c < 0)
2047 goto invalid_code;
2048 if (c - 0xF2 >= COMPOSITION_RELATIVE
2049 && c - 0xF2 <= COMPOSITION_WITH_RULE_ALTCHARS)
2050 DECODE_EMACS_MULE_21_COMPOSITION (c);
2051 else if (c < 0xC0)
2052 DECODE_EMACS_MULE_20_RELATIVE_COMPOSITION (c);
2053 else if (c == 0xFF)
2054 DECODE_EMACS_MULE_20_RULEBASE_COMPOSITION (c);
2055 else
2056 goto invalid_code;
2057 }
2058 else if (c < 0xA0 && emacs_mule_bytes[c] > 1)
2059 {
2060 int nbytes, nchars;
2061 int id;
2062
2063 src = src_base;
2064 consumed_chars = consumed_chars_base;
2065 c = emacs_mule_char (coding, src, &nbytes, &nchars, &id);
2066 if (c < 0)
2067 {
2068 if (c == -2)
2069 break;
2070 goto invalid_code;
2071 }
2072 if (last_id != id)
2073 {
2074 if (last_id != charset_ascii)
2075 ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
2076 last_id = id;
2077 last_offset = char_offset;
2078 }
2079 *charbuf++ = c;
2080 src += nbytes;
2081 consumed_chars += nchars;
2082 char_offset++;
2083 }
2084 continue;
2085
2086 invalid_code:
2087 src = src_base;
2088 consumed_chars = consumed_chars_base;
2089 ONE_MORE_BYTE (c);
2090 *charbuf++ = ASCII_BYTE_P (c) ? c : BYTE8_TO_CHAR (c);
2091 char_offset++;
2092 coding->errors++;
2093 }
2094
2095 no_more_source:
2096 if (last_id != charset_ascii)
2097 ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
2098 coding->consumed_char += consumed_chars_base;
2099 coding->consumed = src_base - coding->source;
2100 coding->charbuf_used = charbuf - coding->charbuf;
2101 }
2102
2103
2104 #define EMACS_MULE_LEADING_CODES(id, codes) \
2105 do { \
2106 if (id < 0xA0) \
2107 codes[0] = id, codes[1] = 0; \
2108 else if (id < 0xE0) \
2109 codes[0] = 0x9A, codes[1] = id; \
2110 else if (id < 0xF0) \
2111 codes[0] = 0x9B, codes[1] = id; \
2112 else if (id < 0xF5) \
2113 codes[0] = 0x9C, codes[1] = id; \
2114 else \
2115 codes[0] = 0x9D, codes[1] = id; \
2116 } while (0);
2117
2118
2119 static int
2120 encode_coding_emacs_mule (coding)
2121 struct coding_system *coding;
2122 {
2123 int multibytep = coding->dst_multibyte;
2124 int *charbuf = coding->charbuf;
2125 int *charbuf_end = charbuf + coding->charbuf_used;
2126 unsigned char *dst = coding->destination + coding->produced;
2127 unsigned char *dst_end = coding->destination + coding->dst_bytes;
2128 int safe_room = 8;
2129 int produced_chars = 0;
2130 Lisp_Object attrs, charset_list;
2131 int c;
2132 int preferred_charset_id = -1;
2133
2134 CODING_GET_INFO (coding, attrs, charset_list);
2135 if (! EQ (charset_list, Vemacs_mule_charset_list))
2136 {
2137 CODING_ATTR_CHARSET_LIST (attrs)
2138 = charset_list = Vemacs_mule_charset_list;
2139 }
2140
2141 while (charbuf < charbuf_end)
2142 {
2143 ASSURE_DESTINATION (safe_room);
2144 c = *charbuf++;
2145
2146 if (c < 0)
2147 {
2148 /* Handle an annotation. */
2149 switch (*charbuf)
2150 {
2151 case CODING_ANNOTATE_COMPOSITION_MASK:
2152 /* Not yet implemented. */
2153 break;
2154 case CODING_ANNOTATE_CHARSET_MASK:
2155 preferred_charset_id = charbuf[3];
2156 if (preferred_charset_id >= 0
2157 && NILP (Fmemq (make_number (preferred_charset_id),
2158 charset_list)))
2159 preferred_charset_id = -1;
2160 break;
2161 default:
2162 abort ();
2163 }
2164 charbuf += -c - 1;
2165 continue;
2166 }
2167
2168 if (ASCII_CHAR_P (c))
2169 EMIT_ONE_ASCII_BYTE (c);
2170 else if (CHAR_BYTE8_P (c))
2171 {
2172 c = CHAR_TO_BYTE8 (c);
2173 EMIT_ONE_BYTE (c);
2174 }
2175 else
2176 {
2177 struct charset *charset;
2178 unsigned code;
2179 int dimension;
2180 int emacs_mule_id;
2181 unsigned char leading_codes[2];
2182
2183 if (preferred_charset_id >= 0)
2184 {
2185 charset = CHARSET_FROM_ID (preferred_charset_id);
2186 if (! CHAR_CHARSET_P (c, charset))
2187 charset = char_charset (c, charset_list, NULL);
2188 }
2189 else
2190 charset = char_charset (c, charset_list, &code);
2191 if (! charset)
2192 {
2193 c = coding->default_char;
2194 if (ASCII_CHAR_P (c))
2195 {
2196 EMIT_ONE_ASCII_BYTE (c);
2197 continue;
2198 }
2199 charset = char_charset (c, charset_list, &code);
2200 }
2201 dimension = CHARSET_DIMENSION (charset);
2202 emacs_mule_id = CHARSET_EMACS_MULE_ID (charset);
2203 EMACS_MULE_LEADING_CODES (emacs_mule_id, leading_codes);
2204 EMIT_ONE_BYTE (leading_codes[0]);
2205 if (leading_codes[1])
2206 EMIT_ONE_BYTE (leading_codes[1]);
2207 if (dimension == 1)
2208 EMIT_ONE_BYTE (code | 0x80);
2209 else
2210 {
2211 code |= 0x8080;
2212 EMIT_ONE_BYTE (code >> 8);
2213 EMIT_ONE_BYTE (code & 0xFF);
2214 }
2215 }
2216 }
2217 record_conversion_result (coding, CODING_RESULT_SUCCESS);
2218 coding->produced_char += produced_chars;
2219 coding->produced = dst - coding->destination;
2220 return 0;
2221 }
2222
2223 \f
2224 /*** 7. ISO2022 handlers ***/
2225
2226 /* The following note describes the coding system ISO2022 briefly.
2227 Since the intention of this note is to help understand the
2228 functions in this file, some parts are NOT ACCURATE or are OVERLY
2229 SIMPLIFIED. For thorough understanding, please refer to the
2230 original document of ISO2022. This is equivalent to the standard
2231 ECMA-35, obtainable from <URL:http://www.ecma.ch/> (*).
2232
2233 ISO2022 provides many mechanisms to encode several character sets
2234 in 7-bit and 8-bit environments. For 7-bit environments, all text
2235 is encoded using bytes less than 128. This may make the encoded
2236 text a little bit longer, but the text passes more easily through
2237 several types of gateway, some of which strip off the MSB (Most
2238 Significant Bit).
2239
2240 There are two kinds of character sets: control character sets and
2241 graphic character sets. The former contain control characters such
2242 as `newline' and `escape' to provide control functions (control
2243 functions are also provided by escape sequences). The latter
2244 contain graphic characters such as 'A' and '-'. Emacs recognizes
2245 two control character sets and many graphic character sets.
2246
2247 Graphic character sets are classified into one of the following
2248 four classes, according to the number of bytes (DIMENSION) and
2249 number of characters in one dimension (CHARS) of the set:
2250 - DIMENSION1_CHARS94
2251 - DIMENSION1_CHARS96
2252 - DIMENSION2_CHARS94
2253 - DIMENSION2_CHARS96
2254
2255 In addition, each character set is assigned an identification tag,
2256 unique for each set, called the "final character" (denoted as <F>
2257 hereafter). The <F> of each character set is decided by ECMA(*)
2258 when it is registered in ISO. The code range of <F> is 0x30..0x7F
2259 (0x30..0x3F are for private use only).
2260
2261 Note (*): ECMA = European Computer Manufacturers Association
2262
2263 Here are examples of graphic character sets [NAME(<F>)]:
2264 o DIMENSION1_CHARS94 -- ASCII('B'), right-half-of-JISX0201('I'), ...
2265 o DIMENSION1_CHARS96 -- right-half-of-ISO8859-1('A'), ...
2266 o DIMENSION2_CHARS94 -- GB2312('A'), JISX0208('B'), ...
2267 o DIMENSION2_CHARS96 -- none for the moment
2268
2269 A code area (1 byte=8 bits) is divided into 4 areas, C0, GL, C1, and GR.
2270 C0 [0x00..0x1F] -- control character plane 0
2271 GL [0x20..0x7F] -- graphic character plane 0
2272 C1 [0x80..0x9F] -- control character plane 1
2273 GR [0xA0..0xFF] -- graphic character plane 1
2274
2275 A control character set is directly designated and invoked to C0 or
2276 C1 by an escape sequence. The most common case is that:
2277 - ISO646's control character set is designated/invoked to C0, and
2278 - ISO6429's control character set is designated/invoked to C1,
2279 and usually these designations/invocations are omitted in encoded
2280 text. In a 7-bit environment, only C0 can be used, and a control
2281 character for C1 is encoded by an appropriate escape sequence to
2282 fit into the environment. All control characters for C1 are
2283 defined to have corresponding escape sequences.
2284
2285 A graphic character set is at first designated to one of four
2286 graphic registers (G0 through G3), then these graphic registers are
2287 invoked to GL or GR. These designations and invocations can be
2288 done independently. The most common case is that G0 is invoked to
2289 GL, G1 is invoked to GR, and ASCII is designated to G0. Usually
2290 these invocations and designations are omitted in encoded text.
2291 In a 7-bit environment, only GL can be used.
2292
2293 When a graphic character set of CHARS94 is invoked to GL, codes
2294 0x20 and 0x7F of the GL area work as control characters SPACE and
2295 DEL respectively, and codes 0xA0 and 0xFF of the GR area should not
2296 be used.
2297
2298 There are two ways of invocation: locking-shift and single-shift.
2299 With locking-shift, the invocation lasts until the next different
2300 invocation, whereas with single-shift, the invocation affects the
2301 following character only and doesn't affect the locking-shift
2302 state. Invocations are done by the following control characters or
2303 escape sequences:
2304
2305 ----------------------------------------------------------------------
2306 abbrev function cntrl escape seq description
2307 ----------------------------------------------------------------------
2308 SI/LS0 (shift-in) 0x0F none invoke G0 into GL
2309 SO/LS1 (shift-out) 0x0E none invoke G1 into GL
2310 LS2 (locking-shift-2) none ESC 'n' invoke G2 into GL
2311 LS3 (locking-shift-3) none ESC 'o' invoke G3 into GL
2312 LS1R (locking-shift-1 right) none ESC '~' invoke G1 into GR (*)
2313 LS2R (locking-shift-2 right) none ESC '}' invoke G2 into GR (*)
2314 LS3R (locking-shift 3 right) none ESC '|' invoke G3 into GR (*)
2315 SS2 (single-shift-2) 0x8E ESC 'N' invoke G2 for one char
2316 SS3 (single-shift-3) 0x8F ESC 'O' invoke G3 for one char
2317 ----------------------------------------------------------------------
2318 (*) These are not used by any known coding system.
2319
2320 Control characters for these functions are defined by macros
2321 ISO_CODE_XXX in `coding.h'.
2322
2323 Designations are done by the following escape sequences:
2324 ----------------------------------------------------------------------
2325 escape sequence description
2326 ----------------------------------------------------------------------
2327 ESC '(' <F> designate DIMENSION1_CHARS94<F> to G0
2328 ESC ')' <F> designate DIMENSION1_CHARS94<F> to G1
2329 ESC '*' <F> designate DIMENSION1_CHARS94<F> to G2
2330 ESC '+' <F> designate DIMENSION1_CHARS94<F> to G3
2331 ESC ',' <F> designate DIMENSION1_CHARS96<F> to G0 (*)
2332 ESC '-' <F> designate DIMENSION1_CHARS96<F> to G1
2333 ESC '.' <F> designate DIMENSION1_CHARS96<F> to G2
2334 ESC '/' <F> designate DIMENSION1_CHARS96<F> to G3
2335 ESC '$' '(' <F> designate DIMENSION2_CHARS94<F> to G0 (**)
2336 ESC '$' ')' <F> designate DIMENSION2_CHARS94<F> to G1
2337 ESC '$' '*' <F> designate DIMENSION2_CHARS94<F> to G2
2338 ESC '$' '+' <F> designate DIMENSION2_CHARS94<F> to G3
2339 ESC '$' ',' <F> designate DIMENSION2_CHARS96<F> to G0 (*)
2340 ESC '$' '-' <F> designate DIMENSION2_CHARS96<F> to G1
2341 ESC '$' '.' <F> designate DIMENSION2_CHARS96<F> to G2
2342 ESC '$' '/' <F> designate DIMENSION2_CHARS96<F> to G3
2343 ----------------------------------------------------------------------
2344
2345 In this list, "DIMENSION1_CHARS94<F>" means a graphic character set
2346 of dimension 1, chars 94, and final character <F>, etc...
2347
2348 Note (*): Although these designations are not allowed in ISO2022,
2349 Emacs accepts them on decoding, and produces them on encoding
2350 CHARS96 character sets in a coding system which is characterized as
2351 7-bit environment, non-locking-shift, and non-single-shift.
2352
2353 Note (**): If <F> is '@', 'A', or 'B', the intermediate character
2354 '(' must be omitted. We refer to this as "short-form" hereafter.
2355
2356 Now you may notice that there are a lot of ways of encoding the
2357 same multilingual text in ISO2022. Actually, there exist many
2358 coding systems such as Compound Text (used in X11's inter client
2359 communication, ISO-2022-JP (used in Japanese Internet), ISO-2022-KR
2360 (used in Korean Internet), EUC (Extended UNIX Code, used in Asian
2361 localized platforms), and all of these are variants of ISO2022.
2362
2363 In addition to the above, Emacs handles two more kinds of escape
2364 sequences: ISO6429's direction specification and Emacs' private
2365 sequence for specifying character composition.
2366
2367 ISO6429's direction specification takes the following form:
2368 o CSI ']' -- end of the current direction
2369 o CSI '0' ']' -- end of the current direction
2370 o CSI '1' ']' -- start of left-to-right text
2371 o CSI '2' ']' -- start of right-to-left text
2372 The control character CSI (0x9B: control sequence introducer) is
2373 abbreviated to the escape sequence ESC '[' in a 7-bit environment.
2374
2375 Character composition specification takes the following form:
2376 o ESC '0' -- start relative composition
2377 o ESC '1' -- end composition
2378 o ESC '2' -- start rule-base composition (*)
2379 o ESC '3' -- start relative composition with alternate chars (**)
2380 o ESC '4' -- start rule-base composition with alternate chars (**)
2381 Since these are not standard escape sequences of any ISO standard,
2382 the use of them with these meanings is restricted to Emacs only.
2383
2384 (*) This form is used only in Emacs 20.7 and older versions,
2385 but newer versions can safely decode it.
2386 (**) This form is used only in Emacs 21.1 and newer versions,
2387 and older versions can't decode it.
2388
2389 Here's a list of example usages of these composition escape
2390 sequences (categorized by `enum composition_method').
2391
2392 COMPOSITION_RELATIVE:
2393 ESC 0 CHAR [ CHAR ] ESC 1
2394 COMPOSITION_WITH_RULE:
2395 ESC 2 CHAR [ RULE CHAR ] ESC 1
2396 COMPOSITION_WITH_ALTCHARS:
2397 ESC 3 ALTCHAR [ ALTCHAR ] ESC 0 CHAR [ CHAR ] ESC 1
2398 COMPOSITION_WITH_RULE_ALTCHARS:
2399 ESC 4 ALTCHAR [ RULE ALTCHAR ] ESC 0 CHAR [ CHAR ] ESC 1 */
2400
2401 enum iso_code_class_type iso_code_class[256];
2402
2403 #define SAFE_CHARSET_P(coding, id) \
2404 ((id) <= (coding)->max_charset_id \
2405 && (coding)->safe_charsets[id] >= 0)
2406
2407
2408 #define SHIFT_OUT_OK(category) \
2409 (CODING_ISO_INITIAL (&coding_categories[category], 1) >= 0)
2410
2411 static void
2412 setup_iso_safe_charsets (attrs)
2413 Lisp_Object attrs;
2414 {
2415 Lisp_Object charset_list, safe_charsets;
2416 Lisp_Object request;
2417 Lisp_Object reg_usage;
2418 Lisp_Object tail;
2419 int reg94, reg96;
2420 int flags = XINT (AREF (attrs, coding_attr_iso_flags));
2421 int max_charset_id;
2422
2423 charset_list = CODING_ATTR_CHARSET_LIST (attrs);
2424 if ((flags & CODING_ISO_FLAG_FULL_SUPPORT)
2425 && ! EQ (charset_list, Viso_2022_charset_list))
2426 {
2427 CODING_ATTR_CHARSET_LIST (attrs)
2428 = charset_list = Viso_2022_charset_list;
2429 ASET (attrs, coding_attr_safe_charsets, Qnil);
2430 }
2431
2432 if (STRINGP (AREF (attrs, coding_attr_safe_charsets)))
2433 return;
2434
2435 max_charset_id = 0;
2436 for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
2437 {
2438 int id = XINT (XCAR (tail));
2439 if (max_charset_id < id)
2440 max_charset_id = id;
2441 }
2442
2443 safe_charsets = Fmake_string (make_number (max_charset_id + 1),
2444 make_number (255));
2445 request = AREF (attrs, coding_attr_iso_request);
2446 reg_usage = AREF (attrs, coding_attr_iso_usage);
2447 reg94 = XINT (XCAR (reg_usage));
2448 reg96 = XINT (XCDR (reg_usage));
2449
2450 for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
2451 {
2452 Lisp_Object id;
2453 Lisp_Object reg;
2454 struct charset *charset;
2455
2456 id = XCAR (tail);
2457 charset = CHARSET_FROM_ID (XINT (id));
2458 reg = Fcdr (Fassq (id, request));
2459 if (! NILP (reg))
2460 SSET (safe_charsets, XINT (id), XINT (reg));
2461 else if (charset->iso_chars_96)
2462 {
2463 if (reg96 < 4)
2464 SSET (safe_charsets, XINT (id), reg96);
2465 }
2466 else
2467 {
2468 if (reg94 < 4)
2469 SSET (safe_charsets, XINT (id), reg94);
2470 }
2471 }
2472 ASET (attrs, coding_attr_safe_charsets, safe_charsets);
2473 }
2474
2475
2476 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
2477 Check if a text is encoded in one of ISO-2022 based codig systems.
2478 If it is, return 1, else return 0. */
2479
2480 static int
2481 detect_coding_iso_2022 (coding, detect_info)
2482 struct coding_system *coding;
2483 struct coding_detection_info *detect_info;
2484 {
2485 const unsigned char *src = coding->source, *src_base = src;
2486 const unsigned char *src_end = coding->source + coding->src_bytes;
2487 int multibytep = coding->src_multibyte;
2488 int single_shifting = 0;
2489 int id;
2490 int c, c1;
2491 int consumed_chars = 0;
2492 int i;
2493 int rejected = 0;
2494 int found = 0;
2495
2496 detect_info->checked |= CATEGORY_MASK_ISO;
2497
2498 for (i = coding_category_iso_7; i <= coding_category_iso_8_else; i++)
2499 {
2500 struct coding_system *this = &(coding_categories[i]);
2501 Lisp_Object attrs, val;
2502
2503 attrs = CODING_ID_ATTRS (this->id);
2504 if (CODING_ISO_FLAGS (this) & CODING_ISO_FLAG_FULL_SUPPORT
2505 && ! EQ (CODING_ATTR_SAFE_CHARSETS (attrs), Viso_2022_charset_list))
2506 setup_iso_safe_charsets (attrs);
2507 val = CODING_ATTR_SAFE_CHARSETS (attrs);
2508 this->max_charset_id = SCHARS (val) - 1;
2509 this->safe_charsets = (char *) SDATA (val);
2510 }
2511
2512 /* A coding system of this category is always ASCII compatible. */
2513 src += coding->head_ascii;
2514
2515 while (rejected != CATEGORY_MASK_ISO)
2516 {
2517 src_base = src;
2518 ONE_MORE_BYTE (c);
2519 switch (c)
2520 {
2521 case ISO_CODE_ESC:
2522 if (inhibit_iso_escape_detection)
2523 break;
2524 single_shifting = 0;
2525 ONE_MORE_BYTE (c);
2526 if (c >= '(' && c <= '/')
2527 {
2528 /* Designation sequence for a charset of dimension 1. */
2529 ONE_MORE_BYTE (c1);
2530 if (c1 < ' ' || c1 >= 0x80
2531 || (id = iso_charset_table[0][c >= ','][c1]) < 0)
2532 /* Invalid designation sequence. Just ignore. */
2533 break;
2534 }
2535 else if (c == '$')
2536 {
2537 /* Designation sequence for a charset of dimension 2. */
2538 ONE_MORE_BYTE (c);
2539 if (c >= '@' && c <= 'B')
2540 /* Designation for JISX0208.1978, GB2312, or JISX0208. */
2541 id = iso_charset_table[1][0][c];
2542 else if (c >= '(' && c <= '/')
2543 {
2544 ONE_MORE_BYTE (c1);
2545 if (c1 < ' ' || c1 >= 0x80
2546 || (id = iso_charset_table[1][c >= ','][c1]) < 0)
2547 /* Invalid designation sequence. Just ignore. */
2548 break;
2549 }
2550 else
2551 /* Invalid designation sequence. Just ignore it. */
2552 break;
2553 }
2554 else if (c == 'N' || c == 'O')
2555 {
2556 /* ESC <Fe> for SS2 or SS3. */
2557 single_shifting = 1;
2558 rejected |= CATEGORY_MASK_ISO_7BIT | CATEGORY_MASK_ISO_8BIT;
2559 break;
2560 }
2561 else if (c >= '0' && c <= '4')
2562 {
2563 /* ESC <Fp> for start/end composition. */
2564 found |= CATEGORY_MASK_ISO;
2565 break;
2566 }
2567 else
2568 {
2569 /* Invalid escape sequence. Just ignore it. */
2570 break;
2571 }
2572
2573 /* We found a valid designation sequence for CHARSET. */
2574 rejected |= CATEGORY_MASK_ISO_8BIT;
2575 if (SAFE_CHARSET_P (&coding_categories[coding_category_iso_7],
2576 id))
2577 found |= CATEGORY_MASK_ISO_7;
2578 else
2579 rejected |= CATEGORY_MASK_ISO_7;
2580 if (SAFE_CHARSET_P (&coding_categories[coding_category_iso_7_tight],
2581 id))
2582 found |= CATEGORY_MASK_ISO_7_TIGHT;
2583 else
2584 rejected |= CATEGORY_MASK_ISO_7_TIGHT;
2585 if (SAFE_CHARSET_P (&coding_categories[coding_category_iso_7_else],
2586 id))
2587 found |= CATEGORY_MASK_ISO_7_ELSE;
2588 else
2589 rejected |= CATEGORY_MASK_ISO_7_ELSE;
2590 if (SAFE_CHARSET_P (&coding_categories[coding_category_iso_8_else],
2591 id))
2592 found |= CATEGORY_MASK_ISO_8_ELSE;
2593 else
2594 rejected |= CATEGORY_MASK_ISO_8_ELSE;
2595 break;
2596
2597 case ISO_CODE_SO:
2598 case ISO_CODE_SI:
2599 /* Locking shift out/in. */
2600 if (inhibit_iso_escape_detection)
2601 break;
2602 single_shifting = 0;
2603 rejected |= CATEGORY_MASK_ISO_7BIT | CATEGORY_MASK_ISO_8BIT;
2604 found |= CATEGORY_MASK_ISO_ELSE;
2605 break;
2606
2607 case ISO_CODE_CSI:
2608 /* Control sequence introducer. */
2609 single_shifting = 0;
2610 rejected |= CATEGORY_MASK_ISO_7BIT | CATEGORY_MASK_ISO_7_ELSE;
2611 found |= CATEGORY_MASK_ISO_8_ELSE;
2612 goto check_extra_latin;
2613
2614 case ISO_CODE_SS2:
2615 case ISO_CODE_SS3:
2616 /* Single shift. */
2617 if (inhibit_iso_escape_detection)
2618 break;
2619 single_shifting = 0;
2620 rejected |= CATEGORY_MASK_ISO_7BIT;
2621 if (CODING_ISO_FLAGS (&coding_categories[coding_category_iso_8_1])
2622 & CODING_ISO_FLAG_SINGLE_SHIFT)
2623 found |= CATEGORY_MASK_ISO_8_1, single_shifting = 1;
2624 if (CODING_ISO_FLAGS (&coding_categories[coding_category_iso_8_2])
2625 & CODING_ISO_FLAG_SINGLE_SHIFT)
2626 found |= CATEGORY_MASK_ISO_8_2, single_shifting = 1;
2627 if (single_shifting)
2628 break;
2629 goto check_extra_latin;
2630
2631 default:
2632 if (c < 0)
2633 continue;
2634 if (c < 0x80)
2635 {
2636 single_shifting = 0;
2637 break;
2638 }
2639 if (c >= 0xA0)
2640 {
2641 rejected |= CATEGORY_MASK_ISO_7BIT | CATEGORY_MASK_ISO_7_ELSE;
2642 found |= CATEGORY_MASK_ISO_8_1;
2643 /* Check the length of succeeding codes of the range
2644 0xA0..0FF. If the byte length is even, we include
2645 CATEGORY_MASK_ISO_8_2 in `found'. We can check this
2646 only when we are not single shifting. */
2647 if (! single_shifting
2648 && ! (rejected & CATEGORY_MASK_ISO_8_2))
2649 {
2650 int i = 1;
2651 while (src < src_end)
2652 {
2653 ONE_MORE_BYTE (c);
2654 if (c < 0xA0)
2655 break;
2656 i++;
2657 }
2658
2659 if (i & 1 && src < src_end)
2660 rejected |= CATEGORY_MASK_ISO_8_2;
2661 else
2662 found |= CATEGORY_MASK_ISO_8_2;
2663 }
2664 break;
2665 }
2666 check_extra_latin:
2667 single_shifting = 0;
2668 if (! VECTORP (Vlatin_extra_code_table)
2669 || NILP (XVECTOR (Vlatin_extra_code_table)->contents[c]))
2670 {
2671 rejected = CATEGORY_MASK_ISO;
2672 break;
2673 }
2674 if (CODING_ISO_FLAGS (&coding_categories[coding_category_iso_8_1])
2675 & CODING_ISO_FLAG_LATIN_EXTRA)
2676 found |= CATEGORY_MASK_ISO_8_1;
2677 else
2678 rejected |= CATEGORY_MASK_ISO_8_1;
2679 rejected |= CATEGORY_MASK_ISO_8_2;
2680 }
2681 }
2682 detect_info->rejected |= CATEGORY_MASK_ISO;
2683 return 0;
2684
2685 no_more_source:
2686 detect_info->rejected |= rejected;
2687 detect_info->found |= (found & ~rejected);
2688 return 1;
2689 }
2690
2691
2692 /* Set designation state into CODING. */
2693 #define DECODE_DESIGNATION(reg, dim, chars_96, final) \
2694 do { \
2695 int id, prev; \
2696 \
2697 if (final < '0' || final >= 128 \
2698 || ((id = ISO_CHARSET_TABLE (dim, chars_96, final)) < 0) \
2699 || !SAFE_CHARSET_P (coding, id)) \
2700 { \
2701 CODING_ISO_DESIGNATION (coding, reg) = -2; \
2702 goto invalid_code; \
2703 } \
2704 prev = CODING_ISO_DESIGNATION (coding, reg); \
2705 if (id == charset_jisx0201_roman) \
2706 { \
2707 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_USE_ROMAN) \
2708 id = charset_ascii; \
2709 } \
2710 else if (id == charset_jisx0208_1978) \
2711 { \
2712 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_USE_OLDJIS) \
2713 id = charset_jisx0208; \
2714 } \
2715 CODING_ISO_DESIGNATION (coding, reg) = id; \
2716 /* If there was an invalid designation to REG previously, and this \
2717 designation is ASCII to REG, we should keep this designation \
2718 sequence. */ \
2719 if (prev == -2 && id == charset_ascii) \
2720 goto invalid_code; \
2721 } while (0)
2722
2723
2724 #define MAYBE_FINISH_COMPOSITION() \
2725 do { \
2726 int i; \
2727 if (composition_state == COMPOSING_NO) \
2728 break; \
2729 /* It is assured that we have enough room for producing \
2730 characters stored in the table `components'. */ \
2731 if (charbuf + component_idx > charbuf_end) \
2732 goto no_more_source; \
2733 composition_state = COMPOSING_NO; \
2734 if (method == COMPOSITION_RELATIVE \
2735 || method == COMPOSITION_WITH_ALTCHARS) \
2736 { \
2737 for (i = 0; i < component_idx; i++) \
2738 *charbuf++ = components[i]; \
2739 char_offset += component_idx; \
2740 } \
2741 else \
2742 { \
2743 for (i = 0; i < component_idx; i += 2) \
2744 *charbuf++ = components[i]; \
2745 char_offset += (component_idx / 2) + 1; \
2746 } \
2747 } while (0)
2748
2749
2750 /* Handle composition start sequence ESC 0, ESC 2, ESC 3, or ESC 4.
2751 ESC 0 : relative composition : ESC 0 CHAR ... ESC 1
2752 ESC 2 : rulebase composition : ESC 2 CHAR RULE CHAR RULE ... CHAR ESC 1
2753 ESC 3 : altchar composition : ESC 3 CHAR ... ESC 0 CHAR ... ESC 1
2754 ESC 4 : alt&rule composition : ESC 4 CHAR RULE ... CHAR ESC 0 CHAR ... ESC 1
2755 */
2756
2757 #define DECODE_COMPOSITION_START(c1) \
2758 do { \
2759 if (c1 == '0' \
2760 && composition_state == COMPOSING_COMPONENT_RULE) \
2761 { \
2762 component_len = component_idx; \
2763 composition_state = COMPOSING_CHAR; \
2764 } \
2765 else \
2766 { \
2767 const unsigned char *p; \
2768 \
2769 MAYBE_FINISH_COMPOSITION (); \
2770 if (charbuf + MAX_COMPOSITION_COMPONENTS > charbuf_end) \
2771 goto no_more_source; \
2772 for (p = src; p < src_end - 1; p++) \
2773 if (*p == ISO_CODE_ESC && p[1] == '1') \
2774 break; \
2775 if (p == src_end - 1) \
2776 { \
2777 if (coding->mode & CODING_MODE_LAST_BLOCK) \
2778 goto invalid_code; \
2779 goto no_more_source; \
2780 } \
2781 \
2782 /* This is surely the start of a composition. */ \
2783 method = (c1 == '0' ? COMPOSITION_RELATIVE \
2784 : c1 == '2' ? COMPOSITION_WITH_RULE \
2785 : c1 == '3' ? COMPOSITION_WITH_ALTCHARS \
2786 : COMPOSITION_WITH_RULE_ALTCHARS); \
2787 composition_state = (c1 <= '2' ? COMPOSING_CHAR \
2788 : COMPOSING_COMPONENT_CHAR); \
2789 component_idx = component_len = 0; \
2790 } \
2791 } while (0)
2792
2793
2794 /* Handle compositoin end sequence ESC 1. */
2795
2796 #define DECODE_COMPOSITION_END() \
2797 do { \
2798 int nchars = (component_len > 0 ? component_idx - component_len \
2799 : method == COMPOSITION_RELATIVE ? component_idx \
2800 : (component_idx + 1) / 2); \
2801 int i; \
2802 int *saved_charbuf = charbuf; \
2803 \
2804 ADD_COMPOSITION_DATA (charbuf, nchars, method); \
2805 if (method != COMPOSITION_RELATIVE) \
2806 { \
2807 if (component_len == 0) \
2808 for (i = 0; i < component_idx; i++) \
2809 *charbuf++ = components[i]; \
2810 else \
2811 for (i = 0; i < component_len; i++) \
2812 *charbuf++ = components[i]; \
2813 *saved_charbuf = saved_charbuf - charbuf; \
2814 } \
2815 if (method == COMPOSITION_WITH_RULE) \
2816 for (i = 0; i < component_idx; i += 2, char_offset++) \
2817 *charbuf++ = components[i]; \
2818 else \
2819 for (i = component_len; i < component_idx; i++, char_offset++) \
2820 *charbuf++ = components[i]; \
2821 coding->annotated = 1; \
2822 composition_state = COMPOSING_NO; \
2823 } while (0)
2824
2825
2826 /* Decode a composition rule from the byte C1 (and maybe one more byte
2827 from SRC) and store one encoded composition rule in
2828 coding->cmp_data. */
2829
2830 #define DECODE_COMPOSITION_RULE(c1) \
2831 do { \
2832 (c1) -= 32; \
2833 if (c1 < 81) /* old format (before ver.21) */ \
2834 { \
2835 int gref = (c1) / 9; \
2836 int nref = (c1) % 9; \
2837 if (gref == 4) gref = 10; \
2838 if (nref == 4) nref = 10; \
2839 c1 = COMPOSITION_ENCODE_RULE (gref, nref); \
2840 } \
2841 else if (c1 < 93) /* new format (after ver.21) */ \
2842 { \
2843 ONE_MORE_BYTE (c2); \
2844 c1 = COMPOSITION_ENCODE_RULE (c1 - 81, c2 - 32); \
2845 } \
2846 else \
2847 c1 = 0; \
2848 } while (0)
2849
2850
2851 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions". */
2852
2853 static void
2854 decode_coding_iso_2022 (coding)
2855 struct coding_system *coding;
2856 {
2857 const unsigned char *src = coding->source + coding->consumed;
2858 const unsigned char *src_end = coding->source + coding->src_bytes;
2859 const unsigned char *src_base;
2860 int *charbuf = coding->charbuf + coding->charbuf_used;
2861 int *charbuf_end
2862 = coding->charbuf + coding->charbuf_size - 4 - MAX_ANNOTATION_LENGTH;
2863 int consumed_chars = 0, consumed_chars_base;
2864 int multibytep = coding->src_multibyte;
2865 /* Charsets invoked to graphic plane 0 and 1 respectively. */
2866 int charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
2867 int charset_id_1 = CODING_ISO_INVOKED_CHARSET (coding, 1);
2868 struct charset *charset;
2869 int c;
2870 /* For handling composition sequence. */
2871 #define COMPOSING_NO 0
2872 #define COMPOSING_CHAR 1
2873 #define COMPOSING_RULE 2
2874 #define COMPOSING_COMPONENT_CHAR 3
2875 #define COMPOSING_COMPONENT_RULE 4
2876
2877 int composition_state = COMPOSING_NO;
2878 enum composition_method method;
2879 int components[MAX_COMPOSITION_COMPONENTS * 2 + 1];
2880 int component_idx;
2881 int component_len;
2882 Lisp_Object attrs, charset_list;
2883 int char_offset = coding->produced_char;
2884 int last_offset = char_offset;
2885 int last_id = charset_ascii;
2886
2887 CODING_GET_INFO (coding, attrs, charset_list);
2888 setup_iso_safe_charsets (attrs);
2889
2890 while (1)
2891 {
2892 int c1, c2;
2893
2894 src_base = src;
2895 consumed_chars_base = consumed_chars;
2896
2897 if (charbuf >= charbuf_end)
2898 break;
2899
2900 ONE_MORE_BYTE (c1);
2901 if (c1 < 0)
2902 goto invalid_code;
2903
2904 /* We produce at most one character. */
2905 switch (iso_code_class [c1])
2906 {
2907 case ISO_0x20_or_0x7F:
2908 if (composition_state != COMPOSING_NO)
2909 {
2910 if (composition_state == COMPOSING_RULE
2911 || composition_state == COMPOSING_COMPONENT_RULE)
2912 {
2913 DECODE_COMPOSITION_RULE (c1);
2914 components[component_idx++] = c1;
2915 composition_state--;
2916 continue;
2917 }
2918 }
2919 if (charset_id_0 < 0
2920 || ! CHARSET_ISO_CHARS_96 (CHARSET_FROM_ID (charset_id_0)))
2921 /* This is SPACE or DEL. */
2922 charset = CHARSET_FROM_ID (charset_ascii);
2923 else
2924 charset = CHARSET_FROM_ID (charset_id_0);
2925 break;
2926
2927 case ISO_graphic_plane_0:
2928 if (composition_state != COMPOSING_NO)
2929 {
2930 if (composition_state == COMPOSING_RULE
2931 || composition_state == COMPOSING_COMPONENT_RULE)
2932 {
2933 DECODE_COMPOSITION_RULE (c1);
2934 components[component_idx++] = c1;
2935 composition_state--;
2936 continue;
2937 }
2938 }
2939 charset = CHARSET_FROM_ID (charset_id_0);
2940 break;
2941
2942 case ISO_0xA0_or_0xFF:
2943 if (charset_id_1 < 0
2944 || ! CHARSET_ISO_CHARS_96 (CHARSET_FROM_ID (charset_id_1))
2945 || CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS)
2946 goto invalid_code;
2947 /* This is a graphic character, we fall down ... */
2948
2949 case ISO_graphic_plane_1:
2950 if (charset_id_1 < 0)
2951 goto invalid_code;
2952 charset = CHARSET_FROM_ID (charset_id_1);
2953 break;
2954
2955 case ISO_control_0:
2956 MAYBE_FINISH_COMPOSITION ();
2957 charset = CHARSET_FROM_ID (charset_ascii);
2958 break;
2959
2960 case ISO_control_1:
2961 MAYBE_FINISH_COMPOSITION ();
2962 goto invalid_code;
2963
2964 case ISO_shift_out:
2965 if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LOCKING_SHIFT)
2966 || CODING_ISO_DESIGNATION (coding, 1) < 0)
2967 goto invalid_code;
2968 CODING_ISO_INVOCATION (coding, 0) = 1;
2969 charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
2970 continue;
2971
2972 case ISO_shift_in:
2973 if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LOCKING_SHIFT))
2974 goto invalid_code;
2975 CODING_ISO_INVOCATION (coding, 0) = 0;
2976 charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
2977 continue;
2978
2979 case ISO_single_shift_2_7:
2980 case ISO_single_shift_2:
2981 if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT))
2982 goto invalid_code;
2983 /* SS2 is handled as an escape sequence of ESC 'N' */
2984 c1 = 'N';
2985 goto label_escape_sequence;
2986
2987 case ISO_single_shift_3:
2988 if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT))
2989 goto invalid_code;
2990 /* SS2 is handled as an escape sequence of ESC 'O' */
2991 c1 = 'O';
2992 goto label_escape_sequence;
2993
2994 case ISO_control_sequence_introducer:
2995 /* CSI is handled as an escape sequence of ESC '[' ... */
2996 c1 = '[';
2997 goto label_escape_sequence;
2998
2999 case ISO_escape:
3000 ONE_MORE_BYTE (c1);
3001 label_escape_sequence:
3002 /* Escape sequences handled here are invocation,
3003 designation, direction specification, and character
3004 composition specification. */
3005 switch (c1)
3006 {
3007 case '&': /* revision of following character set */
3008 ONE_MORE_BYTE (c1);
3009 if (!(c1 >= '@' && c1 <= '~'))
3010 goto invalid_code;
3011 ONE_MORE_BYTE (c1);
3012 if (c1 != ISO_CODE_ESC)
3013 goto invalid_code;
3014 ONE_MORE_BYTE (c1);
3015 goto label_escape_sequence;
3016
3017 case '$': /* designation of 2-byte character set */
3018 if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DESIGNATION))
3019 goto invalid_code;
3020 ONE_MORE_BYTE (c1);
3021 if (c1 >= '@' && c1 <= 'B')
3022 { /* designation of JISX0208.1978, GB2312.1980,
3023 or JISX0208.1980 */
3024 DECODE_DESIGNATION (0, 2, 0, c1);
3025 }
3026 else if (c1 >= 0x28 && c1 <= 0x2B)
3027 { /* designation of DIMENSION2_CHARS94 character set */
3028 ONE_MORE_BYTE (c2);
3029 DECODE_DESIGNATION (c1 - 0x28, 2, 0, c2);
3030 }
3031 else if (c1 >= 0x2C && c1 <= 0x2F)
3032 { /* designation of DIMENSION2_CHARS96 character set */
3033 ONE_MORE_BYTE (c2);
3034 DECODE_DESIGNATION (c1 - 0x2C, 2, 1, c2);
3035 }
3036 else
3037 goto invalid_code;
3038 /* We must update these variables now. */
3039 charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3040 charset_id_1 = CODING_ISO_INVOKED_CHARSET (coding, 1);
3041 continue;
3042
3043 case 'n': /* invocation of locking-shift-2 */
3044 if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LOCKING_SHIFT)
3045 || CODING_ISO_DESIGNATION (coding, 2) < 0)
3046 goto invalid_code;
3047 CODING_ISO_INVOCATION (coding, 0) = 2;
3048 charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3049 continue;
3050
3051 case 'o': /* invocation of locking-shift-3 */
3052 if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LOCKING_SHIFT)
3053 || CODING_ISO_DESIGNATION (coding, 3) < 0)
3054 goto invalid_code;
3055 CODING_ISO_INVOCATION (coding, 0) = 3;
3056 charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3057 continue;
3058
3059 case 'N': /* invocation of single-shift-2 */
3060 if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT)
3061 || CODING_ISO_DESIGNATION (coding, 2) < 0)
3062 goto invalid_code;
3063 charset = CHARSET_FROM_ID (CODING_ISO_DESIGNATION (coding, 2));
3064 ONE_MORE_BYTE (c1);
3065 if (c1 < 0x20 || (c1 >= 0x80 && c1 < 0xA0))
3066 goto invalid_code;
3067 break;
3068
3069 case 'O': /* invocation of single-shift-3 */
3070 if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT)
3071 || CODING_ISO_DESIGNATION (coding, 3) < 0)
3072 goto invalid_code;
3073 charset = CHARSET_FROM_ID (CODING_ISO_DESIGNATION (coding, 3));
3074 ONE_MORE_BYTE (c1);
3075 if (c1 < 0x20 || (c1 >= 0x80 && c1 < 0xA0))
3076 goto invalid_code;
3077 break;
3078
3079 case '0': case '2': case '3': case '4': /* start composition */
3080 if (! (coding->common_flags & CODING_ANNOTATE_COMPOSITION_MASK))
3081 goto invalid_code;
3082 DECODE_COMPOSITION_START (c1);
3083 continue;
3084
3085 case '1': /* end composition */
3086 if (composition_state == COMPOSING_NO)
3087 goto invalid_code;
3088 DECODE_COMPOSITION_END ();
3089 continue;
3090
3091 case '[': /* specification of direction */
3092 if (! CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DIRECTION)
3093 goto invalid_code;
3094 /* For the moment, nested direction is not supported.
3095 So, `coding->mode & CODING_MODE_DIRECTION' zero means
3096 left-to-right, and nozero means right-to-left. */
3097 ONE_MORE_BYTE (c1);
3098 switch (c1)
3099 {
3100 case ']': /* end of the current direction */
3101 coding->mode &= ~CODING_MODE_DIRECTION;
3102
3103 case '0': /* end of the current direction */
3104 case '1': /* start of left-to-right direction */
3105 ONE_MORE_BYTE (c1);
3106 if (c1 == ']')
3107 coding->mode &= ~CODING_MODE_DIRECTION;
3108 else
3109 goto invalid_code;
3110 break;
3111
3112 case '2': /* start of right-to-left direction */
3113 ONE_MORE_BYTE (c1);
3114 if (c1 == ']')
3115 coding->mode |= CODING_MODE_DIRECTION;
3116 else
3117 goto invalid_code;
3118 break;
3119
3120 default:
3121 goto invalid_code;
3122 }
3123 continue;
3124
3125 case '%':
3126 ONE_MORE_BYTE (c1);
3127 if (c1 == '/')
3128 {
3129 /* CTEXT extended segment:
3130 ESC % / [0-4] M L --ENCODING-NAME-- \002 --BYTES--
3131 We keep these bytes as is for the moment.
3132 They may be decoded by post-read-conversion. */
3133 int dim, M, L;
3134 int size;
3135
3136 ONE_MORE_BYTE (dim);
3137 ONE_MORE_BYTE (M);
3138 ONE_MORE_BYTE (L);
3139 size = ((M - 128) * 128) + (L - 128);
3140 if (charbuf + 8 + size > charbuf_end)
3141 goto break_loop;
3142 *charbuf++ = ISO_CODE_ESC;
3143 *charbuf++ = '%';
3144 *charbuf++ = '/';
3145 *charbuf++ = dim;
3146 *charbuf++ = BYTE8_TO_CHAR (M);
3147 *charbuf++ = BYTE8_TO_CHAR (L);
3148 while (size-- > 0)
3149 {
3150 ONE_MORE_BYTE (c1);
3151 *charbuf++ = ASCII_BYTE_P (c1) ? c1 : BYTE8_TO_CHAR (c1);
3152 }
3153 }
3154 else if (c1 == 'G')
3155 {
3156 /* XFree86 extension for embedding UTF-8 in CTEXT:
3157 ESC % G --UTF-8-BYTES-- ESC % @
3158 We keep these bytes as is for the moment.
3159 They may be decoded by post-read-conversion. */
3160 int *p = charbuf;
3161
3162 if (p + 6 > charbuf_end)
3163 goto break_loop;
3164 *p++ = ISO_CODE_ESC;
3165 *p++ = '%';
3166 *p++ = 'G';
3167 while (p < charbuf_end)
3168 {
3169 ONE_MORE_BYTE (c1);
3170 if (c1 == ISO_CODE_ESC
3171 && src + 1 < src_end
3172 && src[0] == '%'
3173 && src[1] == '@')
3174 break;
3175 *p++ = ASCII_BYTE_P (c1) ? c1 : BYTE8_TO_CHAR (c1);
3176 }
3177 if (p + 3 > charbuf_end)
3178 goto break_loop;
3179 *p++ = ISO_CODE_ESC;
3180 *p++ = '%';
3181 *p++ = '@';
3182 charbuf = p;
3183 }
3184 else
3185 goto invalid_code;
3186 continue;
3187 break;
3188
3189 default:
3190 if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DESIGNATION))
3191 goto invalid_code;
3192 if (c1 >= 0x28 && c1 <= 0x2B)
3193 { /* designation of DIMENSION1_CHARS94 character set */
3194 ONE_MORE_BYTE (c2);
3195 DECODE_DESIGNATION (c1 - 0x28, 1, 0, c2);
3196 }
3197 else if (c1 >= 0x2C && c1 <= 0x2F)
3198 { /* designation of DIMENSION1_CHARS96 character set */
3199 ONE_MORE_BYTE (c2);
3200 DECODE_DESIGNATION (c1 - 0x2C, 1, 1, c2);
3201 }
3202 else
3203 goto invalid_code;
3204 /* We must update these variables now. */
3205 charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3206 charset_id_1 = CODING_ISO_INVOKED_CHARSET (coding, 1);
3207 continue;
3208 }
3209 }
3210
3211 if (charset->id != charset_ascii
3212 && last_id != charset->id)
3213 {
3214 if (last_id != charset_ascii)
3215 ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
3216 last_id = charset->id;
3217 last_offset = char_offset;
3218 }
3219
3220 /* Now we know CHARSET and 1st position code C1 of a character.
3221 Produce a decoded character while getting 2nd position code
3222 C2 if necessary. */
3223 c1 &= 0x7F;
3224 if (CHARSET_DIMENSION (charset) > 1)
3225 {
3226 ONE_MORE_BYTE (c2);
3227 if (c2 < 0x20 || (c2 >= 0x80 && c2 < 0xA0))
3228 /* C2 is not in a valid range. */
3229 goto invalid_code;
3230 c1 = (c1 << 8) | (c2 & 0x7F);
3231 if (CHARSET_DIMENSION (charset) > 2)
3232 {
3233 ONE_MORE_BYTE (c2);
3234 if (c2 < 0x20 || (c2 >= 0x80 && c2 < 0xA0))
3235 /* C2 is not in a valid range. */
3236 goto invalid_code;
3237 c1 = (c1 << 8) | (c2 & 0x7F);
3238 }
3239 }
3240
3241 CODING_DECODE_CHAR (coding, src, src_base, src_end, charset, c1, c);
3242 if (c < 0)
3243 {
3244 MAYBE_FINISH_COMPOSITION ();
3245 for (; src_base < src; src_base++, char_offset++)
3246 {
3247 if (ASCII_BYTE_P (*src_base))
3248 *charbuf++ = *src_base;
3249 else
3250 *charbuf++ = BYTE8_TO_CHAR (*src_base);
3251 }
3252 }
3253 else if (composition_state == COMPOSING_NO)
3254 {
3255 *charbuf++ = c;
3256 char_offset++;
3257 }
3258 else
3259 {
3260 components[component_idx++] = c;
3261 if (method == COMPOSITION_WITH_RULE
3262 || (method == COMPOSITION_WITH_RULE_ALTCHARS
3263 && composition_state == COMPOSING_COMPONENT_CHAR))
3264 composition_state++;
3265 }
3266 continue;
3267
3268 invalid_code:
3269 MAYBE_FINISH_COMPOSITION ();
3270 src = src_base;
3271 consumed_chars = consumed_chars_base;
3272 ONE_MORE_BYTE (c);
3273 *charbuf++ = c < 0 ? -c : ASCII_BYTE_P (c) ? c : BYTE8_TO_CHAR (c);
3274 char_offset++;
3275 coding->errors++;
3276 continue;
3277
3278 break_loop:
3279 break;
3280 }
3281
3282 no_more_source:
3283 if (last_id != charset_ascii)
3284 ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
3285 coding->consumed_char += consumed_chars_base;
3286 coding->consumed = src_base - coding->source;
3287 coding->charbuf_used = charbuf - coding->charbuf;
3288 }
3289
3290
3291 /* ISO2022 encoding stuff. */
3292
3293 /*
3294 It is not enough to say just "ISO2022" on encoding, we have to
3295 specify more details. In Emacs, each coding system of ISO2022
3296 variant has the following specifications:
3297 1. Initial designation to G0 thru G3.
3298 2. Allows short-form designation?
3299 3. ASCII should be designated to G0 before control characters?
3300 4. ASCII should be designated to G0 at end of line?
3301 5. 7-bit environment or 8-bit environment?
3302 6. Use locking-shift?
3303 7. Use Single-shift?
3304 And the following two are only for Japanese:
3305 8. Use ASCII in place of JIS0201-1976-Roman?
3306 9. Use JISX0208-1983 in place of JISX0208-1978?
3307 These specifications are encoded in CODING_ISO_FLAGS (coding) as flag bits
3308 defined by macros CODING_ISO_FLAG_XXX. See `coding.h' for more
3309 details.
3310 */
3311
3312 /* Produce codes (escape sequence) for designating CHARSET to graphic
3313 register REG at DST, and increment DST. If <final-char> of CHARSET is
3314 '@', 'A', or 'B' and the coding system CODING allows, produce
3315 designation sequence of short-form. */
3316
3317 #define ENCODE_DESIGNATION(charset, reg, coding) \
3318 do { \
3319 unsigned char final_char = CHARSET_ISO_FINAL (charset); \
3320 char *intermediate_char_94 = "()*+"; \
3321 char *intermediate_char_96 = ",-./"; \
3322 int revision = -1; \
3323 int c; \
3324 \
3325 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_REVISION) \
3326 revision = CHARSET_ISO_REVISION (charset); \
3327 \
3328 if (revision >= 0) \
3329 { \
3330 EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, '&'); \
3331 EMIT_ONE_BYTE ('@' + revision); \
3332 } \
3333 EMIT_ONE_ASCII_BYTE (ISO_CODE_ESC); \
3334 if (CHARSET_DIMENSION (charset) == 1) \
3335 { \
3336 if (! CHARSET_ISO_CHARS_96 (charset)) \
3337 c = intermediate_char_94[reg]; \
3338 else \
3339 c = intermediate_char_96[reg]; \
3340 EMIT_ONE_ASCII_BYTE (c); \
3341 } \
3342 else \
3343 { \
3344 EMIT_ONE_ASCII_BYTE ('$'); \
3345 if (! CHARSET_ISO_CHARS_96 (charset)) \
3346 { \
3347 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LONG_FORM \
3348 || reg != 0 \
3349 || final_char < '@' || final_char > 'B') \
3350 EMIT_ONE_ASCII_BYTE (intermediate_char_94[reg]); \
3351 } \
3352 else \
3353 EMIT_ONE_ASCII_BYTE (intermediate_char_96[reg]); \
3354 } \
3355 EMIT_ONE_ASCII_BYTE (final_char); \
3356 \
3357 CODING_ISO_DESIGNATION (coding, reg) = CHARSET_ID (charset); \
3358 } while (0)
3359
3360
3361 /* The following two macros produce codes (control character or escape
3362 sequence) for ISO2022 single-shift functions (single-shift-2 and
3363 single-shift-3). */
3364
3365 #define ENCODE_SINGLE_SHIFT_2 \
3366 do { \
3367 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS) \
3368 EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'N'); \
3369 else \
3370 EMIT_ONE_BYTE (ISO_CODE_SS2); \
3371 CODING_ISO_SINGLE_SHIFTING (coding) = 1; \
3372 } while (0)
3373
3374
3375 #define ENCODE_SINGLE_SHIFT_3 \
3376 do { \
3377 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS) \
3378 EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'O'); \
3379 else \
3380 EMIT_ONE_BYTE (ISO_CODE_SS3); \
3381 CODING_ISO_SINGLE_SHIFTING (coding) = 1; \
3382 } while (0)
3383
3384
3385 /* The following four macros produce codes (control character or
3386 escape sequence) for ISO2022 locking-shift functions (shift-in,
3387 shift-out, locking-shift-2, and locking-shift-3). */
3388
3389 #define ENCODE_SHIFT_IN \
3390 do { \
3391 EMIT_ONE_ASCII_BYTE (ISO_CODE_SI); \
3392 CODING_ISO_INVOCATION (coding, 0) = 0; \
3393 } while (0)
3394
3395
3396 #define ENCODE_SHIFT_OUT \
3397 do { \
3398 EMIT_ONE_ASCII_BYTE (ISO_CODE_SO); \
3399 CODING_ISO_INVOCATION (coding, 0) = 1; \
3400 } while (0)
3401
3402
3403 #define ENCODE_LOCKING_SHIFT_2 \
3404 do { \
3405 EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'n'); \
3406 CODING_ISO_INVOCATION (coding, 0) = 2; \
3407 } while (0)
3408
3409
3410 #define ENCODE_LOCKING_SHIFT_3 \
3411 do { \
3412 EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'n'); \
3413 CODING_ISO_INVOCATION (coding, 0) = 3; \
3414 } while (0)
3415
3416
3417 /* Produce codes for a DIMENSION1 character whose character set is
3418 CHARSET and whose position-code is C1. Designation and invocation
3419 sequences are also produced in advance if necessary. */
3420
3421 #define ENCODE_ISO_CHARACTER_DIMENSION1(charset, c1) \
3422 do { \
3423 int id = CHARSET_ID (charset); \
3424 \
3425 if ((CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_USE_ROMAN) \
3426 && id == charset_ascii) \
3427 { \
3428 id = charset_jisx0201_roman; \
3429 charset = CHARSET_FROM_ID (id); \
3430 } \
3431 \
3432 if (CODING_ISO_SINGLE_SHIFTING (coding)) \
3433 { \
3434 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS) \
3435 EMIT_ONE_ASCII_BYTE (c1 & 0x7F); \
3436 else \
3437 EMIT_ONE_BYTE (c1 | 0x80); \
3438 CODING_ISO_SINGLE_SHIFTING (coding) = 0; \
3439 break; \
3440 } \
3441 else if (id == CODING_ISO_INVOKED_CHARSET (coding, 0)) \
3442 { \
3443 EMIT_ONE_ASCII_BYTE (c1 & 0x7F); \
3444 break; \
3445 } \
3446 else if (id == CODING_ISO_INVOKED_CHARSET (coding, 1)) \
3447 { \
3448 EMIT_ONE_BYTE (c1 | 0x80); \
3449 break; \
3450 } \
3451 else \
3452 /* Since CHARSET is not yet invoked to any graphic planes, we \
3453 must invoke it, or, at first, designate it to some graphic \
3454 register. Then repeat the loop to actually produce the \
3455 character. */ \
3456 dst = encode_invocation_designation (charset, coding, dst, \
3457 &produced_chars); \
3458 } while (1)
3459
3460
3461 /* Produce codes for a DIMENSION2 character whose character set is
3462 CHARSET and whose position-codes are C1 and C2. Designation and
3463 invocation codes are also produced in advance if necessary. */
3464
3465 #define ENCODE_ISO_CHARACTER_DIMENSION2(charset, c1, c2) \
3466 do { \
3467 int id = CHARSET_ID (charset); \
3468 \
3469 if ((CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_USE_OLDJIS) \
3470 && id == charset_jisx0208) \
3471 { \
3472 id = charset_jisx0208_1978; \
3473 charset = CHARSET_FROM_ID (id); \
3474 } \
3475 \
3476 if (CODING_ISO_SINGLE_SHIFTING (coding)) \
3477 { \
3478 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS) \
3479 EMIT_TWO_ASCII_BYTES ((c1) & 0x7F, (c2) & 0x7F); \
3480 else \
3481 EMIT_TWO_BYTES ((c1) | 0x80, (c2) | 0x80); \
3482 CODING_ISO_SINGLE_SHIFTING (coding) = 0; \
3483 break; \
3484 } \
3485 else if (id == CODING_ISO_INVOKED_CHARSET (coding, 0)) \
3486 { \
3487 EMIT_TWO_ASCII_BYTES ((c1) & 0x7F, (c2) & 0x7F); \
3488 break; \
3489 } \
3490 else if (id == CODING_ISO_INVOKED_CHARSET (coding, 1)) \
3491 { \
3492 EMIT_TWO_BYTES ((c1) | 0x80, (c2) | 0x80); \
3493 break; \
3494 } \
3495 else \
3496 /* Since CHARSET is not yet invoked to any graphic planes, we \
3497 must invoke it, or, at first, designate it to some graphic \
3498 register. Then repeat the loop to actually produce the \
3499 character. */ \
3500 dst = encode_invocation_designation (charset, coding, dst, \
3501 &produced_chars); \
3502 } while (1)
3503
3504
3505 #define ENCODE_ISO_CHARACTER(charset, c) \
3506 do { \
3507 int code = ENCODE_CHAR ((charset),(c)); \
3508 \
3509 if (CHARSET_DIMENSION (charset) == 1) \
3510 ENCODE_ISO_CHARACTER_DIMENSION1 ((charset), code); \
3511 else \
3512 ENCODE_ISO_CHARACTER_DIMENSION2 ((charset), code >> 8, code & 0xFF); \
3513 } while (0)
3514
3515
3516 /* Produce designation and invocation codes at a place pointed by DST
3517 to use CHARSET. The element `spec.iso_2022' of *CODING is updated.
3518 Return new DST. */
3519
3520 unsigned char *
3521 encode_invocation_designation (charset, coding, dst, p_nchars)
3522 struct charset *charset;
3523 struct coding_system *coding;
3524 unsigned char *dst;
3525 int *p_nchars;
3526 {
3527 int multibytep = coding->dst_multibyte;
3528 int produced_chars = *p_nchars;
3529 int reg; /* graphic register number */
3530 int id = CHARSET_ID (charset);
3531
3532 /* At first, check designations. */
3533 for (reg = 0; reg < 4; reg++)
3534 if (id == CODING_ISO_DESIGNATION (coding, reg))
3535 break;
3536
3537 if (reg >= 4)
3538 {
3539 /* CHARSET is not yet designated to any graphic registers. */
3540 /* At first check the requested designation. */
3541 reg = CODING_ISO_REQUEST (coding, id);
3542 if (reg < 0)
3543 /* Since CHARSET requests no special designation, designate it
3544 to graphic register 0. */
3545 reg = 0;
3546
3547 ENCODE_DESIGNATION (charset, reg, coding);
3548 }
3549
3550 if (CODING_ISO_INVOCATION (coding, 0) != reg
3551 && CODING_ISO_INVOCATION (coding, 1) != reg)
3552 {
3553 /* Since the graphic register REG is not invoked to any graphic
3554 planes, invoke it to graphic plane 0. */
3555 switch (reg)
3556 {
3557 case 0: /* graphic register 0 */
3558 ENCODE_SHIFT_IN;
3559 break;
3560
3561 case 1: /* graphic register 1 */
3562 ENCODE_SHIFT_OUT;
3563 break;
3564
3565 case 2: /* graphic register 2 */
3566 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT)
3567 ENCODE_SINGLE_SHIFT_2;
3568 else
3569 ENCODE_LOCKING_SHIFT_2;
3570 break;
3571
3572 case 3: /* graphic register 3 */
3573 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT)
3574 ENCODE_SINGLE_SHIFT_3;
3575 else
3576 ENCODE_LOCKING_SHIFT_3;
3577 break;
3578 }
3579 }
3580
3581 *p_nchars = produced_chars;
3582 return dst;
3583 }
3584
3585 /* The following three macros produce codes for indicating direction
3586 of text. */
3587 #define ENCODE_CONTROL_SEQUENCE_INTRODUCER \
3588 do { \
3589 if (CODING_ISO_FLAGS (coding) == CODING_ISO_FLAG_SEVEN_BITS) \
3590 EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, '['); \
3591 else \
3592 EMIT_ONE_BYTE (ISO_CODE_CSI); \
3593 } while (0)
3594
3595
3596 #define ENCODE_DIRECTION_R2L() \
3597 do { \
3598 ENCODE_CONTROL_SEQUENCE_INTRODUCER (dst); \
3599 EMIT_TWO_ASCII_BYTES ('2', ']'); \
3600 } while (0)
3601
3602
3603 #define ENCODE_DIRECTION_L2R() \
3604 do { \
3605 ENCODE_CONTROL_SEQUENCE_INTRODUCER (dst); \
3606 EMIT_TWO_ASCII_BYTES ('0', ']'); \
3607 } while (0)
3608
3609
3610 /* Produce codes for designation and invocation to reset the graphic
3611 planes and registers to initial state. */
3612 #define ENCODE_RESET_PLANE_AND_REGISTER() \
3613 do { \
3614 int reg; \
3615 struct charset *charset; \
3616 \
3617 if (CODING_ISO_INVOCATION (coding, 0) != 0) \
3618 ENCODE_SHIFT_IN; \
3619 for (reg = 0; reg < 4; reg++) \
3620 if (CODING_ISO_INITIAL (coding, reg) >= 0 \
3621 && (CODING_ISO_DESIGNATION (coding, reg) \
3622 != CODING_ISO_INITIAL (coding, reg))) \
3623 { \
3624 charset = CHARSET_FROM_ID (CODING_ISO_INITIAL (coding, reg)); \
3625 ENCODE_DESIGNATION (charset, reg, coding); \
3626 } \
3627 } while (0)
3628
3629
3630 /* Produce designation sequences of charsets in the line started from
3631 SRC to a place pointed by DST, and return updated DST.
3632
3633 If the current block ends before any end-of-line, we may fail to
3634 find all the necessary designations. */
3635
3636 static unsigned char *
3637 encode_designation_at_bol (coding, charbuf, charbuf_end, dst)
3638 struct coding_system *coding;
3639 int *charbuf, *charbuf_end;
3640 unsigned char *dst;
3641 {
3642 struct charset *charset;
3643 /* Table of charsets to be designated to each graphic register. */
3644 int r[4];
3645 int c, found = 0, reg;
3646 int produced_chars = 0;
3647 int multibytep = coding->dst_multibyte;
3648 Lisp_Object attrs;
3649 Lisp_Object charset_list;
3650
3651 attrs = CODING_ID_ATTRS (coding->id);
3652 charset_list = CODING_ATTR_CHARSET_LIST (attrs);
3653 if (EQ (charset_list, Qiso_2022))
3654 charset_list = Viso_2022_charset_list;
3655
3656 for (reg = 0; reg < 4; reg++)
3657 r[reg] = -1;
3658
3659 while (found < 4)
3660 {
3661 int id;
3662
3663 c = *charbuf++;
3664 if (c == '\n')
3665 break;
3666 charset = char_charset (c, charset_list, NULL);
3667 id = CHARSET_ID (charset);
3668 reg = CODING_ISO_REQUEST (coding, id);
3669 if (reg >= 0 && r[reg] < 0)
3670 {
3671 found++;
3672 r[reg] = id;
3673 }
3674 }
3675
3676 if (found)
3677 {
3678 for (reg = 0; reg < 4; reg++)
3679 if (r[reg] >= 0
3680 && CODING_ISO_DESIGNATION (coding, reg) != r[reg])
3681 ENCODE_DESIGNATION (CHARSET_FROM_ID (r[reg]), reg, coding);
3682 }
3683
3684 return dst;
3685 }
3686
3687 /* See the above "GENERAL NOTES on `encode_coding_XXX ()' functions". */
3688
3689 static int
3690 encode_coding_iso_2022 (coding)
3691 struct coding_system *coding;
3692 {
3693 int multibytep = coding->dst_multibyte;
3694 int *charbuf = coding->charbuf;
3695 int *charbuf_end = charbuf + coding->charbuf_used;
3696 unsigned char *dst = coding->destination + coding->produced;
3697 unsigned char *dst_end = coding->destination + coding->dst_bytes;
3698 int safe_room = 16;
3699 int bol_designation
3700 = (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DESIGNATE_AT_BOL
3701 && CODING_ISO_BOL (coding));
3702 int produced_chars = 0;
3703 Lisp_Object attrs, eol_type, charset_list;
3704 int ascii_compatible;
3705 int c;
3706 int preferred_charset_id = -1;
3707
3708 CODING_GET_INFO (coding, attrs, charset_list);
3709 eol_type = CODING_ID_EOL_TYPE (coding->id);
3710 if (VECTORP (eol_type))
3711 eol_type = Qunix;
3712
3713 setup_iso_safe_charsets (attrs);
3714 /* Charset list may have been changed. */
3715 charset_list = CODING_ATTR_CHARSET_LIST (attrs); \
3716 coding->safe_charsets = (char *) SDATA (CODING_ATTR_SAFE_CHARSETS(attrs));
3717
3718 ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs));
3719
3720 while (charbuf < charbuf_end)
3721 {
3722 ASSURE_DESTINATION (safe_room);
3723
3724 if (bol_designation)
3725 {
3726 unsigned char *dst_prev = dst;
3727
3728 /* We have to produce designation sequences if any now. */
3729 dst = encode_designation_at_bol (coding, charbuf, charbuf_end, dst);
3730 bol_designation = 0;
3731 /* We are sure that designation sequences are all ASCII bytes. */
3732 produced_chars += dst - dst_prev;
3733 }
3734
3735 c = *charbuf++;
3736
3737 if (c < 0)
3738 {
3739 /* Handle an annotation. */
3740 switch (*charbuf)
3741 {
3742 case CODING_ANNOTATE_COMPOSITION_MASK:
3743 /* Not yet implemented. */
3744 break;
3745 case CODING_ANNOTATE_CHARSET_MASK:
3746 preferred_charset_id = charbuf[3];
3747 if (preferred_charset_id >= 0
3748 && NILP (Fmemq (make_number (preferred_charset_id),
3749 charset_list)))
3750 preferred_charset_id = -1;
3751 break;
3752 default:
3753 abort ();
3754 }
3755 charbuf += -c - 1;
3756 continue;
3757 }
3758
3759 /* Now encode the character C. */
3760 if (c < 0x20 || c == 0x7F)
3761 {
3762 if (c == '\n'
3763 || (c == '\r' && EQ (eol_type, Qmac)))
3764 {
3765 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_RESET_AT_EOL)
3766 ENCODE_RESET_PLANE_AND_REGISTER ();
3767 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_INIT_AT_BOL)
3768 {
3769 int i;
3770
3771 for (i = 0; i < 4; i++)
3772 CODING_ISO_DESIGNATION (coding, i)
3773 = CODING_ISO_INITIAL (coding, i);
3774 }
3775 bol_designation
3776 = CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DESIGNATE_AT_BOL;
3777 }
3778 else if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_RESET_AT_CNTL)
3779 ENCODE_RESET_PLANE_AND_REGISTER ();
3780 EMIT_ONE_ASCII_BYTE (c);
3781 }
3782 else if (ASCII_CHAR_P (c))
3783 {
3784 if (ascii_compatible)
3785 EMIT_ONE_ASCII_BYTE (c);
3786 else
3787 {
3788 struct charset *charset = CHARSET_FROM_ID (charset_ascii);
3789 ENCODE_ISO_CHARACTER (charset, c);
3790 }
3791 }
3792 else if (CHAR_BYTE8_P (c))
3793 {
3794 c = CHAR_TO_BYTE8 (c);
3795 EMIT_ONE_BYTE (c);
3796 }
3797 else
3798 {
3799 struct charset *charset;
3800
3801 if (preferred_charset_id >= 0)
3802 {
3803 charset = CHARSET_FROM_ID (preferred_charset_id);
3804 if (! CHAR_CHARSET_P (c, charset))
3805 charset = char_charset (c, charset_list, NULL);
3806 }
3807 else
3808 charset = char_charset (c, charset_list, NULL);
3809 if (!charset)
3810 {
3811 if (coding->mode & CODING_MODE_SAFE_ENCODING)
3812 {
3813 c = CODING_INHIBIT_CHARACTER_SUBSTITUTION;
3814 charset = CHARSET_FROM_ID (charset_ascii);
3815 }
3816 else
3817 {
3818 c = coding->default_char;
3819 charset = char_charset (c, charset_list, NULL);
3820 }
3821 }
3822 ENCODE_ISO_CHARACTER (charset, c);
3823 }
3824 }
3825
3826 if (coding->mode & CODING_MODE_LAST_BLOCK
3827 && CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_RESET_AT_EOL)
3828 {
3829 ASSURE_DESTINATION (safe_room);
3830 ENCODE_RESET_PLANE_AND_REGISTER ();
3831 }
3832 record_conversion_result (coding, CODING_RESULT_SUCCESS);
3833 CODING_ISO_BOL (coding) = bol_designation;
3834 coding->produced_char += produced_chars;
3835 coding->produced = dst - coding->destination;
3836 return 0;
3837 }
3838
3839 \f
3840 /*** 8,9. SJIS and BIG5 handlers ***/
3841
3842 /* Although SJIS and BIG5 are not ISO's coding system, they are used
3843 quite widely. So, for the moment, Emacs supports them in the bare
3844 C code. But, in the future, they may be supported only by CCL. */
3845
3846 /* SJIS is a coding system encoding three character sets: ASCII, right
3847 half of JISX0201-Kana, and JISX0208. An ASCII character is encoded
3848 as is. A character of charset katakana-jisx0201 is encoded by
3849 "position-code + 0x80". A character of charset japanese-jisx0208
3850 is encoded in 2-byte but two position-codes are divided and shifted
3851 so that it fit in the range below.
3852
3853 --- CODE RANGE of SJIS ---
3854 (character set) (range)
3855 ASCII 0x00 .. 0x7F
3856 KATAKANA-JISX0201 0xA0 .. 0xDF
3857 JISX0208 (1st byte) 0x81 .. 0x9F and 0xE0 .. 0xEF
3858 (2nd byte) 0x40 .. 0x7E and 0x80 .. 0xFC
3859 -------------------------------
3860
3861 */
3862
3863 /* BIG5 is a coding system encoding two character sets: ASCII and
3864 Big5. An ASCII character is encoded as is. Big5 is a two-byte
3865 character set and is encoded in two-byte.
3866
3867 --- CODE RANGE of BIG5 ---
3868 (character set) (range)
3869 ASCII 0x00 .. 0x7F
3870 Big5 (1st byte) 0xA1 .. 0xFE
3871 (2nd byte) 0x40 .. 0x7E and 0xA1 .. 0xFE
3872 --------------------------
3873
3874 */
3875
3876 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
3877 Check if a text is encoded in SJIS. If it is, return
3878 CATEGORY_MASK_SJIS, else return 0. */
3879
3880 static int
3881 detect_coding_sjis (coding, detect_info)
3882 struct coding_system *coding;
3883 struct coding_detection_info *detect_info;
3884 {
3885 const unsigned char *src = coding->source, *src_base;
3886 const unsigned char *src_end = coding->source + coding->src_bytes;
3887 int multibytep = coding->src_multibyte;
3888 int consumed_chars = 0;
3889 int found = 0;
3890 int c;
3891
3892 detect_info->checked |= CATEGORY_MASK_SJIS;
3893 /* A coding system of this category is always ASCII compatible. */
3894 src += coding->head_ascii;
3895
3896 while (1)
3897 {
3898 src_base = src;
3899 ONE_MORE_BYTE (c);
3900 if (c < 0x80)
3901 continue;
3902 if ((c >= 0x81 && c <= 0x9F) || (c >= 0xE0 && c <= 0xEF))
3903 {
3904 ONE_MORE_BYTE (c);
3905 if (c < 0x40 || c == 0x7F || c > 0xFC)
3906 break;
3907 found = CATEGORY_MASK_SJIS;
3908 }
3909 else if (c >= 0xA0 && c < 0xE0)
3910 found = CATEGORY_MASK_SJIS;
3911 else
3912 break;
3913 }
3914 detect_info->rejected |= CATEGORY_MASK_SJIS;
3915 return 0;
3916
3917 no_more_source:
3918 if (src_base < src && coding->mode & CODING_MODE_LAST_BLOCK)
3919 {
3920 detect_info->rejected |= CATEGORY_MASK_SJIS;
3921 return 0;
3922 }
3923 detect_info->found |= found;
3924 return 1;
3925 }
3926
3927 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
3928 Check if a text is encoded in BIG5. If it is, return
3929 CATEGORY_MASK_BIG5, else return 0. */
3930
3931 static int
3932 detect_coding_big5 (coding, detect_info)
3933 struct coding_system *coding;
3934 struct coding_detection_info *detect_info;
3935 {
3936 const unsigned char *src = coding->source, *src_base;
3937 const unsigned char *src_end = coding->source + coding->src_bytes;
3938 int multibytep = coding->src_multibyte;
3939 int consumed_chars = 0;
3940 int found = 0;
3941 int c;
3942
3943 detect_info->checked |= CATEGORY_MASK_BIG5;
3944 /* A coding system of this category is always ASCII compatible. */
3945 src += coding->head_ascii;
3946
3947 while (1)
3948 {
3949 src_base = src;
3950 ONE_MORE_BYTE (c);
3951 if (c < 0x80)
3952 continue;
3953 if (c >= 0xA1)
3954 {
3955 ONE_MORE_BYTE (c);
3956 if (c < 0x40 || (c >= 0x7F && c <= 0xA0))
3957 return 0;
3958 found = CATEGORY_MASK_BIG5;
3959 }
3960 else
3961 break;
3962 }
3963 detect_info->rejected |= CATEGORY_MASK_BIG5;
3964 return 0;
3965
3966 no_more_source:
3967 if (src_base < src && coding->mode & CODING_MODE_LAST_BLOCK)
3968 {
3969 detect_info->rejected |= CATEGORY_MASK_BIG5;
3970 return 0;
3971 }
3972 detect_info->found |= found;
3973 return 1;
3974 }
3975
3976 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".
3977 If SJIS_P is 1, decode SJIS text, else decode BIG5 test. */
3978
3979 static void
3980 decode_coding_sjis (coding)
3981 struct coding_system *coding;
3982 {
3983 const unsigned char *src = coding->source + coding->consumed;
3984 const unsigned char *src_end = coding->source + coding->src_bytes;
3985 const unsigned char *src_base;
3986 int *charbuf = coding->charbuf + coding->charbuf_used;
3987 int *charbuf_end
3988 = coding->charbuf + coding->charbuf_size - MAX_ANNOTATION_LENGTH;
3989 int consumed_chars = 0, consumed_chars_base;
3990 int multibytep = coding->src_multibyte;
3991 struct charset *charset_roman, *charset_kanji, *charset_kana;
3992 struct charset *charset_kanji2;
3993 Lisp_Object attrs, charset_list, val;
3994 int char_offset = coding->produced_char;
3995 int last_offset = char_offset;
3996 int last_id = charset_ascii;
3997
3998 CODING_GET_INFO (coding, attrs, charset_list);
3999
4000 val = charset_list;
4001 charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4002 charset_kana = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4003 charset_kanji = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4004 charset_kanji2 = NILP (val) ? NULL : CHARSET_FROM_ID (XINT (XCAR (val)));
4005
4006 while (1)
4007 {
4008 int c, c1;
4009 struct charset *charset;
4010
4011 src_base = src;
4012 consumed_chars_base = consumed_chars;
4013
4014 if (charbuf >= charbuf_end)
4015 break;
4016
4017 ONE_MORE_BYTE (c);
4018 if (c < 0)
4019 goto invalid_code;
4020 if (c < 0x80)
4021 charset = charset_roman;
4022 else if (c == 0x80 || c == 0xA0)
4023 goto invalid_code;
4024 else if (c >= 0xA1 && c <= 0xDF)
4025 {
4026 /* SJIS -> JISX0201-Kana */
4027 c &= 0x7F;
4028 charset = charset_kana;
4029 }
4030 else if (c <= 0xEF)
4031 {
4032 /* SJIS -> JISX0208 */
4033 ONE_MORE_BYTE (c1);
4034 if (c1 < 0x40 || c1 == 0x7F || c1 > 0xFC)
4035 goto invalid_code;
4036 c = (c << 8) | c1;
4037 SJIS_TO_JIS (c);
4038 charset = charset_kanji;
4039 }
4040 else if (c <= 0xFC && charset_kanji2)
4041 {
4042 /* SJIS -> JISX0213-2 */
4043 ONE_MORE_BYTE (c1);
4044 if (c1 < 0x40 || c1 == 0x7F || c1 > 0xFC)
4045 goto invalid_code;
4046 c = (c << 8) | c1;
4047 SJIS_TO_JIS2 (c);
4048 charset = charset_kanji2;
4049 }
4050 else
4051 goto invalid_code;
4052 if (charset->id != charset_ascii
4053 && last_id != charset->id)
4054 {
4055 if (last_id != charset_ascii)
4056 ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
4057 last_id = charset->id;
4058 last_offset = char_offset;
4059 }
4060 CODING_DECODE_CHAR (coding, src, src_base, src_end, charset, c, c);
4061 *charbuf++ = c;
4062 char_offset++;
4063 continue;
4064
4065 invalid_code:
4066 src = src_base;
4067 consumed_chars = consumed_chars_base;
4068 ONE_MORE_BYTE (c);
4069 *charbuf++ = c < 0 ? -c : BYTE8_TO_CHAR (c);
4070 char_offset++;
4071 coding->errors++;
4072 }
4073
4074 no_more_source:
4075 if (last_id != charset_ascii)
4076 ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
4077 coding->consumed_char += consumed_chars_base;
4078 coding->consumed = src_base - coding->source;
4079 coding->charbuf_used = charbuf - coding->charbuf;
4080 }
4081
4082 static void
4083 decode_coding_big5 (coding)
4084 struct coding_system *coding;
4085 {
4086 const unsigned char *src = coding->source + coding->consumed;
4087 const unsigned char *src_end = coding->source + coding->src_bytes;
4088 const unsigned char *src_base;
4089 int *charbuf = coding->charbuf + coding->charbuf_used;
4090 int *charbuf_end
4091 = coding->charbuf + coding->charbuf_size - MAX_ANNOTATION_LENGTH;
4092 int consumed_chars = 0, consumed_chars_base;
4093 int multibytep = coding->src_multibyte;
4094 struct charset *charset_roman, *charset_big5;
4095 Lisp_Object attrs, charset_list, val;
4096 int char_offset = coding->produced_char;
4097 int last_offset = char_offset;
4098 int last_id = charset_ascii;
4099
4100 CODING_GET_INFO (coding, attrs, charset_list);
4101 val = charset_list;
4102 charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4103 charset_big5 = CHARSET_FROM_ID (XINT (XCAR (val)));
4104
4105 while (1)
4106 {
4107 int c, c1;
4108 struct charset *charset;
4109
4110 src_base = src;
4111 consumed_chars_base = consumed_chars;
4112
4113 if (charbuf >= charbuf_end)
4114 break;
4115
4116 ONE_MORE_BYTE (c);
4117
4118 if (c < 0)
4119 goto invalid_code;
4120 if (c < 0x80)
4121 charset = charset_roman;
4122 else
4123 {
4124 /* BIG5 -> Big5 */
4125 if (c < 0xA1 || c > 0xFE)
4126 goto invalid_code;
4127 ONE_MORE_BYTE (c1);
4128 if (c1 < 0x40 || (c1 > 0x7E && c1 < 0xA1) || c1 > 0xFE)
4129 goto invalid_code;
4130 c = c << 8 | c1;
4131 charset = charset_big5;
4132 }
4133 if (charset->id != charset_ascii
4134 && last_id != charset->id)
4135 {
4136 if (last_id != charset_ascii)
4137 ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
4138 last_id = charset->id;
4139 last_offset = char_offset;
4140 }
4141 CODING_DECODE_CHAR (coding, src, src_base, src_end, charset, c, c);
4142 *charbuf++ = c;
4143 char_offset++;
4144 continue;
4145
4146 invalid_code:
4147 src = src_base;
4148 consumed_chars = consumed_chars_base;
4149 ONE_MORE_BYTE (c);
4150 *charbuf++ = c < 0 ? -c : BYTE8_TO_CHAR (c);
4151 char_offset++;
4152 coding->errors++;
4153 }
4154
4155 no_more_source:
4156 if (last_id != charset_ascii)
4157 ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
4158 coding->consumed_char += consumed_chars_base;
4159 coding->consumed = src_base - coding->source;
4160 coding->charbuf_used = charbuf - coding->charbuf;
4161 }
4162
4163 /* See the above "GENERAL NOTES on `encode_coding_XXX ()' functions".
4164 This function can encode charsets `ascii', `katakana-jisx0201',
4165 `japanese-jisx0208', `chinese-big5-1', and `chinese-big5-2'. We
4166 are sure that all these charsets are registered as official charset
4167 (i.e. do not have extended leading-codes). Characters of other
4168 charsets are produced without any encoding. If SJIS_P is 1, encode
4169 SJIS text, else encode BIG5 text. */
4170
4171 static int
4172 encode_coding_sjis (coding)
4173 struct coding_system *coding;
4174 {
4175 int multibytep = coding->dst_multibyte;
4176 int *charbuf = coding->charbuf;
4177 int *charbuf_end = charbuf + coding->charbuf_used;
4178 unsigned char *dst = coding->destination + coding->produced;
4179 unsigned char *dst_end = coding->destination + coding->dst_bytes;
4180 int safe_room = 4;
4181 int produced_chars = 0;
4182 Lisp_Object attrs, charset_list, val;
4183 int ascii_compatible;
4184 struct charset *charset_roman, *charset_kanji, *charset_kana;
4185 struct charset *charset_kanji2;
4186 int c;
4187
4188 CODING_GET_INFO (coding, attrs, charset_list);
4189 val = charset_list;
4190 charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4191 charset_kana = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4192 charset_kanji = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4193 charset_kanji2 = NILP (val) ? NULL : CHARSET_FROM_ID (XINT (XCAR (val)));
4194
4195 ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs));
4196
4197 while (charbuf < charbuf_end)
4198 {
4199 ASSURE_DESTINATION (safe_room);
4200 c = *charbuf++;
4201 /* Now encode the character C. */
4202 if (ASCII_CHAR_P (c) && ascii_compatible)
4203 EMIT_ONE_ASCII_BYTE (c);
4204 else if (CHAR_BYTE8_P (c))
4205 {
4206 c = CHAR_TO_BYTE8 (c);
4207 EMIT_ONE_BYTE (c);
4208 }
4209 else
4210 {
4211 unsigned code;
4212 struct charset *charset = char_charset (c, charset_list, &code);
4213
4214 if (!charset)
4215 {
4216 if (coding->mode & CODING_MODE_SAFE_ENCODING)
4217 {
4218 code = CODING_INHIBIT_CHARACTER_SUBSTITUTION;
4219 charset = CHARSET_FROM_ID (charset_ascii);
4220 }
4221 else
4222 {
4223 c = coding->default_char;
4224 charset = char_charset (c, charset_list, &code);
4225 }
4226 }
4227 if (code == CHARSET_INVALID_CODE (charset))
4228 abort ();
4229 if (charset == charset_kanji)
4230 {
4231 int c1, c2;
4232 JIS_TO_SJIS (code);
4233 c1 = code >> 8, c2 = code & 0xFF;
4234 EMIT_TWO_BYTES (c1, c2);
4235 }
4236 else if (charset == charset_kana)
4237 EMIT_ONE_BYTE (code | 0x80);
4238 else if (charset_kanji2 && charset == charset_kanji2)
4239 {
4240 int c1, c2;
4241
4242 c1 = code >> 8;
4243 if (c1 == 0x21 || (c1 >= 0x23 && c1 < 0x25)
4244 || (c1 >= 0x2C && c1 <= 0x2F) || c1 >= 0x6E)
4245 {
4246 JIS_TO_SJIS2 (code);
4247 c1 = code >> 8, c2 = code & 0xFF;
4248 EMIT_TWO_BYTES (c1, c2);
4249 }
4250 else
4251 EMIT_ONE_ASCII_BYTE (code & 0x7F);
4252 }
4253 else
4254 EMIT_ONE_ASCII_BYTE (code & 0x7F);
4255 }
4256 }
4257 record_conversion_result (coding, CODING_RESULT_SUCCESS);
4258 coding->produced_char += produced_chars;
4259 coding->produced = dst - coding->destination;
4260 return 0;
4261 }
4262
4263 static int
4264 encode_coding_big5 (coding)
4265 struct coding_system *coding;
4266 {
4267 int multibytep = coding->dst_multibyte;
4268 int *charbuf = coding->charbuf;
4269 int *charbuf_end = charbuf + coding->charbuf_used;
4270 unsigned char *dst = coding->destination + coding->produced;
4271 unsigned char *dst_end = coding->destination + coding->dst_bytes;
4272 int safe_room = 4;
4273 int produced_chars = 0;
4274 Lisp_Object attrs, charset_list, val;
4275 int ascii_compatible;
4276 struct charset *charset_roman, *charset_big5;
4277 int c;
4278
4279 CODING_GET_INFO (coding, attrs, charset_list);
4280 val = charset_list;
4281 charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4282 charset_big5 = CHARSET_FROM_ID (XINT (XCAR (val)));
4283 ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs));
4284
4285 while (charbuf < charbuf_end)
4286 {
4287 ASSURE_DESTINATION (safe_room);
4288 c = *charbuf++;
4289 /* Now encode the character C. */
4290 if (ASCII_CHAR_P (c) && ascii_compatible)
4291 EMIT_ONE_ASCII_BYTE (c);
4292 else if (CHAR_BYTE8_P (c))
4293 {
4294 c = CHAR_TO_BYTE8 (c);
4295 EMIT_ONE_BYTE (c);
4296 }
4297 else
4298 {
4299 unsigned code;
4300 struct charset *charset = char_charset (c, charset_list, &code);
4301
4302 if (! charset)
4303 {
4304 if (coding->mode & CODING_MODE_SAFE_ENCODING)
4305 {
4306 code = CODING_INHIBIT_CHARACTER_SUBSTITUTION;
4307 charset = CHARSET_FROM_ID (charset_ascii);
4308 }
4309 else
4310 {
4311 c = coding->default_char;
4312 charset = char_charset (c, charset_list, &code);
4313 }
4314 }
4315 if (code == CHARSET_INVALID_CODE (charset))
4316 abort ();
4317 if (charset == charset_big5)
4318 {
4319 int c1, c2;
4320
4321 c1 = code >> 8, c2 = code & 0xFF;
4322 EMIT_TWO_BYTES (c1, c2);
4323 }
4324 else
4325 EMIT_ONE_ASCII_BYTE (code & 0x7F);
4326 }
4327 }
4328 record_conversion_result (coding, CODING_RESULT_SUCCESS);
4329 coding->produced_char += produced_chars;
4330 coding->produced = dst - coding->destination;
4331 return 0;
4332 }
4333
4334 \f
4335 /*** 10. CCL handlers ***/
4336
4337 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
4338 Check if a text is encoded in a coding system of which
4339 encoder/decoder are written in CCL program. If it is, return
4340 CATEGORY_MASK_CCL, else return 0. */
4341
4342 static int
4343 detect_coding_ccl (coding, detect_info)
4344 struct coding_system *coding;
4345 struct coding_detection_info *detect_info;
4346 {
4347 const unsigned char *src = coding->source, *src_base;
4348 const unsigned char *src_end = coding->source + coding->src_bytes;
4349 int multibytep = coding->src_multibyte;
4350 int consumed_chars = 0;
4351 int found = 0;
4352 unsigned char *valids;
4353 int head_ascii = coding->head_ascii;
4354 Lisp_Object attrs;
4355
4356 detect_info->checked |= CATEGORY_MASK_CCL;
4357
4358 coding = &coding_categories[coding_category_ccl];
4359 valids = CODING_CCL_VALIDS (coding);
4360 attrs = CODING_ID_ATTRS (coding->id);
4361 if (! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
4362 src += head_ascii;
4363
4364 while (1)
4365 {
4366 int c;
4367
4368 src_base = src;
4369 ONE_MORE_BYTE (c);
4370 if (c < 0 || ! valids[c])
4371 break;
4372 if ((valids[c] > 1))
4373 found = CATEGORY_MASK_CCL;
4374 }
4375 detect_info->rejected |= CATEGORY_MASK_CCL;
4376 return 0;
4377
4378 no_more_source:
4379 detect_info->found |= found;
4380 return 1;
4381 }
4382
4383 static void
4384 decode_coding_ccl (coding)
4385 struct coding_system *coding;
4386 {
4387 const unsigned char *src = coding->source + coding->consumed;
4388 const unsigned char *src_end = coding->source + coding->src_bytes;
4389 int *charbuf = coding->charbuf + coding->charbuf_used;
4390 int *charbuf_end = coding->charbuf + coding->charbuf_size;
4391 int consumed_chars = 0;
4392 int multibytep = coding->src_multibyte;
4393 struct ccl_program ccl;
4394 int source_charbuf[1024];
4395 int source_byteidx[1024];
4396 Lisp_Object attrs, charset_list;
4397
4398 CODING_GET_INFO (coding, attrs, charset_list);
4399 setup_ccl_program (&ccl, CODING_CCL_DECODER (coding));
4400
4401 while (src < src_end)
4402 {
4403 const unsigned char *p = src;
4404 int *source, *source_end;
4405 int i = 0;
4406
4407 if (multibytep)
4408 while (i < 1024 && p < src_end)
4409 {
4410 source_byteidx[i] = p - src;
4411 source_charbuf[i++] = STRING_CHAR_ADVANCE (p);
4412 }
4413 else
4414 while (i < 1024 && p < src_end)
4415 source_charbuf[i++] = *p++;
4416
4417 if (p == src_end && coding->mode & CODING_MODE_LAST_BLOCK)
4418 ccl.last_block = 1;
4419
4420 source = source_charbuf;
4421 source_end = source + i;
4422 while (source < source_end)
4423 {
4424 ccl_driver (&ccl, source, charbuf,
4425 source_end - source, charbuf_end - charbuf,
4426 charset_list);
4427 source += ccl.consumed;
4428 charbuf += ccl.produced;
4429 if (ccl.status != CCL_STAT_SUSPEND_BY_DST)
4430 break;
4431 }
4432 if (source < source_end)
4433 src += source_byteidx[source - source_charbuf];
4434 else
4435 src = p;
4436 consumed_chars += source - source_charbuf;
4437
4438 if (ccl.status != CCL_STAT_SUSPEND_BY_SRC
4439 && ccl.status != CODING_RESULT_INSUFFICIENT_SRC)
4440 break;
4441 }
4442
4443 switch (ccl.status)
4444 {
4445 case CCL_STAT_SUSPEND_BY_SRC:
4446 record_conversion_result (coding, CODING_RESULT_INSUFFICIENT_SRC);
4447 break;
4448 case CCL_STAT_SUSPEND_BY_DST:
4449 break;
4450 case CCL_STAT_QUIT:
4451 case CCL_STAT_INVALID_CMD:
4452 record_conversion_result (coding, CODING_RESULT_INTERRUPT);
4453 break;
4454 default:
4455 record_conversion_result (coding, CODING_RESULT_SUCCESS);
4456 break;
4457 }
4458 coding->consumed_char += consumed_chars;
4459 coding->consumed = src - coding->source;
4460 coding->charbuf_used = charbuf - coding->charbuf;
4461 }
4462
4463 static int
4464 encode_coding_ccl (coding)
4465 struct coding_system *coding;
4466 {
4467 struct ccl_program ccl;
4468 int multibytep = coding->dst_multibyte;
4469 int *charbuf = coding->charbuf;
4470 int *charbuf_end = charbuf + coding->charbuf_used;
4471 unsigned char *dst = coding->destination + coding->produced;
4472 unsigned char *dst_end = coding->destination + coding->dst_bytes;
4473 unsigned char *adjusted_dst_end = dst_end - 1;
4474 int destination_charbuf[1024];
4475 int i, produced_chars = 0;
4476 Lisp_Object attrs, charset_list;
4477
4478 CODING_GET_INFO (coding, attrs, charset_list);
4479 setup_ccl_program (&ccl, CODING_CCL_ENCODER (coding));
4480
4481 ccl.last_block = coding->mode & CODING_MODE_LAST_BLOCK;
4482 ccl.dst_multibyte = coding->dst_multibyte;
4483
4484 while (charbuf < charbuf_end && dst < adjusted_dst_end)
4485 {
4486 int dst_bytes = dst_end - dst;
4487 if (dst_bytes > 1024)
4488 dst_bytes = 1024;
4489
4490 ccl_driver (&ccl, charbuf, destination_charbuf,
4491 charbuf_end - charbuf, dst_bytes, charset_list);
4492 charbuf += ccl.consumed;
4493 if (multibytep)
4494 for (i = 0; i < ccl.produced; i++)
4495 EMIT_ONE_BYTE (destination_charbuf[i] & 0xFF);
4496 else
4497 {
4498 for (i = 0; i < ccl.produced; i++)
4499 *dst++ = destination_charbuf[i] & 0xFF;
4500 produced_chars += ccl.produced;
4501 }
4502 }
4503
4504 switch (ccl.status)
4505 {
4506 case CCL_STAT_SUSPEND_BY_SRC:
4507 record_conversion_result (coding, CODING_RESULT_INSUFFICIENT_SRC);
4508 break;
4509 case CCL_STAT_SUSPEND_BY_DST:
4510 record_conversion_result (coding, CODING_RESULT_INSUFFICIENT_DST);
4511 break;
4512 case CCL_STAT_QUIT:
4513 case CCL_STAT_INVALID_CMD:
4514 record_conversion_result (coding, CODING_RESULT_INTERRUPT);
4515 break;
4516 default:
4517 record_conversion_result (coding, CODING_RESULT_SUCCESS);
4518 break;
4519 }
4520
4521 coding->produced_char += produced_chars;
4522 coding->produced = dst - coding->destination;
4523 return 0;
4524 }
4525
4526
4527 \f
4528 /*** 10, 11. no-conversion handlers ***/
4529
4530 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions". */
4531
4532 static void
4533 decode_coding_raw_text (coding)
4534 struct coding_system *coding;
4535 {
4536 coding->chars_at_source = 1;
4537 coding->consumed_char = 0;
4538 coding->consumed = 0;
4539 record_conversion_result (coding, CODING_RESULT_SUCCESS);
4540 }
4541
4542 static int
4543 encode_coding_raw_text (coding)
4544 struct coding_system *coding;
4545 {
4546 int multibytep = coding->dst_multibyte;
4547 int *charbuf = coding->charbuf;
4548 int *charbuf_end = coding->charbuf + coding->charbuf_used;
4549 unsigned char *dst = coding->destination + coding->produced;
4550 unsigned char *dst_end = coding->destination + coding->dst_bytes;
4551 int produced_chars = 0;
4552 int c;
4553
4554 if (multibytep)
4555 {
4556 int safe_room = MAX_MULTIBYTE_LENGTH * 2;
4557
4558 if (coding->src_multibyte)
4559 while (charbuf < charbuf_end)
4560 {
4561 ASSURE_DESTINATION (safe_room);
4562 c = *charbuf++;
4563 if (ASCII_CHAR_P (c))
4564 EMIT_ONE_ASCII_BYTE (c);
4565 else if (CHAR_BYTE8_P (c))
4566 {
4567 c = CHAR_TO_BYTE8 (c);
4568 EMIT_ONE_BYTE (c);
4569 }
4570 else
4571 {
4572 unsigned char str[MAX_MULTIBYTE_LENGTH], *p0 = str, *p1 = str;
4573
4574 CHAR_STRING_ADVANCE (c, p1);
4575 while (p0 < p1)
4576 {
4577 EMIT_ONE_BYTE (*p0);
4578 p0++;
4579 }
4580 }
4581 }
4582 else
4583 while (charbuf < charbuf_end)
4584 {
4585 ASSURE_DESTINATION (safe_room);
4586 c = *charbuf++;
4587 EMIT_ONE_BYTE (c);
4588 }
4589 }
4590 else
4591 {
4592 if (coding->src_multibyte)
4593 {
4594 int safe_room = MAX_MULTIBYTE_LENGTH;
4595
4596 while (charbuf < charbuf_end)
4597 {
4598 ASSURE_DESTINATION (safe_room);
4599 c = *charbuf++;
4600 if (ASCII_CHAR_P (c))
4601 *dst++ = c;
4602 else if (CHAR_BYTE8_P (c))
4603 *dst++ = CHAR_TO_BYTE8 (c);
4604 else
4605 CHAR_STRING_ADVANCE (c, dst);
4606 produced_chars++;
4607 }
4608 }
4609 else
4610 {
4611 ASSURE_DESTINATION (charbuf_end - charbuf);
4612 while (charbuf < charbuf_end && dst < dst_end)
4613 *dst++ = *charbuf++;
4614 produced_chars = dst - (coding->destination + coding->dst_bytes);
4615 }
4616 }
4617 record_conversion_result (coding, CODING_RESULT_SUCCESS);
4618 coding->produced_char += produced_chars;
4619 coding->produced = dst - coding->destination;
4620 return 0;
4621 }
4622
4623 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
4624 Check if a text is encoded in a charset-based coding system. If it
4625 is, return 1, else return 0. */
4626
4627 static int
4628 detect_coding_charset (coding, detect_info)
4629 struct coding_system *coding;
4630 struct coding_detection_info *detect_info;
4631 {
4632 const unsigned char *src = coding->source, *src_base;
4633 const unsigned char *src_end = coding->source + coding->src_bytes;
4634 int multibytep = coding->src_multibyte;
4635 int consumed_chars = 0;
4636 Lisp_Object attrs, valids;
4637 int found = 0;
4638
4639 detect_info->checked |= CATEGORY_MASK_CHARSET;
4640
4641 coding = &coding_categories[coding_category_charset];
4642 attrs = CODING_ID_ATTRS (coding->id);
4643 valids = AREF (attrs, coding_attr_charset_valids);
4644
4645 if (! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
4646 src += coding->head_ascii;
4647
4648 while (1)
4649 {
4650 int c;
4651
4652 src_base = src;
4653 ONE_MORE_BYTE (c);
4654 if (c < 0)
4655 continue;
4656 if (NILP (AREF (valids, c)))
4657 break;
4658 if (c >= 0x80)
4659 found = CATEGORY_MASK_CHARSET;
4660 }
4661 detect_info->rejected |= CATEGORY_MASK_CHARSET;
4662 return 0;
4663
4664 no_more_source:
4665 detect_info->found |= found;
4666 return 1;
4667 }
4668
4669 static void
4670 decode_coding_charset (coding)
4671 struct coding_system *coding;
4672 {
4673 const unsigned char *src = coding->source + coding->consumed;
4674 const unsigned char *src_end = coding->source + coding->src_bytes;
4675 const unsigned char *src_base;
4676 int *charbuf = coding->charbuf + coding->charbuf_used;
4677 int *charbuf_end
4678 = coding->charbuf + coding->charbuf_size - MAX_ANNOTATION_LENGTH;
4679 int consumed_chars = 0, consumed_chars_base;
4680 int multibytep = coding->src_multibyte;
4681 Lisp_Object attrs, charset_list, valids;
4682 int char_offset = coding->produced_char;
4683 int last_offset = char_offset;
4684 int last_id = charset_ascii;
4685
4686 CODING_GET_INFO (coding, attrs, charset_list);
4687 valids = AREF (attrs, coding_attr_charset_valids);
4688
4689 while (1)
4690 {
4691 int c;
4692 Lisp_Object val;
4693 struct charset *charset;
4694 int dim;
4695 int len = 1;
4696 unsigned code;
4697
4698 src_base = src;
4699 consumed_chars_base = consumed_chars;
4700
4701 if (charbuf >= charbuf_end)
4702 break;
4703
4704 ONE_MORE_BYTE (c);
4705 if (c < 0)
4706 goto invalid_code;
4707 code = c;
4708
4709 val = AREF (valids, c);
4710 if (NILP (val))
4711 goto invalid_code;
4712 if (INTEGERP (val))
4713 {
4714 charset = CHARSET_FROM_ID (XFASTINT (val));
4715 dim = CHARSET_DIMENSION (charset);
4716 while (len < dim)
4717 {
4718 ONE_MORE_BYTE (c);
4719 code = (code << 8) | c;
4720 len++;
4721 }
4722 CODING_DECODE_CHAR (coding, src, src_base, src_end,
4723 charset, code, c);
4724 }
4725 else
4726 {
4727 /* VAL is a list of charset IDs. It is assured that the
4728 list is sorted by charset dimensions (smaller one
4729 comes first). */
4730 while (CONSP (val))
4731 {
4732 charset = CHARSET_FROM_ID (XFASTINT (XCAR (val)));
4733 dim = CHARSET_DIMENSION (charset);
4734 while (len < dim)
4735 {
4736 ONE_MORE_BYTE (c);
4737 code = (code << 8) | c;
4738 len++;
4739 }
4740 CODING_DECODE_CHAR (coding, src, src_base,
4741 src_end, charset, code, c);
4742 if (c >= 0)
4743 break;
4744 val = XCDR (val);
4745 }
4746 }
4747 if (c < 0)
4748 goto invalid_code;
4749 if (charset->id != charset_ascii
4750 && last_id != charset->id)
4751 {
4752 if (last_id != charset_ascii)
4753 ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
4754 last_id = charset->id;
4755 last_offset = char_offset;
4756 }
4757
4758 *charbuf++ = c;
4759 char_offset++;
4760 continue;
4761
4762 invalid_code:
4763 src = src_base;
4764 consumed_chars = consumed_chars_base;
4765 ONE_MORE_BYTE (c);
4766 *charbuf++ = c < 0 ? -c : ASCII_BYTE_P (c) ? c : BYTE8_TO_CHAR (c);
4767 char_offset++;
4768 coding->errors++;
4769 }
4770
4771 no_more_source:
4772 if (last_id != charset_ascii)
4773 ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
4774 coding->consumed_char += consumed_chars_base;
4775 coding->consumed = src_base - coding->source;
4776 coding->charbuf_used = charbuf - coding->charbuf;
4777 }
4778
4779 static int
4780 encode_coding_charset (coding)
4781 struct coding_system *coding;
4782 {
4783 int multibytep = coding->dst_multibyte;
4784 int *charbuf = coding->charbuf;
4785 int *charbuf_end = charbuf + coding->charbuf_used;
4786 unsigned char *dst = coding->destination + coding->produced;
4787 unsigned char *dst_end = coding->destination + coding->dst_bytes;
4788 int safe_room = MAX_MULTIBYTE_LENGTH;
4789 int produced_chars = 0;
4790 Lisp_Object attrs, charset_list;
4791 int ascii_compatible;
4792 int c;
4793
4794 CODING_GET_INFO (coding, attrs, charset_list);
4795 ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs));
4796
4797 while (charbuf < charbuf_end)
4798 {
4799 struct charset *charset;
4800 unsigned code;
4801
4802 ASSURE_DESTINATION (safe_room);
4803 c = *charbuf++;
4804 if (ascii_compatible && ASCII_CHAR_P (c))
4805 EMIT_ONE_ASCII_BYTE (c);
4806 else if (CHAR_BYTE8_P (c))
4807 {
4808 c = CHAR_TO_BYTE8 (c);
4809 EMIT_ONE_BYTE (c);
4810 }
4811 else
4812 {
4813 charset = char_charset (c, charset_list, &code);
4814 if (charset)
4815 {
4816 if (CHARSET_DIMENSION (charset) == 1)
4817 EMIT_ONE_BYTE (code);
4818 else if (CHARSET_DIMENSION (charset) == 2)
4819 EMIT_TWO_BYTES (code >> 8, code & 0xFF);
4820 else if (CHARSET_DIMENSION (charset) == 3)
4821 EMIT_THREE_BYTES (code >> 16, (code >> 8) & 0xFF, code & 0xFF);
4822 else
4823 EMIT_FOUR_BYTES (code >> 24, (code >> 16) & 0xFF,
4824 (code >> 8) & 0xFF, code & 0xFF);
4825 }
4826 else
4827 {
4828 if (coding->mode & CODING_MODE_SAFE_ENCODING)
4829 c = CODING_INHIBIT_CHARACTER_SUBSTITUTION;
4830 else
4831 c = coding->default_char;
4832 EMIT_ONE_BYTE (c);
4833 }
4834 }
4835 }
4836
4837 record_conversion_result (coding, CODING_RESULT_SUCCESS);
4838 coding->produced_char += produced_chars;
4839 coding->produced = dst - coding->destination;
4840 return 0;
4841 }
4842
4843 \f
4844 /*** 7. C library functions ***/
4845
4846 /* Setup coding context CODING from information about CODING_SYSTEM.
4847 If CODING_SYSTEM is nil, `no-conversion' is assumed. If
4848 CODING_SYSTEM is invalid, signal an error. */
4849
4850 void
4851 setup_coding_system (coding_system, coding)
4852 Lisp_Object coding_system;
4853 struct coding_system *coding;
4854 {
4855 Lisp_Object attrs;
4856 Lisp_Object eol_type;
4857 Lisp_Object coding_type;
4858 Lisp_Object val;
4859
4860 if (NILP (coding_system))
4861 coding_system = Qno_conversion;
4862
4863 CHECK_CODING_SYSTEM_GET_ID (coding_system, coding->id);
4864
4865 attrs = CODING_ID_ATTRS (coding->id);
4866 eol_type = CODING_ID_EOL_TYPE (coding->id);
4867
4868 coding->mode = 0;
4869 coding->head_ascii = -1;
4870 coding->common_flags
4871 = (VECTORP (eol_type) ? CODING_REQUIRE_DETECTION_MASK : 0);
4872 if (! NILP (CODING_ATTR_POST_READ (attrs)))
4873 coding->common_flags |= CODING_REQUIRE_DECODING_MASK;
4874 if (! NILP (CODING_ATTR_PRE_WRITE (attrs)))
4875 coding->common_flags |= CODING_REQUIRE_ENCODING_MASK;
4876 if (! NILP (CODING_ATTR_FOR_UNIBYTE (attrs)))
4877 coding->common_flags |= CODING_FOR_UNIBYTE_MASK;
4878
4879 val = CODING_ATTR_SAFE_CHARSETS (attrs);
4880 coding->max_charset_id = SCHARS (val) - 1;
4881 coding->safe_charsets = (char *) SDATA (val);
4882 coding->default_char = XINT (CODING_ATTR_DEFAULT_CHAR (attrs));
4883
4884 coding_type = CODING_ATTR_TYPE (attrs);
4885 if (EQ (coding_type, Qundecided))
4886 {
4887 coding->detector = NULL;
4888 coding->decoder = decode_coding_raw_text;
4889 coding->encoder = encode_coding_raw_text;
4890 coding->common_flags |= CODING_REQUIRE_DETECTION_MASK;
4891 }
4892 else if (EQ (coding_type, Qiso_2022))
4893 {
4894 int i;
4895 int flags = XINT (AREF (attrs, coding_attr_iso_flags));
4896
4897 /* Invoke graphic register 0 to plane 0. */
4898 CODING_ISO_INVOCATION (coding, 0) = 0;
4899 /* Invoke graphic register 1 to plane 1 if we can use 8-bit. */
4900 CODING_ISO_INVOCATION (coding, 1)
4901 = (flags & CODING_ISO_FLAG_SEVEN_BITS ? -1 : 1);
4902 /* Setup the initial status of designation. */
4903 for (i = 0; i < 4; i++)
4904 CODING_ISO_DESIGNATION (coding, i) = CODING_ISO_INITIAL (coding, i);
4905 /* Not single shifting initially. */
4906 CODING_ISO_SINGLE_SHIFTING (coding) = 0;
4907 /* Beginning of buffer should also be regarded as bol. */
4908 CODING_ISO_BOL (coding) = 1;
4909 coding->detector = detect_coding_iso_2022;
4910 coding->decoder = decode_coding_iso_2022;
4911 coding->encoder = encode_coding_iso_2022;
4912 if (flags & CODING_ISO_FLAG_SAFE)
4913 coding->mode |= CODING_MODE_SAFE_ENCODING;
4914 coding->common_flags
4915 |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK
4916 | CODING_REQUIRE_FLUSHING_MASK);
4917 if (flags & CODING_ISO_FLAG_COMPOSITION)
4918 coding->common_flags |= CODING_ANNOTATE_COMPOSITION_MASK;
4919 if (flags & CODING_ISO_FLAG_DESIGNATION)
4920 coding->common_flags |= CODING_ANNOTATE_CHARSET_MASK;
4921 if (flags & CODING_ISO_FLAG_FULL_SUPPORT)
4922 {
4923 setup_iso_safe_charsets (attrs);
4924 val = CODING_ATTR_SAFE_CHARSETS (attrs);
4925 coding->max_charset_id = SCHARS (val) - 1;
4926 coding->safe_charsets = (char *) SDATA (val);
4927 }
4928 CODING_ISO_FLAGS (coding) = flags;
4929 }
4930 else if (EQ (coding_type, Qcharset))
4931 {
4932 coding->detector = detect_coding_charset;
4933 coding->decoder = decode_coding_charset;
4934 coding->encoder = encode_coding_charset;
4935 coding->common_flags
4936 |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
4937 }
4938 else if (EQ (coding_type, Qutf_8))
4939 {
4940 coding->detector = detect_coding_utf_8;
4941 coding->decoder = decode_coding_utf_8;
4942 coding->encoder = encode_coding_utf_8;
4943 coding->common_flags
4944 |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
4945 }
4946 else if (EQ (coding_type, Qutf_16))
4947 {
4948 val = AREF (attrs, coding_attr_utf_16_bom);
4949 CODING_UTF_16_BOM (coding) = (CONSP (val) ? utf_16_detect_bom
4950 : EQ (val, Qt) ? utf_16_with_bom
4951 : utf_16_without_bom);
4952 val = AREF (attrs, coding_attr_utf_16_endian);
4953 CODING_UTF_16_ENDIAN (coding) = (EQ (val, Qbig) ? utf_16_big_endian
4954 : utf_16_little_endian);
4955 CODING_UTF_16_SURROGATE (coding) = 0;
4956 coding->detector = detect_coding_utf_16;
4957 coding->decoder = decode_coding_utf_16;
4958 coding->encoder = encode_coding_utf_16;
4959 coding->common_flags
4960 |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
4961 if (CODING_UTF_16_BOM (coding) == utf_16_detect_bom)
4962 coding->common_flags |= CODING_REQUIRE_DETECTION_MASK;
4963 }
4964 else if (EQ (coding_type, Qccl))
4965 {
4966 coding->detector = detect_coding_ccl;
4967 coding->decoder = decode_coding_ccl;
4968 coding->encoder = encode_coding_ccl;
4969 coding->common_flags
4970 |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK
4971 | CODING_REQUIRE_FLUSHING_MASK);
4972 }
4973 else if (EQ (coding_type, Qemacs_mule))
4974 {
4975 coding->detector = detect_coding_emacs_mule;
4976 coding->decoder = decode_coding_emacs_mule;
4977 coding->encoder = encode_coding_emacs_mule;
4978 coding->common_flags
4979 |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
4980 if (! NILP (AREF (attrs, coding_attr_emacs_mule_full))
4981 && ! EQ (CODING_ATTR_CHARSET_LIST (attrs), Vemacs_mule_charset_list))
4982 {
4983 Lisp_Object tail, safe_charsets;
4984 int max_charset_id = 0;
4985
4986 for (tail = Vemacs_mule_charset_list; CONSP (tail);
4987 tail = XCDR (tail))
4988 if (max_charset_id < XFASTINT (XCAR (tail)))
4989 max_charset_id = XFASTINT (XCAR (tail));
4990 safe_charsets = Fmake_string (make_number (max_charset_id + 1),
4991 make_number (255));
4992 for (tail = Vemacs_mule_charset_list; CONSP (tail);
4993 tail = XCDR (tail))
4994 SSET (safe_charsets, XFASTINT (XCAR (tail)), 0);
4995 coding->max_charset_id = max_charset_id;
4996 coding->safe_charsets = (char *) SDATA (safe_charsets);
4997 }
4998 }
4999 else if (EQ (coding_type, Qshift_jis))
5000 {
5001 coding->detector = detect_coding_sjis;
5002 coding->decoder = decode_coding_sjis;
5003 coding->encoder = encode_coding_sjis;
5004 coding->common_flags
5005 |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
5006 }
5007 else if (EQ (coding_type, Qbig5))
5008 {
5009 coding->detector = detect_coding_big5;
5010 coding->decoder = decode_coding_big5;
5011 coding->encoder = encode_coding_big5;
5012 coding->common_flags
5013 |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
5014 }
5015 else /* EQ (coding_type, Qraw_text) */
5016 {
5017 coding->detector = NULL;
5018 coding->decoder = decode_coding_raw_text;
5019 coding->encoder = encode_coding_raw_text;
5020 }
5021
5022 return;
5023 }
5024
5025 /* Return raw-text or one of its subsidiaries that has the same
5026 eol_type as CODING-SYSTEM. */
5027
5028 Lisp_Object
5029 raw_text_coding_system (coding_system)
5030 Lisp_Object coding_system;
5031 {
5032 Lisp_Object spec, attrs;
5033 Lisp_Object eol_type, raw_text_eol_type;
5034
5035 if (NILP (coding_system))
5036 return Qraw_text;
5037 spec = CODING_SYSTEM_SPEC (coding_system);
5038 attrs = AREF (spec, 0);
5039
5040 if (EQ (CODING_ATTR_TYPE (attrs), Qraw_text))
5041 return coding_system;
5042
5043 eol_type = AREF (spec, 2);
5044 if (VECTORP (eol_type))
5045 return Qraw_text;
5046 spec = CODING_SYSTEM_SPEC (Qraw_text);
5047 raw_text_eol_type = AREF (spec, 2);
5048 return (EQ (eol_type, Qunix) ? AREF (raw_text_eol_type, 0)
5049 : EQ (eol_type, Qdos) ? AREF (raw_text_eol_type, 1)
5050 : AREF (raw_text_eol_type, 2));
5051 }
5052
5053
5054 /* If CODING_SYSTEM doesn't specify end-of-line format but PARENT
5055 does, return one of the subsidiary that has the same eol-spec as
5056 PARENT. Otherwise, return CODING_SYSTEM. */
5057
5058 Lisp_Object
5059 coding_inherit_eol_type (coding_system, parent)
5060 Lisp_Object coding_system, parent;
5061 {
5062 Lisp_Object spec, eol_type;
5063
5064 if (NILP (coding_system))
5065 coding_system = Qraw_text;
5066 spec = CODING_SYSTEM_SPEC (coding_system);
5067 eol_type = AREF (spec, 2);
5068 if (VECTORP (eol_type)
5069 && ! NILP (parent))
5070 {
5071 Lisp_Object parent_spec;
5072 Lisp_Object parent_eol_type;
5073
5074 parent_spec
5075 = CODING_SYSTEM_SPEC (buffer_defaults.buffer_file_coding_system);
5076 parent_eol_type = AREF (parent_spec, 2);
5077 if (EQ (parent_eol_type, Qunix))
5078 coding_system = AREF (eol_type, 0);
5079 else if (EQ (parent_eol_type, Qdos))
5080 coding_system = AREF (eol_type, 1);
5081 else if (EQ (parent_eol_type, Qmac))
5082 coding_system = AREF (eol_type, 2);
5083 }
5084 return coding_system;
5085 }
5086
5087 /* Emacs has a mechanism to automatically detect a coding system if it
5088 is one of Emacs' internal format, ISO2022, SJIS, and BIG5. But,
5089 it's impossible to distinguish some coding systems accurately
5090 because they use the same range of codes. So, at first, coding
5091 systems are categorized into 7, those are:
5092
5093 o coding-category-emacs-mule
5094
5095 The category for a coding system which has the same code range
5096 as Emacs' internal format. Assigned the coding-system (Lisp
5097 symbol) `emacs-mule' by default.
5098
5099 o coding-category-sjis
5100
5101 The category for a coding system which has the same code range
5102 as SJIS. Assigned the coding-system (Lisp
5103 symbol) `japanese-shift-jis' by default.
5104
5105 o coding-category-iso-7
5106
5107 The category for a coding system which has the same code range
5108 as ISO2022 of 7-bit environment. This doesn't use any locking
5109 shift and single shift functions. This can encode/decode all
5110 charsets. Assigned the coding-system (Lisp symbol)
5111 `iso-2022-7bit' by default.
5112
5113 o coding-category-iso-7-tight
5114
5115 Same as coding-category-iso-7 except that this can
5116 encode/decode only the specified charsets.
5117
5118 o coding-category-iso-8-1
5119
5120 The category for a coding system which has the same code range
5121 as ISO2022 of 8-bit environment and graphic plane 1 used only
5122 for DIMENSION1 charset. This doesn't use any locking shift
5123 and single shift functions. Assigned the coding-system (Lisp
5124 symbol) `iso-latin-1' by default.
5125
5126 o coding-category-iso-8-2
5127
5128 The category for a coding system which has the same code range
5129 as ISO2022 of 8-bit environment and graphic plane 1 used only
5130 for DIMENSION2 charset. This doesn't use any locking shift
5131 and single shift functions. Assigned the coding-system (Lisp
5132 symbol) `japanese-iso-8bit' by default.
5133
5134 o coding-category-iso-7-else
5135
5136 The category for a coding system which has the same code range
5137 as ISO2022 of 7-bit environemnt but uses locking shift or
5138 single shift functions. Assigned the coding-system (Lisp
5139 symbol) `iso-2022-7bit-lock' by default.
5140
5141 o coding-category-iso-8-else
5142
5143 The category for a coding system which has the same code range
5144 as ISO2022 of 8-bit environemnt but uses locking shift or
5145 single shift functions. Assigned the coding-system (Lisp
5146 symbol) `iso-2022-8bit-ss2' by default.
5147
5148 o coding-category-big5
5149
5150 The category for a coding system which has the same code range
5151 as BIG5. Assigned the coding-system (Lisp symbol)
5152 `cn-big5' by default.
5153
5154 o coding-category-utf-8
5155
5156 The category for a coding system which has the same code range
5157 as UTF-8 (cf. RFC3629). Assigned the coding-system (Lisp
5158 symbol) `utf-8' by default.
5159
5160 o coding-category-utf-16-be
5161
5162 The category for a coding system in which a text has an
5163 Unicode signature (cf. Unicode Standard) in the order of BIG
5164 endian at the head. Assigned the coding-system (Lisp symbol)
5165 `utf-16-be' by default.
5166
5167 o coding-category-utf-16-le
5168
5169 The category for a coding system in which a text has an
5170 Unicode signature (cf. Unicode Standard) in the order of
5171 LITTLE endian at the head. Assigned the coding-system (Lisp
5172 symbol) `utf-16-le' by default.
5173
5174 o coding-category-ccl
5175
5176 The category for a coding system of which encoder/decoder is
5177 written in CCL programs. The default value is nil, i.e., no
5178 coding system is assigned.
5179
5180 o coding-category-binary
5181
5182 The category for a coding system not categorized in any of the
5183 above. Assigned the coding-system (Lisp symbol)
5184 `no-conversion' by default.
5185
5186 Each of them is a Lisp symbol and the value is an actual
5187 `coding-system's (this is also a Lisp symbol) assigned by a user.
5188 What Emacs does actually is to detect a category of coding system.
5189 Then, it uses a `coding-system' assigned to it. If Emacs can't
5190 decide only one possible category, it selects a category of the
5191 highest priority. Priorities of categories are also specified by a
5192 user in a Lisp variable `coding-category-list'.
5193
5194 */
5195
5196 #define EOL_SEEN_NONE 0
5197 #define EOL_SEEN_LF 1
5198 #define EOL_SEEN_CR 2
5199 #define EOL_SEEN_CRLF 4
5200
5201 /* Detect how end-of-line of a text of length SRC_BYTES pointed by
5202 SOURCE is encoded. If CATEGORY is one of
5203 coding_category_utf_16_XXXX, assume that CR and LF are encoded by
5204 two-byte, else they are encoded by one-byte.
5205
5206 Return one of EOL_SEEN_XXX. */
5207
5208 #define MAX_EOL_CHECK_COUNT 3
5209
5210 static int
5211 detect_eol (source, src_bytes, category)
5212 unsigned char *source;
5213 EMACS_INT src_bytes;
5214 enum coding_category category;
5215 {
5216 unsigned char *src = source, *src_end = src + src_bytes;
5217 unsigned char c;
5218 int total = 0;
5219 int eol_seen = EOL_SEEN_NONE;
5220
5221 if ((1 << category) & CATEGORY_MASK_UTF_16)
5222 {
5223 int msb, lsb;
5224
5225 msb = category == (coding_category_utf_16_le
5226 | coding_category_utf_16_le_nosig);
5227 lsb = 1 - msb;
5228
5229 while (src + 1 < src_end)
5230 {
5231 c = src[lsb];
5232 if (src[msb] == 0 && (c == '\n' || c == '\r'))
5233 {
5234 int this_eol;
5235
5236 if (c == '\n')
5237 this_eol = EOL_SEEN_LF;
5238 else if (src + 3 >= src_end
5239 || src[msb + 2] != 0
5240 || src[lsb + 2] != '\n')
5241 this_eol = EOL_SEEN_CR;
5242 else
5243 this_eol = EOL_SEEN_CRLF;
5244
5245 if (eol_seen == EOL_SEEN_NONE)
5246 /* This is the first end-of-line. */
5247 eol_seen = this_eol;
5248 else if (eol_seen != this_eol)
5249 {
5250 /* The found type is different from what found before. */
5251 eol_seen = EOL_SEEN_LF;
5252 break;
5253 }
5254 if (++total == MAX_EOL_CHECK_COUNT)
5255 break;
5256 }
5257 src += 2;
5258 }
5259 }
5260 else
5261 {
5262 while (src < src_end)
5263 {
5264 c = *src++;
5265 if (c == '\n' || c == '\r')
5266 {
5267 int this_eol;
5268
5269 if (c == '\n')
5270 this_eol = EOL_SEEN_LF;
5271 else if (src >= src_end || *src != '\n')
5272 this_eol = EOL_SEEN_CR;
5273 else
5274 this_eol = EOL_SEEN_CRLF, src++;
5275
5276 if (eol_seen == EOL_SEEN_NONE)
5277 /* This is the first end-of-line. */
5278 eol_seen = this_eol;
5279 else if (eol_seen != this_eol)
5280 {
5281 /* The found type is different from what found before. */
5282 eol_seen = EOL_SEEN_LF;
5283 break;
5284 }
5285 if (++total == MAX_EOL_CHECK_COUNT)
5286 break;
5287 }
5288 }
5289 }
5290 return eol_seen;
5291 }
5292
5293
5294 static Lisp_Object
5295 adjust_coding_eol_type (coding, eol_seen)
5296 struct coding_system *coding;
5297 int eol_seen;
5298 {
5299 Lisp_Object eol_type;
5300
5301 eol_type = CODING_ID_EOL_TYPE (coding->id);
5302 if (eol_seen & EOL_SEEN_LF)
5303 {
5304 coding->id = CODING_SYSTEM_ID (AREF (eol_type, 0));
5305 eol_type = Qunix;
5306 }
5307 else if (eol_seen & EOL_SEEN_CRLF)
5308 {
5309 coding->id = CODING_SYSTEM_ID (AREF (eol_type, 1));
5310 eol_type = Qdos;
5311 }
5312 else if (eol_seen & EOL_SEEN_CR)
5313 {
5314 coding->id = CODING_SYSTEM_ID (AREF (eol_type, 2));
5315 eol_type = Qmac;
5316 }
5317 return eol_type;
5318 }
5319
5320 /* Detect how a text specified in CODING is encoded. If a coding
5321 system is detected, update fields of CODING by the detected coding
5322 system. */
5323
5324 void
5325 detect_coding (coding)
5326 struct coding_system *coding;
5327 {
5328 const unsigned char *src, *src_end;
5329 Lisp_Object attrs, coding_type;
5330
5331 coding->consumed = coding->consumed_char = 0;
5332 coding->produced = coding->produced_char = 0;
5333 coding_set_source (coding);
5334
5335 src_end = coding->source + coding->src_bytes;
5336
5337 /* If we have not yet decided the text encoding type, detect it
5338 now. */
5339 if (EQ (CODING_ATTR_TYPE (CODING_ID_ATTRS (coding->id)), Qundecided))
5340 {
5341 int c, i;
5342
5343 for (i = 0, src = coding->source; src < src_end; i++, src++)
5344 {
5345 c = *src;
5346 if (c & 0x80 || (c < 0x20 && (c == ISO_CODE_ESC
5347 || c == ISO_CODE_SI
5348 || c == ISO_CODE_SO)))
5349 break;
5350 }
5351 coding->head_ascii = src - (coding->source + coding->consumed);
5352
5353 if (coding->head_ascii < coding->src_bytes)
5354 {
5355 struct coding_detection_info detect_info;
5356 enum coding_category category;
5357 struct coding_system *this;
5358
5359 detect_info.checked = detect_info.found = detect_info.rejected = 0;
5360 for (i = 0; i < coding_category_raw_text; i++)
5361 {
5362 category = coding_priorities[i];
5363 this = coding_categories + category;
5364 if (this->id < 0)
5365 {
5366 /* No coding system of this category is defined. */
5367 detect_info.rejected |= (1 << category);
5368 }
5369 else if (category >= coding_category_raw_text)
5370 continue;
5371 else if (detect_info.checked & (1 << category))
5372 {
5373 if (detect_info.found & (1 << category))
5374 break;
5375 }
5376 else if ((*(this->detector)) (coding, &detect_info)
5377 && detect_info.found & (1 << category))
5378 {
5379 if (category == coding_category_utf_16_auto)
5380 {
5381 if (detect_info.found & CATEGORY_MASK_UTF_16_LE)
5382 category = coding_category_utf_16_le;
5383 else
5384 category = coding_category_utf_16_be;
5385 }
5386 break;
5387 }
5388 }
5389 if (i < coding_category_raw_text)
5390 setup_coding_system (CODING_ID_NAME (this->id), coding);
5391 else if (detect_info.rejected == CATEGORY_MASK_ANY)
5392 setup_coding_system (Qraw_text, coding);
5393 else if (detect_info.rejected)
5394 for (i = 0; i < coding_category_raw_text; i++)
5395 if (! (detect_info.rejected & (1 << coding_priorities[i])))
5396 {
5397 this = coding_categories + coding_priorities[i];
5398 setup_coding_system (CODING_ID_NAME (this->id), coding);
5399 break;
5400 }
5401 }
5402 }
5403 else if (XINT (CODING_ATTR_CATEGORY (CODING_ID_ATTRS (coding->id)))
5404 == coding_category_utf_16_auto)
5405 {
5406 Lisp_Object coding_systems;
5407 struct coding_detection_info detect_info;
5408
5409 coding_systems
5410 = AREF (CODING_ID_ATTRS (coding->id), coding_attr_utf_16_bom);
5411 detect_info.found = detect_info.rejected = 0;
5412 if (CONSP (coding_systems)
5413 && detect_coding_utf_16 (coding, &detect_info))
5414 {
5415 if (detect_info.found & CATEGORY_MASK_UTF_16_LE)
5416 setup_coding_system (XCAR (coding_systems), coding);
5417 else if (detect_info.found & CATEGORY_MASK_UTF_16_BE)
5418 setup_coding_system (XCDR (coding_systems), coding);
5419 }
5420 }
5421 }
5422
5423
5424 static void
5425 decode_eol (coding)
5426 struct coding_system *coding;
5427 {
5428 Lisp_Object eol_type;
5429 unsigned char *p, *pbeg, *pend;
5430
5431 eol_type = CODING_ID_EOL_TYPE (coding->id);
5432 if (EQ (eol_type, Qunix))
5433 return;
5434
5435 if (NILP (coding->dst_object))
5436 pbeg = coding->destination;
5437 else
5438 pbeg = BYTE_POS_ADDR (coding->dst_pos_byte);
5439 pend = pbeg + coding->produced;
5440
5441 if (VECTORP (eol_type))
5442 {
5443 int eol_seen = EOL_SEEN_NONE;
5444
5445 for (p = pbeg; p < pend; p++)
5446 {
5447 if (*p == '\n')
5448 eol_seen |= EOL_SEEN_LF;
5449 else if (*p == '\r')
5450 {
5451 if (p + 1 < pend && *(p + 1) == '\n')
5452 {
5453 eol_seen |= EOL_SEEN_CRLF;
5454 p++;
5455 }
5456 else
5457 eol_seen |= EOL_SEEN_CR;
5458 }
5459 }
5460 if (eol_seen != EOL_SEEN_NONE
5461 && eol_seen != EOL_SEEN_LF
5462 && eol_seen != EOL_SEEN_CRLF
5463 && eol_seen != EOL_SEEN_CR)
5464 eol_seen = EOL_SEEN_LF;
5465 if (eol_seen != EOL_SEEN_NONE)
5466 eol_type = adjust_coding_eol_type (coding, eol_seen);
5467 }
5468
5469 if (EQ (eol_type, Qmac))
5470 {
5471 for (p = pbeg; p < pend; p++)
5472 if (*p == '\r')
5473 *p = '\n';
5474 }
5475 else if (EQ (eol_type, Qdos))
5476 {
5477 int n = 0;
5478
5479 if (NILP (coding->dst_object))
5480 {
5481 for (p = pend - 2; p >= pbeg; p--)
5482 if (*p == '\r')
5483 {
5484 safe_bcopy ((char *) (p + 1), (char *) p, pend-- - p - 1);
5485 n++;
5486 }
5487 }
5488 else
5489 {
5490 for (p = pend - 2; p >= pbeg; p--)
5491 if (*p == '\r')
5492 {
5493 int pos_byte = coding->dst_pos_byte + (p - pbeg);
5494 int pos = BYTE_TO_CHAR (pos_byte);
5495
5496 del_range_2 (pos, pos_byte, pos + 1, pos_byte + 1, 0);
5497 n++;
5498 }
5499 }
5500 coding->produced -= n;
5501 coding->produced_char -= n;
5502 }
5503 }
5504
5505
5506 /* Return a translation table (or list of them) from coding system
5507 attribute vector ATTRS for encoding (ENCODEP is nonzero) or
5508 decoding (ENCODEP is zero). */
5509
5510 static Lisp_Object
5511 get_translation_table (attrs, encodep, max_lookup)
5512 Lisp_Object attrs;
5513 int encodep, *max_lookup;
5514 {
5515 Lisp_Object standard, translation_table;
5516 Lisp_Object val;
5517
5518 if (encodep)
5519 translation_table = CODING_ATTR_ENCODE_TBL (attrs),
5520 standard = Vstandard_translation_table_for_encode;
5521 else
5522 translation_table = CODING_ATTR_DECODE_TBL (attrs),
5523 standard = Vstandard_translation_table_for_decode;
5524 if (NILP (translation_table))
5525 translation_table = standard;
5526 else
5527 {
5528 if (SYMBOLP (translation_table))
5529 translation_table = Fget (translation_table, Qtranslation_table);
5530 else if (CONSP (translation_table))
5531 {
5532 translation_table = Fcopy_sequence (translation_table);
5533 for (val = translation_table; CONSP (val); val = XCDR (val))
5534 if (SYMBOLP (XCAR (val)))
5535 XSETCAR (val, Fget (XCAR (val), Qtranslation_table));
5536 }
5537 if (CHAR_TABLE_P (standard))
5538 {
5539 if (CONSP (translation_table))
5540 translation_table = nconc2 (translation_table,
5541 Fcons (standard, Qnil));
5542 else
5543 translation_table = Fcons (translation_table,
5544 Fcons (standard, Qnil));
5545 }
5546 }
5547 *max_lookup = 1;
5548 if (CHAR_TABLE_P (translation_table)
5549 && CHAR_TABLE_EXTRA_SLOTS (XCHAR_TABLE (translation_table)) > 1)
5550 {
5551 val = XCHAR_TABLE (translation_table)->extras[1];
5552 if (NATNUMP (val) && *max_lookup < XFASTINT (val))
5553 *max_lookup = XFASTINT (val);
5554 }
5555 else if (CONSP (translation_table))
5556 {
5557 Lisp_Object tail, val;
5558
5559 for (tail = translation_table; CONSP (tail); tail = XCDR (tail))
5560 if (CHAR_TABLE_P (XCAR (tail))
5561 && CHAR_TABLE_EXTRA_SLOTS (XCHAR_TABLE (XCAR (tail))) > 1)
5562 {
5563 val = XCHAR_TABLE (XCAR (tail))->extras[1];
5564 if (NATNUMP (val) && *max_lookup < XFASTINT (val))
5565 *max_lookup = XFASTINT (val);
5566 }
5567 }
5568 return translation_table;
5569 }
5570
5571 #define LOOKUP_TRANSLATION_TABLE(table, c, trans) \
5572 do { \
5573 trans = Qnil; \
5574 if (CHAR_TABLE_P (table)) \
5575 { \
5576 trans = CHAR_TABLE_REF (table, c); \
5577 if (CHARACTERP (trans)) \
5578 c = XFASTINT (trans), trans = Qnil; \
5579 } \
5580 else if (CONSP (table)) \
5581 { \
5582 Lisp_Object tail; \
5583 \
5584 for (tail = table; CONSP (tail); tail = XCDR (tail)) \
5585 if (CHAR_TABLE_P (XCAR (tail))) \
5586 { \
5587 trans = CHAR_TABLE_REF (XCAR (tail), c); \
5588 if (CHARACTERP (trans)) \
5589 c = XFASTINT (trans), trans = Qnil; \
5590 else if (! NILP (trans)) \
5591 break; \
5592 } \
5593 } \
5594 } while (0)
5595
5596
5597 static Lisp_Object
5598 get_translation (val, buf, buf_end, last_block, from_nchars, to_nchars)
5599 Lisp_Object val;
5600 int *buf, *buf_end;
5601 int last_block;
5602 int *from_nchars, *to_nchars;
5603 {
5604 /* VAL is TO or (([FROM-CHAR ...] . TO) ...) where TO is TO-CHAR or
5605 [TO-CHAR ...]. */
5606 if (CONSP (val))
5607 {
5608 Lisp_Object from, tail;
5609 int i, len;
5610
5611 for (tail = val; CONSP (tail); tail = XCDR (tail))
5612 {
5613 val = XCAR (tail);
5614 from = XCAR (val);
5615 len = ASIZE (from);
5616 for (i = 0; i < len; i++)
5617 {
5618 if (buf + i == buf_end)
5619 {
5620 if (! last_block)
5621 return Qt;
5622 break;
5623 }
5624 if (XINT (AREF (from, i)) != buf[i])
5625 break;
5626 }
5627 if (i == len)
5628 {
5629 val = XCDR (val);
5630 *from_nchars = len;
5631 break;
5632 }
5633 }
5634 if (! CONSP (tail))
5635 return Qnil;
5636 }
5637 if (VECTORP (val))
5638 *buf = XINT (AREF (val, 0)), *to_nchars = ASIZE (val);
5639 else
5640 *buf = XINT (val);
5641 return val;
5642 }
5643
5644
5645 static int
5646 produce_chars (coding, translation_table, last_block)
5647 struct coding_system *coding;
5648 Lisp_Object translation_table;
5649 int last_block;
5650 {
5651 unsigned char *dst = coding->destination + coding->produced;
5652 unsigned char *dst_end = coding->destination + coding->dst_bytes;
5653 int produced;
5654 int produced_chars = 0;
5655 int carryover = 0;
5656
5657 if (! coding->chars_at_source)
5658 {
5659 /* Characters are in coding->charbuf. */
5660 int *buf = coding->charbuf;
5661 int *buf_end = buf + coding->charbuf_used;
5662
5663 if (BUFFERP (coding->src_object)
5664 && EQ (coding->src_object, coding->dst_object))
5665 dst_end = ((unsigned char *) coding->source) + coding->consumed;
5666
5667 while (buf < buf_end)
5668 {
5669 int c = *buf, i;
5670
5671 if (c >= 0)
5672 {
5673 int from_nchars = 1, to_nchars = 1;
5674 Lisp_Object trans = Qnil;
5675
5676 LOOKUP_TRANSLATION_TABLE (translation_table, c, trans);
5677 if (! NILP (trans))
5678 {
5679 trans = get_translation (trans, buf, buf_end, last_block,
5680 &from_nchars, &to_nchars);
5681 if (EQ (trans, Qt))
5682 break;
5683 c = *buf;
5684 }
5685
5686 if (dst + MAX_MULTIBYTE_LENGTH * to_nchars > dst_end)
5687 {
5688 dst = alloc_destination (coding,
5689 buf_end - buf
5690 + MAX_MULTIBYTE_LENGTH * to_nchars,
5691 dst);
5692 dst_end = coding->destination + coding->dst_bytes;
5693 }
5694
5695 for (i = 0; i < to_nchars; i++)
5696 {
5697 if (i > 0)
5698 c = XINT (AREF (trans, i));
5699 if (coding->dst_multibyte
5700 || ! CHAR_BYTE8_P (c))
5701 CHAR_STRING_ADVANCE (c, dst);
5702 else
5703 *dst++ = CHAR_TO_BYTE8 (c);
5704 }
5705 produced_chars += to_nchars;
5706 *buf++ = to_nchars;
5707 while (--from_nchars > 0)
5708 *buf++ = 0;
5709 }
5710 else
5711 /* This is an annotation datum. (-C) is the length. */
5712 buf += -c;
5713 }
5714 carryover = buf_end - buf;
5715 }
5716 else
5717 {
5718 const unsigned char *src = coding->source;
5719 const unsigned char *src_end = src + coding->src_bytes;
5720 Lisp_Object eol_type;
5721
5722 eol_type = CODING_ID_EOL_TYPE (coding->id);
5723
5724 if (coding->src_multibyte != coding->dst_multibyte)
5725 {
5726 if (coding->src_multibyte)
5727 {
5728 int multibytep = 1;
5729 int consumed_chars;
5730
5731 while (1)
5732 {
5733 const unsigned char *src_base = src;
5734 int c;
5735
5736 ONE_MORE_BYTE (c);
5737 if (c == '\r')
5738 {
5739 if (EQ (eol_type, Qdos))
5740 {
5741 if (src == src_end)
5742 {
5743 record_conversion_result
5744 (coding, CODING_RESULT_INSUFFICIENT_SRC);
5745 goto no_more_source;
5746 }
5747 if (*src == '\n')
5748 c = *src++;
5749 }
5750 else if (EQ (eol_type, Qmac))
5751 c = '\n';
5752 }
5753 if (dst == dst_end)
5754 {
5755 coding->consumed = src - coding->source;
5756
5757 if (EQ (coding->src_object, coding->dst_object))
5758 dst_end = (unsigned char *) src;
5759 if (dst == dst_end)
5760 {
5761 dst = alloc_destination (coding, src_end - src + 1,
5762 dst);
5763 dst_end = coding->destination + coding->dst_bytes;
5764 coding_set_source (coding);
5765 src = coding->source + coding->consumed;
5766 src_end = coding->source + coding->src_bytes;
5767 }
5768 }
5769 *dst++ = c;
5770 produced_chars++;
5771 }
5772 no_more_source:
5773 ;
5774 }
5775 else
5776 while (src < src_end)
5777 {
5778 int multibytep = 1;
5779 int c = *src++;
5780
5781 if (c == '\r')
5782 {
5783 if (EQ (eol_type, Qdos))
5784 {
5785 if (src < src_end
5786 && *src == '\n')
5787 c = *src++;
5788 }
5789 else if (EQ (eol_type, Qmac))
5790 c = '\n';
5791 }
5792 if (dst >= dst_end - 1)
5793 {
5794 coding->consumed = src - coding->source;
5795
5796 if (EQ (coding->src_object, coding->dst_object))
5797 dst_end = (unsigned char *) src;
5798 if (dst >= dst_end - 1)
5799 {
5800 dst = alloc_destination (coding, src_end - src + 2,
5801 dst);
5802 dst_end = coding->destination + coding->dst_bytes;
5803 coding_set_source (coding);
5804 src = coding->source + coding->consumed;
5805 src_end = coding->source + coding->src_bytes;
5806 }
5807 }
5808 EMIT_ONE_BYTE (c);
5809 }
5810 }
5811 else
5812 {
5813 if (!EQ (coding->src_object, coding->dst_object))
5814 {
5815 int require = coding->src_bytes - coding->dst_bytes;
5816
5817 if (require > 0)
5818 {
5819 EMACS_INT offset = src - coding->source;
5820
5821 dst = alloc_destination (coding, require, dst);
5822 coding_set_source (coding);
5823 src = coding->source + offset;
5824 src_end = coding->source + coding->src_bytes;
5825 }
5826 }
5827 produced_chars = coding->src_chars;
5828 while (src < src_end)
5829 {
5830 int c = *src++;
5831
5832 if (c == '\r')
5833 {
5834 if (EQ (eol_type, Qdos))
5835 {
5836 if (src < src_end
5837 && *src == '\n')
5838 c = *src++;
5839 produced_chars--;
5840 }
5841 else if (EQ (eol_type, Qmac))
5842 c = '\n';
5843 }
5844 *dst++ = c;
5845 }
5846 }
5847 coding->consumed = coding->src_bytes;
5848 coding->consumed_char = coding->src_chars;
5849 }
5850
5851 produced = dst - (coding->destination + coding->produced);
5852 if (BUFFERP (coding->dst_object))
5853 insert_from_gap (produced_chars, produced);
5854 coding->produced += produced;
5855 coding->produced_char += produced_chars;
5856 return carryover;
5857 }
5858
5859 /* Compose text in CODING->object according to the annotation data at
5860 CHARBUF. CHARBUF is an array:
5861 [ -LENGTH ANNOTATION_MASK FROM TO METHOD COMP_LEN [ COMPONENTS... ] ]
5862 */
5863
5864 static INLINE void
5865 produce_composition (coding, charbuf, pos)
5866 struct coding_system *coding;
5867 int *charbuf;
5868 EMACS_INT pos;
5869 {
5870 int len;
5871 EMACS_INT to;
5872 enum composition_method method;
5873 Lisp_Object components;
5874
5875 len = -charbuf[0];
5876 to = pos + charbuf[2];
5877 method = (enum composition_method) (charbuf[3]);
5878
5879 if (method == COMPOSITION_RELATIVE)
5880 components = Qnil;
5881 else
5882 {
5883 Lisp_Object args[MAX_COMPOSITION_COMPONENTS * 2 - 1];
5884 int i;
5885
5886 len -= 4;
5887 charbuf += 4;
5888 for (i = 0; i < len; i++)
5889 args[i] = make_number (charbuf[i]);
5890 components = (method == COMPOSITION_WITH_ALTCHARS
5891 ? Fstring (len, args) : Fvector (len, args));
5892 }
5893 compose_text (pos, to, components, Qnil, coding->dst_object);
5894 }
5895
5896
5897 /* Put `charset' property on text in CODING->object according to
5898 the annotation data at CHARBUF. CHARBUF is an array:
5899 [ -LENGTH ANNOTATION_MASK NCHARS CHARSET-ID ]
5900 */
5901
5902 static INLINE void
5903 produce_charset (coding, charbuf, pos)
5904 struct coding_system *coding;
5905 int *charbuf;
5906 EMACS_INT pos;
5907 {
5908 EMACS_INT from = pos - charbuf[2];
5909 struct charset *charset = CHARSET_FROM_ID (charbuf[3]);
5910
5911 Fput_text_property (make_number (from), make_number (pos),
5912 Qcharset, CHARSET_NAME (charset),
5913 coding->dst_object);
5914 }
5915
5916
5917 #define CHARBUF_SIZE 0x4000
5918
5919 #define ALLOC_CONVERSION_WORK_AREA(coding) \
5920 do { \
5921 int size = CHARBUF_SIZE;; \
5922 \
5923 coding->charbuf = NULL; \
5924 while (size > 1024) \
5925 { \
5926 coding->charbuf = (int *) alloca (sizeof (int) * size); \
5927 if (coding->charbuf) \
5928 break; \
5929 size >>= 1; \
5930 } \
5931 if (! coding->charbuf) \
5932 { \
5933 record_conversion_result (coding, CODING_RESULT_INSUFFICIENT_MEM); \
5934 return coding->result; \
5935 } \
5936 coding->charbuf_size = size; \
5937 } while (0)
5938
5939
5940 static void
5941 produce_annotation (coding, pos)
5942 struct coding_system *coding;
5943 EMACS_INT pos;
5944 {
5945 int *charbuf = coding->charbuf;
5946 int *charbuf_end = charbuf + coding->charbuf_used;
5947
5948 if (NILP (coding->dst_object))
5949 return;
5950
5951 while (charbuf < charbuf_end)
5952 {
5953 if (*charbuf >= 0)
5954 pos += *charbuf++;
5955 else
5956 {
5957 int len = -*charbuf;
5958 switch (charbuf[1])
5959 {
5960 case CODING_ANNOTATE_COMPOSITION_MASK:
5961 produce_composition (coding, charbuf, pos);
5962 break;
5963 case CODING_ANNOTATE_CHARSET_MASK:
5964 produce_charset (coding, charbuf, pos);
5965 break;
5966 default:
5967 abort ();
5968 }
5969 charbuf += len;
5970 }
5971 }
5972 }
5973
5974 /* Decode the data at CODING->src_object into CODING->dst_object.
5975 CODING->src_object is a buffer, a string, or nil.
5976 CODING->dst_object is a buffer.
5977
5978 If CODING->src_object is a buffer, it must be the current buffer.
5979 In this case, if CODING->src_pos is positive, it is a position of
5980 the source text in the buffer, otherwise, the source text is in the
5981 gap area of the buffer, and CODING->src_pos specifies the offset of
5982 the text from GPT (which must be the same as PT). If this is the
5983 same buffer as CODING->dst_object, CODING->src_pos must be
5984 negative.
5985
5986 If CODING->src_object is a string, CODING->src_pos in an index to
5987 that string.
5988
5989 If CODING->src_object is nil, CODING->source must already point to
5990 the non-relocatable memory area. In this case, CODING->src_pos is
5991 an offset from CODING->source.
5992
5993 The decoded data is inserted at the current point of the buffer
5994 CODING->dst_object.
5995 */
5996
5997 static int
5998 decode_coding (coding)
5999 struct coding_system *coding;
6000 {
6001 Lisp_Object attrs;
6002 Lisp_Object undo_list;
6003 Lisp_Object translation_table;
6004 int carryover;
6005 int max_lookup;
6006 int i;
6007
6008 if (BUFFERP (coding->src_object)
6009 && coding->src_pos > 0
6010 && coding->src_pos < GPT
6011 && coding->src_pos + coding->src_chars > GPT)
6012 move_gap_both (coding->src_pos, coding->src_pos_byte);
6013
6014 undo_list = Qt;
6015 if (BUFFERP (coding->dst_object))
6016 {
6017 if (current_buffer != XBUFFER (coding->dst_object))
6018 set_buffer_internal (XBUFFER (coding->dst_object));
6019 if (GPT != PT)
6020 move_gap_both (PT, PT_BYTE);
6021 undo_list = current_buffer->undo_list;
6022 current_buffer->undo_list = Qt;
6023 }
6024
6025 coding->consumed = coding->consumed_char = 0;
6026 coding->produced = coding->produced_char = 0;
6027 coding->chars_at_source = 0;
6028 record_conversion_result (coding, CODING_RESULT_SUCCESS);
6029 coding->errors = 0;
6030
6031 ALLOC_CONVERSION_WORK_AREA (coding);
6032
6033 attrs = CODING_ID_ATTRS (coding->id);
6034 translation_table = get_translation_table (attrs, 0, &max_lookup);
6035
6036 carryover = 0;
6037 do
6038 {
6039 EMACS_INT pos = coding->dst_pos + coding->produced_char;
6040
6041 coding_set_source (coding);
6042 coding->annotated = 0;
6043 coding->charbuf_used = carryover;
6044 (*(coding->decoder)) (coding);
6045 coding_set_destination (coding);
6046 carryover = produce_chars (coding, translation_table, 0);
6047 if (coding->annotated)
6048 produce_annotation (coding, pos);
6049 for (i = 0; i < carryover; i++)
6050 coding->charbuf[i]
6051 = coding->charbuf[coding->charbuf_used - carryover + i];
6052 }
6053 while (coding->consumed < coding->src_bytes
6054 && ! coding->result);
6055
6056 if (carryover > 0)
6057 {
6058 coding_set_destination (coding);
6059 coding->charbuf_used = carryover;
6060 produce_chars (coding, translation_table, 1);
6061 }
6062
6063 coding->carryover_bytes = 0;
6064 if (coding->consumed < coding->src_bytes)
6065 {
6066 int nbytes = coding->src_bytes - coding->consumed;
6067 const unsigned char *src;
6068
6069 coding_set_source (coding);
6070 coding_set_destination (coding);
6071 src = coding->source + coding->consumed;
6072
6073 if (coding->mode & CODING_MODE_LAST_BLOCK)
6074 {
6075 /* Flush out unprocessed data as binary chars. We are sure
6076 that the number of data is less than the size of
6077 coding->charbuf. */
6078 coding->charbuf_used = 0;
6079 while (nbytes-- > 0)
6080 {
6081 int c = *src++;
6082
6083 coding->charbuf[coding->charbuf_used++] = (c & 0x80 ? - c : c);
6084 }
6085 produce_chars (coding);
6086 }
6087 else
6088 {
6089 /* Record unprocessed bytes in coding->carryover. We are
6090 sure that the number of data is less than the size of
6091 coding->carryover. */
6092 unsigned char *p = coding->carryover;
6093
6094 coding->carryover_bytes = nbytes;
6095 while (nbytes-- > 0)
6096 *p++ = *src++;
6097 }
6098 coding->consumed = coding->src_bytes;
6099 }
6100
6101 if (BUFFERP (coding->dst_object))
6102 {
6103 current_buffer->undo_list = undo_list;
6104 record_insert (coding->dst_pos, coding->produced_char);
6105 }
6106 if (! EQ (CODING_ID_EOL_TYPE (coding->id), Qunix))
6107 decode_eol (coding);
6108 return coding->result;
6109 }
6110
6111
6112 /* Extract an annotation datum from a composition starting at POS and
6113 ending before LIMIT of CODING->src_object (buffer or string), store
6114 the data in BUF, set *STOP to a starting position of the next
6115 composition (if any) or to LIMIT, and return the address of the
6116 next element of BUF.
6117
6118 If such an annotation is not found, set *STOP to a starting
6119 position of a composition after POS (if any) or to LIMIT, and
6120 return BUF. */
6121
6122 static INLINE int *
6123 handle_composition_annotation (pos, limit, coding, buf, stop)
6124 EMACS_INT pos, limit;
6125 struct coding_system *coding;
6126 int *buf;
6127 EMACS_INT *stop;
6128 {
6129 EMACS_INT start, end;
6130 Lisp_Object prop;
6131
6132 if (! find_composition (pos, limit, &start, &end, &prop, coding->src_object)
6133 || end > limit)
6134 *stop = limit;
6135 else if (start > pos)
6136 *stop = start;
6137 else
6138 {
6139 if (start == pos)
6140 {
6141 /* We found a composition. Store the corresponding
6142 annotation data in BUF. */
6143 int *head = buf;
6144 enum composition_method method = COMPOSITION_METHOD (prop);
6145 int nchars = COMPOSITION_LENGTH (prop);
6146
6147 ADD_COMPOSITION_DATA (buf, nchars, method);
6148 if (method != COMPOSITION_RELATIVE)
6149 {
6150 Lisp_Object components;
6151 int len, i, i_byte;
6152
6153 components = COMPOSITION_COMPONENTS (prop);
6154 if (VECTORP (components))
6155 {
6156 len = XVECTOR (components)->size;
6157 for (i = 0; i < len; i++)
6158 *buf++ = XINT (AREF (components, i));
6159 }
6160 else if (STRINGP (components))
6161 {
6162 len = SCHARS (components);
6163 i = i_byte = 0;
6164 while (i < len)
6165 {
6166 FETCH_STRING_CHAR_ADVANCE (*buf, components, i, i_byte);
6167 buf++;
6168 }
6169 }
6170 else if (INTEGERP (components))
6171 {
6172 len = 1;
6173 *buf++ = XINT (components);
6174 }
6175 else if (CONSP (components))
6176 {
6177 for (len = 0; CONSP (components);
6178 len++, components = XCDR (components))
6179 *buf++ = XINT (XCAR (components));
6180 }
6181 else
6182 abort ();
6183 *head -= len;
6184 }
6185 }
6186
6187 if (find_composition (end, limit, &start, &end, &prop,
6188 coding->src_object)
6189 && end <= limit)
6190 *stop = start;
6191 else
6192 *stop = limit;
6193 }
6194 return buf;
6195 }
6196
6197
6198 /* Extract an annotation datum from a text property `charset' at POS of
6199 CODING->src_object (buffer of string), store the data in BUF, set
6200 *STOP to the position where the value of `charset' property changes
6201 (limiting by LIMIT), and return the address of the next element of
6202 BUF.
6203
6204 If the property value is nil, set *STOP to the position where the
6205 property value is non-nil (limiting by LIMIT), and return BUF. */
6206
6207 static INLINE int *
6208 handle_charset_annotation (pos, limit, coding, buf, stop)
6209 EMACS_INT pos, limit;
6210 struct coding_system *coding;
6211 int *buf;
6212 EMACS_INT *stop;
6213 {
6214 Lisp_Object val, next;
6215 int id;
6216
6217 val = Fget_text_property (make_number (pos), Qcharset, coding->src_object);
6218 if (! NILP (val) && CHARSETP (val))
6219 id = XINT (CHARSET_SYMBOL_ID (val));
6220 else
6221 id = -1;
6222 ADD_CHARSET_DATA (buf, 0, id);
6223 next = Fnext_single_property_change (make_number (pos), Qcharset,
6224 coding->src_object,
6225 make_number (limit));
6226 *stop = XINT (next);
6227 return buf;
6228 }
6229
6230
6231 static void
6232 consume_chars (coding, translation_table, max_lookup)
6233 struct coding_system *coding;
6234 Lisp_Object translation_table;
6235 int max_lookup;
6236 {
6237 int *buf = coding->charbuf;
6238 int *buf_end = coding->charbuf + coding->charbuf_size;
6239 const unsigned char *src = coding->source + coding->consumed;
6240 const unsigned char *src_end = coding->source + coding->src_bytes;
6241 EMACS_INT pos = coding->src_pos + coding->consumed_char;
6242 EMACS_INT end_pos = coding->src_pos + coding->src_chars;
6243 int multibytep = coding->src_multibyte;
6244 Lisp_Object eol_type;
6245 int c;
6246 EMACS_INT stop, stop_composition, stop_charset;
6247 int *lookup_buf = NULL;
6248
6249 if (! NILP (translation_table))
6250 lookup_buf = alloca (sizeof (int) * max_lookup);
6251
6252 eol_type = CODING_ID_EOL_TYPE (coding->id);
6253 if (VECTORP (eol_type))
6254 eol_type = Qunix;
6255
6256 /* Note: composition handling is not yet implemented. */
6257 coding->common_flags &= ~CODING_ANNOTATE_COMPOSITION_MASK;
6258
6259 if (NILP (coding->src_object))
6260 stop = stop_composition = stop_charset = end_pos;
6261 else
6262 {
6263 if (coding->common_flags & CODING_ANNOTATE_COMPOSITION_MASK)
6264 stop = stop_composition = pos;
6265 else
6266 stop = stop_composition = end_pos;
6267 if (coding->common_flags & CODING_ANNOTATE_CHARSET_MASK)
6268 stop = stop_charset = pos;
6269 else
6270 stop_charset = end_pos;
6271 }
6272
6273 /* Compensate for CRLF and conversion. */
6274 buf_end -= 1 + MAX_ANNOTATION_LENGTH;
6275 while (buf < buf_end)
6276 {
6277 Lisp_Object trans;
6278
6279 if (pos == stop)
6280 {
6281 if (pos == end_pos)
6282 break;
6283 if (pos == stop_composition)
6284 buf = handle_composition_annotation (pos, end_pos, coding,
6285 buf, &stop_composition);
6286 if (pos == stop_charset)
6287 buf = handle_charset_annotation (pos, end_pos, coding,
6288 buf, &stop_charset);
6289 stop = (stop_composition < stop_charset
6290 ? stop_composition : stop_charset);
6291 }
6292
6293 if (! multibytep)
6294 {
6295 EMACS_INT bytes;
6296
6297 if (! CODING_FOR_UNIBYTE (coding)
6298 && (bytes = MULTIBYTE_LENGTH (src, src_end)) > 0)
6299 c = STRING_CHAR_ADVANCE (src), pos += bytes;
6300 else
6301 c = *src++, pos++;
6302 }
6303 else
6304 c = STRING_CHAR_ADVANCE (src), pos++;
6305 if ((c == '\r') && (coding->mode & CODING_MODE_SELECTIVE_DISPLAY))
6306 c = '\n';
6307 if (! EQ (eol_type, Qunix))
6308 {
6309 if (c == '\n')
6310 {
6311 if (EQ (eol_type, Qdos))
6312 *buf++ = '\r';
6313 else
6314 c = '\r';
6315 }
6316 }
6317
6318 trans = Qnil;
6319 LOOKUP_TRANSLATION_TABLE (translation_table, c, trans);
6320 if (NILP (trans))
6321 *buf++ = c;
6322 else
6323 {
6324 int from_nchars = 1, to_nchars = 1;
6325 int *lookup_buf_end;
6326 const unsigned char *p = src;
6327 int i;
6328
6329 lookup_buf[0] = c;
6330 for (i = 1; i < max_lookup && p < src_end; i++)
6331 lookup_buf[i] = STRING_CHAR_ADVANCE (p);
6332 lookup_buf_end = lookup_buf + i;
6333 trans = get_translation (trans, lookup_buf, lookup_buf_end, 1,
6334 &from_nchars, &to_nchars);
6335 if (EQ (trans, Qt)
6336 || buf + to_nchars > buf_end)
6337 break;
6338 *buf++ = *lookup_buf;
6339 for (i = 1; i < to_nchars; i++)
6340 *buf++ = XINT (AREF (trans, i));
6341 for (i = 1; i < from_nchars; i++, pos++)
6342 src += MULTIBYTE_LENGTH_NO_CHECK (src);
6343 }
6344 }
6345
6346 coding->consumed = src - coding->source;
6347 coding->consumed_char = pos - coding->src_pos;
6348 coding->charbuf_used = buf - coding->charbuf;
6349 coding->chars_at_source = 0;
6350 }
6351
6352
6353 /* Encode the text at CODING->src_object into CODING->dst_object.
6354 CODING->src_object is a buffer or a string.
6355 CODING->dst_object is a buffer or nil.
6356
6357 If CODING->src_object is a buffer, it must be the current buffer.
6358 In this case, if CODING->src_pos is positive, it is a position of
6359 the source text in the buffer, otherwise. the source text is in the
6360 gap area of the buffer, and coding->src_pos specifies the offset of
6361 the text from GPT (which must be the same as PT). If this is the
6362 same buffer as CODING->dst_object, CODING->src_pos must be
6363 negative and CODING should not have `pre-write-conversion'.
6364
6365 If CODING->src_object is a string, CODING should not have
6366 `pre-write-conversion'.
6367
6368 If CODING->dst_object is a buffer, the encoded data is inserted at
6369 the current point of that buffer.
6370
6371 If CODING->dst_object is nil, the encoded data is placed at the
6372 memory area specified by CODING->destination. */
6373
6374 static int
6375 encode_coding (coding)
6376 struct coding_system *coding;
6377 {
6378 Lisp_Object attrs;
6379 Lisp_Object translation_table;
6380 int max_lookup;
6381
6382 attrs = CODING_ID_ATTRS (coding->id);
6383 translation_table = get_translation_table (attrs, 1, &max_lookup);
6384
6385 if (BUFFERP (coding->dst_object))
6386 {
6387 set_buffer_internal (XBUFFER (coding->dst_object));
6388 coding->dst_multibyte
6389 = ! NILP (current_buffer->enable_multibyte_characters);
6390 }
6391
6392 coding->consumed = coding->consumed_char = 0;
6393 coding->produced = coding->produced_char = 0;
6394 record_conversion_result (coding, CODING_RESULT_SUCCESS);
6395 coding->errors = 0;
6396
6397 ALLOC_CONVERSION_WORK_AREA (coding);
6398
6399 do {
6400 coding_set_source (coding);
6401 consume_chars (coding, translation_table, max_lookup);
6402 coding_set_destination (coding);
6403 (*(coding->encoder)) (coding);
6404 } while (coding->consumed_char < coding->src_chars);
6405
6406 if (BUFFERP (coding->dst_object))
6407 insert_from_gap (coding->produced_char, coding->produced);
6408
6409 return (coding->result);
6410 }
6411
6412
6413 /* Name (or base name) of work buffer for code conversion. */
6414 static Lisp_Object Vcode_conversion_workbuf_name;
6415
6416 /* A working buffer used by the top level conversion. Once it is
6417 created, it is never destroyed. It has the name
6418 Vcode_conversion_workbuf_name. The other working buffers are
6419 destroyed after the use is finished, and their names are modified
6420 versions of Vcode_conversion_workbuf_name. */
6421 static Lisp_Object Vcode_conversion_reused_workbuf;
6422
6423 /* 1 iff Vcode_conversion_reused_workbuf is already in use. */
6424 static int reused_workbuf_in_use;
6425
6426
6427 /* Return a working buffer of code convesion. MULTIBYTE specifies the
6428 multibyteness of returning buffer. */
6429
6430 Lisp_Object
6431 make_conversion_work_buffer (multibyte)
6432 {
6433 Lisp_Object name, workbuf;
6434 struct buffer *current;
6435
6436 if (reused_workbuf_in_use++)
6437 {
6438 name = Fgenerate_new_buffer_name (Vcode_conversion_workbuf_name, Qnil);
6439 workbuf = Fget_buffer_create (name);
6440 }
6441 else
6442 {
6443 name = Vcode_conversion_workbuf_name;
6444 workbuf = Fget_buffer_create (name);
6445 if (NILP (Vcode_conversion_reused_workbuf))
6446 Vcode_conversion_reused_workbuf = workbuf;
6447 }
6448 current = current_buffer;
6449 set_buffer_internal (XBUFFER (workbuf));
6450 Ferase_buffer ();
6451 current_buffer->undo_list = Qt;
6452 current_buffer->enable_multibyte_characters = multibyte ? Qt : Qnil;
6453 set_buffer_internal (current);
6454 return workbuf;
6455 }
6456
6457
6458 static Lisp_Object
6459 code_conversion_restore (arg)
6460 Lisp_Object arg;
6461 {
6462 Lisp_Object current, workbuf;
6463
6464 current = XCAR (arg);
6465 workbuf = XCDR (arg);
6466 if (! NILP (workbuf))
6467 {
6468 if (EQ (workbuf, Vcode_conversion_reused_workbuf))
6469 reused_workbuf_in_use = 0;
6470 else if (! NILP (Fbuffer_live_p (workbuf)))
6471 Fkill_buffer (workbuf);
6472 }
6473 set_buffer_internal (XBUFFER (current));
6474 return Qnil;
6475 }
6476
6477 Lisp_Object
6478 code_conversion_save (with_work_buf, multibyte)
6479 int with_work_buf, multibyte;
6480 {
6481 Lisp_Object workbuf = Qnil;
6482
6483 if (with_work_buf)
6484 workbuf = make_conversion_work_buffer (multibyte);
6485 record_unwind_protect (code_conversion_restore,
6486 Fcons (Fcurrent_buffer (), workbuf));
6487 return workbuf;
6488 }
6489
6490 int
6491 decode_coding_gap (coding, chars, bytes)
6492 struct coding_system *coding;
6493 EMACS_INT chars, bytes;
6494 {
6495 int count = specpdl_ptr - specpdl;
6496 Lisp_Object attrs;
6497
6498 code_conversion_save (0, 0);
6499
6500 coding->src_object = Fcurrent_buffer ();
6501 coding->src_chars = chars;
6502 coding->src_bytes = bytes;
6503 coding->src_pos = -chars;
6504 coding->src_pos_byte = -bytes;
6505 coding->src_multibyte = chars < bytes;
6506 coding->dst_object = coding->src_object;
6507 coding->dst_pos = PT;
6508 coding->dst_pos_byte = PT_BYTE;
6509 coding->dst_multibyte = ! NILP (current_buffer->enable_multibyte_characters);
6510 coding->mode |= CODING_MODE_LAST_BLOCK;
6511
6512 if (CODING_REQUIRE_DETECTION (coding))
6513 detect_coding (coding);
6514
6515 decode_coding (coding);
6516
6517 attrs = CODING_ID_ATTRS (coding->id);
6518 if (! NILP (CODING_ATTR_POST_READ (attrs)))
6519 {
6520 EMACS_INT prev_Z = Z, prev_Z_BYTE = Z_BYTE;
6521 Lisp_Object val;
6522
6523 TEMP_SET_PT_BOTH (coding->dst_pos, coding->dst_pos_byte);
6524 val = call1 (CODING_ATTR_POST_READ (attrs),
6525 make_number (coding->produced_char));
6526 CHECK_NATNUM (val);
6527 coding->produced_char += Z - prev_Z;
6528 coding->produced += Z_BYTE - prev_Z_BYTE;
6529 }
6530
6531 unbind_to (count, Qnil);
6532 return coding->result;
6533 }
6534
6535 int
6536 encode_coding_gap (coding, chars, bytes)
6537 struct coding_system *coding;
6538 EMACS_INT chars, bytes;
6539 {
6540 int count = specpdl_ptr - specpdl;
6541
6542 code_conversion_save (0, 0);
6543
6544 coding->src_object = Fcurrent_buffer ();
6545 coding->src_chars = chars;
6546 coding->src_bytes = bytes;
6547 coding->src_pos = -chars;
6548 coding->src_pos_byte = -bytes;
6549 coding->src_multibyte = chars < bytes;
6550 coding->dst_object = coding->src_object;
6551 coding->dst_pos = PT;
6552 coding->dst_pos_byte = PT_BYTE;
6553
6554 encode_coding (coding);
6555
6556 unbind_to (count, Qnil);
6557 return coding->result;
6558 }
6559
6560
6561 /* Decode the text in the range FROM/FROM_BYTE and TO/TO_BYTE in
6562 SRC_OBJECT into DST_OBJECT by coding context CODING.
6563
6564 SRC_OBJECT is a buffer, a string, or Qnil.
6565
6566 If it is a buffer, the text is at point of the buffer. FROM and TO
6567 are positions in the buffer.
6568
6569 If it is a string, the text is at the beginning of the string.
6570 FROM and TO are indices to the string.
6571
6572 If it is nil, the text is at coding->source. FROM and TO are
6573 indices to coding->source.
6574
6575 DST_OBJECT is a buffer, Qt, or Qnil.
6576
6577 If it is a buffer, the decoded text is inserted at point of the
6578 buffer. If the buffer is the same as SRC_OBJECT, the source text
6579 is deleted.
6580
6581 If it is Qt, a string is made from the decoded text, and
6582 set in CODING->dst_object.
6583
6584 If it is Qnil, the decoded text is stored at CODING->destination.
6585 The caller must allocate CODING->dst_bytes bytes at
6586 CODING->destination by xmalloc. If the decoded text is longer than
6587 CODING->dst_bytes, CODING->destination is relocated by xrealloc.
6588 */
6589
6590 void
6591 decode_coding_object (coding, src_object, from, from_byte, to, to_byte,
6592 dst_object)
6593 struct coding_system *coding;
6594 Lisp_Object src_object;
6595 EMACS_INT from, from_byte, to, to_byte;
6596 Lisp_Object dst_object;
6597 {
6598 int count = specpdl_ptr - specpdl;
6599 unsigned char *destination;
6600 EMACS_INT dst_bytes;
6601 EMACS_INT chars = to - from;
6602 EMACS_INT bytes = to_byte - from_byte;
6603 Lisp_Object attrs;
6604 Lisp_Object buffer;
6605 int saved_pt = -1, saved_pt_byte;
6606
6607 buffer = Fcurrent_buffer ();
6608
6609 if (NILP (dst_object))
6610 {
6611 destination = coding->destination;
6612 dst_bytes = coding->dst_bytes;
6613 }
6614
6615 coding->src_object = src_object;
6616 coding->src_chars = chars;
6617 coding->src_bytes = bytes;
6618 coding->src_multibyte = chars < bytes;
6619
6620 if (STRINGP (src_object))
6621 {
6622 coding->src_pos = from;
6623 coding->src_pos_byte = from_byte;
6624 }
6625 else if (BUFFERP (src_object))
6626 {
6627 set_buffer_internal (XBUFFER (src_object));
6628 if (from != GPT)
6629 move_gap_both (from, from_byte);
6630 if (EQ (src_object, dst_object))
6631 {
6632 saved_pt = PT, saved_pt_byte = PT_BYTE;
6633 TEMP_SET_PT_BOTH (from, from_byte);
6634 del_range_both (from, from_byte, to, to_byte, 1);
6635 coding->src_pos = -chars;
6636 coding->src_pos_byte = -bytes;
6637 }
6638 else
6639 {
6640 coding->src_pos = from;
6641 coding->src_pos_byte = from_byte;
6642 }
6643 }
6644
6645 if (CODING_REQUIRE_DETECTION (coding))
6646 detect_coding (coding);
6647 attrs = CODING_ID_ATTRS (coding->id);
6648
6649 if (EQ (dst_object, Qt)
6650 || (! NILP (CODING_ATTR_POST_READ (attrs))
6651 && NILP (dst_object)))
6652 {
6653 coding->dst_object = code_conversion_save (1, 1);
6654 coding->dst_pos = BEG;
6655 coding->dst_pos_byte = BEG_BYTE;
6656 coding->dst_multibyte = 1;
6657 }
6658 else if (BUFFERP (dst_object))
6659 {
6660 code_conversion_save (0, 0);
6661 coding->dst_object = dst_object;
6662 coding->dst_pos = BUF_PT (XBUFFER (dst_object));
6663 coding->dst_pos_byte = BUF_PT_BYTE (XBUFFER (dst_object));
6664 coding->dst_multibyte
6665 = ! NILP (XBUFFER (dst_object)->enable_multibyte_characters);
6666 }
6667 else
6668 {
6669 code_conversion_save (0, 0);
6670 coding->dst_object = Qnil;
6671 coding->dst_multibyte = 1;
6672 }
6673
6674 decode_coding (coding);
6675
6676 if (BUFFERP (coding->dst_object))
6677 set_buffer_internal (XBUFFER (coding->dst_object));
6678
6679 if (! NILP (CODING_ATTR_POST_READ (attrs)))
6680 {
6681 struct gcpro gcpro1, gcpro2;
6682 EMACS_INT prev_Z = Z, prev_Z_BYTE = Z_BYTE;
6683 Lisp_Object val;
6684
6685 TEMP_SET_PT_BOTH (coding->dst_pos, coding->dst_pos_byte);
6686 GCPRO2 (coding->src_object, coding->dst_object);
6687 val = call1 (CODING_ATTR_POST_READ (attrs),
6688 make_number (coding->produced_char));
6689 UNGCPRO;
6690 CHECK_NATNUM (val);
6691 coding->produced_char += Z - prev_Z;
6692 coding->produced += Z_BYTE - prev_Z_BYTE;
6693 }
6694
6695 if (EQ (dst_object, Qt))
6696 {
6697 coding->dst_object = Fbuffer_string ();
6698 }
6699 else if (NILP (dst_object) && BUFFERP (coding->dst_object))
6700 {
6701 set_buffer_internal (XBUFFER (coding->dst_object));
6702 if (dst_bytes < coding->produced)
6703 {
6704 destination
6705 = (unsigned char *) xrealloc (destination, coding->produced);
6706 if (! destination)
6707 {
6708 record_conversion_result (coding,
6709 CODING_RESULT_INSUFFICIENT_DST);
6710 unbind_to (count, Qnil);
6711 return;
6712 }
6713 if (BEGV < GPT && GPT < BEGV + coding->produced_char)
6714 move_gap_both (BEGV, BEGV_BYTE);
6715 bcopy (BEGV_ADDR, destination, coding->produced);
6716 coding->destination = destination;
6717 }
6718 }
6719
6720 if (saved_pt >= 0)
6721 {
6722 /* This is the case of:
6723 (BUFFERP (src_object) && EQ (src_object, dst_object))
6724 As we have moved PT while replacing the original buffer
6725 contents, we must recover it now. */
6726 set_buffer_internal (XBUFFER (src_object));
6727 if (saved_pt < from)
6728 TEMP_SET_PT_BOTH (saved_pt, saved_pt_byte);
6729 else if (saved_pt < from + chars)
6730 TEMP_SET_PT_BOTH (from, from_byte);
6731 else if (! NILP (current_buffer->enable_multibyte_characters))
6732 TEMP_SET_PT_BOTH (saved_pt + (coding->produced_char - chars),
6733 saved_pt_byte + (coding->produced - bytes));
6734 else
6735 TEMP_SET_PT_BOTH (saved_pt + (coding->produced - bytes),
6736 saved_pt_byte + (coding->produced - bytes));
6737 }
6738
6739 unbind_to (count, coding->dst_object);
6740 }
6741
6742
6743 void
6744 encode_coding_object (coding, src_object, from, from_byte, to, to_byte,
6745 dst_object)
6746 struct coding_system *coding;
6747 Lisp_Object src_object;
6748 EMACS_INT from, from_byte, to, to_byte;
6749 Lisp_Object dst_object;
6750 {
6751 int count = specpdl_ptr - specpdl;
6752 EMACS_INT chars = to - from;
6753 EMACS_INT bytes = to_byte - from_byte;
6754 Lisp_Object attrs;
6755 Lisp_Object buffer;
6756 int saved_pt = -1, saved_pt_byte;
6757
6758 buffer = Fcurrent_buffer ();
6759
6760 coding->src_object = src_object;
6761 coding->src_chars = chars;
6762 coding->src_bytes = bytes;
6763 coding->src_multibyte = chars < bytes;
6764
6765 attrs = CODING_ID_ATTRS (coding->id);
6766
6767 if (! NILP (CODING_ATTR_PRE_WRITE (attrs)))
6768 {
6769 coding->src_object = code_conversion_save (1, coding->src_multibyte);
6770 set_buffer_internal (XBUFFER (coding->src_object));
6771 if (STRINGP (src_object))
6772 insert_from_string (src_object, from, from_byte, chars, bytes, 0);
6773 else if (BUFFERP (src_object))
6774 insert_from_buffer (XBUFFER (src_object), from, chars, 0);
6775 else
6776 insert_1_both (coding->source + from, chars, bytes, 0, 0, 0);
6777
6778 if (EQ (src_object, dst_object))
6779 {
6780 set_buffer_internal (XBUFFER (src_object));
6781 saved_pt = PT, saved_pt_byte = PT_BYTE;
6782 del_range_both (from, from_byte, to, to_byte, 1);
6783 set_buffer_internal (XBUFFER (coding->src_object));
6784 }
6785
6786 call2 (CODING_ATTR_PRE_WRITE (attrs),
6787 make_number (BEG), make_number (Z));
6788 coding->src_object = Fcurrent_buffer ();
6789 if (BEG != GPT)
6790 move_gap_both (BEG, BEG_BYTE);
6791 coding->src_chars = Z - BEG;
6792 coding->src_bytes = Z_BYTE - BEG_BYTE;
6793 coding->src_pos = BEG;
6794 coding->src_pos_byte = BEG_BYTE;
6795 coding->src_multibyte = Z < Z_BYTE;
6796 }
6797 else if (STRINGP (src_object))
6798 {
6799 code_conversion_save (0, 0);
6800 coding->src_pos = from;
6801 coding->src_pos_byte = from_byte;
6802 }
6803 else if (BUFFERP (src_object))
6804 {
6805 code_conversion_save (0, 0);
6806 set_buffer_internal (XBUFFER (src_object));
6807 if (EQ (src_object, dst_object))
6808 {
6809 saved_pt = PT, saved_pt_byte = PT_BYTE;
6810 coding->src_object = del_range_1 (from, to, 1, 1);
6811 coding->src_pos = 0;
6812 coding->src_pos_byte = 0;
6813 }
6814 else
6815 {
6816 if (from < GPT && to >= GPT)
6817 move_gap_both (from, from_byte);
6818 coding->src_pos = from;
6819 coding->src_pos_byte = from_byte;
6820 }
6821 }
6822 else
6823 code_conversion_save (0, 0);
6824
6825 if (BUFFERP (dst_object))
6826 {
6827 coding->dst_object = dst_object;
6828 if (EQ (src_object, dst_object))
6829 {
6830 coding->dst_pos = from;
6831 coding->dst_pos_byte = from_byte;
6832 }
6833 else
6834 {
6835 coding->dst_pos = BUF_PT (XBUFFER (dst_object));
6836 coding->dst_pos_byte = BUF_PT_BYTE (XBUFFER (dst_object));
6837 }
6838 coding->dst_multibyte
6839 = ! NILP (XBUFFER (dst_object)->enable_multibyte_characters);
6840 }
6841 else if (EQ (dst_object, Qt))
6842 {
6843 coding->dst_object = Qnil;
6844 coding->dst_bytes = coding->src_chars;
6845 if (coding->dst_bytes == 0)
6846 coding->dst_bytes = 1;
6847 coding->destination = (unsigned char *) xmalloc (coding->dst_bytes);
6848 coding->dst_multibyte = 0;
6849 }
6850 else
6851 {
6852 coding->dst_object = Qnil;
6853 coding->dst_multibyte = 0;
6854 }
6855
6856 encode_coding (coding);
6857
6858 if (EQ (dst_object, Qt))
6859 {
6860 if (BUFFERP (coding->dst_object))
6861 coding->dst_object = Fbuffer_string ();
6862 else
6863 {
6864 coding->dst_object
6865 = make_unibyte_string ((char *) coding->destination,
6866 coding->produced);
6867 xfree (coding->destination);
6868 }
6869 }
6870
6871 if (saved_pt >= 0)
6872 {
6873 /* This is the case of:
6874 (BUFFERP (src_object) && EQ (src_object, dst_object))
6875 As we have moved PT while replacing the original buffer
6876 contents, we must recover it now. */
6877 set_buffer_internal (XBUFFER (src_object));
6878 if (saved_pt < from)
6879 TEMP_SET_PT_BOTH (saved_pt, saved_pt_byte);
6880 else if (saved_pt < from + chars)
6881 TEMP_SET_PT_BOTH (from, from_byte);
6882 else if (! NILP (current_buffer->enable_multibyte_characters))
6883 TEMP_SET_PT_BOTH (saved_pt + (coding->produced_char - chars),
6884 saved_pt_byte + (coding->produced - bytes));
6885 else
6886 TEMP_SET_PT_BOTH (saved_pt + (coding->produced - bytes),
6887 saved_pt_byte + (coding->produced - bytes));
6888 }
6889
6890 unbind_to (count, Qnil);
6891 }
6892
6893
6894 Lisp_Object
6895 preferred_coding_system ()
6896 {
6897 int id = coding_categories[coding_priorities[0]].id;
6898
6899 return CODING_ID_NAME (id);
6900 }
6901
6902 \f
6903 #ifdef emacs
6904 /*** 8. Emacs Lisp library functions ***/
6905
6906 DEFUN ("coding-system-p", Fcoding_system_p, Scoding_system_p, 1, 1, 0,
6907 doc: /* Return t if OBJECT is nil or a coding-system.
6908 See the documentation of `define-coding-system' for information
6909 about coding-system objects. */)
6910 (obj)
6911 Lisp_Object obj;
6912 {
6913 return ((NILP (obj) || CODING_SYSTEM_P (obj)) ? Qt : Qnil);
6914 }
6915
6916 DEFUN ("read-non-nil-coding-system", Fread_non_nil_coding_system,
6917 Sread_non_nil_coding_system, 1, 1, 0,
6918 doc: /* Read a coding system from the minibuffer, prompting with string PROMPT. */)
6919 (prompt)
6920 Lisp_Object prompt;
6921 {
6922 Lisp_Object val;
6923 do
6924 {
6925 val = Fcompleting_read (prompt, Vcoding_system_alist, Qnil,
6926 Qt, Qnil, Qcoding_system_history, Qnil, Qnil);
6927 }
6928 while (SCHARS (val) == 0);
6929 return (Fintern (val, Qnil));
6930 }
6931
6932 DEFUN ("read-coding-system", Fread_coding_system, Sread_coding_system, 1, 2, 0,
6933 doc: /* Read a coding system from the minibuffer, prompting with string PROMPT.
6934 If the user enters null input, return second argument DEFAULT-CODING-SYSTEM. */)
6935 (prompt, default_coding_system)
6936 Lisp_Object prompt, default_coding_system;
6937 {
6938 Lisp_Object val;
6939 if (SYMBOLP (default_coding_system))
6940 XSETSTRING (default_coding_system, XPNTR (SYMBOL_NAME (default_coding_system)));
6941 val = Fcompleting_read (prompt, Vcoding_system_alist, Qnil,
6942 Qt, Qnil, Qcoding_system_history,
6943 default_coding_system, Qnil);
6944 return (SCHARS (val) == 0 ? Qnil : Fintern (val, Qnil));
6945 }
6946
6947 DEFUN ("check-coding-system", Fcheck_coding_system, Scheck_coding_system,
6948 1, 1, 0,
6949 doc: /* Check validity of CODING-SYSTEM.
6950 If valid, return CODING-SYSTEM, else signal a `coding-system-error' error. */)
6951 (coding_system)
6952 Lisp_Object coding_system;
6953 {
6954 CHECK_SYMBOL (coding_system);
6955 if (!NILP (Fcoding_system_p (coding_system)))
6956 return coding_system;
6957 while (1)
6958 Fsignal (Qcoding_system_error, Fcons (coding_system, Qnil));
6959 }
6960
6961 \f
6962 /* Detect how the bytes at SRC of length SRC_BYTES are encoded. If
6963 HIGHEST is nonzero, return the coding system of the highest
6964 priority among the detected coding systems. Otherwize return a
6965 list of detected coding systems sorted by their priorities. If
6966 MULTIBYTEP is nonzero, it is assumed that the bytes are in correct
6967 multibyte form but contains only ASCII and eight-bit chars.
6968 Otherwise, the bytes are raw bytes.
6969
6970 CODING-SYSTEM controls the detection as below:
6971
6972 If it is nil, detect both text-format and eol-format. If the
6973 text-format part of CODING-SYSTEM is already specified
6974 (e.g. `iso-latin-1'), detect only eol-format. If the eol-format
6975 part of CODING-SYSTEM is already specified (e.g. `undecided-unix'),
6976 detect only text-format. */
6977
6978 Lisp_Object
6979 detect_coding_system (src, src_chars, src_bytes, highest, multibytep,
6980 coding_system)
6981 const unsigned char *src;
6982 int src_chars, src_bytes, highest;
6983 int multibytep;
6984 Lisp_Object coding_system;
6985 {
6986 const unsigned char *src_end = src + src_bytes;
6987 Lisp_Object attrs, eol_type;
6988 Lisp_Object val;
6989 struct coding_system coding;
6990 int id;
6991 struct coding_detection_info detect_info;
6992 enum coding_category base_category;
6993
6994 if (NILP (coding_system))
6995 coding_system = Qundecided;
6996 setup_coding_system (coding_system, &coding);
6997 attrs = CODING_ID_ATTRS (coding.id);
6998 eol_type = CODING_ID_EOL_TYPE (coding.id);
6999 coding_system = CODING_ATTR_BASE_NAME (attrs);
7000
7001 coding.source = src;
7002 coding.src_chars = src_chars;
7003 coding.src_bytes = src_bytes;
7004 coding.src_multibyte = multibytep;
7005 coding.consumed = 0;
7006 coding.mode |= CODING_MODE_LAST_BLOCK;
7007
7008 detect_info.checked = detect_info.found = detect_info.rejected = 0;
7009
7010 /* At first, detect text-format if necessary. */
7011 base_category = XINT (CODING_ATTR_CATEGORY (attrs));
7012 if (base_category == coding_category_undecided)
7013 {
7014 enum coding_category category;
7015 struct coding_system *this;
7016 int c, i;
7017
7018 /* Skip all ASCII bytes except for a few ISO2022 controls. */
7019 for (i = 0; src < src_end; i++, src++)
7020 {
7021 c = *src;
7022 if (c & 0x80 || (c < 0x20 && (c == ISO_CODE_ESC
7023 || c == ISO_CODE_SI
7024 || c == ISO_CODE_SO)))
7025 break;
7026 }
7027 coding.head_ascii = src - coding.source;
7028
7029 if (src < src_end)
7030 for (i = 0; i < coding_category_raw_text; i++)
7031 {
7032 category = coding_priorities[i];
7033 this = coding_categories + category;
7034
7035 if (this->id < 0)
7036 {
7037 /* No coding system of this category is defined. */
7038 detect_info.rejected |= (1 << category);
7039 }
7040 else if (category >= coding_category_raw_text)
7041 continue;
7042 else if (detect_info.checked & (1 << category))
7043 {
7044 if (highest
7045 && (detect_info.found & (1 << category)))
7046 break;
7047 }
7048 else
7049 {
7050 if ((*(this->detector)) (&coding, &detect_info)
7051 && highest
7052 && (detect_info.found & (1 << category)))
7053 {
7054 if (category == coding_category_utf_16_auto)
7055 {
7056 if (detect_info.found & CATEGORY_MASK_UTF_16_LE)
7057 category = coding_category_utf_16_le;
7058 else
7059 category = coding_category_utf_16_be;
7060 }
7061 break;
7062 }
7063 }
7064 }
7065
7066 if (detect_info.rejected == CATEGORY_MASK_ANY)
7067 {
7068 detect_info.found = CATEGORY_MASK_RAW_TEXT;
7069 id = coding_categories[coding_category_raw_text].id;
7070 val = Fcons (make_number (id), Qnil);
7071 }
7072 else if (! detect_info.rejected && ! detect_info.found)
7073 {
7074 detect_info.found = CATEGORY_MASK_ANY;
7075 id = coding_categories[coding_category_undecided].id;
7076 val = Fcons (make_number (id), Qnil);
7077 }
7078 else if (highest)
7079 {
7080 if (detect_info.found)
7081 {
7082 detect_info.found = 1 << category;
7083 val = Fcons (make_number (this->id), Qnil);
7084 }
7085 else
7086 for (i = 0; i < coding_category_raw_text; i++)
7087 if (! (detect_info.rejected & (1 << coding_priorities[i])))
7088 {
7089 detect_info.found = 1 << coding_priorities[i];
7090 id = coding_categories[coding_priorities[i]].id;
7091 val = Fcons (make_number (id), Qnil);
7092 break;
7093 }
7094 }
7095 else
7096 {
7097 int mask = detect_info.rejected | detect_info.found;
7098 int found = 0;
7099 val = Qnil;
7100
7101 for (i = coding_category_raw_text - 1; i >= 0; i--)
7102 {
7103 category = coding_priorities[i];
7104 if (! (mask & (1 << category)))
7105 {
7106 found |= 1 << category;
7107 id = coding_categories[category].id;
7108 val = Fcons (make_number (id), val);
7109 }
7110 }
7111 for (i = coding_category_raw_text - 1; i >= 0; i--)
7112 {
7113 category = coding_priorities[i];
7114 if (detect_info.found & (1 << category))
7115 {
7116 id = coding_categories[category].id;
7117 val = Fcons (make_number (id), val);
7118 }
7119 }
7120 detect_info.found |= found;
7121 }
7122 }
7123 else if (base_category == coding_category_utf_16_auto)
7124 {
7125 if (detect_coding_utf_16 (&coding, &detect_info))
7126 {
7127 enum coding_category category;
7128 struct coding_system *this;
7129
7130 if (detect_info.found & CATEGORY_MASK_UTF_16_LE)
7131 this = coding_categories + coding_category_utf_16_le;
7132 else if (detect_info.found & CATEGORY_MASK_UTF_16_BE)
7133 this = coding_categories + coding_category_utf_16_be;
7134 else if (detect_info.rejected & CATEGORY_MASK_UTF_16_LE_NOSIG)
7135 this = coding_categories + coding_category_utf_16_be_nosig;
7136 else
7137 this = coding_categories + coding_category_utf_16_le_nosig;
7138 val = Fcons (make_number (this->id), Qnil);
7139 }
7140 }
7141 else
7142 {
7143 detect_info.found = 1 << XINT (CODING_ATTR_CATEGORY (attrs));
7144 val = Fcons (make_number (coding.id), Qnil);
7145 }
7146
7147 /* Then, detect eol-format if necessary. */
7148 {
7149 int normal_eol = -1, utf_16_be_eol = -1, utf_16_le_eol;
7150 Lisp_Object tail;
7151
7152 if (VECTORP (eol_type))
7153 {
7154 if (detect_info.found & ~CATEGORY_MASK_UTF_16)
7155 normal_eol = detect_eol (coding.source, src_bytes,
7156 coding_category_raw_text);
7157 if (detect_info.found & (CATEGORY_MASK_UTF_16_BE
7158 | CATEGORY_MASK_UTF_16_BE_NOSIG))
7159 utf_16_be_eol = detect_eol (coding.source, src_bytes,
7160 coding_category_utf_16_be);
7161 if (detect_info.found & (CATEGORY_MASK_UTF_16_LE
7162 | CATEGORY_MASK_UTF_16_LE_NOSIG))
7163 utf_16_le_eol = detect_eol (coding.source, src_bytes,
7164 coding_category_utf_16_le);
7165 }
7166 else
7167 {
7168 if (EQ (eol_type, Qunix))
7169 normal_eol = utf_16_be_eol = utf_16_le_eol = EOL_SEEN_LF;
7170 else if (EQ (eol_type, Qdos))
7171 normal_eol = utf_16_be_eol = utf_16_le_eol = EOL_SEEN_CRLF;
7172 else
7173 normal_eol = utf_16_be_eol = utf_16_le_eol = EOL_SEEN_CR;
7174 }
7175
7176 for (tail = val; CONSP (tail); tail = XCDR (tail))
7177 {
7178 enum coding_category category;
7179 int this_eol;
7180
7181 id = XINT (XCAR (tail));
7182 attrs = CODING_ID_ATTRS (id);
7183 category = XINT (CODING_ATTR_CATEGORY (attrs));
7184 eol_type = CODING_ID_EOL_TYPE (id);
7185 if (VECTORP (eol_type))
7186 {
7187 if (category == coding_category_utf_16_be
7188 || category == coding_category_utf_16_be_nosig)
7189 this_eol = utf_16_be_eol;
7190 else if (category == coding_category_utf_16_le
7191 || category == coding_category_utf_16_le_nosig)
7192 this_eol = utf_16_le_eol;
7193 else
7194 this_eol = normal_eol;
7195
7196 if (this_eol == EOL_SEEN_LF)
7197 XSETCAR (tail, AREF (eol_type, 0));
7198 else if (this_eol == EOL_SEEN_CRLF)
7199 XSETCAR (tail, AREF (eol_type, 1));
7200 else if (this_eol == EOL_SEEN_CR)
7201 XSETCAR (tail, AREF (eol_type, 2));
7202 else
7203 XSETCAR (tail, CODING_ID_NAME (id));
7204 }
7205 else
7206 XSETCAR (tail, CODING_ID_NAME (id));
7207 }
7208 }
7209
7210 return (highest ? XCAR (val) : val);
7211 }
7212
7213
7214 DEFUN ("detect-coding-region", Fdetect_coding_region, Sdetect_coding_region,
7215 2, 3, 0,
7216 doc: /* Detect coding system of the text in the region between START and END.
7217 Return a list of possible coding systems ordered by priority.
7218
7219 If only ASCII characters are found, it returns a list of single element
7220 `undecided' or its subsidiary coding system according to a detected
7221 end-of-line format.
7222
7223 If optional argument HIGHEST is non-nil, return the coding system of
7224 highest priority. */)
7225 (start, end, highest)
7226 Lisp_Object start, end, highest;
7227 {
7228 int from, to;
7229 int from_byte, to_byte;
7230
7231 CHECK_NUMBER_COERCE_MARKER (start);
7232 CHECK_NUMBER_COERCE_MARKER (end);
7233
7234 validate_region (&start, &end);
7235 from = XINT (start), to = XINT (end);
7236 from_byte = CHAR_TO_BYTE (from);
7237 to_byte = CHAR_TO_BYTE (to);
7238
7239 if (from < GPT && to >= GPT)
7240 move_gap_both (to, to_byte);
7241
7242 return detect_coding_system (BYTE_POS_ADDR (from_byte),
7243 to - from, to_byte - from_byte,
7244 !NILP (highest),
7245 !NILP (current_buffer
7246 ->enable_multibyte_characters),
7247 Qnil);
7248 }
7249
7250 DEFUN ("detect-coding-string", Fdetect_coding_string, Sdetect_coding_string,
7251 1, 2, 0,
7252 doc: /* Detect coding system of the text in STRING.
7253 Return a list of possible coding systems ordered by priority.
7254
7255 If only ASCII characters are found, it returns a list of single element
7256 `undecided' or its subsidiary coding system according to a detected
7257 end-of-line format.
7258
7259 If optional argument HIGHEST is non-nil, return the coding system of
7260 highest priority. */)
7261 (string, highest)
7262 Lisp_Object string, highest;
7263 {
7264 CHECK_STRING (string);
7265
7266 return detect_coding_system (SDATA (string),
7267 SCHARS (string), SBYTES (string),
7268 !NILP (highest), STRING_MULTIBYTE (string),
7269 Qnil);
7270 }
7271
7272
7273 static INLINE int
7274 char_encodable_p (c, attrs)
7275 int c;
7276 Lisp_Object attrs;
7277 {
7278 Lisp_Object tail;
7279 struct charset *charset;
7280 Lisp_Object translation_table;
7281
7282 translation_table = CODING_ATTR_TRANS_TBL (attrs);
7283 if (! NILP (translation_table))
7284 c = translate_char (translation_table, c);
7285 for (tail = CODING_ATTR_CHARSET_LIST (attrs);
7286 CONSP (tail); tail = XCDR (tail))
7287 {
7288 charset = CHARSET_FROM_ID (XINT (XCAR (tail)));
7289 if (CHAR_CHARSET_P (c, charset))
7290 break;
7291 }
7292 return (! NILP (tail));
7293 }
7294
7295
7296 /* Return a list of coding systems that safely encode the text between
7297 START and END. If EXCLUDE is non-nil, it is a list of coding
7298 systems not to check. The returned list doesn't contain any such
7299 coding systems. In any case, if the text contains only ASCII or is
7300 unibyte, return t. */
7301
7302 DEFUN ("find-coding-systems-region-internal",
7303 Ffind_coding_systems_region_internal,
7304 Sfind_coding_systems_region_internal, 2, 3, 0,
7305 doc: /* Internal use only. */)
7306 (start, end, exclude)
7307 Lisp_Object start, end, exclude;
7308 {
7309 Lisp_Object coding_attrs_list, safe_codings;
7310 EMACS_INT start_byte, end_byte;
7311 const unsigned char *p, *pbeg, *pend;
7312 int c;
7313 Lisp_Object tail, elt;
7314
7315 if (STRINGP (start))
7316 {
7317 if (!STRING_MULTIBYTE (start)
7318 || SCHARS (start) == SBYTES (start))
7319 return Qt;
7320 start_byte = 0;
7321 end_byte = SBYTES (start);
7322 }
7323 else
7324 {
7325 CHECK_NUMBER_COERCE_MARKER (start);
7326 CHECK_NUMBER_COERCE_MARKER (end);
7327 if (XINT (start) < BEG || XINT (end) > Z || XINT (start) > XINT (end))
7328 args_out_of_range (start, end);
7329 if (NILP (current_buffer->enable_multibyte_characters))
7330 return Qt;
7331 start_byte = CHAR_TO_BYTE (XINT (start));
7332 end_byte = CHAR_TO_BYTE (XINT (end));
7333 if (XINT (end) - XINT (start) == end_byte - start_byte)
7334 return Qt;
7335
7336 if (XINT (start) < GPT && XINT (end) > GPT)
7337 {
7338 if ((GPT - XINT (start)) < (XINT (end) - GPT))
7339 move_gap_both (XINT (start), start_byte);
7340 else
7341 move_gap_both (XINT (end), end_byte);
7342 }
7343 }
7344
7345 coding_attrs_list = Qnil;
7346 for (tail = Vcoding_system_list; CONSP (tail); tail = XCDR (tail))
7347 if (NILP (exclude)
7348 || NILP (Fmemq (XCAR (tail), exclude)))
7349 {
7350 Lisp_Object attrs;
7351
7352 attrs = AREF (CODING_SYSTEM_SPEC (XCAR (tail)), 0);
7353 if (EQ (XCAR (tail), CODING_ATTR_BASE_NAME (attrs))
7354 && ! EQ (CODING_ATTR_TYPE (attrs), Qundecided))
7355 {
7356 ASET (attrs, coding_attr_trans_tbl,
7357 get_translation_table (attrs, 1));
7358 coding_attrs_list = Fcons (attrs, coding_attrs_list);
7359 }
7360 }
7361
7362 if (STRINGP (start))
7363 p = pbeg = SDATA (start);
7364 else
7365 p = pbeg = BYTE_POS_ADDR (start_byte);
7366 pend = p + (end_byte - start_byte);
7367
7368 while (p < pend && ASCII_BYTE_P (*p)) p++;
7369 while (p < pend && ASCII_BYTE_P (*(pend - 1))) pend--;
7370
7371 while (p < pend)
7372 {
7373 if (ASCII_BYTE_P (*p))
7374 p++;
7375 else
7376 {
7377 c = STRING_CHAR_ADVANCE (p);
7378
7379 charset_map_loaded = 0;
7380 for (tail = coding_attrs_list; CONSP (tail);)
7381 {
7382 elt = XCAR (tail);
7383 if (NILP (elt))
7384 tail = XCDR (tail);
7385 else if (char_encodable_p (c, elt))
7386 tail = XCDR (tail);
7387 else if (CONSP (XCDR (tail)))
7388 {
7389 XSETCAR (tail, XCAR (XCDR (tail)));
7390 XSETCDR (tail, XCDR (XCDR (tail)));
7391 }
7392 else
7393 {
7394 XSETCAR (tail, Qnil);
7395 tail = XCDR (tail);
7396 }
7397 }
7398 if (charset_map_loaded)
7399 {
7400 EMACS_INT p_offset = p - pbeg, pend_offset = pend - pbeg;
7401
7402 if (STRINGP (start))
7403 pbeg = SDATA (start);
7404 else
7405 pbeg = BYTE_POS_ADDR (start_byte);
7406 p = pbeg + p_offset;
7407 pend = pbeg + pend_offset;
7408 }
7409 }
7410 }
7411
7412 safe_codings = Qnil;
7413 for (tail = coding_attrs_list; CONSP (tail); tail = XCDR (tail))
7414 if (! NILP (XCAR (tail)))
7415 safe_codings = Fcons (CODING_ATTR_BASE_NAME (XCAR (tail)), safe_codings);
7416
7417 return safe_codings;
7418 }
7419
7420
7421 DEFUN ("unencodable-char-position", Funencodable_char_position,
7422 Sunencodable_char_position, 3, 5, 0,
7423 doc: /*
7424 Return position of first un-encodable character in a region.
7425 START and END specfiy the region and CODING-SYSTEM specifies the
7426 encoding to check. Return nil if CODING-SYSTEM does encode the region.
7427
7428 If optional 4th argument COUNT is non-nil, it specifies at most how
7429 many un-encodable characters to search. In this case, the value is a
7430 list of positions.
7431
7432 If optional 5th argument STRING is non-nil, it is a string to search
7433 for un-encodable characters. In that case, START and END are indexes
7434 to the string. */)
7435 (start, end, coding_system, count, string)
7436 Lisp_Object start, end, coding_system, count, string;
7437 {
7438 int n;
7439 struct coding_system coding;
7440 Lisp_Object attrs, charset_list, translation_table;
7441 Lisp_Object positions;
7442 int from, to;
7443 const unsigned char *p, *stop, *pend;
7444 int ascii_compatible;
7445
7446 setup_coding_system (Fcheck_coding_system (coding_system), &coding);
7447 attrs = CODING_ID_ATTRS (coding.id);
7448 if (EQ (CODING_ATTR_TYPE (attrs), Qraw_text))
7449 return Qnil;
7450 ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs));
7451 charset_list = CODING_ATTR_CHARSET_LIST (attrs);
7452 translation_table = get_translation_table (attrs, 1);
7453
7454 if (NILP (string))
7455 {
7456 validate_region (&start, &end);
7457 from = XINT (start);
7458 to = XINT (end);
7459 if (NILP (current_buffer->enable_multibyte_characters)
7460 || (ascii_compatible
7461 && (to - from) == (CHAR_TO_BYTE (to) - (CHAR_TO_BYTE (from)))))
7462 return Qnil;
7463 p = CHAR_POS_ADDR (from);
7464 pend = CHAR_POS_ADDR (to);
7465 if (from < GPT && to >= GPT)
7466 stop = GPT_ADDR;
7467 else
7468 stop = pend;
7469 }
7470 else
7471 {
7472 CHECK_STRING (string);
7473 CHECK_NATNUM (start);
7474 CHECK_NATNUM (end);
7475 from = XINT (start);
7476 to = XINT (end);
7477 if (from > to
7478 || to > SCHARS (string))
7479 args_out_of_range_3 (string, start, end);
7480 if (! STRING_MULTIBYTE (string))
7481 return Qnil;
7482 p = SDATA (string) + string_char_to_byte (string, from);
7483 stop = pend = SDATA (string) + string_char_to_byte (string, to);
7484 if (ascii_compatible && (to - from) == (pend - p))
7485 return Qnil;
7486 }
7487
7488 if (NILP (count))
7489 n = 1;
7490 else
7491 {
7492 CHECK_NATNUM (count);
7493 n = XINT (count);
7494 }
7495
7496 positions = Qnil;
7497 while (1)
7498 {
7499 int c;
7500
7501 if (ascii_compatible)
7502 while (p < stop && ASCII_BYTE_P (*p))
7503 p++, from++;
7504 if (p >= stop)
7505 {
7506 if (p >= pend)
7507 break;
7508 stop = pend;
7509 p = GAP_END_ADDR;
7510 }
7511
7512 c = STRING_CHAR_ADVANCE (p);
7513 if (! (ASCII_CHAR_P (c) && ascii_compatible)
7514 && ! char_charset (translate_char (translation_table, c),
7515 charset_list, NULL))
7516 {
7517 positions = Fcons (make_number (from), positions);
7518 n--;
7519 if (n == 0)
7520 break;
7521 }
7522
7523 from++;
7524 }
7525
7526 return (NILP (count) ? Fcar (positions) : Fnreverse (positions));
7527 }
7528
7529
7530 DEFUN ("check-coding-systems-region", Fcheck_coding_systems_region,
7531 Scheck_coding_systems_region, 3, 3, 0,
7532 doc: /* Check if the region is encodable by coding systems.
7533
7534 START and END are buffer positions specifying the region.
7535 CODING-SYSTEM-LIST is a list of coding systems to check.
7536
7537 The value is an alist ((CODING-SYSTEM POS0 POS1 ...) ...), where
7538 CODING-SYSTEM is a member of CODING-SYSTEM-LIst and can't encode the
7539 whole region, POS0, POS1, ... are buffer positions where non-encodable
7540 characters are found.
7541
7542 If all coding systems in CODING-SYSTEM-LIST can encode the region, the
7543 value is nil.
7544
7545 START may be a string. In that case, check if the string is
7546 encodable, and the value contains indices to the string instead of
7547 buffer positions. END is ignored. */)
7548 (start, end, coding_system_list)
7549 Lisp_Object start, end, coding_system_list;
7550 {
7551 Lisp_Object list;
7552 EMACS_INT start_byte, end_byte;
7553 int pos;
7554 const unsigned char *p, *pbeg, *pend;
7555 int c;
7556 Lisp_Object tail, elt, attrs;
7557
7558 if (STRINGP (start))
7559 {
7560 if (!STRING_MULTIBYTE (start)
7561 && SCHARS (start) != SBYTES (start))
7562 return Qnil;
7563 start_byte = 0;
7564 end_byte = SBYTES (start);
7565 pos = 0;
7566 }
7567 else
7568 {
7569 CHECK_NUMBER_COERCE_MARKER (start);
7570 CHECK_NUMBER_COERCE_MARKER (end);
7571 if (XINT (start) < BEG || XINT (end) > Z || XINT (start) > XINT (end))
7572 args_out_of_range (start, end);
7573 if (NILP (current_buffer->enable_multibyte_characters))
7574 return Qnil;
7575 start_byte = CHAR_TO_BYTE (XINT (start));
7576 end_byte = CHAR_TO_BYTE (XINT (end));
7577 if (XINT (end) - XINT (start) == end_byte - start_byte)
7578 return Qt;
7579
7580 if (XINT (start) < GPT && XINT (end) > GPT)
7581 {
7582 if ((GPT - XINT (start)) < (XINT (end) - GPT))
7583 move_gap_both (XINT (start), start_byte);
7584 else
7585 move_gap_both (XINT (end), end_byte);
7586 }
7587 pos = XINT (start);
7588 }
7589
7590 list = Qnil;
7591 for (tail = coding_system_list; CONSP (tail); tail = XCDR (tail))
7592 {
7593 elt = XCAR (tail);
7594 attrs = AREF (CODING_SYSTEM_SPEC (elt), 0);
7595 ASET (attrs, coding_attr_trans_tbl, get_translation_table (attrs, 1));
7596 list = Fcons (Fcons (elt, Fcons (attrs, Qnil)), list);
7597 }
7598
7599 if (STRINGP (start))
7600 p = pbeg = SDATA (start);
7601 else
7602 p = pbeg = BYTE_POS_ADDR (start_byte);
7603 pend = p + (end_byte - start_byte);
7604
7605 while (p < pend && ASCII_BYTE_P (*p)) p++, pos++;
7606 while (p < pend && ASCII_BYTE_P (*(pend - 1))) pend--;
7607
7608 while (p < pend)
7609 {
7610 if (ASCII_BYTE_P (*p))
7611 p++;
7612 else
7613 {
7614 c = STRING_CHAR_ADVANCE (p);
7615
7616 charset_map_loaded = 0;
7617 for (tail = list; CONSP (tail); tail = XCDR (tail))
7618 {
7619 elt = XCDR (XCAR (tail));
7620 if (! char_encodable_p (c, XCAR (elt)))
7621 XSETCDR (elt, Fcons (make_number (pos), XCDR (elt)));
7622 }
7623 if (charset_map_loaded)
7624 {
7625 EMACS_INT p_offset = p - pbeg, pend_offset = pend - pbeg;
7626
7627 if (STRINGP (start))
7628 pbeg = SDATA (start);
7629 else
7630 pbeg = BYTE_POS_ADDR (start_byte);
7631 p = pbeg + p_offset;
7632 pend = pbeg + pend_offset;
7633 }
7634 }
7635 pos++;
7636 }
7637
7638 tail = list;
7639 list = Qnil;
7640 for (; CONSP (tail); tail = XCDR (tail))
7641 {
7642 elt = XCAR (tail);
7643 if (CONSP (XCDR (XCDR (elt))))
7644 list = Fcons (Fcons (XCAR (elt), Fnreverse (XCDR (XCDR (elt)))),
7645 list);
7646 }
7647
7648 return list;
7649 }
7650
7651
7652 Lisp_Object
7653 code_convert_region (start, end, coding_system, dst_object, encodep, norecord)
7654 Lisp_Object start, end, coding_system, dst_object;
7655 int encodep, norecord;
7656 {
7657 struct coding_system coding;
7658 EMACS_INT from, from_byte, to, to_byte;
7659 Lisp_Object src_object;
7660
7661 CHECK_NUMBER_COERCE_MARKER (start);
7662 CHECK_NUMBER_COERCE_MARKER (end);
7663 if (NILP (coding_system))
7664 coding_system = Qno_conversion;
7665 else
7666 CHECK_CODING_SYSTEM (coding_system);
7667 src_object = Fcurrent_buffer ();
7668 if (NILP (dst_object))
7669 dst_object = src_object;
7670 else if (! EQ (dst_object, Qt))
7671 CHECK_BUFFER (dst_object);
7672
7673 validate_region (&start, &end);
7674 from = XFASTINT (start);
7675 from_byte = CHAR_TO_BYTE (from);
7676 to = XFASTINT (end);
7677 to_byte = CHAR_TO_BYTE (to);
7678
7679 setup_coding_system (coding_system, &coding);
7680 coding.mode |= CODING_MODE_LAST_BLOCK;
7681
7682 if (encodep)
7683 encode_coding_object (&coding, src_object, from, from_byte, to, to_byte,
7684 dst_object);
7685 else
7686 decode_coding_object (&coding, src_object, from, from_byte, to, to_byte,
7687 dst_object);
7688 if (! norecord)
7689 Vlast_coding_system_used = CODING_ID_NAME (coding.id);
7690
7691 return (BUFFERP (dst_object)
7692 ? make_number (coding.produced_char)
7693 : coding.dst_object);
7694 }
7695
7696
7697 DEFUN ("decode-coding-region", Fdecode_coding_region, Sdecode_coding_region,
7698 3, 4, "r\nzCoding system: ",
7699 doc: /* Decode the current region from the specified coding system.
7700 When called from a program, takes four arguments:
7701 START, END, CODING-SYSTEM, and DESTINATION.
7702 START and END are buffer positions.
7703
7704 Optional 4th arguments DESTINATION specifies where the decoded text goes.
7705 If nil, the region between START and END is replace by the decoded text.
7706 If buffer, the decoded text is inserted in the buffer.
7707 If t, the decoded text is returned.
7708
7709 This function sets `last-coding-system-used' to the precise coding system
7710 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
7711 not fully specified.)
7712 It returns the length of the decoded text. */)
7713 (start, end, coding_system, destination)
7714 Lisp_Object start, end, coding_system, destination;
7715 {
7716 return code_convert_region (start, end, coding_system, destination, 0, 0);
7717 }
7718
7719 DEFUN ("encode-coding-region", Fencode_coding_region, Sencode_coding_region,
7720 3, 4, "r\nzCoding system: ",
7721 doc: /* Encode the current region by specified coding system.
7722 When called from a program, takes three arguments:
7723 START, END, and CODING-SYSTEM. START and END are buffer positions.
7724
7725 Optional 4th arguments DESTINATION specifies where the encoded text goes.
7726 If nil, the region between START and END is replace by the encoded text.
7727 If buffer, the encoded text is inserted in the buffer.
7728 If t, the encoded text is returned.
7729
7730 This function sets `last-coding-system-used' to the precise coding system
7731 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
7732 not fully specified.)
7733 It returns the length of the encoded text. */)
7734 (start, end, coding_system, destination)
7735 Lisp_Object start, end, coding_system, destination;
7736 {
7737 return code_convert_region (start, end, coding_system, destination, 1, 0);
7738 }
7739
7740 Lisp_Object
7741 code_convert_string (string, coding_system, dst_object,
7742 encodep, nocopy, norecord)
7743 Lisp_Object string, coding_system, dst_object;
7744 int encodep, nocopy, norecord;
7745 {
7746 struct coding_system coding;
7747 EMACS_INT chars, bytes;
7748
7749 CHECK_STRING (string);
7750 if (NILP (coding_system))
7751 {
7752 if (! norecord)
7753 Vlast_coding_system_used = Qno_conversion;
7754 if (NILP (dst_object))
7755 return (nocopy ? Fcopy_sequence (string) : string);
7756 }
7757
7758 if (NILP (coding_system))
7759 coding_system = Qno_conversion;
7760 else
7761 CHECK_CODING_SYSTEM (coding_system);
7762 if (NILP (dst_object))
7763 dst_object = Qt;
7764 else if (! EQ (dst_object, Qt))
7765 CHECK_BUFFER (dst_object);
7766
7767 setup_coding_system (coding_system, &coding);
7768 coding.mode |= CODING_MODE_LAST_BLOCK;
7769 chars = SCHARS (string);
7770 bytes = SBYTES (string);
7771 if (encodep)
7772 encode_coding_object (&coding, string, 0, 0, chars, bytes, dst_object);
7773 else
7774 decode_coding_object (&coding, string, 0, 0, chars, bytes, dst_object);
7775 if (! norecord)
7776 Vlast_coding_system_used = CODING_ID_NAME (coding.id);
7777
7778 return (BUFFERP (dst_object)
7779 ? make_number (coding.produced_char)
7780 : coding.dst_object);
7781 }
7782
7783
7784 /* Encode or decode STRING according to CODING_SYSTEM.
7785 Do not set Vlast_coding_system_used.
7786
7787 This function is called only from macros DECODE_FILE and
7788 ENCODE_FILE, thus we ignore character composition. */
7789
7790 Lisp_Object
7791 code_convert_string_norecord (string, coding_system, encodep)
7792 Lisp_Object string, coding_system;
7793 int encodep;
7794 {
7795 return code_convert_string (string, coding_system, Qt, encodep, 0, 1);
7796 }
7797
7798
7799 DEFUN ("decode-coding-string", Fdecode_coding_string, Sdecode_coding_string,
7800 2, 4, 0,
7801 doc: /* Decode STRING which is encoded in CODING-SYSTEM, and return the result.
7802
7803 Optional third arg NOCOPY non-nil means it is OK to return STRING itself
7804 if the decoding operation is trivial.
7805
7806 Optional fourth arg BUFFER non-nil meant that the decoded text is
7807 inserted in BUFFER instead of returned as a string. In this case,
7808 the return value is BUFFER.
7809
7810 This function sets `last-coding-system-used' to the precise coding system
7811 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
7812 not fully specified. */)
7813 (string, coding_system, nocopy, buffer)
7814 Lisp_Object string, coding_system, nocopy, buffer;
7815 {
7816 return code_convert_string (string, coding_system, buffer,
7817 0, ! NILP (nocopy), 0);
7818 }
7819
7820 DEFUN ("encode-coding-string", Fencode_coding_string, Sencode_coding_string,
7821 2, 4, 0,
7822 doc: /* Encode STRING to CODING-SYSTEM, and return the result.
7823
7824 Optional third arg NOCOPY non-nil means it is OK to return STRING
7825 itself if the encoding operation is trivial.
7826
7827 Optional fourth arg BUFFER non-nil meant that the encoded text is
7828 inserted in BUFFER instead of returned as a string. In this case,
7829 the return value is BUFFER.
7830
7831 This function sets `last-coding-system-used' to the precise coding system
7832 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
7833 not fully specified.) */)
7834 (string, coding_system, nocopy, buffer)
7835 Lisp_Object string, coding_system, nocopy, buffer;
7836 {
7837 return code_convert_string (string, coding_system, buffer,
7838 1, ! NILP (nocopy), 1);
7839 }
7840
7841 \f
7842 DEFUN ("decode-sjis-char", Fdecode_sjis_char, Sdecode_sjis_char, 1, 1, 0,
7843 doc: /* Decode a Japanese character which has CODE in shift_jis encoding.
7844 Return the corresponding character. */)
7845 (code)
7846 Lisp_Object code;
7847 {
7848 Lisp_Object spec, attrs, val;
7849 struct charset *charset_roman, *charset_kanji, *charset_kana, *charset;
7850 int c;
7851
7852 CHECK_NATNUM (code);
7853 c = XFASTINT (code);
7854 CHECK_CODING_SYSTEM_GET_SPEC (Vsjis_coding_system, spec);
7855 attrs = AREF (spec, 0);
7856
7857 if (ASCII_BYTE_P (c)
7858 && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
7859 return code;
7860
7861 val = CODING_ATTR_CHARSET_LIST (attrs);
7862 charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
7863 charset_kana = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
7864 charset_kanji = CHARSET_FROM_ID (XINT (XCAR (val)));
7865
7866 if (c <= 0x7F)
7867 charset = charset_roman;
7868 else if (c >= 0xA0 && c < 0xDF)
7869 {
7870 charset = charset_kana;
7871 c -= 0x80;
7872 }
7873 else
7874 {
7875 int s1 = c >> 8, s2 = c & 0xFF;
7876
7877 if (s1 < 0x81 || (s1 > 0x9F && s1 < 0xE0) || s1 > 0xEF
7878 || s2 < 0x40 || s2 == 0x7F || s2 > 0xFC)
7879 error ("Invalid code: %d", code);
7880 SJIS_TO_JIS (c);
7881 charset = charset_kanji;
7882 }
7883 c = DECODE_CHAR (charset, c);
7884 if (c < 0)
7885 error ("Invalid code: %d", code);
7886 return make_number (c);
7887 }
7888
7889
7890 DEFUN ("encode-sjis-char", Fencode_sjis_char, Sencode_sjis_char, 1, 1, 0,
7891 doc: /* Encode a Japanese character CHAR to shift_jis encoding.
7892 Return the corresponding code in SJIS. */)
7893 (ch)
7894 Lisp_Object ch;
7895 {
7896 Lisp_Object spec, attrs, charset_list;
7897 int c;
7898 struct charset *charset;
7899 unsigned code;
7900
7901 CHECK_CHARACTER (ch);
7902 c = XFASTINT (ch);
7903 CHECK_CODING_SYSTEM_GET_SPEC (Vsjis_coding_system, spec);
7904 attrs = AREF (spec, 0);
7905
7906 if (ASCII_CHAR_P (c)
7907 && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
7908 return ch;
7909
7910 charset_list = CODING_ATTR_CHARSET_LIST (attrs);
7911 charset = char_charset (c, charset_list, &code);
7912 if (code == CHARSET_INVALID_CODE (charset))
7913 error ("Can't encode by shift_jis encoding: %d", c);
7914 JIS_TO_SJIS (code);
7915
7916 return make_number (code);
7917 }
7918
7919 DEFUN ("decode-big5-char", Fdecode_big5_char, Sdecode_big5_char, 1, 1, 0,
7920 doc: /* Decode a Big5 character which has CODE in BIG5 coding system.
7921 Return the corresponding character. */)
7922 (code)
7923 Lisp_Object code;
7924 {
7925 Lisp_Object spec, attrs, val;
7926 struct charset *charset_roman, *charset_big5, *charset;
7927 int c;
7928
7929 CHECK_NATNUM (code);
7930 c = XFASTINT (code);
7931 CHECK_CODING_SYSTEM_GET_SPEC (Vbig5_coding_system, spec);
7932 attrs = AREF (spec, 0);
7933
7934 if (ASCII_BYTE_P (c)
7935 && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
7936 return code;
7937
7938 val = CODING_ATTR_CHARSET_LIST (attrs);
7939 charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
7940 charset_big5 = CHARSET_FROM_ID (XINT (XCAR (val)));
7941
7942 if (c <= 0x7F)
7943 charset = charset_roman;
7944 else
7945 {
7946 int b1 = c >> 8, b2 = c & 0x7F;
7947 if (b1 < 0xA1 || b1 > 0xFE
7948 || b2 < 0x40 || (b2 > 0x7E && b2 < 0xA1) || b2 > 0xFE)
7949 error ("Invalid code: %d", code);
7950 charset = charset_big5;
7951 }
7952 c = DECODE_CHAR (charset, (unsigned )c);
7953 if (c < 0)
7954 error ("Invalid code: %d", code);
7955 return make_number (c);
7956 }
7957
7958 DEFUN ("encode-big5-char", Fencode_big5_char, Sencode_big5_char, 1, 1, 0,
7959 doc: /* Encode the Big5 character CHAR to BIG5 coding system.
7960 Return the corresponding character code in Big5. */)
7961 (ch)
7962 Lisp_Object ch;
7963 {
7964 Lisp_Object spec, attrs, charset_list;
7965 struct charset *charset;
7966 int c;
7967 unsigned code;
7968
7969 CHECK_CHARACTER (ch);
7970 c = XFASTINT (ch);
7971 CHECK_CODING_SYSTEM_GET_SPEC (Vbig5_coding_system, spec);
7972 attrs = AREF (spec, 0);
7973 if (ASCII_CHAR_P (c)
7974 && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
7975 return ch;
7976
7977 charset_list = CODING_ATTR_CHARSET_LIST (attrs);
7978 charset = char_charset (c, charset_list, &code);
7979 if (code == CHARSET_INVALID_CODE (charset))
7980 error ("Can't encode by Big5 encoding: %d", c);
7981
7982 return make_number (code);
7983 }
7984
7985 \f
7986 DEFUN ("set-terminal-coding-system-internal",
7987 Fset_terminal_coding_system_internal,
7988 Sset_terminal_coding_system_internal, 1, 1, 0,
7989 doc: /* Internal use only. */)
7990 (coding_system)
7991 Lisp_Object coding_system;
7992 {
7993 CHECK_SYMBOL (coding_system);
7994 setup_coding_system (Fcheck_coding_system (coding_system),
7995 &terminal_coding);
7996
7997 /* We had better not send unsafe characters to terminal. */
7998 terminal_coding.mode |= CODING_MODE_SAFE_ENCODING;
7999 /* Characer composition should be disabled. */
8000 terminal_coding.common_flags &= ~CODING_ANNOTATE_COMPOSITION_MASK;
8001 terminal_coding.src_multibyte = 1;
8002 terminal_coding.dst_multibyte = 0;
8003 return Qnil;
8004 }
8005
8006 DEFUN ("set-safe-terminal-coding-system-internal",
8007 Fset_safe_terminal_coding_system_internal,
8008 Sset_safe_terminal_coding_system_internal, 1, 1, 0,
8009 doc: /* Internal use only. */)
8010 (coding_system)
8011 Lisp_Object coding_system;
8012 {
8013 CHECK_SYMBOL (coding_system);
8014 setup_coding_system (Fcheck_coding_system (coding_system),
8015 &safe_terminal_coding);
8016 /* Characer composition should be disabled. */
8017 safe_terminal_coding.common_flags &= ~CODING_ANNOTATE_COMPOSITION_MASK;
8018 safe_terminal_coding.src_multibyte = 1;
8019 safe_terminal_coding.dst_multibyte = 0;
8020 return Qnil;
8021 }
8022
8023 DEFUN ("terminal-coding-system",
8024 Fterminal_coding_system, Sterminal_coding_system, 0, 0, 0,
8025 doc: /* Return coding system specified for terminal output. */)
8026 ()
8027 {
8028 return CODING_ID_NAME (terminal_coding.id);
8029 }
8030
8031 DEFUN ("set-keyboard-coding-system-internal",
8032 Fset_keyboard_coding_system_internal,
8033 Sset_keyboard_coding_system_internal, 1, 1, 0,
8034 doc: /* Internal use only. */)
8035 (coding_system)
8036 Lisp_Object coding_system;
8037 {
8038 CHECK_SYMBOL (coding_system);
8039 setup_coding_system (Fcheck_coding_system (coding_system),
8040 &keyboard_coding);
8041 /* Characer composition should be disabled. */
8042 keyboard_coding.common_flags &= ~CODING_ANNOTATE_COMPOSITION_MASK;
8043 return Qnil;
8044 }
8045
8046 DEFUN ("keyboard-coding-system",
8047 Fkeyboard_coding_system, Skeyboard_coding_system, 0, 0, 0,
8048 doc: /* Return coding system specified for decoding keyboard input. */)
8049 ()
8050 {
8051 return CODING_ID_NAME (keyboard_coding.id);
8052 }
8053
8054 \f
8055 DEFUN ("find-operation-coding-system", Ffind_operation_coding_system,
8056 Sfind_operation_coding_system, 1, MANY, 0,
8057 doc: /* Choose a coding system for an operation based on the target name.
8058 The value names a pair of coding systems: (DECODING-SYSTEM . ENCODING-SYSTEM).
8059 DECODING-SYSTEM is the coding system to use for decoding
8060 \(in case OPERATION does decoding), and ENCODING-SYSTEM is the coding system
8061 for encoding (in case OPERATION does encoding).
8062
8063 The first argument OPERATION specifies an I/O primitive:
8064 For file I/O, `insert-file-contents' or `write-region'.
8065 For process I/O, `call-process', `call-process-region', or `start-process'.
8066 For network I/O, `open-network-stream'.
8067
8068 The remaining arguments should be the same arguments that were passed
8069 to the primitive. Depending on which primitive, one of those arguments
8070 is selected as the TARGET. For example, if OPERATION does file I/O,
8071 whichever argument specifies the file name is TARGET.
8072
8073 TARGET has a meaning which depends on OPERATION:
8074 For file I/O, TARGET is a file name.
8075 For process I/O, TARGET is a process name.
8076 For network I/O, TARGET is a service name or a port number
8077
8078 This function looks up what specified for TARGET in,
8079 `file-coding-system-alist', `process-coding-system-alist',
8080 or `network-coding-system-alist' depending on OPERATION.
8081 They may specify a coding system, a cons of coding systems,
8082 or a function symbol to call.
8083 In the last case, we call the function with one argument,
8084 which is a list of all the arguments given to this function.
8085
8086 usage: (find-operation-coding-system OPERATION ARGUMENTS ...) */)
8087 (nargs, args)
8088 int nargs;
8089 Lisp_Object *args;
8090 {
8091 Lisp_Object operation, target_idx, target, val;
8092 register Lisp_Object chain;
8093
8094 if (nargs < 2)
8095 error ("Too few arguments");
8096 operation = args[0];
8097 if (!SYMBOLP (operation)
8098 || !INTEGERP (target_idx = Fget (operation, Qtarget_idx)))
8099 error ("Invalid first arguement");
8100 if (nargs < 1 + XINT (target_idx))
8101 error ("Too few arguments for operation: %s",
8102 SDATA (SYMBOL_NAME (operation)));
8103 target = args[XINT (target_idx) + 1];
8104 if (!(STRINGP (target)
8105 || (EQ (operation, Qopen_network_stream) && INTEGERP (target))))
8106 error ("Invalid %dth argument", XINT (target_idx) + 1);
8107
8108 chain = ((EQ (operation, Qinsert_file_contents)
8109 || EQ (operation, Qwrite_region))
8110 ? Vfile_coding_system_alist
8111 : (EQ (operation, Qopen_network_stream)
8112 ? Vnetwork_coding_system_alist
8113 : Vprocess_coding_system_alist));
8114 if (NILP (chain))
8115 return Qnil;
8116
8117 for (; CONSP (chain); chain = XCDR (chain))
8118 {
8119 Lisp_Object elt;
8120
8121 elt = XCAR (chain);
8122 if (CONSP (elt)
8123 && ((STRINGP (target)
8124 && STRINGP (XCAR (elt))
8125 && fast_string_match (XCAR (elt), target) >= 0)
8126 || (INTEGERP (target) && EQ (target, XCAR (elt)))))
8127 {
8128 val = XCDR (elt);
8129 /* Here, if VAL is both a valid coding system and a valid
8130 function symbol, we return VAL as a coding system. */
8131 if (CONSP (val))
8132 return val;
8133 if (! SYMBOLP (val))
8134 return Qnil;
8135 if (! NILP (Fcoding_system_p (val)))
8136 return Fcons (val, val);
8137 if (! NILP (Ffboundp (val)))
8138 {
8139 val = call1 (val, Flist (nargs, args));
8140 if (CONSP (val))
8141 return val;
8142 if (SYMBOLP (val) && ! NILP (Fcoding_system_p (val)))
8143 return Fcons (val, val);
8144 }
8145 return Qnil;
8146 }
8147 }
8148 return Qnil;
8149 }
8150
8151 DEFUN ("set-coding-system-priority", Fset_coding_system_priority,
8152 Sset_coding_system_priority, 0, MANY, 0,
8153 doc: /* Assign higher priority to the coding systems given as arguments.
8154 If multiple coding systems belongs to the same category,
8155 all but the first one are ignored.
8156
8157 usage: (set-coding-system-priority ...) */)
8158 (nargs, args)
8159 int nargs;
8160 Lisp_Object *args;
8161 {
8162 int i, j;
8163 int changed[coding_category_max];
8164 enum coding_category priorities[coding_category_max];
8165
8166 bzero (changed, sizeof changed);
8167
8168 for (i = j = 0; i < nargs; i++)
8169 {
8170 enum coding_category category;
8171 Lisp_Object spec, attrs;
8172
8173 CHECK_CODING_SYSTEM_GET_SPEC (args[i], spec);
8174 attrs = AREF (spec, 0);
8175 category = XINT (CODING_ATTR_CATEGORY (attrs));
8176 if (changed[category])
8177 /* Ignore this coding system because a coding system of the
8178 same category already had a higher priority. */
8179 continue;
8180 changed[category] = 1;
8181 priorities[j++] = category;
8182 if (coding_categories[category].id >= 0
8183 && ! EQ (args[i], CODING_ID_NAME (coding_categories[category].id)))
8184 setup_coding_system (args[i], &coding_categories[category]);
8185 Fset (AREF (Vcoding_category_table, category), args[i]);
8186 }
8187
8188 /* Now we have decided top J priorities. Reflect the order of the
8189 original priorities to the remaining priorities. */
8190
8191 for (i = j, j = 0; i < coding_category_max; i++, j++)
8192 {
8193 while (j < coding_category_max
8194 && changed[coding_priorities[j]])
8195 j++;
8196 if (j == coding_category_max)
8197 abort ();
8198 priorities[i] = coding_priorities[j];
8199 }
8200
8201 bcopy (priorities, coding_priorities, sizeof priorities);
8202
8203 /* Update `coding-category-list'. */
8204 Vcoding_category_list = Qnil;
8205 for (i = coding_category_max - 1; i >= 0; i--)
8206 Vcoding_category_list
8207 = Fcons (AREF (Vcoding_category_table, priorities[i]),
8208 Vcoding_category_list);
8209
8210 return Qnil;
8211 }
8212
8213 DEFUN ("coding-system-priority-list", Fcoding_system_priority_list,
8214 Scoding_system_priority_list, 0, 1, 0,
8215 doc: /* Return a list of coding systems ordered by their priorities.
8216 HIGHESTP non-nil means just return the highest priority one. */)
8217 (highestp)
8218 Lisp_Object highestp;
8219 {
8220 int i;
8221 Lisp_Object val;
8222
8223 for (i = 0, val = Qnil; i < coding_category_max; i++)
8224 {
8225 enum coding_category category = coding_priorities[i];
8226 int id = coding_categories[category].id;
8227 Lisp_Object attrs;
8228
8229 if (id < 0)
8230 continue;
8231 attrs = CODING_ID_ATTRS (id);
8232 if (! NILP (highestp))
8233 return CODING_ATTR_BASE_NAME (attrs);
8234 val = Fcons (CODING_ATTR_BASE_NAME (attrs), val);
8235 }
8236 return Fnreverse (val);
8237 }
8238
8239 static char *suffixes[] = { "-unix", "-dos", "-mac" };
8240
8241 static Lisp_Object
8242 make_subsidiaries (base)
8243 Lisp_Object base;
8244 {
8245 Lisp_Object subsidiaries;
8246 int base_name_len = SBYTES (SYMBOL_NAME (base));
8247 char *buf = (char *) alloca (base_name_len + 6);
8248 int i;
8249
8250 bcopy (SDATA (SYMBOL_NAME (base)), buf, base_name_len);
8251 subsidiaries = Fmake_vector (make_number (3), Qnil);
8252 for (i = 0; i < 3; i++)
8253 {
8254 bcopy (suffixes[i], buf + base_name_len, strlen (suffixes[i]) + 1);
8255 ASET (subsidiaries, i, intern (buf));
8256 }
8257 return subsidiaries;
8258 }
8259
8260
8261 DEFUN ("define-coding-system-internal", Fdefine_coding_system_internal,
8262 Sdefine_coding_system_internal, coding_arg_max, MANY, 0,
8263 doc: /* For internal use only.
8264 usage: (define-coding-system-internal ...) */)
8265 (nargs, args)
8266 int nargs;
8267 Lisp_Object *args;
8268 {
8269 Lisp_Object name;
8270 Lisp_Object spec_vec; /* [ ATTRS ALIASE EOL_TYPE ] */
8271 Lisp_Object attrs; /* Vector of attributes. */
8272 Lisp_Object eol_type;
8273 Lisp_Object aliases;
8274 Lisp_Object coding_type, charset_list, safe_charsets;
8275 enum coding_category category;
8276 Lisp_Object tail, val;
8277 int max_charset_id = 0;
8278 int i;
8279
8280 if (nargs < coding_arg_max)
8281 goto short_args;
8282
8283 attrs = Fmake_vector (make_number (coding_attr_last_index), Qnil);
8284
8285 name = args[coding_arg_name];
8286 CHECK_SYMBOL (name);
8287 CODING_ATTR_BASE_NAME (attrs) = name;
8288
8289 val = args[coding_arg_mnemonic];
8290 if (! STRINGP (val))
8291 CHECK_CHARACTER (val);
8292 CODING_ATTR_MNEMONIC (attrs) = val;
8293
8294 coding_type = args[coding_arg_coding_type];
8295 CHECK_SYMBOL (coding_type);
8296 CODING_ATTR_TYPE (attrs) = coding_type;
8297
8298 charset_list = args[coding_arg_charset_list];
8299 if (SYMBOLP (charset_list))
8300 {
8301 if (EQ (charset_list, Qiso_2022))
8302 {
8303 if (! EQ (coding_type, Qiso_2022))
8304 error ("Invalid charset-list");
8305 charset_list = Viso_2022_charset_list;
8306 }
8307 else if (EQ (charset_list, Qemacs_mule))
8308 {
8309 if (! EQ (coding_type, Qemacs_mule))
8310 error ("Invalid charset-list");
8311 charset_list = Vemacs_mule_charset_list;
8312 }
8313 for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
8314 if (max_charset_id < XFASTINT (XCAR (tail)))
8315 max_charset_id = XFASTINT (XCAR (tail));
8316 }
8317 else
8318 {
8319 charset_list = Fcopy_sequence (charset_list);
8320 for (tail = charset_list; !NILP (tail); tail = Fcdr (tail))
8321 {
8322 struct charset *charset;
8323
8324 val = Fcar (tail);
8325 CHECK_CHARSET_GET_CHARSET (val, charset);
8326 if (EQ (coding_type, Qiso_2022)
8327 ? CHARSET_ISO_FINAL (charset) < 0
8328 : EQ (coding_type, Qemacs_mule)
8329 ? CHARSET_EMACS_MULE_ID (charset) < 0
8330 : 0)
8331 error ("Can't handle charset `%s'",
8332 SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
8333
8334 XSETCAR (tail, make_number (charset->id));
8335 if (max_charset_id < charset->id)
8336 max_charset_id = charset->id;
8337 }
8338 }
8339 CODING_ATTR_CHARSET_LIST (attrs) = charset_list;
8340
8341 safe_charsets = Fmake_string (make_number (max_charset_id + 1),
8342 make_number (255));
8343 for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
8344 SSET (safe_charsets, XFASTINT (XCAR (tail)), 0);
8345 CODING_ATTR_SAFE_CHARSETS (attrs) = safe_charsets;
8346
8347 CODING_ATTR_ASCII_COMPAT (attrs) = args[coding_arg_ascii_compatible_p];
8348
8349 val = args[coding_arg_decode_translation_table];
8350 if (! CHAR_TABLE_P (val) && ! CONSP (val))
8351 CHECK_SYMBOL (val);
8352 CODING_ATTR_DECODE_TBL (attrs) = val;
8353
8354 val = args[coding_arg_encode_translation_table];
8355 if (! CHAR_TABLE_P (val) && ! CONSP (val))
8356 CHECK_SYMBOL (val);
8357 CODING_ATTR_ENCODE_TBL (attrs) = val;
8358
8359 val = args[coding_arg_post_read_conversion];
8360 CHECK_SYMBOL (val);
8361 CODING_ATTR_POST_READ (attrs) = val;
8362
8363 val = args[coding_arg_pre_write_conversion];
8364 CHECK_SYMBOL (val);
8365 CODING_ATTR_PRE_WRITE (attrs) = val;
8366
8367 val = args[coding_arg_default_char];
8368 if (NILP (val))
8369 CODING_ATTR_DEFAULT_CHAR (attrs) = make_number (' ');
8370 else
8371 {
8372 CHECK_CHARACTER (val);
8373 CODING_ATTR_DEFAULT_CHAR (attrs) = val;
8374 }
8375
8376 val = args[coding_arg_for_unibyte];
8377 CODING_ATTR_FOR_UNIBYTE (attrs) = NILP (val) ? Qnil : Qt;
8378
8379 val = args[coding_arg_plist];
8380 CHECK_LIST (val);
8381 CODING_ATTR_PLIST (attrs) = val;
8382
8383 if (EQ (coding_type, Qcharset))
8384 {
8385 /* Generate a lisp vector of 256 elements. Each element is nil,
8386 integer, or a list of charset IDs.
8387
8388 If Nth element is nil, the byte code N is invalid in this
8389 coding system.
8390
8391 If Nth element is a number NUM, N is the first byte of a
8392 charset whose ID is NUM.
8393
8394 If Nth element is a list of charset IDs, N is the first byte
8395 of one of them. The list is sorted by dimensions of the
8396 charsets. A charset of smaller dimension comes firtst. */
8397 val = Fmake_vector (make_number (256), Qnil);
8398
8399 for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
8400 {
8401 struct charset *charset = CHARSET_FROM_ID (XFASTINT (XCAR (tail)));
8402 int dim = CHARSET_DIMENSION (charset);
8403 int idx = (dim - 1) * 4;
8404
8405 if (CHARSET_ASCII_COMPATIBLE_P (charset))
8406 CODING_ATTR_ASCII_COMPAT (attrs) = Qt;
8407
8408 for (i = charset->code_space[idx];
8409 i <= charset->code_space[idx + 1]; i++)
8410 {
8411 Lisp_Object tmp, tmp2;
8412 int dim2;
8413
8414 tmp = AREF (val, i);
8415 if (NILP (tmp))
8416 tmp = XCAR (tail);
8417 else if (NUMBERP (tmp))
8418 {
8419 dim2 = CHARSET_DIMENSION (CHARSET_FROM_ID (XFASTINT (tmp)));
8420 if (dim < dim2)
8421 tmp = Fcons (XCAR (tail), Fcons (tmp, Qnil));
8422 else
8423 tmp = Fcons (tmp, Fcons (XCAR (tail), Qnil));
8424 }
8425 else
8426 {
8427 for (tmp2 = tmp; CONSP (tmp2); tmp2 = XCDR (tmp2))
8428 {
8429 dim2 = CHARSET_DIMENSION (CHARSET_FROM_ID (XFASTINT (XCAR (tmp2))));
8430 if (dim < dim2)
8431 break;
8432 }
8433 if (NILP (tmp2))
8434 tmp = nconc2 (tmp, Fcons (XCAR (tail), Qnil));
8435 else
8436 {
8437 XSETCDR (tmp2, Fcons (XCAR (tmp2), XCDR (tmp2)));
8438 XSETCAR (tmp2, XCAR (tail));
8439 }
8440 }
8441 ASET (val, i, tmp);
8442 }
8443 }
8444 ASET (attrs, coding_attr_charset_valids, val);
8445 category = coding_category_charset;
8446 }
8447 else if (EQ (coding_type, Qccl))
8448 {
8449 Lisp_Object valids;
8450
8451 if (nargs < coding_arg_ccl_max)
8452 goto short_args;
8453
8454 val = args[coding_arg_ccl_decoder];
8455 CHECK_CCL_PROGRAM (val);
8456 if (VECTORP (val))
8457 val = Fcopy_sequence (val);
8458 ASET (attrs, coding_attr_ccl_decoder, val);
8459
8460 val = args[coding_arg_ccl_encoder];
8461 CHECK_CCL_PROGRAM (val);
8462 if (VECTORP (val))
8463 val = Fcopy_sequence (val);
8464 ASET (attrs, coding_attr_ccl_encoder, val);
8465
8466 val = args[coding_arg_ccl_valids];
8467 valids = Fmake_string (make_number (256), make_number (0));
8468 for (tail = val; !NILP (tail); tail = Fcdr (tail))
8469 {
8470 int from, to;
8471
8472 val = Fcar (tail);
8473 if (INTEGERP (val))
8474 {
8475 from = to = XINT (val);
8476 if (from < 0 || from > 255)
8477 args_out_of_range_3 (val, make_number (0), make_number (255));
8478 }
8479 else
8480 {
8481 CHECK_CONS (val);
8482 CHECK_NATNUM_CAR (val);
8483 CHECK_NATNUM_CDR (val);
8484 from = XINT (XCAR (val));
8485 if (from > 255)
8486 args_out_of_range_3 (XCAR (val),
8487 make_number (0), make_number (255));
8488 to = XINT (XCDR (val));
8489 if (to < from || to > 255)
8490 args_out_of_range_3 (XCDR (val),
8491 XCAR (val), make_number (255));
8492 }
8493 for (i = from; i <= to; i++)
8494 SSET (valids, i, 1);
8495 }
8496 ASET (attrs, coding_attr_ccl_valids, valids);
8497
8498 category = coding_category_ccl;
8499 }
8500 else if (EQ (coding_type, Qutf_16))
8501 {
8502 Lisp_Object bom, endian;
8503
8504 CODING_ATTR_ASCII_COMPAT (attrs) = Qnil;
8505
8506 if (nargs < coding_arg_utf16_max)
8507 goto short_args;
8508
8509 bom = args[coding_arg_utf16_bom];
8510 if (! NILP (bom) && ! EQ (bom, Qt))
8511 {
8512 CHECK_CONS (bom);
8513 val = XCAR (bom);
8514 CHECK_CODING_SYSTEM (val);
8515 val = XCDR (bom);
8516 CHECK_CODING_SYSTEM (val);
8517 }
8518 ASET (attrs, coding_attr_utf_16_bom, bom);
8519
8520 endian = args[coding_arg_utf16_endian];
8521 CHECK_SYMBOL (endian);
8522 if (NILP (endian))
8523 endian = Qbig;
8524 else if (! EQ (endian, Qbig) && ! EQ (endian, Qlittle))
8525 error ("Invalid endian: %s", SDATA (SYMBOL_NAME (endian)));
8526 ASET (attrs, coding_attr_utf_16_endian, endian);
8527
8528 category = (CONSP (bom)
8529 ? coding_category_utf_16_auto
8530 : NILP (bom)
8531 ? (EQ (endian, Qbig)
8532 ? coding_category_utf_16_be_nosig
8533 : coding_category_utf_16_le_nosig)
8534 : (EQ (endian, Qbig)
8535 ? coding_category_utf_16_be
8536 : coding_category_utf_16_le));
8537 }
8538 else if (EQ (coding_type, Qiso_2022))
8539 {
8540 Lisp_Object initial, reg_usage, request, flags;
8541 int i;
8542
8543 if (nargs < coding_arg_iso2022_max)
8544 goto short_args;
8545
8546 initial = Fcopy_sequence (args[coding_arg_iso2022_initial]);
8547 CHECK_VECTOR (initial);
8548 for (i = 0; i < 4; i++)
8549 {
8550 val = Faref (initial, make_number (i));
8551 if (! NILP (val))
8552 {
8553 struct charset *charset;
8554
8555 CHECK_CHARSET_GET_CHARSET (val, charset);
8556 ASET (initial, i, make_number (CHARSET_ID (charset)));
8557 if (i == 0 && CHARSET_ASCII_COMPATIBLE_P (charset))
8558 CODING_ATTR_ASCII_COMPAT (attrs) = Qt;
8559 }
8560 else
8561 ASET (initial, i, make_number (-1));
8562 }
8563
8564 reg_usage = args[coding_arg_iso2022_reg_usage];
8565 CHECK_CONS (reg_usage);
8566 CHECK_NUMBER_CAR (reg_usage);
8567 CHECK_NUMBER_CDR (reg_usage);
8568
8569 request = Fcopy_sequence (args[coding_arg_iso2022_request]);
8570 for (tail = request; ! NILP (tail); tail = Fcdr (tail))
8571 {
8572 int id;
8573 Lisp_Object tmp;
8574
8575 val = Fcar (tail);
8576 CHECK_CONS (val);
8577 tmp = XCAR (val);
8578 CHECK_CHARSET_GET_ID (tmp, id);
8579 CHECK_NATNUM_CDR (val);
8580 if (XINT (XCDR (val)) >= 4)
8581 error ("Invalid graphic register number: %d", XINT (XCDR (val)));
8582 XSETCAR (val, make_number (id));
8583 }
8584
8585 flags = args[coding_arg_iso2022_flags];
8586 CHECK_NATNUM (flags);
8587 i = XINT (flags);
8588 if (EQ (args[coding_arg_charset_list], Qiso_2022))
8589 flags = make_number (i | CODING_ISO_FLAG_FULL_SUPPORT);
8590
8591 ASET (attrs, coding_attr_iso_initial, initial);
8592 ASET (attrs, coding_attr_iso_usage, reg_usage);
8593 ASET (attrs, coding_attr_iso_request, request);
8594 ASET (attrs, coding_attr_iso_flags, flags);
8595 setup_iso_safe_charsets (attrs);
8596
8597 if (i & CODING_ISO_FLAG_SEVEN_BITS)
8598 category = ((i & (CODING_ISO_FLAG_LOCKING_SHIFT
8599 | CODING_ISO_FLAG_SINGLE_SHIFT))
8600 ? coding_category_iso_7_else
8601 : EQ (args[coding_arg_charset_list], Qiso_2022)
8602 ? coding_category_iso_7
8603 : coding_category_iso_7_tight);
8604 else
8605 {
8606 int id = XINT (AREF (initial, 1));
8607
8608 category = (((i & CODING_ISO_FLAG_LOCKING_SHIFT)
8609 || EQ (args[coding_arg_charset_list], Qiso_2022)
8610 || id < 0)
8611 ? coding_category_iso_8_else
8612 : (CHARSET_DIMENSION (CHARSET_FROM_ID (id)) == 1)
8613 ? coding_category_iso_8_1
8614 : coding_category_iso_8_2);
8615 }
8616 if (category != coding_category_iso_8_1
8617 && category != coding_category_iso_8_2)
8618 CODING_ATTR_ASCII_COMPAT (attrs) = Qnil;
8619 }
8620 else if (EQ (coding_type, Qemacs_mule))
8621 {
8622 if (EQ (args[coding_arg_charset_list], Qemacs_mule))
8623 ASET (attrs, coding_attr_emacs_mule_full, Qt);
8624 CODING_ATTR_ASCII_COMPAT (attrs) = Qt;
8625 category = coding_category_emacs_mule;
8626 }
8627 else if (EQ (coding_type, Qshift_jis))
8628 {
8629
8630 struct charset *charset;
8631
8632 if (XINT (Flength (charset_list)) != 3
8633 && XINT (Flength (charset_list)) != 4)
8634 error ("There should be three or four charsets");
8635
8636 charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
8637 if (CHARSET_DIMENSION (charset) != 1)
8638 error ("Dimension of charset %s is not one",
8639 SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
8640 if (CHARSET_ASCII_COMPATIBLE_P (charset))
8641 CODING_ATTR_ASCII_COMPAT (attrs) = Qt;
8642
8643 charset_list = XCDR (charset_list);
8644 charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
8645 if (CHARSET_DIMENSION (charset) != 1)
8646 error ("Dimension of charset %s is not one",
8647 SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
8648
8649 charset_list = XCDR (charset_list);
8650 charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
8651 if (CHARSET_DIMENSION (charset) != 2)
8652 error ("Dimension of charset %s is not two",
8653 SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
8654
8655 charset_list = XCDR (charset_list);
8656 if (! NILP (charset_list))
8657 {
8658 charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
8659 if (CHARSET_DIMENSION (charset) != 2)
8660 error ("Dimension of charset %s is not two",
8661 SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
8662 }
8663
8664 category = coding_category_sjis;
8665 Vsjis_coding_system = name;
8666 }
8667 else if (EQ (coding_type, Qbig5))
8668 {
8669 struct charset *charset;
8670
8671 if (XINT (Flength (charset_list)) != 2)
8672 error ("There should be just two charsets");
8673
8674 charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
8675 if (CHARSET_DIMENSION (charset) != 1)
8676 error ("Dimension of charset %s is not one",
8677 SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
8678 if (CHARSET_ASCII_COMPATIBLE_P (charset))
8679 CODING_ATTR_ASCII_COMPAT (attrs) = Qt;
8680
8681 charset_list = XCDR (charset_list);
8682 charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
8683 if (CHARSET_DIMENSION (charset) != 2)
8684 error ("Dimension of charset %s is not two",
8685 SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
8686
8687 category = coding_category_big5;
8688 Vbig5_coding_system = name;
8689 }
8690 else if (EQ (coding_type, Qraw_text))
8691 {
8692 category = coding_category_raw_text;
8693 CODING_ATTR_ASCII_COMPAT (attrs) = Qt;
8694 }
8695 else if (EQ (coding_type, Qutf_8))
8696 {
8697 category = coding_category_utf_8;
8698 CODING_ATTR_ASCII_COMPAT (attrs) = Qt;
8699 }
8700 else if (EQ (coding_type, Qundecided))
8701 category = coding_category_undecided;
8702 else
8703 error ("Invalid coding system type: %s",
8704 SDATA (SYMBOL_NAME (coding_type)));
8705
8706 CODING_ATTR_CATEGORY (attrs) = make_number (category);
8707 CODING_ATTR_PLIST (attrs)
8708 = Fcons (QCcategory, Fcons (AREF (Vcoding_category_table, category),
8709 CODING_ATTR_PLIST (attrs)));
8710
8711 eol_type = args[coding_arg_eol_type];
8712 if (! NILP (eol_type)
8713 && ! EQ (eol_type, Qunix)
8714 && ! EQ (eol_type, Qdos)
8715 && ! EQ (eol_type, Qmac))
8716 error ("Invalid eol-type");
8717
8718 aliases = Fcons (name, Qnil);
8719
8720 if (NILP (eol_type))
8721 {
8722 eol_type = make_subsidiaries (name);
8723 for (i = 0; i < 3; i++)
8724 {
8725 Lisp_Object this_spec, this_name, this_aliases, this_eol_type;
8726
8727 this_name = AREF (eol_type, i);
8728 this_aliases = Fcons (this_name, Qnil);
8729 this_eol_type = (i == 0 ? Qunix : i == 1 ? Qdos : Qmac);
8730 this_spec = Fmake_vector (make_number (3), attrs);
8731 ASET (this_spec, 1, this_aliases);
8732 ASET (this_spec, 2, this_eol_type);
8733 Fputhash (this_name, this_spec, Vcoding_system_hash_table);
8734 Vcoding_system_list = Fcons (this_name, Vcoding_system_list);
8735 Vcoding_system_alist = Fcons (Fcons (Fsymbol_name (this_name), Qnil),
8736 Vcoding_system_alist);
8737 }
8738 }
8739
8740 spec_vec = Fmake_vector (make_number (3), attrs);
8741 ASET (spec_vec, 1, aliases);
8742 ASET (spec_vec, 2, eol_type);
8743
8744 Fputhash (name, spec_vec, Vcoding_system_hash_table);
8745 Vcoding_system_list = Fcons (name, Vcoding_system_list);
8746 Vcoding_system_alist = Fcons (Fcons (Fsymbol_name (name), Qnil),
8747 Vcoding_system_alist);
8748
8749 {
8750 int id = coding_categories[category].id;
8751
8752 if (id < 0 || EQ (name, CODING_ID_NAME (id)))
8753 setup_coding_system (name, &coding_categories[category]);
8754 }
8755
8756 return Qnil;
8757
8758 short_args:
8759 return Fsignal (Qwrong_number_of_arguments,
8760 Fcons (intern ("define-coding-system-internal"),
8761 make_number (nargs)));
8762 }
8763
8764
8765 DEFUN ("coding-system-put", Fcoding_system_put, Scoding_system_put,
8766 3, 3, 0,
8767 doc: /* Change value in CODING-SYSTEM's property list PROP to VAL. */)
8768 (coding_system, prop, val)
8769 Lisp_Object coding_system, prop, val;
8770 {
8771 Lisp_Object spec, attrs, plist;
8772
8773 CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
8774 attrs = AREF (spec, 0);
8775 if (EQ (prop, QCmnemonic))
8776 {
8777 if (! STRINGP (val))
8778 CHECK_CHARACTER (val);
8779 CODING_ATTR_MNEMONIC (attrs) = val;
8780 }
8781 else if (EQ (prop, QCdefalut_char))
8782 {
8783 if (NILP (val))
8784 val = make_number (' ');
8785 else
8786 CHECK_CHARACTER (val);
8787 CODING_ATTR_DEFAULT_CHAR (attrs) = val;
8788 }
8789 else if (EQ (prop, QCdecode_translation_table))
8790 {
8791 if (! CHAR_TABLE_P (val) && ! CONSP (val))
8792 CHECK_SYMBOL (val);
8793 CODING_ATTR_DECODE_TBL (attrs) = val;
8794 }
8795 else if (EQ (prop, QCencode_translation_table))
8796 {
8797 if (! CHAR_TABLE_P (val) && ! CONSP (val))
8798 CHECK_SYMBOL (val);
8799 CODING_ATTR_ENCODE_TBL (attrs) = val;
8800 }
8801 else if (EQ (prop, QCpost_read_conversion))
8802 {
8803 CHECK_SYMBOL (val);
8804 CODING_ATTR_POST_READ (attrs) = val;
8805 }
8806 else if (EQ (prop, QCpre_write_conversion))
8807 {
8808 CHECK_SYMBOL (val);
8809 CODING_ATTR_PRE_WRITE (attrs) = val;
8810 }
8811
8812 CODING_ATTR_PLIST (attrs)
8813 = Fplist_put (CODING_ATTR_PLIST (attrs), prop, val);
8814 return val;
8815 }
8816
8817
8818 DEFUN ("define-coding-system-alias", Fdefine_coding_system_alias,
8819 Sdefine_coding_system_alias, 2, 2, 0,
8820 doc: /* Define ALIAS as an alias for CODING-SYSTEM. */)
8821 (alias, coding_system)
8822 Lisp_Object alias, coding_system;
8823 {
8824 Lisp_Object spec, aliases, eol_type;
8825
8826 CHECK_SYMBOL (alias);
8827 CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
8828 aliases = AREF (spec, 1);
8829 /* ALISES should be a list of length more than zero, and the first
8830 element is a base coding system. Append ALIAS at the tail of the
8831 list. */
8832 while (!NILP (XCDR (aliases)))
8833 aliases = XCDR (aliases);
8834 XSETCDR (aliases, Fcons (alias, Qnil));
8835
8836 eol_type = AREF (spec, 2);
8837 if (VECTORP (eol_type))
8838 {
8839 Lisp_Object subsidiaries;
8840 int i;
8841
8842 subsidiaries = make_subsidiaries (alias);
8843 for (i = 0; i < 3; i++)
8844 Fdefine_coding_system_alias (AREF (subsidiaries, i),
8845 AREF (eol_type, i));
8846 }
8847
8848 Fputhash (alias, spec, Vcoding_system_hash_table);
8849 Vcoding_system_list = Fcons (alias, Vcoding_system_list);
8850 Vcoding_system_alist = Fcons (Fcons (Fsymbol_name (alias), Qnil),
8851 Vcoding_system_alist);
8852
8853 return Qnil;
8854 }
8855
8856 DEFUN ("coding-system-base", Fcoding_system_base, Scoding_system_base,
8857 1, 1, 0,
8858 doc: /* Return the base of CODING-SYSTEM.
8859 Any alias or subsidiary coding system is not a base coding system. */)
8860 (coding_system)
8861 Lisp_Object coding_system;
8862 {
8863 Lisp_Object spec, attrs;
8864
8865 if (NILP (coding_system))
8866 return (Qno_conversion);
8867 CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
8868 attrs = AREF (spec, 0);
8869 return CODING_ATTR_BASE_NAME (attrs);
8870 }
8871
8872 DEFUN ("coding-system-plist", Fcoding_system_plist, Scoding_system_plist,
8873 1, 1, 0,
8874 doc: "Return the property list of CODING-SYSTEM.")
8875 (coding_system)
8876 Lisp_Object coding_system;
8877 {
8878 Lisp_Object spec, attrs;
8879
8880 if (NILP (coding_system))
8881 coding_system = Qno_conversion;
8882 CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
8883 attrs = AREF (spec, 0);
8884 return CODING_ATTR_PLIST (attrs);
8885 }
8886
8887
8888 DEFUN ("coding-system-aliases", Fcoding_system_aliases, Scoding_system_aliases,
8889 1, 1, 0,
8890 doc: /* Return the list of aliases of CODING-SYSTEM. */)
8891 (coding_system)
8892 Lisp_Object coding_system;
8893 {
8894 Lisp_Object spec;
8895
8896 if (NILP (coding_system))
8897 coding_system = Qno_conversion;
8898 CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
8899 return AREF (spec, 1);
8900 }
8901
8902 DEFUN ("coding-system-eol-type", Fcoding_system_eol_type,
8903 Scoding_system_eol_type, 1, 1, 0,
8904 doc: /* Return eol-type of CODING-SYSTEM.
8905 An eol-type is integer 0, 1, 2, or a vector of coding systems.
8906
8907 Integer values 0, 1, and 2 indicate a format of end-of-line; LF, CRLF,
8908 and CR respectively.
8909
8910 A vector value indicates that a format of end-of-line should be
8911 detected automatically. Nth element of the vector is the subsidiary
8912 coding system whose eol-type is N. */)
8913 (coding_system)
8914 Lisp_Object coding_system;
8915 {
8916 Lisp_Object spec, eol_type;
8917 int n;
8918
8919 if (NILP (coding_system))
8920 coding_system = Qno_conversion;
8921 if (! CODING_SYSTEM_P (coding_system))
8922 return Qnil;
8923 spec = CODING_SYSTEM_SPEC (coding_system);
8924 eol_type = AREF (spec, 2);
8925 if (VECTORP (eol_type))
8926 return Fcopy_sequence (eol_type);
8927 n = EQ (eol_type, Qunix) ? 0 : EQ (eol_type, Qdos) ? 1 : 2;
8928 return make_number (n);
8929 }
8930
8931 #endif /* emacs */
8932
8933 \f
8934 /*** 9. Post-amble ***/
8935
8936 void
8937 init_coding_once ()
8938 {
8939 int i;
8940
8941 for (i = 0; i < coding_category_max; i++)
8942 {
8943 coding_categories[i].id = -1;
8944 coding_priorities[i] = i;
8945 }
8946
8947 /* ISO2022 specific initialize routine. */
8948 for (i = 0; i < 0x20; i++)
8949 iso_code_class[i] = ISO_control_0;
8950 for (i = 0x21; i < 0x7F; i++)
8951 iso_code_class[i] = ISO_graphic_plane_0;
8952 for (i = 0x80; i < 0xA0; i++)
8953 iso_code_class[i] = ISO_control_1;
8954 for (i = 0xA1; i < 0xFF; i++)
8955 iso_code_class[i] = ISO_graphic_plane_1;
8956 iso_code_class[0x20] = iso_code_class[0x7F] = ISO_0x20_or_0x7F;
8957 iso_code_class[0xA0] = iso_code_class[0xFF] = ISO_0xA0_or_0xFF;
8958 iso_code_class[ISO_CODE_SO] = ISO_shift_out;
8959 iso_code_class[ISO_CODE_SI] = ISO_shift_in;
8960 iso_code_class[ISO_CODE_SS2_7] = ISO_single_shift_2_7;
8961 iso_code_class[ISO_CODE_ESC] = ISO_escape;
8962 iso_code_class[ISO_CODE_SS2] = ISO_single_shift_2;
8963 iso_code_class[ISO_CODE_SS3] = ISO_single_shift_3;
8964 iso_code_class[ISO_CODE_CSI] = ISO_control_sequence_introducer;
8965
8966 for (i = 0; i < 256; i++)
8967 {
8968 emacs_mule_bytes[i] = 1;
8969 }
8970 emacs_mule_bytes[EMACS_MULE_LEADING_CODE_PRIVATE_11] = 3;
8971 emacs_mule_bytes[EMACS_MULE_LEADING_CODE_PRIVATE_12] = 3;
8972 emacs_mule_bytes[EMACS_MULE_LEADING_CODE_PRIVATE_21] = 4;
8973 emacs_mule_bytes[EMACS_MULE_LEADING_CODE_PRIVATE_22] = 4;
8974 }
8975
8976 #ifdef emacs
8977
8978 void
8979 syms_of_coding ()
8980 {
8981 staticpro (&Vcoding_system_hash_table);
8982 {
8983 Lisp_Object args[2];
8984 args[0] = QCtest;
8985 args[1] = Qeq;
8986 Vcoding_system_hash_table = Fmake_hash_table (2, args);
8987 }
8988
8989 staticpro (&Vsjis_coding_system);
8990 Vsjis_coding_system = Qnil;
8991
8992 staticpro (&Vbig5_coding_system);
8993 Vbig5_coding_system = Qnil;
8994
8995 staticpro (&Vcode_conversion_reused_workbuf);
8996 Vcode_conversion_reused_workbuf = Qnil;
8997
8998 staticpro (&Vcode_conversion_workbuf_name);
8999 Vcode_conversion_workbuf_name = build_string (" *code-conversion-work*");
9000
9001 reused_workbuf_in_use = 0;
9002
9003 DEFSYM (Qcharset, "charset");
9004 DEFSYM (Qtarget_idx, "target-idx");
9005 DEFSYM (Qcoding_system_history, "coding-system-history");
9006 Fset (Qcoding_system_history, Qnil);
9007
9008 /* Target FILENAME is the first argument. */
9009 Fput (Qinsert_file_contents, Qtarget_idx, make_number (0));
9010 /* Target FILENAME is the third argument. */
9011 Fput (Qwrite_region, Qtarget_idx, make_number (2));
9012
9013 DEFSYM (Qcall_process, "call-process");
9014 /* Target PROGRAM is the first argument. */
9015 Fput (Qcall_process, Qtarget_idx, make_number (0));
9016
9017 DEFSYM (Qcall_process_region, "call-process-region");
9018 /* Target PROGRAM is the third argument. */
9019 Fput (Qcall_process_region, Qtarget_idx, make_number (2));
9020
9021 DEFSYM (Qstart_process, "start-process");
9022 /* Target PROGRAM is the third argument. */
9023 Fput (Qstart_process, Qtarget_idx, make_number (2));
9024
9025 DEFSYM (Qopen_network_stream, "open-network-stream");
9026 /* Target SERVICE is the fourth argument. */
9027 Fput (Qopen_network_stream, Qtarget_idx, make_number (3));
9028
9029 DEFSYM (Qcoding_system, "coding-system");
9030 DEFSYM (Qcoding_aliases, "coding-aliases");
9031
9032 DEFSYM (Qeol_type, "eol-type");
9033 DEFSYM (Qunix, "unix");
9034 DEFSYM (Qdos, "dos");
9035
9036 DEFSYM (Qbuffer_file_coding_system, "buffer-file-coding-system");
9037 DEFSYM (Qpost_read_conversion, "post-read-conversion");
9038 DEFSYM (Qpre_write_conversion, "pre-write-conversion");
9039 DEFSYM (Qdefault_char, "default-char");
9040 DEFSYM (Qundecided, "undecided");
9041 DEFSYM (Qno_conversion, "no-conversion");
9042 DEFSYM (Qraw_text, "raw-text");
9043
9044 DEFSYM (Qiso_2022, "iso-2022");
9045
9046 DEFSYM (Qutf_8, "utf-8");
9047 DEFSYM (Qutf_8_emacs, "utf-8-emacs");
9048
9049 DEFSYM (Qutf_16, "utf-16");
9050 DEFSYM (Qbig, "big");
9051 DEFSYM (Qlittle, "little");
9052
9053 DEFSYM (Qshift_jis, "shift-jis");
9054 DEFSYM (Qbig5, "big5");
9055
9056 DEFSYM (Qcoding_system_p, "coding-system-p");
9057
9058 DEFSYM (Qcoding_system_error, "coding-system-error");
9059 Fput (Qcoding_system_error, Qerror_conditions,
9060 Fcons (Qcoding_system_error, Fcons (Qerror, Qnil)));
9061 Fput (Qcoding_system_error, Qerror_message,
9062 build_string ("Invalid coding system"));
9063
9064 /* Intern this now in case it isn't already done.
9065 Setting this variable twice is harmless.
9066 But don't staticpro it here--that is done in alloc.c. */
9067 Qchar_table_extra_slots = intern ("char-table-extra-slots");
9068
9069 DEFSYM (Qtranslation_table, "translation-table");
9070 Fput (Qtranslation_table, Qchar_table_extra_slots, make_number (2));
9071 DEFSYM (Qtranslation_table_id, "translation-table-id");
9072 DEFSYM (Qtranslation_table_for_decode, "translation-table-for-decode");
9073 DEFSYM (Qtranslation_table_for_encode, "translation-table-for-encode");
9074
9075 DEFSYM (Qvalid_codes, "valid-codes");
9076
9077 DEFSYM (Qemacs_mule, "emacs-mule");
9078
9079 DEFSYM (QCcategory, ":category");
9080 DEFSYM (QCmnemonic, ":mnemonic");
9081 DEFSYM (QCdefalut_char, ":default-char");
9082 DEFSYM (QCdecode_translation_table, ":decode-translation-table");
9083 DEFSYM (QCencode_translation_table, ":encode-translation-table");
9084 DEFSYM (QCpost_read_conversion, ":post-read-conversion");
9085 DEFSYM (QCpre_write_conversion, ":pre-write-conversion");
9086
9087 Vcoding_category_table
9088 = Fmake_vector (make_number (coding_category_max), Qnil);
9089 staticpro (&Vcoding_category_table);
9090 /* Followings are target of code detection. */
9091 ASET (Vcoding_category_table, coding_category_iso_7,
9092 intern ("coding-category-iso-7"));
9093 ASET (Vcoding_category_table, coding_category_iso_7_tight,
9094 intern ("coding-category-iso-7-tight"));
9095 ASET (Vcoding_category_table, coding_category_iso_8_1,
9096 intern ("coding-category-iso-8-1"));
9097 ASET (Vcoding_category_table, coding_category_iso_8_2,
9098 intern ("coding-category-iso-8-2"));
9099 ASET (Vcoding_category_table, coding_category_iso_7_else,
9100 intern ("coding-category-iso-7-else"));
9101 ASET (Vcoding_category_table, coding_category_iso_8_else,
9102 intern ("coding-category-iso-8-else"));
9103 ASET (Vcoding_category_table, coding_category_utf_8,
9104 intern ("coding-category-utf-8"));
9105 ASET (Vcoding_category_table, coding_category_utf_16_be,
9106 intern ("coding-category-utf-16-be"));
9107 ASET (Vcoding_category_table, coding_category_utf_16_auto,
9108 intern ("coding-category-utf-16-auto"));
9109 ASET (Vcoding_category_table, coding_category_utf_16_le,
9110 intern ("coding-category-utf-16-le"));
9111 ASET (Vcoding_category_table, coding_category_utf_16_be_nosig,
9112 intern ("coding-category-utf-16-be-nosig"));
9113 ASET (Vcoding_category_table, coding_category_utf_16_le_nosig,
9114 intern ("coding-category-utf-16-le-nosig"));
9115 ASET (Vcoding_category_table, coding_category_charset,
9116 intern ("coding-category-charset"));
9117 ASET (Vcoding_category_table, coding_category_sjis,
9118 intern ("coding-category-sjis"));
9119 ASET (Vcoding_category_table, coding_category_big5,
9120 intern ("coding-category-big5"));
9121 ASET (Vcoding_category_table, coding_category_ccl,
9122 intern ("coding-category-ccl"));
9123 ASET (Vcoding_category_table, coding_category_emacs_mule,
9124 intern ("coding-category-emacs-mule"));
9125 /* Followings are NOT target of code detection. */
9126 ASET (Vcoding_category_table, coding_category_raw_text,
9127 intern ("coding-category-raw-text"));
9128 ASET (Vcoding_category_table, coding_category_undecided,
9129 intern ("coding-category-undecided"));
9130
9131 DEFSYM (Qinsufficient_source, "insufficient-source");
9132 DEFSYM (Qinconsistent_eol, "inconsistent-eol");
9133 DEFSYM (Qinvalid_source, "invalid-source");
9134 DEFSYM (Qinterrupted, "interrupted");
9135 DEFSYM (Qinsufficient_memory, "insufficient-memory");
9136
9137 defsubr (&Scoding_system_p);
9138 defsubr (&Sread_coding_system);
9139 defsubr (&Sread_non_nil_coding_system);
9140 defsubr (&Scheck_coding_system);
9141 defsubr (&Sdetect_coding_region);
9142 defsubr (&Sdetect_coding_string);
9143 defsubr (&Sfind_coding_systems_region_internal);
9144 defsubr (&Sunencodable_char_position);
9145 defsubr (&Scheck_coding_systems_region);
9146 defsubr (&Sdecode_coding_region);
9147 defsubr (&Sencode_coding_region);
9148 defsubr (&Sdecode_coding_string);
9149 defsubr (&Sencode_coding_string);
9150 defsubr (&Sdecode_sjis_char);
9151 defsubr (&Sencode_sjis_char);
9152 defsubr (&Sdecode_big5_char);
9153 defsubr (&Sencode_big5_char);
9154 defsubr (&Sset_terminal_coding_system_internal);
9155 defsubr (&Sset_safe_terminal_coding_system_internal);
9156 defsubr (&Sterminal_coding_system);
9157 defsubr (&Sset_keyboard_coding_system_internal);
9158 defsubr (&Skeyboard_coding_system);
9159 defsubr (&Sfind_operation_coding_system);
9160 defsubr (&Sset_coding_system_priority);
9161 defsubr (&Sdefine_coding_system_internal);
9162 defsubr (&Sdefine_coding_system_alias);
9163 defsubr (&Scoding_system_put);
9164 defsubr (&Scoding_system_base);
9165 defsubr (&Scoding_system_plist);
9166 defsubr (&Scoding_system_aliases);
9167 defsubr (&Scoding_system_eol_type);
9168 defsubr (&Scoding_system_priority_list);
9169
9170 DEFVAR_LISP ("coding-system-list", &Vcoding_system_list,
9171 doc: /* List of coding systems.
9172
9173 Do not alter the value of this variable manually. This variable should be
9174 updated by the functions `define-coding-system' and
9175 `define-coding-system-alias'. */);
9176 Vcoding_system_list = Qnil;
9177
9178 DEFVAR_LISP ("coding-system-alist", &Vcoding_system_alist,
9179 doc: /* Alist of coding system names.
9180 Each element is one element list of coding system name.
9181 This variable is given to `completing-read' as TABLE argument.
9182
9183 Do not alter the value of this variable manually. This variable should be
9184 updated by the functions `make-coding-system' and
9185 `define-coding-system-alias'. */);
9186 Vcoding_system_alist = Qnil;
9187
9188 DEFVAR_LISP ("coding-category-list", &Vcoding_category_list,
9189 doc: /* List of coding-categories (symbols) ordered by priority.
9190
9191 On detecting a coding system, Emacs tries code detection algorithms
9192 associated with each coding-category one by one in this order. When
9193 one algorithm agrees with a byte sequence of source text, the coding
9194 system bound to the corresponding coding-category is selected. */);
9195 {
9196 int i;
9197
9198 Vcoding_category_list = Qnil;
9199 for (i = coding_category_max - 1; i >= 0; i--)
9200 Vcoding_category_list
9201 = Fcons (XVECTOR (Vcoding_category_table)->contents[i],
9202 Vcoding_category_list);
9203 }
9204
9205 DEFVAR_LISP ("coding-system-for-read", &Vcoding_system_for_read,
9206 doc: /* Specify the coding system for read operations.
9207 It is useful to bind this variable with `let', but do not set it globally.
9208 If the value is a coding system, it is used for decoding on read operation.
9209 If not, an appropriate element is used from one of the coding system alists:
9210 There are three such tables, `file-coding-system-alist',
9211 `process-coding-system-alist', and `network-coding-system-alist'. */);
9212 Vcoding_system_for_read = Qnil;
9213
9214 DEFVAR_LISP ("coding-system-for-write", &Vcoding_system_for_write,
9215 doc: /* Specify the coding system for write operations.
9216 Programs bind this variable with `let', but you should not set it globally.
9217 If the value is a coding system, it is used for encoding of output,
9218 when writing it to a file and when sending it to a file or subprocess.
9219
9220 If this does not specify a coding system, an appropriate element
9221 is used from one of the coding system alists:
9222 There are three such tables, `file-coding-system-alist',
9223 `process-coding-system-alist', and `network-coding-system-alist'.
9224 For output to files, if the above procedure does not specify a coding system,
9225 the value of `buffer-file-coding-system' is used. */);
9226 Vcoding_system_for_write = Qnil;
9227
9228 DEFVAR_LISP ("last-coding-system-used", &Vlast_coding_system_used,
9229 doc: /*
9230 Coding system used in the latest file or process I/O. */);
9231 Vlast_coding_system_used = Qnil;
9232
9233 DEFVAR_LISP ("last-code-conversion-error", &Vlast_code_conversion_error,
9234 doc: /*
9235 Error status of the last code conversion.
9236
9237 When an error was detected in the last code conversion, this variable
9238 is set to one of the following symbols.
9239 `insufficient-source'
9240 `inconsistent-eol'
9241 `invalid-source'
9242 `interrupted'
9243 `insufficient-memory'
9244 When no error was detected, the value doesn't change. So, to check
9245 the error status of a code conversion by this variable, you must
9246 explicitly set this variable to nil before performing code
9247 conversion. */);
9248 Vlast_code_conversion_error = Qnil;
9249
9250 DEFVAR_BOOL ("inhibit-eol-conversion", &inhibit_eol_conversion,
9251 doc: /*
9252 *Non-nil means always inhibit code conversion of end-of-line format.
9253 See info node `Coding Systems' and info node `Text and Binary' concerning
9254 such conversion. */);
9255 inhibit_eol_conversion = 0;
9256
9257 DEFVAR_BOOL ("inherit-process-coding-system", &inherit_process_coding_system,
9258 doc: /*
9259 Non-nil means process buffer inherits coding system of process output.
9260 Bind it to t if the process output is to be treated as if it were a file
9261 read from some filesystem. */);
9262 inherit_process_coding_system = 0;
9263
9264 DEFVAR_LISP ("file-coding-system-alist", &Vfile_coding_system_alist,
9265 doc: /*
9266 Alist to decide a coding system to use for a file I/O operation.
9267 The format is ((PATTERN . VAL) ...),
9268 where PATTERN is a regular expression matching a file name,
9269 VAL is a coding system, a cons of coding systems, or a function symbol.
9270 If VAL is a coding system, it is used for both decoding and encoding
9271 the file contents.
9272 If VAL is a cons of coding systems, the car part is used for decoding,
9273 and the cdr part is used for encoding.
9274 If VAL is a function symbol, the function must return a coding system
9275 or a cons of coding systems which are used as above. The function gets
9276 the arguments with which `find-operation-coding-systems' was called.
9277
9278 See also the function `find-operation-coding-system'
9279 and the variable `auto-coding-alist'. */);
9280 Vfile_coding_system_alist = Qnil;
9281
9282 DEFVAR_LISP ("process-coding-system-alist", &Vprocess_coding_system_alist,
9283 doc: /*
9284 Alist to decide a coding system to use for a process I/O operation.
9285 The format is ((PATTERN . VAL) ...),
9286 where PATTERN is a regular expression matching a program name,
9287 VAL is a coding system, a cons of coding systems, or a function symbol.
9288 If VAL is a coding system, it is used for both decoding what received
9289 from the program and encoding what sent to the program.
9290 If VAL is a cons of coding systems, the car part is used for decoding,
9291 and the cdr part is used for encoding.
9292 If VAL is a function symbol, the function must return a coding system
9293 or a cons of coding systems which are used as above.
9294
9295 See also the function `find-operation-coding-system'. */);
9296 Vprocess_coding_system_alist = Qnil;
9297
9298 DEFVAR_LISP ("network-coding-system-alist", &Vnetwork_coding_system_alist,
9299 doc: /*
9300 Alist to decide a coding system to use for a network I/O operation.
9301 The format is ((PATTERN . VAL) ...),
9302 where PATTERN is a regular expression matching a network service name
9303 or is a port number to connect to,
9304 VAL is a coding system, a cons of coding systems, or a function symbol.
9305 If VAL is a coding system, it is used for both decoding what received
9306 from the network stream and encoding what sent to the network stream.
9307 If VAL is a cons of coding systems, the car part is used for decoding,
9308 and the cdr part is used for encoding.
9309 If VAL is a function symbol, the function must return a coding system
9310 or a cons of coding systems which are used as above.
9311
9312 See also the function `find-operation-coding-system'. */);
9313 Vnetwork_coding_system_alist = Qnil;
9314
9315 DEFVAR_LISP ("locale-coding-system", &Vlocale_coding_system,
9316 doc: /* Coding system to use with system messages.
9317 Also used for decoding keyboard input on X Window system. */);
9318 Vlocale_coding_system = Qnil;
9319
9320 /* The eol mnemonics are reset in startup.el system-dependently. */
9321 DEFVAR_LISP ("eol-mnemonic-unix", &eol_mnemonic_unix,
9322 doc: /*
9323 *String displayed in mode line for UNIX-like (LF) end-of-line format. */);
9324 eol_mnemonic_unix = build_string (":");
9325
9326 DEFVAR_LISP ("eol-mnemonic-dos", &eol_mnemonic_dos,
9327 doc: /*
9328 *String displayed in mode line for DOS-like (CRLF) end-of-line format. */);
9329 eol_mnemonic_dos = build_string ("\\");
9330
9331 DEFVAR_LISP ("eol-mnemonic-mac", &eol_mnemonic_mac,
9332 doc: /*
9333 *String displayed in mode line for MAC-like (CR) end-of-line format. */);
9334 eol_mnemonic_mac = build_string ("/");
9335
9336 DEFVAR_LISP ("eol-mnemonic-undecided", &eol_mnemonic_undecided,
9337 doc: /*
9338 *String displayed in mode line when end-of-line format is not yet determined. */);
9339 eol_mnemonic_undecided = build_string (":");
9340
9341 DEFVAR_LISP ("enable-character-translation", &Venable_character_translation,
9342 doc: /*
9343 *Non-nil enables character translation while encoding and decoding. */);
9344 Venable_character_translation = Qt;
9345
9346 DEFVAR_LISP ("standard-translation-table-for-decode",
9347 &Vstandard_translation_table_for_decode,
9348 doc: /* Table for translating characters while decoding. */);
9349 Vstandard_translation_table_for_decode = Qnil;
9350
9351 DEFVAR_LISP ("standard-translation-table-for-encode",
9352 &Vstandard_translation_table_for_encode,
9353 doc: /* Table for translating characters while encoding. */);
9354 Vstandard_translation_table_for_encode = Qnil;
9355
9356 DEFVAR_LISP ("charset-revision-table", &Vcharset_revision_table,
9357 doc: /* Alist of charsets vs revision numbers.
9358 While encoding, if a charset (car part of an element) is found,
9359 designate it with the escape sequence identifying revision (cdr part
9360 of the element). */);
9361 Vcharset_revision_table = Qnil;
9362
9363 DEFVAR_LISP ("default-process-coding-system",
9364 &Vdefault_process_coding_system,
9365 doc: /* Cons of coding systems used for process I/O by default.
9366 The car part is used for decoding a process output,
9367 the cdr part is used for encoding a text to be sent to a process. */);
9368 Vdefault_process_coding_system = Qnil;
9369
9370 DEFVAR_LISP ("latin-extra-code-table", &Vlatin_extra_code_table,
9371 doc: /*
9372 Table of extra Latin codes in the range 128..159 (inclusive).
9373 This is a vector of length 256.
9374 If Nth element is non-nil, the existence of code N in a file
9375 \(or output of subprocess) doesn't prevent it to be detected as
9376 a coding system of ISO 2022 variant which has a flag
9377 `accept-latin-extra-code' t (e.g. iso-latin-1) on reading a file
9378 or reading output of a subprocess.
9379 Only 128th through 159th elements has a meaning. */);
9380 Vlatin_extra_code_table = Fmake_vector (make_number (256), Qnil);
9381
9382 DEFVAR_LISP ("select-safe-coding-system-function",
9383 &Vselect_safe_coding_system_function,
9384 doc: /*
9385 Function to call to select safe coding system for encoding a text.
9386
9387 If set, this function is called to force a user to select a proper
9388 coding system which can encode the text in the case that a default
9389 coding system used in each operation can't encode the text.
9390
9391 The default value is `select-safe-coding-system' (which see). */);
9392 Vselect_safe_coding_system_function = Qnil;
9393
9394 DEFVAR_BOOL ("coding-system-require-warning",
9395 &coding_system_require_warning,
9396 doc: /* Internal use only.
9397 If non-nil, on writing a file, `select-safe-coding-system-function' is
9398 called even if `coding-system-for-write' is non-nil. The command
9399 `universal-coding-system-argument' binds this variable to t temporarily. */);
9400 coding_system_require_warning = 0;
9401
9402
9403 DEFVAR_BOOL ("inhibit-iso-escape-detection",
9404 &inhibit_iso_escape_detection,
9405 doc: /*
9406 If non-nil, Emacs ignores ISO2022's escape sequence on code detection.
9407
9408 By default, on reading a file, Emacs tries to detect how the text is
9409 encoded. This code detection is sensitive to escape sequences. If
9410 the sequence is valid as ISO2022, the code is determined as one of
9411 the ISO2022 encodings, and the file is decoded by the corresponding
9412 coding system (e.g. `iso-2022-7bit').
9413
9414 However, there may be a case that you want to read escape sequences in
9415 a file as is. In such a case, you can set this variable to non-nil.
9416 Then, as the code detection ignores any escape sequences, no file is
9417 detected as encoded in some ISO2022 encoding. The result is that all
9418 escape sequences become visible in a buffer.
9419
9420 The default value is nil, and it is strongly recommended not to change
9421 it. That is because many Emacs Lisp source files that contain
9422 non-ASCII characters are encoded by the coding system `iso-2022-7bit'
9423 in Emacs's distribution, and they won't be decoded correctly on
9424 reading if you suppress escape sequence detection.
9425
9426 The other way to read escape sequences in a file without decoding is
9427 to explicitly specify some coding system that doesn't use ISO2022's
9428 escape sequence (e.g `latin-1') on reading by \\[universal-coding-system-argument]. */);
9429 inhibit_iso_escape_detection = 0;
9430
9431 DEFVAR_LISP ("translation-table-for-input", &Vtranslation_table_for_input,
9432 doc: /* Char table for translating self-inserting characters.
9433 This is applied to the result of input methods, not their input. See also
9434 `keyboard-translate-table'. */);
9435 Vtranslation_table_for_input = Qnil;
9436
9437 {
9438 Lisp_Object args[coding_arg_max];
9439 Lisp_Object plist[16];
9440 int i;
9441
9442 for (i = 0; i < coding_arg_max; i++)
9443 args[i] = Qnil;
9444
9445 plist[0] = intern (":name");
9446 plist[1] = args[coding_arg_name] = Qno_conversion;
9447 plist[2] = intern (":mnemonic");
9448 plist[3] = args[coding_arg_mnemonic] = make_number ('=');
9449 plist[4] = intern (":coding-type");
9450 plist[5] = args[coding_arg_coding_type] = Qraw_text;
9451 plist[6] = intern (":ascii-compatible-p");
9452 plist[7] = args[coding_arg_ascii_compatible_p] = Qt;
9453 plist[8] = intern (":default-char");
9454 plist[9] = args[coding_arg_default_char] = make_number (0);
9455 plist[10] = intern (":for-unibyte");
9456 plist[11] = args[coding_arg_for_unibyte] = Qt;
9457 plist[12] = intern (":docstring");
9458 plist[13] = build_string ("Do no conversion.\n\
9459 \n\
9460 When you visit a file with this coding, the file is read into a\n\
9461 unibyte buffer as is, thus each byte of a file is treated as a\n\
9462 character.");
9463 plist[14] = intern (":eol-type");
9464 plist[15] = args[coding_arg_eol_type] = Qunix;
9465 args[coding_arg_plist] = Flist (16, plist);
9466 Fdefine_coding_system_internal (coding_arg_max, args);
9467 }
9468
9469 setup_coding_system (Qno_conversion, &keyboard_coding);
9470 setup_coding_system (Qno_conversion, &terminal_coding);
9471 setup_coding_system (Qno_conversion, &safe_terminal_coding);
9472
9473 {
9474 int i;
9475
9476 for (i = 0; i < coding_category_max; i++)
9477 Fset (AREF (Vcoding_category_table, i), Qno_conversion);
9478 }
9479 }
9480
9481 char *
9482 emacs_strerror (error_number)
9483 int error_number;
9484 {
9485 char *str;
9486
9487 synchronize_system_messages_locale ();
9488 str = strerror (error_number);
9489
9490 if (! NILP (Vlocale_coding_system))
9491 {
9492 Lisp_Object dec = code_convert_string_norecord (build_string (str),
9493 Vlocale_coding_system,
9494 0);
9495 str = (char *) SDATA (dec);
9496 }
9497
9498 return str;
9499 }
9500
9501 #endif /* emacs */