(detect_coding_iso_2022): Fix handling of SS2 and SS3.
[bpt/emacs.git] / src / coding.c
1 /* Coding system handler (conversion, detection, etc).
2 Copyright (C) 1995, 1997, 1998 Electrotechnical Laboratory, JAPAN.
3 Licensed to the Free Software Foundation.
4 Copyright (C) 2001, 2002 Free Software Foundation, Inc.
5 Copyright (C) 2003
6 National Institute of Advanced Industrial Science and Technology (AIST)
7 Registration Number H13PRO009
8
9 This file is part of GNU Emacs.
10
11 GNU Emacs is free software; you can redistribute it and/or modify
12 it under the terms of the GNU General Public License as published by
13 the Free Software Foundation; either version 2, or (at your option)
14 any later version.
15
16 GNU Emacs is distributed in the hope that it will be useful,
17 but WITHOUT ANY WARRANTY; without even the implied warranty of
18 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19 GNU General Public License for more details.
20
21 You should have received a copy of the GNU General Public License
22 along with GNU Emacs; see the file COPYING. If not, write to
23 the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
24 Boston, MA 02111-1307, USA. */
25
26 /*** TABLE OF CONTENTS ***
27
28 0. General comments
29 1. Preamble
30 2. Emacs' internal format (emacs-utf-8) handlers
31 3. UTF-8 handlers
32 4. UTF-16 handlers
33 5. Charset-base coding systems handlers
34 6. emacs-mule (old Emacs' internal format) handlers
35 7. ISO2022 handlers
36 8. Shift-JIS and BIG5 handlers
37 9. CCL handlers
38 10. C library functions
39 11. Emacs Lisp library functions
40 12. Postamble
41
42 */
43
44 /*** 0. General comments ***
45
46
47 CODING SYSTEM
48
49 A coding system is an object for an encoding mechanism that contains
50 information about how to convert byte sequences to character
51 sequences and vice versa. When we say "decode", it means converting
52 a byte sequence of a specific coding system into a character
53 sequence that is represented by Emacs' internal coding system
54 `emacs-utf-8', and when we say "encode", it means converting a
55 character sequence of emacs-utf-8 to a byte sequence of a specific
56 coding system.
57
58 In Emacs Lisp, a coding system is represented by a Lisp symbol. In
59 C level, a coding system is represented by a vector of attributes
60 stored in the hash table Vcharset_hash_table. The conversion from
61 coding system symbol to attributes vector is done by looking up
62 Vcharset_hash_table by the symbol.
63
64 Coding systems are classified into the following types depending on
65 the encoding mechanism. Here's a brief description of the types.
66
67 o UTF-8
68
69 o UTF-16
70
71 o Charset-base coding system
72
73 A coding system defined by one or more (coded) character sets.
74 Decoding and encoding are done by a code converter defined for each
75 character set.
76
77 o Old Emacs internal format (emacs-mule)
78
79 The coding system adopted by old versions of Emacs (20 and 21).
80
81 o ISO2022-base coding system
82
83 The most famous coding system for multiple character sets. X's
84 Compound Text, various EUCs (Extended Unix Code), and coding systems
85 used in the Internet communication such as ISO-2022-JP are all
86 variants of ISO2022.
87
88 o SJIS (or Shift-JIS or MS-Kanji-Code)
89
90 A coding system to encode character sets: ASCII, JISX0201, and
91 JISX0208. Widely used for PC's in Japan. Details are described in
92 section 8.
93
94 o BIG5
95
96 A coding system to encode character sets: ASCII and Big5. Widely
97 used for Chinese (mainly in Taiwan and Hong Kong). Details are
98 described in section 8. In this file, when we write "big5" (all
99 lowercase), we mean the coding system, and when we write "Big5"
100 (capitalized), we mean the character set.
101
102 o CCL
103
104 If a user wants to decode/encode text encoded in a coding system
105 not listed above, he can supply a decoder and an encoder for it in
106 CCL (Code Conversion Language) programs. Emacs executes the CCL
107 program while decoding/encoding.
108
109 o Raw-text
110
111 A coding system for text containing raw eight-bit data. Emacs
112 treats each byte of source text as a character (except for
113 end-of-line conversion).
114
115 o No-conversion
116
117 Like raw text, but don't do end-of-line conversion.
118
119
120 END-OF-LINE FORMAT
121
122 How text end-of-line is encoded depends on operating system. For
123 instance, Unix's format is just one byte of LF (line-feed) code,
124 whereas DOS's format is two-byte sequence of `carriage-return' and
125 `line-feed' codes. MacOS's format is usually one byte of
126 `carriage-return'.
127
128 Since text character encoding and end-of-line encoding are
129 independent, any coding system described above can take any format
130 of end-of-line (except for no-conversion).
131
132 STRUCT CODING_SYSTEM
133
134 Before using a coding system for code conversion (i.e. decoding and
135 encoding), we setup a structure of type `struct coding_system'.
136 This structure keeps various information about a specific code
137 conversion (e.g. the location of source and destination data).
138
139 */
140
141 /* COMMON MACROS */
142
143
144 /*** GENERAL NOTES on `detect_coding_XXX ()' functions ***
145
146 These functions check if a byte sequence specified as a source in
147 CODING conforms to the format of XXX, and update the members of
148 DETECT_INFO.
149
150 Return 1 if the byte sequence conforms to XXX, otherwise return 0.
151
152 Below is the template of these functions. */
153
154 #if 0
155 static int
156 detect_coding_XXX (coding, detect_info)
157 struct coding_system *coding;
158 struct coding_detection_info *detect_info;
159 {
160 unsigned char *src = coding->source;
161 unsigned char *src_end = coding->source + coding->src_bytes;
162 int multibytep = coding->src_multibyte;
163 int consumed_chars = 0;
164 int found = 0;
165 ...;
166
167 while (1)
168 {
169 /* Get one byte from the source. If the souce is exausted, jump
170 to no_more_source:. */
171 ONE_MORE_BYTE (c);
172
173 if (! __C_conforms_to_XXX___ (c))
174 break;
175 if (! __C_strongly_suggests_XXX__ (c))
176 found = CATEGORY_MASK_XXX;
177 }
178 /* The byte sequence is invalid for XXX. */
179 detect_info->rejected |= CATEGORY_MASK_XXX;
180 return 0;
181
182 no_more_source:
183 /* The source exausted successfully. */
184 detect_info->found |= found;
185 return 1;
186 }
187 #endif
188
189 /*** GENERAL NOTES on `decode_coding_XXX ()' functions ***
190
191 These functions decode a byte sequence specified as a source by
192 CODING. The resulting multibyte text goes to a place pointed to by
193 CODING->charbuf, the length of which should not exceed
194 CODING->charbuf_size;
195
196 These functions set the information of original and decoded texts in
197 CODING->consumed, CODING->consumed_char, and CODING->charbuf_used.
198 They also set CODING->result to one of CODING_RESULT_XXX indicating
199 how the decoding is finished.
200
201 Below is the template of these functions. */
202
203 #if 0
204 static void
205 decode_coding_XXXX (coding)
206 struct coding_system *coding;
207 {
208 unsigned char *src = coding->source + coding->consumed;
209 unsigned char *src_end = coding->source + coding->src_bytes;
210 /* SRC_BASE remembers the start position in source in each loop.
211 The loop will be exited when there's not enough source code, or
212 when there's no room in CHARBUF for a decoded character. */
213 unsigned char *src_base;
214 /* A buffer to produce decoded characters. */
215 int *charbuf = coding->charbuf;
216 int *charbuf_end = charbuf + coding->charbuf_size;
217 int multibytep = coding->src_multibyte;
218
219 while (1)
220 {
221 src_base = src;
222 if (charbuf < charbuf_end)
223 /* No more room to produce a decoded character. */
224 break;
225 ONE_MORE_BYTE (c);
226 /* Decode it. */
227 }
228
229 no_more_source:
230 if (src_base < src_end
231 && coding->mode & CODING_MODE_LAST_BLOCK)
232 /* If the source ends by partial bytes to construct a character,
233 treat them as eight-bit raw data. */
234 while (src_base < src_end && charbuf < charbuf_end)
235 *charbuf++ = *src_base++;
236 /* Remember how many bytes and characters we consumed. If the
237 source is multibyte, the bytes and chars are not identical. */
238 coding->consumed = coding->consumed_char = src_base - coding->source;
239 /* Remember how many characters we produced. */
240 coding->charbuf_used = charbuf - coding->charbuf;
241 }
242 #endif
243
244 /*** GENERAL NOTES on `encode_coding_XXX ()' functions ***
245
246 These functions encode SRC_BYTES length text at SOURCE of Emacs'
247 internal multibyte format by CODING. The resulting byte sequence
248 goes to a place pointed to by DESTINATION, the length of which
249 should not exceed DST_BYTES.
250
251 These functions set the information of original and encoded texts in
252 the members produced, produced_char, consumed, and consumed_char of
253 the structure *CODING. They also set the member result to one of
254 CODING_RESULT_XXX indicating how the encoding finished.
255
256 DST_BYTES zero means that source area and destination area are
257 overlapped, which means that we can produce a encoded text until it
258 reaches at the head of not-yet-encoded source text.
259
260 Below is a template of these functions. */
261 #if 0
262 static void
263 encode_coding_XXX (coding)
264 struct coding_system *coding;
265 {
266 int multibytep = coding->dst_multibyte;
267 int *charbuf = coding->charbuf;
268 int *charbuf_end = charbuf->charbuf + coding->charbuf_used;
269 unsigned char *dst = coding->destination + coding->produced;
270 unsigned char *dst_end = coding->destination + coding->dst_bytes;
271 unsigned char *adjusted_dst_end = dst_end - _MAX_BYTES_PRODUCED_IN_LOOP_;
272 int produced_chars = 0;
273
274 for (; charbuf < charbuf_end && dst < adjusted_dst_end; charbuf++)
275 {
276 int c = *charbuf;
277 /* Encode C into DST, and increment DST. */
278 }
279 label_no_more_destination:
280 /* How many chars and bytes we produced. */
281 coding->produced_char += produced_chars;
282 coding->produced = dst - coding->destination;
283 }
284 #endif
285
286 \f
287 /*** 1. Preamble ***/
288
289 #include <config.h>
290 #include <stdio.h>
291
292 #include "lisp.h"
293 #include "buffer.h"
294 #include "character.h"
295 #include "charset.h"
296 #include "ccl.h"
297 #include "composite.h"
298 #include "coding.h"
299 #include "window.h"
300
301 Lisp_Object Vcoding_system_hash_table;
302
303 Lisp_Object Qcoding_system, Qcoding_aliases, Qeol_type;
304 Lisp_Object Qunix, Qdos;
305 extern Lisp_Object Qmac; /* frame.c */
306 Lisp_Object Qbuffer_file_coding_system;
307 Lisp_Object Qpost_read_conversion, Qpre_write_conversion;
308 Lisp_Object Qdefault_char;
309 Lisp_Object Qno_conversion, Qundecided;
310 Lisp_Object Qcharset, Qiso_2022, Qutf_8, Qutf_16, Qshift_jis, Qbig5;
311 Lisp_Object Qbig, Qlittle;
312 Lisp_Object Qcoding_system_history;
313 Lisp_Object Qvalid_codes;
314 Lisp_Object QCcategory, QCmnemonic, QCdefalut_char;
315 Lisp_Object QCdecode_translation_table, QCencode_translation_table;
316 Lisp_Object QCpost_read_conversion, QCpre_write_conversion;
317
318 extern Lisp_Object Qinsert_file_contents, Qwrite_region;
319 Lisp_Object Qcall_process, Qcall_process_region, Qprocess_argument;
320 Lisp_Object Qstart_process, Qopen_network_stream;
321 Lisp_Object Qtarget_idx;
322
323 Lisp_Object Qinsufficient_source, Qinconsistent_eol, Qinvalid_source;
324 Lisp_Object Qinterrupted, Qinsufficient_memory;
325
326 int coding_system_require_warning;
327
328 Lisp_Object Vselect_safe_coding_system_function;
329
330 /* Mnemonic string for each format of end-of-line. */
331 Lisp_Object eol_mnemonic_unix, eol_mnemonic_dos, eol_mnemonic_mac;
332 /* Mnemonic string to indicate format of end-of-line is not yet
333 decided. */
334 Lisp_Object eol_mnemonic_undecided;
335
336 #ifdef emacs
337
338 Lisp_Object Vcoding_system_list, Vcoding_system_alist;
339
340 Lisp_Object Qcoding_system_p, Qcoding_system_error;
341
342 /* Coding system emacs-mule and raw-text are for converting only
343 end-of-line format. */
344 Lisp_Object Qemacs_mule, Qraw_text;
345 Lisp_Object Qutf_8_emacs;
346
347 /* Coding-systems are handed between Emacs Lisp programs and C internal
348 routines by the following three variables. */
349 /* Coding-system for reading files and receiving data from process. */
350 Lisp_Object Vcoding_system_for_read;
351 /* Coding-system for writing files and sending data to process. */
352 Lisp_Object Vcoding_system_for_write;
353 /* Coding-system actually used in the latest I/O. */
354 Lisp_Object Vlast_coding_system_used;
355 /* Set to non-nil when an error is detected while code conversion. */
356 Lisp_Object Vlast_code_conversion_error;
357 /* A vector of length 256 which contains information about special
358 Latin codes (especially for dealing with Microsoft codes). */
359 Lisp_Object Vlatin_extra_code_table;
360
361 /* Flag to inhibit code conversion of end-of-line format. */
362 int inhibit_eol_conversion;
363
364 /* Flag to inhibit ISO2022 escape sequence detection. */
365 int inhibit_iso_escape_detection;
366
367 /* Flag to make buffer-file-coding-system inherit from process-coding. */
368 int inherit_process_coding_system;
369
370 /* Coding system to be used to encode text for terminal display. */
371 struct coding_system terminal_coding;
372
373 /* Coding system to be used to encode text for terminal display when
374 terminal coding system is nil. */
375 struct coding_system safe_terminal_coding;
376
377 /* Coding system of what is sent from terminal keyboard. */
378 struct coding_system keyboard_coding;
379
380 Lisp_Object Vfile_coding_system_alist;
381 Lisp_Object Vprocess_coding_system_alist;
382 Lisp_Object Vnetwork_coding_system_alist;
383
384 Lisp_Object Vlocale_coding_system;
385
386 #endif /* emacs */
387
388 /* Flag to tell if we look up translation table on character code
389 conversion. */
390 Lisp_Object Venable_character_translation;
391 /* Standard translation table to look up on decoding (reading). */
392 Lisp_Object Vstandard_translation_table_for_decode;
393 /* Standard translation table to look up on encoding (writing). */
394 Lisp_Object Vstandard_translation_table_for_encode;
395
396 Lisp_Object Qtranslation_table;
397 Lisp_Object Qtranslation_table_id;
398 Lisp_Object Qtranslation_table_for_decode;
399 Lisp_Object Qtranslation_table_for_encode;
400
401 /* Alist of charsets vs revision number. */
402 static Lisp_Object Vcharset_revision_table;
403
404 /* Default coding systems used for process I/O. */
405 Lisp_Object Vdefault_process_coding_system;
406
407 /* Char table for translating Quail and self-inserting input. */
408 Lisp_Object Vtranslation_table_for_input;
409
410 /* Two special coding systems. */
411 Lisp_Object Vsjis_coding_system;
412 Lisp_Object Vbig5_coding_system;
413
414 static void record_conversion_result (struct coding_system *coding,
415 enum coding_result_code result);
416 static int detect_coding_utf_8 P_ ((struct coding_system *,
417 struct coding_detection_info *info));
418 static void decode_coding_utf_8 P_ ((struct coding_system *));
419 static int encode_coding_utf_8 P_ ((struct coding_system *));
420
421 static int detect_coding_utf_16 P_ ((struct coding_system *,
422 struct coding_detection_info *info));
423 static void decode_coding_utf_16 P_ ((struct coding_system *));
424 static int encode_coding_utf_16 P_ ((struct coding_system *));
425
426 static int detect_coding_iso_2022 P_ ((struct coding_system *,
427 struct coding_detection_info *info));
428 static void decode_coding_iso_2022 P_ ((struct coding_system *));
429 static int encode_coding_iso_2022 P_ ((struct coding_system *));
430
431 static int detect_coding_emacs_mule P_ ((struct coding_system *,
432 struct coding_detection_info *info));
433 static void decode_coding_emacs_mule P_ ((struct coding_system *));
434 static int encode_coding_emacs_mule P_ ((struct coding_system *));
435
436 static int detect_coding_sjis P_ ((struct coding_system *,
437 struct coding_detection_info *info));
438 static void decode_coding_sjis P_ ((struct coding_system *));
439 static int encode_coding_sjis P_ ((struct coding_system *));
440
441 static int detect_coding_big5 P_ ((struct coding_system *,
442 struct coding_detection_info *info));
443 static void decode_coding_big5 P_ ((struct coding_system *));
444 static int encode_coding_big5 P_ ((struct coding_system *));
445
446 static int detect_coding_ccl P_ ((struct coding_system *,
447 struct coding_detection_info *info));
448 static void decode_coding_ccl P_ ((struct coding_system *));
449 static int encode_coding_ccl P_ ((struct coding_system *));
450
451 static void decode_coding_raw_text P_ ((struct coding_system *));
452 static int encode_coding_raw_text P_ ((struct coding_system *));
453
454
455 /* ISO2022 section */
456
457 #define CODING_ISO_INITIAL(coding, reg) \
458 (XINT (AREF (AREF (CODING_ID_ATTRS ((coding)->id), \
459 coding_attr_iso_initial), \
460 reg)))
461
462
463 #define CODING_ISO_REQUEST(coding, charset_id) \
464 ((charset_id <= (coding)->max_charset_id \
465 ? (coding)->safe_charsets[charset_id] \
466 : -1))
467
468
469 #define CODING_ISO_FLAGS(coding) \
470 ((coding)->spec.iso_2022.flags)
471 #define CODING_ISO_DESIGNATION(coding, reg) \
472 ((coding)->spec.iso_2022.current_designation[reg])
473 #define CODING_ISO_INVOCATION(coding, plane) \
474 ((coding)->spec.iso_2022.current_invocation[plane])
475 #define CODING_ISO_SINGLE_SHIFTING(coding) \
476 ((coding)->spec.iso_2022.single_shifting)
477 #define CODING_ISO_BOL(coding) \
478 ((coding)->spec.iso_2022.bol)
479 #define CODING_ISO_INVOKED_CHARSET(coding, plane) \
480 CODING_ISO_DESIGNATION ((coding), CODING_ISO_INVOCATION ((coding), (plane)))
481
482 /* Control characters of ISO2022. */
483 /* code */ /* function */
484 #define ISO_CODE_LF 0x0A /* line-feed */
485 #define ISO_CODE_CR 0x0D /* carriage-return */
486 #define ISO_CODE_SO 0x0E /* shift-out */
487 #define ISO_CODE_SI 0x0F /* shift-in */
488 #define ISO_CODE_SS2_7 0x19 /* single-shift-2 for 7-bit code */
489 #define ISO_CODE_ESC 0x1B /* escape */
490 #define ISO_CODE_SS2 0x8E /* single-shift-2 */
491 #define ISO_CODE_SS3 0x8F /* single-shift-3 */
492 #define ISO_CODE_CSI 0x9B /* control-sequence-introducer */
493
494 /* All code (1-byte) of ISO2022 is classified into one of the
495 followings. */
496 enum iso_code_class_type
497 {
498 ISO_control_0, /* Control codes in the range
499 0x00..0x1F and 0x7F, except for the
500 following 5 codes. */
501 ISO_shift_out, /* ISO_CODE_SO (0x0E) */
502 ISO_shift_in, /* ISO_CODE_SI (0x0F) */
503 ISO_single_shift_2_7, /* ISO_CODE_SS2_7 (0x19) */
504 ISO_escape, /* ISO_CODE_SO (0x1B) */
505 ISO_control_1, /* Control codes in the range
506 0x80..0x9F, except for the
507 following 3 codes. */
508 ISO_single_shift_2, /* ISO_CODE_SS2 (0x8E) */
509 ISO_single_shift_3, /* ISO_CODE_SS3 (0x8F) */
510 ISO_control_sequence_introducer, /* ISO_CODE_CSI (0x9B) */
511 ISO_0x20_or_0x7F, /* Codes of the values 0x20 or 0x7F. */
512 ISO_graphic_plane_0, /* Graphic codes in the range 0x21..0x7E. */
513 ISO_0xA0_or_0xFF, /* Codes of the values 0xA0 or 0xFF. */
514 ISO_graphic_plane_1 /* Graphic codes in the range 0xA1..0xFE. */
515 };
516
517 /** The macros CODING_ISO_FLAG_XXX defines a flag bit of the
518 `iso-flags' attribute of an iso2022 coding system. */
519
520 /* If set, produce long-form designation sequence (e.g. ESC $ ( A)
521 instead of the correct short-form sequence (e.g. ESC $ A). */
522 #define CODING_ISO_FLAG_LONG_FORM 0x0001
523
524 /* If set, reset graphic planes and registers at end-of-line to the
525 initial state. */
526 #define CODING_ISO_FLAG_RESET_AT_EOL 0x0002
527
528 /* If set, reset graphic planes and registers before any control
529 characters to the initial state. */
530 #define CODING_ISO_FLAG_RESET_AT_CNTL 0x0004
531
532 /* If set, encode by 7-bit environment. */
533 #define CODING_ISO_FLAG_SEVEN_BITS 0x0008
534
535 /* If set, use locking-shift function. */
536 #define CODING_ISO_FLAG_LOCKING_SHIFT 0x0010
537
538 /* If set, use single-shift function. Overwrite
539 CODING_ISO_FLAG_LOCKING_SHIFT. */
540 #define CODING_ISO_FLAG_SINGLE_SHIFT 0x0020
541
542 /* If set, use designation escape sequence. */
543 #define CODING_ISO_FLAG_DESIGNATION 0x0040
544
545 /* If set, produce revision number sequence. */
546 #define CODING_ISO_FLAG_REVISION 0x0080
547
548 /* If set, produce ISO6429's direction specifying sequence. */
549 #define CODING_ISO_FLAG_DIRECTION 0x0100
550
551 /* If set, assume designation states are reset at beginning of line on
552 output. */
553 #define CODING_ISO_FLAG_INIT_AT_BOL 0x0200
554
555 /* If set, designation sequence should be placed at beginning of line
556 on output. */
557 #define CODING_ISO_FLAG_DESIGNATE_AT_BOL 0x0400
558
559 /* If set, do not encode unsafe charactes on output. */
560 #define CODING_ISO_FLAG_SAFE 0x0800
561
562 /* If set, extra latin codes (128..159) are accepted as a valid code
563 on input. */
564 #define CODING_ISO_FLAG_LATIN_EXTRA 0x1000
565
566 #define CODING_ISO_FLAG_COMPOSITION 0x2000
567
568 #define CODING_ISO_FLAG_EUC_TW_SHIFT 0x4000
569
570 #define CODING_ISO_FLAG_USE_ROMAN 0x8000
571
572 #define CODING_ISO_FLAG_USE_OLDJIS 0x10000
573
574 #define CODING_ISO_FLAG_FULL_SUPPORT 0x100000
575
576 /* A character to be produced on output if encoding of the original
577 character is prohibited by CODING_ISO_FLAG_SAFE. */
578 #define CODING_INHIBIT_CHARACTER_SUBSTITUTION '?'
579
580
581 /* UTF-16 section */
582 #define CODING_UTF_16_BOM(coding) \
583 ((coding)->spec.utf_16.bom)
584
585 #define CODING_UTF_16_ENDIAN(coding) \
586 ((coding)->spec.utf_16.endian)
587
588 #define CODING_UTF_16_SURROGATE(coding) \
589 ((coding)->spec.utf_16.surrogate)
590
591
592 /* CCL section */
593 #define CODING_CCL_DECODER(coding) \
594 AREF (CODING_ID_ATTRS ((coding)->id), coding_attr_ccl_decoder)
595 #define CODING_CCL_ENCODER(coding) \
596 AREF (CODING_ID_ATTRS ((coding)->id), coding_attr_ccl_encoder)
597 #define CODING_CCL_VALIDS(coding) \
598 (SDATA (AREF (CODING_ID_ATTRS ((coding)->id), coding_attr_ccl_valids)))
599
600 /* Index for each coding category in `coding_categories' */
601
602 enum coding_category
603 {
604 coding_category_iso_7,
605 coding_category_iso_7_tight,
606 coding_category_iso_8_1,
607 coding_category_iso_8_2,
608 coding_category_iso_7_else,
609 coding_category_iso_8_else,
610 coding_category_utf_8,
611 coding_category_utf_16_auto,
612 coding_category_utf_16_be,
613 coding_category_utf_16_le,
614 coding_category_utf_16_be_nosig,
615 coding_category_utf_16_le_nosig,
616 coding_category_charset,
617 coding_category_sjis,
618 coding_category_big5,
619 coding_category_ccl,
620 coding_category_emacs_mule,
621 /* All above are targets of code detection. */
622 coding_category_raw_text,
623 coding_category_undecided,
624 coding_category_max
625 };
626
627 /* Definitions of flag bits used in detect_coding_XXXX. */
628 #define CATEGORY_MASK_ISO_7 (1 << coding_category_iso_7)
629 #define CATEGORY_MASK_ISO_7_TIGHT (1 << coding_category_iso_7_tight)
630 #define CATEGORY_MASK_ISO_8_1 (1 << coding_category_iso_8_1)
631 #define CATEGORY_MASK_ISO_8_2 (1 << coding_category_iso_8_2)
632 #define CATEGORY_MASK_ISO_7_ELSE (1 << coding_category_iso_7_else)
633 #define CATEGORY_MASK_ISO_8_ELSE (1 << coding_category_iso_8_else)
634 #define CATEGORY_MASK_UTF_8 (1 << coding_category_utf_8)
635 #define CATEGORY_MASK_UTF_16_AUTO (1 << coding_category_utf_16_auto)
636 #define CATEGORY_MASK_UTF_16_BE (1 << coding_category_utf_16_be)
637 #define CATEGORY_MASK_UTF_16_LE (1 << coding_category_utf_16_le)
638 #define CATEGORY_MASK_UTF_16_BE_NOSIG (1 << coding_category_utf_16_be_nosig)
639 #define CATEGORY_MASK_UTF_16_LE_NOSIG (1 << coding_category_utf_16_le_nosig)
640 #define CATEGORY_MASK_CHARSET (1 << coding_category_charset)
641 #define CATEGORY_MASK_SJIS (1 << coding_category_sjis)
642 #define CATEGORY_MASK_BIG5 (1 << coding_category_big5)
643 #define CATEGORY_MASK_CCL (1 << coding_category_ccl)
644 #define CATEGORY_MASK_EMACS_MULE (1 << coding_category_emacs_mule)
645 #define CATEGORY_MASK_RAW_TEXT (1 << coding_category_raw_text)
646
647 /* This value is returned if detect_coding_mask () find nothing other
648 than ASCII characters. */
649 #define CATEGORY_MASK_ANY \
650 (CATEGORY_MASK_ISO_7 \
651 | CATEGORY_MASK_ISO_7_TIGHT \
652 | CATEGORY_MASK_ISO_8_1 \
653 | CATEGORY_MASK_ISO_8_2 \
654 | CATEGORY_MASK_ISO_7_ELSE \
655 | CATEGORY_MASK_ISO_8_ELSE \
656 | CATEGORY_MASK_UTF_8 \
657 | CATEGORY_MASK_UTF_16_BE \
658 | CATEGORY_MASK_UTF_16_LE \
659 | CATEGORY_MASK_UTF_16_BE_NOSIG \
660 | CATEGORY_MASK_UTF_16_LE_NOSIG \
661 | CATEGORY_MASK_CHARSET \
662 | CATEGORY_MASK_SJIS \
663 | CATEGORY_MASK_BIG5 \
664 | CATEGORY_MASK_CCL \
665 | CATEGORY_MASK_EMACS_MULE)
666
667
668 #define CATEGORY_MASK_ISO_7BIT \
669 (CATEGORY_MASK_ISO_7 | CATEGORY_MASK_ISO_7_TIGHT)
670
671 #define CATEGORY_MASK_ISO_8BIT \
672 (CATEGORY_MASK_ISO_8_1 | CATEGORY_MASK_ISO_8_2)
673
674 #define CATEGORY_MASK_ISO_ELSE \
675 (CATEGORY_MASK_ISO_7_ELSE | CATEGORY_MASK_ISO_8_ELSE)
676
677 #define CATEGORY_MASK_ISO_ESCAPE \
678 (CATEGORY_MASK_ISO_7 \
679 | CATEGORY_MASK_ISO_7_TIGHT \
680 | CATEGORY_MASK_ISO_7_ELSE \
681 | CATEGORY_MASK_ISO_8_ELSE)
682
683 #define CATEGORY_MASK_ISO \
684 ( CATEGORY_MASK_ISO_7BIT \
685 | CATEGORY_MASK_ISO_8BIT \
686 | CATEGORY_MASK_ISO_ELSE)
687
688 #define CATEGORY_MASK_UTF_16 \
689 (CATEGORY_MASK_UTF_16_BE \
690 | CATEGORY_MASK_UTF_16_LE \
691 | CATEGORY_MASK_UTF_16_BE_NOSIG \
692 | CATEGORY_MASK_UTF_16_LE_NOSIG)
693
694
695 /* List of symbols `coding-category-xxx' ordered by priority. This
696 variable is exposed to Emacs Lisp. */
697 static Lisp_Object Vcoding_category_list;
698
699 /* Table of coding categories (Lisp symbols). This variable is for
700 internal use oly. */
701 static Lisp_Object Vcoding_category_table;
702
703 /* Table of coding-categories ordered by priority. */
704 static enum coding_category coding_priorities[coding_category_max];
705
706 /* Nth element is a coding context for the coding system bound to the
707 Nth coding category. */
708 static struct coding_system coding_categories[coding_category_max];
709
710 /*** Commonly used macros and functions ***/
711
712 #ifndef min
713 #define min(a, b) ((a) < (b) ? (a) : (b))
714 #endif
715 #ifndef max
716 #define max(a, b) ((a) > (b) ? (a) : (b))
717 #endif
718
719 #define CODING_GET_INFO(coding, attrs, charset_list) \
720 do { \
721 (attrs) = CODING_ID_ATTRS ((coding)->id); \
722 (charset_list) = CODING_ATTR_CHARSET_LIST (attrs); \
723 } while (0)
724
725
726 /* Safely get one byte from the source text pointed by SRC which ends
727 at SRC_END, and set C to that byte. If there are not enough bytes
728 in the source, it jumps to `no_more_source'. If multibytep is
729 nonzero, and a multibyte character is found at SRC, set C to the
730 negative value of the character code. The caller should declare
731 and set these variables appropriately in advance:
732 src, src_end, multibytep */
733
734 #define ONE_MORE_BYTE(c) \
735 do { \
736 if (src == src_end) \
737 { \
738 if (src_base < src) \
739 record_conversion_result \
740 (coding, CODING_RESULT_INSUFFICIENT_SRC); \
741 goto no_more_source; \
742 } \
743 c = *src++; \
744 if (multibytep && (c & 0x80)) \
745 { \
746 if ((c & 0xFE) == 0xC0) \
747 c = ((c & 1) << 6) | *src++; \
748 else \
749 { \
750 c = - string_char (--src, &src, NULL); \
751 record_conversion_result \
752 (coding, CODING_RESULT_INVALID_SRC); \
753 } \
754 } \
755 consumed_chars++; \
756 } while (0)
757
758
759 #define ONE_MORE_BYTE_NO_CHECK(c) \
760 do { \
761 c = *src++; \
762 if (multibytep && (c & 0x80)) \
763 { \
764 if ((c & 0xFE) == 0xC0) \
765 c = ((c & 1) << 6) | *src++; \
766 else \
767 { \
768 c = - string_char (--src, &src, NULL); \
769 record_conversion_result \
770 (coding, CODING_RESULT_INVALID_SRC); \
771 } \
772 } \
773 consumed_chars++; \
774 } while (0)
775
776
777 /* Store a byte C in the place pointed by DST and increment DST to the
778 next free point, and increment PRODUCED_CHARS. The caller should
779 assure that C is 0..127, and declare and set the variable `dst'
780 appropriately in advance.
781 */
782
783
784 #define EMIT_ONE_ASCII_BYTE(c) \
785 do { \
786 produced_chars++; \
787 *dst++ = (c); \
788 } while (0)
789
790
791 /* Like EMIT_ONE_ASCII_BYTE byt store two bytes; C1 and C2. */
792
793 #define EMIT_TWO_ASCII_BYTES(c1, c2) \
794 do { \
795 produced_chars += 2; \
796 *dst++ = (c1), *dst++ = (c2); \
797 } while (0)
798
799
800 /* Store a byte C in the place pointed by DST and increment DST to the
801 next free point, and increment PRODUCED_CHARS. If MULTIBYTEP is
802 nonzero, store in an appropriate multibyte from. The caller should
803 declare and set the variables `dst' and `multibytep' appropriately
804 in advance. */
805
806 #define EMIT_ONE_BYTE(c) \
807 do { \
808 produced_chars++; \
809 if (multibytep) \
810 { \
811 int ch = (c); \
812 if (ch >= 0x80) \
813 ch = BYTE8_TO_CHAR (ch); \
814 CHAR_STRING_ADVANCE (ch, dst); \
815 } \
816 else \
817 *dst++ = (c); \
818 } while (0)
819
820
821 /* Like EMIT_ONE_BYTE, but emit two bytes; C1 and C2. */
822
823 #define EMIT_TWO_BYTES(c1, c2) \
824 do { \
825 produced_chars += 2; \
826 if (multibytep) \
827 { \
828 int ch; \
829 \
830 ch = (c1); \
831 if (ch >= 0x80) \
832 ch = BYTE8_TO_CHAR (ch); \
833 CHAR_STRING_ADVANCE (ch, dst); \
834 ch = (c2); \
835 if (ch >= 0x80) \
836 ch = BYTE8_TO_CHAR (ch); \
837 CHAR_STRING_ADVANCE (ch, dst); \
838 } \
839 else \
840 { \
841 *dst++ = (c1); \
842 *dst++ = (c2); \
843 } \
844 } while (0)
845
846
847 #define EMIT_THREE_BYTES(c1, c2, c3) \
848 do { \
849 EMIT_ONE_BYTE (c1); \
850 EMIT_TWO_BYTES (c2, c3); \
851 } while (0)
852
853
854 #define EMIT_FOUR_BYTES(c1, c2, c3, c4) \
855 do { \
856 EMIT_TWO_BYTES (c1, c2); \
857 EMIT_TWO_BYTES (c3, c4); \
858 } while (0)
859
860
861 static void
862 record_conversion_result (struct coding_system *coding,
863 enum coding_result_code result)
864 {
865 coding->result = result;
866 switch (result)
867 {
868 case CODING_RESULT_INSUFFICIENT_SRC:
869 Vlast_code_conversion_error = Qinsufficient_source;
870 break;
871 case CODING_RESULT_INCONSISTENT_EOL:
872 Vlast_code_conversion_error = Qinconsistent_eol;
873 break;
874 case CODING_RESULT_INVALID_SRC:
875 Vlast_code_conversion_error = Qinvalid_source;
876 break;
877 case CODING_RESULT_INTERRUPT:
878 Vlast_code_conversion_error = Qinterrupted;
879 break;
880 case CODING_RESULT_INSUFFICIENT_MEM:
881 Vlast_code_conversion_error = Qinsufficient_memory;
882 break;
883 }
884 }
885
886 #define CODING_DECODE_CHAR(coding, src, src_base, src_end, charset, code, c) \
887 do { \
888 charset_map_loaded = 0; \
889 c = DECODE_CHAR (charset, code); \
890 if (charset_map_loaded) \
891 { \
892 const unsigned char *orig = coding->source; \
893 EMACS_INT offset; \
894 \
895 coding_set_source (coding); \
896 offset = coding->source - orig; \
897 src += offset; \
898 src_base += offset; \
899 src_end += offset; \
900 } \
901 } while (0)
902
903
904 #define ASSURE_DESTINATION(bytes) \
905 do { \
906 if (dst + (bytes) >= dst_end) \
907 { \
908 int more_bytes = charbuf_end - charbuf + (bytes); \
909 \
910 dst = alloc_destination (coding, more_bytes, dst); \
911 dst_end = coding->destination + coding->dst_bytes; \
912 } \
913 } while (0)
914
915
916
917 static void
918 coding_set_source (coding)
919 struct coding_system *coding;
920 {
921 if (BUFFERP (coding->src_object))
922 {
923 struct buffer *buf = XBUFFER (coding->src_object);
924
925 if (coding->src_pos < 0)
926 coding->source = BUF_GAP_END_ADDR (buf) + coding->src_pos_byte;
927 else
928 coding->source = BUF_BYTE_ADDRESS (buf, coding->src_pos_byte);
929 }
930 else if (STRINGP (coding->src_object))
931 {
932 coding->source = SDATA (coding->src_object) + coding->src_pos_byte;
933 }
934 else
935 /* Otherwise, the source is C string and is never relocated
936 automatically. Thus we don't have to update anything. */
937 ;
938 }
939
940 static void
941 coding_set_destination (coding)
942 struct coding_system *coding;
943 {
944 if (BUFFERP (coding->dst_object))
945 {
946 if (coding->src_pos < 0)
947 {
948 coding->destination = BEG_ADDR + coding->dst_pos_byte - 1;
949 coding->dst_bytes = (GAP_END_ADDR
950 - (coding->src_bytes - coding->consumed)
951 - coding->destination);
952 }
953 else
954 {
955 /* We are sure that coding->dst_pos_byte is before the gap
956 of the buffer. */
957 coding->destination = (BUF_BEG_ADDR (XBUFFER (coding->dst_object))
958 + coding->dst_pos_byte - 1);
959 coding->dst_bytes = (BUF_GAP_END_ADDR (XBUFFER (coding->dst_object))
960 - coding->destination);
961 }
962 }
963 else
964 /* Otherwise, the destination is C string and is never relocated
965 automatically. Thus we don't have to update anything. */
966 ;
967 }
968
969
970 static void
971 coding_alloc_by_realloc (coding, bytes)
972 struct coding_system *coding;
973 EMACS_INT bytes;
974 {
975 coding->destination = (unsigned char *) xrealloc (coding->destination,
976 coding->dst_bytes + bytes);
977 coding->dst_bytes += bytes;
978 }
979
980 static void
981 coding_alloc_by_making_gap (coding, bytes)
982 struct coding_system *coding;
983 EMACS_INT bytes;
984 {
985 if (BUFFERP (coding->dst_object)
986 && EQ (coding->src_object, coding->dst_object))
987 {
988 EMACS_INT add = coding->src_bytes - coding->consumed;
989
990 GAP_SIZE -= add; ZV += add; Z += add; ZV_BYTE += add; Z_BYTE += add;
991 make_gap (bytes);
992 GAP_SIZE += add; ZV -= add; Z -= add; ZV_BYTE -= add; Z_BYTE -= add;
993 }
994 else
995 {
996 Lisp_Object this_buffer;
997
998 this_buffer = Fcurrent_buffer ();
999 set_buffer_internal (XBUFFER (coding->dst_object));
1000 make_gap (bytes);
1001 set_buffer_internal (XBUFFER (this_buffer));
1002 }
1003 }
1004
1005
1006 static unsigned char *
1007 alloc_destination (coding, nbytes, dst)
1008 struct coding_system *coding;
1009 EMACS_INT nbytes;
1010 unsigned char *dst;
1011 {
1012 EMACS_INT offset = dst - coding->destination;
1013
1014 if (BUFFERP (coding->dst_object))
1015 coding_alloc_by_making_gap (coding, nbytes);
1016 else
1017 coding_alloc_by_realloc (coding, nbytes);
1018 record_conversion_result (coding, CODING_RESULT_SUCCESS);
1019 coding_set_destination (coding);
1020 dst = coding->destination + offset;
1021 return dst;
1022 }
1023
1024 /** Macros for annotations. */
1025
1026 /* Maximum length of annotation data (sum of annotations for
1027 composition and charset). */
1028 #define MAX_ANNOTATION_LENGTH (5 + (MAX_COMPOSITION_COMPONENTS * 2) - 1 + 5)
1029
1030 /* An annotation data is stored in the array coding->charbuf in this
1031 format:
1032 [ -LENGTH ANNOTATION_MASK FROM TO ... ]
1033 LENGTH is the number of elements in the annotation.
1034 ANNOTATION_MASK is one of CODING_ANNOTATE_XXX_MASK.
1035 FROM and TO specify the range of text annotated. They are relative
1036 to coding->src_pos (on encoding) or coding->dst_pos (on decoding).
1037
1038 The format of the following elements depend on ANNOTATION_MASK.
1039
1040 In the case of CODING_ANNOTATE_COMPOSITION_MASK, these elements
1041 follows:
1042 ... METHOD [ COMPOSITION-COMPONENTS ... ]
1043 METHOD is one of enum composition_method.
1044 Optionnal COMPOSITION-COMPONENTS are characters and composition
1045 rules.
1046
1047 In the case of CODING_ANNOTATE_CHARSET_MASK, one element CHARSET-ID
1048 follows. */
1049
1050 #define ADD_ANNOTATION_DATA(buf, len, mask, from, to) \
1051 do { \
1052 *(buf)++ = -(len); \
1053 *(buf)++ = (mask); \
1054 *(buf)++ = (from); \
1055 *(buf)++ = (to); \
1056 coding->annotated = 1; \
1057 } while (0);
1058
1059 #define ADD_COMPOSITION_DATA(buf, from, to, method) \
1060 do { \
1061 ADD_ANNOTATION_DATA (buf, 5, CODING_ANNOTATE_COMPOSITION_MASK, from, to); \
1062 *buf++ = method; \
1063 } while (0)
1064
1065
1066 #define ADD_CHARSET_DATA(buf, from, to, id) \
1067 do { \
1068 ADD_ANNOTATION_DATA (buf, 5, CODING_ANNOTATE_CHARSET_MASK, from, to); \
1069 *buf++ = id; \
1070 } while (0)
1071
1072 \f
1073 /*** 2. Emacs' internal format (emacs-utf-8) ***/
1074
1075
1076
1077 \f
1078 /*** 3. UTF-8 ***/
1079
1080 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
1081 Check if a text is encoded in UTF-8. If it is, return 1, else
1082 return 0. */
1083
1084 #define UTF_8_1_OCTET_P(c) ((c) < 0x80)
1085 #define UTF_8_EXTRA_OCTET_P(c) (((c) & 0xC0) == 0x80)
1086 #define UTF_8_2_OCTET_LEADING_P(c) (((c) & 0xE0) == 0xC0)
1087 #define UTF_8_3_OCTET_LEADING_P(c) (((c) & 0xF0) == 0xE0)
1088 #define UTF_8_4_OCTET_LEADING_P(c) (((c) & 0xF8) == 0xF0)
1089 #define UTF_8_5_OCTET_LEADING_P(c) (((c) & 0xFC) == 0xF8)
1090
1091 static int
1092 detect_coding_utf_8 (coding, detect_info)
1093 struct coding_system *coding;
1094 struct coding_detection_info *detect_info;
1095 {
1096 const unsigned char *src = coding->source, *src_base;
1097 const unsigned char *src_end = coding->source + coding->src_bytes;
1098 int multibytep = coding->src_multibyte;
1099 int consumed_chars = 0;
1100 int found = 0;
1101
1102 detect_info->checked |= CATEGORY_MASK_UTF_8;
1103 /* A coding system of this category is always ASCII compatible. */
1104 src += coding->head_ascii;
1105
1106 while (1)
1107 {
1108 int c, c1, c2, c3, c4;
1109
1110 src_base = src;
1111 ONE_MORE_BYTE (c);
1112 if (c < 0 || UTF_8_1_OCTET_P (c))
1113 continue;
1114 ONE_MORE_BYTE (c1);
1115 if (c1 < 0 || ! UTF_8_EXTRA_OCTET_P (c1))
1116 break;
1117 if (UTF_8_2_OCTET_LEADING_P (c))
1118 {
1119 found = CATEGORY_MASK_UTF_8;
1120 continue;
1121 }
1122 ONE_MORE_BYTE (c2);
1123 if (c2 < 0 || ! UTF_8_EXTRA_OCTET_P (c2))
1124 break;
1125 if (UTF_8_3_OCTET_LEADING_P (c))
1126 {
1127 found = CATEGORY_MASK_UTF_8;
1128 continue;
1129 }
1130 ONE_MORE_BYTE (c3);
1131 if (c3 < 0 || ! UTF_8_EXTRA_OCTET_P (c3))
1132 break;
1133 if (UTF_8_4_OCTET_LEADING_P (c))
1134 {
1135 found = CATEGORY_MASK_UTF_8;
1136 continue;
1137 }
1138 ONE_MORE_BYTE (c4);
1139 if (c4 < 0 || ! UTF_8_EXTRA_OCTET_P (c4))
1140 break;
1141 if (UTF_8_5_OCTET_LEADING_P (c))
1142 {
1143 found = CATEGORY_MASK_UTF_8;
1144 continue;
1145 }
1146 break;
1147 }
1148 detect_info->rejected |= CATEGORY_MASK_UTF_8;
1149 return 0;
1150
1151 no_more_source:
1152 if (src_base < src && coding->mode & CODING_MODE_LAST_BLOCK)
1153 {
1154 detect_info->rejected |= CATEGORY_MASK_UTF_8;
1155 return 0;
1156 }
1157 detect_info->found |= found;
1158 return 1;
1159 }
1160
1161
1162 static void
1163 decode_coding_utf_8 (coding)
1164 struct coding_system *coding;
1165 {
1166 const unsigned char *src = coding->source + coding->consumed;
1167 const unsigned char *src_end = coding->source + coding->src_bytes;
1168 const unsigned char *src_base;
1169 int *charbuf = coding->charbuf;
1170 int *charbuf_end = charbuf + coding->charbuf_size;
1171 int consumed_chars = 0, consumed_chars_base;
1172 int multibytep = coding->src_multibyte;
1173 Lisp_Object attr, charset_list;
1174
1175 CODING_GET_INFO (coding, attr, charset_list);
1176
1177 while (1)
1178 {
1179 int c, c1, c2, c3, c4, c5;
1180
1181 src_base = src;
1182 consumed_chars_base = consumed_chars;
1183
1184 if (charbuf >= charbuf_end)
1185 break;
1186
1187 ONE_MORE_BYTE (c1);
1188 if (c1 < 0)
1189 {
1190 c = - c1;
1191 }
1192 else if (UTF_8_1_OCTET_P(c1))
1193 {
1194 c = c1;
1195 }
1196 else
1197 {
1198 ONE_MORE_BYTE (c2);
1199 if (c2 < 0 || ! UTF_8_EXTRA_OCTET_P (c2))
1200 goto invalid_code;
1201 if (UTF_8_2_OCTET_LEADING_P (c1))
1202 {
1203 c = ((c1 & 0x1F) << 6) | (c2 & 0x3F);
1204 /* Reject overlong sequences here and below. Encoders
1205 producing them are incorrect, they can be misleading,
1206 and they mess up read/write invariance. */
1207 if (c < 128)
1208 goto invalid_code;
1209 }
1210 else
1211 {
1212 ONE_MORE_BYTE (c3);
1213 if (c3 < 0 || ! UTF_8_EXTRA_OCTET_P (c3))
1214 goto invalid_code;
1215 if (UTF_8_3_OCTET_LEADING_P (c1))
1216 {
1217 c = (((c1 & 0xF) << 12)
1218 | ((c2 & 0x3F) << 6) | (c3 & 0x3F));
1219 if (c < 0x800
1220 || (c >= 0xd800 && c < 0xe000)) /* surrogates (invalid) */
1221 goto invalid_code;
1222 }
1223 else
1224 {
1225 ONE_MORE_BYTE (c4);
1226 if (c4 < 0 || ! UTF_8_EXTRA_OCTET_P (c4))
1227 goto invalid_code;
1228 if (UTF_8_4_OCTET_LEADING_P (c1))
1229 {
1230 c = (((c1 & 0x7) << 18) | ((c2 & 0x3F) << 12)
1231 | ((c3 & 0x3F) << 6) | (c4 & 0x3F));
1232 if (c < 0x10000)
1233 goto invalid_code;
1234 }
1235 else
1236 {
1237 ONE_MORE_BYTE (c5);
1238 if (c5 < 0 || ! UTF_8_EXTRA_OCTET_P (c5))
1239 goto invalid_code;
1240 if (UTF_8_5_OCTET_LEADING_P (c1))
1241 {
1242 c = (((c1 & 0x3) << 24) | ((c2 & 0x3F) << 18)
1243 | ((c3 & 0x3F) << 12) | ((c4 & 0x3F) << 6)
1244 | (c5 & 0x3F));
1245 if ((c > MAX_CHAR) || (c < 0x200000))
1246 goto invalid_code;
1247 }
1248 else
1249 goto invalid_code;
1250 }
1251 }
1252 }
1253 }
1254
1255 *charbuf++ = c;
1256 continue;
1257
1258 invalid_code:
1259 src = src_base;
1260 consumed_chars = consumed_chars_base;
1261 ONE_MORE_BYTE (c);
1262 *charbuf++ = ASCII_BYTE_P (c) ? c : BYTE8_TO_CHAR (c);
1263 coding->errors++;
1264 }
1265
1266 no_more_source:
1267 coding->consumed_char += consumed_chars_base;
1268 coding->consumed = src_base - coding->source;
1269 coding->charbuf_used = charbuf - coding->charbuf;
1270 }
1271
1272
1273 static int
1274 encode_coding_utf_8 (coding)
1275 struct coding_system *coding;
1276 {
1277 int multibytep = coding->dst_multibyte;
1278 int *charbuf = coding->charbuf;
1279 int *charbuf_end = charbuf + coding->charbuf_used;
1280 unsigned char *dst = coding->destination + coding->produced;
1281 unsigned char *dst_end = coding->destination + coding->dst_bytes;
1282 int produced_chars = 0;
1283 int c;
1284
1285 if (multibytep)
1286 {
1287 int safe_room = MAX_MULTIBYTE_LENGTH * 2;
1288
1289 while (charbuf < charbuf_end)
1290 {
1291 unsigned char str[MAX_MULTIBYTE_LENGTH], *p, *pend = str;
1292
1293 ASSURE_DESTINATION (safe_room);
1294 c = *charbuf++;
1295 if (CHAR_BYTE8_P (c))
1296 {
1297 c = CHAR_TO_BYTE8 (c);
1298 EMIT_ONE_BYTE (c);
1299 }
1300 else
1301 {
1302 CHAR_STRING_ADVANCE (c, pend);
1303 for (p = str; p < pend; p++)
1304 EMIT_ONE_BYTE (*p);
1305 }
1306 }
1307 }
1308 else
1309 {
1310 int safe_room = MAX_MULTIBYTE_LENGTH;
1311
1312 while (charbuf < charbuf_end)
1313 {
1314 ASSURE_DESTINATION (safe_room);
1315 c = *charbuf++;
1316 dst += CHAR_STRING (c, dst);
1317 produced_chars++;
1318 }
1319 }
1320 record_conversion_result (coding, CODING_RESULT_SUCCESS);
1321 coding->produced_char += produced_chars;
1322 coding->produced = dst - coding->destination;
1323 return 0;
1324 }
1325
1326
1327 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
1328 Check if a text is encoded in one of UTF-16 based coding systems.
1329 If it is, return 1, else return 0. */
1330
1331 #define UTF_16_HIGH_SURROGATE_P(val) \
1332 (((val) & 0xFC00) == 0xD800)
1333
1334 #define UTF_16_LOW_SURROGATE_P(val) \
1335 (((val) & 0xFC00) == 0xDC00)
1336
1337 #define UTF_16_INVALID_P(val) \
1338 (((val) == 0xFFFE) \
1339 || ((val) == 0xFFFF) \
1340 || UTF_16_LOW_SURROGATE_P (val))
1341
1342
1343 static int
1344 detect_coding_utf_16 (coding, detect_info)
1345 struct coding_system *coding;
1346 struct coding_detection_info *detect_info;
1347 {
1348 const unsigned char *src = coding->source, *src_base = src;
1349 const unsigned char *src_end = coding->source + coding->src_bytes;
1350 int multibytep = coding->src_multibyte;
1351 int consumed_chars = 0;
1352 int c1, c2;
1353
1354 detect_info->checked |= CATEGORY_MASK_UTF_16;
1355 if (coding->mode & CODING_MODE_LAST_BLOCK
1356 && (coding->src_chars & 1))
1357 {
1358 detect_info->rejected |= CATEGORY_MASK_UTF_16;
1359 return 0;
1360 }
1361
1362 ONE_MORE_BYTE (c1);
1363 ONE_MORE_BYTE (c2);
1364 if ((c1 == 0xFF) && (c2 == 0xFE))
1365 {
1366 detect_info->found |= (CATEGORY_MASK_UTF_16_LE
1367 | CATEGORY_MASK_UTF_16_AUTO);
1368 detect_info->rejected |= (CATEGORY_MASK_UTF_16_BE
1369 | CATEGORY_MASK_UTF_16_BE_NOSIG
1370 | CATEGORY_MASK_UTF_16_LE_NOSIG);
1371 }
1372 else if ((c1 == 0xFE) && (c2 == 0xFF))
1373 {
1374 detect_info->found |= (CATEGORY_MASK_UTF_16_BE
1375 | CATEGORY_MASK_UTF_16_AUTO);
1376 detect_info->rejected |= (CATEGORY_MASK_UTF_16_LE
1377 | CATEGORY_MASK_UTF_16_BE_NOSIG
1378 | CATEGORY_MASK_UTF_16_LE_NOSIG);
1379 }
1380 else if (c1 >= 0 && c2 >= 0)
1381 {
1382 unsigned char b1[256], b2[256];
1383 int b1_variants = 1, b2_variants = 1;
1384 int n;
1385
1386 bzero (b1, 256), bzero (b2, 256);
1387 b1[c1]++, b2[c2]++;
1388 for (n = 0; n < 256 && src < src_end; n++)
1389 {
1390 src_base = src;
1391 ONE_MORE_BYTE (c1);
1392 ONE_MORE_BYTE (c2);
1393 if (c1 < 0 || c2 < 0)
1394 break;
1395 if (! b1[c1++]) b1_variants++;
1396 if (! b2[c2++]) b2_variants++;
1397 }
1398 if (b1_variants < b2_variants)
1399 detect_info->found |= CATEGORY_MASK_UTF_16_BE_NOSIG;
1400 else
1401 detect_info->found |= CATEGORY_MASK_UTF_16_LE_NOSIG;
1402 detect_info->rejected
1403 |= (CATEGORY_MASK_UTF_16_BE | CATEGORY_MASK_UTF_16_LE);
1404 }
1405 no_more_source:
1406 return 1;
1407 }
1408
1409 static void
1410 decode_coding_utf_16 (coding)
1411 struct coding_system *coding;
1412 {
1413 const unsigned char *src = coding->source + coding->consumed;
1414 const unsigned char *src_end = coding->source + coding->src_bytes;
1415 const unsigned char *src_base;
1416 int *charbuf = coding->charbuf;
1417 int *charbuf_end = charbuf + coding->charbuf_size;
1418 int consumed_chars = 0, consumed_chars_base;
1419 int multibytep = coding->src_multibyte;
1420 enum utf_16_bom_type bom = CODING_UTF_16_BOM (coding);
1421 enum utf_16_endian_type endian = CODING_UTF_16_ENDIAN (coding);
1422 int surrogate = CODING_UTF_16_SURROGATE (coding);
1423 Lisp_Object attr, charset_list;
1424
1425 CODING_GET_INFO (coding, attr, charset_list);
1426
1427 if (bom == utf_16_with_bom)
1428 {
1429 int c, c1, c2;
1430
1431 src_base = src;
1432 ONE_MORE_BYTE (c1);
1433 ONE_MORE_BYTE (c2);
1434 c = (c1 << 8) | c2;
1435
1436 if (endian == utf_16_big_endian
1437 ? c != 0xFEFF : c != 0xFFFE)
1438 {
1439 /* The first two bytes are not BOM. Treat them as bytes
1440 for a normal character. */
1441 src = src_base;
1442 coding->errors++;
1443 }
1444 CODING_UTF_16_BOM (coding) = utf_16_without_bom;
1445 }
1446 else if (bom == utf_16_detect_bom)
1447 {
1448 /* We have already tried to detect BOM and failed in
1449 detect_coding. */
1450 CODING_UTF_16_BOM (coding) = utf_16_without_bom;
1451 }
1452
1453 while (1)
1454 {
1455 int c, c1, c2;
1456
1457 src_base = src;
1458 consumed_chars_base = consumed_chars;
1459
1460 if (charbuf + 2 >= charbuf_end)
1461 break;
1462
1463 ONE_MORE_BYTE (c1);
1464 if (c1 < 0)
1465 {
1466 *charbuf++ = -c1;
1467 continue;
1468 }
1469 ONE_MORE_BYTE (c2);
1470 if (c2 < 0)
1471 {
1472 *charbuf++ = ASCII_BYTE_P (c1) ? c1 : BYTE8_TO_CHAR (c1);
1473 *charbuf++ = -c2;
1474 continue;
1475 }
1476 c = (endian == utf_16_big_endian
1477 ? ((c1 << 8) | c2) : ((c2 << 8) | c1));
1478 if (surrogate)
1479 {
1480 if (! UTF_16_LOW_SURROGATE_P (c))
1481 {
1482 if (endian == utf_16_big_endian)
1483 c1 = surrogate >> 8, c2 = surrogate & 0xFF;
1484 else
1485 c1 = surrogate & 0xFF, c2 = surrogate >> 8;
1486 *charbuf++ = c1;
1487 *charbuf++ = c2;
1488 coding->errors++;
1489 if (UTF_16_HIGH_SURROGATE_P (c))
1490 CODING_UTF_16_SURROGATE (coding) = surrogate = c;
1491 else
1492 *charbuf++ = c;
1493 }
1494 else
1495 {
1496 c = ((surrogate - 0xD800) << 10) | (c - 0xDC00);
1497 CODING_UTF_16_SURROGATE (coding) = surrogate = 0;
1498 *charbuf++ = c;
1499 }
1500 }
1501 else
1502 {
1503 if (UTF_16_HIGH_SURROGATE_P (c))
1504 CODING_UTF_16_SURROGATE (coding) = surrogate = c;
1505 else
1506 *charbuf++ = c;
1507 }
1508 }
1509
1510 no_more_source:
1511 coding->consumed_char += consumed_chars_base;
1512 coding->consumed = src_base - coding->source;
1513 coding->charbuf_used = charbuf - coding->charbuf;
1514 }
1515
1516 static int
1517 encode_coding_utf_16 (coding)
1518 struct coding_system *coding;
1519 {
1520 int multibytep = coding->dst_multibyte;
1521 int *charbuf = coding->charbuf;
1522 int *charbuf_end = charbuf + coding->charbuf_used;
1523 unsigned char *dst = coding->destination + coding->produced;
1524 unsigned char *dst_end = coding->destination + coding->dst_bytes;
1525 int safe_room = 8;
1526 enum utf_16_bom_type bom = CODING_UTF_16_BOM (coding);
1527 int big_endian = CODING_UTF_16_ENDIAN (coding) == utf_16_big_endian;
1528 int produced_chars = 0;
1529 Lisp_Object attrs, charset_list;
1530 int c;
1531
1532 CODING_GET_INFO (coding, attrs, charset_list);
1533
1534 if (bom != utf_16_without_bom)
1535 {
1536 ASSURE_DESTINATION (safe_room);
1537 if (big_endian)
1538 EMIT_TWO_BYTES (0xFE, 0xFF);
1539 else
1540 EMIT_TWO_BYTES (0xFF, 0xFE);
1541 CODING_UTF_16_BOM (coding) = utf_16_without_bom;
1542 }
1543
1544 while (charbuf < charbuf_end)
1545 {
1546 ASSURE_DESTINATION (safe_room);
1547 c = *charbuf++;
1548 if (c >= MAX_UNICODE_CHAR)
1549 c = coding->default_char;
1550
1551 if (c < 0x10000)
1552 {
1553 if (big_endian)
1554 EMIT_TWO_BYTES (c >> 8, c & 0xFF);
1555 else
1556 EMIT_TWO_BYTES (c & 0xFF, c >> 8);
1557 }
1558 else
1559 {
1560 int c1, c2;
1561
1562 c -= 0x10000;
1563 c1 = (c >> 10) + 0xD800;
1564 c2 = (c & 0x3FF) + 0xDC00;
1565 if (big_endian)
1566 EMIT_FOUR_BYTES (c1 >> 8, c1 & 0xFF, c2 >> 8, c2 & 0xFF);
1567 else
1568 EMIT_FOUR_BYTES (c1 & 0xFF, c1 >> 8, c2 & 0xFF, c2 >> 8);
1569 }
1570 }
1571 record_conversion_result (coding, CODING_RESULT_SUCCESS);
1572 coding->produced = dst - coding->destination;
1573 coding->produced_char += produced_chars;
1574 return 0;
1575 }
1576
1577 \f
1578 /*** 6. Old Emacs' internal format (emacs-mule) ***/
1579
1580 /* Emacs' internal format for representation of multiple character
1581 sets is a kind of multi-byte encoding, i.e. characters are
1582 represented by variable-length sequences of one-byte codes.
1583
1584 ASCII characters and control characters (e.g. `tab', `newline') are
1585 represented by one-byte sequences which are their ASCII codes, in
1586 the range 0x00 through 0x7F.
1587
1588 8-bit characters of the range 0x80..0x9F are represented by
1589 two-byte sequences of LEADING_CODE_8_BIT_CONTROL and (their 8-bit
1590 code + 0x20).
1591
1592 8-bit characters of the range 0xA0..0xFF are represented by
1593 one-byte sequences which are their 8-bit code.
1594
1595 The other characters are represented by a sequence of `base
1596 leading-code', optional `extended leading-code', and one or two
1597 `position-code's. The length of the sequence is determined by the
1598 base leading-code. Leading-code takes the range 0x81 through 0x9D,
1599 whereas extended leading-code and position-code take the range 0xA0
1600 through 0xFF. See `charset.h' for more details about leading-code
1601 and position-code.
1602
1603 --- CODE RANGE of Emacs' internal format ---
1604 character set range
1605 ------------- -----
1606 ascii 0x00..0x7F
1607 eight-bit-control LEADING_CODE_8_BIT_CONTROL + 0xA0..0xBF
1608 eight-bit-graphic 0xA0..0xBF
1609 ELSE 0x81..0x9D + [0xA0..0xFF]+
1610 ---------------------------------------------
1611
1612 As this is the internal character representation, the format is
1613 usually not used externally (i.e. in a file or in a data sent to a
1614 process). But, it is possible to have a text externally in this
1615 format (i.e. by encoding by the coding system `emacs-mule').
1616
1617 In that case, a sequence of one-byte codes has a slightly different
1618 form.
1619
1620 At first, all characters in eight-bit-control are represented by
1621 one-byte sequences which are their 8-bit code.
1622
1623 Next, character composition data are represented by the byte
1624 sequence of the form: 0x80 METHOD BYTES CHARS COMPONENT ...,
1625 where,
1626 METHOD is 0xF0 plus one of composition method (enum
1627 composition_method),
1628
1629 BYTES is 0xA0 plus a byte length of this composition data,
1630
1631 CHARS is 0x20 plus a number of characters composed by this
1632 data,
1633
1634 COMPONENTs are characters of multibye form or composition
1635 rules encoded by two-byte of ASCII codes.
1636
1637 In addition, for backward compatibility, the following formats are
1638 also recognized as composition data on decoding.
1639
1640 0x80 MSEQ ...
1641 0x80 0xFF MSEQ RULE MSEQ RULE ... MSEQ
1642
1643 Here,
1644 MSEQ is a multibyte form but in these special format:
1645 ASCII: 0xA0 ASCII_CODE+0x80,
1646 other: LEADING_CODE+0x20 FOLLOWING-BYTE ...,
1647 RULE is a one byte code of the range 0xA0..0xF0 that
1648 represents a composition rule.
1649 */
1650
1651 char emacs_mule_bytes[256];
1652
1653 int
1654 emacs_mule_char (coding, src, nbytes, nchars, id)
1655 struct coding_system *coding;
1656 const unsigned char *src;
1657 int *nbytes, *nchars, *id;
1658 {
1659 const unsigned char *src_end = coding->source + coding->src_bytes;
1660 const unsigned char *src_base = src;
1661 int multibytep = coding->src_multibyte;
1662 struct charset *charset;
1663 unsigned code;
1664 int c;
1665 int consumed_chars = 0;
1666
1667 ONE_MORE_BYTE (c);
1668 if (c < 0)
1669 {
1670 c = -c;
1671 charset = emacs_mule_charset[0];
1672 }
1673 else
1674 {
1675 switch (emacs_mule_bytes[c])
1676 {
1677 case 2:
1678 if (! (charset = emacs_mule_charset[c]))
1679 goto invalid_code;
1680 ONE_MORE_BYTE (c);
1681 if (c < 0)
1682 goto invalid_code;
1683 code = c & 0x7F;
1684 break;
1685
1686 case 3:
1687 if (c == EMACS_MULE_LEADING_CODE_PRIVATE_11
1688 || c == EMACS_MULE_LEADING_CODE_PRIVATE_12)
1689 {
1690 ONE_MORE_BYTE (c);
1691 if (c < 0 || ! (charset = emacs_mule_charset[c]))
1692 goto invalid_code;
1693 ONE_MORE_BYTE (c);
1694 if (c < 0)
1695 goto invalid_code;
1696 code = c & 0x7F;
1697 }
1698 else
1699 {
1700 if (! (charset = emacs_mule_charset[c]))
1701 goto invalid_code;
1702 ONE_MORE_BYTE (c);
1703 if (c < 0)
1704 goto invalid_code;
1705 code = (c & 0x7F) << 8;
1706 ONE_MORE_BYTE (c);
1707 if (c < 0)
1708 goto invalid_code;
1709 code |= c & 0x7F;
1710 }
1711 break;
1712
1713 case 4:
1714 ONE_MORE_BYTE (c);
1715 if (c < 0 || ! (charset = emacs_mule_charset[c]))
1716 goto invalid_code;
1717 ONE_MORE_BYTE (c);
1718 if (c < 0)
1719 goto invalid_code;
1720 code = (c & 0x7F) << 8;
1721 ONE_MORE_BYTE (c);
1722 if (c < 0)
1723 goto invalid_code;
1724 code |= c & 0x7F;
1725 break;
1726
1727 case 1:
1728 code = c;
1729 charset = CHARSET_FROM_ID (ASCII_BYTE_P (code)
1730 ? charset_ascii : charset_eight_bit);
1731 break;
1732
1733 default:
1734 abort ();
1735 }
1736 c = DECODE_CHAR (charset, code);
1737 if (c < 0)
1738 goto invalid_code;
1739 }
1740 *nbytes = src - src_base;
1741 *nchars = consumed_chars;
1742 if (id)
1743 *id = charset->id;
1744 return c;
1745
1746 no_more_source:
1747 return -2;
1748
1749 invalid_code:
1750 return -1;
1751 }
1752
1753
1754 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
1755 Check if a text is encoded in `emacs-mule'. If it is, return 1,
1756 else return 0. */
1757
1758 static int
1759 detect_coding_emacs_mule (coding, detect_info)
1760 struct coding_system *coding;
1761 struct coding_detection_info *detect_info;
1762 {
1763 const unsigned char *src = coding->source, *src_base;
1764 const unsigned char *src_end = coding->source + coding->src_bytes;
1765 int multibytep = coding->src_multibyte;
1766 int consumed_chars = 0;
1767 int c;
1768 int found = 0;
1769
1770 detect_info->checked |= CATEGORY_MASK_EMACS_MULE;
1771 /* A coding system of this category is always ASCII compatible. */
1772 src += coding->head_ascii;
1773
1774 while (1)
1775 {
1776 src_base = src;
1777 ONE_MORE_BYTE (c);
1778 if (c < 0)
1779 continue;
1780 if (c == 0x80)
1781 {
1782 /* Perhaps the start of composite character. We simple skip
1783 it because analyzing it is too heavy for detecting. But,
1784 at least, we check that the composite character
1785 constitues of more than 4 bytes. */
1786 const unsigned char *src_base;
1787
1788 repeat:
1789 src_base = src;
1790 do
1791 {
1792 ONE_MORE_BYTE (c);
1793 }
1794 while (c >= 0xA0);
1795
1796 if (src - src_base <= 4)
1797 break;
1798 found = CATEGORY_MASK_EMACS_MULE;
1799 if (c == 0x80)
1800 goto repeat;
1801 }
1802
1803 if (c < 0x80)
1804 {
1805 if (c < 0x20
1806 && (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO))
1807 break;
1808 }
1809 else
1810 {
1811 const unsigned char *src_base = src - 1;
1812
1813 do
1814 {
1815 ONE_MORE_BYTE (c);
1816 }
1817 while (c >= 0xA0);
1818 if (src - src_base != emacs_mule_bytes[*src_base])
1819 break;
1820 found = CATEGORY_MASK_EMACS_MULE;
1821 }
1822 }
1823 detect_info->rejected |= CATEGORY_MASK_EMACS_MULE;
1824 return 0;
1825
1826 no_more_source:
1827 if (src_base < src && coding->mode & CODING_MODE_LAST_BLOCK)
1828 {
1829 detect_info->rejected |= CATEGORY_MASK_EMACS_MULE;
1830 return 0;
1831 }
1832 detect_info->found |= found;
1833 return 1;
1834 }
1835
1836
1837 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions". */
1838
1839 /* Decode a character represented as a component of composition
1840 sequence of Emacs 20/21 style at SRC. Set C to that character and
1841 update SRC to the head of next character (or an encoded composition
1842 rule). If SRC doesn't points a composition component, set C to -1.
1843 If SRC points an invalid byte sequence, global exit by a return
1844 value 0. */
1845
1846 #define DECODE_EMACS_MULE_COMPOSITION_CHAR(buf) \
1847 if (1) \
1848 { \
1849 int c; \
1850 int nbytes, nchars; \
1851 \
1852 if (src == src_end) \
1853 break; \
1854 c = emacs_mule_char (coding, src, &nbytes, &nchars, NULL);\
1855 if (c < 0) \
1856 { \
1857 if (c == -2) \
1858 break; \
1859 goto invalid_code; \
1860 } \
1861 *buf++ = c; \
1862 src += nbytes; \
1863 consumed_chars += nchars; \
1864 } \
1865 else
1866
1867
1868 /* Decode a composition rule represented as a component of composition
1869 sequence of Emacs 20 style at SRC. Store the decoded rule in *BUF,
1870 and increment BUF. If SRC points an invalid byte sequence, set C
1871 to -1. */
1872
1873 #define DECODE_EMACS_MULE_COMPOSITION_RULE_20(buf) \
1874 do { \
1875 int c, gref, nref; \
1876 \
1877 if (src >= src_end) \
1878 goto invalid_code; \
1879 ONE_MORE_BYTE_NO_CHECK (c); \
1880 c -= 0x20; \
1881 if (c < 0 || c >= 81) \
1882 goto invalid_code; \
1883 \
1884 gref = c / 9, nref = c % 9; \
1885 *buf++ = COMPOSITION_ENCODE_RULE (gref, nref); \
1886 } while (0)
1887
1888
1889 /* Decode a composition rule represented as a component of composition
1890 sequence of Emacs 21 style at SRC. Store the decoded rule in *BUF,
1891 and increment BUF. If SRC points an invalid byte sequence, set C
1892 to -1. */
1893
1894 #define DECODE_EMACS_MULE_COMPOSITION_RULE_21(buf) \
1895 do { \
1896 int gref, nref; \
1897 \
1898 if (src + 1>= src_end) \
1899 goto invalid_code; \
1900 ONE_MORE_BYTE_NO_CHECK (gref); \
1901 gref -= 0x20; \
1902 ONE_MORE_BYTE_NO_CHECK (nref); \
1903 nref -= 0x20; \
1904 if (gref < 0 || gref >= 81 \
1905 || nref < 0 || nref >= 81) \
1906 goto invalid_code; \
1907 *buf++ = COMPOSITION_ENCODE_RULE (gref, nref); \
1908 } while (0)
1909
1910
1911 #define DECODE_EMACS_MULE_21_COMPOSITION(c) \
1912 do { \
1913 /* Emacs 21 style format. The first three bytes at SRC are \
1914 (METHOD - 0xF2), (BYTES - 0xA0), (CHARS - 0xA0), where BYTES is \
1915 the byte length of this composition information, CHARS is the \
1916 number of characters composed by this composition. */ \
1917 enum composition_method method = c - 0xF2; \
1918 int *charbuf_base = charbuf; \
1919 int from, to; \
1920 int consumed_chars_limit; \
1921 int nbytes, nchars; \
1922 \
1923 ONE_MORE_BYTE (c); \
1924 if (c < 0) \
1925 goto invalid_code; \
1926 nbytes = c - 0xA0; \
1927 if (nbytes < 3) \
1928 goto invalid_code; \
1929 ONE_MORE_BYTE (c); \
1930 if (c < 0) \
1931 goto invalid_code; \
1932 nchars = c - 0xA0; \
1933 from = coding->produced + char_offset; \
1934 to = from + nchars; \
1935 ADD_COMPOSITION_DATA (charbuf, from, to, method); \
1936 consumed_chars_limit = consumed_chars_base + nbytes; \
1937 if (method != COMPOSITION_RELATIVE) \
1938 { \
1939 int i = 0; \
1940 while (consumed_chars < consumed_chars_limit) \
1941 { \
1942 if (i % 2 && method != COMPOSITION_WITH_ALTCHARS) \
1943 DECODE_EMACS_MULE_COMPOSITION_RULE_21 (charbuf); \
1944 else \
1945 DECODE_EMACS_MULE_COMPOSITION_CHAR (charbuf); \
1946 i++; \
1947 } \
1948 if (consumed_chars < consumed_chars_limit) \
1949 goto invalid_code; \
1950 charbuf_base[0] -= i; \
1951 } \
1952 } while (0)
1953
1954
1955 #define DECODE_EMACS_MULE_20_RELATIVE_COMPOSITION(c) \
1956 do { \
1957 /* Emacs 20 style format for relative composition. */ \
1958 /* Store multibyte form of characters to be composed. */ \
1959 enum composition_method method = COMPOSITION_RELATIVE; \
1960 int components[MAX_COMPOSITION_COMPONENTS * 2 - 1]; \
1961 int *buf = components; \
1962 int i, j; \
1963 int from, to; \
1964 \
1965 src = src_base; \
1966 ONE_MORE_BYTE (c); /* skip 0x80 */ \
1967 for (i = 0; i < MAX_COMPOSITION_COMPONENTS; i++) \
1968 DECODE_EMACS_MULE_COMPOSITION_CHAR (buf); \
1969 if (i < 2) \
1970 goto invalid_code; \
1971 from = coding->produced_char + char_offset; \
1972 to = from + i; \
1973 ADD_COMPOSITION_DATA (charbuf, from, to, method); \
1974 for (j = 0; j < i; j++) \
1975 *charbuf++ = components[j]; \
1976 } while (0)
1977
1978
1979 #define DECODE_EMACS_MULE_20_RULEBASE_COMPOSITION(c) \
1980 do { \
1981 /* Emacs 20 style format for rule-base composition. */ \
1982 /* Store multibyte form of characters to be composed. */ \
1983 enum composition_method method = COMPOSITION_WITH_RULE; \
1984 int components[MAX_COMPOSITION_COMPONENTS * 2 - 1]; \
1985 int *buf = components; \
1986 int i, j; \
1987 int from, to; \
1988 \
1989 DECODE_EMACS_MULE_COMPOSITION_CHAR (buf); \
1990 for (i = 0; i < MAX_COMPOSITION_COMPONENTS; i++) \
1991 { \
1992 DECODE_EMACS_MULE_COMPOSITION_RULE_20 (buf); \
1993 DECODE_EMACS_MULE_COMPOSITION_CHAR (buf); \
1994 } \
1995 if (i < 1 || (buf - components) % 2 == 0) \
1996 goto invalid_code; \
1997 if (charbuf + i + (i / 2) + 1 < charbuf_end) \
1998 goto no_more_source; \
1999 from = coding->produced_char + char_offset; \
2000 to = from + i; \
2001 ADD_COMPOSITION_DATA (buf, from, to, method); \
2002 for (j = 0; j < i; j++) \
2003 *charbuf++ = components[j]; \
2004 for (j = 0; j < i; j += 2) \
2005 *charbuf++ = components[j]; \
2006 } while (0)
2007
2008
2009 static void
2010 decode_coding_emacs_mule (coding)
2011 struct coding_system *coding;
2012 {
2013 const unsigned char *src = coding->source + coding->consumed;
2014 const unsigned char *src_end = coding->source + coding->src_bytes;
2015 const unsigned char *src_base;
2016 int *charbuf = coding->charbuf;
2017 int *charbuf_end = charbuf + coding->charbuf_size - MAX_ANNOTATION_LENGTH;
2018 int consumed_chars = 0, consumed_chars_base;
2019 int multibytep = coding->src_multibyte;
2020 Lisp_Object attrs, charset_list;
2021 int char_offset = coding->produced_char;
2022 int last_offset = char_offset;
2023 int last_id = charset_ascii;
2024
2025 CODING_GET_INFO (coding, attrs, charset_list);
2026
2027 while (1)
2028 {
2029 int c;
2030
2031 src_base = src;
2032 consumed_chars_base = consumed_chars;
2033
2034 if (charbuf >= charbuf_end)
2035 break;
2036
2037 ONE_MORE_BYTE (c);
2038 if (c < 0)
2039 {
2040 *charbuf++ = -c;
2041 char_offset++;
2042 }
2043 else if (c < 0x80)
2044 {
2045 *charbuf++ = c;
2046 char_offset++;
2047 }
2048 else if (c == 0x80)
2049 {
2050 ONE_MORE_BYTE (c);
2051 if (c < 0)
2052 goto invalid_code;
2053 if (c - 0xF2 >= COMPOSITION_RELATIVE
2054 && c - 0xF2 <= COMPOSITION_WITH_RULE_ALTCHARS)
2055 DECODE_EMACS_MULE_21_COMPOSITION (c);
2056 else if (c < 0xC0)
2057 DECODE_EMACS_MULE_20_RELATIVE_COMPOSITION (c);
2058 else if (c == 0xFF)
2059 DECODE_EMACS_MULE_20_RULEBASE_COMPOSITION (c);
2060 else
2061 goto invalid_code;
2062 }
2063 else if (c < 0xA0 && emacs_mule_bytes[c] > 1)
2064 {
2065 int nbytes, nchars;
2066 int id;
2067
2068 src = src_base;
2069 consumed_chars = consumed_chars_base;
2070 c = emacs_mule_char (coding, src, &nbytes, &nchars, &id);
2071 if (c < 0)
2072 {
2073 if (c == -2)
2074 break;
2075 goto invalid_code;
2076 }
2077 if (last_id != id)
2078 {
2079 if (last_id != charset_ascii)
2080 ADD_CHARSET_DATA (charbuf, last_offset, char_offset, last_id);
2081 last_id = id;
2082 last_offset = char_offset;
2083 }
2084 *charbuf++ = c;
2085 src += nbytes;
2086 consumed_chars += nchars;
2087 char_offset++;
2088 }
2089 continue;
2090
2091 invalid_code:
2092 src = src_base;
2093 consumed_chars = consumed_chars_base;
2094 ONE_MORE_BYTE (c);
2095 *charbuf++ = ASCII_BYTE_P (c) ? c : BYTE8_TO_CHAR (c);
2096 char_offset++;
2097 coding->errors++;
2098 }
2099
2100 no_more_source:
2101 if (last_id != charset_ascii)
2102 ADD_CHARSET_DATA (charbuf, last_offset, char_offset, last_id);
2103 coding->consumed_char += consumed_chars_base;
2104 coding->consumed = src_base - coding->source;
2105 coding->charbuf_used = charbuf - coding->charbuf;
2106 }
2107
2108
2109 #define EMACS_MULE_LEADING_CODES(id, codes) \
2110 do { \
2111 if (id < 0xA0) \
2112 codes[0] = id, codes[1] = 0; \
2113 else if (id < 0xE0) \
2114 codes[0] = 0x9A, codes[1] = id; \
2115 else if (id < 0xF0) \
2116 codes[0] = 0x9B, codes[1] = id; \
2117 else if (id < 0xF5) \
2118 codes[0] = 0x9C, codes[1] = id; \
2119 else \
2120 codes[0] = 0x9D, codes[1] = id; \
2121 } while (0);
2122
2123
2124 static int
2125 encode_coding_emacs_mule (coding)
2126 struct coding_system *coding;
2127 {
2128 int multibytep = coding->dst_multibyte;
2129 int *charbuf = coding->charbuf;
2130 int *charbuf_end = charbuf + coding->charbuf_used;
2131 unsigned char *dst = coding->destination + coding->produced;
2132 unsigned char *dst_end = coding->destination + coding->dst_bytes;
2133 int safe_room = 8;
2134 int produced_chars = 0;
2135 Lisp_Object attrs, charset_list;
2136 int c;
2137 int preferred_charset_id = -1;
2138
2139 CODING_GET_INFO (coding, attrs, charset_list);
2140 if (! EQ (charset_list, Vemacs_mule_charset_list))
2141 {
2142 CODING_ATTR_CHARSET_LIST (attrs)
2143 = charset_list = Vemacs_mule_charset_list;
2144 }
2145
2146 while (charbuf < charbuf_end)
2147 {
2148 ASSURE_DESTINATION (safe_room);
2149 c = *charbuf++;
2150
2151 if (c < 0)
2152 {
2153 /* Handle an annotation. */
2154 switch (*charbuf)
2155 {
2156 case CODING_ANNOTATE_COMPOSITION_MASK:
2157 /* Not yet implemented. */
2158 break;
2159 case CODING_ANNOTATE_CHARSET_MASK:
2160 preferred_charset_id = charbuf[3];
2161 if (preferred_charset_id >= 0
2162 && NILP (Fmemq (make_number (preferred_charset_id),
2163 charset_list)))
2164 preferred_charset_id = -1;
2165 break;
2166 default:
2167 abort ();
2168 }
2169 charbuf += -c - 1;
2170 continue;
2171 }
2172
2173 if (ASCII_CHAR_P (c))
2174 EMIT_ONE_ASCII_BYTE (c);
2175 else if (CHAR_BYTE8_P (c))
2176 {
2177 c = CHAR_TO_BYTE8 (c);
2178 EMIT_ONE_BYTE (c);
2179 }
2180 else
2181 {
2182 struct charset *charset;
2183 unsigned code;
2184 int dimension;
2185 int emacs_mule_id;
2186 unsigned char leading_codes[2];
2187
2188 if (preferred_charset_id >= 0)
2189 {
2190 charset = CHARSET_FROM_ID (preferred_charset_id);
2191 if (! CHAR_CHARSET_P (c, charset))
2192 charset = char_charset (c, charset_list, NULL);
2193 }
2194 else
2195 charset = char_charset (c, charset_list, &code);
2196 if (! charset)
2197 {
2198 c = coding->default_char;
2199 if (ASCII_CHAR_P (c))
2200 {
2201 EMIT_ONE_ASCII_BYTE (c);
2202 continue;
2203 }
2204 charset = char_charset (c, charset_list, &code);
2205 }
2206 dimension = CHARSET_DIMENSION (charset);
2207 emacs_mule_id = CHARSET_EMACS_MULE_ID (charset);
2208 EMACS_MULE_LEADING_CODES (emacs_mule_id, leading_codes);
2209 EMIT_ONE_BYTE (leading_codes[0]);
2210 if (leading_codes[1])
2211 EMIT_ONE_BYTE (leading_codes[1]);
2212 if (dimension == 1)
2213 EMIT_ONE_BYTE (code | 0x80);
2214 else
2215 {
2216 code |= 0x8080;
2217 EMIT_ONE_BYTE (code >> 8);
2218 EMIT_ONE_BYTE (code & 0xFF);
2219 }
2220 }
2221 }
2222 record_conversion_result (coding, CODING_RESULT_SUCCESS);
2223 coding->produced_char += produced_chars;
2224 coding->produced = dst - coding->destination;
2225 return 0;
2226 }
2227
2228 \f
2229 /*** 7. ISO2022 handlers ***/
2230
2231 /* The following note describes the coding system ISO2022 briefly.
2232 Since the intention of this note is to help understand the
2233 functions in this file, some parts are NOT ACCURATE or are OVERLY
2234 SIMPLIFIED. For thorough understanding, please refer to the
2235 original document of ISO2022. This is equivalent to the standard
2236 ECMA-35, obtainable from <URL:http://www.ecma.ch/> (*).
2237
2238 ISO2022 provides many mechanisms to encode several character sets
2239 in 7-bit and 8-bit environments. For 7-bit environments, all text
2240 is encoded using bytes less than 128. This may make the encoded
2241 text a little bit longer, but the text passes more easily through
2242 several types of gateway, some of which strip off the MSB (Most
2243 Significant Bit).
2244
2245 There are two kinds of character sets: control character sets and
2246 graphic character sets. The former contain control characters such
2247 as `newline' and `escape' to provide control functions (control
2248 functions are also provided by escape sequences). The latter
2249 contain graphic characters such as 'A' and '-'. Emacs recognizes
2250 two control character sets and many graphic character sets.
2251
2252 Graphic character sets are classified into one of the following
2253 four classes, according to the number of bytes (DIMENSION) and
2254 number of characters in one dimension (CHARS) of the set:
2255 - DIMENSION1_CHARS94
2256 - DIMENSION1_CHARS96
2257 - DIMENSION2_CHARS94
2258 - DIMENSION2_CHARS96
2259
2260 In addition, each character set is assigned an identification tag,
2261 unique for each set, called the "final character" (denoted as <F>
2262 hereafter). The <F> of each character set is decided by ECMA(*)
2263 when it is registered in ISO. The code range of <F> is 0x30..0x7F
2264 (0x30..0x3F are for private use only).
2265
2266 Note (*): ECMA = European Computer Manufacturers Association
2267
2268 Here are examples of graphic character sets [NAME(<F>)]:
2269 o DIMENSION1_CHARS94 -- ASCII('B'), right-half-of-JISX0201('I'), ...
2270 o DIMENSION1_CHARS96 -- right-half-of-ISO8859-1('A'), ...
2271 o DIMENSION2_CHARS94 -- GB2312('A'), JISX0208('B'), ...
2272 o DIMENSION2_CHARS96 -- none for the moment
2273
2274 A code area (1 byte=8 bits) is divided into 4 areas, C0, GL, C1, and GR.
2275 C0 [0x00..0x1F] -- control character plane 0
2276 GL [0x20..0x7F] -- graphic character plane 0
2277 C1 [0x80..0x9F] -- control character plane 1
2278 GR [0xA0..0xFF] -- graphic character plane 1
2279
2280 A control character set is directly designated and invoked to C0 or
2281 C1 by an escape sequence. The most common case is that:
2282 - ISO646's control character set is designated/invoked to C0, and
2283 - ISO6429's control character set is designated/invoked to C1,
2284 and usually these designations/invocations are omitted in encoded
2285 text. In a 7-bit environment, only C0 can be used, and a control
2286 character for C1 is encoded by an appropriate escape sequence to
2287 fit into the environment. All control characters for C1 are
2288 defined to have corresponding escape sequences.
2289
2290 A graphic character set is at first designated to one of four
2291 graphic registers (G0 through G3), then these graphic registers are
2292 invoked to GL or GR. These designations and invocations can be
2293 done independently. The most common case is that G0 is invoked to
2294 GL, G1 is invoked to GR, and ASCII is designated to G0. Usually
2295 these invocations and designations are omitted in encoded text.
2296 In a 7-bit environment, only GL can be used.
2297
2298 When a graphic character set of CHARS94 is invoked to GL, codes
2299 0x20 and 0x7F of the GL area work as control characters SPACE and
2300 DEL respectively, and codes 0xA0 and 0xFF of the GR area should not
2301 be used.
2302
2303 There are two ways of invocation: locking-shift and single-shift.
2304 With locking-shift, the invocation lasts until the next different
2305 invocation, whereas with single-shift, the invocation affects the
2306 following character only and doesn't affect the locking-shift
2307 state. Invocations are done by the following control characters or
2308 escape sequences:
2309
2310 ----------------------------------------------------------------------
2311 abbrev function cntrl escape seq description
2312 ----------------------------------------------------------------------
2313 SI/LS0 (shift-in) 0x0F none invoke G0 into GL
2314 SO/LS1 (shift-out) 0x0E none invoke G1 into GL
2315 LS2 (locking-shift-2) none ESC 'n' invoke G2 into GL
2316 LS3 (locking-shift-3) none ESC 'o' invoke G3 into GL
2317 LS1R (locking-shift-1 right) none ESC '~' invoke G1 into GR (*)
2318 LS2R (locking-shift-2 right) none ESC '}' invoke G2 into GR (*)
2319 LS3R (locking-shift 3 right) none ESC '|' invoke G3 into GR (*)
2320 SS2 (single-shift-2) 0x8E ESC 'N' invoke G2 for one char
2321 SS3 (single-shift-3) 0x8F ESC 'O' invoke G3 for one char
2322 ----------------------------------------------------------------------
2323 (*) These are not used by any known coding system.
2324
2325 Control characters for these functions are defined by macros
2326 ISO_CODE_XXX in `coding.h'.
2327
2328 Designations are done by the following escape sequences:
2329 ----------------------------------------------------------------------
2330 escape sequence description
2331 ----------------------------------------------------------------------
2332 ESC '(' <F> designate DIMENSION1_CHARS94<F> to G0
2333 ESC ')' <F> designate DIMENSION1_CHARS94<F> to G1
2334 ESC '*' <F> designate DIMENSION1_CHARS94<F> to G2
2335 ESC '+' <F> designate DIMENSION1_CHARS94<F> to G3
2336 ESC ',' <F> designate DIMENSION1_CHARS96<F> to G0 (*)
2337 ESC '-' <F> designate DIMENSION1_CHARS96<F> to G1
2338 ESC '.' <F> designate DIMENSION1_CHARS96<F> to G2
2339 ESC '/' <F> designate DIMENSION1_CHARS96<F> to G3
2340 ESC '$' '(' <F> designate DIMENSION2_CHARS94<F> to G0 (**)
2341 ESC '$' ')' <F> designate DIMENSION2_CHARS94<F> to G1
2342 ESC '$' '*' <F> designate DIMENSION2_CHARS94<F> to G2
2343 ESC '$' '+' <F> designate DIMENSION2_CHARS94<F> to G3
2344 ESC '$' ',' <F> designate DIMENSION2_CHARS96<F> to G0 (*)
2345 ESC '$' '-' <F> designate DIMENSION2_CHARS96<F> to G1
2346 ESC '$' '.' <F> designate DIMENSION2_CHARS96<F> to G2
2347 ESC '$' '/' <F> designate DIMENSION2_CHARS96<F> to G3
2348 ----------------------------------------------------------------------
2349
2350 In this list, "DIMENSION1_CHARS94<F>" means a graphic character set
2351 of dimension 1, chars 94, and final character <F>, etc...
2352
2353 Note (*): Although these designations are not allowed in ISO2022,
2354 Emacs accepts them on decoding, and produces them on encoding
2355 CHARS96 character sets in a coding system which is characterized as
2356 7-bit environment, non-locking-shift, and non-single-shift.
2357
2358 Note (**): If <F> is '@', 'A', or 'B', the intermediate character
2359 '(' must be omitted. We refer to this as "short-form" hereafter.
2360
2361 Now you may notice that there are a lot of ways of encoding the
2362 same multilingual text in ISO2022. Actually, there exist many
2363 coding systems such as Compound Text (used in X11's inter client
2364 communication, ISO-2022-JP (used in Japanese Internet), ISO-2022-KR
2365 (used in Korean Internet), EUC (Extended UNIX Code, used in Asian
2366 localized platforms), and all of these are variants of ISO2022.
2367
2368 In addition to the above, Emacs handles two more kinds of escape
2369 sequences: ISO6429's direction specification and Emacs' private
2370 sequence for specifying character composition.
2371
2372 ISO6429's direction specification takes the following form:
2373 o CSI ']' -- end of the current direction
2374 o CSI '0' ']' -- end of the current direction
2375 o CSI '1' ']' -- start of left-to-right text
2376 o CSI '2' ']' -- start of right-to-left text
2377 The control character CSI (0x9B: control sequence introducer) is
2378 abbreviated to the escape sequence ESC '[' in a 7-bit environment.
2379
2380 Character composition specification takes the following form:
2381 o ESC '0' -- start relative composition
2382 o ESC '1' -- end composition
2383 o ESC '2' -- start rule-base composition (*)
2384 o ESC '3' -- start relative composition with alternate chars (**)
2385 o ESC '4' -- start rule-base composition with alternate chars (**)
2386 Since these are not standard escape sequences of any ISO standard,
2387 the use of them with these meanings is restricted to Emacs only.
2388
2389 (*) This form is used only in Emacs 20.7 and older versions,
2390 but newer versions can safely decode it.
2391 (**) This form is used only in Emacs 21.1 and newer versions,
2392 and older versions can't decode it.
2393
2394 Here's a list of example usages of these composition escape
2395 sequences (categorized by `enum composition_method').
2396
2397 COMPOSITION_RELATIVE:
2398 ESC 0 CHAR [ CHAR ] ESC 1
2399 COMPOSITION_WITH_RULE:
2400 ESC 2 CHAR [ RULE CHAR ] ESC 1
2401 COMPOSITION_WITH_ALTCHARS:
2402 ESC 3 ALTCHAR [ ALTCHAR ] ESC 0 CHAR [ CHAR ] ESC 1
2403 COMPOSITION_WITH_RULE_ALTCHARS:
2404 ESC 4 ALTCHAR [ RULE ALTCHAR ] ESC 0 CHAR [ CHAR ] ESC 1 */
2405
2406 enum iso_code_class_type iso_code_class[256];
2407
2408 #define SAFE_CHARSET_P(coding, id) \
2409 ((id) <= (coding)->max_charset_id \
2410 && (coding)->safe_charsets[id] >= 0)
2411
2412
2413 #define SHIFT_OUT_OK(category) \
2414 (CODING_ISO_INITIAL (&coding_categories[category], 1) >= 0)
2415
2416 static void
2417 setup_iso_safe_charsets (attrs)
2418 Lisp_Object attrs;
2419 {
2420 Lisp_Object charset_list, safe_charsets;
2421 Lisp_Object request;
2422 Lisp_Object reg_usage;
2423 Lisp_Object tail;
2424 int reg94, reg96;
2425 int flags = XINT (AREF (attrs, coding_attr_iso_flags));
2426 int max_charset_id;
2427
2428 charset_list = CODING_ATTR_CHARSET_LIST (attrs);
2429 if ((flags & CODING_ISO_FLAG_FULL_SUPPORT)
2430 && ! EQ (charset_list, Viso_2022_charset_list))
2431 {
2432 CODING_ATTR_CHARSET_LIST (attrs)
2433 = charset_list = Viso_2022_charset_list;
2434 ASET (attrs, coding_attr_safe_charsets, Qnil);
2435 }
2436
2437 if (STRINGP (AREF (attrs, coding_attr_safe_charsets)))
2438 return;
2439
2440 max_charset_id = 0;
2441 for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
2442 {
2443 int id = XINT (XCAR (tail));
2444 if (max_charset_id < id)
2445 max_charset_id = id;
2446 }
2447
2448 safe_charsets = Fmake_string (make_number (max_charset_id + 1),
2449 make_number (255));
2450 request = AREF (attrs, coding_attr_iso_request);
2451 reg_usage = AREF (attrs, coding_attr_iso_usage);
2452 reg94 = XINT (XCAR (reg_usage));
2453 reg96 = XINT (XCDR (reg_usage));
2454
2455 for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
2456 {
2457 Lisp_Object id;
2458 Lisp_Object reg;
2459 struct charset *charset;
2460
2461 id = XCAR (tail);
2462 charset = CHARSET_FROM_ID (XINT (id));
2463 reg = Fcdr (Fassq (id, request));
2464 if (! NILP (reg))
2465 SSET (safe_charsets, XINT (id), XINT (reg));
2466 else if (charset->iso_chars_96)
2467 {
2468 if (reg96 < 4)
2469 SSET (safe_charsets, XINT (id), reg96);
2470 }
2471 else
2472 {
2473 if (reg94 < 4)
2474 SSET (safe_charsets, XINT (id), reg94);
2475 }
2476 }
2477 ASET (attrs, coding_attr_safe_charsets, safe_charsets);
2478 }
2479
2480
2481 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
2482 Check if a text is encoded in one of ISO-2022 based codig systems.
2483 If it is, return 1, else return 0. */
2484
2485 static int
2486 detect_coding_iso_2022 (coding, detect_info)
2487 struct coding_system *coding;
2488 struct coding_detection_info *detect_info;
2489 {
2490 const unsigned char *src = coding->source, *src_base = src;
2491 const unsigned char *src_end = coding->source + coding->src_bytes;
2492 int multibytep = coding->src_multibyte;
2493 int single_shifting = 0;
2494 int id;
2495 int c, c1;
2496 int consumed_chars = 0;
2497 int i;
2498 int rejected = 0;
2499 int found = 0;
2500
2501 detect_info->checked |= CATEGORY_MASK_ISO;
2502
2503 for (i = coding_category_iso_7; i <= coding_category_iso_8_else; i++)
2504 {
2505 struct coding_system *this = &(coding_categories[i]);
2506 Lisp_Object attrs, val;
2507
2508 attrs = CODING_ID_ATTRS (this->id);
2509 if (CODING_ISO_FLAGS (this) & CODING_ISO_FLAG_FULL_SUPPORT
2510 && ! EQ (CODING_ATTR_SAFE_CHARSETS (attrs), Viso_2022_charset_list))
2511 setup_iso_safe_charsets (attrs);
2512 val = CODING_ATTR_SAFE_CHARSETS (attrs);
2513 this->max_charset_id = SCHARS (val) - 1;
2514 this->safe_charsets = (char *) SDATA (val);
2515 }
2516
2517 /* A coding system of this category is always ASCII compatible. */
2518 src += coding->head_ascii;
2519
2520 while (rejected != CATEGORY_MASK_ISO)
2521 {
2522 src_base = src;
2523 ONE_MORE_BYTE (c);
2524 switch (c)
2525 {
2526 case ISO_CODE_ESC:
2527 if (inhibit_iso_escape_detection)
2528 break;
2529 single_shifting = 0;
2530 ONE_MORE_BYTE (c);
2531 if (c >= '(' && c <= '/')
2532 {
2533 /* Designation sequence for a charset of dimension 1. */
2534 ONE_MORE_BYTE (c1);
2535 if (c1 < ' ' || c1 >= 0x80
2536 || (id = iso_charset_table[0][c >= ','][c1]) < 0)
2537 /* Invalid designation sequence. Just ignore. */
2538 break;
2539 }
2540 else if (c == '$')
2541 {
2542 /* Designation sequence for a charset of dimension 2. */
2543 ONE_MORE_BYTE (c);
2544 if (c >= '@' && c <= 'B')
2545 /* Designation for JISX0208.1978, GB2312, or JISX0208. */
2546 id = iso_charset_table[1][0][c];
2547 else if (c >= '(' && c <= '/')
2548 {
2549 ONE_MORE_BYTE (c1);
2550 if (c1 < ' ' || c1 >= 0x80
2551 || (id = iso_charset_table[1][c >= ','][c1]) < 0)
2552 /* Invalid designation sequence. Just ignore. */
2553 break;
2554 }
2555 else
2556 /* Invalid designation sequence. Just ignore it. */
2557 break;
2558 }
2559 else if (c == 'N' || c == 'O')
2560 {
2561 /* ESC <Fe> for SS2 or SS3. */
2562 single_shifting = 1;
2563 rejected |= CATEGORY_MASK_ISO_7BIT | CATEGORY_MASK_ISO_8BIT;
2564 break;
2565 }
2566 else if (c >= '0' && c <= '4')
2567 {
2568 /* ESC <Fp> for start/end composition. */
2569 found |= CATEGORY_MASK_ISO;
2570 break;
2571 }
2572 else
2573 {
2574 /* Invalid escape sequence. Just ignore it. */
2575 break;
2576 }
2577
2578 /* We found a valid designation sequence for CHARSET. */
2579 rejected |= CATEGORY_MASK_ISO_8BIT;
2580 if (SAFE_CHARSET_P (&coding_categories[coding_category_iso_7],
2581 id))
2582 found |= CATEGORY_MASK_ISO_7;
2583 else
2584 rejected |= CATEGORY_MASK_ISO_7;
2585 if (SAFE_CHARSET_P (&coding_categories[coding_category_iso_7_tight],
2586 id))
2587 found |= CATEGORY_MASK_ISO_7_TIGHT;
2588 else
2589 rejected |= CATEGORY_MASK_ISO_7_TIGHT;
2590 if (SAFE_CHARSET_P (&coding_categories[coding_category_iso_7_else],
2591 id))
2592 found |= CATEGORY_MASK_ISO_7_ELSE;
2593 else
2594 rejected |= CATEGORY_MASK_ISO_7_ELSE;
2595 if (SAFE_CHARSET_P (&coding_categories[coding_category_iso_8_else],
2596 id))
2597 found |= CATEGORY_MASK_ISO_8_ELSE;
2598 else
2599 rejected |= CATEGORY_MASK_ISO_8_ELSE;
2600 break;
2601
2602 case ISO_CODE_SO:
2603 case ISO_CODE_SI:
2604 /* Locking shift out/in. */
2605 if (inhibit_iso_escape_detection)
2606 break;
2607 single_shifting = 0;
2608 rejected |= CATEGORY_MASK_ISO_7BIT | CATEGORY_MASK_ISO_8BIT;
2609 found |= CATEGORY_MASK_ISO_ELSE;
2610 break;
2611
2612 case ISO_CODE_CSI:
2613 /* Control sequence introducer. */
2614 single_shifting = 0;
2615 rejected |= CATEGORY_MASK_ISO_7BIT | CATEGORY_MASK_ISO_7_ELSE;
2616 found |= CATEGORY_MASK_ISO_8_ELSE;
2617 goto check_extra_latin;
2618
2619 case ISO_CODE_SS2:
2620 case ISO_CODE_SS3:
2621 /* Single shift. */
2622 if (inhibit_iso_escape_detection)
2623 break;
2624 single_shifting = 0;
2625 rejected |= CATEGORY_MASK_ISO_7BIT;
2626 if (CODING_ISO_FLAGS (&coding_categories[coding_category_iso_8_1])
2627 & CODING_ISO_FLAG_SINGLE_SHIFT)
2628 found |= CATEGORY_MASK_ISO_8_1, single_shifting = 1;
2629 if (CODING_ISO_FLAGS (&coding_categories[coding_category_iso_8_2])
2630 & CODING_ISO_FLAG_SINGLE_SHIFT)
2631 found |= CATEGORY_MASK_ISO_8_2, single_shifting = 1;
2632 if (single_shifting)
2633 break;
2634 goto check_extra_latin;
2635
2636 default:
2637 if (c < 0)
2638 continue;
2639 if (c < 0x80)
2640 {
2641 single_shifting = 0;
2642 break;
2643 }
2644 if (c >= 0xA0)
2645 {
2646 rejected |= CATEGORY_MASK_ISO_7BIT | CATEGORY_MASK_ISO_7_ELSE;
2647 found |= CATEGORY_MASK_ISO_8_1;
2648 /* Check the length of succeeding codes of the range
2649 0xA0..0FF. If the byte length is even, we include
2650 CATEGORY_MASK_ISO_8_2 in `found'. We can check this
2651 only when we are not single shifting. */
2652 if (! single_shifting
2653 && ! (rejected & CATEGORY_MASK_ISO_8_2))
2654 {
2655 int i = 1;
2656 while (src < src_end)
2657 {
2658 ONE_MORE_BYTE (c);
2659 if (c < 0xA0)
2660 break;
2661 i++;
2662 }
2663
2664 if (i & 1 && src < src_end)
2665 rejected |= CATEGORY_MASK_ISO_8_2;
2666 else
2667 found |= CATEGORY_MASK_ISO_8_2;
2668 }
2669 break;
2670 }
2671 check_extra_latin:
2672 single_shifting = 0;
2673 if (! VECTORP (Vlatin_extra_code_table)
2674 || NILP (XVECTOR (Vlatin_extra_code_table)->contents[c]))
2675 {
2676 rejected = CATEGORY_MASK_ISO;
2677 break;
2678 }
2679 if (CODING_ISO_FLAGS (&coding_categories[coding_category_iso_8_1])
2680 & CODING_ISO_FLAG_LATIN_EXTRA)
2681 found |= CATEGORY_MASK_ISO_8_1;
2682 else
2683 rejected |= CATEGORY_MASK_ISO_8_1;
2684 rejected |= CATEGORY_MASK_ISO_8_2;
2685 }
2686 }
2687 detect_info->rejected |= CATEGORY_MASK_ISO;
2688 return 0;
2689
2690 no_more_source:
2691 detect_info->rejected |= rejected;
2692 detect_info->found |= (found & ~rejected);
2693 return 1;
2694 }
2695
2696
2697 /* Set designation state into CODING. */
2698 #define DECODE_DESIGNATION(reg, dim, chars_96, final) \
2699 do { \
2700 int id, prev; \
2701 \
2702 if (final < '0' || final >= 128 \
2703 || ((id = ISO_CHARSET_TABLE (dim, chars_96, final)) < 0) \
2704 || !SAFE_CHARSET_P (coding, id)) \
2705 { \
2706 CODING_ISO_DESIGNATION (coding, reg) = -2; \
2707 goto invalid_code; \
2708 } \
2709 prev = CODING_ISO_DESIGNATION (coding, reg); \
2710 if (id == charset_jisx0201_roman) \
2711 { \
2712 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_USE_ROMAN) \
2713 id = charset_ascii; \
2714 } \
2715 else if (id == charset_jisx0208_1978) \
2716 { \
2717 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_USE_OLDJIS) \
2718 id = charset_jisx0208; \
2719 } \
2720 CODING_ISO_DESIGNATION (coding, reg) = id; \
2721 /* If there was an invalid designation to REG previously, and this \
2722 designation is ASCII to REG, we should keep this designation \
2723 sequence. */ \
2724 if (prev == -2 && id == charset_ascii) \
2725 goto invalid_code; \
2726 } while (0)
2727
2728
2729 #define MAYBE_FINISH_COMPOSITION() \
2730 do { \
2731 int i; \
2732 if (composition_state == COMPOSING_NO) \
2733 break; \
2734 /* It is assured that we have enough room for producing \
2735 characters stored in the table `components'. */ \
2736 if (charbuf + component_idx > charbuf_end) \
2737 goto no_more_source; \
2738 composition_state = COMPOSING_NO; \
2739 if (method == COMPOSITION_RELATIVE \
2740 || method == COMPOSITION_WITH_ALTCHARS) \
2741 { \
2742 for (i = 0; i < component_idx; i++) \
2743 *charbuf++ = components[i]; \
2744 char_offset += component_idx; \
2745 } \
2746 else \
2747 { \
2748 for (i = 0; i < component_idx; i += 2) \
2749 *charbuf++ = components[i]; \
2750 char_offset += (component_idx / 2) + 1; \
2751 } \
2752 } while (0)
2753
2754
2755 /* Handle composition start sequence ESC 0, ESC 2, ESC 3, or ESC 4.
2756 ESC 0 : relative composition : ESC 0 CHAR ... ESC 1
2757 ESC 2 : rulebase composition : ESC 2 CHAR RULE CHAR RULE ... CHAR ESC 1
2758 ESC 3 : altchar composition : ESC 3 CHAR ... ESC 0 CHAR ... ESC 1
2759 ESC 4 : alt&rule composition : ESC 4 CHAR RULE ... CHAR ESC 0 CHAR ... ESC 1
2760 */
2761
2762 #define DECODE_COMPOSITION_START(c1) \
2763 do { \
2764 if (c1 == '0' \
2765 && composition_state == COMPOSING_COMPONENT_RULE) \
2766 { \
2767 component_len = component_idx; \
2768 composition_state = COMPOSING_CHAR; \
2769 } \
2770 else \
2771 { \
2772 const unsigned char *p; \
2773 \
2774 MAYBE_FINISH_COMPOSITION (); \
2775 if (charbuf + MAX_COMPOSITION_COMPONENTS > charbuf_end) \
2776 goto no_more_source; \
2777 for (p = src; p < src_end - 1; p++) \
2778 if (*p == ISO_CODE_ESC && p[1] == '1') \
2779 break; \
2780 if (p == src_end - 1) \
2781 { \
2782 if (coding->mode & CODING_MODE_LAST_BLOCK) \
2783 goto invalid_code; \
2784 goto no_more_source; \
2785 } \
2786 \
2787 /* This is surely the start of a composition. */ \
2788 method = (c1 == '0' ? COMPOSITION_RELATIVE \
2789 : c1 == '2' ? COMPOSITION_WITH_RULE \
2790 : c1 == '3' ? COMPOSITION_WITH_ALTCHARS \
2791 : COMPOSITION_WITH_RULE_ALTCHARS); \
2792 composition_state = (c1 <= '2' ? COMPOSING_CHAR \
2793 : COMPOSING_COMPONENT_CHAR); \
2794 component_idx = component_len = 0; \
2795 } \
2796 } while (0)
2797
2798
2799 /* Handle compositoin end sequence ESC 1. */
2800
2801 #define DECODE_COMPOSITION_END() \
2802 do { \
2803 int nchars = (component_len > 0 ? component_idx - component_len \
2804 : method == COMPOSITION_RELATIVE ? component_idx \
2805 : (component_idx + 1) / 2); \
2806 int i; \
2807 int *saved_charbuf = charbuf; \
2808 int from = char_offset; \
2809 int to = from + nchars; \
2810 \
2811 ADD_COMPOSITION_DATA (charbuf, from, to, method); \
2812 if (method != COMPOSITION_RELATIVE) \
2813 { \
2814 if (component_len == 0) \
2815 for (i = 0; i < component_idx; i++) \
2816 *charbuf++ = components[i]; \
2817 else \
2818 for (i = 0; i < component_len; i++) \
2819 *charbuf++ = components[i]; \
2820 *saved_charbuf = saved_charbuf - charbuf; \
2821 } \
2822 if (method == COMPOSITION_WITH_RULE) \
2823 for (i = 0; i < component_idx; i += 2, char_offset++) \
2824 *charbuf++ = components[i]; \
2825 else \
2826 for (i = component_len; i < component_idx; i++, char_offset++) \
2827 *charbuf++ = components[i]; \
2828 coding->annotated = 1; \
2829 composition_state = COMPOSING_NO; \
2830 } while (0)
2831
2832
2833 /* Decode a composition rule from the byte C1 (and maybe one more byte
2834 from SRC) and store one encoded composition rule in
2835 coding->cmp_data. */
2836
2837 #define DECODE_COMPOSITION_RULE(c1) \
2838 do { \
2839 (c1) -= 32; \
2840 if (c1 < 81) /* old format (before ver.21) */ \
2841 { \
2842 int gref = (c1) / 9; \
2843 int nref = (c1) % 9; \
2844 if (gref == 4) gref = 10; \
2845 if (nref == 4) nref = 10; \
2846 c1 = COMPOSITION_ENCODE_RULE (gref, nref); \
2847 } \
2848 else if (c1 < 93) /* new format (after ver.21) */ \
2849 { \
2850 ONE_MORE_BYTE (c2); \
2851 c1 = COMPOSITION_ENCODE_RULE (c1 - 81, c2 - 32); \
2852 } \
2853 else \
2854 c1 = 0; \
2855 } while (0)
2856
2857
2858 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions". */
2859
2860 static void
2861 decode_coding_iso_2022 (coding)
2862 struct coding_system *coding;
2863 {
2864 const unsigned char *src = coding->source + coding->consumed;
2865 const unsigned char *src_end = coding->source + coding->src_bytes;
2866 const unsigned char *src_base;
2867 int *charbuf = coding->charbuf;
2868 int *charbuf_end
2869 = charbuf + coding->charbuf_size - 4 - MAX_ANNOTATION_LENGTH;
2870 int consumed_chars = 0, consumed_chars_base;
2871 int multibytep = coding->src_multibyte;
2872 /* Charsets invoked to graphic plane 0 and 1 respectively. */
2873 int charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
2874 int charset_id_1 = CODING_ISO_INVOKED_CHARSET (coding, 1);
2875 struct charset *charset;
2876 int c;
2877 /* For handling composition sequence. */
2878 #define COMPOSING_NO 0
2879 #define COMPOSING_CHAR 1
2880 #define COMPOSING_RULE 2
2881 #define COMPOSING_COMPONENT_CHAR 3
2882 #define COMPOSING_COMPONENT_RULE 4
2883
2884 int composition_state = COMPOSING_NO;
2885 enum composition_method method;
2886 int components[MAX_COMPOSITION_COMPONENTS * 2 + 1];
2887 int component_idx;
2888 int component_len;
2889 Lisp_Object attrs, charset_list;
2890 int char_offset = coding->produced_char;
2891 int last_offset = char_offset;
2892 int last_id = charset_ascii;
2893
2894 CODING_GET_INFO (coding, attrs, charset_list);
2895 setup_iso_safe_charsets (attrs);
2896
2897 while (1)
2898 {
2899 int c1, c2;
2900
2901 src_base = src;
2902 consumed_chars_base = consumed_chars;
2903
2904 if (charbuf >= charbuf_end)
2905 break;
2906
2907 ONE_MORE_BYTE (c1);
2908 if (c1 < 0)
2909 goto invalid_code;
2910
2911 /* We produce at most one character. */
2912 switch (iso_code_class [c1])
2913 {
2914 case ISO_0x20_or_0x7F:
2915 if (composition_state != COMPOSING_NO)
2916 {
2917 if (composition_state == COMPOSING_RULE
2918 || composition_state == COMPOSING_COMPONENT_RULE)
2919 {
2920 DECODE_COMPOSITION_RULE (c1);
2921 components[component_idx++] = c1;
2922 composition_state--;
2923 continue;
2924 }
2925 }
2926 if (charset_id_0 < 0
2927 || ! CHARSET_ISO_CHARS_96 (CHARSET_FROM_ID (charset_id_0)))
2928 /* This is SPACE or DEL. */
2929 charset = CHARSET_FROM_ID (charset_ascii);
2930 else
2931 charset = CHARSET_FROM_ID (charset_id_0);
2932 break;
2933
2934 case ISO_graphic_plane_0:
2935 if (composition_state != COMPOSING_NO)
2936 {
2937 if (composition_state == COMPOSING_RULE
2938 || composition_state == COMPOSING_COMPONENT_RULE)
2939 {
2940 DECODE_COMPOSITION_RULE (c1);
2941 components[component_idx++] = c1;
2942 composition_state--;
2943 continue;
2944 }
2945 }
2946 charset = CHARSET_FROM_ID (charset_id_0);
2947 break;
2948
2949 case ISO_0xA0_or_0xFF:
2950 if (charset_id_1 < 0
2951 || ! CHARSET_ISO_CHARS_96 (CHARSET_FROM_ID (charset_id_1))
2952 || CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS)
2953 goto invalid_code;
2954 /* This is a graphic character, we fall down ... */
2955
2956 case ISO_graphic_plane_1:
2957 if (charset_id_1 < 0)
2958 goto invalid_code;
2959 charset = CHARSET_FROM_ID (charset_id_1);
2960 break;
2961
2962 case ISO_control_0:
2963 MAYBE_FINISH_COMPOSITION ();
2964 charset = CHARSET_FROM_ID (charset_ascii);
2965 break;
2966
2967 case ISO_control_1:
2968 MAYBE_FINISH_COMPOSITION ();
2969 goto invalid_code;
2970
2971 case ISO_shift_out:
2972 if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LOCKING_SHIFT)
2973 || CODING_ISO_DESIGNATION (coding, 1) < 0)
2974 goto invalid_code;
2975 CODING_ISO_INVOCATION (coding, 0) = 1;
2976 charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
2977 continue;
2978
2979 case ISO_shift_in:
2980 if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LOCKING_SHIFT))
2981 goto invalid_code;
2982 CODING_ISO_INVOCATION (coding, 0) = 0;
2983 charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
2984 continue;
2985
2986 case ISO_single_shift_2_7:
2987 case ISO_single_shift_2:
2988 if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT))
2989 goto invalid_code;
2990 /* SS2 is handled as an escape sequence of ESC 'N' */
2991 c1 = 'N';
2992 goto label_escape_sequence;
2993
2994 case ISO_single_shift_3:
2995 if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT))
2996 goto invalid_code;
2997 /* SS2 is handled as an escape sequence of ESC 'O' */
2998 c1 = 'O';
2999 goto label_escape_sequence;
3000
3001 case ISO_control_sequence_introducer:
3002 /* CSI is handled as an escape sequence of ESC '[' ... */
3003 c1 = '[';
3004 goto label_escape_sequence;
3005
3006 case ISO_escape:
3007 ONE_MORE_BYTE (c1);
3008 label_escape_sequence:
3009 /* Escape sequences handled here are invocation,
3010 designation, direction specification, and character
3011 composition specification. */
3012 switch (c1)
3013 {
3014 case '&': /* revision of following character set */
3015 ONE_MORE_BYTE (c1);
3016 if (!(c1 >= '@' && c1 <= '~'))
3017 goto invalid_code;
3018 ONE_MORE_BYTE (c1);
3019 if (c1 != ISO_CODE_ESC)
3020 goto invalid_code;
3021 ONE_MORE_BYTE (c1);
3022 goto label_escape_sequence;
3023
3024 case '$': /* designation of 2-byte character set */
3025 if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DESIGNATION))
3026 goto invalid_code;
3027 ONE_MORE_BYTE (c1);
3028 if (c1 >= '@' && c1 <= 'B')
3029 { /* designation of JISX0208.1978, GB2312.1980,
3030 or JISX0208.1980 */
3031 DECODE_DESIGNATION (0, 2, 0, c1);
3032 }
3033 else if (c1 >= 0x28 && c1 <= 0x2B)
3034 { /* designation of DIMENSION2_CHARS94 character set */
3035 ONE_MORE_BYTE (c2);
3036 DECODE_DESIGNATION (c1 - 0x28, 2, 0, c2);
3037 }
3038 else if (c1 >= 0x2C && c1 <= 0x2F)
3039 { /* designation of DIMENSION2_CHARS96 character set */
3040 ONE_MORE_BYTE (c2);
3041 DECODE_DESIGNATION (c1 - 0x2C, 2, 1, c2);
3042 }
3043 else
3044 goto invalid_code;
3045 /* We must update these variables now. */
3046 charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3047 charset_id_1 = CODING_ISO_INVOKED_CHARSET (coding, 1);
3048 continue;
3049
3050 case 'n': /* invocation of locking-shift-2 */
3051 if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LOCKING_SHIFT)
3052 || CODING_ISO_DESIGNATION (coding, 2) < 0)
3053 goto invalid_code;
3054 CODING_ISO_INVOCATION (coding, 0) = 2;
3055 charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3056 continue;
3057
3058 case 'o': /* invocation of locking-shift-3 */
3059 if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LOCKING_SHIFT)
3060 || CODING_ISO_DESIGNATION (coding, 3) < 0)
3061 goto invalid_code;
3062 CODING_ISO_INVOCATION (coding, 0) = 3;
3063 charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3064 continue;
3065
3066 case 'N': /* invocation of single-shift-2 */
3067 if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT)
3068 || CODING_ISO_DESIGNATION (coding, 2) < 0)
3069 goto invalid_code;
3070 charset = CHARSET_FROM_ID (CODING_ISO_DESIGNATION (coding, 2));
3071 ONE_MORE_BYTE (c1);
3072 if (c1 < 0x20 || (c1 >= 0x80 && c1 < 0xA0))
3073 goto invalid_code;
3074 break;
3075
3076 case 'O': /* invocation of single-shift-3 */
3077 if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT)
3078 || CODING_ISO_DESIGNATION (coding, 3) < 0)
3079 goto invalid_code;
3080 charset = CHARSET_FROM_ID (CODING_ISO_DESIGNATION (coding, 3));
3081 ONE_MORE_BYTE (c1);
3082 if (c1 < 0x20 || (c1 >= 0x80 && c1 < 0xA0))
3083 goto invalid_code;
3084 break;
3085
3086 case '0': case '2': case '3': case '4': /* start composition */
3087 if (! (coding->common_flags & CODING_ANNOTATE_COMPOSITION_MASK))
3088 goto invalid_code;
3089 DECODE_COMPOSITION_START (c1);
3090 continue;
3091
3092 case '1': /* end composition */
3093 if (composition_state == COMPOSING_NO)
3094 goto invalid_code;
3095 DECODE_COMPOSITION_END ();
3096 continue;
3097
3098 case '[': /* specification of direction */
3099 if (! CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DIRECTION)
3100 goto invalid_code;
3101 /* For the moment, nested direction is not supported.
3102 So, `coding->mode & CODING_MODE_DIRECTION' zero means
3103 left-to-right, and nozero means right-to-left. */
3104 ONE_MORE_BYTE (c1);
3105 switch (c1)
3106 {
3107 case ']': /* end of the current direction */
3108 coding->mode &= ~CODING_MODE_DIRECTION;
3109
3110 case '0': /* end of the current direction */
3111 case '1': /* start of left-to-right direction */
3112 ONE_MORE_BYTE (c1);
3113 if (c1 == ']')
3114 coding->mode &= ~CODING_MODE_DIRECTION;
3115 else
3116 goto invalid_code;
3117 break;
3118
3119 case '2': /* start of right-to-left direction */
3120 ONE_MORE_BYTE (c1);
3121 if (c1 == ']')
3122 coding->mode |= CODING_MODE_DIRECTION;
3123 else
3124 goto invalid_code;
3125 break;
3126
3127 default:
3128 goto invalid_code;
3129 }
3130 continue;
3131
3132 case '%':
3133 ONE_MORE_BYTE (c1);
3134 if (c1 == '/')
3135 {
3136 /* CTEXT extended segment:
3137 ESC % / [0-4] M L --ENCODING-NAME-- \002 --BYTES--
3138 We keep these bytes as is for the moment.
3139 They may be decoded by post-read-conversion. */
3140 int dim, M, L;
3141 int size;
3142
3143 ONE_MORE_BYTE (dim);
3144 ONE_MORE_BYTE (M);
3145 ONE_MORE_BYTE (L);
3146 size = ((M - 128) * 128) + (L - 128);
3147 if (charbuf + 8 + size > charbuf_end)
3148 goto break_loop;
3149 *charbuf++ = ISO_CODE_ESC;
3150 *charbuf++ = '%';
3151 *charbuf++ = '/';
3152 *charbuf++ = dim;
3153 *charbuf++ = BYTE8_TO_CHAR (M);
3154 *charbuf++ = BYTE8_TO_CHAR (L);
3155 while (size-- > 0)
3156 {
3157 ONE_MORE_BYTE (c1);
3158 *charbuf++ = ASCII_BYTE_P (c1) ? c1 : BYTE8_TO_CHAR (c1);
3159 }
3160 }
3161 else if (c1 == 'G')
3162 {
3163 /* XFree86 extension for embedding UTF-8 in CTEXT:
3164 ESC % G --UTF-8-BYTES-- ESC % @
3165 We keep these bytes as is for the moment.
3166 They may be decoded by post-read-conversion. */
3167 int *p = charbuf;
3168
3169 if (p + 6 > charbuf_end)
3170 goto break_loop;
3171 *p++ = ISO_CODE_ESC;
3172 *p++ = '%';
3173 *p++ = 'G';
3174 while (p < charbuf_end)
3175 {
3176 ONE_MORE_BYTE (c1);
3177 if (c1 == ISO_CODE_ESC
3178 && src + 1 < src_end
3179 && src[0] == '%'
3180 && src[1] == '@')
3181 break;
3182 *p++ = ASCII_BYTE_P (c1) ? c1 : BYTE8_TO_CHAR (c1);
3183 }
3184 if (p + 3 > charbuf_end)
3185 goto break_loop;
3186 *p++ = ISO_CODE_ESC;
3187 *p++ = '%';
3188 *p++ = '@';
3189 charbuf = p;
3190 }
3191 else
3192 goto invalid_code;
3193 continue;
3194 break;
3195
3196 default:
3197 if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DESIGNATION))
3198 goto invalid_code;
3199 if (c1 >= 0x28 && c1 <= 0x2B)
3200 { /* designation of DIMENSION1_CHARS94 character set */
3201 ONE_MORE_BYTE (c2);
3202 DECODE_DESIGNATION (c1 - 0x28, 1, 0, c2);
3203 }
3204 else if (c1 >= 0x2C && c1 <= 0x2F)
3205 { /* designation of DIMENSION1_CHARS96 character set */
3206 ONE_MORE_BYTE (c2);
3207 DECODE_DESIGNATION (c1 - 0x2C, 1, 1, c2);
3208 }
3209 else
3210 goto invalid_code;
3211 /* We must update these variables now. */
3212 charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3213 charset_id_1 = CODING_ISO_INVOKED_CHARSET (coding, 1);
3214 continue;
3215 }
3216 }
3217
3218 if (charset->id != charset_ascii
3219 && last_id != charset->id)
3220 {
3221 if (last_id != charset_ascii)
3222 ADD_CHARSET_DATA (charbuf, last_offset, char_offset, last_id);
3223 last_id = charset->id;
3224 last_offset = char_offset;
3225 }
3226
3227 /* Now we know CHARSET and 1st position code C1 of a character.
3228 Produce a decoded character while getting 2nd position code
3229 C2 if necessary. */
3230 c1 &= 0x7F;
3231 if (CHARSET_DIMENSION (charset) > 1)
3232 {
3233 ONE_MORE_BYTE (c2);
3234 if (c2 < 0x20 || (c2 >= 0x80 && c2 < 0xA0))
3235 /* C2 is not in a valid range. */
3236 goto invalid_code;
3237 c1 = (c1 << 8) | (c2 & 0x7F);
3238 if (CHARSET_DIMENSION (charset) > 2)
3239 {
3240 ONE_MORE_BYTE (c2);
3241 if (c2 < 0x20 || (c2 >= 0x80 && c2 < 0xA0))
3242 /* C2 is not in a valid range. */
3243 goto invalid_code;
3244 c1 = (c1 << 8) | (c2 & 0x7F);
3245 }
3246 }
3247
3248 CODING_DECODE_CHAR (coding, src, src_base, src_end, charset, c1, c);
3249 if (c < 0)
3250 {
3251 MAYBE_FINISH_COMPOSITION ();
3252 for (; src_base < src; src_base++, char_offset++)
3253 {
3254 if (ASCII_BYTE_P (*src_base))
3255 *charbuf++ = *src_base;
3256 else
3257 *charbuf++ = BYTE8_TO_CHAR (*src_base);
3258 }
3259 }
3260 else if (composition_state == COMPOSING_NO)
3261 {
3262 *charbuf++ = c;
3263 char_offset++;
3264 }
3265 else
3266 {
3267 components[component_idx++] = c;
3268 if (method == COMPOSITION_WITH_RULE
3269 || (method == COMPOSITION_WITH_RULE_ALTCHARS
3270 && composition_state == COMPOSING_COMPONENT_CHAR))
3271 composition_state++;
3272 }
3273 continue;
3274
3275 invalid_code:
3276 MAYBE_FINISH_COMPOSITION ();
3277 src = src_base;
3278 consumed_chars = consumed_chars_base;
3279 ONE_MORE_BYTE (c);
3280 *charbuf++ = c < 0 ? -c : ASCII_BYTE_P (c) ? c : BYTE8_TO_CHAR (c);
3281 char_offset++;
3282 coding->errors++;
3283 continue;
3284
3285 break_loop:
3286 break;
3287 }
3288
3289 no_more_source:
3290 if (last_id != charset_ascii)
3291 ADD_CHARSET_DATA (charbuf, last_offset, char_offset, last_id);
3292 coding->consumed_char += consumed_chars_base;
3293 coding->consumed = src_base - coding->source;
3294 coding->charbuf_used = charbuf - coding->charbuf;
3295 }
3296
3297
3298 /* ISO2022 encoding stuff. */
3299
3300 /*
3301 It is not enough to say just "ISO2022" on encoding, we have to
3302 specify more details. In Emacs, each coding system of ISO2022
3303 variant has the following specifications:
3304 1. Initial designation to G0 thru G3.
3305 2. Allows short-form designation?
3306 3. ASCII should be designated to G0 before control characters?
3307 4. ASCII should be designated to G0 at end of line?
3308 5. 7-bit environment or 8-bit environment?
3309 6. Use locking-shift?
3310 7. Use Single-shift?
3311 And the following two are only for Japanese:
3312 8. Use ASCII in place of JIS0201-1976-Roman?
3313 9. Use JISX0208-1983 in place of JISX0208-1978?
3314 These specifications are encoded in CODING_ISO_FLAGS (coding) as flag bits
3315 defined by macros CODING_ISO_FLAG_XXX. See `coding.h' for more
3316 details.
3317 */
3318
3319 /* Produce codes (escape sequence) for designating CHARSET to graphic
3320 register REG at DST, and increment DST. If <final-char> of CHARSET is
3321 '@', 'A', or 'B' and the coding system CODING allows, produce
3322 designation sequence of short-form. */
3323
3324 #define ENCODE_DESIGNATION(charset, reg, coding) \
3325 do { \
3326 unsigned char final_char = CHARSET_ISO_FINAL (charset); \
3327 char *intermediate_char_94 = "()*+"; \
3328 char *intermediate_char_96 = ",-./"; \
3329 int revision = -1; \
3330 int c; \
3331 \
3332 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_REVISION) \
3333 revision = CHARSET_ISO_REVISION (charset); \
3334 \
3335 if (revision >= 0) \
3336 { \
3337 EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, '&'); \
3338 EMIT_ONE_BYTE ('@' + revision); \
3339 } \
3340 EMIT_ONE_ASCII_BYTE (ISO_CODE_ESC); \
3341 if (CHARSET_DIMENSION (charset) == 1) \
3342 { \
3343 if (! CHARSET_ISO_CHARS_96 (charset)) \
3344 c = intermediate_char_94[reg]; \
3345 else \
3346 c = intermediate_char_96[reg]; \
3347 EMIT_ONE_ASCII_BYTE (c); \
3348 } \
3349 else \
3350 { \
3351 EMIT_ONE_ASCII_BYTE ('$'); \
3352 if (! CHARSET_ISO_CHARS_96 (charset)) \
3353 { \
3354 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LONG_FORM \
3355 || reg != 0 \
3356 || final_char < '@' || final_char > 'B') \
3357 EMIT_ONE_ASCII_BYTE (intermediate_char_94[reg]); \
3358 } \
3359 else \
3360 EMIT_ONE_ASCII_BYTE (intermediate_char_96[reg]); \
3361 } \
3362 EMIT_ONE_ASCII_BYTE (final_char); \
3363 \
3364 CODING_ISO_DESIGNATION (coding, reg) = CHARSET_ID (charset); \
3365 } while (0)
3366
3367
3368 /* The following two macros produce codes (control character or escape
3369 sequence) for ISO2022 single-shift functions (single-shift-2 and
3370 single-shift-3). */
3371
3372 #define ENCODE_SINGLE_SHIFT_2 \
3373 do { \
3374 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS) \
3375 EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'N'); \
3376 else \
3377 EMIT_ONE_BYTE (ISO_CODE_SS2); \
3378 CODING_ISO_SINGLE_SHIFTING (coding) = 1; \
3379 } while (0)
3380
3381
3382 #define ENCODE_SINGLE_SHIFT_3 \
3383 do { \
3384 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS) \
3385 EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'O'); \
3386 else \
3387 EMIT_ONE_BYTE (ISO_CODE_SS3); \
3388 CODING_ISO_SINGLE_SHIFTING (coding) = 1; \
3389 } while (0)
3390
3391
3392 /* The following four macros produce codes (control character or
3393 escape sequence) for ISO2022 locking-shift functions (shift-in,
3394 shift-out, locking-shift-2, and locking-shift-3). */
3395
3396 #define ENCODE_SHIFT_IN \
3397 do { \
3398 EMIT_ONE_ASCII_BYTE (ISO_CODE_SI); \
3399 CODING_ISO_INVOCATION (coding, 0) = 0; \
3400 } while (0)
3401
3402
3403 #define ENCODE_SHIFT_OUT \
3404 do { \
3405 EMIT_ONE_ASCII_BYTE (ISO_CODE_SO); \
3406 CODING_ISO_INVOCATION (coding, 0) = 1; \
3407 } while (0)
3408
3409
3410 #define ENCODE_LOCKING_SHIFT_2 \
3411 do { \
3412 EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'n'); \
3413 CODING_ISO_INVOCATION (coding, 0) = 2; \
3414 } while (0)
3415
3416
3417 #define ENCODE_LOCKING_SHIFT_3 \
3418 do { \
3419 EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'n'); \
3420 CODING_ISO_INVOCATION (coding, 0) = 3; \
3421 } while (0)
3422
3423
3424 /* Produce codes for a DIMENSION1 character whose character set is
3425 CHARSET and whose position-code is C1. Designation and invocation
3426 sequences are also produced in advance if necessary. */
3427
3428 #define ENCODE_ISO_CHARACTER_DIMENSION1(charset, c1) \
3429 do { \
3430 int id = CHARSET_ID (charset); \
3431 \
3432 if ((CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_USE_ROMAN) \
3433 && id == charset_ascii) \
3434 { \
3435 id = charset_jisx0201_roman; \
3436 charset = CHARSET_FROM_ID (id); \
3437 } \
3438 \
3439 if (CODING_ISO_SINGLE_SHIFTING (coding)) \
3440 { \
3441 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS) \
3442 EMIT_ONE_ASCII_BYTE (c1 & 0x7F); \
3443 else \
3444 EMIT_ONE_BYTE (c1 | 0x80); \
3445 CODING_ISO_SINGLE_SHIFTING (coding) = 0; \
3446 break; \
3447 } \
3448 else if (id == CODING_ISO_INVOKED_CHARSET (coding, 0)) \
3449 { \
3450 EMIT_ONE_ASCII_BYTE (c1 & 0x7F); \
3451 break; \
3452 } \
3453 else if (id == CODING_ISO_INVOKED_CHARSET (coding, 1)) \
3454 { \
3455 EMIT_ONE_BYTE (c1 | 0x80); \
3456 break; \
3457 } \
3458 else \
3459 /* Since CHARSET is not yet invoked to any graphic planes, we \
3460 must invoke it, or, at first, designate it to some graphic \
3461 register. Then repeat the loop to actually produce the \
3462 character. */ \
3463 dst = encode_invocation_designation (charset, coding, dst, \
3464 &produced_chars); \
3465 } while (1)
3466
3467
3468 /* Produce codes for a DIMENSION2 character whose character set is
3469 CHARSET and whose position-codes are C1 and C2. Designation and
3470 invocation codes are also produced in advance if necessary. */
3471
3472 #define ENCODE_ISO_CHARACTER_DIMENSION2(charset, c1, c2) \
3473 do { \
3474 int id = CHARSET_ID (charset); \
3475 \
3476 if ((CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_USE_OLDJIS) \
3477 && id == charset_jisx0208) \
3478 { \
3479 id = charset_jisx0208_1978; \
3480 charset = CHARSET_FROM_ID (id); \
3481 } \
3482 \
3483 if (CODING_ISO_SINGLE_SHIFTING (coding)) \
3484 { \
3485 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS) \
3486 EMIT_TWO_ASCII_BYTES ((c1) & 0x7F, (c2) & 0x7F); \
3487 else \
3488 EMIT_TWO_BYTES ((c1) | 0x80, (c2) | 0x80); \
3489 CODING_ISO_SINGLE_SHIFTING (coding) = 0; \
3490 break; \
3491 } \
3492 else if (id == CODING_ISO_INVOKED_CHARSET (coding, 0)) \
3493 { \
3494 EMIT_TWO_ASCII_BYTES ((c1) & 0x7F, (c2) & 0x7F); \
3495 break; \
3496 } \
3497 else if (id == CODING_ISO_INVOKED_CHARSET (coding, 1)) \
3498 { \
3499 EMIT_TWO_BYTES ((c1) | 0x80, (c2) | 0x80); \
3500 break; \
3501 } \
3502 else \
3503 /* Since CHARSET is not yet invoked to any graphic planes, we \
3504 must invoke it, or, at first, designate it to some graphic \
3505 register. Then repeat the loop to actually produce the \
3506 character. */ \
3507 dst = encode_invocation_designation (charset, coding, dst, \
3508 &produced_chars); \
3509 } while (1)
3510
3511
3512 #define ENCODE_ISO_CHARACTER(charset, c) \
3513 do { \
3514 int code = ENCODE_CHAR ((charset),(c)); \
3515 \
3516 if (CHARSET_DIMENSION (charset) == 1) \
3517 ENCODE_ISO_CHARACTER_DIMENSION1 ((charset), code); \
3518 else \
3519 ENCODE_ISO_CHARACTER_DIMENSION2 ((charset), code >> 8, code & 0xFF); \
3520 } while (0)
3521
3522
3523 /* Produce designation and invocation codes at a place pointed by DST
3524 to use CHARSET. The element `spec.iso_2022' of *CODING is updated.
3525 Return new DST. */
3526
3527 unsigned char *
3528 encode_invocation_designation (charset, coding, dst, p_nchars)
3529 struct charset *charset;
3530 struct coding_system *coding;
3531 unsigned char *dst;
3532 int *p_nchars;
3533 {
3534 int multibytep = coding->dst_multibyte;
3535 int produced_chars = *p_nchars;
3536 int reg; /* graphic register number */
3537 int id = CHARSET_ID (charset);
3538
3539 /* At first, check designations. */
3540 for (reg = 0; reg < 4; reg++)
3541 if (id == CODING_ISO_DESIGNATION (coding, reg))
3542 break;
3543
3544 if (reg >= 4)
3545 {
3546 /* CHARSET is not yet designated to any graphic registers. */
3547 /* At first check the requested designation. */
3548 reg = CODING_ISO_REQUEST (coding, id);
3549 if (reg < 0)
3550 /* Since CHARSET requests no special designation, designate it
3551 to graphic register 0. */
3552 reg = 0;
3553
3554 ENCODE_DESIGNATION (charset, reg, coding);
3555 }
3556
3557 if (CODING_ISO_INVOCATION (coding, 0) != reg
3558 && CODING_ISO_INVOCATION (coding, 1) != reg)
3559 {
3560 /* Since the graphic register REG is not invoked to any graphic
3561 planes, invoke it to graphic plane 0. */
3562 switch (reg)
3563 {
3564 case 0: /* graphic register 0 */
3565 ENCODE_SHIFT_IN;
3566 break;
3567
3568 case 1: /* graphic register 1 */
3569 ENCODE_SHIFT_OUT;
3570 break;
3571
3572 case 2: /* graphic register 2 */
3573 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT)
3574 ENCODE_SINGLE_SHIFT_2;
3575 else
3576 ENCODE_LOCKING_SHIFT_2;
3577 break;
3578
3579 case 3: /* graphic register 3 */
3580 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT)
3581 ENCODE_SINGLE_SHIFT_3;
3582 else
3583 ENCODE_LOCKING_SHIFT_3;
3584 break;
3585 }
3586 }
3587
3588 *p_nchars = produced_chars;
3589 return dst;
3590 }
3591
3592 /* The following three macros produce codes for indicating direction
3593 of text. */
3594 #define ENCODE_CONTROL_SEQUENCE_INTRODUCER \
3595 do { \
3596 if (CODING_ISO_FLAGS (coding) == CODING_ISO_FLAG_SEVEN_BITS) \
3597 EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, '['); \
3598 else \
3599 EMIT_ONE_BYTE (ISO_CODE_CSI); \
3600 } while (0)
3601
3602
3603 #define ENCODE_DIRECTION_R2L() \
3604 do { \
3605 ENCODE_CONTROL_SEQUENCE_INTRODUCER (dst); \
3606 EMIT_TWO_ASCII_BYTES ('2', ']'); \
3607 } while (0)
3608
3609
3610 #define ENCODE_DIRECTION_L2R() \
3611 do { \
3612 ENCODE_CONTROL_SEQUENCE_INTRODUCER (dst); \
3613 EMIT_TWO_ASCII_BYTES ('0', ']'); \
3614 } while (0)
3615
3616
3617 /* Produce codes for designation and invocation to reset the graphic
3618 planes and registers to initial state. */
3619 #define ENCODE_RESET_PLANE_AND_REGISTER() \
3620 do { \
3621 int reg; \
3622 struct charset *charset; \
3623 \
3624 if (CODING_ISO_INVOCATION (coding, 0) != 0) \
3625 ENCODE_SHIFT_IN; \
3626 for (reg = 0; reg < 4; reg++) \
3627 if (CODING_ISO_INITIAL (coding, reg) >= 0 \
3628 && (CODING_ISO_DESIGNATION (coding, reg) \
3629 != CODING_ISO_INITIAL (coding, reg))) \
3630 { \
3631 charset = CHARSET_FROM_ID (CODING_ISO_INITIAL (coding, reg)); \
3632 ENCODE_DESIGNATION (charset, reg, coding); \
3633 } \
3634 } while (0)
3635
3636
3637 /* Produce designation sequences of charsets in the line started from
3638 SRC to a place pointed by DST, and return updated DST.
3639
3640 If the current block ends before any end-of-line, we may fail to
3641 find all the necessary designations. */
3642
3643 static unsigned char *
3644 encode_designation_at_bol (coding, charbuf, charbuf_end, dst)
3645 struct coding_system *coding;
3646 int *charbuf, *charbuf_end;
3647 unsigned char *dst;
3648 {
3649 struct charset *charset;
3650 /* Table of charsets to be designated to each graphic register. */
3651 int r[4];
3652 int c, found = 0, reg;
3653 int produced_chars = 0;
3654 int multibytep = coding->dst_multibyte;
3655 Lisp_Object attrs;
3656 Lisp_Object charset_list;
3657
3658 attrs = CODING_ID_ATTRS (coding->id);
3659 charset_list = CODING_ATTR_CHARSET_LIST (attrs);
3660 if (EQ (charset_list, Qiso_2022))
3661 charset_list = Viso_2022_charset_list;
3662
3663 for (reg = 0; reg < 4; reg++)
3664 r[reg] = -1;
3665
3666 while (found < 4)
3667 {
3668 int id;
3669
3670 c = *charbuf++;
3671 if (c == '\n')
3672 break;
3673 charset = char_charset (c, charset_list, NULL);
3674 id = CHARSET_ID (charset);
3675 reg = CODING_ISO_REQUEST (coding, id);
3676 if (reg >= 0 && r[reg] < 0)
3677 {
3678 found++;
3679 r[reg] = id;
3680 }
3681 }
3682
3683 if (found)
3684 {
3685 for (reg = 0; reg < 4; reg++)
3686 if (r[reg] >= 0
3687 && CODING_ISO_DESIGNATION (coding, reg) != r[reg])
3688 ENCODE_DESIGNATION (CHARSET_FROM_ID (r[reg]), reg, coding);
3689 }
3690
3691 return dst;
3692 }
3693
3694 /* See the above "GENERAL NOTES on `encode_coding_XXX ()' functions". */
3695
3696 static int
3697 encode_coding_iso_2022 (coding)
3698 struct coding_system *coding;
3699 {
3700 int multibytep = coding->dst_multibyte;
3701 int *charbuf = coding->charbuf;
3702 int *charbuf_end = charbuf + coding->charbuf_used;
3703 unsigned char *dst = coding->destination + coding->produced;
3704 unsigned char *dst_end = coding->destination + coding->dst_bytes;
3705 int safe_room = 16;
3706 int bol_designation
3707 = (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DESIGNATE_AT_BOL
3708 && CODING_ISO_BOL (coding));
3709 int produced_chars = 0;
3710 Lisp_Object attrs, eol_type, charset_list;
3711 int ascii_compatible;
3712 int c;
3713 int preferred_charset_id = -1;
3714
3715 CODING_GET_INFO (coding, attrs, charset_list);
3716 eol_type = CODING_ID_EOL_TYPE (coding->id);
3717 if (VECTORP (eol_type))
3718 eol_type = Qunix;
3719
3720 setup_iso_safe_charsets (attrs);
3721 /* Charset list may have been changed. */
3722 charset_list = CODING_ATTR_CHARSET_LIST (attrs); \
3723 coding->safe_charsets = (char *) SDATA (CODING_ATTR_SAFE_CHARSETS(attrs));
3724
3725 ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs));
3726
3727 while (charbuf < charbuf_end)
3728 {
3729 ASSURE_DESTINATION (safe_room);
3730
3731 if (bol_designation)
3732 {
3733 unsigned char *dst_prev = dst;
3734
3735 /* We have to produce designation sequences if any now. */
3736 dst = encode_designation_at_bol (coding, charbuf, charbuf_end, dst);
3737 bol_designation = 0;
3738 /* We are sure that designation sequences are all ASCII bytes. */
3739 produced_chars += dst - dst_prev;
3740 }
3741
3742 c = *charbuf++;
3743
3744 if (c < 0)
3745 {
3746 /* Handle an annotation. */
3747 switch (*charbuf)
3748 {
3749 case CODING_ANNOTATE_COMPOSITION_MASK:
3750 /* Not yet implemented. */
3751 break;
3752 case CODING_ANNOTATE_CHARSET_MASK:
3753 preferred_charset_id = charbuf[3];
3754 if (preferred_charset_id >= 0
3755 && NILP (Fmemq (make_number (preferred_charset_id),
3756 charset_list)))
3757 preferred_charset_id = -1;
3758 break;
3759 default:
3760 abort ();
3761 }
3762 charbuf += -c - 1;
3763 continue;
3764 }
3765
3766 /* Now encode the character C. */
3767 if (c < 0x20 || c == 0x7F)
3768 {
3769 if (c == '\n'
3770 || (c == '\r' && EQ (eol_type, Qmac)))
3771 {
3772 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_RESET_AT_EOL)
3773 ENCODE_RESET_PLANE_AND_REGISTER ();
3774 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_INIT_AT_BOL)
3775 {
3776 int i;
3777
3778 for (i = 0; i < 4; i++)
3779 CODING_ISO_DESIGNATION (coding, i)
3780 = CODING_ISO_INITIAL (coding, i);
3781 }
3782 bol_designation
3783 = CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DESIGNATE_AT_BOL;
3784 }
3785 else if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_RESET_AT_CNTL)
3786 ENCODE_RESET_PLANE_AND_REGISTER ();
3787 EMIT_ONE_ASCII_BYTE (c);
3788 }
3789 else if (ASCII_CHAR_P (c))
3790 {
3791 if (ascii_compatible)
3792 EMIT_ONE_ASCII_BYTE (c);
3793 else
3794 {
3795 struct charset *charset = CHARSET_FROM_ID (charset_ascii);
3796 ENCODE_ISO_CHARACTER (charset, c);
3797 }
3798 }
3799 else if (CHAR_BYTE8_P (c))
3800 {
3801 c = CHAR_TO_BYTE8 (c);
3802 EMIT_ONE_BYTE (c);
3803 }
3804 else
3805 {
3806 struct charset *charset;
3807
3808 if (preferred_charset_id >= 0)
3809 {
3810 charset = CHARSET_FROM_ID (preferred_charset_id);
3811 if (! CHAR_CHARSET_P (c, charset))
3812 charset = char_charset (c, charset_list, NULL);
3813 }
3814 else
3815 charset = char_charset (c, charset_list, NULL);
3816 if (!charset)
3817 {
3818 if (coding->mode & CODING_MODE_SAFE_ENCODING)
3819 {
3820 c = CODING_INHIBIT_CHARACTER_SUBSTITUTION;
3821 charset = CHARSET_FROM_ID (charset_ascii);
3822 }
3823 else
3824 {
3825 c = coding->default_char;
3826 charset = char_charset (c, charset_list, NULL);
3827 }
3828 }
3829 ENCODE_ISO_CHARACTER (charset, c);
3830 }
3831 }
3832
3833 if (coding->mode & CODING_MODE_LAST_BLOCK
3834 && CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_RESET_AT_EOL)
3835 {
3836 ASSURE_DESTINATION (safe_room);
3837 ENCODE_RESET_PLANE_AND_REGISTER ();
3838 }
3839 record_conversion_result (coding, CODING_RESULT_SUCCESS);
3840 CODING_ISO_BOL (coding) = bol_designation;
3841 coding->produced_char += produced_chars;
3842 coding->produced = dst - coding->destination;
3843 return 0;
3844 }
3845
3846 \f
3847 /*** 8,9. SJIS and BIG5 handlers ***/
3848
3849 /* Although SJIS and BIG5 are not ISO's coding system, they are used
3850 quite widely. So, for the moment, Emacs supports them in the bare
3851 C code. But, in the future, they may be supported only by CCL. */
3852
3853 /* SJIS is a coding system encoding three character sets: ASCII, right
3854 half of JISX0201-Kana, and JISX0208. An ASCII character is encoded
3855 as is. A character of charset katakana-jisx0201 is encoded by
3856 "position-code + 0x80". A character of charset japanese-jisx0208
3857 is encoded in 2-byte but two position-codes are divided and shifted
3858 so that it fit in the range below.
3859
3860 --- CODE RANGE of SJIS ---
3861 (character set) (range)
3862 ASCII 0x00 .. 0x7F
3863 KATAKANA-JISX0201 0xA0 .. 0xDF
3864 JISX0208 (1st byte) 0x81 .. 0x9F and 0xE0 .. 0xEF
3865 (2nd byte) 0x40 .. 0x7E and 0x80 .. 0xFC
3866 -------------------------------
3867
3868 */
3869
3870 /* BIG5 is a coding system encoding two character sets: ASCII and
3871 Big5. An ASCII character is encoded as is. Big5 is a two-byte
3872 character set and is encoded in two-byte.
3873
3874 --- CODE RANGE of BIG5 ---
3875 (character set) (range)
3876 ASCII 0x00 .. 0x7F
3877 Big5 (1st byte) 0xA1 .. 0xFE
3878 (2nd byte) 0x40 .. 0x7E and 0xA1 .. 0xFE
3879 --------------------------
3880
3881 */
3882
3883 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
3884 Check if a text is encoded in SJIS. If it is, return
3885 CATEGORY_MASK_SJIS, else return 0. */
3886
3887 static int
3888 detect_coding_sjis (coding, detect_info)
3889 struct coding_system *coding;
3890 struct coding_detection_info *detect_info;
3891 {
3892 const unsigned char *src = coding->source, *src_base;
3893 const unsigned char *src_end = coding->source + coding->src_bytes;
3894 int multibytep = coding->src_multibyte;
3895 int consumed_chars = 0;
3896 int found = 0;
3897 int c;
3898
3899 detect_info->checked |= CATEGORY_MASK_SJIS;
3900 /* A coding system of this category is always ASCII compatible. */
3901 src += coding->head_ascii;
3902
3903 while (1)
3904 {
3905 src_base = src;
3906 ONE_MORE_BYTE (c);
3907 if (c < 0x80)
3908 continue;
3909 if ((c >= 0x81 && c <= 0x9F) || (c >= 0xE0 && c <= 0xEF))
3910 {
3911 ONE_MORE_BYTE (c);
3912 if (c < 0x40 || c == 0x7F || c > 0xFC)
3913 break;
3914 found = CATEGORY_MASK_SJIS;
3915 }
3916 else if (c >= 0xA0 && c < 0xE0)
3917 found = CATEGORY_MASK_SJIS;
3918 else
3919 break;
3920 }
3921 detect_info->rejected |= CATEGORY_MASK_SJIS;
3922 return 0;
3923
3924 no_more_source:
3925 if (src_base < src && coding->mode & CODING_MODE_LAST_BLOCK)
3926 {
3927 detect_info->rejected |= CATEGORY_MASK_SJIS;
3928 return 0;
3929 }
3930 detect_info->found |= found;
3931 return 1;
3932 }
3933
3934 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
3935 Check if a text is encoded in BIG5. If it is, return
3936 CATEGORY_MASK_BIG5, else return 0. */
3937
3938 static int
3939 detect_coding_big5 (coding, detect_info)
3940 struct coding_system *coding;
3941 struct coding_detection_info *detect_info;
3942 {
3943 const unsigned char *src = coding->source, *src_base;
3944 const unsigned char *src_end = coding->source + coding->src_bytes;
3945 int multibytep = coding->src_multibyte;
3946 int consumed_chars = 0;
3947 int found = 0;
3948 int c;
3949
3950 detect_info->checked |= CATEGORY_MASK_BIG5;
3951 /* A coding system of this category is always ASCII compatible. */
3952 src += coding->head_ascii;
3953
3954 while (1)
3955 {
3956 src_base = src;
3957 ONE_MORE_BYTE (c);
3958 if (c < 0x80)
3959 continue;
3960 if (c >= 0xA1)
3961 {
3962 ONE_MORE_BYTE (c);
3963 if (c < 0x40 || (c >= 0x7F && c <= 0xA0))
3964 return 0;
3965 found = CATEGORY_MASK_BIG5;
3966 }
3967 else
3968 break;
3969 }
3970 detect_info->rejected |= CATEGORY_MASK_BIG5;
3971 return 0;
3972
3973 no_more_source:
3974 if (src_base < src && coding->mode & CODING_MODE_LAST_BLOCK)
3975 {
3976 detect_info->rejected |= CATEGORY_MASK_BIG5;
3977 return 0;
3978 }
3979 detect_info->found |= found;
3980 return 1;
3981 }
3982
3983 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".
3984 If SJIS_P is 1, decode SJIS text, else decode BIG5 test. */
3985
3986 static void
3987 decode_coding_sjis (coding)
3988 struct coding_system *coding;
3989 {
3990 const unsigned char *src = coding->source + coding->consumed;
3991 const unsigned char *src_end = coding->source + coding->src_bytes;
3992 const unsigned char *src_base;
3993 int *charbuf = coding->charbuf;
3994 int *charbuf_end = charbuf + coding->charbuf_size - MAX_ANNOTATION_LENGTH;
3995 int consumed_chars = 0, consumed_chars_base;
3996 int multibytep = coding->src_multibyte;
3997 struct charset *charset_roman, *charset_kanji, *charset_kana;
3998 struct charset *charset_kanji2;
3999 Lisp_Object attrs, charset_list, val;
4000 int char_offset = coding->produced_char;
4001 int last_offset = char_offset;
4002 int last_id = charset_ascii;
4003
4004 CODING_GET_INFO (coding, attrs, charset_list);
4005
4006 val = charset_list;
4007 charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4008 charset_kana = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4009 charset_kanji = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4010 charset_kanji2 = NILP (val) ? NULL : CHARSET_FROM_ID (XINT (XCAR (val)));
4011
4012 while (1)
4013 {
4014 int c, c1;
4015 struct charset *charset;
4016
4017 src_base = src;
4018 consumed_chars_base = consumed_chars;
4019
4020 if (charbuf >= charbuf_end)
4021 break;
4022
4023 ONE_MORE_BYTE (c);
4024 if (c < 0)
4025 goto invalid_code;
4026 if (c < 0x80)
4027 charset = charset_roman;
4028 else if (c == 0x80 || c == 0xA0)
4029 goto invalid_code;
4030 else if (c >= 0xA1 && c <= 0xDF)
4031 {
4032 /* SJIS -> JISX0201-Kana */
4033 c &= 0x7F;
4034 charset = charset_kana;
4035 }
4036 else if (c <= 0xEF)
4037 {
4038 /* SJIS -> JISX0208 */
4039 ONE_MORE_BYTE (c1);
4040 if (c1 < 0x40 || c1 == 0x7F || c1 > 0xFC)
4041 goto invalid_code;
4042 c = (c << 8) | c1;
4043 SJIS_TO_JIS (c);
4044 charset = charset_kanji;
4045 }
4046 else if (c <= 0xFC && charset_kanji2)
4047 {
4048 /* SJIS -> JISX0213-2 */
4049 ONE_MORE_BYTE (c1);
4050 if (c1 < 0x40 || c1 == 0x7F || c1 > 0xFC)
4051 goto invalid_code;
4052 c = (c << 8) | c1;
4053 SJIS_TO_JIS2 (c);
4054 charset = charset_kanji2;
4055 }
4056 else
4057 goto invalid_code;
4058 if (charset->id != charset_ascii
4059 && last_id != charset->id)
4060 {
4061 if (last_id != charset_ascii)
4062 ADD_CHARSET_DATA (charbuf, last_offset, char_offset, last_id);
4063 last_id = charset->id;
4064 last_offset = char_offset;
4065 }
4066 CODING_DECODE_CHAR (coding, src, src_base, src_end, charset, c, c);
4067 *charbuf++ = c;
4068 char_offset++;
4069 continue;
4070
4071 invalid_code:
4072 src = src_base;
4073 consumed_chars = consumed_chars_base;
4074 ONE_MORE_BYTE (c);
4075 *charbuf++ = c < 0 ? -c : BYTE8_TO_CHAR (c);
4076 char_offset++;
4077 coding->errors++;
4078 }
4079
4080 no_more_source:
4081 if (last_id != charset_ascii)
4082 ADD_CHARSET_DATA (charbuf, last_offset, char_offset, last_id);
4083 coding->consumed_char += consumed_chars_base;
4084 coding->consumed = src_base - coding->source;
4085 coding->charbuf_used = charbuf - coding->charbuf;
4086 }
4087
4088 static void
4089 decode_coding_big5 (coding)
4090 struct coding_system *coding;
4091 {
4092 const unsigned char *src = coding->source + coding->consumed;
4093 const unsigned char *src_end = coding->source + coding->src_bytes;
4094 const unsigned char *src_base;
4095 int *charbuf = coding->charbuf;
4096 int *charbuf_end = charbuf + coding->charbuf_size - MAX_ANNOTATION_LENGTH;
4097 int consumed_chars = 0, consumed_chars_base;
4098 int multibytep = coding->src_multibyte;
4099 struct charset *charset_roman, *charset_big5;
4100 Lisp_Object attrs, charset_list, val;
4101 int char_offset = coding->produced_char;
4102 int last_offset = char_offset;
4103 int last_id = charset_ascii;
4104
4105 CODING_GET_INFO (coding, attrs, charset_list);
4106 val = charset_list;
4107 charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4108 charset_big5 = CHARSET_FROM_ID (XINT (XCAR (val)));
4109
4110 while (1)
4111 {
4112 int c, c1;
4113 struct charset *charset;
4114
4115 src_base = src;
4116 consumed_chars_base = consumed_chars;
4117
4118 if (charbuf >= charbuf_end)
4119 break;
4120
4121 ONE_MORE_BYTE (c);
4122
4123 if (c < 0)
4124 goto invalid_code;
4125 if (c < 0x80)
4126 charset = charset_roman;
4127 else
4128 {
4129 /* BIG5 -> Big5 */
4130 if (c < 0xA1 || c > 0xFE)
4131 goto invalid_code;
4132 ONE_MORE_BYTE (c1);
4133 if (c1 < 0x40 || (c1 > 0x7E && c1 < 0xA1) || c1 > 0xFE)
4134 goto invalid_code;
4135 c = c << 8 | c1;
4136 charset = charset_big5;
4137 }
4138 if (charset->id != charset_ascii
4139 && last_id != charset->id)
4140 {
4141 if (last_id != charset_ascii)
4142 ADD_CHARSET_DATA (charbuf, last_offset, char_offset, last_id);
4143 last_id = charset->id;
4144 last_offset = char_offset;
4145 }
4146 CODING_DECODE_CHAR (coding, src, src_base, src_end, charset, c, c);
4147 *charbuf++ = c;
4148 char_offset++;
4149 continue;
4150
4151 invalid_code:
4152 src = src_base;
4153 consumed_chars = consumed_chars_base;
4154 ONE_MORE_BYTE (c);
4155 *charbuf++ = c < 0 ? -c : BYTE8_TO_CHAR (c);
4156 char_offset++;
4157 coding->errors++;
4158 }
4159
4160 no_more_source:
4161 if (last_id != charset_ascii)
4162 ADD_CHARSET_DATA (charbuf, last_offset, char_offset, last_id);
4163 coding->consumed_char += consumed_chars_base;
4164 coding->consumed = src_base - coding->source;
4165 coding->charbuf_used = charbuf - coding->charbuf;
4166 }
4167
4168 /* See the above "GENERAL NOTES on `encode_coding_XXX ()' functions".
4169 This function can encode charsets `ascii', `katakana-jisx0201',
4170 `japanese-jisx0208', `chinese-big5-1', and `chinese-big5-2'. We
4171 are sure that all these charsets are registered as official charset
4172 (i.e. do not have extended leading-codes). Characters of other
4173 charsets are produced without any encoding. If SJIS_P is 1, encode
4174 SJIS text, else encode BIG5 text. */
4175
4176 static int
4177 encode_coding_sjis (coding)
4178 struct coding_system *coding;
4179 {
4180 int multibytep = coding->dst_multibyte;
4181 int *charbuf = coding->charbuf;
4182 int *charbuf_end = charbuf + coding->charbuf_used;
4183 unsigned char *dst = coding->destination + coding->produced;
4184 unsigned char *dst_end = coding->destination + coding->dst_bytes;
4185 int safe_room = 4;
4186 int produced_chars = 0;
4187 Lisp_Object attrs, charset_list, val;
4188 int ascii_compatible;
4189 struct charset *charset_roman, *charset_kanji, *charset_kana;
4190 struct charset *charset_kanji2;
4191 int c;
4192
4193 CODING_GET_INFO (coding, attrs, charset_list);
4194 val = charset_list;
4195 charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4196 charset_kana = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4197 charset_kanji = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4198 charset_kanji2 = NILP (val) ? NULL : CHARSET_FROM_ID (XINT (XCAR (val)));
4199
4200 ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs));
4201
4202 while (charbuf < charbuf_end)
4203 {
4204 ASSURE_DESTINATION (safe_room);
4205 c = *charbuf++;
4206 /* Now encode the character C. */
4207 if (ASCII_CHAR_P (c) && ascii_compatible)
4208 EMIT_ONE_ASCII_BYTE (c);
4209 else if (CHAR_BYTE8_P (c))
4210 {
4211 c = CHAR_TO_BYTE8 (c);
4212 EMIT_ONE_BYTE (c);
4213 }
4214 else
4215 {
4216 unsigned code;
4217 struct charset *charset = char_charset (c, charset_list, &code);
4218
4219 if (!charset)
4220 {
4221 if (coding->mode & CODING_MODE_SAFE_ENCODING)
4222 {
4223 code = CODING_INHIBIT_CHARACTER_SUBSTITUTION;
4224 charset = CHARSET_FROM_ID (charset_ascii);
4225 }
4226 else
4227 {
4228 c = coding->default_char;
4229 charset = char_charset (c, charset_list, &code);
4230 }
4231 }
4232 if (code == CHARSET_INVALID_CODE (charset))
4233 abort ();
4234 if (charset == charset_kanji)
4235 {
4236 int c1, c2;
4237 JIS_TO_SJIS (code);
4238 c1 = code >> 8, c2 = code & 0xFF;
4239 EMIT_TWO_BYTES (c1, c2);
4240 }
4241 else if (charset == charset_kana)
4242 EMIT_ONE_BYTE (code | 0x80);
4243 else if (charset_kanji2 && charset == charset_kanji2)
4244 {
4245 int c1, c2;
4246
4247 c1 = code >> 8;
4248 if (c1 == 0x21 || (c1 >= 0x23 && c1 < 0x25)
4249 || (c1 >= 0x2C && c1 <= 0x2F) || c1 >= 0x6E)
4250 {
4251 JIS_TO_SJIS2 (code);
4252 c1 = code >> 8, c2 = code & 0xFF;
4253 EMIT_TWO_BYTES (c1, c2);
4254 }
4255 else
4256 EMIT_ONE_ASCII_BYTE (code & 0x7F);
4257 }
4258 else
4259 EMIT_ONE_ASCII_BYTE (code & 0x7F);
4260 }
4261 }
4262 record_conversion_result (coding, CODING_RESULT_SUCCESS);
4263 coding->produced_char += produced_chars;
4264 coding->produced = dst - coding->destination;
4265 return 0;
4266 }
4267
4268 static int
4269 encode_coding_big5 (coding)
4270 struct coding_system *coding;
4271 {
4272 int multibytep = coding->dst_multibyte;
4273 int *charbuf = coding->charbuf;
4274 int *charbuf_end = charbuf + coding->charbuf_used;
4275 unsigned char *dst = coding->destination + coding->produced;
4276 unsigned char *dst_end = coding->destination + coding->dst_bytes;
4277 int safe_room = 4;
4278 int produced_chars = 0;
4279 Lisp_Object attrs, charset_list, val;
4280 int ascii_compatible;
4281 struct charset *charset_roman, *charset_big5;
4282 int c;
4283
4284 CODING_GET_INFO (coding, attrs, charset_list);
4285 val = charset_list;
4286 charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4287 charset_big5 = CHARSET_FROM_ID (XINT (XCAR (val)));
4288 ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs));
4289
4290 while (charbuf < charbuf_end)
4291 {
4292 ASSURE_DESTINATION (safe_room);
4293 c = *charbuf++;
4294 /* Now encode the character C. */
4295 if (ASCII_CHAR_P (c) && ascii_compatible)
4296 EMIT_ONE_ASCII_BYTE (c);
4297 else if (CHAR_BYTE8_P (c))
4298 {
4299 c = CHAR_TO_BYTE8 (c);
4300 EMIT_ONE_BYTE (c);
4301 }
4302 else
4303 {
4304 unsigned code;
4305 struct charset *charset = char_charset (c, charset_list, &code);
4306
4307 if (! charset)
4308 {
4309 if (coding->mode & CODING_MODE_SAFE_ENCODING)
4310 {
4311 code = CODING_INHIBIT_CHARACTER_SUBSTITUTION;
4312 charset = CHARSET_FROM_ID (charset_ascii);
4313 }
4314 else
4315 {
4316 c = coding->default_char;
4317 charset = char_charset (c, charset_list, &code);
4318 }
4319 }
4320 if (code == CHARSET_INVALID_CODE (charset))
4321 abort ();
4322 if (charset == charset_big5)
4323 {
4324 int c1, c2;
4325
4326 c1 = code >> 8, c2 = code & 0xFF;
4327 EMIT_TWO_BYTES (c1, c2);
4328 }
4329 else
4330 EMIT_ONE_ASCII_BYTE (code & 0x7F);
4331 }
4332 }
4333 record_conversion_result (coding, CODING_RESULT_SUCCESS);
4334 coding->produced_char += produced_chars;
4335 coding->produced = dst - coding->destination;
4336 return 0;
4337 }
4338
4339 \f
4340 /*** 10. CCL handlers ***/
4341
4342 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
4343 Check if a text is encoded in a coding system of which
4344 encoder/decoder are written in CCL program. If it is, return
4345 CATEGORY_MASK_CCL, else return 0. */
4346
4347 static int
4348 detect_coding_ccl (coding, detect_info)
4349 struct coding_system *coding;
4350 struct coding_detection_info *detect_info;
4351 {
4352 const unsigned char *src = coding->source, *src_base;
4353 const unsigned char *src_end = coding->source + coding->src_bytes;
4354 int multibytep = coding->src_multibyte;
4355 int consumed_chars = 0;
4356 int found = 0;
4357 unsigned char *valids = CODING_CCL_VALIDS (coding);
4358 int head_ascii = coding->head_ascii;
4359 Lisp_Object attrs;
4360
4361 detect_info->checked |= CATEGORY_MASK_CCL;
4362
4363 coding = &coding_categories[coding_category_ccl];
4364 attrs = CODING_ID_ATTRS (coding->id);
4365 if (! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
4366 src += head_ascii;
4367
4368 while (1)
4369 {
4370 int c;
4371
4372 src_base = src;
4373 ONE_MORE_BYTE (c);
4374 if (c < 0 || ! valids[c])
4375 break;
4376 if ((valids[c] > 1))
4377 found = CATEGORY_MASK_CCL;
4378 }
4379 detect_info->rejected |= CATEGORY_MASK_CCL;
4380 return 0;
4381
4382 no_more_source:
4383 detect_info->found |= found;
4384 return 1;
4385 }
4386
4387 static void
4388 decode_coding_ccl (coding)
4389 struct coding_system *coding;
4390 {
4391 const unsigned char *src = coding->source + coding->consumed;
4392 const unsigned char *src_end = coding->source + coding->src_bytes;
4393 int *charbuf = coding->charbuf;
4394 int *charbuf_end = charbuf + coding->charbuf_size;
4395 int consumed_chars = 0;
4396 int multibytep = coding->src_multibyte;
4397 struct ccl_program ccl;
4398 int source_charbuf[1024];
4399 int source_byteidx[1024];
4400 Lisp_Object attrs, charset_list;
4401
4402 CODING_GET_INFO (coding, attrs, charset_list);
4403 setup_ccl_program (&ccl, CODING_CCL_DECODER (coding));
4404
4405 while (src < src_end)
4406 {
4407 const unsigned char *p = src;
4408 int *source, *source_end;
4409 int i = 0;
4410
4411 if (multibytep)
4412 while (i < 1024 && p < src_end)
4413 {
4414 source_byteidx[i] = p - src;
4415 source_charbuf[i++] = STRING_CHAR_ADVANCE (p);
4416 }
4417 else
4418 while (i < 1024 && p < src_end)
4419 source_charbuf[i++] = *p++;
4420
4421 if (p == src_end && coding->mode & CODING_MODE_LAST_BLOCK)
4422 ccl.last_block = 1;
4423
4424 source = source_charbuf;
4425 source_end = source + i;
4426 while (source < source_end)
4427 {
4428 ccl_driver (&ccl, source, charbuf,
4429 source_end - source, charbuf_end - charbuf,
4430 charset_list);
4431 source += ccl.consumed;
4432 charbuf += ccl.produced;
4433 if (ccl.status != CCL_STAT_SUSPEND_BY_DST)
4434 break;
4435 }
4436 if (source < source_end)
4437 src += source_byteidx[source - source_charbuf];
4438 else
4439 src = p;
4440 consumed_chars += source - source_charbuf;
4441
4442 if (ccl.status != CCL_STAT_SUSPEND_BY_SRC
4443 && ccl.status != CODING_RESULT_INSUFFICIENT_SRC)
4444 break;
4445 }
4446
4447 switch (ccl.status)
4448 {
4449 case CCL_STAT_SUSPEND_BY_SRC:
4450 record_conversion_result (coding, CODING_RESULT_INSUFFICIENT_SRC);
4451 break;
4452 case CCL_STAT_SUSPEND_BY_DST:
4453 break;
4454 case CCL_STAT_QUIT:
4455 case CCL_STAT_INVALID_CMD:
4456 record_conversion_result (coding, CODING_RESULT_INTERRUPT);
4457 break;
4458 default:
4459 record_conversion_result (coding, CODING_RESULT_SUCCESS);
4460 break;
4461 }
4462 coding->consumed_char += consumed_chars;
4463 coding->consumed = src - coding->source;
4464 coding->charbuf_used = charbuf - coding->charbuf;
4465 }
4466
4467 static int
4468 encode_coding_ccl (coding)
4469 struct coding_system *coding;
4470 {
4471 struct ccl_program ccl;
4472 int multibytep = coding->dst_multibyte;
4473 int *charbuf = coding->charbuf;
4474 int *charbuf_end = charbuf + coding->charbuf_used;
4475 unsigned char *dst = coding->destination + coding->produced;
4476 unsigned char *dst_end = coding->destination + coding->dst_bytes;
4477 unsigned char *adjusted_dst_end = dst_end - 1;
4478 int destination_charbuf[1024];
4479 int i, produced_chars = 0;
4480 Lisp_Object attrs, charset_list;
4481
4482 CODING_GET_INFO (coding, attrs, charset_list);
4483 setup_ccl_program (&ccl, CODING_CCL_ENCODER (coding));
4484
4485 ccl.last_block = coding->mode & CODING_MODE_LAST_BLOCK;
4486 ccl.dst_multibyte = coding->dst_multibyte;
4487
4488 while (charbuf < charbuf_end && dst < adjusted_dst_end)
4489 {
4490 int dst_bytes = dst_end - dst;
4491 if (dst_bytes > 1024)
4492 dst_bytes = 1024;
4493
4494 ccl_driver (&ccl, charbuf, destination_charbuf,
4495 charbuf_end - charbuf, dst_bytes, charset_list);
4496 charbuf += ccl.consumed;
4497 if (multibytep)
4498 for (i = 0; i < ccl.produced; i++)
4499 EMIT_ONE_BYTE (destination_charbuf[i] & 0xFF);
4500 else
4501 {
4502 for (i = 0; i < ccl.produced; i++)
4503 *dst++ = destination_charbuf[i] & 0xFF;
4504 produced_chars += ccl.produced;
4505 }
4506 }
4507
4508 switch (ccl.status)
4509 {
4510 case CCL_STAT_SUSPEND_BY_SRC:
4511 record_conversion_result (coding, CODING_RESULT_INSUFFICIENT_SRC);
4512 break;
4513 case CCL_STAT_SUSPEND_BY_DST:
4514 record_conversion_result (coding, CODING_RESULT_INSUFFICIENT_DST);
4515 break;
4516 case CCL_STAT_QUIT:
4517 case CCL_STAT_INVALID_CMD:
4518 record_conversion_result (coding, CODING_RESULT_INTERRUPT);
4519 break;
4520 default:
4521 record_conversion_result (coding, CODING_RESULT_SUCCESS);
4522 break;
4523 }
4524
4525 coding->produced_char += produced_chars;
4526 coding->produced = dst - coding->destination;
4527 return 0;
4528 }
4529
4530
4531 \f
4532 /*** 10, 11. no-conversion handlers ***/
4533
4534 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions". */
4535
4536 static void
4537 decode_coding_raw_text (coding)
4538 struct coding_system *coding;
4539 {
4540 coding->chars_at_source = 1;
4541 coding->consumed_char = 0;
4542 coding->consumed = 0;
4543 record_conversion_result (coding, CODING_RESULT_SUCCESS);
4544 }
4545
4546 static int
4547 encode_coding_raw_text (coding)
4548 struct coding_system *coding;
4549 {
4550 int multibytep = coding->dst_multibyte;
4551 int *charbuf = coding->charbuf;
4552 int *charbuf_end = coding->charbuf + coding->charbuf_used;
4553 unsigned char *dst = coding->destination + coding->produced;
4554 unsigned char *dst_end = coding->destination + coding->dst_bytes;
4555 int produced_chars = 0;
4556 int c;
4557
4558 if (multibytep)
4559 {
4560 int safe_room = MAX_MULTIBYTE_LENGTH * 2;
4561
4562 if (coding->src_multibyte)
4563 while (charbuf < charbuf_end)
4564 {
4565 ASSURE_DESTINATION (safe_room);
4566 c = *charbuf++;
4567 if (ASCII_CHAR_P (c))
4568 EMIT_ONE_ASCII_BYTE (c);
4569 else if (CHAR_BYTE8_P (c))
4570 {
4571 c = CHAR_TO_BYTE8 (c);
4572 EMIT_ONE_BYTE (c);
4573 }
4574 else
4575 {
4576 unsigned char str[MAX_MULTIBYTE_LENGTH], *p0 = str, *p1 = str;
4577
4578 CHAR_STRING_ADVANCE (c, p1);
4579 while (p0 < p1)
4580 {
4581 EMIT_ONE_BYTE (*p0);
4582 p0++;
4583 }
4584 }
4585 }
4586 else
4587 while (charbuf < charbuf_end)
4588 {
4589 ASSURE_DESTINATION (safe_room);
4590 c = *charbuf++;
4591 EMIT_ONE_BYTE (c);
4592 }
4593 }
4594 else
4595 {
4596 if (coding->src_multibyte)
4597 {
4598 int safe_room = MAX_MULTIBYTE_LENGTH;
4599
4600 while (charbuf < charbuf_end)
4601 {
4602 ASSURE_DESTINATION (safe_room);
4603 c = *charbuf++;
4604 if (ASCII_CHAR_P (c))
4605 *dst++ = c;
4606 else if (CHAR_BYTE8_P (c))
4607 *dst++ = CHAR_TO_BYTE8 (c);
4608 else
4609 CHAR_STRING_ADVANCE (c, dst);
4610 produced_chars++;
4611 }
4612 }
4613 else
4614 {
4615 ASSURE_DESTINATION (charbuf_end - charbuf);
4616 while (charbuf < charbuf_end && dst < dst_end)
4617 *dst++ = *charbuf++;
4618 produced_chars = dst - (coding->destination + coding->dst_bytes);
4619 }
4620 }
4621 record_conversion_result (coding, CODING_RESULT_SUCCESS);
4622 coding->produced_char += produced_chars;
4623 coding->produced = dst - coding->destination;
4624 return 0;
4625 }
4626
4627 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
4628 Check if a text is encoded in a charset-based coding system. If it
4629 is, return 1, else return 0. */
4630
4631 static int
4632 detect_coding_charset (coding, detect_info)
4633 struct coding_system *coding;
4634 struct coding_detection_info *detect_info;
4635 {
4636 const unsigned char *src = coding->source, *src_base;
4637 const unsigned char *src_end = coding->source + coding->src_bytes;
4638 int multibytep = coding->src_multibyte;
4639 int consumed_chars = 0;
4640 Lisp_Object attrs, valids;
4641 int found = 0;
4642
4643 detect_info->checked |= CATEGORY_MASK_CHARSET;
4644
4645 coding = &coding_categories[coding_category_charset];
4646 attrs = CODING_ID_ATTRS (coding->id);
4647 valids = AREF (attrs, coding_attr_charset_valids);
4648
4649 if (! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
4650 src += coding->head_ascii;
4651
4652 while (1)
4653 {
4654 int c;
4655
4656 src_base = src;
4657 ONE_MORE_BYTE (c);
4658 if (c < 0)
4659 continue;
4660 if (NILP (AREF (valids, c)))
4661 break;
4662 if (c >= 0x80)
4663 found = CATEGORY_MASK_CHARSET;
4664 }
4665 detect_info->rejected |= CATEGORY_MASK_CHARSET;
4666 return 0;
4667
4668 no_more_source:
4669 detect_info->found |= found;
4670 return 1;
4671 }
4672
4673 static void
4674 decode_coding_charset (coding)
4675 struct coding_system *coding;
4676 {
4677 const unsigned char *src = coding->source + coding->consumed;
4678 const unsigned char *src_end = coding->source + coding->src_bytes;
4679 const unsigned char *src_base;
4680 int *charbuf = coding->charbuf;
4681 int *charbuf_end = charbuf + coding->charbuf_size - MAX_ANNOTATION_LENGTH;
4682 int consumed_chars = 0, consumed_chars_base;
4683 int multibytep = coding->src_multibyte;
4684 Lisp_Object attrs, charset_list, valids;
4685 int char_offset = coding->produced_char;
4686 int last_offset = char_offset;
4687 int last_id = charset_ascii;
4688
4689 CODING_GET_INFO (coding, attrs, charset_list);
4690 valids = AREF (attrs, coding_attr_charset_valids);
4691
4692 while (1)
4693 {
4694 int c;
4695 Lisp_Object val;
4696 struct charset *charset;
4697 int dim;
4698 int len = 1;
4699 unsigned code;
4700
4701 src_base = src;
4702 consumed_chars_base = consumed_chars;
4703
4704 if (charbuf >= charbuf_end)
4705 break;
4706
4707 ONE_MORE_BYTE (c);
4708 if (c < 0)
4709 goto invalid_code;
4710 code = c;
4711
4712 val = AREF (valids, c);
4713 if (NILP (val))
4714 goto invalid_code;
4715 if (INTEGERP (val))
4716 {
4717 charset = CHARSET_FROM_ID (XFASTINT (val));
4718 dim = CHARSET_DIMENSION (charset);
4719 while (len < dim)
4720 {
4721 ONE_MORE_BYTE (c);
4722 code = (code << 8) | c;
4723 len++;
4724 }
4725 CODING_DECODE_CHAR (coding, src, src_base, src_end,
4726 charset, code, c);
4727 }
4728 else
4729 {
4730 /* VAL is a list of charset IDs. It is assured that the
4731 list is sorted by charset dimensions (smaller one
4732 comes first). */
4733 while (CONSP (val))
4734 {
4735 charset = CHARSET_FROM_ID (XFASTINT (XCAR (val)));
4736 dim = CHARSET_DIMENSION (charset);
4737 while (len < dim)
4738 {
4739 ONE_MORE_BYTE (c);
4740 code = (code << 8) | c;
4741 len++;
4742 }
4743 CODING_DECODE_CHAR (coding, src, src_base,
4744 src_end, charset, code, c);
4745 if (c >= 0)
4746 break;
4747 val = XCDR (val);
4748 }
4749 }
4750 if (c < 0)
4751 goto invalid_code;
4752 if (charset->id != charset_ascii
4753 && last_id != charset->id)
4754 {
4755 if (last_id != charset_ascii)
4756 ADD_CHARSET_DATA (charbuf, last_offset, char_offset, last_id);
4757 last_id = charset->id;
4758 last_offset = char_offset;
4759 }
4760
4761 *charbuf++ = c;
4762 char_offset++;
4763 continue;
4764
4765 invalid_code:
4766 src = src_base;
4767 consumed_chars = consumed_chars_base;
4768 ONE_MORE_BYTE (c);
4769 *charbuf++ = c < 0 ? -c : ASCII_BYTE_P (c) ? c : BYTE8_TO_CHAR (c);
4770 char_offset++;
4771 coding->errors++;
4772 }
4773
4774 no_more_source:
4775 if (last_id != charset_ascii)
4776 ADD_CHARSET_DATA (charbuf, last_offset, char_offset, last_id);
4777 coding->consumed_char += consumed_chars_base;
4778 coding->consumed = src_base - coding->source;
4779 coding->charbuf_used = charbuf - coding->charbuf;
4780 }
4781
4782 static int
4783 encode_coding_charset (coding)
4784 struct coding_system *coding;
4785 {
4786 int multibytep = coding->dst_multibyte;
4787 int *charbuf = coding->charbuf;
4788 int *charbuf_end = charbuf + coding->charbuf_used;
4789 unsigned char *dst = coding->destination + coding->produced;
4790 unsigned char *dst_end = coding->destination + coding->dst_bytes;
4791 int safe_room = MAX_MULTIBYTE_LENGTH;
4792 int produced_chars = 0;
4793 Lisp_Object attrs, charset_list;
4794 int ascii_compatible;
4795 int c;
4796
4797 CODING_GET_INFO (coding, attrs, charset_list);
4798 ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs));
4799
4800 while (charbuf < charbuf_end)
4801 {
4802 struct charset *charset;
4803 unsigned code;
4804
4805 ASSURE_DESTINATION (safe_room);
4806 c = *charbuf++;
4807 if (ascii_compatible && ASCII_CHAR_P (c))
4808 EMIT_ONE_ASCII_BYTE (c);
4809 else if (CHAR_BYTE8_P (c))
4810 {
4811 c = CHAR_TO_BYTE8 (c);
4812 EMIT_ONE_BYTE (c);
4813 }
4814 else
4815 {
4816 charset = char_charset (c, charset_list, &code);
4817 if (charset)
4818 {
4819 if (CHARSET_DIMENSION (charset) == 1)
4820 EMIT_ONE_BYTE (code);
4821 else if (CHARSET_DIMENSION (charset) == 2)
4822 EMIT_TWO_BYTES (code >> 8, code & 0xFF);
4823 else if (CHARSET_DIMENSION (charset) == 3)
4824 EMIT_THREE_BYTES (code >> 16, (code >> 8) & 0xFF, code & 0xFF);
4825 else
4826 EMIT_FOUR_BYTES (code >> 24, (code >> 16) & 0xFF,
4827 (code >> 8) & 0xFF, code & 0xFF);
4828 }
4829 else
4830 {
4831 if (coding->mode & CODING_MODE_SAFE_ENCODING)
4832 c = CODING_INHIBIT_CHARACTER_SUBSTITUTION;
4833 else
4834 c = coding->default_char;
4835 EMIT_ONE_BYTE (c);
4836 }
4837 }
4838 }
4839
4840 record_conversion_result (coding, CODING_RESULT_SUCCESS);
4841 coding->produced_char += produced_chars;
4842 coding->produced = dst - coding->destination;
4843 return 0;
4844 }
4845
4846 \f
4847 /*** 7. C library functions ***/
4848
4849 /* Setup coding context CODING from information about CODING_SYSTEM.
4850 If CODING_SYSTEM is nil, `no-conversion' is assumed. If
4851 CODING_SYSTEM is invalid, signal an error. */
4852
4853 void
4854 setup_coding_system (coding_system, coding)
4855 Lisp_Object coding_system;
4856 struct coding_system *coding;
4857 {
4858 Lisp_Object attrs;
4859 Lisp_Object eol_type;
4860 Lisp_Object coding_type;
4861 Lisp_Object val;
4862
4863 if (NILP (coding_system))
4864 coding_system = Qno_conversion;
4865
4866 CHECK_CODING_SYSTEM_GET_ID (coding_system, coding->id);
4867
4868 attrs = CODING_ID_ATTRS (coding->id);
4869 eol_type = CODING_ID_EOL_TYPE (coding->id);
4870
4871 coding->mode = 0;
4872 coding->head_ascii = -1;
4873 coding->common_flags
4874 = (VECTORP (eol_type) ? CODING_REQUIRE_DETECTION_MASK : 0);
4875 if (! NILP (CODING_ATTR_POST_READ (attrs)))
4876 coding->common_flags |= CODING_REQUIRE_DECODING_MASK;
4877 if (! NILP (CODING_ATTR_PRE_WRITE (attrs)))
4878 coding->common_flags |= CODING_REQUIRE_ENCODING_MASK;
4879 if (! NILP (CODING_ATTR_FOR_UNIBYTE (attrs)))
4880 coding->common_flags |= CODING_FOR_UNIBYTE_MASK;
4881
4882 val = CODING_ATTR_SAFE_CHARSETS (attrs);
4883 coding->max_charset_id = SCHARS (val) - 1;
4884 coding->safe_charsets = (char *) SDATA (val);
4885 coding->default_char = XINT (CODING_ATTR_DEFAULT_CHAR (attrs));
4886
4887 coding_type = CODING_ATTR_TYPE (attrs);
4888 if (EQ (coding_type, Qundecided))
4889 {
4890 coding->detector = NULL;
4891 coding->decoder = decode_coding_raw_text;
4892 coding->encoder = encode_coding_raw_text;
4893 coding->common_flags |= CODING_REQUIRE_DETECTION_MASK;
4894 }
4895 else if (EQ (coding_type, Qiso_2022))
4896 {
4897 int i;
4898 int flags = XINT (AREF (attrs, coding_attr_iso_flags));
4899
4900 /* Invoke graphic register 0 to plane 0. */
4901 CODING_ISO_INVOCATION (coding, 0) = 0;
4902 /* Invoke graphic register 1 to plane 1 if we can use 8-bit. */
4903 CODING_ISO_INVOCATION (coding, 1)
4904 = (flags & CODING_ISO_FLAG_SEVEN_BITS ? -1 : 1);
4905 /* Setup the initial status of designation. */
4906 for (i = 0; i < 4; i++)
4907 CODING_ISO_DESIGNATION (coding, i) = CODING_ISO_INITIAL (coding, i);
4908 /* Not single shifting initially. */
4909 CODING_ISO_SINGLE_SHIFTING (coding) = 0;
4910 /* Beginning of buffer should also be regarded as bol. */
4911 CODING_ISO_BOL (coding) = 1;
4912 coding->detector = detect_coding_iso_2022;
4913 coding->decoder = decode_coding_iso_2022;
4914 coding->encoder = encode_coding_iso_2022;
4915 if (flags & CODING_ISO_FLAG_SAFE)
4916 coding->mode |= CODING_MODE_SAFE_ENCODING;
4917 coding->common_flags
4918 |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK
4919 | CODING_REQUIRE_FLUSHING_MASK);
4920 if (flags & CODING_ISO_FLAG_COMPOSITION)
4921 coding->common_flags |= CODING_ANNOTATE_COMPOSITION_MASK;
4922 if (flags & CODING_ISO_FLAG_DESIGNATION)
4923 coding->common_flags |= CODING_ANNOTATE_CHARSET_MASK;
4924 if (flags & CODING_ISO_FLAG_FULL_SUPPORT)
4925 {
4926 setup_iso_safe_charsets (attrs);
4927 val = CODING_ATTR_SAFE_CHARSETS (attrs);
4928 coding->max_charset_id = SCHARS (val) - 1;
4929 coding->safe_charsets = (char *) SDATA (val);
4930 }
4931 CODING_ISO_FLAGS (coding) = flags;
4932 }
4933 else if (EQ (coding_type, Qcharset))
4934 {
4935 coding->detector = detect_coding_charset;
4936 coding->decoder = decode_coding_charset;
4937 coding->encoder = encode_coding_charset;
4938 coding->common_flags
4939 |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
4940 }
4941 else if (EQ (coding_type, Qutf_8))
4942 {
4943 coding->detector = detect_coding_utf_8;
4944 coding->decoder = decode_coding_utf_8;
4945 coding->encoder = encode_coding_utf_8;
4946 coding->common_flags
4947 |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
4948 }
4949 else if (EQ (coding_type, Qutf_16))
4950 {
4951 val = AREF (attrs, coding_attr_utf_16_bom);
4952 CODING_UTF_16_BOM (coding) = (CONSP (val) ? utf_16_detect_bom
4953 : EQ (val, Qt) ? utf_16_with_bom
4954 : utf_16_without_bom);
4955 val = AREF (attrs, coding_attr_utf_16_endian);
4956 CODING_UTF_16_ENDIAN (coding) = (EQ (val, Qbig) ? utf_16_big_endian
4957 : utf_16_little_endian);
4958 CODING_UTF_16_SURROGATE (coding) = 0;
4959 coding->detector = detect_coding_utf_16;
4960 coding->decoder = decode_coding_utf_16;
4961 coding->encoder = encode_coding_utf_16;
4962 coding->common_flags
4963 |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
4964 if (CODING_UTF_16_BOM (coding) == utf_16_detect_bom)
4965 coding->common_flags |= CODING_REQUIRE_DETECTION_MASK;
4966 }
4967 else if (EQ (coding_type, Qccl))
4968 {
4969 coding->detector = detect_coding_ccl;
4970 coding->decoder = decode_coding_ccl;
4971 coding->encoder = encode_coding_ccl;
4972 coding->common_flags
4973 |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK
4974 | CODING_REQUIRE_FLUSHING_MASK);
4975 }
4976 else if (EQ (coding_type, Qemacs_mule))
4977 {
4978 coding->detector = detect_coding_emacs_mule;
4979 coding->decoder = decode_coding_emacs_mule;
4980 coding->encoder = encode_coding_emacs_mule;
4981 coding->common_flags
4982 |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
4983 if (! NILP (AREF (attrs, coding_attr_emacs_mule_full))
4984 && ! EQ (CODING_ATTR_CHARSET_LIST (attrs), Vemacs_mule_charset_list))
4985 {
4986 Lisp_Object tail, safe_charsets;
4987 int max_charset_id = 0;
4988
4989 for (tail = Vemacs_mule_charset_list; CONSP (tail);
4990 tail = XCDR (tail))
4991 if (max_charset_id < XFASTINT (XCAR (tail)))
4992 max_charset_id = XFASTINT (XCAR (tail));
4993 safe_charsets = Fmake_string (make_number (max_charset_id + 1),
4994 make_number (255));
4995 for (tail = Vemacs_mule_charset_list; CONSP (tail);
4996 tail = XCDR (tail))
4997 SSET (safe_charsets, XFASTINT (XCAR (tail)), 0);
4998 coding->max_charset_id = max_charset_id;
4999 coding->safe_charsets = (char *) SDATA (safe_charsets);
5000 }
5001 }
5002 else if (EQ (coding_type, Qshift_jis))
5003 {
5004 coding->detector = detect_coding_sjis;
5005 coding->decoder = decode_coding_sjis;
5006 coding->encoder = encode_coding_sjis;
5007 coding->common_flags
5008 |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
5009 }
5010 else if (EQ (coding_type, Qbig5))
5011 {
5012 coding->detector = detect_coding_big5;
5013 coding->decoder = decode_coding_big5;
5014 coding->encoder = encode_coding_big5;
5015 coding->common_flags
5016 |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
5017 }
5018 else /* EQ (coding_type, Qraw_text) */
5019 {
5020 coding->detector = NULL;
5021 coding->decoder = decode_coding_raw_text;
5022 coding->encoder = encode_coding_raw_text;
5023 }
5024
5025 return;
5026 }
5027
5028 /* Return raw-text or one of its subsidiaries that has the same
5029 eol_type as CODING-SYSTEM. */
5030
5031 Lisp_Object
5032 raw_text_coding_system (coding_system)
5033 Lisp_Object coding_system;
5034 {
5035 Lisp_Object spec, attrs;
5036 Lisp_Object eol_type, raw_text_eol_type;
5037
5038 if (NILP (coding_system))
5039 return Qraw_text;
5040 spec = CODING_SYSTEM_SPEC (coding_system);
5041 attrs = AREF (spec, 0);
5042
5043 if (EQ (CODING_ATTR_TYPE (attrs), Qraw_text))
5044 return coding_system;
5045
5046 eol_type = AREF (spec, 2);
5047 if (VECTORP (eol_type))
5048 return Qraw_text;
5049 spec = CODING_SYSTEM_SPEC (Qraw_text);
5050 raw_text_eol_type = AREF (spec, 2);
5051 return (EQ (eol_type, Qunix) ? AREF (raw_text_eol_type, 0)
5052 : EQ (eol_type, Qdos) ? AREF (raw_text_eol_type, 1)
5053 : AREF (raw_text_eol_type, 2));
5054 }
5055
5056
5057 /* If CODING_SYSTEM doesn't specify end-of-line format but PARENT
5058 does, return one of the subsidiary that has the same eol-spec as
5059 PARENT. Otherwise, return CODING_SYSTEM. */
5060
5061 Lisp_Object
5062 coding_inherit_eol_type (coding_system, parent)
5063 Lisp_Object coding_system, parent;
5064 {
5065 Lisp_Object spec, eol_type;
5066
5067 if (NILP (coding_system))
5068 coding_system = Qraw_text;
5069 spec = CODING_SYSTEM_SPEC (coding_system);
5070 eol_type = AREF (spec, 2);
5071 if (VECTORP (eol_type)
5072 && ! NILP (parent))
5073 {
5074 Lisp_Object parent_spec;
5075 Lisp_Object parent_eol_type;
5076
5077 parent_spec
5078 = CODING_SYSTEM_SPEC (buffer_defaults.buffer_file_coding_system);
5079 parent_eol_type = AREF (parent_spec, 2);
5080 if (EQ (parent_eol_type, Qunix))
5081 coding_system = AREF (eol_type, 0);
5082 else if (EQ (parent_eol_type, Qdos))
5083 coding_system = AREF (eol_type, 1);
5084 else if (EQ (parent_eol_type, Qmac))
5085 coding_system = AREF (eol_type, 2);
5086 }
5087 return coding_system;
5088 }
5089
5090 /* Emacs has a mechanism to automatically detect a coding system if it
5091 is one of Emacs' internal format, ISO2022, SJIS, and BIG5. But,
5092 it's impossible to distinguish some coding systems accurately
5093 because they use the same range of codes. So, at first, coding
5094 systems are categorized into 7, those are:
5095
5096 o coding-category-emacs-mule
5097
5098 The category for a coding system which has the same code range
5099 as Emacs' internal format. Assigned the coding-system (Lisp
5100 symbol) `emacs-mule' by default.
5101
5102 o coding-category-sjis
5103
5104 The category for a coding system which has the same code range
5105 as SJIS. Assigned the coding-system (Lisp
5106 symbol) `japanese-shift-jis' by default.
5107
5108 o coding-category-iso-7
5109
5110 The category for a coding system which has the same code range
5111 as ISO2022 of 7-bit environment. This doesn't use any locking
5112 shift and single shift functions. This can encode/decode all
5113 charsets. Assigned the coding-system (Lisp symbol)
5114 `iso-2022-7bit' by default.
5115
5116 o coding-category-iso-7-tight
5117
5118 Same as coding-category-iso-7 except that this can
5119 encode/decode only the specified charsets.
5120
5121 o coding-category-iso-8-1
5122
5123 The category for a coding system which has the same code range
5124 as ISO2022 of 8-bit environment and graphic plane 1 used only
5125 for DIMENSION1 charset. This doesn't use any locking shift
5126 and single shift functions. Assigned the coding-system (Lisp
5127 symbol) `iso-latin-1' by default.
5128
5129 o coding-category-iso-8-2
5130
5131 The category for a coding system which has the same code range
5132 as ISO2022 of 8-bit environment and graphic plane 1 used only
5133 for DIMENSION2 charset. This doesn't use any locking shift
5134 and single shift functions. Assigned the coding-system (Lisp
5135 symbol) `japanese-iso-8bit' by default.
5136
5137 o coding-category-iso-7-else
5138
5139 The category for a coding system which has the same code range
5140 as ISO2022 of 7-bit environemnt but uses locking shift or
5141 single shift functions. Assigned the coding-system (Lisp
5142 symbol) `iso-2022-7bit-lock' by default.
5143
5144 o coding-category-iso-8-else
5145
5146 The category for a coding system which has the same code range
5147 as ISO2022 of 8-bit environemnt but uses locking shift or
5148 single shift functions. Assigned the coding-system (Lisp
5149 symbol) `iso-2022-8bit-ss2' by default.
5150
5151 o coding-category-big5
5152
5153 The category for a coding system which has the same code range
5154 as BIG5. Assigned the coding-system (Lisp symbol)
5155 `cn-big5' by default.
5156
5157 o coding-category-utf-8
5158
5159 The category for a coding system which has the same code range
5160 as UTF-8 (cf. RFC2279). Assigned the coding-system (Lisp
5161 symbol) `utf-8' by default.
5162
5163 o coding-category-utf-16-be
5164
5165 The category for a coding system in which a text has an
5166 Unicode signature (cf. Unicode Standard) in the order of BIG
5167 endian at the head. Assigned the coding-system (Lisp symbol)
5168 `utf-16-be' by default.
5169
5170 o coding-category-utf-16-le
5171
5172 The category for a coding system in which a text has an
5173 Unicode signature (cf. Unicode Standard) in the order of
5174 LITTLE endian at the head. Assigned the coding-system (Lisp
5175 symbol) `utf-16-le' by default.
5176
5177 o coding-category-ccl
5178
5179 The category for a coding system of which encoder/decoder is
5180 written in CCL programs. The default value is nil, i.e., no
5181 coding system is assigned.
5182
5183 o coding-category-binary
5184
5185 The category for a coding system not categorized in any of the
5186 above. Assigned the coding-system (Lisp symbol)
5187 `no-conversion' by default.
5188
5189 Each of them is a Lisp symbol and the value is an actual
5190 `coding-system's (this is also a Lisp symbol) assigned by a user.
5191 What Emacs does actually is to detect a category of coding system.
5192 Then, it uses a `coding-system' assigned to it. If Emacs can't
5193 decide only one possible category, it selects a category of the
5194 highest priority. Priorities of categories are also specified by a
5195 user in a Lisp variable `coding-category-list'.
5196
5197 */
5198
5199 #define EOL_SEEN_NONE 0
5200 #define EOL_SEEN_LF 1
5201 #define EOL_SEEN_CR 2
5202 #define EOL_SEEN_CRLF 4
5203
5204 /* Detect how end-of-line of a text of length SRC_BYTES pointed by
5205 SOURCE is encoded. If CATEGORY is one of
5206 coding_category_utf_16_XXXX, assume that CR and LF are encoded by
5207 two-byte, else they are encoded by one-byte.
5208
5209 Return one of EOL_SEEN_XXX. */
5210
5211 #define MAX_EOL_CHECK_COUNT 3
5212
5213 static int
5214 detect_eol (source, src_bytes, category)
5215 unsigned char *source;
5216 EMACS_INT src_bytes;
5217 enum coding_category category;
5218 {
5219 unsigned char *src = source, *src_end = src + src_bytes;
5220 unsigned char c;
5221 int total = 0;
5222 int eol_seen = EOL_SEEN_NONE;
5223
5224 if ((1 << category) & CATEGORY_MASK_UTF_16)
5225 {
5226 int msb, lsb;
5227
5228 msb = category == (coding_category_utf_16_le
5229 | coding_category_utf_16_le_nosig);
5230 lsb = 1 - msb;
5231
5232 while (src + 1 < src_end)
5233 {
5234 c = src[lsb];
5235 if (src[msb] == 0 && (c == '\n' || c == '\r'))
5236 {
5237 int this_eol;
5238
5239 if (c == '\n')
5240 this_eol = EOL_SEEN_LF;
5241 else if (src + 3 >= src_end
5242 || src[msb + 2] != 0
5243 || src[lsb + 2] != '\n')
5244 this_eol = EOL_SEEN_CR;
5245 else
5246 this_eol = EOL_SEEN_CRLF;
5247
5248 if (eol_seen == EOL_SEEN_NONE)
5249 /* This is the first end-of-line. */
5250 eol_seen = this_eol;
5251 else if (eol_seen != this_eol)
5252 {
5253 /* The found type is different from what found before. */
5254 eol_seen = EOL_SEEN_LF;
5255 break;
5256 }
5257 if (++total == MAX_EOL_CHECK_COUNT)
5258 break;
5259 }
5260 src += 2;
5261 }
5262 }
5263 else
5264 {
5265 while (src < src_end)
5266 {
5267 c = *src++;
5268 if (c == '\n' || c == '\r')
5269 {
5270 int this_eol;
5271
5272 if (c == '\n')
5273 this_eol = EOL_SEEN_LF;
5274 else if (src >= src_end || *src != '\n')
5275 this_eol = EOL_SEEN_CR;
5276 else
5277 this_eol = EOL_SEEN_CRLF, src++;
5278
5279 if (eol_seen == EOL_SEEN_NONE)
5280 /* This is the first end-of-line. */
5281 eol_seen = this_eol;
5282 else if (eol_seen != this_eol)
5283 {
5284 /* The found type is different from what found before. */
5285 eol_seen = EOL_SEEN_LF;
5286 break;
5287 }
5288 if (++total == MAX_EOL_CHECK_COUNT)
5289 break;
5290 }
5291 }
5292 }
5293 return eol_seen;
5294 }
5295
5296
5297 static Lisp_Object
5298 adjust_coding_eol_type (coding, eol_seen)
5299 struct coding_system *coding;
5300 int eol_seen;
5301 {
5302 Lisp_Object eol_type;
5303
5304 eol_type = CODING_ID_EOL_TYPE (coding->id);
5305 if (eol_seen & EOL_SEEN_LF)
5306 {
5307 coding->id = CODING_SYSTEM_ID (AREF (eol_type, 0));
5308 eol_type = Qunix;
5309 }
5310 else if (eol_seen & EOL_SEEN_CRLF)
5311 {
5312 coding->id = CODING_SYSTEM_ID (AREF (eol_type, 1));
5313 eol_type = Qdos;
5314 }
5315 else if (eol_seen & EOL_SEEN_CR)
5316 {
5317 coding->id = CODING_SYSTEM_ID (AREF (eol_type, 2));
5318 eol_type = Qmac;
5319 }
5320 return eol_type;
5321 }
5322
5323 /* Detect how a text specified in CODING is encoded. If a coding
5324 system is detected, update fields of CODING by the detected coding
5325 system. */
5326
5327 void
5328 detect_coding (coding)
5329 struct coding_system *coding;
5330 {
5331 const unsigned char *src, *src_end;
5332 Lisp_Object attrs, coding_type;
5333
5334 coding->consumed = coding->consumed_char = 0;
5335 coding->produced = coding->produced_char = 0;
5336 coding_set_source (coding);
5337
5338 src_end = coding->source + coding->src_bytes;
5339
5340 /* If we have not yet decided the text encoding type, detect it
5341 now. */
5342 if (EQ (CODING_ATTR_TYPE (CODING_ID_ATTRS (coding->id)), Qundecided))
5343 {
5344 int c, i;
5345
5346 for (i = 0, src = coding->source; src < src_end; i++, src++)
5347 {
5348 c = *src;
5349 if (c & 0x80 || (c < 0x20 && (c == ISO_CODE_ESC
5350 || c == ISO_CODE_SI
5351 || c == ISO_CODE_SO)))
5352 break;
5353 }
5354 coding->head_ascii = src - (coding->source + coding->consumed);
5355
5356 if (coding->head_ascii < coding->src_bytes)
5357 {
5358 struct coding_detection_info detect_info;
5359 enum coding_category category;
5360 struct coding_system *this;
5361
5362 detect_info.checked = detect_info.found = detect_info.rejected = 0;
5363 for (i = 0; i < coding_category_raw_text; i++)
5364 {
5365 category = coding_priorities[i];
5366 this = coding_categories + category;
5367 if (this->id < 0)
5368 {
5369 /* No coding system of this category is defined. */
5370 detect_info.rejected |= (1 << category);
5371 }
5372 else if (category >= coding_category_raw_text)
5373 continue;
5374 else if (detect_info.checked & (1 << category))
5375 {
5376 if (detect_info.found & (1 << category))
5377 break;
5378 }
5379 else if ((*(this->detector)) (coding, &detect_info)
5380 && detect_info.found & (1 << category))
5381 {
5382 if (category == coding_category_utf_16_auto)
5383 {
5384 if (detect_info.found & CATEGORY_MASK_UTF_16_LE)
5385 category = coding_category_utf_16_le;
5386 else
5387 category = coding_category_utf_16_be;
5388 }
5389 break;
5390 }
5391 }
5392 if (i < coding_category_raw_text)
5393 setup_coding_system (CODING_ID_NAME (this->id), coding);
5394 else if (detect_info.rejected == CATEGORY_MASK_ANY)
5395 setup_coding_system (Qraw_text, coding);
5396 else if (detect_info.rejected)
5397 for (i = 0; i < coding_category_raw_text; i++)
5398 if (! (detect_info.rejected & (1 << coding_priorities[i])))
5399 {
5400 this = coding_categories + coding_priorities[i];
5401 setup_coding_system (CODING_ID_NAME (this->id), coding);
5402 break;
5403 }
5404 }
5405 }
5406 else if (XINT (CODING_ATTR_CATEGORY (CODING_ID_ATTRS (coding->id)))
5407 == coding_category_utf_16_auto)
5408 {
5409 Lisp_Object coding_systems;
5410 struct coding_detection_info detect_info;
5411
5412 coding_systems
5413 = AREF (CODING_ID_ATTRS (coding->id), coding_attr_utf_16_bom);
5414 detect_info.found = detect_info.rejected = 0;
5415 if (CONSP (coding_systems)
5416 && detect_coding_utf_16 (coding, &detect_info))
5417 {
5418 if (detect_info.found & CATEGORY_MASK_UTF_16_LE)
5419 setup_coding_system (XCAR (coding_systems), coding);
5420 else if (detect_info.found & CATEGORY_MASK_UTF_16_BE)
5421 setup_coding_system (XCDR (coding_systems), coding);
5422 }
5423 }
5424 }
5425
5426
5427 static void
5428 decode_eol (coding)
5429 struct coding_system *coding;
5430 {
5431 Lisp_Object eol_type;
5432 unsigned char *p, *pbeg, *pend;
5433
5434 eol_type = CODING_ID_EOL_TYPE (coding->id);
5435 if (EQ (eol_type, Qunix))
5436 return;
5437
5438 if (NILP (coding->dst_object))
5439 pbeg = coding->destination;
5440 else
5441 pbeg = BYTE_POS_ADDR (coding->dst_pos_byte);
5442 pend = pbeg + coding->produced;
5443
5444 if (VECTORP (eol_type))
5445 {
5446 int eol_seen = EOL_SEEN_NONE;
5447
5448 for (p = pbeg; p < pend; p++)
5449 {
5450 if (*p == '\n')
5451 eol_seen |= EOL_SEEN_LF;
5452 else if (*p == '\r')
5453 {
5454 if (p + 1 < pend && *(p + 1) == '\n')
5455 {
5456 eol_seen |= EOL_SEEN_CRLF;
5457 p++;
5458 }
5459 else
5460 eol_seen |= EOL_SEEN_CR;
5461 }
5462 }
5463 if (eol_seen != EOL_SEEN_NONE
5464 && eol_seen != EOL_SEEN_LF
5465 && eol_seen != EOL_SEEN_CRLF
5466 && eol_seen != EOL_SEEN_CR)
5467 eol_seen = EOL_SEEN_LF;
5468 if (eol_seen != EOL_SEEN_NONE)
5469 eol_type = adjust_coding_eol_type (coding, eol_seen);
5470 }
5471
5472 if (EQ (eol_type, Qmac))
5473 {
5474 for (p = pbeg; p < pend; p++)
5475 if (*p == '\r')
5476 *p = '\n';
5477 }
5478 else if (EQ (eol_type, Qdos))
5479 {
5480 int n = 0;
5481
5482 if (NILP (coding->dst_object))
5483 {
5484 for (p = pend - 2; p >= pbeg; p--)
5485 if (*p == '\r')
5486 {
5487 safe_bcopy ((char *) (p + 1), (char *) p, pend-- - p - 1);
5488 n++;
5489 }
5490 }
5491 else
5492 {
5493 for (p = pend - 2; p >= pbeg; p--)
5494 if (*p == '\r')
5495 {
5496 int pos_byte = coding->dst_pos_byte + (p - pbeg);
5497 int pos = BYTE_TO_CHAR (pos_byte);
5498
5499 del_range_2 (pos, pos_byte, pos + 1, pos_byte + 1, 0);
5500 n++;
5501 }
5502 }
5503 coding->produced -= n;
5504 coding->produced_char -= n;
5505 }
5506 }
5507
5508
5509 /* Return a translation table (or list of them) from coding system
5510 attribute vector ATTRS for encoding (ENCODEP is nonzero) or
5511 decoding (ENCODEP is zero). */
5512
5513 static INLINE
5514 get_translation_table (attrs, encodep)
5515 {
5516 Lisp_Object standard, translation_table;
5517
5518 if (encodep)
5519 translation_table = CODING_ATTR_ENCODE_TBL (attrs),
5520 standard = Vstandard_translation_table_for_encode;
5521 else
5522 translation_table = CODING_ATTR_DECODE_TBL (attrs),
5523 standard = Vstandard_translation_table_for_decode;
5524 if (NILP (translation_table))
5525 return standard;
5526 if (SYMBOLP (translation_table))
5527 translation_table = Fget (translation_table, Qtranslation_table);
5528 else if (CONSP (translation_table))
5529 {
5530 Lisp_Object val;
5531
5532 translation_table = Fcopy_sequence (translation_table);
5533 for (val = translation_table; CONSP (val); val = XCDR (val))
5534 if (SYMBOLP (XCAR (val)))
5535 XSETCAR (val, Fget (XCAR (val), Qtranslation_table));
5536 }
5537 if (! NILP (standard))
5538 {
5539 if (CONSP (translation_table))
5540 translation_table = nconc2 (translation_table, Fcons (standard, Qnil));
5541 else
5542 translation_table = Fcons (translation_table, Fcons (standard, Qnil));
5543 }
5544 return translation_table;
5545 }
5546
5547
5548 static void
5549 translate_chars (coding, table)
5550 struct coding_system *coding;
5551 Lisp_Object table;
5552 {
5553 int *charbuf = coding->charbuf;
5554 int *charbuf_end = charbuf + coding->charbuf_used;
5555 int c;
5556
5557 if (coding->chars_at_source)
5558 return;
5559
5560 while (charbuf < charbuf_end)
5561 {
5562 c = *charbuf;
5563 if (c < 0)
5564 charbuf += -c;
5565 else
5566 *charbuf++ = translate_char (table, c);
5567 }
5568 }
5569
5570 static int
5571 produce_chars (coding)
5572 struct coding_system *coding;
5573 {
5574 unsigned char *dst = coding->destination + coding->produced;
5575 unsigned char *dst_end = coding->destination + coding->dst_bytes;
5576 int produced;
5577 int produced_chars = 0;
5578
5579 if (! coding->chars_at_source)
5580 {
5581 /* Characters are in coding->charbuf. */
5582 int *buf = coding->charbuf;
5583 int *buf_end = buf + coding->charbuf_used;
5584 unsigned char *adjusted_dst_end;
5585
5586 if (BUFFERP (coding->src_object)
5587 && EQ (coding->src_object, coding->dst_object))
5588 dst_end = ((unsigned char *) coding->source) + coding->consumed;
5589 adjusted_dst_end = dst_end - MAX_MULTIBYTE_LENGTH;
5590
5591 while (buf < buf_end)
5592 {
5593 int c = *buf++;
5594
5595 if (dst >= adjusted_dst_end)
5596 {
5597 dst = alloc_destination (coding,
5598 buf_end - buf + MAX_MULTIBYTE_LENGTH,
5599 dst);
5600 dst_end = coding->destination + coding->dst_bytes;
5601 adjusted_dst_end = dst_end - MAX_MULTIBYTE_LENGTH;
5602 }
5603 if (c >= 0)
5604 {
5605 if (coding->dst_multibyte
5606 || ! CHAR_BYTE8_P (c))
5607 CHAR_STRING_ADVANCE (c, dst);
5608 else
5609 *dst++ = CHAR_TO_BYTE8 (c);
5610 produced_chars++;
5611 }
5612 else
5613 /* This is an annotation datum. (-C) is the length of
5614 it. */
5615 buf += -c - 1;
5616 }
5617 }
5618 else
5619 {
5620 const unsigned char *src = coding->source;
5621 const unsigned char *src_end = src + coding->src_bytes;
5622 Lisp_Object eol_type;
5623
5624 eol_type = CODING_ID_EOL_TYPE (coding->id);
5625
5626 if (coding->src_multibyte != coding->dst_multibyte)
5627 {
5628 if (coding->src_multibyte)
5629 {
5630 int multibytep = 1;
5631 int consumed_chars;
5632
5633 while (1)
5634 {
5635 const unsigned char *src_base = src;
5636 int c;
5637
5638 ONE_MORE_BYTE (c);
5639 if (c == '\r')
5640 {
5641 if (EQ (eol_type, Qdos))
5642 {
5643 if (src == src_end)
5644 {
5645 record_conversion_result
5646 (coding, CODING_RESULT_INSUFFICIENT_SRC);
5647 goto no_more_source;
5648 }
5649 if (*src == '\n')
5650 c = *src++;
5651 }
5652 else if (EQ (eol_type, Qmac))
5653 c = '\n';
5654 }
5655 if (dst == dst_end)
5656 {
5657 coding->consumed = src - coding->source;
5658
5659 if (EQ (coding->src_object, coding->dst_object))
5660 dst_end = (unsigned char *) src;
5661 if (dst == dst_end)
5662 {
5663 dst = alloc_destination (coding, src_end - src + 1,
5664 dst);
5665 dst_end = coding->destination + coding->dst_bytes;
5666 coding_set_source (coding);
5667 src = coding->source + coding->consumed;
5668 src_end = coding->source + coding->src_bytes;
5669 }
5670 }
5671 *dst++ = c;
5672 produced_chars++;
5673 }
5674 no_more_source:
5675 ;
5676 }
5677 else
5678 while (src < src_end)
5679 {
5680 int multibytep = 1;
5681 int c = *src++;
5682
5683 if (c == '\r')
5684 {
5685 if (EQ (eol_type, Qdos))
5686 {
5687 if (src < src_end
5688 && *src == '\n')
5689 c = *src++;
5690 }
5691 else if (EQ (eol_type, Qmac))
5692 c = '\n';
5693 }
5694 if (dst >= dst_end - 1)
5695 {
5696 coding->consumed = src - coding->source;
5697
5698 if (EQ (coding->src_object, coding->dst_object))
5699 dst_end = (unsigned char *) src;
5700 if (dst >= dst_end - 1)
5701 {
5702 dst = alloc_destination (coding, src_end - src + 2,
5703 dst);
5704 dst_end = coding->destination + coding->dst_bytes;
5705 coding_set_source (coding);
5706 src = coding->source + coding->consumed;
5707 src_end = coding->source + coding->src_bytes;
5708 }
5709 }
5710 EMIT_ONE_BYTE (c);
5711 }
5712 }
5713 else
5714 {
5715 if (!EQ (coding->src_object, coding->dst_object))
5716 {
5717 int require = coding->src_bytes - coding->dst_bytes;
5718
5719 if (require > 0)
5720 {
5721 EMACS_INT offset = src - coding->source;
5722
5723 dst = alloc_destination (coding, require, dst);
5724 coding_set_source (coding);
5725 src = coding->source + offset;
5726 src_end = coding->source + coding->src_bytes;
5727 }
5728 }
5729 produced_chars = coding->src_chars;
5730 while (src < src_end)
5731 {
5732 int c = *src++;
5733
5734 if (c == '\r')
5735 {
5736 if (EQ (eol_type, Qdos))
5737 {
5738 if (src < src_end
5739 && *src == '\n')
5740 c = *src++;
5741 produced_chars--;
5742 }
5743 else if (EQ (eol_type, Qmac))
5744 c = '\n';
5745 }
5746 *dst++ = c;
5747 }
5748 }
5749 coding->consumed = coding->src_bytes;
5750 coding->consumed_char = coding->src_chars;
5751 }
5752
5753 produced = dst - (coding->destination + coding->produced);
5754 if (BUFFERP (coding->dst_object))
5755 insert_from_gap (produced_chars, produced);
5756 coding->produced += produced;
5757 coding->produced_char += produced_chars;
5758 return produced_chars;
5759 }
5760
5761 /* Compose text in CODING->object according to the annotation data at
5762 CHARBUF. CHARBUF is an array:
5763 [ -LENGTH ANNOTATION_MASK FROM TO METHOD COMP_LEN [ COMPONENTS... ] ]
5764 */
5765
5766 static INLINE void
5767 produce_composition (coding, charbuf)
5768 struct coding_system *coding;
5769 int *charbuf;
5770 {
5771 int len;
5772 EMACS_INT from, to;
5773 enum composition_method method;
5774 Lisp_Object components;
5775
5776 len = -charbuf[0];
5777 from = coding->dst_pos + charbuf[2];
5778 to = coding->dst_pos + charbuf[3];
5779 method = (enum composition_method) (charbuf[4]);
5780
5781 if (method == COMPOSITION_RELATIVE)
5782 components = Qnil;
5783 else
5784 {
5785 Lisp_Object args[MAX_COMPOSITION_COMPONENTS * 2 - 1];
5786 int i;
5787
5788 len -= 5;
5789 charbuf += 5;
5790 for (i = 0; i < len; i++)
5791 args[i] = make_number (charbuf[i]);
5792 components = (method == COMPOSITION_WITH_ALTCHARS
5793 ? Fstring (len, args) : Fvector (len, args));
5794 }
5795 compose_text (from, to, components, Qnil, coding->dst_object);
5796 }
5797
5798
5799 /* Put `charset' property on text in CODING->object according to
5800 the annotation data at CHARBUF. CHARBUF is an array:
5801 [ -LENGTH ANNOTATION_MASK FROM TO CHARSET-ID ]
5802 */
5803
5804 static INLINE void
5805 produce_charset (coding, charbuf)
5806 struct coding_system *coding;
5807 int *charbuf;
5808 {
5809 EMACS_INT from = coding->dst_pos + charbuf[2];
5810 EMACS_INT to = coding->dst_pos + charbuf[3];
5811 struct charset *charset = CHARSET_FROM_ID (charbuf[4]);
5812
5813 Fput_text_property (make_number (from), make_number (to),
5814 Qcharset, CHARSET_NAME (charset),
5815 coding->dst_object);
5816 }
5817
5818
5819 #define CHARBUF_SIZE 0x4000
5820
5821 #define ALLOC_CONVERSION_WORK_AREA(coding) \
5822 do { \
5823 int size = CHARBUF_SIZE;; \
5824 \
5825 coding->charbuf = NULL; \
5826 while (size > 1024) \
5827 { \
5828 coding->charbuf = (int *) alloca (sizeof (int) * size); \
5829 if (coding->charbuf) \
5830 break; \
5831 size >>= 1; \
5832 } \
5833 if (! coding->charbuf) \
5834 { \
5835 record_conversion_result (coding, CODING_RESULT_INSUFFICIENT_MEM); \
5836 return coding->result; \
5837 } \
5838 coding->charbuf_size = size; \
5839 } while (0)
5840
5841
5842 static void
5843 produce_annotation (coding)
5844 struct coding_system *coding;
5845 {
5846 int *charbuf = coding->charbuf;
5847 int *charbuf_end = charbuf + coding->charbuf_used;
5848
5849 if (NILP (coding->dst_object))
5850 return;
5851
5852 while (charbuf < charbuf_end)
5853 {
5854 if (*charbuf >= 0)
5855 charbuf++;
5856 else
5857 {
5858 int len = -*charbuf;
5859 switch (charbuf[1])
5860 {
5861 case CODING_ANNOTATE_COMPOSITION_MASK:
5862 produce_composition (coding, charbuf);
5863 break;
5864 case CODING_ANNOTATE_CHARSET_MASK:
5865 produce_charset (coding, charbuf);
5866 break;
5867 default:
5868 abort ();
5869 }
5870 charbuf += len;
5871 }
5872 }
5873 }
5874
5875 /* Decode the data at CODING->src_object into CODING->dst_object.
5876 CODING->src_object is a buffer, a string, or nil.
5877 CODING->dst_object is a buffer.
5878
5879 If CODING->src_object is a buffer, it must be the current buffer.
5880 In this case, if CODING->src_pos is positive, it is a position of
5881 the source text in the buffer, otherwise, the source text is in the
5882 gap area of the buffer, and CODING->src_pos specifies the offset of
5883 the text from GPT (which must be the same as PT). If this is the
5884 same buffer as CODING->dst_object, CODING->src_pos must be
5885 negative.
5886
5887 If CODING->src_object is a string, CODING->src_pos in an index to
5888 that string.
5889
5890 If CODING->src_object is nil, CODING->source must already point to
5891 the non-relocatable memory area. In this case, CODING->src_pos is
5892 an offset from CODING->source.
5893
5894 The decoded data is inserted at the current point of the buffer
5895 CODING->dst_object.
5896 */
5897
5898 static int
5899 decode_coding (coding)
5900 struct coding_system *coding;
5901 {
5902 Lisp_Object attrs;
5903 Lisp_Object undo_list;
5904 Lisp_Object translation_table;
5905
5906 if (BUFFERP (coding->src_object)
5907 && coding->src_pos > 0
5908 && coding->src_pos < GPT
5909 && coding->src_pos + coding->src_chars > GPT)
5910 move_gap_both (coding->src_pos, coding->src_pos_byte);
5911
5912 undo_list = Qt;
5913 if (BUFFERP (coding->dst_object))
5914 {
5915 if (current_buffer != XBUFFER (coding->dst_object))
5916 set_buffer_internal (XBUFFER (coding->dst_object));
5917 if (GPT != PT)
5918 move_gap_both (PT, PT_BYTE);
5919 undo_list = current_buffer->undo_list;
5920 current_buffer->undo_list = Qt;
5921 }
5922
5923 coding->consumed = coding->consumed_char = 0;
5924 coding->produced = coding->produced_char = 0;
5925 coding->chars_at_source = 0;
5926 record_conversion_result (coding, CODING_RESULT_SUCCESS);
5927 coding->errors = 0;
5928
5929 ALLOC_CONVERSION_WORK_AREA (coding);
5930
5931 attrs = CODING_ID_ATTRS (coding->id);
5932 translation_table = get_translation_table (attrs, 0);
5933
5934 do
5935 {
5936 coding_set_source (coding);
5937 coding->annotated = 0;
5938 (*(coding->decoder)) (coding);
5939 if (!NILP (translation_table))
5940 translate_chars (coding, translation_table);
5941 coding_set_destination (coding);
5942 produce_chars (coding);
5943 if (coding->annotated)
5944 produce_annotation (coding);
5945 }
5946 while (coding->consumed < coding->src_bytes
5947 && ! coding->result);
5948
5949 coding->carryover_bytes = 0;
5950 if (coding->consumed < coding->src_bytes)
5951 {
5952 int nbytes = coding->src_bytes - coding->consumed;
5953 const unsigned char *src;
5954
5955 coding_set_source (coding);
5956 coding_set_destination (coding);
5957 src = coding->source + coding->consumed;
5958
5959 if (coding->mode & CODING_MODE_LAST_BLOCK)
5960 {
5961 /* Flush out unprocessed data as binary chars. We are sure
5962 that the number of data is less than the size of
5963 coding->charbuf. */
5964 coding->charbuf_used = 0;
5965 while (nbytes-- > 0)
5966 {
5967 int c = *src++;
5968
5969 coding->charbuf[coding->charbuf_used++] = (c & 0x80 ? - c : c);
5970 }
5971 produce_chars (coding);
5972 }
5973 else
5974 {
5975 /* Record unprocessed bytes in coding->carryover. We are
5976 sure that the number of data is less than the size of
5977 coding->carryover. */
5978 unsigned char *p = coding->carryover;
5979
5980 coding->carryover_bytes = nbytes;
5981 while (nbytes-- > 0)
5982 *p++ = *src++;
5983 }
5984 coding->consumed = coding->src_bytes;
5985 }
5986
5987 if (BUFFERP (coding->dst_object))
5988 {
5989 current_buffer->undo_list = undo_list;
5990 record_insert (coding->dst_pos, coding->produced_char);
5991 }
5992 if (! EQ (CODING_ID_EOL_TYPE (coding->id), Qunix))
5993 decode_eol (coding);
5994 return coding->result;
5995 }
5996
5997
5998 /* Extract an annotation datum from a composition starting at POS and
5999 ending before LIMIT of CODING->src_object (buffer or string), store
6000 the data in BUF, set *STOP to a starting position of the next
6001 composition (if any) or to LIMIT, and return the address of the
6002 next element of BUF.
6003
6004 If such an annotation is not found, set *STOP to a starting
6005 position of a composition after POS (if any) or to LIMIT, and
6006 return BUF. */
6007
6008 static INLINE int *
6009 handle_composition_annotation (pos, limit, coding, buf, stop)
6010 EMACS_INT pos, limit;
6011 struct coding_system *coding;
6012 int *buf;
6013 EMACS_INT *stop;
6014 {
6015 EMACS_INT start, end;
6016 Lisp_Object prop;
6017
6018 if (! find_composition (pos, limit, &start, &end, &prop, coding->src_object)
6019 || end > limit)
6020 *stop = limit;
6021 else if (start > pos)
6022 *stop = start;
6023 else
6024 {
6025 if (start == pos)
6026 {
6027 /* We found a composition. Store the corresponding
6028 annotation data in BUF. */
6029 int *head = buf;
6030 enum composition_method method = COMPOSITION_METHOD (prop);
6031 int nchars = COMPOSITION_LENGTH (prop);
6032
6033 ADD_COMPOSITION_DATA (buf, 0, nchars, method);
6034 if (method != COMPOSITION_RELATIVE)
6035 {
6036 Lisp_Object components;
6037 int len, i, i_byte;
6038
6039 components = COMPOSITION_COMPONENTS (prop);
6040 if (VECTORP (components))
6041 {
6042 len = XVECTOR (components)->size;
6043 for (i = 0; i < len; i++)
6044 *buf++ = XINT (AREF (components, i));
6045 }
6046 else if (STRINGP (components))
6047 {
6048 len = SCHARS (components);
6049 i = i_byte = 0;
6050 while (i < len)
6051 {
6052 FETCH_STRING_CHAR_ADVANCE (*buf, components, i, i_byte);
6053 buf++;
6054 }
6055 }
6056 else if (INTEGERP (components))
6057 {
6058 len = 1;
6059 *buf++ = XINT (components);
6060 }
6061 else if (CONSP (components))
6062 {
6063 for (len = 0; CONSP (components);
6064 len++, components = XCDR (components))
6065 *buf++ = XINT (XCAR (components));
6066 }
6067 else
6068 abort ();
6069 *head -= len;
6070 }
6071 }
6072
6073 if (find_composition (end, limit, &start, &end, &prop,
6074 coding->src_object)
6075 && end <= limit)
6076 *stop = start;
6077 else
6078 *stop = limit;
6079 }
6080 return buf;
6081 }
6082
6083
6084 /* Extract an annotation datum from a text property `charset' at POS of
6085 CODING->src_object (buffer of string), store the data in BUF, set
6086 *STOP to the position where the value of `charset' property changes
6087 (limiting by LIMIT), and return the address of the next element of
6088 BUF.
6089
6090 If the property value is nil, set *STOP to the position where the
6091 property value is non-nil (limiting by LIMIT), and return BUF. */
6092
6093 static INLINE int *
6094 handle_charset_annotation (pos, limit, coding, buf, stop)
6095 EMACS_INT pos, limit;
6096 struct coding_system *coding;
6097 int *buf;
6098 EMACS_INT *stop;
6099 {
6100 Lisp_Object val, next;
6101 int id;
6102
6103 val = Fget_text_property (make_number (pos), Qcharset, coding->src_object);
6104 if (! NILP (val) && CHARSETP (val))
6105 id = XINT (CHARSET_SYMBOL_ID (val));
6106 else
6107 id = -1;
6108 ADD_CHARSET_DATA (buf, 0, 0, id);
6109 next = Fnext_single_property_change (make_number (pos), Qcharset,
6110 coding->src_object,
6111 make_number (limit));
6112 *stop = XINT (next);
6113 return buf;
6114 }
6115
6116
6117 static void
6118 consume_chars (coding)
6119 struct coding_system *coding;
6120 {
6121 int *buf = coding->charbuf;
6122 int *buf_end = coding->charbuf + coding->charbuf_size;
6123 const unsigned char *src = coding->source + coding->consumed;
6124 const unsigned char *src_end = coding->source + coding->src_bytes;
6125 EMACS_INT pos = coding->src_pos + coding->consumed_char;
6126 EMACS_INT end_pos = coding->src_pos + coding->src_chars;
6127 int multibytep = coding->src_multibyte;
6128 Lisp_Object eol_type;
6129 int c;
6130 EMACS_INT stop, stop_composition, stop_charset;
6131
6132 eol_type = CODING_ID_EOL_TYPE (coding->id);
6133 if (VECTORP (eol_type))
6134 eol_type = Qunix;
6135
6136 /* Note: composition handling is not yet implemented. */
6137 coding->common_flags &= ~CODING_ANNOTATE_COMPOSITION_MASK;
6138
6139 if (NILP (coding->src_object))
6140 stop = stop_composition = stop_charset = end_pos;
6141 else
6142 {
6143 if (coding->common_flags & CODING_ANNOTATE_COMPOSITION_MASK)
6144 stop = stop_composition = pos;
6145 else
6146 stop = stop_composition = end_pos;
6147 if (coding->common_flags & CODING_ANNOTATE_CHARSET_MASK)
6148 stop = stop_charset = pos;
6149 else
6150 stop_charset = end_pos;
6151 }
6152
6153 /* Compensate for CRLF and conversion. */
6154 buf_end -= 1 + MAX_ANNOTATION_LENGTH;
6155 while (buf < buf_end)
6156 {
6157 if (pos == stop)
6158 {
6159 if (pos == end_pos)
6160 break;
6161 if (pos == stop_composition)
6162 buf = handle_composition_annotation (pos, end_pos, coding,
6163 buf, &stop_composition);
6164 if (pos == stop_charset)
6165 buf = handle_charset_annotation (pos, end_pos, coding,
6166 buf, &stop_charset);
6167 stop = (stop_composition < stop_charset
6168 ? stop_composition : stop_charset);
6169 }
6170
6171 if (! multibytep)
6172 {
6173 EMACS_INT bytes;
6174
6175 if (! CODING_FOR_UNIBYTE (coding)
6176 && (bytes = MULTIBYTE_LENGTH (src, src_end)) > 0)
6177 c = STRING_CHAR_ADVANCE (src), pos += bytes;
6178 else
6179 c = *src++, pos++;
6180 }
6181 else
6182 c = STRING_CHAR_ADVANCE (src), pos++;
6183 if ((c == '\r') && (coding->mode & CODING_MODE_SELECTIVE_DISPLAY))
6184 c = '\n';
6185 if (! EQ (eol_type, Qunix))
6186 {
6187 if (c == '\n')
6188 {
6189 if (EQ (eol_type, Qdos))
6190 *buf++ = '\r';
6191 else
6192 c = '\r';
6193 }
6194 }
6195 *buf++ = c;
6196 }
6197
6198 coding->consumed = src - coding->source;
6199 coding->consumed_char = pos - coding->src_pos;
6200 coding->charbuf_used = buf - coding->charbuf;
6201 coding->chars_at_source = 0;
6202 }
6203
6204
6205 /* Encode the text at CODING->src_object into CODING->dst_object.
6206 CODING->src_object is a buffer or a string.
6207 CODING->dst_object is a buffer or nil.
6208
6209 If CODING->src_object is a buffer, it must be the current buffer.
6210 In this case, if CODING->src_pos is positive, it is a position of
6211 the source text in the buffer, otherwise. the source text is in the
6212 gap area of the buffer, and coding->src_pos specifies the offset of
6213 the text from GPT (which must be the same as PT). If this is the
6214 same buffer as CODING->dst_object, CODING->src_pos must be
6215 negative and CODING should not have `pre-write-conversion'.
6216
6217 If CODING->src_object is a string, CODING should not have
6218 `pre-write-conversion'.
6219
6220 If CODING->dst_object is a buffer, the encoded data is inserted at
6221 the current point of that buffer.
6222
6223 If CODING->dst_object is nil, the encoded data is placed at the
6224 memory area specified by CODING->destination. */
6225
6226 static int
6227 encode_coding (coding)
6228 struct coding_system *coding;
6229 {
6230 Lisp_Object attrs;
6231 Lisp_Object translation_table;
6232
6233 attrs = CODING_ID_ATTRS (coding->id);
6234 translation_table = get_translation_table (attrs, 1);
6235
6236 if (BUFFERP (coding->dst_object))
6237 {
6238 set_buffer_internal (XBUFFER (coding->dst_object));
6239 coding->dst_multibyte
6240 = ! NILP (current_buffer->enable_multibyte_characters);
6241 }
6242
6243 coding->consumed = coding->consumed_char = 0;
6244 coding->produced = coding->produced_char = 0;
6245 record_conversion_result (coding, CODING_RESULT_SUCCESS);
6246 coding->errors = 0;
6247
6248 ALLOC_CONVERSION_WORK_AREA (coding);
6249
6250 do {
6251 coding_set_source (coding);
6252 consume_chars (coding);
6253
6254 if (!NILP (translation_table))
6255 translate_chars (coding, translation_table);
6256
6257 coding_set_destination (coding);
6258 (*(coding->encoder)) (coding);
6259 } while (coding->consumed_char < coding->src_chars);
6260
6261 if (BUFFERP (coding->dst_object))
6262 insert_from_gap (coding->produced_char, coding->produced);
6263
6264 return (coding->result);
6265 }
6266
6267
6268 /* Name (or base name) of work buffer for code conversion. */
6269 static Lisp_Object Vcode_conversion_workbuf_name;
6270
6271 /* A working buffer used by the top level conversion. Once it is
6272 created, it is never destroyed. It has the name
6273 Vcode_conversion_workbuf_name. The other working buffers are
6274 destroyed after the use is finished, and their names are modified
6275 versions of Vcode_conversion_workbuf_name. */
6276 static Lisp_Object Vcode_conversion_reused_workbuf;
6277
6278 /* 1 iff Vcode_conversion_reused_workbuf is already in use. */
6279 static int reused_workbuf_in_use;
6280
6281
6282 /* Return a working buffer of code convesion. MULTIBYTE specifies the
6283 multibyteness of returning buffer. */
6284
6285 Lisp_Object
6286 make_conversion_work_buffer (multibyte)
6287 {
6288 Lisp_Object name, workbuf;
6289 struct buffer *current;
6290
6291 if (reused_workbuf_in_use++)
6292 {
6293 name = Fgenerate_new_buffer_name (Vcode_conversion_workbuf_name, Qnil);
6294 workbuf = Fget_buffer_create (name);
6295 }
6296 else
6297 {
6298 name = Vcode_conversion_workbuf_name;
6299 workbuf = Fget_buffer_create (name);
6300 if (NILP (Vcode_conversion_reused_workbuf))
6301 Vcode_conversion_reused_workbuf = workbuf;
6302 }
6303 current = current_buffer;
6304 set_buffer_internal (XBUFFER (workbuf));
6305 Ferase_buffer ();
6306 current_buffer->undo_list = Qt;
6307 current_buffer->enable_multibyte_characters = multibyte ? Qt : Qnil;
6308 set_buffer_internal (current);
6309 return workbuf;
6310 }
6311
6312
6313 static Lisp_Object
6314 code_conversion_restore (arg)
6315 Lisp_Object arg;
6316 {
6317 Lisp_Object current, workbuf;
6318
6319 current = XCAR (arg);
6320 workbuf = XCDR (arg);
6321 if (! NILP (workbuf))
6322 {
6323 if (EQ (workbuf, Vcode_conversion_reused_workbuf))
6324 reused_workbuf_in_use = 0;
6325 else if (! NILP (Fbuffer_live_p (workbuf)))
6326 Fkill_buffer (workbuf);
6327 }
6328 set_buffer_internal (XBUFFER (current));
6329 return Qnil;
6330 }
6331
6332 Lisp_Object
6333 code_conversion_save (with_work_buf, multibyte)
6334 int with_work_buf, multibyte;
6335 {
6336 Lisp_Object workbuf = Qnil;
6337
6338 if (with_work_buf)
6339 workbuf = make_conversion_work_buffer (multibyte);
6340 record_unwind_protect (code_conversion_restore,
6341 Fcons (Fcurrent_buffer (), workbuf));
6342 return workbuf;
6343 }
6344
6345 int
6346 decode_coding_gap (coding, chars, bytes)
6347 struct coding_system *coding;
6348 EMACS_INT chars, bytes;
6349 {
6350 int count = specpdl_ptr - specpdl;
6351 Lisp_Object attrs;
6352
6353 code_conversion_save (0, 0);
6354
6355 coding->src_object = Fcurrent_buffer ();
6356 coding->src_chars = chars;
6357 coding->src_bytes = bytes;
6358 coding->src_pos = -chars;
6359 coding->src_pos_byte = -bytes;
6360 coding->src_multibyte = chars < bytes;
6361 coding->dst_object = coding->src_object;
6362 coding->dst_pos = PT;
6363 coding->dst_pos_byte = PT_BYTE;
6364 coding->dst_multibyte = ! NILP (current_buffer->enable_multibyte_characters);
6365 coding->mode |= CODING_MODE_LAST_BLOCK;
6366
6367 if (CODING_REQUIRE_DETECTION (coding))
6368 detect_coding (coding);
6369
6370 decode_coding (coding);
6371
6372 attrs = CODING_ID_ATTRS (coding->id);
6373 if (! NILP (CODING_ATTR_POST_READ (attrs)))
6374 {
6375 EMACS_INT prev_Z = Z, prev_Z_BYTE = Z_BYTE;
6376 Lisp_Object val;
6377
6378 TEMP_SET_PT_BOTH (coding->dst_pos, coding->dst_pos_byte);
6379 val = call1 (CODING_ATTR_POST_READ (attrs),
6380 make_number (coding->produced_char));
6381 CHECK_NATNUM (val);
6382 coding->produced_char += Z - prev_Z;
6383 coding->produced += Z_BYTE - prev_Z_BYTE;
6384 }
6385
6386 unbind_to (count, Qnil);
6387 return coding->result;
6388 }
6389
6390 int
6391 encode_coding_gap (coding, chars, bytes)
6392 struct coding_system *coding;
6393 EMACS_INT chars, bytes;
6394 {
6395 int count = specpdl_ptr - specpdl;
6396
6397 code_conversion_save (0, 0);
6398
6399 coding->src_object = Fcurrent_buffer ();
6400 coding->src_chars = chars;
6401 coding->src_bytes = bytes;
6402 coding->src_pos = -chars;
6403 coding->src_pos_byte = -bytes;
6404 coding->src_multibyte = chars < bytes;
6405 coding->dst_object = coding->src_object;
6406 coding->dst_pos = PT;
6407 coding->dst_pos_byte = PT_BYTE;
6408
6409 encode_coding (coding);
6410
6411 unbind_to (count, Qnil);
6412 return coding->result;
6413 }
6414
6415
6416 /* Decode the text in the range FROM/FROM_BYTE and TO/TO_BYTE in
6417 SRC_OBJECT into DST_OBJECT by coding context CODING.
6418
6419 SRC_OBJECT is a buffer, a string, or Qnil.
6420
6421 If it is a buffer, the text is at point of the buffer. FROM and TO
6422 are positions in the buffer.
6423
6424 If it is a string, the text is at the beginning of the string.
6425 FROM and TO are indices to the string.
6426
6427 If it is nil, the text is at coding->source. FROM and TO are
6428 indices to coding->source.
6429
6430 DST_OBJECT is a buffer, Qt, or Qnil.
6431
6432 If it is a buffer, the decoded text is inserted at point of the
6433 buffer. If the buffer is the same as SRC_OBJECT, the source text
6434 is deleted.
6435
6436 If it is Qt, a string is made from the decoded text, and
6437 set in CODING->dst_object.
6438
6439 If it is Qnil, the decoded text is stored at CODING->destination.
6440 The caller must allocate CODING->dst_bytes bytes at
6441 CODING->destination by xmalloc. If the decoded text is longer than
6442 CODING->dst_bytes, CODING->destination is relocated by xrealloc.
6443 */
6444
6445 void
6446 decode_coding_object (coding, src_object, from, from_byte, to, to_byte,
6447 dst_object)
6448 struct coding_system *coding;
6449 Lisp_Object src_object;
6450 EMACS_INT from, from_byte, to, to_byte;
6451 Lisp_Object dst_object;
6452 {
6453 int count = specpdl_ptr - specpdl;
6454 unsigned char *destination;
6455 EMACS_INT dst_bytes;
6456 EMACS_INT chars = to - from;
6457 EMACS_INT bytes = to_byte - from_byte;
6458 Lisp_Object attrs;
6459 Lisp_Object buffer;
6460 int saved_pt = -1, saved_pt_byte;
6461
6462 buffer = Fcurrent_buffer ();
6463
6464 if (NILP (dst_object))
6465 {
6466 destination = coding->destination;
6467 dst_bytes = coding->dst_bytes;
6468 }
6469
6470 coding->src_object = src_object;
6471 coding->src_chars = chars;
6472 coding->src_bytes = bytes;
6473 coding->src_multibyte = chars < bytes;
6474
6475 if (STRINGP (src_object))
6476 {
6477 coding->src_pos = from;
6478 coding->src_pos_byte = from_byte;
6479 }
6480 else if (BUFFERP (src_object))
6481 {
6482 set_buffer_internal (XBUFFER (src_object));
6483 if (from != GPT)
6484 move_gap_both (from, from_byte);
6485 if (EQ (src_object, dst_object))
6486 {
6487 saved_pt = PT, saved_pt_byte = PT_BYTE;
6488 TEMP_SET_PT_BOTH (from, from_byte);
6489 del_range_both (from, from_byte, to, to_byte, 1);
6490 coding->src_pos = -chars;
6491 coding->src_pos_byte = -bytes;
6492 }
6493 else
6494 {
6495 coding->src_pos = from;
6496 coding->src_pos_byte = from_byte;
6497 }
6498 }
6499
6500 if (CODING_REQUIRE_DETECTION (coding))
6501 detect_coding (coding);
6502 attrs = CODING_ID_ATTRS (coding->id);
6503
6504 if (EQ (dst_object, Qt)
6505 || (! NILP (CODING_ATTR_POST_READ (attrs))
6506 && NILP (dst_object)))
6507 {
6508 coding->dst_object = code_conversion_save (1, 1);
6509 coding->dst_pos = BEG;
6510 coding->dst_pos_byte = BEG_BYTE;
6511 coding->dst_multibyte = 1;
6512 }
6513 else if (BUFFERP (dst_object))
6514 {
6515 code_conversion_save (0, 0);
6516 coding->dst_object = dst_object;
6517 coding->dst_pos = BUF_PT (XBUFFER (dst_object));
6518 coding->dst_pos_byte = BUF_PT_BYTE (XBUFFER (dst_object));
6519 coding->dst_multibyte
6520 = ! NILP (XBUFFER (dst_object)->enable_multibyte_characters);
6521 }
6522 else
6523 {
6524 code_conversion_save (0, 0);
6525 coding->dst_object = Qnil;
6526 coding->dst_multibyte = 1;
6527 }
6528
6529 decode_coding (coding);
6530
6531 if (BUFFERP (coding->dst_object))
6532 set_buffer_internal (XBUFFER (coding->dst_object));
6533
6534 if (! NILP (CODING_ATTR_POST_READ (attrs)))
6535 {
6536 struct gcpro gcpro1, gcpro2;
6537 EMACS_INT prev_Z = Z, prev_Z_BYTE = Z_BYTE;
6538 Lisp_Object val;
6539
6540 TEMP_SET_PT_BOTH (coding->dst_pos, coding->dst_pos_byte);
6541 GCPRO2 (coding->src_object, coding->dst_object);
6542 val = call1 (CODING_ATTR_POST_READ (attrs),
6543 make_number (coding->produced_char));
6544 UNGCPRO;
6545 CHECK_NATNUM (val);
6546 coding->produced_char += Z - prev_Z;
6547 coding->produced += Z_BYTE - prev_Z_BYTE;
6548 }
6549
6550 if (EQ (dst_object, Qt))
6551 {
6552 coding->dst_object = Fbuffer_string ();
6553 }
6554 else if (NILP (dst_object) && BUFFERP (coding->dst_object))
6555 {
6556 set_buffer_internal (XBUFFER (coding->dst_object));
6557 if (dst_bytes < coding->produced)
6558 {
6559 destination
6560 = (unsigned char *) xrealloc (destination, coding->produced);
6561 if (! destination)
6562 {
6563 record_conversion_result (coding,
6564 CODING_RESULT_INSUFFICIENT_DST);
6565 unbind_to (count, Qnil);
6566 return;
6567 }
6568 if (BEGV < GPT && GPT < BEGV + coding->produced_char)
6569 move_gap_both (BEGV, BEGV_BYTE);
6570 bcopy (BEGV_ADDR, destination, coding->produced);
6571 coding->destination = destination;
6572 }
6573 }
6574
6575 if (saved_pt >= 0)
6576 {
6577 /* This is the case of:
6578 (BUFFERP (src_object) && EQ (src_object, dst_object))
6579 As we have moved PT while replacing the original buffer
6580 contents, we must recover it now. */
6581 set_buffer_internal (XBUFFER (src_object));
6582 if (saved_pt < from)
6583 TEMP_SET_PT_BOTH (saved_pt, saved_pt_byte);
6584 else if (saved_pt < from + chars)
6585 TEMP_SET_PT_BOTH (from, from_byte);
6586 else if (! NILP (current_buffer->enable_multibyte_characters))
6587 TEMP_SET_PT_BOTH (saved_pt + (coding->produced_char - chars),
6588 saved_pt_byte + (coding->produced - bytes));
6589 else
6590 TEMP_SET_PT_BOTH (saved_pt + (coding->produced - bytes),
6591 saved_pt_byte + (coding->produced - bytes));
6592 }
6593
6594 unbind_to (count, coding->dst_object);
6595 }
6596
6597
6598 void
6599 encode_coding_object (coding, src_object, from, from_byte, to, to_byte,
6600 dst_object)
6601 struct coding_system *coding;
6602 Lisp_Object src_object;
6603 EMACS_INT from, from_byte, to, to_byte;
6604 Lisp_Object dst_object;
6605 {
6606 int count = specpdl_ptr - specpdl;
6607 EMACS_INT chars = to - from;
6608 EMACS_INT bytes = to_byte - from_byte;
6609 Lisp_Object attrs;
6610 Lisp_Object buffer;
6611 int saved_pt = -1, saved_pt_byte;
6612
6613 buffer = Fcurrent_buffer ();
6614
6615 coding->src_object = src_object;
6616 coding->src_chars = chars;
6617 coding->src_bytes = bytes;
6618 coding->src_multibyte = chars < bytes;
6619
6620 attrs = CODING_ID_ATTRS (coding->id);
6621
6622 if (! NILP (CODING_ATTR_PRE_WRITE (attrs)))
6623 {
6624 coding->src_object = code_conversion_save (1, coding->src_multibyte);
6625 set_buffer_internal (XBUFFER (coding->src_object));
6626 if (STRINGP (src_object))
6627 insert_from_string (src_object, from, from_byte, chars, bytes, 0);
6628 else if (BUFFERP (src_object))
6629 insert_from_buffer (XBUFFER (src_object), from, chars, 0);
6630 else
6631 insert_1_both (coding->source + from, chars, bytes, 0, 0, 0);
6632
6633 if (EQ (src_object, dst_object))
6634 {
6635 set_buffer_internal (XBUFFER (src_object));
6636 saved_pt = PT, saved_pt_byte = PT_BYTE;
6637 del_range_both (from, from_byte, to, to_byte, 1);
6638 set_buffer_internal (XBUFFER (coding->src_object));
6639 }
6640
6641 call2 (CODING_ATTR_PRE_WRITE (attrs),
6642 make_number (BEG), make_number (Z));
6643 coding->src_object = Fcurrent_buffer ();
6644 if (BEG != GPT)
6645 move_gap_both (BEG, BEG_BYTE);
6646 coding->src_chars = Z - BEG;
6647 coding->src_bytes = Z_BYTE - BEG_BYTE;
6648 coding->src_pos = BEG;
6649 coding->src_pos_byte = BEG_BYTE;
6650 coding->src_multibyte = Z < Z_BYTE;
6651 }
6652 else if (STRINGP (src_object))
6653 {
6654 code_conversion_save (0, 0);
6655 coding->src_pos = from;
6656 coding->src_pos_byte = from_byte;
6657 }
6658 else if (BUFFERP (src_object))
6659 {
6660 code_conversion_save (0, 0);
6661 set_buffer_internal (XBUFFER (src_object));
6662 if (EQ (src_object, dst_object))
6663 {
6664 saved_pt = PT, saved_pt_byte = PT_BYTE;
6665 coding->src_object = del_range_1 (from, to, 1, 1);
6666 coding->src_pos = 0;
6667 coding->src_pos_byte = 0;
6668 }
6669 else
6670 {
6671 if (from < GPT && to >= GPT)
6672 move_gap_both (from, from_byte);
6673 coding->src_pos = from;
6674 coding->src_pos_byte = from_byte;
6675 }
6676 }
6677 else
6678 code_conversion_save (0, 0);
6679
6680 if (BUFFERP (dst_object))
6681 {
6682 coding->dst_object = dst_object;
6683 if (EQ (src_object, dst_object))
6684 {
6685 coding->dst_pos = from;
6686 coding->dst_pos_byte = from_byte;
6687 }
6688 else
6689 {
6690 coding->dst_pos = BUF_PT (XBUFFER (dst_object));
6691 coding->dst_pos_byte = BUF_PT_BYTE (XBUFFER (dst_object));
6692 }
6693 coding->dst_multibyte
6694 = ! NILP (XBUFFER (dst_object)->enable_multibyte_characters);
6695 }
6696 else if (EQ (dst_object, Qt))
6697 {
6698 coding->dst_object = Qnil;
6699 coding->dst_bytes = coding->src_chars;
6700 if (coding->dst_bytes == 0)
6701 coding->dst_bytes = 1;
6702 coding->destination = (unsigned char *) xmalloc (coding->dst_bytes);
6703 coding->dst_multibyte = 0;
6704 }
6705 else
6706 {
6707 coding->dst_object = Qnil;
6708 coding->dst_multibyte = 0;
6709 }
6710
6711 encode_coding (coding);
6712
6713 if (EQ (dst_object, Qt))
6714 {
6715 if (BUFFERP (coding->dst_object))
6716 coding->dst_object = Fbuffer_string ();
6717 else
6718 {
6719 coding->dst_object
6720 = make_unibyte_string ((char *) coding->destination,
6721 coding->produced);
6722 xfree (coding->destination);
6723 }
6724 }
6725
6726 if (saved_pt >= 0)
6727 {
6728 /* This is the case of:
6729 (BUFFERP (src_object) && EQ (src_object, dst_object))
6730 As we have moved PT while replacing the original buffer
6731 contents, we must recover it now. */
6732 set_buffer_internal (XBUFFER (src_object));
6733 if (saved_pt < from)
6734 TEMP_SET_PT_BOTH (saved_pt, saved_pt_byte);
6735 else if (saved_pt < from + chars)
6736 TEMP_SET_PT_BOTH (from, from_byte);
6737 else if (! NILP (current_buffer->enable_multibyte_characters))
6738 TEMP_SET_PT_BOTH (saved_pt + (coding->produced_char - chars),
6739 saved_pt_byte + (coding->produced - bytes));
6740 else
6741 TEMP_SET_PT_BOTH (saved_pt + (coding->produced - bytes),
6742 saved_pt_byte + (coding->produced - bytes));
6743 }
6744
6745 unbind_to (count, Qnil);
6746 }
6747
6748
6749 Lisp_Object
6750 preferred_coding_system ()
6751 {
6752 int id = coding_categories[coding_priorities[0]].id;
6753
6754 return CODING_ID_NAME (id);
6755 }
6756
6757 \f
6758 #ifdef emacs
6759 /*** 8. Emacs Lisp library functions ***/
6760
6761 DEFUN ("coding-system-p", Fcoding_system_p, Scoding_system_p, 1, 1, 0,
6762 doc: /* Return t if OBJECT is nil or a coding-system.
6763 See the documentation of `define-coding-system' for information
6764 about coding-system objects. */)
6765 (obj)
6766 Lisp_Object obj;
6767 {
6768 return ((NILP (obj) || CODING_SYSTEM_P (obj)) ? Qt : Qnil);
6769 }
6770
6771 DEFUN ("read-non-nil-coding-system", Fread_non_nil_coding_system,
6772 Sread_non_nil_coding_system, 1, 1, 0,
6773 doc: /* Read a coding system from the minibuffer, prompting with string PROMPT. */)
6774 (prompt)
6775 Lisp_Object prompt;
6776 {
6777 Lisp_Object val;
6778 do
6779 {
6780 val = Fcompleting_read (prompt, Vcoding_system_alist, Qnil,
6781 Qt, Qnil, Qcoding_system_history, Qnil, Qnil);
6782 }
6783 while (SCHARS (val) == 0);
6784 return (Fintern (val, Qnil));
6785 }
6786
6787 DEFUN ("read-coding-system", Fread_coding_system, Sread_coding_system, 1, 2, 0,
6788 doc: /* Read a coding system from the minibuffer, prompting with string PROMPT.
6789 If the user enters null input, return second argument DEFAULT-CODING-SYSTEM. */)
6790 (prompt, default_coding_system)
6791 Lisp_Object prompt, default_coding_system;
6792 {
6793 Lisp_Object val;
6794 if (SYMBOLP (default_coding_system))
6795 XSETSTRING (default_coding_system, XPNTR (SYMBOL_NAME (default_coding_system)));
6796 val = Fcompleting_read (prompt, Vcoding_system_alist, Qnil,
6797 Qt, Qnil, Qcoding_system_history,
6798 default_coding_system, Qnil);
6799 return (SCHARS (val) == 0 ? Qnil : Fintern (val, Qnil));
6800 }
6801
6802 DEFUN ("check-coding-system", Fcheck_coding_system, Scheck_coding_system,
6803 1, 1, 0,
6804 doc: /* Check validity of CODING-SYSTEM.
6805 If valid, return CODING-SYSTEM, else signal a `coding-system-error' error. */)
6806 (coding_system)
6807 Lisp_Object coding_system;
6808 {
6809 CHECK_SYMBOL (coding_system);
6810 if (!NILP (Fcoding_system_p (coding_system)))
6811 return coding_system;
6812 while (1)
6813 Fsignal (Qcoding_system_error, Fcons (coding_system, Qnil));
6814 }
6815
6816 \f
6817 /* Detect how the bytes at SRC of length SRC_BYTES are encoded. If
6818 HIGHEST is nonzero, return the coding system of the highest
6819 priority among the detected coding systems. Otherwize return a
6820 list of detected coding systems sorted by their priorities. If
6821 MULTIBYTEP is nonzero, it is assumed that the bytes are in correct
6822 multibyte form but contains only ASCII and eight-bit chars.
6823 Otherwise, the bytes are raw bytes.
6824
6825 CODING-SYSTEM controls the detection as below:
6826
6827 If it is nil, detect both text-format and eol-format. If the
6828 text-format part of CODING-SYSTEM is already specified
6829 (e.g. `iso-latin-1'), detect only eol-format. If the eol-format
6830 part of CODING-SYSTEM is already specified (e.g. `undecided-unix'),
6831 detect only text-format. */
6832
6833 Lisp_Object
6834 detect_coding_system (src, src_chars, src_bytes, highest, multibytep,
6835 coding_system)
6836 const unsigned char *src;
6837 int src_chars, src_bytes, highest;
6838 int multibytep;
6839 Lisp_Object coding_system;
6840 {
6841 const unsigned char *src_end = src + src_bytes;
6842 Lisp_Object attrs, eol_type;
6843 Lisp_Object val;
6844 struct coding_system coding;
6845 int id;
6846 struct coding_detection_info detect_info;
6847 enum coding_category base_category;
6848
6849 if (NILP (coding_system))
6850 coding_system = Qundecided;
6851 setup_coding_system (coding_system, &coding);
6852 attrs = CODING_ID_ATTRS (coding.id);
6853 eol_type = CODING_ID_EOL_TYPE (coding.id);
6854 coding_system = CODING_ATTR_BASE_NAME (attrs);
6855
6856 coding.source = src;
6857 coding.src_chars = src_chars;
6858 coding.src_bytes = src_bytes;
6859 coding.src_multibyte = multibytep;
6860 coding.consumed = 0;
6861 coding.mode |= CODING_MODE_LAST_BLOCK;
6862
6863 detect_info.checked = detect_info.found = detect_info.rejected = 0;
6864
6865 /* At first, detect text-format if necessary. */
6866 base_category = XINT (CODING_ATTR_CATEGORY (attrs));
6867 if (base_category == coding_category_undecided)
6868 {
6869 enum coding_category category;
6870 struct coding_system *this;
6871 int c, i;
6872
6873 /* Skip all ASCII bytes except for a few ISO2022 controls. */
6874 for (i = 0; src < src_end; i++, src++)
6875 {
6876 c = *src;
6877 if (c & 0x80 || (c < 0x20 && (c == ISO_CODE_ESC
6878 || c == ISO_CODE_SI
6879 || c == ISO_CODE_SO)))
6880 break;
6881 }
6882 coding.head_ascii = src - coding.source;
6883
6884 if (src < src_end)
6885 for (i = 0; i < coding_category_raw_text; i++)
6886 {
6887 category = coding_priorities[i];
6888 this = coding_categories + category;
6889
6890 if (this->id < 0)
6891 {
6892 /* No coding system of this category is defined. */
6893 detect_info.rejected |= (1 << category);
6894 }
6895 else if (category >= coding_category_raw_text)
6896 continue;
6897 else if (detect_info.checked & (1 << category))
6898 {
6899 if (highest
6900 && (detect_info.found & (1 << category)))
6901 break;
6902 }
6903 else
6904 {
6905 if ((*(this->detector)) (&coding, &detect_info)
6906 && highest
6907 && (detect_info.found & (1 << category)))
6908 {
6909 if (category == coding_category_utf_16_auto)
6910 {
6911 if (detect_info.found & CATEGORY_MASK_UTF_16_LE)
6912 category = coding_category_utf_16_le;
6913 else
6914 category = coding_category_utf_16_be;
6915 }
6916 break;
6917 }
6918 }
6919 }
6920
6921 if (detect_info.rejected == CATEGORY_MASK_ANY)
6922 {
6923 detect_info.found = CATEGORY_MASK_RAW_TEXT;
6924 id = coding_categories[coding_category_raw_text].id;
6925 val = Fcons (make_number (id), Qnil);
6926 }
6927 else if (! detect_info.rejected && ! detect_info.found)
6928 {
6929 detect_info.found = CATEGORY_MASK_ANY;
6930 id = coding_categories[coding_category_undecided].id;
6931 val = Fcons (make_number (id), Qnil);
6932 }
6933 else if (highest)
6934 {
6935 if (detect_info.found)
6936 {
6937 detect_info.found = 1 << category;
6938 val = Fcons (make_number (this->id), Qnil);
6939 }
6940 else
6941 for (i = 0; i < coding_category_raw_text; i++)
6942 if (! (detect_info.rejected & (1 << coding_priorities[i])))
6943 {
6944 detect_info.found = 1 << coding_priorities[i];
6945 id = coding_categories[coding_priorities[i]].id;
6946 val = Fcons (make_number (id), Qnil);
6947 break;
6948 }
6949 }
6950 else
6951 {
6952 int mask = detect_info.rejected | detect_info.found;
6953 int found = 0;
6954 val = Qnil;
6955
6956 for (i = coding_category_raw_text - 1; i >= 0; i--)
6957 {
6958 category = coding_priorities[i];
6959 if (! (mask & (1 << category)))
6960 {
6961 found |= 1 << category;
6962 id = coding_categories[category].id;
6963 val = Fcons (make_number (id), val);
6964 }
6965 }
6966 for (i = coding_category_raw_text - 1; i >= 0; i--)
6967 {
6968 category = coding_priorities[i];
6969 if (detect_info.found & (1 << category))
6970 {
6971 id = coding_categories[category].id;
6972 val = Fcons (make_number (id), val);
6973 }
6974 }
6975 detect_info.found |= found;
6976 }
6977 }
6978 else if (base_category == coding_category_utf_16_auto)
6979 {
6980 if (detect_coding_utf_16 (&coding, &detect_info))
6981 {
6982 enum coding_category category;
6983 struct coding_system *this;
6984
6985 if (detect_info.found & CATEGORY_MASK_UTF_16_LE)
6986 this = coding_categories + coding_category_utf_16_le;
6987 else if (detect_info.found & CATEGORY_MASK_UTF_16_BE)
6988 this = coding_categories + coding_category_utf_16_be;
6989 else if (detect_info.rejected & CATEGORY_MASK_UTF_16_LE_NOSIG)
6990 this = coding_categories + coding_category_utf_16_be_nosig;
6991 else
6992 this = coding_categories + coding_category_utf_16_le_nosig;
6993 val = Fcons (make_number (this->id), Qnil);
6994 }
6995 }
6996 else
6997 {
6998 detect_info.found = 1 << XINT (CODING_ATTR_CATEGORY (attrs));
6999 val = Fcons (make_number (coding.id), Qnil);
7000 }
7001
7002 /* Then, detect eol-format if necessary. */
7003 {
7004 int normal_eol = -1, utf_16_be_eol = -1, utf_16_le_eol;
7005 Lisp_Object tail;
7006
7007 if (VECTORP (eol_type))
7008 {
7009 if (detect_info.found & ~CATEGORY_MASK_UTF_16)
7010 normal_eol = detect_eol (coding.source, src_bytes,
7011 coding_category_raw_text);
7012 if (detect_info.found & (CATEGORY_MASK_UTF_16_BE
7013 | CATEGORY_MASK_UTF_16_BE_NOSIG))
7014 utf_16_be_eol = detect_eol (coding.source, src_bytes,
7015 coding_category_utf_16_be);
7016 if (detect_info.found & (CATEGORY_MASK_UTF_16_LE
7017 | CATEGORY_MASK_UTF_16_LE_NOSIG))
7018 utf_16_le_eol = detect_eol (coding.source, src_bytes,
7019 coding_category_utf_16_le);
7020 }
7021 else
7022 {
7023 if (EQ (eol_type, Qunix))
7024 normal_eol = utf_16_be_eol = utf_16_le_eol = EOL_SEEN_LF;
7025 else if (EQ (eol_type, Qdos))
7026 normal_eol = utf_16_be_eol = utf_16_le_eol = EOL_SEEN_CRLF;
7027 else
7028 normal_eol = utf_16_be_eol = utf_16_le_eol = EOL_SEEN_CR;
7029 }
7030
7031 for (tail = val; CONSP (tail); tail = XCDR (tail))
7032 {
7033 enum coding_category category;
7034 int this_eol;
7035
7036 id = XINT (XCAR (tail));
7037 attrs = CODING_ID_ATTRS (id);
7038 category = XINT (CODING_ATTR_CATEGORY (attrs));
7039 eol_type = CODING_ID_EOL_TYPE (id);
7040 if (VECTORP (eol_type))
7041 {
7042 if (category == coding_category_utf_16_be
7043 || category == coding_category_utf_16_be_nosig)
7044 this_eol = utf_16_be_eol;
7045 else if (category == coding_category_utf_16_le
7046 || category == coding_category_utf_16_le_nosig)
7047 this_eol = utf_16_le_eol;
7048 else
7049 this_eol = normal_eol;
7050
7051 if (this_eol == EOL_SEEN_LF)
7052 XSETCAR (tail, AREF (eol_type, 0));
7053 else if (this_eol == EOL_SEEN_CRLF)
7054 XSETCAR (tail, AREF (eol_type, 1));
7055 else if (this_eol == EOL_SEEN_CR)
7056 XSETCAR (tail, AREF (eol_type, 2));
7057 else
7058 XSETCAR (tail, CODING_ID_NAME (id));
7059 }
7060 else
7061 XSETCAR (tail, CODING_ID_NAME (id));
7062 }
7063 }
7064
7065 return (highest ? XCAR (val) : val);
7066 }
7067
7068
7069 DEFUN ("detect-coding-region", Fdetect_coding_region, Sdetect_coding_region,
7070 2, 3, 0,
7071 doc: /* Detect coding system of the text in the region between START and END.
7072 Return a list of possible coding systems ordered by priority.
7073
7074 If only ASCII characters are found, it returns a list of single element
7075 `undecided' or its subsidiary coding system according to a detected
7076 end-of-line format.
7077
7078 If optional argument HIGHEST is non-nil, return the coding system of
7079 highest priority. */)
7080 (start, end, highest)
7081 Lisp_Object start, end, highest;
7082 {
7083 int from, to;
7084 int from_byte, to_byte;
7085
7086 CHECK_NUMBER_COERCE_MARKER (start);
7087 CHECK_NUMBER_COERCE_MARKER (end);
7088
7089 validate_region (&start, &end);
7090 from = XINT (start), to = XINT (end);
7091 from_byte = CHAR_TO_BYTE (from);
7092 to_byte = CHAR_TO_BYTE (to);
7093
7094 if (from < GPT && to >= GPT)
7095 move_gap_both (to, to_byte);
7096
7097 return detect_coding_system (BYTE_POS_ADDR (from_byte),
7098 to - from, to_byte - from_byte,
7099 !NILP (highest),
7100 !NILP (current_buffer
7101 ->enable_multibyte_characters),
7102 Qnil);
7103 }
7104
7105 DEFUN ("detect-coding-string", Fdetect_coding_string, Sdetect_coding_string,
7106 1, 2, 0,
7107 doc: /* Detect coding system of the text in STRING.
7108 Return a list of possible coding systems ordered by priority.
7109
7110 If only ASCII characters are found, it returns a list of single element
7111 `undecided' or its subsidiary coding system according to a detected
7112 end-of-line format.
7113
7114 If optional argument HIGHEST is non-nil, return the coding system of
7115 highest priority. */)
7116 (string, highest)
7117 Lisp_Object string, highest;
7118 {
7119 CHECK_STRING (string);
7120
7121 return detect_coding_system (SDATA (string),
7122 SCHARS (string), SBYTES (string),
7123 !NILP (highest), STRING_MULTIBYTE (string),
7124 Qnil);
7125 }
7126
7127
7128 static INLINE int
7129 char_encodable_p (c, attrs)
7130 int c;
7131 Lisp_Object attrs;
7132 {
7133 Lisp_Object tail;
7134 struct charset *charset;
7135 Lisp_Object translation_table;
7136
7137 translation_table = CODING_ATTR_TRANS_TBL (attrs);
7138 if (! NILP (translation_table))
7139 c = translate_char (translation_table, c);
7140 for (tail = CODING_ATTR_CHARSET_LIST (attrs);
7141 CONSP (tail); tail = XCDR (tail))
7142 {
7143 charset = CHARSET_FROM_ID (XINT (XCAR (tail)));
7144 if (CHAR_CHARSET_P (c, charset))
7145 break;
7146 }
7147 return (! NILP (tail));
7148 }
7149
7150
7151 /* Return a list of coding systems that safely encode the text between
7152 START and END. If EXCLUDE is non-nil, it is a list of coding
7153 systems not to check. The returned list doesn't contain any such
7154 coding systems. In any case, if the text contains only ASCII or is
7155 unibyte, return t. */
7156
7157 DEFUN ("find-coding-systems-region-internal",
7158 Ffind_coding_systems_region_internal,
7159 Sfind_coding_systems_region_internal, 2, 3, 0,
7160 doc: /* Internal use only. */)
7161 (start, end, exclude)
7162 Lisp_Object start, end, exclude;
7163 {
7164 Lisp_Object coding_attrs_list, safe_codings;
7165 EMACS_INT start_byte, end_byte;
7166 const unsigned char *p, *pbeg, *pend;
7167 int c;
7168 Lisp_Object tail, elt;
7169
7170 if (STRINGP (start))
7171 {
7172 if (!STRING_MULTIBYTE (start)
7173 || SCHARS (start) == SBYTES (start))
7174 return Qt;
7175 start_byte = 0;
7176 end_byte = SBYTES (start);
7177 }
7178 else
7179 {
7180 CHECK_NUMBER_COERCE_MARKER (start);
7181 CHECK_NUMBER_COERCE_MARKER (end);
7182 if (XINT (start) < BEG || XINT (end) > Z || XINT (start) > XINT (end))
7183 args_out_of_range (start, end);
7184 if (NILP (current_buffer->enable_multibyte_characters))
7185 return Qt;
7186 start_byte = CHAR_TO_BYTE (XINT (start));
7187 end_byte = CHAR_TO_BYTE (XINT (end));
7188 if (XINT (end) - XINT (start) == end_byte - start_byte)
7189 return Qt;
7190
7191 if (XINT (start) < GPT && XINT (end) > GPT)
7192 {
7193 if ((GPT - XINT (start)) < (XINT (end) - GPT))
7194 move_gap_both (XINT (start), start_byte);
7195 else
7196 move_gap_both (XINT (end), end_byte);
7197 }
7198 }
7199
7200 coding_attrs_list = Qnil;
7201 for (tail = Vcoding_system_list; CONSP (tail); tail = XCDR (tail))
7202 if (NILP (exclude)
7203 || NILP (Fmemq (XCAR (tail), exclude)))
7204 {
7205 Lisp_Object attrs;
7206
7207 attrs = AREF (CODING_SYSTEM_SPEC (XCAR (tail)), 0);
7208 if (EQ (XCAR (tail), CODING_ATTR_BASE_NAME (attrs))
7209 && ! EQ (CODING_ATTR_TYPE (attrs), Qundecided))
7210 {
7211 ASET (attrs, coding_attr_trans_tbl,
7212 get_translation_table (attrs, 1));
7213 coding_attrs_list = Fcons (attrs, coding_attrs_list);
7214 }
7215 }
7216
7217 if (STRINGP (start))
7218 p = pbeg = SDATA (start);
7219 else
7220 p = pbeg = BYTE_POS_ADDR (start_byte);
7221 pend = p + (end_byte - start_byte);
7222
7223 while (p < pend && ASCII_BYTE_P (*p)) p++;
7224 while (p < pend && ASCII_BYTE_P (*(pend - 1))) pend--;
7225
7226 while (p < pend)
7227 {
7228 if (ASCII_BYTE_P (*p))
7229 p++;
7230 else
7231 {
7232 c = STRING_CHAR_ADVANCE (p);
7233
7234 charset_map_loaded = 0;
7235 for (tail = coding_attrs_list; CONSP (tail);)
7236 {
7237 elt = XCAR (tail);
7238 if (NILP (elt))
7239 tail = XCDR (tail);
7240 else if (char_encodable_p (c, elt))
7241 tail = XCDR (tail);
7242 else if (CONSP (XCDR (tail)))
7243 {
7244 XSETCAR (tail, XCAR (XCDR (tail)));
7245 XSETCDR (tail, XCDR (XCDR (tail)));
7246 }
7247 else
7248 {
7249 XSETCAR (tail, Qnil);
7250 tail = XCDR (tail);
7251 }
7252 }
7253 if (charset_map_loaded)
7254 {
7255 EMACS_INT p_offset = p - pbeg, pend_offset = pend - pbeg;
7256
7257 if (STRINGP (start))
7258 pbeg = SDATA (start);
7259 else
7260 pbeg = BYTE_POS_ADDR (start_byte);
7261 p = pbeg + p_offset;
7262 pend = pbeg + pend_offset;
7263 }
7264 }
7265 }
7266
7267 safe_codings = Qnil;
7268 for (tail = coding_attrs_list; CONSP (tail); tail = XCDR (tail))
7269 if (! NILP (XCAR (tail)))
7270 safe_codings = Fcons (CODING_ATTR_BASE_NAME (XCAR (tail)), safe_codings);
7271
7272 return safe_codings;
7273 }
7274
7275
7276 DEFUN ("unencodable-char-position", Funencodable_char_position,
7277 Sunencodable_char_position, 3, 5, 0,
7278 doc: /*
7279 Return position of first un-encodable character in a region.
7280 START and END specfiy the region and CODING-SYSTEM specifies the
7281 encoding to check. Return nil if CODING-SYSTEM does encode the region.
7282
7283 If optional 4th argument COUNT is non-nil, it specifies at most how
7284 many un-encodable characters to search. In this case, the value is a
7285 list of positions.
7286
7287 If optional 5th argument STRING is non-nil, it is a string to search
7288 for un-encodable characters. In that case, START and END are indexes
7289 to the string. */)
7290 (start, end, coding_system, count, string)
7291 Lisp_Object start, end, coding_system, count, string;
7292 {
7293 int n;
7294 struct coding_system coding;
7295 Lisp_Object attrs, charset_list, translation_table;
7296 Lisp_Object positions;
7297 int from, to;
7298 const unsigned char *p, *stop, *pend;
7299 int ascii_compatible;
7300
7301 setup_coding_system (Fcheck_coding_system (coding_system), &coding);
7302 attrs = CODING_ID_ATTRS (coding.id);
7303 if (EQ (CODING_ATTR_TYPE (attrs), Qraw_text))
7304 return Qnil;
7305 ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs));
7306 charset_list = CODING_ATTR_CHARSET_LIST (attrs);
7307 translation_table = get_translation_table (attrs, 1);
7308
7309 if (NILP (string))
7310 {
7311 validate_region (&start, &end);
7312 from = XINT (start);
7313 to = XINT (end);
7314 if (NILP (current_buffer->enable_multibyte_characters)
7315 || (ascii_compatible
7316 && (to - from) == (CHAR_TO_BYTE (to) - (CHAR_TO_BYTE (from)))))
7317 return Qnil;
7318 p = CHAR_POS_ADDR (from);
7319 pend = CHAR_POS_ADDR (to);
7320 if (from < GPT && to >= GPT)
7321 stop = GPT_ADDR;
7322 else
7323 stop = pend;
7324 }
7325 else
7326 {
7327 CHECK_STRING (string);
7328 CHECK_NATNUM (start);
7329 CHECK_NATNUM (end);
7330 from = XINT (start);
7331 to = XINT (end);
7332 if (from > to
7333 || to > SCHARS (string))
7334 args_out_of_range_3 (string, start, end);
7335 if (! STRING_MULTIBYTE (string))
7336 return Qnil;
7337 p = SDATA (string) + string_char_to_byte (string, from);
7338 stop = pend = SDATA (string) + string_char_to_byte (string, to);
7339 if (ascii_compatible && (to - from) == (pend - p))
7340 return Qnil;
7341 }
7342
7343 if (NILP (count))
7344 n = 1;
7345 else
7346 {
7347 CHECK_NATNUM (count);
7348 n = XINT (count);
7349 }
7350
7351 positions = Qnil;
7352 while (1)
7353 {
7354 int c;
7355
7356 if (ascii_compatible)
7357 while (p < stop && ASCII_BYTE_P (*p))
7358 p++, from++;
7359 if (p >= stop)
7360 {
7361 if (p >= pend)
7362 break;
7363 stop = pend;
7364 p = GAP_END_ADDR;
7365 }
7366
7367 c = STRING_CHAR_ADVANCE (p);
7368 if (! (ASCII_CHAR_P (c) && ascii_compatible)
7369 && ! char_charset (translate_char (translation_table, c),
7370 charset_list, NULL))
7371 {
7372 positions = Fcons (make_number (from), positions);
7373 n--;
7374 if (n == 0)
7375 break;
7376 }
7377
7378 from++;
7379 }
7380
7381 return (NILP (count) ? Fcar (positions) : Fnreverse (positions));
7382 }
7383
7384
7385 DEFUN ("check-coding-systems-region", Fcheck_coding_systems_region,
7386 Scheck_coding_systems_region, 3, 3, 0,
7387 doc: /* Check if the region is encodable by coding systems.
7388
7389 START and END are buffer positions specifying the region.
7390 CODING-SYSTEM-LIST is a list of coding systems to check.
7391
7392 The value is an alist ((CODING-SYSTEM POS0 POS1 ...) ...), where
7393 CODING-SYSTEM is a member of CODING-SYSTEM-LIst and can't encode the
7394 whole region, POS0, POS1, ... are buffer positions where non-encodable
7395 characters are found.
7396
7397 If all coding systems in CODING-SYSTEM-LIST can encode the region, the
7398 value is nil.
7399
7400 START may be a string. In that case, check if the string is
7401 encodable, and the value contains indices to the string instead of
7402 buffer positions. END is ignored. */)
7403 (start, end, coding_system_list)
7404 Lisp_Object start, end, coding_system_list;
7405 {
7406 Lisp_Object list;
7407 EMACS_INT start_byte, end_byte;
7408 int pos;
7409 const unsigned char *p, *pbeg, *pend;
7410 int c;
7411 Lisp_Object tail, elt, attrs;
7412
7413 if (STRINGP (start))
7414 {
7415 if (!STRING_MULTIBYTE (start)
7416 && SCHARS (start) != SBYTES (start))
7417 return Qnil;
7418 start_byte = 0;
7419 end_byte = SBYTES (start);
7420 pos = 0;
7421 }
7422 else
7423 {
7424 CHECK_NUMBER_COERCE_MARKER (start);
7425 CHECK_NUMBER_COERCE_MARKER (end);
7426 if (XINT (start) < BEG || XINT (end) > Z || XINT (start) > XINT (end))
7427 args_out_of_range (start, end);
7428 if (NILP (current_buffer->enable_multibyte_characters))
7429 return Qnil;
7430 start_byte = CHAR_TO_BYTE (XINT (start));
7431 end_byte = CHAR_TO_BYTE (XINT (end));
7432 if (XINT (end) - XINT (start) == end_byte - start_byte)
7433 return Qt;
7434
7435 if (XINT (start) < GPT && XINT (end) > GPT)
7436 {
7437 if ((GPT - XINT (start)) < (XINT (end) - GPT))
7438 move_gap_both (XINT (start), start_byte);
7439 else
7440 move_gap_both (XINT (end), end_byte);
7441 }
7442 pos = XINT (start);
7443 }
7444
7445 list = Qnil;
7446 for (tail = coding_system_list; CONSP (tail); tail = XCDR (tail))
7447 {
7448 elt = XCAR (tail);
7449 attrs = AREF (CODING_SYSTEM_SPEC (elt), 0);
7450 ASET (attrs, coding_attr_trans_tbl, get_translation_table (attrs, 1));
7451 list = Fcons (Fcons (elt, Fcons (attrs, Qnil)), list);
7452 }
7453
7454 if (STRINGP (start))
7455 p = pbeg = SDATA (start);
7456 else
7457 p = pbeg = BYTE_POS_ADDR (start_byte);
7458 pend = p + (end_byte - start_byte);
7459
7460 while (p < pend && ASCII_BYTE_P (*p)) p++, pos++;
7461 while (p < pend && ASCII_BYTE_P (*(pend - 1))) pend--;
7462
7463 while (p < pend)
7464 {
7465 if (ASCII_BYTE_P (*p))
7466 p++;
7467 else
7468 {
7469 c = STRING_CHAR_ADVANCE (p);
7470
7471 charset_map_loaded = 0;
7472 for (tail = list; CONSP (tail); tail = XCDR (tail))
7473 {
7474 elt = XCDR (XCAR (tail));
7475 if (! char_encodable_p (c, XCAR (elt)))
7476 XSETCDR (elt, Fcons (make_number (pos), XCDR (elt)));
7477 }
7478 if (charset_map_loaded)
7479 {
7480 EMACS_INT p_offset = p - pbeg, pend_offset = pend - pbeg;
7481
7482 if (STRINGP (start))
7483 pbeg = SDATA (start);
7484 else
7485 pbeg = BYTE_POS_ADDR (start_byte);
7486 p = pbeg + p_offset;
7487 pend = pbeg + pend_offset;
7488 }
7489 }
7490 pos++;
7491 }
7492
7493 tail = list;
7494 list = Qnil;
7495 for (; CONSP (tail); tail = XCDR (tail))
7496 {
7497 elt = XCAR (tail);
7498 if (CONSP (XCDR (XCDR (elt))))
7499 list = Fcons (Fcons (XCAR (elt), Fnreverse (XCDR (XCDR (elt)))),
7500 list);
7501 }
7502
7503 return list;
7504 }
7505
7506
7507 Lisp_Object
7508 code_convert_region (start, end, coding_system, dst_object, encodep, norecord)
7509 Lisp_Object start, end, coding_system, dst_object;
7510 int encodep, norecord;
7511 {
7512 struct coding_system coding;
7513 EMACS_INT from, from_byte, to, to_byte;
7514 Lisp_Object src_object;
7515
7516 CHECK_NUMBER_COERCE_MARKER (start);
7517 CHECK_NUMBER_COERCE_MARKER (end);
7518 if (NILP (coding_system))
7519 coding_system = Qno_conversion;
7520 else
7521 CHECK_CODING_SYSTEM (coding_system);
7522 src_object = Fcurrent_buffer ();
7523 if (NILP (dst_object))
7524 dst_object = src_object;
7525 else if (! EQ (dst_object, Qt))
7526 CHECK_BUFFER (dst_object);
7527
7528 validate_region (&start, &end);
7529 from = XFASTINT (start);
7530 from_byte = CHAR_TO_BYTE (from);
7531 to = XFASTINT (end);
7532 to_byte = CHAR_TO_BYTE (to);
7533
7534 setup_coding_system (coding_system, &coding);
7535 coding.mode |= CODING_MODE_LAST_BLOCK;
7536
7537 if (encodep)
7538 encode_coding_object (&coding, src_object, from, from_byte, to, to_byte,
7539 dst_object);
7540 else
7541 decode_coding_object (&coding, src_object, from, from_byte, to, to_byte,
7542 dst_object);
7543 if (! norecord)
7544 Vlast_coding_system_used = CODING_ID_NAME (coding.id);
7545
7546 return (BUFFERP (dst_object)
7547 ? make_number (coding.produced_char)
7548 : coding.dst_object);
7549 }
7550
7551
7552 DEFUN ("decode-coding-region", Fdecode_coding_region, Sdecode_coding_region,
7553 3, 4, "r\nzCoding system: ",
7554 doc: /* Decode the current region from the specified coding system.
7555 When called from a program, takes four arguments:
7556 START, END, CODING-SYSTEM, and DESTINATION.
7557 START and END are buffer positions.
7558
7559 Optional 4th arguments DESTINATION specifies where the decoded text goes.
7560 If nil, the region between START and END is replace by the decoded text.
7561 If buffer, the decoded text is inserted in the buffer.
7562 If t, the decoded text is returned.
7563
7564 This function sets `last-coding-system-used' to the precise coding system
7565 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
7566 not fully specified.)
7567 It returns the length of the decoded text. */)
7568 (start, end, coding_system, destination)
7569 Lisp_Object start, end, coding_system, destination;
7570 {
7571 return code_convert_region (start, end, coding_system, destination, 0, 0);
7572 }
7573
7574 DEFUN ("encode-coding-region", Fencode_coding_region, Sencode_coding_region,
7575 3, 4, "r\nzCoding system: ",
7576 doc: /* Encode the current region by specified coding system.
7577 When called from a program, takes three arguments:
7578 START, END, and CODING-SYSTEM. START and END are buffer positions.
7579
7580 Optional 4th arguments DESTINATION specifies where the encoded text goes.
7581 If nil, the region between START and END is replace by the encoded text.
7582 If buffer, the encoded text is inserted in the buffer.
7583 If t, the encoded text is returned.
7584
7585 This function sets `last-coding-system-used' to the precise coding system
7586 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
7587 not fully specified.)
7588 It returns the length of the encoded text. */)
7589 (start, end, coding_system, destination)
7590 Lisp_Object start, end, coding_system, destination;
7591 {
7592 return code_convert_region (start, end, coding_system, destination, 1, 0);
7593 }
7594
7595 Lisp_Object
7596 code_convert_string (string, coding_system, dst_object,
7597 encodep, nocopy, norecord)
7598 Lisp_Object string, coding_system, dst_object;
7599 int encodep, nocopy, norecord;
7600 {
7601 struct coding_system coding;
7602 EMACS_INT chars, bytes;
7603
7604 CHECK_STRING (string);
7605 if (NILP (coding_system))
7606 {
7607 if (! norecord)
7608 Vlast_coding_system_used = Qno_conversion;
7609 if (NILP (dst_object))
7610 return (nocopy ? Fcopy_sequence (string) : string);
7611 }
7612
7613 if (NILP (coding_system))
7614 coding_system = Qno_conversion;
7615 else
7616 CHECK_CODING_SYSTEM (coding_system);
7617 if (NILP (dst_object))
7618 dst_object = Qt;
7619 else if (! EQ (dst_object, Qt))
7620 CHECK_BUFFER (dst_object);
7621
7622 setup_coding_system (coding_system, &coding);
7623 coding.mode |= CODING_MODE_LAST_BLOCK;
7624 chars = SCHARS (string);
7625 bytes = SBYTES (string);
7626 if (encodep)
7627 encode_coding_object (&coding, string, 0, 0, chars, bytes, dst_object);
7628 else
7629 decode_coding_object (&coding, string, 0, 0, chars, bytes, dst_object);
7630 if (! norecord)
7631 Vlast_coding_system_used = CODING_ID_NAME (coding.id);
7632
7633 return (BUFFERP (dst_object)
7634 ? make_number (coding.produced_char)
7635 : coding.dst_object);
7636 }
7637
7638
7639 /* Encode or decode STRING according to CODING_SYSTEM.
7640 Do not set Vlast_coding_system_used.
7641
7642 This function is called only from macros DECODE_FILE and
7643 ENCODE_FILE, thus we ignore character composition. */
7644
7645 Lisp_Object
7646 code_convert_string_norecord (string, coding_system, encodep)
7647 Lisp_Object string, coding_system;
7648 int encodep;
7649 {
7650 return code_convert_string (string, coding_system, Qt, encodep, 0, 1);
7651 }
7652
7653
7654 DEFUN ("decode-coding-string", Fdecode_coding_string, Sdecode_coding_string,
7655 2, 4, 0,
7656 doc: /* Decode STRING which is encoded in CODING-SYSTEM, and return the result.
7657
7658 Optional third arg NOCOPY non-nil means it is OK to return STRING itself
7659 if the decoding operation is trivial.
7660
7661 Optional fourth arg BUFFER non-nil meant that the decoded text is
7662 inserted in BUFFER instead of returned as a string. In this case,
7663 the return value is BUFFER.
7664
7665 This function sets `last-coding-system-used' to the precise coding system
7666 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
7667 not fully specified. */)
7668 (string, coding_system, nocopy, buffer)
7669 Lisp_Object string, coding_system, nocopy, buffer;
7670 {
7671 return code_convert_string (string, coding_system, buffer,
7672 0, ! NILP (nocopy), 0);
7673 }
7674
7675 DEFUN ("encode-coding-string", Fencode_coding_string, Sencode_coding_string,
7676 2, 4, 0,
7677 doc: /* Encode STRING to CODING-SYSTEM, and return the result.
7678
7679 Optional third arg NOCOPY non-nil means it is OK to return STRING
7680 itself if the encoding operation is trivial.
7681
7682 Optional fourth arg BUFFER non-nil meant that the encoded text is
7683 inserted in BUFFER instead of returned as a string. In this case,
7684 the return value is BUFFER.
7685
7686 This function sets `last-coding-system-used' to the precise coding system
7687 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
7688 not fully specified.) */)
7689 (string, coding_system, nocopy, buffer)
7690 Lisp_Object string, coding_system, nocopy, buffer;
7691 {
7692 return code_convert_string (string, coding_system, buffer,
7693 1, ! NILP (nocopy), 1);
7694 }
7695
7696 \f
7697 DEFUN ("decode-sjis-char", Fdecode_sjis_char, Sdecode_sjis_char, 1, 1, 0,
7698 doc: /* Decode a Japanese character which has CODE in shift_jis encoding.
7699 Return the corresponding character. */)
7700 (code)
7701 Lisp_Object code;
7702 {
7703 Lisp_Object spec, attrs, val;
7704 struct charset *charset_roman, *charset_kanji, *charset_kana, *charset;
7705 int c;
7706
7707 CHECK_NATNUM (code);
7708 c = XFASTINT (code);
7709 CHECK_CODING_SYSTEM_GET_SPEC (Vsjis_coding_system, spec);
7710 attrs = AREF (spec, 0);
7711
7712 if (ASCII_BYTE_P (c)
7713 && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
7714 return code;
7715
7716 val = CODING_ATTR_CHARSET_LIST (attrs);
7717 charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
7718 charset_kana = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
7719 charset_kanji = CHARSET_FROM_ID (XINT (XCAR (val)));
7720
7721 if (c <= 0x7F)
7722 charset = charset_roman;
7723 else if (c >= 0xA0 && c < 0xDF)
7724 {
7725 charset = charset_kana;
7726 c -= 0x80;
7727 }
7728 else
7729 {
7730 int s1 = c >> 8, s2 = c & 0xFF;
7731
7732 if (s1 < 0x81 || (s1 > 0x9F && s1 < 0xE0) || s1 > 0xEF
7733 || s2 < 0x40 || s2 == 0x7F || s2 > 0xFC)
7734 error ("Invalid code: %d", code);
7735 SJIS_TO_JIS (c);
7736 charset = charset_kanji;
7737 }
7738 c = DECODE_CHAR (charset, c);
7739 if (c < 0)
7740 error ("Invalid code: %d", code);
7741 return make_number (c);
7742 }
7743
7744
7745 DEFUN ("encode-sjis-char", Fencode_sjis_char, Sencode_sjis_char, 1, 1, 0,
7746 doc: /* Encode a Japanese character CHAR to shift_jis encoding.
7747 Return the corresponding code in SJIS. */)
7748 (ch)
7749 Lisp_Object ch;
7750 {
7751 Lisp_Object spec, attrs, charset_list;
7752 int c;
7753 struct charset *charset;
7754 unsigned code;
7755
7756 CHECK_CHARACTER (ch);
7757 c = XFASTINT (ch);
7758 CHECK_CODING_SYSTEM_GET_SPEC (Vsjis_coding_system, spec);
7759 attrs = AREF (spec, 0);
7760
7761 if (ASCII_CHAR_P (c)
7762 && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
7763 return ch;
7764
7765 charset_list = CODING_ATTR_CHARSET_LIST (attrs);
7766 charset = char_charset (c, charset_list, &code);
7767 if (code == CHARSET_INVALID_CODE (charset))
7768 error ("Can't encode by shift_jis encoding: %d", c);
7769 JIS_TO_SJIS (code);
7770
7771 return make_number (code);
7772 }
7773
7774 DEFUN ("decode-big5-char", Fdecode_big5_char, Sdecode_big5_char, 1, 1, 0,
7775 doc: /* Decode a Big5 character which has CODE in BIG5 coding system.
7776 Return the corresponding character. */)
7777 (code)
7778 Lisp_Object code;
7779 {
7780 Lisp_Object spec, attrs, val;
7781 struct charset *charset_roman, *charset_big5, *charset;
7782 int c;
7783
7784 CHECK_NATNUM (code);
7785 c = XFASTINT (code);
7786 CHECK_CODING_SYSTEM_GET_SPEC (Vbig5_coding_system, spec);
7787 attrs = AREF (spec, 0);
7788
7789 if (ASCII_BYTE_P (c)
7790 && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
7791 return code;
7792
7793 val = CODING_ATTR_CHARSET_LIST (attrs);
7794 charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
7795 charset_big5 = CHARSET_FROM_ID (XINT (XCAR (val)));
7796
7797 if (c <= 0x7F)
7798 charset = charset_roman;
7799 else
7800 {
7801 int b1 = c >> 8, b2 = c & 0x7F;
7802 if (b1 < 0xA1 || b1 > 0xFE
7803 || b2 < 0x40 || (b2 > 0x7E && b2 < 0xA1) || b2 > 0xFE)
7804 error ("Invalid code: %d", code);
7805 charset = charset_big5;
7806 }
7807 c = DECODE_CHAR (charset, (unsigned )c);
7808 if (c < 0)
7809 error ("Invalid code: %d", code);
7810 return make_number (c);
7811 }
7812
7813 DEFUN ("encode-big5-char", Fencode_big5_char, Sencode_big5_char, 1, 1, 0,
7814 doc: /* Encode the Big5 character CHAR to BIG5 coding system.
7815 Return the corresponding character code in Big5. */)
7816 (ch)
7817 Lisp_Object ch;
7818 {
7819 Lisp_Object spec, attrs, charset_list;
7820 struct charset *charset;
7821 int c;
7822 unsigned code;
7823
7824 CHECK_CHARACTER (ch);
7825 c = XFASTINT (ch);
7826 CHECK_CODING_SYSTEM_GET_SPEC (Vbig5_coding_system, spec);
7827 attrs = AREF (spec, 0);
7828 if (ASCII_CHAR_P (c)
7829 && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
7830 return ch;
7831
7832 charset_list = CODING_ATTR_CHARSET_LIST (attrs);
7833 charset = char_charset (c, charset_list, &code);
7834 if (code == CHARSET_INVALID_CODE (charset))
7835 error ("Can't encode by Big5 encoding: %d", c);
7836
7837 return make_number (code);
7838 }
7839
7840 \f
7841 DEFUN ("set-terminal-coding-system-internal",
7842 Fset_terminal_coding_system_internal,
7843 Sset_terminal_coding_system_internal, 1, 1, 0,
7844 doc: /* Internal use only. */)
7845 (coding_system)
7846 Lisp_Object coding_system;
7847 {
7848 CHECK_SYMBOL (coding_system);
7849 setup_coding_system (Fcheck_coding_system (coding_system),
7850 &terminal_coding);
7851
7852 /* We had better not send unsafe characters to terminal. */
7853 terminal_coding.mode |= CODING_MODE_SAFE_ENCODING;
7854 /* Characer composition should be disabled. */
7855 terminal_coding.common_flags &= ~CODING_ANNOTATE_COMPOSITION_MASK;
7856 terminal_coding.src_multibyte = 1;
7857 terminal_coding.dst_multibyte = 0;
7858 return Qnil;
7859 }
7860
7861 DEFUN ("set-safe-terminal-coding-system-internal",
7862 Fset_safe_terminal_coding_system_internal,
7863 Sset_safe_terminal_coding_system_internal, 1, 1, 0,
7864 doc: /* Internal use only. */)
7865 (coding_system)
7866 Lisp_Object coding_system;
7867 {
7868 CHECK_SYMBOL (coding_system);
7869 setup_coding_system (Fcheck_coding_system (coding_system),
7870 &safe_terminal_coding);
7871 /* Characer composition should be disabled. */
7872 safe_terminal_coding.common_flags &= ~CODING_ANNOTATE_COMPOSITION_MASK;
7873 safe_terminal_coding.src_multibyte = 1;
7874 safe_terminal_coding.dst_multibyte = 0;
7875 return Qnil;
7876 }
7877
7878 DEFUN ("terminal-coding-system",
7879 Fterminal_coding_system, Sterminal_coding_system, 0, 0, 0,
7880 doc: /* Return coding system specified for terminal output. */)
7881 ()
7882 {
7883 return CODING_ID_NAME (terminal_coding.id);
7884 }
7885
7886 DEFUN ("set-keyboard-coding-system-internal",
7887 Fset_keyboard_coding_system_internal,
7888 Sset_keyboard_coding_system_internal, 1, 1, 0,
7889 doc: /* Internal use only. */)
7890 (coding_system)
7891 Lisp_Object coding_system;
7892 {
7893 CHECK_SYMBOL (coding_system);
7894 setup_coding_system (Fcheck_coding_system (coding_system),
7895 &keyboard_coding);
7896 /* Characer composition should be disabled. */
7897 keyboard_coding.common_flags &= ~CODING_ANNOTATE_COMPOSITION_MASK;
7898 return Qnil;
7899 }
7900
7901 DEFUN ("keyboard-coding-system",
7902 Fkeyboard_coding_system, Skeyboard_coding_system, 0, 0, 0,
7903 doc: /* Return coding system specified for decoding keyboard input. */)
7904 ()
7905 {
7906 return CODING_ID_NAME (keyboard_coding.id);
7907 }
7908
7909 \f
7910 DEFUN ("find-operation-coding-system", Ffind_operation_coding_system,
7911 Sfind_operation_coding_system, 1, MANY, 0,
7912 doc: /* Choose a coding system for an operation based on the target name.
7913 The value names a pair of coding systems: (DECODING-SYSTEM . ENCODING-SYSTEM).
7914 DECODING-SYSTEM is the coding system to use for decoding
7915 \(in case OPERATION does decoding), and ENCODING-SYSTEM is the coding system
7916 for encoding (in case OPERATION does encoding).
7917
7918 The first argument OPERATION specifies an I/O primitive:
7919 For file I/O, `insert-file-contents' or `write-region'.
7920 For process I/O, `call-process', `call-process-region', or `start-process'.
7921 For network I/O, `open-network-stream'.
7922
7923 The remaining arguments should be the same arguments that were passed
7924 to the primitive. Depending on which primitive, one of those arguments
7925 is selected as the TARGET. For example, if OPERATION does file I/O,
7926 whichever argument specifies the file name is TARGET.
7927
7928 TARGET has a meaning which depends on OPERATION:
7929 For file I/O, TARGET is a file name.
7930 For process I/O, TARGET is a process name.
7931 For network I/O, TARGET is a service name or a port number
7932
7933 This function looks up what specified for TARGET in,
7934 `file-coding-system-alist', `process-coding-system-alist',
7935 or `network-coding-system-alist' depending on OPERATION.
7936 They may specify a coding system, a cons of coding systems,
7937 or a function symbol to call.
7938 In the last case, we call the function with one argument,
7939 which is a list of all the arguments given to this function.
7940
7941 usage: (find-operation-coding-system OPERATION ARGUMENTS ...) */)
7942 (nargs, args)
7943 int nargs;
7944 Lisp_Object *args;
7945 {
7946 Lisp_Object operation, target_idx, target, val;
7947 register Lisp_Object chain;
7948
7949 if (nargs < 2)
7950 error ("Too few arguments");
7951 operation = args[0];
7952 if (!SYMBOLP (operation)
7953 || !INTEGERP (target_idx = Fget (operation, Qtarget_idx)))
7954 error ("Invalid first arguement");
7955 if (nargs < 1 + XINT (target_idx))
7956 error ("Too few arguments for operation: %s",
7957 SDATA (SYMBOL_NAME (operation)));
7958 target = args[XINT (target_idx) + 1];
7959 if (!(STRINGP (target)
7960 || (EQ (operation, Qopen_network_stream) && INTEGERP (target))))
7961 error ("Invalid %dth argument", XINT (target_idx) + 1);
7962
7963 chain = ((EQ (operation, Qinsert_file_contents)
7964 || EQ (operation, Qwrite_region))
7965 ? Vfile_coding_system_alist
7966 : (EQ (operation, Qopen_network_stream)
7967 ? Vnetwork_coding_system_alist
7968 : Vprocess_coding_system_alist));
7969 if (NILP (chain))
7970 return Qnil;
7971
7972 for (; CONSP (chain); chain = XCDR (chain))
7973 {
7974 Lisp_Object elt;
7975
7976 elt = XCAR (chain);
7977 if (CONSP (elt)
7978 && ((STRINGP (target)
7979 && STRINGP (XCAR (elt))
7980 && fast_string_match (XCAR (elt), target) >= 0)
7981 || (INTEGERP (target) && EQ (target, XCAR (elt)))))
7982 {
7983 val = XCDR (elt);
7984 /* Here, if VAL is both a valid coding system and a valid
7985 function symbol, we return VAL as a coding system. */
7986 if (CONSP (val))
7987 return val;
7988 if (! SYMBOLP (val))
7989 return Qnil;
7990 if (! NILP (Fcoding_system_p (val)))
7991 return Fcons (val, val);
7992 if (! NILP (Ffboundp (val)))
7993 {
7994 val = call1 (val, Flist (nargs, args));
7995 if (CONSP (val))
7996 return val;
7997 if (SYMBOLP (val) && ! NILP (Fcoding_system_p (val)))
7998 return Fcons (val, val);
7999 }
8000 return Qnil;
8001 }
8002 }
8003 return Qnil;
8004 }
8005
8006 DEFUN ("set-coding-system-priority", Fset_coding_system_priority,
8007 Sset_coding_system_priority, 0, MANY, 0,
8008 doc: /* Assign higher priority to the coding systems given as arguments.
8009 If multiple coding systems belongs to the same category,
8010 all but the first one are ignored.
8011
8012 usage: (set-coding-system-priority ...) */)
8013 (nargs, args)
8014 int nargs;
8015 Lisp_Object *args;
8016 {
8017 int i, j;
8018 int changed[coding_category_max];
8019 enum coding_category priorities[coding_category_max];
8020
8021 bzero (changed, sizeof changed);
8022
8023 for (i = j = 0; i < nargs; i++)
8024 {
8025 enum coding_category category;
8026 Lisp_Object spec, attrs;
8027
8028 CHECK_CODING_SYSTEM_GET_SPEC (args[i], spec);
8029 attrs = AREF (spec, 0);
8030 category = XINT (CODING_ATTR_CATEGORY (attrs));
8031 if (changed[category])
8032 /* Ignore this coding system because a coding system of the
8033 same category already had a higher priority. */
8034 continue;
8035 changed[category] = 1;
8036 priorities[j++] = category;
8037 if (coding_categories[category].id >= 0
8038 && ! EQ (args[i], CODING_ID_NAME (coding_categories[category].id)))
8039 setup_coding_system (args[i], &coding_categories[category]);
8040 Fset (AREF (Vcoding_category_table, category), args[i]);
8041 }
8042
8043 /* Now we have decided top J priorities. Reflect the order of the
8044 original priorities to the remaining priorities. */
8045
8046 for (i = j, j = 0; i < coding_category_max; i++, j++)
8047 {
8048 while (j < coding_category_max
8049 && changed[coding_priorities[j]])
8050 j++;
8051 if (j == coding_category_max)
8052 abort ();
8053 priorities[i] = coding_priorities[j];
8054 }
8055
8056 bcopy (priorities, coding_priorities, sizeof priorities);
8057
8058 /* Update `coding-category-list'. */
8059 Vcoding_category_list = Qnil;
8060 for (i = coding_category_max - 1; i >= 0; i--)
8061 Vcoding_category_list
8062 = Fcons (AREF (Vcoding_category_table, priorities[i]),
8063 Vcoding_category_list);
8064
8065 return Qnil;
8066 }
8067
8068 DEFUN ("coding-system-priority-list", Fcoding_system_priority_list,
8069 Scoding_system_priority_list, 0, 1, 0,
8070 doc: /* Return a list of coding systems ordered by their priorities.
8071 HIGHESTP non-nil means just return the highest priority one. */)
8072 (highestp)
8073 Lisp_Object highestp;
8074 {
8075 int i;
8076 Lisp_Object val;
8077
8078 for (i = 0, val = Qnil; i < coding_category_max; i++)
8079 {
8080 enum coding_category category = coding_priorities[i];
8081 int id = coding_categories[category].id;
8082 Lisp_Object attrs;
8083
8084 if (id < 0)
8085 continue;
8086 attrs = CODING_ID_ATTRS (id);
8087 if (! NILP (highestp))
8088 return CODING_ATTR_BASE_NAME (attrs);
8089 val = Fcons (CODING_ATTR_BASE_NAME (attrs), val);
8090 }
8091 return Fnreverse (val);
8092 }
8093
8094 static char *suffixes[] = { "-unix", "-dos", "-mac" };
8095
8096 static Lisp_Object
8097 make_subsidiaries (base)
8098 Lisp_Object base;
8099 {
8100 Lisp_Object subsidiaries;
8101 int base_name_len = SBYTES (SYMBOL_NAME (base));
8102 char *buf = (char *) alloca (base_name_len + 6);
8103 int i;
8104
8105 bcopy (SDATA (SYMBOL_NAME (base)), buf, base_name_len);
8106 subsidiaries = Fmake_vector (make_number (3), Qnil);
8107 for (i = 0; i < 3; i++)
8108 {
8109 bcopy (suffixes[i], buf + base_name_len, strlen (suffixes[i]) + 1);
8110 ASET (subsidiaries, i, intern (buf));
8111 }
8112 return subsidiaries;
8113 }
8114
8115
8116 DEFUN ("define-coding-system-internal", Fdefine_coding_system_internal,
8117 Sdefine_coding_system_internal, coding_arg_max, MANY, 0,
8118 doc: /* For internal use only.
8119 usage: (define-coding-system-internal ...) */)
8120 (nargs, args)
8121 int nargs;
8122 Lisp_Object *args;
8123 {
8124 Lisp_Object name;
8125 Lisp_Object spec_vec; /* [ ATTRS ALIASE EOL_TYPE ] */
8126 Lisp_Object attrs; /* Vector of attributes. */
8127 Lisp_Object eol_type;
8128 Lisp_Object aliases;
8129 Lisp_Object coding_type, charset_list, safe_charsets;
8130 enum coding_category category;
8131 Lisp_Object tail, val;
8132 int max_charset_id = 0;
8133 int i;
8134
8135 if (nargs < coding_arg_max)
8136 goto short_args;
8137
8138 attrs = Fmake_vector (make_number (coding_attr_last_index), Qnil);
8139
8140 name = args[coding_arg_name];
8141 CHECK_SYMBOL (name);
8142 CODING_ATTR_BASE_NAME (attrs) = name;
8143
8144 val = args[coding_arg_mnemonic];
8145 if (! STRINGP (val))
8146 CHECK_CHARACTER (val);
8147 CODING_ATTR_MNEMONIC (attrs) = val;
8148
8149 coding_type = args[coding_arg_coding_type];
8150 CHECK_SYMBOL (coding_type);
8151 CODING_ATTR_TYPE (attrs) = coding_type;
8152
8153 charset_list = args[coding_arg_charset_list];
8154 if (SYMBOLP (charset_list))
8155 {
8156 if (EQ (charset_list, Qiso_2022))
8157 {
8158 if (! EQ (coding_type, Qiso_2022))
8159 error ("Invalid charset-list");
8160 charset_list = Viso_2022_charset_list;
8161 }
8162 else if (EQ (charset_list, Qemacs_mule))
8163 {
8164 if (! EQ (coding_type, Qemacs_mule))
8165 error ("Invalid charset-list");
8166 charset_list = Vemacs_mule_charset_list;
8167 }
8168 for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
8169 if (max_charset_id < XFASTINT (XCAR (tail)))
8170 max_charset_id = XFASTINT (XCAR (tail));
8171 }
8172 else
8173 {
8174 charset_list = Fcopy_sequence (charset_list);
8175 for (tail = charset_list; !NILP (tail); tail = Fcdr (tail))
8176 {
8177 struct charset *charset;
8178
8179 val = Fcar (tail);
8180 CHECK_CHARSET_GET_CHARSET (val, charset);
8181 if (EQ (coding_type, Qiso_2022)
8182 ? CHARSET_ISO_FINAL (charset) < 0
8183 : EQ (coding_type, Qemacs_mule)
8184 ? CHARSET_EMACS_MULE_ID (charset) < 0
8185 : 0)
8186 error ("Can't handle charset `%s'",
8187 SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
8188
8189 XSETCAR (tail, make_number (charset->id));
8190 if (max_charset_id < charset->id)
8191 max_charset_id = charset->id;
8192 }
8193 }
8194 CODING_ATTR_CHARSET_LIST (attrs) = charset_list;
8195
8196 safe_charsets = Fmake_string (make_number (max_charset_id + 1),
8197 make_number (255));
8198 for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
8199 SSET (safe_charsets, XFASTINT (XCAR (tail)), 0);
8200 CODING_ATTR_SAFE_CHARSETS (attrs) = safe_charsets;
8201
8202 CODING_ATTR_ASCII_COMPAT (attrs) = args[coding_arg_ascii_compatible_p];
8203
8204 val = args[coding_arg_decode_translation_table];
8205 if (! CHAR_TABLE_P (val) && ! CONSP (val))
8206 CHECK_SYMBOL (val);
8207 CODING_ATTR_DECODE_TBL (attrs) = val;
8208
8209 val = args[coding_arg_encode_translation_table];
8210 if (! CHAR_TABLE_P (val) && ! CONSP (val))
8211 CHECK_SYMBOL (val);
8212 CODING_ATTR_ENCODE_TBL (attrs) = val;
8213
8214 val = args[coding_arg_post_read_conversion];
8215 CHECK_SYMBOL (val);
8216 CODING_ATTR_POST_READ (attrs) = val;
8217
8218 val = args[coding_arg_pre_write_conversion];
8219 CHECK_SYMBOL (val);
8220 CODING_ATTR_PRE_WRITE (attrs) = val;
8221
8222 val = args[coding_arg_default_char];
8223 if (NILP (val))
8224 CODING_ATTR_DEFAULT_CHAR (attrs) = make_number (' ');
8225 else
8226 {
8227 CHECK_CHARACTER (val);
8228 CODING_ATTR_DEFAULT_CHAR (attrs) = val;
8229 }
8230
8231 val = args[coding_arg_for_unibyte];
8232 CODING_ATTR_FOR_UNIBYTE (attrs) = NILP (val) ? Qnil : Qt;
8233
8234 val = args[coding_arg_plist];
8235 CHECK_LIST (val);
8236 CODING_ATTR_PLIST (attrs) = val;
8237
8238 if (EQ (coding_type, Qcharset))
8239 {
8240 /* Generate a lisp vector of 256 elements. Each element is nil,
8241 integer, or a list of charset IDs.
8242
8243 If Nth element is nil, the byte code N is invalid in this
8244 coding system.
8245
8246 If Nth element is a number NUM, N is the first byte of a
8247 charset whose ID is NUM.
8248
8249 If Nth element is a list of charset IDs, N is the first byte
8250 of one of them. The list is sorted by dimensions of the
8251 charsets. A charset of smaller dimension comes firtst. */
8252 val = Fmake_vector (make_number (256), Qnil);
8253
8254 for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
8255 {
8256 struct charset *charset = CHARSET_FROM_ID (XFASTINT (XCAR (tail)));
8257 int dim = CHARSET_DIMENSION (charset);
8258 int idx = (dim - 1) * 4;
8259
8260 if (CHARSET_ASCII_COMPATIBLE_P (charset))
8261 CODING_ATTR_ASCII_COMPAT (attrs) = Qt;
8262
8263 for (i = charset->code_space[idx];
8264 i <= charset->code_space[idx + 1]; i++)
8265 {
8266 Lisp_Object tmp, tmp2;
8267 int dim2;
8268
8269 tmp = AREF (val, i);
8270 if (NILP (tmp))
8271 tmp = XCAR (tail);
8272 else if (NUMBERP (tmp))
8273 {
8274 dim2 = CHARSET_DIMENSION (CHARSET_FROM_ID (XFASTINT (tmp)));
8275 if (dim < dim2)
8276 tmp = Fcons (XCAR (tail), Fcons (tmp, Qnil));
8277 else
8278 tmp = Fcons (tmp, Fcons (XCAR (tail), Qnil));
8279 }
8280 else
8281 {
8282 for (tmp2 = tmp; CONSP (tmp2); tmp2 = XCDR (tmp2))
8283 {
8284 dim2 = CHARSET_DIMENSION (CHARSET_FROM_ID (XFASTINT (XCAR (tmp2))));
8285 if (dim < dim2)
8286 break;
8287 }
8288 if (NILP (tmp2))
8289 tmp = nconc2 (tmp, Fcons (XCAR (tail), Qnil));
8290 else
8291 {
8292 XSETCDR (tmp2, Fcons (XCAR (tmp2), XCDR (tmp2)));
8293 XSETCAR (tmp2, XCAR (tail));
8294 }
8295 }
8296 ASET (val, i, tmp);
8297 }
8298 }
8299 ASET (attrs, coding_attr_charset_valids, val);
8300 category = coding_category_charset;
8301 }
8302 else if (EQ (coding_type, Qccl))
8303 {
8304 Lisp_Object valids;
8305
8306 if (nargs < coding_arg_ccl_max)
8307 goto short_args;
8308
8309 val = args[coding_arg_ccl_decoder];
8310 CHECK_CCL_PROGRAM (val);
8311 if (VECTORP (val))
8312 val = Fcopy_sequence (val);
8313 ASET (attrs, coding_attr_ccl_decoder, val);
8314
8315 val = args[coding_arg_ccl_encoder];
8316 CHECK_CCL_PROGRAM (val);
8317 if (VECTORP (val))
8318 val = Fcopy_sequence (val);
8319 ASET (attrs, coding_attr_ccl_encoder, val);
8320
8321 val = args[coding_arg_ccl_valids];
8322 valids = Fmake_string (make_number (256), make_number (0));
8323 for (tail = val; !NILP (tail); tail = Fcdr (tail))
8324 {
8325 int from, to;
8326
8327 val = Fcar (tail);
8328 if (INTEGERP (val))
8329 {
8330 from = to = XINT (val);
8331 if (from < 0 || from > 255)
8332 args_out_of_range_3 (val, make_number (0), make_number (255));
8333 }
8334 else
8335 {
8336 CHECK_CONS (val);
8337 CHECK_NATNUM_CAR (val);
8338 CHECK_NATNUM_CDR (val);
8339 from = XINT (XCAR (val));
8340 if (from > 255)
8341 args_out_of_range_3 (XCAR (val),
8342 make_number (0), make_number (255));
8343 to = XINT (XCDR (val));
8344 if (to < from || to > 255)
8345 args_out_of_range_3 (XCDR (val),
8346 XCAR (val), make_number (255));
8347 }
8348 for (i = from; i <= to; i++)
8349 SSET (valids, i, 1);
8350 }
8351 ASET (attrs, coding_attr_ccl_valids, valids);
8352
8353 category = coding_category_ccl;
8354 }
8355 else if (EQ (coding_type, Qutf_16))
8356 {
8357 Lisp_Object bom, endian;
8358
8359 CODING_ATTR_ASCII_COMPAT (attrs) = Qnil;
8360
8361 if (nargs < coding_arg_utf16_max)
8362 goto short_args;
8363
8364 bom = args[coding_arg_utf16_bom];
8365 if (! NILP (bom) && ! EQ (bom, Qt))
8366 {
8367 CHECK_CONS (bom);
8368 val = XCAR (bom);
8369 CHECK_CODING_SYSTEM (val);
8370 val = XCDR (bom);
8371 CHECK_CODING_SYSTEM (val);
8372 }
8373 ASET (attrs, coding_attr_utf_16_bom, bom);
8374
8375 endian = args[coding_arg_utf16_endian];
8376 CHECK_SYMBOL (endian);
8377 if (NILP (endian))
8378 endian = Qbig;
8379 else if (! EQ (endian, Qbig) && ! EQ (endian, Qlittle))
8380 error ("Invalid endian: %s", SDATA (SYMBOL_NAME (endian)));
8381 ASET (attrs, coding_attr_utf_16_endian, endian);
8382
8383 category = (CONSP (bom)
8384 ? coding_category_utf_16_auto
8385 : NILP (bom)
8386 ? (EQ (endian, Qbig)
8387 ? coding_category_utf_16_be_nosig
8388 : coding_category_utf_16_le_nosig)
8389 : (EQ (endian, Qbig)
8390 ? coding_category_utf_16_be
8391 : coding_category_utf_16_le));
8392 }
8393 else if (EQ (coding_type, Qiso_2022))
8394 {
8395 Lisp_Object initial, reg_usage, request, flags;
8396 int i;
8397
8398 if (nargs < coding_arg_iso2022_max)
8399 goto short_args;
8400
8401 initial = Fcopy_sequence (args[coding_arg_iso2022_initial]);
8402 CHECK_VECTOR (initial);
8403 for (i = 0; i < 4; i++)
8404 {
8405 val = Faref (initial, make_number (i));
8406 if (! NILP (val))
8407 {
8408 struct charset *charset;
8409
8410 CHECK_CHARSET_GET_CHARSET (val, charset);
8411 ASET (initial, i, make_number (CHARSET_ID (charset)));
8412 if (i == 0 && CHARSET_ASCII_COMPATIBLE_P (charset))
8413 CODING_ATTR_ASCII_COMPAT (attrs) = Qt;
8414 }
8415 else
8416 ASET (initial, i, make_number (-1));
8417 }
8418
8419 reg_usage = args[coding_arg_iso2022_reg_usage];
8420 CHECK_CONS (reg_usage);
8421 CHECK_NUMBER_CAR (reg_usage);
8422 CHECK_NUMBER_CDR (reg_usage);
8423
8424 request = Fcopy_sequence (args[coding_arg_iso2022_request]);
8425 for (tail = request; ! NILP (tail); tail = Fcdr (tail))
8426 {
8427 int id;
8428 Lisp_Object tmp;
8429
8430 val = Fcar (tail);
8431 CHECK_CONS (val);
8432 tmp = XCAR (val);
8433 CHECK_CHARSET_GET_ID (tmp, id);
8434 CHECK_NATNUM_CDR (val);
8435 if (XINT (XCDR (val)) >= 4)
8436 error ("Invalid graphic register number: %d", XINT (XCDR (val)));
8437 XSETCAR (val, make_number (id));
8438 }
8439
8440 flags = args[coding_arg_iso2022_flags];
8441 CHECK_NATNUM (flags);
8442 i = XINT (flags);
8443 if (EQ (args[coding_arg_charset_list], Qiso_2022))
8444 flags = make_number (i | CODING_ISO_FLAG_FULL_SUPPORT);
8445
8446 ASET (attrs, coding_attr_iso_initial, initial);
8447 ASET (attrs, coding_attr_iso_usage, reg_usage);
8448 ASET (attrs, coding_attr_iso_request, request);
8449 ASET (attrs, coding_attr_iso_flags, flags);
8450 setup_iso_safe_charsets (attrs);
8451
8452 if (i & CODING_ISO_FLAG_SEVEN_BITS)
8453 category = ((i & (CODING_ISO_FLAG_LOCKING_SHIFT
8454 | CODING_ISO_FLAG_SINGLE_SHIFT))
8455 ? coding_category_iso_7_else
8456 : EQ (args[coding_arg_charset_list], Qiso_2022)
8457 ? coding_category_iso_7
8458 : coding_category_iso_7_tight);
8459 else
8460 {
8461 int id = XINT (AREF (initial, 1));
8462
8463 category = (((i & CODING_ISO_FLAG_LOCKING_SHIFT)
8464 || EQ (args[coding_arg_charset_list], Qiso_2022)
8465 || id < 0)
8466 ? coding_category_iso_8_else
8467 : (CHARSET_DIMENSION (CHARSET_FROM_ID (id)) == 1)
8468 ? coding_category_iso_8_1
8469 : coding_category_iso_8_2);
8470 }
8471 if (category != coding_category_iso_8_1
8472 && category != coding_category_iso_8_2)
8473 CODING_ATTR_ASCII_COMPAT (attrs) = Qnil;
8474 }
8475 else if (EQ (coding_type, Qemacs_mule))
8476 {
8477 if (EQ (args[coding_arg_charset_list], Qemacs_mule))
8478 ASET (attrs, coding_attr_emacs_mule_full, Qt);
8479 CODING_ATTR_ASCII_COMPAT (attrs) = Qt;
8480 category = coding_category_emacs_mule;
8481 }
8482 else if (EQ (coding_type, Qshift_jis))
8483 {
8484
8485 struct charset *charset;
8486
8487 if (XINT (Flength (charset_list)) != 3
8488 && XINT (Flength (charset_list)) != 4)
8489 error ("There should be three or four charsets");
8490
8491 charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
8492 if (CHARSET_DIMENSION (charset) != 1)
8493 error ("Dimension of charset %s is not one",
8494 SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
8495 if (CHARSET_ASCII_COMPATIBLE_P (charset))
8496 CODING_ATTR_ASCII_COMPAT (attrs) = Qt;
8497
8498 charset_list = XCDR (charset_list);
8499 charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
8500 if (CHARSET_DIMENSION (charset) != 1)
8501 error ("Dimension of charset %s is not one",
8502 SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
8503
8504 charset_list = XCDR (charset_list);
8505 charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
8506 if (CHARSET_DIMENSION (charset) != 2)
8507 error ("Dimension of charset %s is not two",
8508 SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
8509
8510 charset_list = XCDR (charset_list);
8511 if (! NILP (charset_list))
8512 {
8513 charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
8514 if (CHARSET_DIMENSION (charset) != 2)
8515 error ("Dimension of charset %s is not two",
8516 SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
8517 }
8518
8519 category = coding_category_sjis;
8520 Vsjis_coding_system = name;
8521 }
8522 else if (EQ (coding_type, Qbig5))
8523 {
8524 struct charset *charset;
8525
8526 if (XINT (Flength (charset_list)) != 2)
8527 error ("There should be just two charsets");
8528
8529 charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
8530 if (CHARSET_DIMENSION (charset) != 1)
8531 error ("Dimension of charset %s is not one",
8532 SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
8533 if (CHARSET_ASCII_COMPATIBLE_P (charset))
8534 CODING_ATTR_ASCII_COMPAT (attrs) = Qt;
8535
8536 charset_list = XCDR (charset_list);
8537 charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
8538 if (CHARSET_DIMENSION (charset) != 2)
8539 error ("Dimension of charset %s is not two",
8540 SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
8541
8542 category = coding_category_big5;
8543 Vbig5_coding_system = name;
8544 }
8545 else if (EQ (coding_type, Qraw_text))
8546 {
8547 category = coding_category_raw_text;
8548 CODING_ATTR_ASCII_COMPAT (attrs) = Qt;
8549 }
8550 else if (EQ (coding_type, Qutf_8))
8551 {
8552 category = coding_category_utf_8;
8553 CODING_ATTR_ASCII_COMPAT (attrs) = Qt;
8554 }
8555 else if (EQ (coding_type, Qundecided))
8556 category = coding_category_undecided;
8557 else
8558 error ("Invalid coding system type: %s",
8559 SDATA (SYMBOL_NAME (coding_type)));
8560
8561 CODING_ATTR_CATEGORY (attrs) = make_number (category);
8562 CODING_ATTR_PLIST (attrs)
8563 = Fcons (QCcategory, Fcons (AREF (Vcoding_category_table, category),
8564 CODING_ATTR_PLIST (attrs)));
8565
8566 eol_type = args[coding_arg_eol_type];
8567 if (! NILP (eol_type)
8568 && ! EQ (eol_type, Qunix)
8569 && ! EQ (eol_type, Qdos)
8570 && ! EQ (eol_type, Qmac))
8571 error ("Invalid eol-type");
8572
8573 aliases = Fcons (name, Qnil);
8574
8575 if (NILP (eol_type))
8576 {
8577 eol_type = make_subsidiaries (name);
8578 for (i = 0; i < 3; i++)
8579 {
8580 Lisp_Object this_spec, this_name, this_aliases, this_eol_type;
8581
8582 this_name = AREF (eol_type, i);
8583 this_aliases = Fcons (this_name, Qnil);
8584 this_eol_type = (i == 0 ? Qunix : i == 1 ? Qdos : Qmac);
8585 this_spec = Fmake_vector (make_number (3), attrs);
8586 ASET (this_spec, 1, this_aliases);
8587 ASET (this_spec, 2, this_eol_type);
8588 Fputhash (this_name, this_spec, Vcoding_system_hash_table);
8589 Vcoding_system_list = Fcons (this_name, Vcoding_system_list);
8590 Vcoding_system_alist = Fcons (Fcons (Fsymbol_name (this_name), Qnil),
8591 Vcoding_system_alist);
8592 }
8593 }
8594
8595 spec_vec = Fmake_vector (make_number (3), attrs);
8596 ASET (spec_vec, 1, aliases);
8597 ASET (spec_vec, 2, eol_type);
8598
8599 Fputhash (name, spec_vec, Vcoding_system_hash_table);
8600 Vcoding_system_list = Fcons (name, Vcoding_system_list);
8601 Vcoding_system_alist = Fcons (Fcons (Fsymbol_name (name), Qnil),
8602 Vcoding_system_alist);
8603
8604 {
8605 int id = coding_categories[category].id;
8606
8607 if (id < 0 || EQ (name, CODING_ID_NAME (id)))
8608 setup_coding_system (name, &coding_categories[category]);
8609 }
8610
8611 return Qnil;
8612
8613 short_args:
8614 return Fsignal (Qwrong_number_of_arguments,
8615 Fcons (intern ("define-coding-system-internal"),
8616 make_number (nargs)));
8617 }
8618
8619
8620 DEFUN ("coding-system-put", Fcoding_system_put, Scoding_system_put,
8621 3, 3, 0,
8622 doc: /* Change value in CODING-SYSTEM's property list PROP to VAL. */)
8623 (coding_system, prop, val)
8624 Lisp_Object coding_system, prop, val;
8625 {
8626 Lisp_Object spec, attrs, plist;
8627
8628 CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
8629 attrs = AREF (spec, 0);
8630 if (EQ (prop, QCmnemonic))
8631 {
8632 if (! STRINGP (val))
8633 CHECK_CHARACTER (val);
8634 CODING_ATTR_MNEMONIC (attrs) = val;
8635 }
8636 else if (EQ (prop, QCdefalut_char))
8637 {
8638 if (NILP (val))
8639 val = make_number (' ');
8640 else
8641 CHECK_CHARACTER (val);
8642 CODING_ATTR_DEFAULT_CHAR (attrs) = val;
8643 }
8644 else if (EQ (prop, QCdecode_translation_table))
8645 {
8646 if (! CHAR_TABLE_P (val) && ! CONSP (val))
8647 CHECK_SYMBOL (val);
8648 CODING_ATTR_DECODE_TBL (attrs) = val;
8649 }
8650 else if (EQ (prop, QCencode_translation_table))
8651 {
8652 if (! CHAR_TABLE_P (val) && ! CONSP (val))
8653 CHECK_SYMBOL (val);
8654 CODING_ATTR_ENCODE_TBL (attrs) = val;
8655 }
8656 else if (EQ (prop, QCpost_read_conversion))
8657 {
8658 CHECK_SYMBOL (val);
8659 CODING_ATTR_POST_READ (attrs) = val;
8660 }
8661 else if (EQ (prop, QCpre_write_conversion))
8662 {
8663 CHECK_SYMBOL (val);
8664 CODING_ATTR_PRE_WRITE (attrs) = val;
8665 }
8666
8667 CODING_ATTR_PLIST (attrs)
8668 = Fplist_put (CODING_ATTR_PLIST (attrs), prop, val);
8669 return val;
8670 }
8671
8672
8673 DEFUN ("define-coding-system-alias", Fdefine_coding_system_alias,
8674 Sdefine_coding_system_alias, 2, 2, 0,
8675 doc: /* Define ALIAS as an alias for CODING-SYSTEM. */)
8676 (alias, coding_system)
8677 Lisp_Object alias, coding_system;
8678 {
8679 Lisp_Object spec, aliases, eol_type;
8680
8681 CHECK_SYMBOL (alias);
8682 CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
8683 aliases = AREF (spec, 1);
8684 /* ALISES should be a list of length more than zero, and the first
8685 element is a base coding system. Append ALIAS at the tail of the
8686 list. */
8687 while (!NILP (XCDR (aliases)))
8688 aliases = XCDR (aliases);
8689 XSETCDR (aliases, Fcons (alias, Qnil));
8690
8691 eol_type = AREF (spec, 2);
8692 if (VECTORP (eol_type))
8693 {
8694 Lisp_Object subsidiaries;
8695 int i;
8696
8697 subsidiaries = make_subsidiaries (alias);
8698 for (i = 0; i < 3; i++)
8699 Fdefine_coding_system_alias (AREF (subsidiaries, i),
8700 AREF (eol_type, i));
8701 }
8702
8703 Fputhash (alias, spec, Vcoding_system_hash_table);
8704 Vcoding_system_list = Fcons (alias, Vcoding_system_list);
8705 Vcoding_system_alist = Fcons (Fcons (Fsymbol_name (alias), Qnil),
8706 Vcoding_system_alist);
8707
8708 return Qnil;
8709 }
8710
8711 DEFUN ("coding-system-base", Fcoding_system_base, Scoding_system_base,
8712 1, 1, 0,
8713 doc: /* Return the base of CODING-SYSTEM.
8714 Any alias or subsidiary coding system is not a base coding system. */)
8715 (coding_system)
8716 Lisp_Object coding_system;
8717 {
8718 Lisp_Object spec, attrs;
8719
8720 if (NILP (coding_system))
8721 return (Qno_conversion);
8722 CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
8723 attrs = AREF (spec, 0);
8724 return CODING_ATTR_BASE_NAME (attrs);
8725 }
8726
8727 DEFUN ("coding-system-plist", Fcoding_system_plist, Scoding_system_plist,
8728 1, 1, 0,
8729 doc: "Return the property list of CODING-SYSTEM.")
8730 (coding_system)
8731 Lisp_Object coding_system;
8732 {
8733 Lisp_Object spec, attrs;
8734
8735 if (NILP (coding_system))
8736 coding_system = Qno_conversion;
8737 CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
8738 attrs = AREF (spec, 0);
8739 return CODING_ATTR_PLIST (attrs);
8740 }
8741
8742
8743 DEFUN ("coding-system-aliases", Fcoding_system_aliases, Scoding_system_aliases,
8744 1, 1, 0,
8745 doc: /* Return the list of aliases of CODING-SYSTEM. */)
8746 (coding_system)
8747 Lisp_Object coding_system;
8748 {
8749 Lisp_Object spec;
8750
8751 if (NILP (coding_system))
8752 coding_system = Qno_conversion;
8753 CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
8754 return AREF (spec, 1);
8755 }
8756
8757 DEFUN ("coding-system-eol-type", Fcoding_system_eol_type,
8758 Scoding_system_eol_type, 1, 1, 0,
8759 doc: /* Return eol-type of CODING-SYSTEM.
8760 An eol-type is integer 0, 1, 2, or a vector of coding systems.
8761
8762 Integer values 0, 1, and 2 indicate a format of end-of-line; LF, CRLF,
8763 and CR respectively.
8764
8765 A vector value indicates that a format of end-of-line should be
8766 detected automatically. Nth element of the vector is the subsidiary
8767 coding system whose eol-type is N. */)
8768 (coding_system)
8769 Lisp_Object coding_system;
8770 {
8771 Lisp_Object spec, eol_type;
8772 int n;
8773
8774 if (NILP (coding_system))
8775 coding_system = Qno_conversion;
8776 if (! CODING_SYSTEM_P (coding_system))
8777 return Qnil;
8778 spec = CODING_SYSTEM_SPEC (coding_system);
8779 eol_type = AREF (spec, 2);
8780 if (VECTORP (eol_type))
8781 return Fcopy_sequence (eol_type);
8782 n = EQ (eol_type, Qunix) ? 0 : EQ (eol_type, Qdos) ? 1 : 2;
8783 return make_number (n);
8784 }
8785
8786 #endif /* emacs */
8787
8788 \f
8789 /*** 9. Post-amble ***/
8790
8791 void
8792 init_coding_once ()
8793 {
8794 int i;
8795
8796 for (i = 0; i < coding_category_max; i++)
8797 {
8798 coding_categories[i].id = -1;
8799 coding_priorities[i] = i;
8800 }
8801
8802 /* ISO2022 specific initialize routine. */
8803 for (i = 0; i < 0x20; i++)
8804 iso_code_class[i] = ISO_control_0;
8805 for (i = 0x21; i < 0x7F; i++)
8806 iso_code_class[i] = ISO_graphic_plane_0;
8807 for (i = 0x80; i < 0xA0; i++)
8808 iso_code_class[i] = ISO_control_1;
8809 for (i = 0xA1; i < 0xFF; i++)
8810 iso_code_class[i] = ISO_graphic_plane_1;
8811 iso_code_class[0x20] = iso_code_class[0x7F] = ISO_0x20_or_0x7F;
8812 iso_code_class[0xA0] = iso_code_class[0xFF] = ISO_0xA0_or_0xFF;
8813 iso_code_class[ISO_CODE_SO] = ISO_shift_out;
8814 iso_code_class[ISO_CODE_SI] = ISO_shift_in;
8815 iso_code_class[ISO_CODE_SS2_7] = ISO_single_shift_2_7;
8816 iso_code_class[ISO_CODE_ESC] = ISO_escape;
8817 iso_code_class[ISO_CODE_SS2] = ISO_single_shift_2;
8818 iso_code_class[ISO_CODE_SS3] = ISO_single_shift_3;
8819 iso_code_class[ISO_CODE_CSI] = ISO_control_sequence_introducer;
8820
8821 for (i = 0; i < 256; i++)
8822 {
8823 emacs_mule_bytes[i] = 1;
8824 }
8825 emacs_mule_bytes[EMACS_MULE_LEADING_CODE_PRIVATE_11] = 3;
8826 emacs_mule_bytes[EMACS_MULE_LEADING_CODE_PRIVATE_12] = 3;
8827 emacs_mule_bytes[EMACS_MULE_LEADING_CODE_PRIVATE_21] = 4;
8828 emacs_mule_bytes[EMACS_MULE_LEADING_CODE_PRIVATE_22] = 4;
8829 }
8830
8831 #ifdef emacs
8832
8833 void
8834 syms_of_coding ()
8835 {
8836 staticpro (&Vcoding_system_hash_table);
8837 {
8838 Lisp_Object args[2];
8839 args[0] = QCtest;
8840 args[1] = Qeq;
8841 Vcoding_system_hash_table = Fmake_hash_table (2, args);
8842 }
8843
8844 staticpro (&Vsjis_coding_system);
8845 Vsjis_coding_system = Qnil;
8846
8847 staticpro (&Vbig5_coding_system);
8848 Vbig5_coding_system = Qnil;
8849
8850 staticpro (&Vcode_conversion_reused_workbuf);
8851 Vcode_conversion_reused_workbuf = Qnil;
8852
8853 staticpro (&Vcode_conversion_workbuf_name);
8854 Vcode_conversion_workbuf_name = build_string (" *code-conversion-work*");
8855
8856 reused_workbuf_in_use = 0;
8857
8858 DEFSYM (Qcharset, "charset");
8859 DEFSYM (Qtarget_idx, "target-idx");
8860 DEFSYM (Qcoding_system_history, "coding-system-history");
8861 Fset (Qcoding_system_history, Qnil);
8862
8863 /* Target FILENAME is the first argument. */
8864 Fput (Qinsert_file_contents, Qtarget_idx, make_number (0));
8865 /* Target FILENAME is the third argument. */
8866 Fput (Qwrite_region, Qtarget_idx, make_number (2));
8867
8868 DEFSYM (Qcall_process, "call-process");
8869 /* Target PROGRAM is the first argument. */
8870 Fput (Qcall_process, Qtarget_idx, make_number (0));
8871
8872 DEFSYM (Qcall_process_region, "call-process-region");
8873 /* Target PROGRAM is the third argument. */
8874 Fput (Qcall_process_region, Qtarget_idx, make_number (2));
8875
8876 DEFSYM (Qstart_process, "start-process");
8877 /* Target PROGRAM is the third argument. */
8878 Fput (Qstart_process, Qtarget_idx, make_number (2));
8879
8880 DEFSYM (Qopen_network_stream, "open-network-stream");
8881 /* Target SERVICE is the fourth argument. */
8882 Fput (Qopen_network_stream, Qtarget_idx, make_number (3));
8883
8884 DEFSYM (Qcoding_system, "coding-system");
8885 DEFSYM (Qcoding_aliases, "coding-aliases");
8886
8887 DEFSYM (Qeol_type, "eol-type");
8888 DEFSYM (Qunix, "unix");
8889 DEFSYM (Qdos, "dos");
8890
8891 DEFSYM (Qbuffer_file_coding_system, "buffer-file-coding-system");
8892 DEFSYM (Qpost_read_conversion, "post-read-conversion");
8893 DEFSYM (Qpre_write_conversion, "pre-write-conversion");
8894 DEFSYM (Qdefault_char, "default-char");
8895 DEFSYM (Qundecided, "undecided");
8896 DEFSYM (Qno_conversion, "no-conversion");
8897 DEFSYM (Qraw_text, "raw-text");
8898
8899 DEFSYM (Qiso_2022, "iso-2022");
8900
8901 DEFSYM (Qutf_8, "utf-8");
8902 DEFSYM (Qutf_8_emacs, "utf-8-emacs");
8903
8904 DEFSYM (Qutf_16, "utf-16");
8905 DEFSYM (Qbig, "big");
8906 DEFSYM (Qlittle, "little");
8907
8908 DEFSYM (Qshift_jis, "shift-jis");
8909 DEFSYM (Qbig5, "big5");
8910
8911 DEFSYM (Qcoding_system_p, "coding-system-p");
8912
8913 DEFSYM (Qcoding_system_error, "coding-system-error");
8914 Fput (Qcoding_system_error, Qerror_conditions,
8915 Fcons (Qcoding_system_error, Fcons (Qerror, Qnil)));
8916 Fput (Qcoding_system_error, Qerror_message,
8917 build_string ("Invalid coding system"));
8918
8919 /* Intern this now in case it isn't already done.
8920 Setting this variable twice is harmless.
8921 But don't staticpro it here--that is done in alloc.c. */
8922 Qchar_table_extra_slots = intern ("char-table-extra-slots");
8923
8924 DEFSYM (Qtranslation_table, "translation-table");
8925 Fput (Qtranslation_table, Qchar_table_extra_slots, make_number (1));
8926 DEFSYM (Qtranslation_table_id, "translation-table-id");
8927 DEFSYM (Qtranslation_table_for_decode, "translation-table-for-decode");
8928 DEFSYM (Qtranslation_table_for_encode, "translation-table-for-encode");
8929
8930 DEFSYM (Qvalid_codes, "valid-codes");
8931
8932 DEFSYM (Qemacs_mule, "emacs-mule");
8933
8934 DEFSYM (QCcategory, ":category");
8935 DEFSYM (QCmnemonic, ":mnemonic");
8936 DEFSYM (QCdefalut_char, ":default-char");
8937 DEFSYM (QCdecode_translation_table, ":decode-translation-table");
8938 DEFSYM (QCencode_translation_table, ":encode-translation-table");
8939 DEFSYM (QCpost_read_conversion, ":post-read-conversion");
8940 DEFSYM (QCpre_write_conversion, ":pre-write-conversion");
8941
8942 Vcoding_category_table
8943 = Fmake_vector (make_number (coding_category_max), Qnil);
8944 staticpro (&Vcoding_category_table);
8945 /* Followings are target of code detection. */
8946 ASET (Vcoding_category_table, coding_category_iso_7,
8947 intern ("coding-category-iso-7"));
8948 ASET (Vcoding_category_table, coding_category_iso_7_tight,
8949 intern ("coding-category-iso-7-tight"));
8950 ASET (Vcoding_category_table, coding_category_iso_8_1,
8951 intern ("coding-category-iso-8-1"));
8952 ASET (Vcoding_category_table, coding_category_iso_8_2,
8953 intern ("coding-category-iso-8-2"));
8954 ASET (Vcoding_category_table, coding_category_iso_7_else,
8955 intern ("coding-category-iso-7-else"));
8956 ASET (Vcoding_category_table, coding_category_iso_8_else,
8957 intern ("coding-category-iso-8-else"));
8958 ASET (Vcoding_category_table, coding_category_utf_8,
8959 intern ("coding-category-utf-8"));
8960 ASET (Vcoding_category_table, coding_category_utf_16_be,
8961 intern ("coding-category-utf-16-be"));
8962 ASET (Vcoding_category_table, coding_category_utf_16_auto,
8963 intern ("coding-category-utf-16-auto"));
8964 ASET (Vcoding_category_table, coding_category_utf_16_le,
8965 intern ("coding-category-utf-16-le"));
8966 ASET (Vcoding_category_table, coding_category_utf_16_be_nosig,
8967 intern ("coding-category-utf-16-be-nosig"));
8968 ASET (Vcoding_category_table, coding_category_utf_16_le_nosig,
8969 intern ("coding-category-utf-16-le-nosig"));
8970 ASET (Vcoding_category_table, coding_category_charset,
8971 intern ("coding-category-charset"));
8972 ASET (Vcoding_category_table, coding_category_sjis,
8973 intern ("coding-category-sjis"));
8974 ASET (Vcoding_category_table, coding_category_big5,
8975 intern ("coding-category-big5"));
8976 ASET (Vcoding_category_table, coding_category_ccl,
8977 intern ("coding-category-ccl"));
8978 ASET (Vcoding_category_table, coding_category_emacs_mule,
8979 intern ("coding-category-emacs-mule"));
8980 /* Followings are NOT target of code detection. */
8981 ASET (Vcoding_category_table, coding_category_raw_text,
8982 intern ("coding-category-raw-text"));
8983 ASET (Vcoding_category_table, coding_category_undecided,
8984 intern ("coding-category-undecided"));
8985
8986 DEFSYM (Qinsufficient_source, "insufficient-source");
8987 DEFSYM (Qinconsistent_eol, "inconsistent-eol");
8988 DEFSYM (Qinvalid_source, "invalid-source");
8989 DEFSYM (Qinterrupted, "interrupted");
8990 DEFSYM (Qinsufficient_memory, "insufficient-memory");
8991
8992 defsubr (&Scoding_system_p);
8993 defsubr (&Sread_coding_system);
8994 defsubr (&Sread_non_nil_coding_system);
8995 defsubr (&Scheck_coding_system);
8996 defsubr (&Sdetect_coding_region);
8997 defsubr (&Sdetect_coding_string);
8998 defsubr (&Sfind_coding_systems_region_internal);
8999 defsubr (&Sunencodable_char_position);
9000 defsubr (&Scheck_coding_systems_region);
9001 defsubr (&Sdecode_coding_region);
9002 defsubr (&Sencode_coding_region);
9003 defsubr (&Sdecode_coding_string);
9004 defsubr (&Sencode_coding_string);
9005 defsubr (&Sdecode_sjis_char);
9006 defsubr (&Sencode_sjis_char);
9007 defsubr (&Sdecode_big5_char);
9008 defsubr (&Sencode_big5_char);
9009 defsubr (&Sset_terminal_coding_system_internal);
9010 defsubr (&Sset_safe_terminal_coding_system_internal);
9011 defsubr (&Sterminal_coding_system);
9012 defsubr (&Sset_keyboard_coding_system_internal);
9013 defsubr (&Skeyboard_coding_system);
9014 defsubr (&Sfind_operation_coding_system);
9015 defsubr (&Sset_coding_system_priority);
9016 defsubr (&Sdefine_coding_system_internal);
9017 defsubr (&Sdefine_coding_system_alias);
9018 defsubr (&Scoding_system_put);
9019 defsubr (&Scoding_system_base);
9020 defsubr (&Scoding_system_plist);
9021 defsubr (&Scoding_system_aliases);
9022 defsubr (&Scoding_system_eol_type);
9023 defsubr (&Scoding_system_priority_list);
9024
9025 DEFVAR_LISP ("coding-system-list", &Vcoding_system_list,
9026 doc: /* List of coding systems.
9027
9028 Do not alter the value of this variable manually. This variable should be
9029 updated by the functions `define-coding-system' and
9030 `define-coding-system-alias'. */);
9031 Vcoding_system_list = Qnil;
9032
9033 DEFVAR_LISP ("coding-system-alist", &Vcoding_system_alist,
9034 doc: /* Alist of coding system names.
9035 Each element is one element list of coding system name.
9036 This variable is given to `completing-read' as TABLE argument.
9037
9038 Do not alter the value of this variable manually. This variable should be
9039 updated by the functions `make-coding-system' and
9040 `define-coding-system-alias'. */);
9041 Vcoding_system_alist = Qnil;
9042
9043 DEFVAR_LISP ("coding-category-list", &Vcoding_category_list,
9044 doc: /* List of coding-categories (symbols) ordered by priority.
9045
9046 On detecting a coding system, Emacs tries code detection algorithms
9047 associated with each coding-category one by one in this order. When
9048 one algorithm agrees with a byte sequence of source text, the coding
9049 system bound to the corresponding coding-category is selected. */);
9050 {
9051 int i;
9052
9053 Vcoding_category_list = Qnil;
9054 for (i = coding_category_max - 1; i >= 0; i--)
9055 Vcoding_category_list
9056 = Fcons (XVECTOR (Vcoding_category_table)->contents[i],
9057 Vcoding_category_list);
9058 }
9059
9060 DEFVAR_LISP ("coding-system-for-read", &Vcoding_system_for_read,
9061 doc: /* Specify the coding system for read operations.
9062 It is useful to bind this variable with `let', but do not set it globally.
9063 If the value is a coding system, it is used for decoding on read operation.
9064 If not, an appropriate element is used from one of the coding system alists:
9065 There are three such tables, `file-coding-system-alist',
9066 `process-coding-system-alist', and `network-coding-system-alist'. */);
9067 Vcoding_system_for_read = Qnil;
9068
9069 DEFVAR_LISP ("coding-system-for-write", &Vcoding_system_for_write,
9070 doc: /* Specify the coding system for write operations.
9071 Programs bind this variable with `let', but you should not set it globally.
9072 If the value is a coding system, it is used for encoding of output,
9073 when writing it to a file and when sending it to a file or subprocess.
9074
9075 If this does not specify a coding system, an appropriate element
9076 is used from one of the coding system alists:
9077 There are three such tables, `file-coding-system-alist',
9078 `process-coding-system-alist', and `network-coding-system-alist'.
9079 For output to files, if the above procedure does not specify a coding system,
9080 the value of `buffer-file-coding-system' is used. */);
9081 Vcoding_system_for_write = Qnil;
9082
9083 DEFVAR_LISP ("last-coding-system-used", &Vlast_coding_system_used,
9084 doc: /*
9085 Coding system used in the latest file or process I/O. */);
9086 Vlast_coding_system_used = Qnil;
9087
9088 DEFVAR_LISP ("last-code-conversion-error", &Vlast_code_conversion_error,
9089 doc: /*
9090 Error status of the last code conversion.
9091
9092 When an error was detected in the last code conversion, this variable
9093 is set to one of the following symbols.
9094 `insufficient-source'
9095 `inconsistent-eol'
9096 `invalid-source'
9097 `interrupted'
9098 `insufficient-memory'
9099 When no error was detected, the value doesn't change. So, to check
9100 the error status of a code conversion by this variable, you must
9101 explicitly set this variable to nil before performing code
9102 conversion. */);
9103 Vlast_code_conversion_error = Qnil;
9104
9105 DEFVAR_BOOL ("inhibit-eol-conversion", &inhibit_eol_conversion,
9106 doc: /*
9107 *Non-nil means always inhibit code conversion of end-of-line format.
9108 See info node `Coding Systems' and info node `Text and Binary' concerning
9109 such conversion. */);
9110 inhibit_eol_conversion = 0;
9111
9112 DEFVAR_BOOL ("inherit-process-coding-system", &inherit_process_coding_system,
9113 doc: /*
9114 Non-nil means process buffer inherits coding system of process output.
9115 Bind it to t if the process output is to be treated as if it were a file
9116 read from some filesystem. */);
9117 inherit_process_coding_system = 0;
9118
9119 DEFVAR_LISP ("file-coding-system-alist", &Vfile_coding_system_alist,
9120 doc: /*
9121 Alist to decide a coding system to use for a file I/O operation.
9122 The format is ((PATTERN . VAL) ...),
9123 where PATTERN is a regular expression matching a file name,
9124 VAL is a coding system, a cons of coding systems, or a function symbol.
9125 If VAL is a coding system, it is used for both decoding and encoding
9126 the file contents.
9127 If VAL is a cons of coding systems, the car part is used for decoding,
9128 and the cdr part is used for encoding.
9129 If VAL is a function symbol, the function must return a coding system
9130 or a cons of coding systems which are used as above. The function gets
9131 the arguments with which `find-operation-coding-systems' was called.
9132
9133 See also the function `find-operation-coding-system'
9134 and the variable `auto-coding-alist'. */);
9135 Vfile_coding_system_alist = Qnil;
9136
9137 DEFVAR_LISP ("process-coding-system-alist", &Vprocess_coding_system_alist,
9138 doc: /*
9139 Alist to decide a coding system to use for a process I/O operation.
9140 The format is ((PATTERN . VAL) ...),
9141 where PATTERN is a regular expression matching a program name,
9142 VAL is a coding system, a cons of coding systems, or a function symbol.
9143 If VAL is a coding system, it is used for both decoding what received
9144 from the program and encoding what sent to the program.
9145 If VAL is a cons of coding systems, the car part is used for decoding,
9146 and the cdr part is used for encoding.
9147 If VAL is a function symbol, the function must return a coding system
9148 or a cons of coding systems which are used as above.
9149
9150 See also the function `find-operation-coding-system'. */);
9151 Vprocess_coding_system_alist = Qnil;
9152
9153 DEFVAR_LISP ("network-coding-system-alist", &Vnetwork_coding_system_alist,
9154 doc: /*
9155 Alist to decide a coding system to use for a network I/O operation.
9156 The format is ((PATTERN . VAL) ...),
9157 where PATTERN is a regular expression matching a network service name
9158 or is a port number to connect to,
9159 VAL is a coding system, a cons of coding systems, or a function symbol.
9160 If VAL is a coding system, it is used for both decoding what received
9161 from the network stream and encoding what sent to the network stream.
9162 If VAL is a cons of coding systems, the car part is used for decoding,
9163 and the cdr part is used for encoding.
9164 If VAL is a function symbol, the function must return a coding system
9165 or a cons of coding systems which are used as above.
9166
9167 See also the function `find-operation-coding-system'. */);
9168 Vnetwork_coding_system_alist = Qnil;
9169
9170 DEFVAR_LISP ("locale-coding-system", &Vlocale_coding_system,
9171 doc: /* Coding system to use with system messages.
9172 Also used for decoding keyboard input on X Window system. */);
9173 Vlocale_coding_system = Qnil;
9174
9175 /* The eol mnemonics are reset in startup.el system-dependently. */
9176 DEFVAR_LISP ("eol-mnemonic-unix", &eol_mnemonic_unix,
9177 doc: /*
9178 *String displayed in mode line for UNIX-like (LF) end-of-line format. */);
9179 eol_mnemonic_unix = build_string (":");
9180
9181 DEFVAR_LISP ("eol-mnemonic-dos", &eol_mnemonic_dos,
9182 doc: /*
9183 *String displayed in mode line for DOS-like (CRLF) end-of-line format. */);
9184 eol_mnemonic_dos = build_string ("\\");
9185
9186 DEFVAR_LISP ("eol-mnemonic-mac", &eol_mnemonic_mac,
9187 doc: /*
9188 *String displayed in mode line for MAC-like (CR) end-of-line format. */);
9189 eol_mnemonic_mac = build_string ("/");
9190
9191 DEFVAR_LISP ("eol-mnemonic-undecided", &eol_mnemonic_undecided,
9192 doc: /*
9193 *String displayed in mode line when end-of-line format is not yet determined. */);
9194 eol_mnemonic_undecided = build_string (":");
9195
9196 DEFVAR_LISP ("enable-character-translation", &Venable_character_translation,
9197 doc: /*
9198 *Non-nil enables character translation while encoding and decoding. */);
9199 Venable_character_translation = Qt;
9200
9201 DEFVAR_LISP ("standard-translation-table-for-decode",
9202 &Vstandard_translation_table_for_decode,
9203 doc: /* Table for translating characters while decoding. */);
9204 Vstandard_translation_table_for_decode = Qnil;
9205
9206 DEFVAR_LISP ("standard-translation-table-for-encode",
9207 &Vstandard_translation_table_for_encode,
9208 doc: /* Table for translating characters while encoding. */);
9209 Vstandard_translation_table_for_encode = Qnil;
9210
9211 DEFVAR_LISP ("charset-revision-table", &Vcharset_revision_table,
9212 doc: /* Alist of charsets vs revision numbers.
9213 While encoding, if a charset (car part of an element) is found,
9214 designate it with the escape sequence identifying revision (cdr part
9215 of the element). */);
9216 Vcharset_revision_table = Qnil;
9217
9218 DEFVAR_LISP ("default-process-coding-system",
9219 &Vdefault_process_coding_system,
9220 doc: /* Cons of coding systems used for process I/O by default.
9221 The car part is used for decoding a process output,
9222 the cdr part is used for encoding a text to be sent to a process. */);
9223 Vdefault_process_coding_system = Qnil;
9224
9225 DEFVAR_LISP ("latin-extra-code-table", &Vlatin_extra_code_table,
9226 doc: /*
9227 Table of extra Latin codes in the range 128..159 (inclusive).
9228 This is a vector of length 256.
9229 If Nth element is non-nil, the existence of code N in a file
9230 \(or output of subprocess) doesn't prevent it to be detected as
9231 a coding system of ISO 2022 variant which has a flag
9232 `accept-latin-extra-code' t (e.g. iso-latin-1) on reading a file
9233 or reading output of a subprocess.
9234 Only 128th through 159th elements has a meaning. */);
9235 Vlatin_extra_code_table = Fmake_vector (make_number (256), Qnil);
9236
9237 DEFVAR_LISP ("select-safe-coding-system-function",
9238 &Vselect_safe_coding_system_function,
9239 doc: /*
9240 Function to call to select safe coding system for encoding a text.
9241
9242 If set, this function is called to force a user to select a proper
9243 coding system which can encode the text in the case that a default
9244 coding system used in each operation can't encode the text.
9245
9246 The default value is `select-safe-coding-system' (which see). */);
9247 Vselect_safe_coding_system_function = Qnil;
9248
9249 DEFVAR_BOOL ("coding-system-require-warning",
9250 &coding_system_require_warning,
9251 doc: /* Internal use only.
9252 If non-nil, on writing a file, `select-safe-coding-system-function' is
9253 called even if `coding-system-for-write' is non-nil. The command
9254 `universal-coding-system-argument' binds this variable to t temporarily. */);
9255 coding_system_require_warning = 0;
9256
9257
9258 DEFVAR_BOOL ("inhibit-iso-escape-detection",
9259 &inhibit_iso_escape_detection,
9260 doc: /*
9261 If non-nil, Emacs ignores ISO2022's escape sequence on code detection.
9262
9263 By default, on reading a file, Emacs tries to detect how the text is
9264 encoded. This code detection is sensitive to escape sequences. If
9265 the sequence is valid as ISO2022, the code is determined as one of
9266 the ISO2022 encodings, and the file is decoded by the corresponding
9267 coding system (e.g. `iso-2022-7bit').
9268
9269 However, there may be a case that you want to read escape sequences in
9270 a file as is. In such a case, you can set this variable to non-nil.
9271 Then, as the code detection ignores any escape sequences, no file is
9272 detected as encoded in some ISO2022 encoding. The result is that all
9273 escape sequences become visible in a buffer.
9274
9275 The default value is nil, and it is strongly recommended not to change
9276 it. That is because many Emacs Lisp source files that contain
9277 non-ASCII characters are encoded by the coding system `iso-2022-7bit'
9278 in Emacs's distribution, and they won't be decoded correctly on
9279 reading if you suppress escape sequence detection.
9280
9281 The other way to read escape sequences in a file without decoding is
9282 to explicitly specify some coding system that doesn't use ISO2022's
9283 escape sequence (e.g `latin-1') on reading by \\[universal-coding-system-argument]. */);
9284 inhibit_iso_escape_detection = 0;
9285
9286 DEFVAR_LISP ("translation-table-for-input", &Vtranslation_table_for_input,
9287 doc: /* Char table for translating self-inserting characters.
9288 This is applied to the result of input methods, not their input. See also
9289 `keyboard-translate-table'. */);
9290 Vtranslation_table_for_input = Qnil;
9291
9292 {
9293 Lisp_Object args[coding_arg_max];
9294 Lisp_Object plist[16];
9295 int i;
9296
9297 for (i = 0; i < coding_arg_max; i++)
9298 args[i] = Qnil;
9299
9300 plist[0] = intern (":name");
9301 plist[1] = args[coding_arg_name] = Qno_conversion;
9302 plist[2] = intern (":mnemonic");
9303 plist[3] = args[coding_arg_mnemonic] = make_number ('=');
9304 plist[4] = intern (":coding-type");
9305 plist[5] = args[coding_arg_coding_type] = Qraw_text;
9306 plist[6] = intern (":ascii-compatible-p");
9307 plist[7] = args[coding_arg_ascii_compatible_p] = Qt;
9308 plist[8] = intern (":default-char");
9309 plist[9] = args[coding_arg_default_char] = make_number (0);
9310 plist[10] = intern (":for-unibyte");
9311 plist[11] = args[coding_arg_for_unibyte] = Qt;
9312 plist[12] = intern (":docstring");
9313 plist[13] = build_string ("Do no conversion.\n\
9314 \n\
9315 When you visit a file with this coding, the file is read into a\n\
9316 unibyte buffer as is, thus each byte of a file is treated as a\n\
9317 character.");
9318 plist[14] = intern (":eol-type");
9319 plist[15] = args[coding_arg_eol_type] = Qunix;
9320 args[coding_arg_plist] = Flist (16, plist);
9321 Fdefine_coding_system_internal (coding_arg_max, args);
9322 }
9323
9324 setup_coding_system (Qno_conversion, &keyboard_coding);
9325 setup_coding_system (Qno_conversion, &terminal_coding);
9326 setup_coding_system (Qno_conversion, &safe_terminal_coding);
9327
9328 {
9329 int i;
9330
9331 for (i = 0; i < coding_category_max; i++)
9332 Fset (AREF (Vcoding_category_table, i), Qno_conversion);
9333 }
9334 }
9335
9336 char *
9337 emacs_strerror (error_number)
9338 int error_number;
9339 {
9340 char *str;
9341
9342 synchronize_system_messages_locale ();
9343 str = strerror (error_number);
9344
9345 if (! NILP (Vlocale_coding_system))
9346 {
9347 Lisp_Object dec = code_convert_string_norecord (build_string (str),
9348 Vlocale_coding_system,
9349 0);
9350 str = (char *) SDATA (dec);
9351 }
9352
9353 return str;
9354 }
9355
9356 #endif /* emacs */