(Qinsufficient_source, Qinconsistent_eol)
[bpt/emacs.git] / src / coding.c
1 /* Coding system handler (conversion, detection, etc).
2 Copyright (C) 1995, 1997, 1998 Electrotechnical Laboratory, JAPAN.
3 Licensed to the Free Software Foundation.
4 Copyright (C) 2001, 2002 Free Software Foundation, Inc.
5 Copyright (C) 2003
6 National Institute of Advanced Industrial Science and Technology (AIST)
7 Registration Number H13PRO009
8
9 This file is part of GNU Emacs.
10
11 GNU Emacs is free software; you can redistribute it and/or modify
12 it under the terms of the GNU General Public License as published by
13 the Free Software Foundation; either version 2, or (at your option)
14 any later version.
15
16 GNU Emacs is distributed in the hope that it will be useful,
17 but WITHOUT ANY WARRANTY; without even the implied warranty of
18 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19 GNU General Public License for more details.
20
21 You should have received a copy of the GNU General Public License
22 along with GNU Emacs; see the file COPYING. If not, write to
23 the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
24 Boston, MA 02111-1307, USA. */
25
26 /*** TABLE OF CONTENTS ***
27
28 0. General comments
29 1. Preamble
30 2. Emacs' internal format (emacs-utf-8) handlers
31 3. UTF-8 handlers
32 4. UTF-16 handlers
33 5. Charset-base coding systems handlers
34 6. emacs-mule (old Emacs' internal format) handlers
35 7. ISO2022 handlers
36 8. Shift-JIS and BIG5 handlers
37 9. CCL handlers
38 10. C library functions
39 11. Emacs Lisp library functions
40 12. Postamble
41
42 */
43
44 /*** 0. General comments ***
45
46
47 CODING SYSTEM
48
49 A coding system is an object for an encoding mechanism that contains
50 information about how to convert byte sequences to character
51 sequences and vice versa. When we say "decode", it means converting
52 a byte sequence of a specific coding system into a character
53 sequence that is represented by Emacs' internal coding system
54 `emacs-utf-8', and when we say "encode", it means converting a
55 character sequence of emacs-utf-8 to a byte sequence of a specific
56 coding system.
57
58 In Emacs Lisp, a coding system is represented by a Lisp symbol. In
59 C level, a coding system is represented by a vector of attributes
60 stored in the hash table Vcharset_hash_table. The conversion from
61 coding system symbol to attributes vector is done by looking up
62 Vcharset_hash_table by the symbol.
63
64 Coding systems are classified into the following types depending on
65 the encoding mechanism. Here's a brief description of the types.
66
67 o UTF-8
68
69 o UTF-16
70
71 o Charset-base coding system
72
73 A coding system defined by one or more (coded) character sets.
74 Decoding and encoding are done by a code converter defined for each
75 character set.
76
77 o Old Emacs internal format (emacs-mule)
78
79 The coding system adopted by old versions of Emacs (20 and 21).
80
81 o ISO2022-base coding system
82
83 The most famous coding system for multiple character sets. X's
84 Compound Text, various EUCs (Extended Unix Code), and coding systems
85 used in the Internet communication such as ISO-2022-JP are all
86 variants of ISO2022.
87
88 o SJIS (or Shift-JIS or MS-Kanji-Code)
89
90 A coding system to encode character sets: ASCII, JISX0201, and
91 JISX0208. Widely used for PC's in Japan. Details are described in
92 section 8.
93
94 o BIG5
95
96 A coding system to encode character sets: ASCII and Big5. Widely
97 used for Chinese (mainly in Taiwan and Hong Kong). Details are
98 described in section 8. In this file, when we write "big5" (all
99 lowercase), we mean the coding system, and when we write "Big5"
100 (capitalized), we mean the character set.
101
102 o CCL
103
104 If a user wants to decode/encode text encoded in a coding system
105 not listed above, he can supply a decoder and an encoder for it in
106 CCL (Code Conversion Language) programs. Emacs executes the CCL
107 program while decoding/encoding.
108
109 o Raw-text
110
111 A coding system for text containing raw eight-bit data. Emacs
112 treats each byte of source text as a character (except for
113 end-of-line conversion).
114
115 o No-conversion
116
117 Like raw text, but don't do end-of-line conversion.
118
119
120 END-OF-LINE FORMAT
121
122 How text end-of-line is encoded depends on operating system. For
123 instance, Unix's format is just one byte of LF (line-feed) code,
124 whereas DOS's format is two-byte sequence of `carriage-return' and
125 `line-feed' codes. MacOS's format is usually one byte of
126 `carriage-return'.
127
128 Since text character encoding and end-of-line encoding are
129 independent, any coding system described above can take any format
130 of end-of-line (except for no-conversion).
131
132 STRUCT CODING_SYSTEM
133
134 Before using a coding system for code conversion (i.e. decoding and
135 encoding), we setup a structure of type `struct coding_system'.
136 This structure keeps various information about a specific code
137 conversion (e.g. the location of source and destination data).
138
139 */
140
141 /* COMMON MACROS */
142
143
144 /*** GENERAL NOTES on `detect_coding_XXX ()' functions ***
145
146 These functions check if a byte sequence specified as a source in
147 CODING conforms to the format of XXX, and update the members of
148 DETECT_INFO.
149
150 Return 1 if the byte sequence conforms to XXX, otherwise return 0.
151
152 Below is the template of these functions. */
153
154 #if 0
155 static int
156 detect_coding_XXX (coding, detect_info)
157 struct coding_system *coding;
158 struct coding_detection_info *detect_info;
159 {
160 unsigned char *src = coding->source;
161 unsigned char *src_end = coding->source + coding->src_bytes;
162 int multibytep = coding->src_multibyte;
163 int consumed_chars = 0;
164 int found = 0;
165 ...;
166
167 while (1)
168 {
169 /* Get one byte from the source. If the souce is exausted, jump
170 to no_more_source:. */
171 ONE_MORE_BYTE (c);
172
173 if (! __C_conforms_to_XXX___ (c))
174 break;
175 if (! __C_strongly_suggests_XXX__ (c))
176 found = CATEGORY_MASK_XXX;
177 }
178 /* The byte sequence is invalid for XXX. */
179 detect_info->rejected |= CATEGORY_MASK_XXX;
180 return 0;
181
182 no_more_source:
183 /* The source exausted successfully. */
184 detect_info->found |= found;
185 return 1;
186 }
187 #endif
188
189 /*** GENERAL NOTES on `decode_coding_XXX ()' functions ***
190
191 These functions decode a byte sequence specified as a source by
192 CODING. The resulting multibyte text goes to a place pointed to by
193 CODING->charbuf, the length of which should not exceed
194 CODING->charbuf_size;
195
196 These functions set the information of original and decoded texts in
197 CODING->consumed, CODING->consumed_char, and CODING->charbuf_used.
198 They also set CODING->result to one of CODING_RESULT_XXX indicating
199 how the decoding is finished.
200
201 Below is the template of these functions. */
202
203 #if 0
204 static void
205 decode_coding_XXXX (coding)
206 struct coding_system *coding;
207 {
208 unsigned char *src = coding->source + coding->consumed;
209 unsigned char *src_end = coding->source + coding->src_bytes;
210 /* SRC_BASE remembers the start position in source in each loop.
211 The loop will be exited when there's not enough source code, or
212 when there's no room in CHARBUF for a decoded character. */
213 unsigned char *src_base;
214 /* A buffer to produce decoded characters. */
215 int *charbuf = coding->charbuf;
216 int *charbuf_end = charbuf + coding->charbuf_size;
217 int multibytep = coding->src_multibyte;
218
219 while (1)
220 {
221 src_base = src;
222 if (charbuf < charbuf_end)
223 /* No more room to produce a decoded character. */
224 break;
225 ONE_MORE_BYTE (c);
226 /* Decode it. */
227 }
228
229 no_more_source:
230 if (src_base < src_end
231 && coding->mode & CODING_MODE_LAST_BLOCK)
232 /* If the source ends by partial bytes to construct a character,
233 treat them as eight-bit raw data. */
234 while (src_base < src_end && charbuf < charbuf_end)
235 *charbuf++ = *src_base++;
236 /* Remember how many bytes and characters we consumed. If the
237 source is multibyte, the bytes and chars are not identical. */
238 coding->consumed = coding->consumed_char = src_base - coding->source;
239 /* Remember how many characters we produced. */
240 coding->charbuf_used = charbuf - coding->charbuf;
241 }
242 #endif
243
244 /*** GENERAL NOTES on `encode_coding_XXX ()' functions ***
245
246 These functions encode SRC_BYTES length text at SOURCE of Emacs'
247 internal multibyte format by CODING. The resulting byte sequence
248 goes to a place pointed to by DESTINATION, the length of which
249 should not exceed DST_BYTES.
250
251 These functions set the information of original and encoded texts in
252 the members produced, produced_char, consumed, and consumed_char of
253 the structure *CODING. They also set the member result to one of
254 CODING_RESULT_XXX indicating how the encoding finished.
255
256 DST_BYTES zero means that source area and destination area are
257 overlapped, which means that we can produce a encoded text until it
258 reaches at the head of not-yet-encoded source text.
259
260 Below is a template of these functions. */
261 #if 0
262 static void
263 encode_coding_XXX (coding)
264 struct coding_system *coding;
265 {
266 int multibytep = coding->dst_multibyte;
267 int *charbuf = coding->charbuf;
268 int *charbuf_end = charbuf->charbuf + coding->charbuf_used;
269 unsigned char *dst = coding->destination + coding->produced;
270 unsigned char *dst_end = coding->destination + coding->dst_bytes;
271 unsigned char *adjusted_dst_end = dst_end - _MAX_BYTES_PRODUCED_IN_LOOP_;
272 int produced_chars = 0;
273
274 for (; charbuf < charbuf_end && dst < adjusted_dst_end; charbuf++)
275 {
276 int c = *charbuf;
277 /* Encode C into DST, and increment DST. */
278 }
279 label_no_more_destination:
280 /* How many chars and bytes we produced. */
281 coding->produced_char += produced_chars;
282 coding->produced = dst - coding->destination;
283 }
284 #endif
285
286 \f
287 /*** 1. Preamble ***/
288
289 #include <config.h>
290 #include <stdio.h>
291
292 #include "lisp.h"
293 #include "buffer.h"
294 #include "character.h"
295 #include "charset.h"
296 #include "ccl.h"
297 #include "composite.h"
298 #include "coding.h"
299 #include "window.h"
300
301 Lisp_Object Vcoding_system_hash_table;
302
303 Lisp_Object Qcoding_system, Qcoding_aliases, Qeol_type;
304 Lisp_Object Qunix, Qdos;
305 extern Lisp_Object Qmac; /* frame.c */
306 Lisp_Object Qbuffer_file_coding_system;
307 Lisp_Object Qpost_read_conversion, Qpre_write_conversion;
308 Lisp_Object Qdefault_char;
309 Lisp_Object Qno_conversion, Qundecided;
310 Lisp_Object Qcharset, Qiso_2022, Qutf_8, Qutf_16, Qshift_jis, Qbig5;
311 Lisp_Object Qbig, Qlittle;
312 Lisp_Object Qcoding_system_history;
313 Lisp_Object Qvalid_codes;
314 Lisp_Object QCcategory;
315
316 extern Lisp_Object Qinsert_file_contents, Qwrite_region;
317 Lisp_Object Qcall_process, Qcall_process_region, Qprocess_argument;
318 Lisp_Object Qstart_process, Qopen_network_stream;
319 Lisp_Object Qtarget_idx;
320
321 Lisp_Object Qinsufficient_source, Qinconsistent_eol, Qinvalid_source;
322 Lisp_Object Qinterrupted, Qinsufficient_memory;
323
324 int coding_system_require_warning;
325
326 Lisp_Object Vselect_safe_coding_system_function;
327
328 /* Mnemonic string for each format of end-of-line. */
329 Lisp_Object eol_mnemonic_unix, eol_mnemonic_dos, eol_mnemonic_mac;
330 /* Mnemonic string to indicate format of end-of-line is not yet
331 decided. */
332 Lisp_Object eol_mnemonic_undecided;
333
334 #ifdef emacs
335
336 Lisp_Object Vcoding_system_list, Vcoding_system_alist;
337
338 Lisp_Object Qcoding_system_p, Qcoding_system_error;
339
340 /* Coding system emacs-mule and raw-text are for converting only
341 end-of-line format. */
342 Lisp_Object Qemacs_mule, Qraw_text;
343 Lisp_Object Qutf_8_emacs;
344
345 /* Coding-systems are handed between Emacs Lisp programs and C internal
346 routines by the following three variables. */
347 /* Coding-system for reading files and receiving data from process. */
348 Lisp_Object Vcoding_system_for_read;
349 /* Coding-system for writing files and sending data to process. */
350 Lisp_Object Vcoding_system_for_write;
351 /* Coding-system actually used in the latest I/O. */
352 Lisp_Object Vlast_coding_system_used;
353 /* Set to non-nil when an error is detected while code conversion. */
354 Lisp_Object Vlast_code_conversion_error;
355 /* A vector of length 256 which contains information about special
356 Latin codes (especially for dealing with Microsoft codes). */
357 Lisp_Object Vlatin_extra_code_table;
358
359 /* Flag to inhibit code conversion of end-of-line format. */
360 int inhibit_eol_conversion;
361
362 /* Flag to inhibit ISO2022 escape sequence detection. */
363 int inhibit_iso_escape_detection;
364
365 /* Flag to make buffer-file-coding-system inherit from process-coding. */
366 int inherit_process_coding_system;
367
368 /* Coding system to be used to encode text for terminal display. */
369 struct coding_system terminal_coding;
370
371 /* Coding system to be used to encode text for terminal display when
372 terminal coding system is nil. */
373 struct coding_system safe_terminal_coding;
374
375 /* Coding system of what is sent from terminal keyboard. */
376 struct coding_system keyboard_coding;
377
378 Lisp_Object Vfile_coding_system_alist;
379 Lisp_Object Vprocess_coding_system_alist;
380 Lisp_Object Vnetwork_coding_system_alist;
381
382 Lisp_Object Vlocale_coding_system;
383
384 #endif /* emacs */
385
386 /* Flag to tell if we look up translation table on character code
387 conversion. */
388 Lisp_Object Venable_character_translation;
389 /* Standard translation table to look up on decoding (reading). */
390 Lisp_Object Vstandard_translation_table_for_decode;
391 /* Standard translation table to look up on encoding (writing). */
392 Lisp_Object Vstandard_translation_table_for_encode;
393
394 Lisp_Object Qtranslation_table;
395 Lisp_Object Qtranslation_table_id;
396 Lisp_Object Qtranslation_table_for_decode;
397 Lisp_Object Qtranslation_table_for_encode;
398
399 /* Alist of charsets vs revision number. */
400 static Lisp_Object Vcharset_revision_table;
401
402 /* Default coding systems used for process I/O. */
403 Lisp_Object Vdefault_process_coding_system;
404
405 /* Char table for translating Quail and self-inserting input. */
406 Lisp_Object Vtranslation_table_for_input;
407
408 /* Two special coding systems. */
409 Lisp_Object Vsjis_coding_system;
410 Lisp_Object Vbig5_coding_system;
411
412
413 static void record_conversion_result (struct coding_system *coding,
414 enum coding_result_code result);
415 static int detect_coding_utf_8 P_ ((struct coding_system *,
416 struct coding_detection_info *info));
417 static void decode_coding_utf_8 P_ ((struct coding_system *));
418 static int encode_coding_utf_8 P_ ((struct coding_system *));
419
420 static int detect_coding_utf_16 P_ ((struct coding_system *,
421 struct coding_detection_info *info));
422 static void decode_coding_utf_16 P_ ((struct coding_system *));
423 static int encode_coding_utf_16 P_ ((struct coding_system *));
424
425 static int detect_coding_iso_2022 P_ ((struct coding_system *,
426 struct coding_detection_info *info));
427 static void decode_coding_iso_2022 P_ ((struct coding_system *));
428 static int encode_coding_iso_2022 P_ ((struct coding_system *));
429
430 static int detect_coding_emacs_mule P_ ((struct coding_system *,
431 struct coding_detection_info *info));
432 static void decode_coding_emacs_mule P_ ((struct coding_system *));
433 static int encode_coding_emacs_mule P_ ((struct coding_system *));
434
435 static int detect_coding_sjis P_ ((struct coding_system *,
436 struct coding_detection_info *info));
437 static void decode_coding_sjis P_ ((struct coding_system *));
438 static int encode_coding_sjis P_ ((struct coding_system *));
439
440 static int detect_coding_big5 P_ ((struct coding_system *,
441 struct coding_detection_info *info));
442 static void decode_coding_big5 P_ ((struct coding_system *));
443 static int encode_coding_big5 P_ ((struct coding_system *));
444
445 static int detect_coding_ccl P_ ((struct coding_system *,
446 struct coding_detection_info *info));
447 static void decode_coding_ccl P_ ((struct coding_system *));
448 static int encode_coding_ccl P_ ((struct coding_system *));
449
450 static void decode_coding_raw_text P_ ((struct coding_system *));
451 static int encode_coding_raw_text P_ ((struct coding_system *));
452
453
454 /* ISO2022 section */
455
456 #define CODING_ISO_INITIAL(coding, reg) \
457 (XINT (AREF (AREF (CODING_ID_ATTRS ((coding)->id), \
458 coding_attr_iso_initial), \
459 reg)))
460
461
462 #define CODING_ISO_REQUEST(coding, charset_id) \
463 ((charset_id <= (coding)->max_charset_id \
464 ? (coding)->safe_charsets[charset_id] \
465 : -1))
466
467
468 #define CODING_ISO_FLAGS(coding) \
469 ((coding)->spec.iso_2022.flags)
470 #define CODING_ISO_DESIGNATION(coding, reg) \
471 ((coding)->spec.iso_2022.current_designation[reg])
472 #define CODING_ISO_INVOCATION(coding, plane) \
473 ((coding)->spec.iso_2022.current_invocation[plane])
474 #define CODING_ISO_SINGLE_SHIFTING(coding) \
475 ((coding)->spec.iso_2022.single_shifting)
476 #define CODING_ISO_BOL(coding) \
477 ((coding)->spec.iso_2022.bol)
478 #define CODING_ISO_INVOKED_CHARSET(coding, plane) \
479 CODING_ISO_DESIGNATION ((coding), CODING_ISO_INVOCATION ((coding), (plane)))
480
481 /* Control characters of ISO2022. */
482 /* code */ /* function */
483 #define ISO_CODE_LF 0x0A /* line-feed */
484 #define ISO_CODE_CR 0x0D /* carriage-return */
485 #define ISO_CODE_SO 0x0E /* shift-out */
486 #define ISO_CODE_SI 0x0F /* shift-in */
487 #define ISO_CODE_SS2_7 0x19 /* single-shift-2 for 7-bit code */
488 #define ISO_CODE_ESC 0x1B /* escape */
489 #define ISO_CODE_SS2 0x8E /* single-shift-2 */
490 #define ISO_CODE_SS3 0x8F /* single-shift-3 */
491 #define ISO_CODE_CSI 0x9B /* control-sequence-introducer */
492
493 /* All code (1-byte) of ISO2022 is classified into one of the
494 followings. */
495 enum iso_code_class_type
496 {
497 ISO_control_0, /* Control codes in the range
498 0x00..0x1F and 0x7F, except for the
499 following 5 codes. */
500 ISO_shift_out, /* ISO_CODE_SO (0x0E) */
501 ISO_shift_in, /* ISO_CODE_SI (0x0F) */
502 ISO_single_shift_2_7, /* ISO_CODE_SS2_7 (0x19) */
503 ISO_escape, /* ISO_CODE_SO (0x1B) */
504 ISO_control_1, /* Control codes in the range
505 0x80..0x9F, except for the
506 following 3 codes. */
507 ISO_single_shift_2, /* ISO_CODE_SS2 (0x8E) */
508 ISO_single_shift_3, /* ISO_CODE_SS3 (0x8F) */
509 ISO_control_sequence_introducer, /* ISO_CODE_CSI (0x9B) */
510 ISO_0x20_or_0x7F, /* Codes of the values 0x20 or 0x7F. */
511 ISO_graphic_plane_0, /* Graphic codes in the range 0x21..0x7E. */
512 ISO_0xA0_or_0xFF, /* Codes of the values 0xA0 or 0xFF. */
513 ISO_graphic_plane_1 /* Graphic codes in the range 0xA1..0xFE. */
514 };
515
516 /** The macros CODING_ISO_FLAG_XXX defines a flag bit of the
517 `iso-flags' attribute of an iso2022 coding system. */
518
519 /* If set, produce long-form designation sequence (e.g. ESC $ ( A)
520 instead of the correct short-form sequence (e.g. ESC $ A). */
521 #define CODING_ISO_FLAG_LONG_FORM 0x0001
522
523 /* If set, reset graphic planes and registers at end-of-line to the
524 initial state. */
525 #define CODING_ISO_FLAG_RESET_AT_EOL 0x0002
526
527 /* If set, reset graphic planes and registers before any control
528 characters to the initial state. */
529 #define CODING_ISO_FLAG_RESET_AT_CNTL 0x0004
530
531 /* If set, encode by 7-bit environment. */
532 #define CODING_ISO_FLAG_SEVEN_BITS 0x0008
533
534 /* If set, use locking-shift function. */
535 #define CODING_ISO_FLAG_LOCKING_SHIFT 0x0010
536
537 /* If set, use single-shift function. Overwrite
538 CODING_ISO_FLAG_LOCKING_SHIFT. */
539 #define CODING_ISO_FLAG_SINGLE_SHIFT 0x0020
540
541 /* If set, use designation escape sequence. */
542 #define CODING_ISO_FLAG_DESIGNATION 0x0040
543
544 /* If set, produce revision number sequence. */
545 #define CODING_ISO_FLAG_REVISION 0x0080
546
547 /* If set, produce ISO6429's direction specifying sequence. */
548 #define CODING_ISO_FLAG_DIRECTION 0x0100
549
550 /* If set, assume designation states are reset at beginning of line on
551 output. */
552 #define CODING_ISO_FLAG_INIT_AT_BOL 0x0200
553
554 /* If set, designation sequence should be placed at beginning of line
555 on output. */
556 #define CODING_ISO_FLAG_DESIGNATE_AT_BOL 0x0400
557
558 /* If set, do not encode unsafe charactes on output. */
559 #define CODING_ISO_FLAG_SAFE 0x0800
560
561 /* If set, extra latin codes (128..159) are accepted as a valid code
562 on input. */
563 #define CODING_ISO_FLAG_LATIN_EXTRA 0x1000
564
565 #define CODING_ISO_FLAG_COMPOSITION 0x2000
566
567 #define CODING_ISO_FLAG_EUC_TW_SHIFT 0x4000
568
569 #define CODING_ISO_FLAG_USE_ROMAN 0x8000
570
571 #define CODING_ISO_FLAG_USE_OLDJIS 0x10000
572
573 #define CODING_ISO_FLAG_FULL_SUPPORT 0x100000
574
575 /* A character to be produced on output if encoding of the original
576 character is prohibited by CODING_ISO_FLAG_SAFE. */
577 #define CODING_INHIBIT_CHARACTER_SUBSTITUTION '?'
578
579
580 /* UTF-16 section */
581 #define CODING_UTF_16_BOM(coding) \
582 ((coding)->spec.utf_16.bom)
583
584 #define CODING_UTF_16_ENDIAN(coding) \
585 ((coding)->spec.utf_16.endian)
586
587 #define CODING_UTF_16_SURROGATE(coding) \
588 ((coding)->spec.utf_16.surrogate)
589
590
591 /* CCL section */
592 #define CODING_CCL_DECODER(coding) \
593 AREF (CODING_ID_ATTRS ((coding)->id), coding_attr_ccl_decoder)
594 #define CODING_CCL_ENCODER(coding) \
595 AREF (CODING_ID_ATTRS ((coding)->id), coding_attr_ccl_encoder)
596 #define CODING_CCL_VALIDS(coding) \
597 (SDATA (AREF (CODING_ID_ATTRS ((coding)->id), coding_attr_ccl_valids)))
598
599 /* Index for each coding category in `coding_categories' */
600
601 enum coding_category
602 {
603 coding_category_iso_7,
604 coding_category_iso_7_tight,
605 coding_category_iso_8_1,
606 coding_category_iso_8_2,
607 coding_category_iso_7_else,
608 coding_category_iso_8_else,
609 coding_category_utf_8,
610 coding_category_utf_16_auto,
611 coding_category_utf_16_be,
612 coding_category_utf_16_le,
613 coding_category_utf_16_be_nosig,
614 coding_category_utf_16_le_nosig,
615 coding_category_charset,
616 coding_category_sjis,
617 coding_category_big5,
618 coding_category_ccl,
619 coding_category_emacs_mule,
620 /* All above are targets of code detection. */
621 coding_category_raw_text,
622 coding_category_undecided,
623 coding_category_max
624 };
625
626 /* Definitions of flag bits used in detect_coding_XXXX. */
627 #define CATEGORY_MASK_ISO_7 (1 << coding_category_iso_7)
628 #define CATEGORY_MASK_ISO_7_TIGHT (1 << coding_category_iso_7_tight)
629 #define CATEGORY_MASK_ISO_8_1 (1 << coding_category_iso_8_1)
630 #define CATEGORY_MASK_ISO_8_2 (1 << coding_category_iso_8_2)
631 #define CATEGORY_MASK_ISO_7_ELSE (1 << coding_category_iso_7_else)
632 #define CATEGORY_MASK_ISO_8_ELSE (1 << coding_category_iso_8_else)
633 #define CATEGORY_MASK_UTF_8 (1 << coding_category_utf_8)
634 #define CATEGORY_MASK_UTF_16_AUTO (1 << coding_category_utf_16_auto)
635 #define CATEGORY_MASK_UTF_16_BE (1 << coding_category_utf_16_be)
636 #define CATEGORY_MASK_UTF_16_LE (1 << coding_category_utf_16_le)
637 #define CATEGORY_MASK_UTF_16_BE_NOSIG (1 << coding_category_utf_16_be_nosig)
638 #define CATEGORY_MASK_UTF_16_LE_NOSIG (1 << coding_category_utf_16_le_nosig)
639 #define CATEGORY_MASK_CHARSET (1 << coding_category_charset)
640 #define CATEGORY_MASK_SJIS (1 << coding_category_sjis)
641 #define CATEGORY_MASK_BIG5 (1 << coding_category_big5)
642 #define CATEGORY_MASK_CCL (1 << coding_category_ccl)
643 #define CATEGORY_MASK_EMACS_MULE (1 << coding_category_emacs_mule)
644 #define CATEGORY_MASK_RAW_TEXT (1 << coding_category_raw_text)
645
646 /* This value is returned if detect_coding_mask () find nothing other
647 than ASCII characters. */
648 #define CATEGORY_MASK_ANY \
649 (CATEGORY_MASK_ISO_7 \
650 | CATEGORY_MASK_ISO_7_TIGHT \
651 | CATEGORY_MASK_ISO_8_1 \
652 | CATEGORY_MASK_ISO_8_2 \
653 | CATEGORY_MASK_ISO_7_ELSE \
654 | CATEGORY_MASK_ISO_8_ELSE \
655 | CATEGORY_MASK_UTF_8 \
656 | CATEGORY_MASK_UTF_16_BE \
657 | CATEGORY_MASK_UTF_16_LE \
658 | CATEGORY_MASK_UTF_16_BE_NOSIG \
659 | CATEGORY_MASK_UTF_16_LE_NOSIG \
660 | CATEGORY_MASK_CHARSET \
661 | CATEGORY_MASK_SJIS \
662 | CATEGORY_MASK_BIG5 \
663 | CATEGORY_MASK_CCL \
664 | CATEGORY_MASK_EMACS_MULE)
665
666
667 #define CATEGORY_MASK_ISO_7BIT \
668 (CATEGORY_MASK_ISO_7 | CATEGORY_MASK_ISO_7_TIGHT)
669
670 #define CATEGORY_MASK_ISO_8BIT \
671 (CATEGORY_MASK_ISO_8_1 | CATEGORY_MASK_ISO_8_2)
672
673 #define CATEGORY_MASK_ISO_ELSE \
674 (CATEGORY_MASK_ISO_7_ELSE | CATEGORY_MASK_ISO_8_ELSE)
675
676 #define CATEGORY_MASK_ISO_ESCAPE \
677 (CATEGORY_MASK_ISO_7 \
678 | CATEGORY_MASK_ISO_7_TIGHT \
679 | CATEGORY_MASK_ISO_7_ELSE \
680 | CATEGORY_MASK_ISO_8_ELSE)
681
682 #define CATEGORY_MASK_ISO \
683 ( CATEGORY_MASK_ISO_7BIT \
684 | CATEGORY_MASK_ISO_8BIT \
685 | CATEGORY_MASK_ISO_ELSE)
686
687 #define CATEGORY_MASK_UTF_16 \
688 (CATEGORY_MASK_UTF_16_BE \
689 | CATEGORY_MASK_UTF_16_LE \
690 | CATEGORY_MASK_UTF_16_BE_NOSIG \
691 | CATEGORY_MASK_UTF_16_LE_NOSIG)
692
693
694 /* List of symbols `coding-category-xxx' ordered by priority. This
695 variable is exposed to Emacs Lisp. */
696 static Lisp_Object Vcoding_category_list;
697
698 /* Table of coding categories (Lisp symbols). This variable is for
699 internal use oly. */
700 static Lisp_Object Vcoding_category_table;
701
702 /* Table of coding-categories ordered by priority. */
703 static enum coding_category coding_priorities[coding_category_max];
704
705 /* Nth element is a coding context for the coding system bound to the
706 Nth coding category. */
707 static struct coding_system coding_categories[coding_category_max];
708
709 /*** Commonly used macros and functions ***/
710
711 #ifndef min
712 #define min(a, b) ((a) < (b) ? (a) : (b))
713 #endif
714 #ifndef max
715 #define max(a, b) ((a) > (b) ? (a) : (b))
716 #endif
717
718 #define CODING_GET_INFO(coding, attrs, charset_list) \
719 do { \
720 (attrs) = CODING_ID_ATTRS ((coding)->id); \
721 (charset_list) = CODING_ATTR_CHARSET_LIST (attrs); \
722 } while (0)
723
724
725 /* Safely get one byte from the source text pointed by SRC which ends
726 at SRC_END, and set C to that byte. If there are not enough bytes
727 in the source, it jumps to `no_more_source'. If multibytep is
728 nonzero, and a multibyte character is found at SRC, set C to the
729 negative value of the character code. The caller should declare
730 and set these variables appropriately in advance:
731 src, src_end, multibytep */
732
733 #define ONE_MORE_BYTE(c) \
734 do { \
735 if (src == src_end) \
736 { \
737 if (src_base < src) \
738 record_conversion_result \
739 (coding, CODING_RESULT_INSUFFICIENT_SRC); \
740 goto no_more_source; \
741 } \
742 c = *src++; \
743 if (multibytep && (c & 0x80)) \
744 { \
745 if ((c & 0xFE) == 0xC0) \
746 c = ((c & 1) << 6) | *src++; \
747 else \
748 { \
749 c = - string_char (--src, &src, NULL); \
750 record_conversion_result \
751 (coding, CODING_RESULT_INVALID_SRC); \
752 } \
753 } \
754 consumed_chars++; \
755 } while (0)
756
757
758 #define ONE_MORE_BYTE_NO_CHECK(c) \
759 do { \
760 c = *src++; \
761 if (multibytep && (c & 0x80)) \
762 { \
763 if ((c & 0xFE) == 0xC0) \
764 c = ((c & 1) << 6) | *src++; \
765 else \
766 { \
767 c = - string_char (--src, &src, NULL); \
768 record_conversion_result \
769 (coding, CODING_RESULT_INVALID_SRC); \
770 } \
771 } \
772 consumed_chars++; \
773 } while (0)
774
775
776 /* Store a byte C in the place pointed by DST and increment DST to the
777 next free point, and increment PRODUCED_CHARS. The caller should
778 assure that C is 0..127, and declare and set the variable `dst'
779 appropriately in advance.
780 */
781
782
783 #define EMIT_ONE_ASCII_BYTE(c) \
784 do { \
785 produced_chars++; \
786 *dst++ = (c); \
787 } while (0)
788
789
790 /* Like EMIT_ONE_ASCII_BYTE byt store two bytes; C1 and C2. */
791
792 #define EMIT_TWO_ASCII_BYTES(c1, c2) \
793 do { \
794 produced_chars += 2; \
795 *dst++ = (c1), *dst++ = (c2); \
796 } while (0)
797
798
799 /* Store a byte C in the place pointed by DST and increment DST to the
800 next free point, and increment PRODUCED_CHARS. If MULTIBYTEP is
801 nonzero, store in an appropriate multibyte from. The caller should
802 declare and set the variables `dst' and `multibytep' appropriately
803 in advance. */
804
805 #define EMIT_ONE_BYTE(c) \
806 do { \
807 produced_chars++; \
808 if (multibytep) \
809 { \
810 int ch = (c); \
811 if (ch >= 0x80) \
812 ch = BYTE8_TO_CHAR (ch); \
813 CHAR_STRING_ADVANCE (ch, dst); \
814 } \
815 else \
816 *dst++ = (c); \
817 } while (0)
818
819
820 /* Like EMIT_ONE_BYTE, but emit two bytes; C1 and C2. */
821
822 #define EMIT_TWO_BYTES(c1, c2) \
823 do { \
824 produced_chars += 2; \
825 if (multibytep) \
826 { \
827 int ch; \
828 \
829 ch = (c1); \
830 if (ch >= 0x80) \
831 ch = BYTE8_TO_CHAR (ch); \
832 CHAR_STRING_ADVANCE (ch, dst); \
833 ch = (c2); \
834 if (ch >= 0x80) \
835 ch = BYTE8_TO_CHAR (ch); \
836 CHAR_STRING_ADVANCE (ch, dst); \
837 } \
838 else \
839 { \
840 *dst++ = (c1); \
841 *dst++ = (c2); \
842 } \
843 } while (0)
844
845
846 #define EMIT_THREE_BYTES(c1, c2, c3) \
847 do { \
848 EMIT_ONE_BYTE (c1); \
849 EMIT_TWO_BYTES (c2, c3); \
850 } while (0)
851
852
853 #define EMIT_FOUR_BYTES(c1, c2, c3, c4) \
854 do { \
855 EMIT_TWO_BYTES (c1, c2); \
856 EMIT_TWO_BYTES (c3, c4); \
857 } while (0)
858
859
860 static void
861 record_conversion_result (struct coding_system *coding,
862 enum coding_result_code result)
863 {
864 coding->result = result;
865 switch (result)
866 {
867 case CODING_RESULT_INSUFFICIENT_SRC:
868 Vlast_code_conversion_error = Qinsufficient_source;
869 break;
870 case CODING_RESULT_INCONSISTENT_EOL:
871 Vlast_code_conversion_error = Qinconsistent_eol;
872 break;
873 case CODING_RESULT_INVALID_SRC:
874 Vlast_code_conversion_error = Qinvalid_source;
875 break;
876 case CODING_RESULT_INTERRUPT:
877 Vlast_code_conversion_error = Qinterrupted;
878 break;
879 case CODING_RESULT_INSUFFICIENT_MEM:
880 Vlast_code_conversion_error = Qinsufficient_memory;
881 break;
882 }
883 }
884
885 #define CODING_DECODE_CHAR(coding, src, src_base, src_end, charset, code, c) \
886 do { \
887 charset_map_loaded = 0; \
888 c = DECODE_CHAR (charset, code); \
889 if (charset_map_loaded) \
890 { \
891 const unsigned char *orig = coding->source; \
892 EMACS_INT offset; \
893 \
894 coding_set_source (coding); \
895 offset = coding->source - orig; \
896 src += offset; \
897 src_base += offset; \
898 src_end += offset; \
899 } \
900 } while (0)
901
902
903 #define ASSURE_DESTINATION(bytes) \
904 do { \
905 if (dst + (bytes) >= dst_end) \
906 { \
907 int more_bytes = charbuf_end - charbuf + (bytes); \
908 \
909 dst = alloc_destination (coding, more_bytes, dst); \
910 dst_end = coding->destination + coding->dst_bytes; \
911 } \
912 } while (0)
913
914
915
916 static void
917 coding_set_source (coding)
918 struct coding_system *coding;
919 {
920 if (BUFFERP (coding->src_object))
921 {
922 struct buffer *buf = XBUFFER (coding->src_object);
923
924 if (coding->src_pos < 0)
925 coding->source = BUF_GAP_END_ADDR (buf) + coding->src_pos_byte;
926 else
927 coding->source = BUF_BYTE_ADDRESS (buf, coding->src_pos_byte);
928 }
929 else if (STRINGP (coding->src_object))
930 {
931 coding->source = SDATA (coding->src_object) + coding->src_pos_byte;
932 }
933 else
934 /* Otherwise, the source is C string and is never relocated
935 automatically. Thus we don't have to update anything. */
936 ;
937 }
938
939 static void
940 coding_set_destination (coding)
941 struct coding_system *coding;
942 {
943 if (BUFFERP (coding->dst_object))
944 {
945 if (coding->src_pos < 0)
946 {
947 coding->destination = BEG_ADDR + coding->dst_pos_byte - 1;
948 coding->dst_bytes = (GAP_END_ADDR
949 - (coding->src_bytes - coding->consumed)
950 - coding->destination);
951 }
952 else
953 {
954 /* We are sure that coding->dst_pos_byte is before the gap
955 of the buffer. */
956 coding->destination = (BUF_BEG_ADDR (XBUFFER (coding->dst_object))
957 + coding->dst_pos_byte - 1);
958 coding->dst_bytes = (BUF_GAP_END_ADDR (XBUFFER (coding->dst_object))
959 - coding->destination);
960 }
961 }
962 else
963 /* Otherwise, the destination is C string and is never relocated
964 automatically. Thus we don't have to update anything. */
965 ;
966 }
967
968
969 static void
970 coding_alloc_by_realloc (coding, bytes)
971 struct coding_system *coding;
972 EMACS_INT bytes;
973 {
974 coding->destination = (unsigned char *) xrealloc (coding->destination,
975 coding->dst_bytes + bytes);
976 coding->dst_bytes += bytes;
977 }
978
979 static void
980 coding_alloc_by_making_gap (coding, bytes)
981 struct coding_system *coding;
982 EMACS_INT bytes;
983 {
984 if (BUFFERP (coding->dst_object)
985 && EQ (coding->src_object, coding->dst_object))
986 {
987 EMACS_INT add = coding->src_bytes - coding->consumed;
988
989 GAP_SIZE -= add; ZV += add; Z += add; ZV_BYTE += add; Z_BYTE += add;
990 make_gap (bytes);
991 GAP_SIZE += add; ZV -= add; Z -= add; ZV_BYTE -= add; Z_BYTE -= add;
992 }
993 else
994 {
995 Lisp_Object this_buffer;
996
997 this_buffer = Fcurrent_buffer ();
998 set_buffer_internal (XBUFFER (coding->dst_object));
999 make_gap (bytes);
1000 set_buffer_internal (XBUFFER (this_buffer));
1001 }
1002 }
1003
1004
1005 static unsigned char *
1006 alloc_destination (coding, nbytes, dst)
1007 struct coding_system *coding;
1008 EMACS_INT nbytes;
1009 unsigned char *dst;
1010 {
1011 EMACS_INT offset = dst - coding->destination;
1012
1013 if (BUFFERP (coding->dst_object))
1014 coding_alloc_by_making_gap (coding, nbytes);
1015 else
1016 coding_alloc_by_realloc (coding, nbytes);
1017 record_conversion_result (coding, CODING_RESULT_SUCCESS);
1018 coding_set_destination (coding);
1019 dst = coding->destination + offset;
1020 return dst;
1021 }
1022
1023 /** Macros for annotations. */
1024
1025 /* Maximum length of annotation data (sum of annotations for
1026 composition and charset). */
1027 #define MAX_ANNOTATION_LENGTH (5 + (MAX_COMPOSITION_COMPONENTS * 2) - 1 + 5)
1028
1029 /* An annotation data is stored in the array coding->charbuf in this
1030 format:
1031 [ -LENGTH ANNOTATION_MASK FROM TO ... ]
1032 LENGTH is the number of elements in the annotation.
1033 ANNOTATION_MASK is one of CODING_ANNOTATE_XXX_MASK.
1034 FROM and TO specify the range of text annotated. They are relative
1035 to coding->src_pos (on encoding) or coding->dst_pos (on decoding).
1036
1037 The format of the following elements depend on ANNOTATION_MASK.
1038
1039 In the case of CODING_ANNOTATE_COMPOSITION_MASK, these elements
1040 follows:
1041 ... METHOD [ COMPOSITION-COMPONENTS ... ]
1042 METHOD is one of enum composition_method.
1043 Optionnal COMPOSITION-COMPONENTS are characters and composition
1044 rules.
1045
1046 In the case of CODING_ANNOTATE_CHARSET_MASK, one element CHARSET-ID
1047 follows. */
1048
1049 #define ADD_ANNOTATION_DATA(buf, len, mask, from, to) \
1050 do { \
1051 *(buf)++ = -(len); \
1052 *(buf)++ = (mask); \
1053 *(buf)++ = (from); \
1054 *(buf)++ = (to); \
1055 coding->annotated = 1; \
1056 } while (0);
1057
1058 #define ADD_COMPOSITION_DATA(buf, from, to, method) \
1059 do { \
1060 ADD_ANNOTATION_DATA (buf, 5, CODING_ANNOTATE_COMPOSITION_MASK, from, to); \
1061 *buf++ = method; \
1062 } while (0)
1063
1064
1065 #define ADD_CHARSET_DATA(buf, from, to, id) \
1066 do { \
1067 ADD_ANNOTATION_DATA (buf, 5, CODING_ANNOTATE_CHARSET_MASK, from, to); \
1068 *buf++ = id; \
1069 } while (0)
1070
1071 \f
1072 /*** 2. Emacs' internal format (emacs-utf-8) ***/
1073
1074
1075
1076 \f
1077 /*** 3. UTF-8 ***/
1078
1079 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
1080 Check if a text is encoded in UTF-8. If it is, return 1, else
1081 return 0. */
1082
1083 #define UTF_8_1_OCTET_P(c) ((c) < 0x80)
1084 #define UTF_8_EXTRA_OCTET_P(c) (((c) & 0xC0) == 0x80)
1085 #define UTF_8_2_OCTET_LEADING_P(c) (((c) & 0xE0) == 0xC0)
1086 #define UTF_8_3_OCTET_LEADING_P(c) (((c) & 0xF0) == 0xE0)
1087 #define UTF_8_4_OCTET_LEADING_P(c) (((c) & 0xF8) == 0xF0)
1088 #define UTF_8_5_OCTET_LEADING_P(c) (((c) & 0xFC) == 0xF8)
1089
1090 static int
1091 detect_coding_utf_8 (coding, detect_info)
1092 struct coding_system *coding;
1093 struct coding_detection_info *detect_info;
1094 {
1095 const unsigned char *src = coding->source, *src_base;
1096 const unsigned char *src_end = coding->source + coding->src_bytes;
1097 int multibytep = coding->src_multibyte;
1098 int consumed_chars = 0;
1099 int found = 0;
1100
1101 detect_info->checked |= CATEGORY_MASK_UTF_8;
1102 /* A coding system of this category is always ASCII compatible. */
1103 src += coding->head_ascii;
1104
1105 while (1)
1106 {
1107 int c, c1, c2, c3, c4;
1108
1109 src_base = src;
1110 ONE_MORE_BYTE (c);
1111 if (c < 0 || UTF_8_1_OCTET_P (c))
1112 continue;
1113 ONE_MORE_BYTE (c1);
1114 if (c1 < 0 || ! UTF_8_EXTRA_OCTET_P (c1))
1115 break;
1116 if (UTF_8_2_OCTET_LEADING_P (c))
1117 {
1118 found = CATEGORY_MASK_UTF_8;
1119 continue;
1120 }
1121 ONE_MORE_BYTE (c2);
1122 if (c2 < 0 || ! UTF_8_EXTRA_OCTET_P (c2))
1123 break;
1124 if (UTF_8_3_OCTET_LEADING_P (c))
1125 {
1126 found = CATEGORY_MASK_UTF_8;
1127 continue;
1128 }
1129 ONE_MORE_BYTE (c3);
1130 if (c3 < 0 || ! UTF_8_EXTRA_OCTET_P (c3))
1131 break;
1132 if (UTF_8_4_OCTET_LEADING_P (c))
1133 {
1134 found = CATEGORY_MASK_UTF_8;
1135 continue;
1136 }
1137 ONE_MORE_BYTE (c4);
1138 if (c4 < 0 || ! UTF_8_EXTRA_OCTET_P (c4))
1139 break;
1140 if (UTF_8_5_OCTET_LEADING_P (c))
1141 {
1142 found = CATEGORY_MASK_UTF_8;
1143 continue;
1144 }
1145 break;
1146 }
1147 detect_info->rejected |= CATEGORY_MASK_UTF_8;
1148 return 0;
1149
1150 no_more_source:
1151 if (src_base < src && coding->mode & CODING_MODE_LAST_BLOCK)
1152 {
1153 detect_info->rejected |= CATEGORY_MASK_UTF_8;
1154 return 0;
1155 }
1156 detect_info->found |= found;
1157 return 1;
1158 }
1159
1160
1161 static void
1162 decode_coding_utf_8 (coding)
1163 struct coding_system *coding;
1164 {
1165 const unsigned char *src = coding->source + coding->consumed;
1166 const unsigned char *src_end = coding->source + coding->src_bytes;
1167 const unsigned char *src_base;
1168 int *charbuf = coding->charbuf;
1169 int *charbuf_end = charbuf + coding->charbuf_size;
1170 int consumed_chars = 0, consumed_chars_base;
1171 int multibytep = coding->src_multibyte;
1172 Lisp_Object attr, charset_list;
1173
1174 CODING_GET_INFO (coding, attr, charset_list);
1175
1176 while (1)
1177 {
1178 int c, c1, c2, c3, c4, c5;
1179
1180 src_base = src;
1181 consumed_chars_base = consumed_chars;
1182
1183 if (charbuf >= charbuf_end)
1184 break;
1185
1186 ONE_MORE_BYTE (c1);
1187 if (c1 < 0)
1188 {
1189 c = - c1;
1190 }
1191 else if (UTF_8_1_OCTET_P(c1))
1192 {
1193 c = c1;
1194 }
1195 else
1196 {
1197 ONE_MORE_BYTE (c2);
1198 if (c2 < 0 || ! UTF_8_EXTRA_OCTET_P (c2))
1199 goto invalid_code;
1200 if (UTF_8_2_OCTET_LEADING_P (c1))
1201 {
1202 c = ((c1 & 0x1F) << 6) | (c2 & 0x3F);
1203 /* Reject overlong sequences here and below. Encoders
1204 producing them are incorrect, they can be misleading,
1205 and they mess up read/write invariance. */
1206 if (c < 128)
1207 goto invalid_code;
1208 }
1209 else
1210 {
1211 ONE_MORE_BYTE (c3);
1212 if (c3 < 0 || ! UTF_8_EXTRA_OCTET_P (c3))
1213 goto invalid_code;
1214 if (UTF_8_3_OCTET_LEADING_P (c1))
1215 {
1216 c = (((c1 & 0xF) << 12)
1217 | ((c2 & 0x3F) << 6) | (c3 & 0x3F));
1218 if (c < 0x800
1219 || (c >= 0xd800 && c < 0xe000)) /* surrogates (invalid) */
1220 goto invalid_code;
1221 }
1222 else
1223 {
1224 ONE_MORE_BYTE (c4);
1225 if (c4 < 0 || ! UTF_8_EXTRA_OCTET_P (c4))
1226 goto invalid_code;
1227 if (UTF_8_4_OCTET_LEADING_P (c1))
1228 {
1229 c = (((c1 & 0x7) << 18) | ((c2 & 0x3F) << 12)
1230 | ((c3 & 0x3F) << 6) | (c4 & 0x3F));
1231 if (c < 0x10000)
1232 goto invalid_code;
1233 }
1234 else
1235 {
1236 ONE_MORE_BYTE (c5);
1237 if (c5 < 0 || ! UTF_8_EXTRA_OCTET_P (c5))
1238 goto invalid_code;
1239 if (UTF_8_5_OCTET_LEADING_P (c1))
1240 {
1241 c = (((c1 & 0x3) << 24) | ((c2 & 0x3F) << 18)
1242 | ((c3 & 0x3F) << 12) | ((c4 & 0x3F) << 6)
1243 | (c5 & 0x3F));
1244 if ((c > MAX_CHAR) || (c < 0x200000))
1245 goto invalid_code;
1246 }
1247 else
1248 goto invalid_code;
1249 }
1250 }
1251 }
1252 }
1253
1254 *charbuf++ = c;
1255 continue;
1256
1257 invalid_code:
1258 src = src_base;
1259 consumed_chars = consumed_chars_base;
1260 ONE_MORE_BYTE (c);
1261 *charbuf++ = ASCII_BYTE_P (c) ? c : BYTE8_TO_CHAR (c);
1262 coding->errors++;
1263 }
1264
1265 no_more_source:
1266 coding->consumed_char += consumed_chars_base;
1267 coding->consumed = src_base - coding->source;
1268 coding->charbuf_used = charbuf - coding->charbuf;
1269 }
1270
1271
1272 static int
1273 encode_coding_utf_8 (coding)
1274 struct coding_system *coding;
1275 {
1276 int multibytep = coding->dst_multibyte;
1277 int *charbuf = coding->charbuf;
1278 int *charbuf_end = charbuf + coding->charbuf_used;
1279 unsigned char *dst = coding->destination + coding->produced;
1280 unsigned char *dst_end = coding->destination + coding->dst_bytes;
1281 int produced_chars = 0;
1282 int c;
1283
1284 if (multibytep)
1285 {
1286 int safe_room = MAX_MULTIBYTE_LENGTH * 2;
1287
1288 while (charbuf < charbuf_end)
1289 {
1290 unsigned char str[MAX_MULTIBYTE_LENGTH], *p, *pend = str;
1291
1292 ASSURE_DESTINATION (safe_room);
1293 c = *charbuf++;
1294 if (CHAR_BYTE8_P (c))
1295 {
1296 c = CHAR_TO_BYTE8 (c);
1297 EMIT_ONE_BYTE (c);
1298 }
1299 else
1300 {
1301 CHAR_STRING_ADVANCE (c, pend);
1302 for (p = str; p < pend; p++)
1303 EMIT_ONE_BYTE (*p);
1304 }
1305 }
1306 }
1307 else
1308 {
1309 int safe_room = MAX_MULTIBYTE_LENGTH;
1310
1311 while (charbuf < charbuf_end)
1312 {
1313 ASSURE_DESTINATION (safe_room);
1314 c = *charbuf++;
1315 dst += CHAR_STRING (c, dst);
1316 produced_chars++;
1317 }
1318 }
1319 record_conversion_result (coding, CODING_RESULT_SUCCESS);
1320 coding->produced_char += produced_chars;
1321 coding->produced = dst - coding->destination;
1322 return 0;
1323 }
1324
1325
1326 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
1327 Check if a text is encoded in one of UTF-16 based coding systems.
1328 If it is, return 1, else return 0. */
1329
1330 #define UTF_16_HIGH_SURROGATE_P(val) \
1331 (((val) & 0xFC00) == 0xD800)
1332
1333 #define UTF_16_LOW_SURROGATE_P(val) \
1334 (((val) & 0xFC00) == 0xDC00)
1335
1336 #define UTF_16_INVALID_P(val) \
1337 (((val) == 0xFFFE) \
1338 || ((val) == 0xFFFF) \
1339 || UTF_16_LOW_SURROGATE_P (val))
1340
1341
1342 static int
1343 detect_coding_utf_16 (coding, detect_info)
1344 struct coding_system *coding;
1345 struct coding_detection_info *detect_info;
1346 {
1347 const unsigned char *src = coding->source, *src_base = src;
1348 const unsigned char *src_end = coding->source + coding->src_bytes;
1349 int multibytep = coding->src_multibyte;
1350 int consumed_chars = 0;
1351 int c1, c2;
1352
1353 detect_info->checked |= CATEGORY_MASK_UTF_16;
1354 if (coding->mode & CODING_MODE_LAST_BLOCK
1355 && (coding->src_chars & 1))
1356 {
1357 detect_info->rejected |= CATEGORY_MASK_UTF_16;
1358 return 0;
1359 }
1360
1361 ONE_MORE_BYTE (c1);
1362 ONE_MORE_BYTE (c2);
1363 if ((c1 == 0xFF) && (c2 == 0xFE))
1364 {
1365 detect_info->found |= (CATEGORY_MASK_UTF_16_LE
1366 | CATEGORY_MASK_UTF_16_AUTO);
1367 detect_info->rejected |= (CATEGORY_MASK_UTF_16_BE
1368 | CATEGORY_MASK_UTF_16_BE_NOSIG
1369 | CATEGORY_MASK_UTF_16_LE_NOSIG);
1370 }
1371 else if ((c1 == 0xFE) && (c2 == 0xFF))
1372 {
1373 detect_info->found |= (CATEGORY_MASK_UTF_16_BE
1374 | CATEGORY_MASK_UTF_16_AUTO);
1375 detect_info->rejected |= (CATEGORY_MASK_UTF_16_LE
1376 | CATEGORY_MASK_UTF_16_BE_NOSIG
1377 | CATEGORY_MASK_UTF_16_LE_NOSIG);
1378 }
1379 else if (c1 >= 0 && c2 >= 0)
1380 {
1381 unsigned char b1[256], b2[256];
1382 int b1_variants = 1, b2_variants = 1;
1383 int n;
1384
1385 bzero (b1, 256), bzero (b2, 256);
1386 b1[c1]++, b2[c2]++;
1387 for (n = 0; n < 256 && src < src_end; n++)
1388 {
1389 src_base = src;
1390 ONE_MORE_BYTE (c1);
1391 ONE_MORE_BYTE (c2);
1392 if (c1 < 0 || c2 < 0)
1393 break;
1394 if (! b1[c1++]) b1_variants++;
1395 if (! b2[c2++]) b2_variants++;
1396 }
1397 if (b1_variants < b2_variants)
1398 detect_info->found |= CATEGORY_MASK_UTF_16_BE_NOSIG;
1399 else
1400 detect_info->found |= CATEGORY_MASK_UTF_16_LE_NOSIG;
1401 detect_info->rejected
1402 |= (CATEGORY_MASK_UTF_16_BE | CATEGORY_MASK_UTF_16_LE);
1403 }
1404 no_more_source:
1405 return 1;
1406 }
1407
1408 static void
1409 decode_coding_utf_16 (coding)
1410 struct coding_system *coding;
1411 {
1412 const unsigned char *src = coding->source + coding->consumed;
1413 const unsigned char *src_end = coding->source + coding->src_bytes;
1414 const unsigned char *src_base;
1415 int *charbuf = coding->charbuf;
1416 int *charbuf_end = charbuf + coding->charbuf_size;
1417 int consumed_chars = 0, consumed_chars_base;
1418 int multibytep = coding->src_multibyte;
1419 enum utf_16_bom_type bom = CODING_UTF_16_BOM (coding);
1420 enum utf_16_endian_type endian = CODING_UTF_16_ENDIAN (coding);
1421 int surrogate = CODING_UTF_16_SURROGATE (coding);
1422 Lisp_Object attr, charset_list;
1423
1424 CODING_GET_INFO (coding, attr, charset_list);
1425
1426 if (bom == utf_16_with_bom)
1427 {
1428 int c, c1, c2;
1429
1430 src_base = src;
1431 ONE_MORE_BYTE (c1);
1432 ONE_MORE_BYTE (c2);
1433 c = (c1 << 8) | c2;
1434
1435 if (endian == utf_16_big_endian
1436 ? c != 0xFEFF : c != 0xFFFE)
1437 {
1438 /* The first two bytes are not BOM. Treat them as bytes
1439 for a normal character. */
1440 src = src_base;
1441 coding->errors++;
1442 }
1443 CODING_UTF_16_BOM (coding) = utf_16_without_bom;
1444 }
1445 else if (bom == utf_16_detect_bom)
1446 {
1447 /* We have already tried to detect BOM and failed in
1448 detect_coding. */
1449 CODING_UTF_16_BOM (coding) = utf_16_without_bom;
1450 }
1451
1452 while (1)
1453 {
1454 int c, c1, c2;
1455
1456 src_base = src;
1457 consumed_chars_base = consumed_chars;
1458
1459 if (charbuf + 2 >= charbuf_end)
1460 break;
1461
1462 ONE_MORE_BYTE (c1);
1463 if (c1 < 0)
1464 {
1465 *charbuf++ = -c1;
1466 continue;
1467 }
1468 ONE_MORE_BYTE (c2);
1469 if (c2 < 0)
1470 {
1471 *charbuf++ = ASCII_BYTE_P (c1) ? c1 : BYTE8_TO_CHAR (c1);
1472 *charbuf++ = -c2;
1473 continue;
1474 }
1475 c = (endian == utf_16_big_endian
1476 ? ((c1 << 8) | c2) : ((c2 << 8) | c1));
1477 if (surrogate)
1478 {
1479 if (! UTF_16_LOW_SURROGATE_P (c))
1480 {
1481 if (endian == utf_16_big_endian)
1482 c1 = surrogate >> 8, c2 = surrogate & 0xFF;
1483 else
1484 c1 = surrogate & 0xFF, c2 = surrogate >> 8;
1485 *charbuf++ = c1;
1486 *charbuf++ = c2;
1487 coding->errors++;
1488 if (UTF_16_HIGH_SURROGATE_P (c))
1489 CODING_UTF_16_SURROGATE (coding) = surrogate = c;
1490 else
1491 *charbuf++ = c;
1492 }
1493 else
1494 {
1495 c = ((surrogate - 0xD800) << 10) | (c - 0xDC00);
1496 CODING_UTF_16_SURROGATE (coding) = surrogate = 0;
1497 *charbuf++ = c;
1498 }
1499 }
1500 else
1501 {
1502 if (UTF_16_HIGH_SURROGATE_P (c))
1503 CODING_UTF_16_SURROGATE (coding) = surrogate = c;
1504 else
1505 *charbuf++ = c;
1506 }
1507 }
1508
1509 no_more_source:
1510 coding->consumed_char += consumed_chars_base;
1511 coding->consumed = src_base - coding->source;
1512 coding->charbuf_used = charbuf - coding->charbuf;
1513 }
1514
1515 static int
1516 encode_coding_utf_16 (coding)
1517 struct coding_system *coding;
1518 {
1519 int multibytep = coding->dst_multibyte;
1520 int *charbuf = coding->charbuf;
1521 int *charbuf_end = charbuf + coding->charbuf_used;
1522 unsigned char *dst = coding->destination + coding->produced;
1523 unsigned char *dst_end = coding->destination + coding->dst_bytes;
1524 int safe_room = 8;
1525 enum utf_16_bom_type bom = CODING_UTF_16_BOM (coding);
1526 int big_endian = CODING_UTF_16_ENDIAN (coding) == utf_16_big_endian;
1527 int produced_chars = 0;
1528 Lisp_Object attrs, charset_list;
1529 int c;
1530
1531 CODING_GET_INFO (coding, attrs, charset_list);
1532
1533 if (bom != utf_16_without_bom)
1534 {
1535 ASSURE_DESTINATION (safe_room);
1536 if (big_endian)
1537 EMIT_TWO_BYTES (0xFE, 0xFF);
1538 else
1539 EMIT_TWO_BYTES (0xFF, 0xFE);
1540 CODING_UTF_16_BOM (coding) = utf_16_without_bom;
1541 }
1542
1543 while (charbuf < charbuf_end)
1544 {
1545 ASSURE_DESTINATION (safe_room);
1546 c = *charbuf++;
1547 if (c >= MAX_UNICODE_CHAR)
1548 c = coding->default_char;
1549
1550 if (c < 0x10000)
1551 {
1552 if (big_endian)
1553 EMIT_TWO_BYTES (c >> 8, c & 0xFF);
1554 else
1555 EMIT_TWO_BYTES (c & 0xFF, c >> 8);
1556 }
1557 else
1558 {
1559 int c1, c2;
1560
1561 c -= 0x10000;
1562 c1 = (c >> 10) + 0xD800;
1563 c2 = (c & 0x3FF) + 0xDC00;
1564 if (big_endian)
1565 EMIT_FOUR_BYTES (c1 >> 8, c1 & 0xFF, c2 >> 8, c2 & 0xFF);
1566 else
1567 EMIT_FOUR_BYTES (c1 & 0xFF, c1 >> 8, c2 & 0xFF, c2 >> 8);
1568 }
1569 }
1570 record_conversion_result (coding, CODING_RESULT_SUCCESS);
1571 coding->produced = dst - coding->destination;
1572 coding->produced_char += produced_chars;
1573 return 0;
1574 }
1575
1576 \f
1577 /*** 6. Old Emacs' internal format (emacs-mule) ***/
1578
1579 /* Emacs' internal format for representation of multiple character
1580 sets is a kind of multi-byte encoding, i.e. characters are
1581 represented by variable-length sequences of one-byte codes.
1582
1583 ASCII characters and control characters (e.g. `tab', `newline') are
1584 represented by one-byte sequences which are their ASCII codes, in
1585 the range 0x00 through 0x7F.
1586
1587 8-bit characters of the range 0x80..0x9F are represented by
1588 two-byte sequences of LEADING_CODE_8_BIT_CONTROL and (their 8-bit
1589 code + 0x20).
1590
1591 8-bit characters of the range 0xA0..0xFF are represented by
1592 one-byte sequences which are their 8-bit code.
1593
1594 The other characters are represented by a sequence of `base
1595 leading-code', optional `extended leading-code', and one or two
1596 `position-code's. The length of the sequence is determined by the
1597 base leading-code. Leading-code takes the range 0x81 through 0x9D,
1598 whereas extended leading-code and position-code take the range 0xA0
1599 through 0xFF. See `charset.h' for more details about leading-code
1600 and position-code.
1601
1602 --- CODE RANGE of Emacs' internal format ---
1603 character set range
1604 ------------- -----
1605 ascii 0x00..0x7F
1606 eight-bit-control LEADING_CODE_8_BIT_CONTROL + 0xA0..0xBF
1607 eight-bit-graphic 0xA0..0xBF
1608 ELSE 0x81..0x9D + [0xA0..0xFF]+
1609 ---------------------------------------------
1610
1611 As this is the internal character representation, the format is
1612 usually not used externally (i.e. in a file or in a data sent to a
1613 process). But, it is possible to have a text externally in this
1614 format (i.e. by encoding by the coding system `emacs-mule').
1615
1616 In that case, a sequence of one-byte codes has a slightly different
1617 form.
1618
1619 At first, all characters in eight-bit-control are represented by
1620 one-byte sequences which are their 8-bit code.
1621
1622 Next, character composition data are represented by the byte
1623 sequence of the form: 0x80 METHOD BYTES CHARS COMPONENT ...,
1624 where,
1625 METHOD is 0xF0 plus one of composition method (enum
1626 composition_method),
1627
1628 BYTES is 0xA0 plus a byte length of this composition data,
1629
1630 CHARS is 0x20 plus a number of characters composed by this
1631 data,
1632
1633 COMPONENTs are characters of multibye form or composition
1634 rules encoded by two-byte of ASCII codes.
1635
1636 In addition, for backward compatibility, the following formats are
1637 also recognized as composition data on decoding.
1638
1639 0x80 MSEQ ...
1640 0x80 0xFF MSEQ RULE MSEQ RULE ... MSEQ
1641
1642 Here,
1643 MSEQ is a multibyte form but in these special format:
1644 ASCII: 0xA0 ASCII_CODE+0x80,
1645 other: LEADING_CODE+0x20 FOLLOWING-BYTE ...,
1646 RULE is a one byte code of the range 0xA0..0xF0 that
1647 represents a composition rule.
1648 */
1649
1650 char emacs_mule_bytes[256];
1651
1652 int
1653 emacs_mule_char (coding, src, nbytes, nchars, id)
1654 struct coding_system *coding;
1655 const unsigned char *src;
1656 int *nbytes, *nchars, *id;
1657 {
1658 const unsigned char *src_end = coding->source + coding->src_bytes;
1659 const unsigned char *src_base = src;
1660 int multibytep = coding->src_multibyte;
1661 struct charset *charset;
1662 unsigned code;
1663 int c;
1664 int consumed_chars = 0;
1665
1666 ONE_MORE_BYTE (c);
1667 if (c < 0)
1668 {
1669 c = -c;
1670 charset = emacs_mule_charset[0];
1671 }
1672 else
1673 {
1674 switch (emacs_mule_bytes[c])
1675 {
1676 case 2:
1677 if (! (charset = emacs_mule_charset[c]))
1678 goto invalid_code;
1679 ONE_MORE_BYTE (c);
1680 if (c < 0)
1681 goto invalid_code;
1682 code = c & 0x7F;
1683 break;
1684
1685 case 3:
1686 if (c == EMACS_MULE_LEADING_CODE_PRIVATE_11
1687 || c == EMACS_MULE_LEADING_CODE_PRIVATE_12)
1688 {
1689 ONE_MORE_BYTE (c);
1690 if (c < 0 || ! (charset = emacs_mule_charset[c]))
1691 goto invalid_code;
1692 ONE_MORE_BYTE (c);
1693 if (c < 0)
1694 goto invalid_code;
1695 code = c & 0x7F;
1696 }
1697 else
1698 {
1699 if (! (charset = emacs_mule_charset[c]))
1700 goto invalid_code;
1701 ONE_MORE_BYTE (c);
1702 if (c < 0)
1703 goto invalid_code;
1704 code = (c & 0x7F) << 8;
1705 ONE_MORE_BYTE (c);
1706 if (c < 0)
1707 goto invalid_code;
1708 code |= c & 0x7F;
1709 }
1710 break;
1711
1712 case 4:
1713 ONE_MORE_BYTE (c);
1714 if (c < 0 || ! (charset = emacs_mule_charset[c]))
1715 goto invalid_code;
1716 ONE_MORE_BYTE (c);
1717 if (c < 0)
1718 goto invalid_code;
1719 code = (c & 0x7F) << 8;
1720 ONE_MORE_BYTE (c);
1721 if (c < 0)
1722 goto invalid_code;
1723 code |= c & 0x7F;
1724 break;
1725
1726 case 1:
1727 code = c;
1728 charset = CHARSET_FROM_ID (ASCII_BYTE_P (code)
1729 ? charset_ascii : charset_eight_bit);
1730 break;
1731
1732 default:
1733 abort ();
1734 }
1735 c = DECODE_CHAR (charset, code);
1736 if (c < 0)
1737 goto invalid_code;
1738 }
1739 *nbytes = src - src_base;
1740 *nchars = consumed_chars;
1741 if (id)
1742 *id = charset->id;
1743 return c;
1744
1745 no_more_source:
1746 return -2;
1747
1748 invalid_code:
1749 return -1;
1750 }
1751
1752
1753 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
1754 Check if a text is encoded in `emacs-mule'. If it is, return 1,
1755 else return 0. */
1756
1757 static int
1758 detect_coding_emacs_mule (coding, detect_info)
1759 struct coding_system *coding;
1760 struct coding_detection_info *detect_info;
1761 {
1762 const unsigned char *src = coding->source, *src_base;
1763 const unsigned char *src_end = coding->source + coding->src_bytes;
1764 int multibytep = coding->src_multibyte;
1765 int consumed_chars = 0;
1766 int c;
1767 int found = 0;
1768
1769 detect_info->checked |= CATEGORY_MASK_EMACS_MULE;
1770 /* A coding system of this category is always ASCII compatible. */
1771 src += coding->head_ascii;
1772
1773 while (1)
1774 {
1775 src_base = src;
1776 ONE_MORE_BYTE (c);
1777 if (c < 0)
1778 continue;
1779 if (c == 0x80)
1780 {
1781 /* Perhaps the start of composite character. We simple skip
1782 it because analyzing it is too heavy for detecting. But,
1783 at least, we check that the composite character
1784 constitues of more than 4 bytes. */
1785 const unsigned char *src_base;
1786
1787 repeat:
1788 src_base = src;
1789 do
1790 {
1791 ONE_MORE_BYTE (c);
1792 }
1793 while (c >= 0xA0);
1794
1795 if (src - src_base <= 4)
1796 break;
1797 found = CATEGORY_MASK_EMACS_MULE;
1798 if (c == 0x80)
1799 goto repeat;
1800 }
1801
1802 if (c < 0x80)
1803 {
1804 if (c < 0x20
1805 && (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO))
1806 break;
1807 }
1808 else
1809 {
1810 const unsigned char *src_base = src - 1;
1811
1812 do
1813 {
1814 ONE_MORE_BYTE (c);
1815 }
1816 while (c >= 0xA0);
1817 if (src - src_base != emacs_mule_bytes[*src_base])
1818 break;
1819 found = CATEGORY_MASK_EMACS_MULE;
1820 }
1821 }
1822 detect_info->rejected |= CATEGORY_MASK_EMACS_MULE;
1823 return 0;
1824
1825 no_more_source:
1826 if (src_base < src && coding->mode & CODING_MODE_LAST_BLOCK)
1827 {
1828 detect_info->rejected |= CATEGORY_MASK_EMACS_MULE;
1829 return 0;
1830 }
1831 detect_info->found |= found;
1832 return 1;
1833 }
1834
1835
1836 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions". */
1837
1838 /* Decode a character represented as a component of composition
1839 sequence of Emacs 20/21 style at SRC. Set C to that character and
1840 update SRC to the head of next character (or an encoded composition
1841 rule). If SRC doesn't points a composition component, set C to -1.
1842 If SRC points an invalid byte sequence, global exit by a return
1843 value 0. */
1844
1845 #define DECODE_EMACS_MULE_COMPOSITION_CHAR(buf) \
1846 if (1) \
1847 { \
1848 int c; \
1849 int nbytes, nchars; \
1850 \
1851 if (src == src_end) \
1852 break; \
1853 c = emacs_mule_char (coding, src, &nbytes, &nchars, NULL);\
1854 if (c < 0) \
1855 { \
1856 if (c == -2) \
1857 break; \
1858 goto invalid_code; \
1859 } \
1860 *buf++ = c; \
1861 src += nbytes; \
1862 consumed_chars += nchars; \
1863 } \
1864 else
1865
1866
1867 /* Decode a composition rule represented as a component of composition
1868 sequence of Emacs 20 style at SRC. Store the decoded rule in *BUF,
1869 and increment BUF. If SRC points an invalid byte sequence, set C
1870 to -1. */
1871
1872 #define DECODE_EMACS_MULE_COMPOSITION_RULE_20(buf) \
1873 do { \
1874 int c, gref, nref; \
1875 \
1876 if (src >= src_end) \
1877 goto invalid_code; \
1878 ONE_MORE_BYTE_NO_CHECK (c); \
1879 c -= 0x20; \
1880 if (c < 0 || c >= 81) \
1881 goto invalid_code; \
1882 \
1883 gref = c / 9, nref = c % 9; \
1884 *buf++ = COMPOSITION_ENCODE_RULE (gref, nref); \
1885 } while (0)
1886
1887
1888 /* Decode a composition rule represented as a component of composition
1889 sequence of Emacs 21 style at SRC. Store the decoded rule in *BUF,
1890 and increment BUF. If SRC points an invalid byte sequence, set C
1891 to -1. */
1892
1893 #define DECODE_EMACS_MULE_COMPOSITION_RULE_21(buf) \
1894 do { \
1895 int gref, nref; \
1896 \
1897 if (src + 1>= src_end) \
1898 goto invalid_code; \
1899 ONE_MORE_BYTE_NO_CHECK (gref); \
1900 gref -= 0x20; \
1901 ONE_MORE_BYTE_NO_CHECK (nref); \
1902 nref -= 0x20; \
1903 if (gref < 0 || gref >= 81 \
1904 || nref < 0 || nref >= 81) \
1905 goto invalid_code; \
1906 *buf++ = COMPOSITION_ENCODE_RULE (gref, nref); \
1907 } while (0)
1908
1909
1910 #define DECODE_EMACS_MULE_21_COMPOSITION(c) \
1911 do { \
1912 /* Emacs 21 style format. The first three bytes at SRC are \
1913 (METHOD - 0xF2), (BYTES - 0xA0), (CHARS - 0xA0), where BYTES is \
1914 the byte length of this composition information, CHARS is the \
1915 number of characters composed by this composition. */ \
1916 enum composition_method method = c - 0xF2; \
1917 int *charbuf_base = charbuf; \
1918 int from, to; \
1919 int consumed_chars_limit; \
1920 int nbytes, nchars; \
1921 \
1922 ONE_MORE_BYTE (c); \
1923 if (c < 0) \
1924 goto invalid_code; \
1925 nbytes = c - 0xA0; \
1926 if (nbytes < 3) \
1927 goto invalid_code; \
1928 ONE_MORE_BYTE (c); \
1929 if (c < 0) \
1930 goto invalid_code; \
1931 nchars = c - 0xA0; \
1932 from = coding->produced + char_offset; \
1933 to = from + nchars; \
1934 ADD_COMPOSITION_DATA (charbuf, from, to, method); \
1935 consumed_chars_limit = consumed_chars_base + nbytes; \
1936 if (method != COMPOSITION_RELATIVE) \
1937 { \
1938 int i = 0; \
1939 while (consumed_chars < consumed_chars_limit) \
1940 { \
1941 if (i % 2 && method != COMPOSITION_WITH_ALTCHARS) \
1942 DECODE_EMACS_MULE_COMPOSITION_RULE_21 (charbuf); \
1943 else \
1944 DECODE_EMACS_MULE_COMPOSITION_CHAR (charbuf); \
1945 i++; \
1946 } \
1947 if (consumed_chars < consumed_chars_limit) \
1948 goto invalid_code; \
1949 charbuf_base[0] -= i; \
1950 } \
1951 } while (0)
1952
1953
1954 #define DECODE_EMACS_MULE_20_RELATIVE_COMPOSITION(c) \
1955 do { \
1956 /* Emacs 20 style format for relative composition. */ \
1957 /* Store multibyte form of characters to be composed. */ \
1958 enum composition_method method = COMPOSITION_RELATIVE; \
1959 int components[MAX_COMPOSITION_COMPONENTS * 2 - 1]; \
1960 int *buf = components; \
1961 int i, j; \
1962 int from, to; \
1963 \
1964 src = src_base; \
1965 ONE_MORE_BYTE (c); /* skip 0x80 */ \
1966 for (i = 0; i < MAX_COMPOSITION_COMPONENTS; i++) \
1967 DECODE_EMACS_MULE_COMPOSITION_CHAR (buf); \
1968 if (i < 2) \
1969 goto invalid_code; \
1970 from = coding->produced_char + char_offset; \
1971 to = from + i; \
1972 ADD_COMPOSITION_DATA (charbuf, from, to, method); \
1973 for (j = 0; j < i; j++) \
1974 *charbuf++ = components[j]; \
1975 } while (0)
1976
1977
1978 #define DECODE_EMACS_MULE_20_RULEBASE_COMPOSITION(c) \
1979 do { \
1980 /* Emacs 20 style format for rule-base composition. */ \
1981 /* Store multibyte form of characters to be composed. */ \
1982 enum composition_method method = COMPOSITION_WITH_RULE; \
1983 int components[MAX_COMPOSITION_COMPONENTS * 2 - 1]; \
1984 int *buf = components; \
1985 int i, j; \
1986 int from, to; \
1987 \
1988 DECODE_EMACS_MULE_COMPOSITION_CHAR (buf); \
1989 for (i = 0; i < MAX_COMPOSITION_COMPONENTS; i++) \
1990 { \
1991 DECODE_EMACS_MULE_COMPOSITION_RULE_20 (buf); \
1992 DECODE_EMACS_MULE_COMPOSITION_CHAR (buf); \
1993 } \
1994 if (i < 1 || (buf - components) % 2 == 0) \
1995 goto invalid_code; \
1996 if (charbuf + i + (i / 2) + 1 < charbuf_end) \
1997 goto no_more_source; \
1998 from = coding->produced_char + char_offset; \
1999 to = from + i; \
2000 ADD_COMPOSITION_DATA (buf, from, to, method); \
2001 for (j = 0; j < i; j++) \
2002 *charbuf++ = components[j]; \
2003 for (j = 0; j < i; j += 2) \
2004 *charbuf++ = components[j]; \
2005 } while (0)
2006
2007
2008 static void
2009 decode_coding_emacs_mule (coding)
2010 struct coding_system *coding;
2011 {
2012 const unsigned char *src = coding->source + coding->consumed;
2013 const unsigned char *src_end = coding->source + coding->src_bytes;
2014 const unsigned char *src_base;
2015 int *charbuf = coding->charbuf;
2016 int *charbuf_end = charbuf + coding->charbuf_size - MAX_ANNOTATION_LENGTH;
2017 int consumed_chars = 0, consumed_chars_base;
2018 int multibytep = coding->src_multibyte;
2019 Lisp_Object attrs, charset_list;
2020 int char_offset = coding->produced_char;
2021 int last_offset = char_offset;
2022 int last_id = charset_ascii;
2023
2024 CODING_GET_INFO (coding, attrs, charset_list);
2025
2026 while (1)
2027 {
2028 int c;
2029
2030 src_base = src;
2031 consumed_chars_base = consumed_chars;
2032
2033 if (charbuf >= charbuf_end)
2034 break;
2035
2036 ONE_MORE_BYTE (c);
2037 if (c < 0)
2038 {
2039 *charbuf++ = -c;
2040 char_offset++;
2041 }
2042 else if (c < 0x80)
2043 {
2044 *charbuf++ = c;
2045 char_offset++;
2046 }
2047 else if (c == 0x80)
2048 {
2049 ONE_MORE_BYTE (c);
2050 if (c < 0)
2051 goto invalid_code;
2052 if (c - 0xF2 >= COMPOSITION_RELATIVE
2053 && c - 0xF2 <= COMPOSITION_WITH_RULE_ALTCHARS)
2054 DECODE_EMACS_MULE_21_COMPOSITION (c);
2055 else if (c < 0xC0)
2056 DECODE_EMACS_MULE_20_RELATIVE_COMPOSITION (c);
2057 else if (c == 0xFF)
2058 DECODE_EMACS_MULE_20_RULEBASE_COMPOSITION (c);
2059 else
2060 goto invalid_code;
2061 }
2062 else if (c < 0xA0 && emacs_mule_bytes[c] > 1)
2063 {
2064 int nbytes, nchars;
2065 int id;
2066
2067 src = src_base;
2068 consumed_chars = consumed_chars_base;
2069 c = emacs_mule_char (coding, src, &nbytes, &nchars, &id);
2070 if (c < 0)
2071 {
2072 if (c == -2)
2073 break;
2074 goto invalid_code;
2075 }
2076 if (last_id != id)
2077 {
2078 if (last_id != charset_ascii)
2079 ADD_CHARSET_DATA (charbuf, last_offset, char_offset, last_id);
2080 last_id = id;
2081 last_offset = char_offset;
2082 }
2083 *charbuf++ = c;
2084 src += nbytes;
2085 consumed_chars += nchars;
2086 char_offset++;
2087 }
2088 continue;
2089
2090 invalid_code:
2091 src = src_base;
2092 consumed_chars = consumed_chars_base;
2093 ONE_MORE_BYTE (c);
2094 *charbuf++ = ASCII_BYTE_P (c) ? c : BYTE8_TO_CHAR (c);
2095 char_offset++;
2096 coding->errors++;
2097 }
2098
2099 no_more_source:
2100 if (last_id != charset_ascii)
2101 ADD_CHARSET_DATA (charbuf, last_offset, char_offset, last_id);
2102 coding->consumed_char += consumed_chars_base;
2103 coding->consumed = src_base - coding->source;
2104 coding->charbuf_used = charbuf - coding->charbuf;
2105 }
2106
2107
2108 #define EMACS_MULE_LEADING_CODES(id, codes) \
2109 do { \
2110 if (id < 0xA0) \
2111 codes[0] = id, codes[1] = 0; \
2112 else if (id < 0xE0) \
2113 codes[0] = 0x9A, codes[1] = id; \
2114 else if (id < 0xF0) \
2115 codes[0] = 0x9B, codes[1] = id; \
2116 else if (id < 0xF5) \
2117 codes[0] = 0x9C, codes[1] = id; \
2118 else \
2119 codes[0] = 0x9D, codes[1] = id; \
2120 } while (0);
2121
2122
2123 static int
2124 encode_coding_emacs_mule (coding)
2125 struct coding_system *coding;
2126 {
2127 int multibytep = coding->dst_multibyte;
2128 int *charbuf = coding->charbuf;
2129 int *charbuf_end = charbuf + coding->charbuf_used;
2130 unsigned char *dst = coding->destination + coding->produced;
2131 unsigned char *dst_end = coding->destination + coding->dst_bytes;
2132 int safe_room = 8;
2133 int produced_chars = 0;
2134 Lisp_Object attrs, charset_list;
2135 int c;
2136 int preferred_charset_id = -1;
2137
2138 CODING_GET_INFO (coding, attrs, charset_list);
2139 if (! EQ (charset_list, Vemacs_mule_charset_list))
2140 {
2141 CODING_ATTR_CHARSET_LIST (attrs)
2142 = charset_list = Vemacs_mule_charset_list;
2143 }
2144
2145 while (charbuf < charbuf_end)
2146 {
2147 ASSURE_DESTINATION (safe_room);
2148 c = *charbuf++;
2149
2150 if (c < 0)
2151 {
2152 /* Handle an annotation. */
2153 switch (*charbuf)
2154 {
2155 case CODING_ANNOTATE_COMPOSITION_MASK:
2156 /* Not yet implemented. */
2157 break;
2158 case CODING_ANNOTATE_CHARSET_MASK:
2159 preferred_charset_id = charbuf[3];
2160 if (preferred_charset_id >= 0
2161 && NILP (Fmemq (make_number (preferred_charset_id),
2162 charset_list)))
2163 preferred_charset_id = -1;
2164 break;
2165 default:
2166 abort ();
2167 }
2168 charbuf += -c - 1;
2169 continue;
2170 }
2171
2172 if (ASCII_CHAR_P (c))
2173 EMIT_ONE_ASCII_BYTE (c);
2174 else if (CHAR_BYTE8_P (c))
2175 {
2176 c = CHAR_TO_BYTE8 (c);
2177 EMIT_ONE_BYTE (c);
2178 }
2179 else
2180 {
2181 struct charset *charset;
2182 unsigned code;
2183 int dimension;
2184 int emacs_mule_id;
2185 unsigned char leading_codes[2];
2186
2187 if (preferred_charset_id >= 0)
2188 {
2189 charset = CHARSET_FROM_ID (preferred_charset_id);
2190 if (! CHAR_CHARSET_P (c, charset))
2191 charset = char_charset (c, charset_list, NULL);
2192 }
2193 else
2194 charset = char_charset (c, charset_list, &code);
2195 if (! charset)
2196 {
2197 c = coding->default_char;
2198 if (ASCII_CHAR_P (c))
2199 {
2200 EMIT_ONE_ASCII_BYTE (c);
2201 continue;
2202 }
2203 charset = char_charset (c, charset_list, &code);
2204 }
2205 dimension = CHARSET_DIMENSION (charset);
2206 emacs_mule_id = CHARSET_EMACS_MULE_ID (charset);
2207 EMACS_MULE_LEADING_CODES (emacs_mule_id, leading_codes);
2208 EMIT_ONE_BYTE (leading_codes[0]);
2209 if (leading_codes[1])
2210 EMIT_ONE_BYTE (leading_codes[1]);
2211 if (dimension == 1)
2212 EMIT_ONE_BYTE (code | 0x80);
2213 else
2214 {
2215 code |= 0x8080;
2216 EMIT_ONE_BYTE (code >> 8);
2217 EMIT_ONE_BYTE (code & 0xFF);
2218 }
2219 }
2220 }
2221 record_conversion_result (coding, CODING_RESULT_SUCCESS);
2222 coding->produced_char += produced_chars;
2223 coding->produced = dst - coding->destination;
2224 return 0;
2225 }
2226
2227 \f
2228 /*** 7. ISO2022 handlers ***/
2229
2230 /* The following note describes the coding system ISO2022 briefly.
2231 Since the intention of this note is to help understand the
2232 functions in this file, some parts are NOT ACCURATE or are OVERLY
2233 SIMPLIFIED. For thorough understanding, please refer to the
2234 original document of ISO2022. This is equivalent to the standard
2235 ECMA-35, obtainable from <URL:http://www.ecma.ch/> (*).
2236
2237 ISO2022 provides many mechanisms to encode several character sets
2238 in 7-bit and 8-bit environments. For 7-bit environments, all text
2239 is encoded using bytes less than 128. This may make the encoded
2240 text a little bit longer, but the text passes more easily through
2241 several types of gateway, some of which strip off the MSB (Most
2242 Significant Bit).
2243
2244 There are two kinds of character sets: control character sets and
2245 graphic character sets. The former contain control characters such
2246 as `newline' and `escape' to provide control functions (control
2247 functions are also provided by escape sequences). The latter
2248 contain graphic characters such as 'A' and '-'. Emacs recognizes
2249 two control character sets and many graphic character sets.
2250
2251 Graphic character sets are classified into one of the following
2252 four classes, according to the number of bytes (DIMENSION) and
2253 number of characters in one dimension (CHARS) of the set:
2254 - DIMENSION1_CHARS94
2255 - DIMENSION1_CHARS96
2256 - DIMENSION2_CHARS94
2257 - DIMENSION2_CHARS96
2258
2259 In addition, each character set is assigned an identification tag,
2260 unique for each set, called the "final character" (denoted as <F>
2261 hereafter). The <F> of each character set is decided by ECMA(*)
2262 when it is registered in ISO. The code range of <F> is 0x30..0x7F
2263 (0x30..0x3F are for private use only).
2264
2265 Note (*): ECMA = European Computer Manufacturers Association
2266
2267 Here are examples of graphic character sets [NAME(<F>)]:
2268 o DIMENSION1_CHARS94 -- ASCII('B'), right-half-of-JISX0201('I'), ...
2269 o DIMENSION1_CHARS96 -- right-half-of-ISO8859-1('A'), ...
2270 o DIMENSION2_CHARS94 -- GB2312('A'), JISX0208('B'), ...
2271 o DIMENSION2_CHARS96 -- none for the moment
2272
2273 A code area (1 byte=8 bits) is divided into 4 areas, C0, GL, C1, and GR.
2274 C0 [0x00..0x1F] -- control character plane 0
2275 GL [0x20..0x7F] -- graphic character plane 0
2276 C1 [0x80..0x9F] -- control character plane 1
2277 GR [0xA0..0xFF] -- graphic character plane 1
2278
2279 A control character set is directly designated and invoked to C0 or
2280 C1 by an escape sequence. The most common case is that:
2281 - ISO646's control character set is designated/invoked to C0, and
2282 - ISO6429's control character set is designated/invoked to C1,
2283 and usually these designations/invocations are omitted in encoded
2284 text. In a 7-bit environment, only C0 can be used, and a control
2285 character for C1 is encoded by an appropriate escape sequence to
2286 fit into the environment. All control characters for C1 are
2287 defined to have corresponding escape sequences.
2288
2289 A graphic character set is at first designated to one of four
2290 graphic registers (G0 through G3), then these graphic registers are
2291 invoked to GL or GR. These designations and invocations can be
2292 done independently. The most common case is that G0 is invoked to
2293 GL, G1 is invoked to GR, and ASCII is designated to G0. Usually
2294 these invocations and designations are omitted in encoded text.
2295 In a 7-bit environment, only GL can be used.
2296
2297 When a graphic character set of CHARS94 is invoked to GL, codes
2298 0x20 and 0x7F of the GL area work as control characters SPACE and
2299 DEL respectively, and codes 0xA0 and 0xFF of the GR area should not
2300 be used.
2301
2302 There are two ways of invocation: locking-shift and single-shift.
2303 With locking-shift, the invocation lasts until the next different
2304 invocation, whereas with single-shift, the invocation affects the
2305 following character only and doesn't affect the locking-shift
2306 state. Invocations are done by the following control characters or
2307 escape sequences:
2308
2309 ----------------------------------------------------------------------
2310 abbrev function cntrl escape seq description
2311 ----------------------------------------------------------------------
2312 SI/LS0 (shift-in) 0x0F none invoke G0 into GL
2313 SO/LS1 (shift-out) 0x0E none invoke G1 into GL
2314 LS2 (locking-shift-2) none ESC 'n' invoke G2 into GL
2315 LS3 (locking-shift-3) none ESC 'o' invoke G3 into GL
2316 LS1R (locking-shift-1 right) none ESC '~' invoke G1 into GR (*)
2317 LS2R (locking-shift-2 right) none ESC '}' invoke G2 into GR (*)
2318 LS3R (locking-shift 3 right) none ESC '|' invoke G3 into GR (*)
2319 SS2 (single-shift-2) 0x8E ESC 'N' invoke G2 for one char
2320 SS3 (single-shift-3) 0x8F ESC 'O' invoke G3 for one char
2321 ----------------------------------------------------------------------
2322 (*) These are not used by any known coding system.
2323
2324 Control characters for these functions are defined by macros
2325 ISO_CODE_XXX in `coding.h'.
2326
2327 Designations are done by the following escape sequences:
2328 ----------------------------------------------------------------------
2329 escape sequence description
2330 ----------------------------------------------------------------------
2331 ESC '(' <F> designate DIMENSION1_CHARS94<F> to G0
2332 ESC ')' <F> designate DIMENSION1_CHARS94<F> to G1
2333 ESC '*' <F> designate DIMENSION1_CHARS94<F> to G2
2334 ESC '+' <F> designate DIMENSION1_CHARS94<F> to G3
2335 ESC ',' <F> designate DIMENSION1_CHARS96<F> to G0 (*)
2336 ESC '-' <F> designate DIMENSION1_CHARS96<F> to G1
2337 ESC '.' <F> designate DIMENSION1_CHARS96<F> to G2
2338 ESC '/' <F> designate DIMENSION1_CHARS96<F> to G3
2339 ESC '$' '(' <F> designate DIMENSION2_CHARS94<F> to G0 (**)
2340 ESC '$' ')' <F> designate DIMENSION2_CHARS94<F> to G1
2341 ESC '$' '*' <F> designate DIMENSION2_CHARS94<F> to G2
2342 ESC '$' '+' <F> designate DIMENSION2_CHARS94<F> to G3
2343 ESC '$' ',' <F> designate DIMENSION2_CHARS96<F> to G0 (*)
2344 ESC '$' '-' <F> designate DIMENSION2_CHARS96<F> to G1
2345 ESC '$' '.' <F> designate DIMENSION2_CHARS96<F> to G2
2346 ESC '$' '/' <F> designate DIMENSION2_CHARS96<F> to G3
2347 ----------------------------------------------------------------------
2348
2349 In this list, "DIMENSION1_CHARS94<F>" means a graphic character set
2350 of dimension 1, chars 94, and final character <F>, etc...
2351
2352 Note (*): Although these designations are not allowed in ISO2022,
2353 Emacs accepts them on decoding, and produces them on encoding
2354 CHARS96 character sets in a coding system which is characterized as
2355 7-bit environment, non-locking-shift, and non-single-shift.
2356
2357 Note (**): If <F> is '@', 'A', or 'B', the intermediate character
2358 '(' must be omitted. We refer to this as "short-form" hereafter.
2359
2360 Now you may notice that there are a lot of ways of encoding the
2361 same multilingual text in ISO2022. Actually, there exist many
2362 coding systems such as Compound Text (used in X11's inter client
2363 communication, ISO-2022-JP (used in Japanese Internet), ISO-2022-KR
2364 (used in Korean Internet), EUC (Extended UNIX Code, used in Asian
2365 localized platforms), and all of these are variants of ISO2022.
2366
2367 In addition to the above, Emacs handles two more kinds of escape
2368 sequences: ISO6429's direction specification and Emacs' private
2369 sequence for specifying character composition.
2370
2371 ISO6429's direction specification takes the following form:
2372 o CSI ']' -- end of the current direction
2373 o CSI '0' ']' -- end of the current direction
2374 o CSI '1' ']' -- start of left-to-right text
2375 o CSI '2' ']' -- start of right-to-left text
2376 The control character CSI (0x9B: control sequence introducer) is
2377 abbreviated to the escape sequence ESC '[' in a 7-bit environment.
2378
2379 Character composition specification takes the following form:
2380 o ESC '0' -- start relative composition
2381 o ESC '1' -- end composition
2382 o ESC '2' -- start rule-base composition (*)
2383 o ESC '3' -- start relative composition with alternate chars (**)
2384 o ESC '4' -- start rule-base composition with alternate chars (**)
2385 Since these are not standard escape sequences of any ISO standard,
2386 the use of them with these meanings is restricted to Emacs only.
2387
2388 (*) This form is used only in Emacs 20.7 and older versions,
2389 but newer versions can safely decode it.
2390 (**) This form is used only in Emacs 21.1 and newer versions,
2391 and older versions can't decode it.
2392
2393 Here's a list of example usages of these composition escape
2394 sequences (categorized by `enum composition_method').
2395
2396 COMPOSITION_RELATIVE:
2397 ESC 0 CHAR [ CHAR ] ESC 1
2398 COMPOSITION_WITH_RULE:
2399 ESC 2 CHAR [ RULE CHAR ] ESC 1
2400 COMPOSITION_WITH_ALTCHARS:
2401 ESC 3 ALTCHAR [ ALTCHAR ] ESC 0 CHAR [ CHAR ] ESC 1
2402 COMPOSITION_WITH_RULE_ALTCHARS:
2403 ESC 4 ALTCHAR [ RULE ALTCHAR ] ESC 0 CHAR [ CHAR ] ESC 1 */
2404
2405 enum iso_code_class_type iso_code_class[256];
2406
2407 #define SAFE_CHARSET_P(coding, id) \
2408 ((id) <= (coding)->max_charset_id \
2409 && (coding)->safe_charsets[id] >= 0)
2410
2411
2412 #define SHIFT_OUT_OK(category) \
2413 (CODING_ISO_INITIAL (&coding_categories[category], 1) >= 0)
2414
2415 static void
2416 setup_iso_safe_charsets (attrs)
2417 Lisp_Object attrs;
2418 {
2419 Lisp_Object charset_list, safe_charsets;
2420 Lisp_Object request;
2421 Lisp_Object reg_usage;
2422 Lisp_Object tail;
2423 int reg94, reg96;
2424 int flags = XINT (AREF (attrs, coding_attr_iso_flags));
2425 int max_charset_id;
2426
2427 charset_list = CODING_ATTR_CHARSET_LIST (attrs);
2428 if ((flags & CODING_ISO_FLAG_FULL_SUPPORT)
2429 && ! EQ (charset_list, Viso_2022_charset_list))
2430 {
2431 CODING_ATTR_CHARSET_LIST (attrs)
2432 = charset_list = Viso_2022_charset_list;
2433 ASET (attrs, coding_attr_safe_charsets, Qnil);
2434 }
2435
2436 if (STRINGP (AREF (attrs, coding_attr_safe_charsets)))
2437 return;
2438
2439 max_charset_id = 0;
2440 for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
2441 {
2442 int id = XINT (XCAR (tail));
2443 if (max_charset_id < id)
2444 max_charset_id = id;
2445 }
2446
2447 safe_charsets = Fmake_string (make_number (max_charset_id + 1),
2448 make_number (255));
2449 request = AREF (attrs, coding_attr_iso_request);
2450 reg_usage = AREF (attrs, coding_attr_iso_usage);
2451 reg94 = XINT (XCAR (reg_usage));
2452 reg96 = XINT (XCDR (reg_usage));
2453
2454 for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
2455 {
2456 Lisp_Object id;
2457 Lisp_Object reg;
2458 struct charset *charset;
2459
2460 id = XCAR (tail);
2461 charset = CHARSET_FROM_ID (XINT (id));
2462 reg = Fcdr (Fassq (id, request));
2463 if (! NILP (reg))
2464 SSET (safe_charsets, XINT (id), XINT (reg));
2465 else if (charset->iso_chars_96)
2466 {
2467 if (reg96 < 4)
2468 SSET (safe_charsets, XINT (id), reg96);
2469 }
2470 else
2471 {
2472 if (reg94 < 4)
2473 SSET (safe_charsets, XINT (id), reg94);
2474 }
2475 }
2476 ASET (attrs, coding_attr_safe_charsets, safe_charsets);
2477 }
2478
2479
2480 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
2481 Check if a text is encoded in one of ISO-2022 based codig systems.
2482 If it is, return 1, else return 0. */
2483
2484 static int
2485 detect_coding_iso_2022 (coding, detect_info)
2486 struct coding_system *coding;
2487 struct coding_detection_info *detect_info;
2488 {
2489 const unsigned char *src = coding->source, *src_base = src;
2490 const unsigned char *src_end = coding->source + coding->src_bytes;
2491 int multibytep = coding->src_multibyte;
2492 int single_shifting = 0;
2493 int id;
2494 int c, c1;
2495 int consumed_chars = 0;
2496 int i;
2497 int rejected = 0;
2498 int found = 0;
2499
2500 detect_info->checked |= CATEGORY_MASK_ISO;
2501
2502 for (i = coding_category_iso_7; i <= coding_category_iso_8_else; i++)
2503 {
2504 struct coding_system *this = &(coding_categories[i]);
2505 Lisp_Object attrs, val;
2506
2507 attrs = CODING_ID_ATTRS (this->id);
2508 if (CODING_ISO_FLAGS (this) & CODING_ISO_FLAG_FULL_SUPPORT
2509 && ! EQ (CODING_ATTR_SAFE_CHARSETS (attrs), Viso_2022_charset_list))
2510 setup_iso_safe_charsets (attrs);
2511 val = CODING_ATTR_SAFE_CHARSETS (attrs);
2512 this->max_charset_id = SCHARS (val) - 1;
2513 this->safe_charsets = (char *) SDATA (val);
2514 }
2515
2516 /* A coding system of this category is always ASCII compatible. */
2517 src += coding->head_ascii;
2518
2519 while (rejected != CATEGORY_MASK_ISO)
2520 {
2521 src_base = src;
2522 ONE_MORE_BYTE (c);
2523 switch (c)
2524 {
2525 case ISO_CODE_ESC:
2526 if (inhibit_iso_escape_detection)
2527 break;
2528 single_shifting = 0;
2529 ONE_MORE_BYTE (c);
2530 if (c >= '(' && c <= '/')
2531 {
2532 /* Designation sequence for a charset of dimension 1. */
2533 ONE_MORE_BYTE (c1);
2534 if (c1 < ' ' || c1 >= 0x80
2535 || (id = iso_charset_table[0][c >= ','][c1]) < 0)
2536 /* Invalid designation sequence. Just ignore. */
2537 break;
2538 }
2539 else if (c == '$')
2540 {
2541 /* Designation sequence for a charset of dimension 2. */
2542 ONE_MORE_BYTE (c);
2543 if (c >= '@' && c <= 'B')
2544 /* Designation for JISX0208.1978, GB2312, or JISX0208. */
2545 id = iso_charset_table[1][0][c];
2546 else if (c >= '(' && c <= '/')
2547 {
2548 ONE_MORE_BYTE (c1);
2549 if (c1 < ' ' || c1 >= 0x80
2550 || (id = iso_charset_table[1][c >= ','][c1]) < 0)
2551 /* Invalid designation sequence. Just ignore. */
2552 break;
2553 }
2554 else
2555 /* Invalid designation sequence. Just ignore it. */
2556 break;
2557 }
2558 else if (c == 'N' || c == 'O')
2559 {
2560 /* ESC <Fe> for SS2 or SS3. */
2561 single_shifting = 1;
2562 rejected |= CATEGORY_MASK_ISO_7BIT | CATEGORY_MASK_ISO_8BIT;
2563 break;
2564 }
2565 else if (c >= '0' && c <= '4')
2566 {
2567 /* ESC <Fp> for start/end composition. */
2568 found |= CATEGORY_MASK_ISO;
2569 break;
2570 }
2571 else
2572 {
2573 /* Invalid escape sequence. Just ignore it. */
2574 break;
2575 }
2576
2577 /* We found a valid designation sequence for CHARSET. */
2578 rejected |= CATEGORY_MASK_ISO_8BIT;
2579 if (SAFE_CHARSET_P (&coding_categories[coding_category_iso_7],
2580 id))
2581 found |= CATEGORY_MASK_ISO_7;
2582 else
2583 rejected |= CATEGORY_MASK_ISO_7;
2584 if (SAFE_CHARSET_P (&coding_categories[coding_category_iso_7_tight],
2585 id))
2586 found |= CATEGORY_MASK_ISO_7_TIGHT;
2587 else
2588 rejected |= CATEGORY_MASK_ISO_7_TIGHT;
2589 if (SAFE_CHARSET_P (&coding_categories[coding_category_iso_7_else],
2590 id))
2591 found |= CATEGORY_MASK_ISO_7_ELSE;
2592 else
2593 rejected |= CATEGORY_MASK_ISO_7_ELSE;
2594 if (SAFE_CHARSET_P (&coding_categories[coding_category_iso_8_else],
2595 id))
2596 found |= CATEGORY_MASK_ISO_8_ELSE;
2597 else
2598 rejected |= CATEGORY_MASK_ISO_8_ELSE;
2599 break;
2600
2601 case ISO_CODE_SO:
2602 case ISO_CODE_SI:
2603 /* Locking shift out/in. */
2604 if (inhibit_iso_escape_detection)
2605 break;
2606 single_shifting = 0;
2607 rejected |= CATEGORY_MASK_ISO_7BIT | CATEGORY_MASK_ISO_8BIT;
2608 found |= CATEGORY_MASK_ISO_ELSE;
2609 break;
2610
2611 case ISO_CODE_CSI:
2612 /* Control sequence introducer. */
2613 single_shifting = 0;
2614 rejected |= CATEGORY_MASK_ISO_7BIT | CATEGORY_MASK_ISO_7_ELSE;
2615 found |= CATEGORY_MASK_ISO_8_ELSE;
2616 goto check_extra_latin;
2617
2618
2619 case ISO_CODE_SS2:
2620 case ISO_CODE_SS3:
2621 /* Single shift. */
2622 if (inhibit_iso_escape_detection)
2623 break;
2624 single_shifting = 1;
2625 rejected |= CATEGORY_MASK_ISO_7BIT;
2626 if (CODING_ISO_FLAGS (&coding_categories[coding_category_iso_8_1])
2627 & CODING_ISO_FLAG_SINGLE_SHIFT)
2628 found |= CATEGORY_MASK_ISO_8_1;
2629 if (CODING_ISO_FLAGS (&coding_categories[coding_category_iso_8_2])
2630 & CODING_ISO_FLAG_SINGLE_SHIFT)
2631 found |= CATEGORY_MASK_ISO_8_2;
2632 goto check_extra_latin;
2633
2634 default:
2635 if (c < 0)
2636 continue;
2637 if (c < 0x80)
2638 {
2639 single_shifting = 0;
2640 break;
2641 }
2642 if (c >= 0xA0)
2643 {
2644 rejected |= CATEGORY_MASK_ISO_7BIT | CATEGORY_MASK_ISO_7_ELSE;
2645 found |= CATEGORY_MASK_ISO_8_1;
2646 /* Check the length of succeeding codes of the range
2647 0xA0..0FF. If the byte length is even, we include
2648 CATEGORY_MASK_ISO_8_2 in `found'. We can check this
2649 only when we are not single shifting. */
2650 if (! single_shifting
2651 && ! (rejected & CATEGORY_MASK_ISO_8_2))
2652 {
2653 int i = 1;
2654 while (src < src_end)
2655 {
2656 ONE_MORE_BYTE (c);
2657 if (c < 0xA0)
2658 break;
2659 i++;
2660 }
2661
2662 if (i & 1 && src < src_end)
2663 rejected |= CATEGORY_MASK_ISO_8_2;
2664 else
2665 found |= CATEGORY_MASK_ISO_8_2;
2666 }
2667 break;
2668 }
2669 check_extra_latin:
2670 single_shifting = 0;
2671 if (! VECTORP (Vlatin_extra_code_table)
2672 || NILP (XVECTOR (Vlatin_extra_code_table)->contents[c]))
2673 {
2674 rejected = CATEGORY_MASK_ISO;
2675 break;
2676 }
2677 if (CODING_ISO_FLAGS (&coding_categories[coding_category_iso_8_1])
2678 & CODING_ISO_FLAG_LATIN_EXTRA)
2679 found |= CATEGORY_MASK_ISO_8_1;
2680 else
2681 rejected |= CATEGORY_MASK_ISO_8_1;
2682 if (CODING_ISO_FLAGS (&coding_categories[coding_category_iso_8_2])
2683 & CODING_ISO_FLAG_LATIN_EXTRA)
2684 found |= CATEGORY_MASK_ISO_8_2;
2685 else
2686 rejected |= CATEGORY_MASK_ISO_8_2;
2687 }
2688 }
2689 detect_info->rejected |= CATEGORY_MASK_ISO;
2690 return 0;
2691
2692 no_more_source:
2693 detect_info->rejected |= rejected;
2694 detect_info->found |= (found & ~rejected);
2695 return 1;
2696 }
2697
2698
2699 /* Set designation state into CODING. */
2700 #define DECODE_DESIGNATION(reg, dim, chars_96, final) \
2701 do { \
2702 int id, prev; \
2703 \
2704 if (final < '0' || final >= 128 \
2705 || ((id = ISO_CHARSET_TABLE (dim, chars_96, final)) < 0) \
2706 || !SAFE_CHARSET_P (coding, id)) \
2707 { \
2708 CODING_ISO_DESIGNATION (coding, reg) = -2; \
2709 goto invalid_code; \
2710 } \
2711 prev = CODING_ISO_DESIGNATION (coding, reg); \
2712 if (id == charset_jisx0201_roman) \
2713 { \
2714 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_USE_ROMAN) \
2715 id = charset_ascii; \
2716 } \
2717 else if (id == charset_jisx0208_1978) \
2718 { \
2719 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_USE_OLDJIS) \
2720 id = charset_jisx0208; \
2721 } \
2722 CODING_ISO_DESIGNATION (coding, reg) = id; \
2723 /* If there was an invalid designation to REG previously, and this \
2724 designation is ASCII to REG, we should keep this designation \
2725 sequence. */ \
2726 if (prev == -2 && id == charset_ascii) \
2727 goto invalid_code; \
2728 } while (0)
2729
2730
2731 #define MAYBE_FINISH_COMPOSITION() \
2732 do { \
2733 int i; \
2734 if (composition_state == COMPOSING_NO) \
2735 break; \
2736 /* It is assured that we have enough room for producing \
2737 characters stored in the table `components'. */ \
2738 if (charbuf + component_idx > charbuf_end) \
2739 goto no_more_source; \
2740 composition_state = COMPOSING_NO; \
2741 if (method == COMPOSITION_RELATIVE \
2742 || method == COMPOSITION_WITH_ALTCHARS) \
2743 { \
2744 for (i = 0; i < component_idx; i++) \
2745 *charbuf++ = components[i]; \
2746 char_offset += component_idx; \
2747 } \
2748 else \
2749 { \
2750 for (i = 0; i < component_idx; i += 2) \
2751 *charbuf++ = components[i]; \
2752 char_offset += (component_idx / 2) + 1; \
2753 } \
2754 } while (0)
2755
2756
2757 /* Handle composition start sequence ESC 0, ESC 2, ESC 3, or ESC 4.
2758 ESC 0 : relative composition : ESC 0 CHAR ... ESC 1
2759 ESC 2 : rulebase composition : ESC 2 CHAR RULE CHAR RULE ... CHAR ESC 1
2760 ESC 3 : altchar composition : ESC 3 CHAR ... ESC 0 CHAR ... ESC 1
2761 ESC 4 : alt&rule composition : ESC 4 CHAR RULE ... CHAR ESC 0 CHAR ... ESC 1
2762 */
2763
2764 #define DECODE_COMPOSITION_START(c1) \
2765 do { \
2766 if (c1 == '0' \
2767 && composition_state == COMPOSING_COMPONENT_RULE) \
2768 { \
2769 component_len = component_idx; \
2770 composition_state = COMPOSING_CHAR; \
2771 } \
2772 else \
2773 { \
2774 const unsigned char *p; \
2775 \
2776 MAYBE_FINISH_COMPOSITION (); \
2777 if (charbuf + MAX_COMPOSITION_COMPONENTS > charbuf_end) \
2778 goto no_more_source; \
2779 for (p = src; p < src_end - 1; p++) \
2780 if (*p == ISO_CODE_ESC && p[1] == '1') \
2781 break; \
2782 if (p == src_end - 1) \
2783 { \
2784 if (coding->mode & CODING_MODE_LAST_BLOCK) \
2785 goto invalid_code; \
2786 goto no_more_source; \
2787 } \
2788 \
2789 /* This is surely the start of a composition. */ \
2790 method = (c1 == '0' ? COMPOSITION_RELATIVE \
2791 : c1 == '2' ? COMPOSITION_WITH_RULE \
2792 : c1 == '3' ? COMPOSITION_WITH_ALTCHARS \
2793 : COMPOSITION_WITH_RULE_ALTCHARS); \
2794 composition_state = (c1 <= '2' ? COMPOSING_CHAR \
2795 : COMPOSING_COMPONENT_CHAR); \
2796 component_idx = component_len = 0; \
2797 } \
2798 } while (0)
2799
2800
2801 /* Handle compositoin end sequence ESC 1. */
2802
2803 #define DECODE_COMPOSITION_END() \
2804 do { \
2805 int nchars = (component_len > 0 ? component_idx - component_len \
2806 : method == COMPOSITION_RELATIVE ? component_idx \
2807 : (component_idx + 1) / 2); \
2808 int i; \
2809 int *saved_charbuf = charbuf; \
2810 int from = char_offset; \
2811 int to = from + nchars; \
2812 \
2813 ADD_COMPOSITION_DATA (charbuf, from, to, method); \
2814 if (method != COMPOSITION_RELATIVE) \
2815 { \
2816 if (component_len == 0) \
2817 for (i = 0; i < component_idx; i++) \
2818 *charbuf++ = components[i]; \
2819 else \
2820 for (i = 0; i < component_len; i++) \
2821 *charbuf++ = components[i]; \
2822 *saved_charbuf = saved_charbuf - charbuf; \
2823 } \
2824 if (method == COMPOSITION_WITH_RULE) \
2825 for (i = 0; i < component_idx; i += 2, char_offset++) \
2826 *charbuf++ = components[i]; \
2827 else \
2828 for (i = component_len; i < component_idx; i++, char_offset++) \
2829 *charbuf++ = components[i]; \
2830 coding->annotated = 1; \
2831 composition_state = COMPOSING_NO; \
2832 } while (0)
2833
2834
2835 /* Decode a composition rule from the byte C1 (and maybe one more byte
2836 from SRC) and store one encoded composition rule in
2837 coding->cmp_data. */
2838
2839 #define DECODE_COMPOSITION_RULE(c1) \
2840 do { \
2841 (c1) -= 32; \
2842 if (c1 < 81) /* old format (before ver.21) */ \
2843 { \
2844 int gref = (c1) / 9; \
2845 int nref = (c1) % 9; \
2846 if (gref == 4) gref = 10; \
2847 if (nref == 4) nref = 10; \
2848 c1 = COMPOSITION_ENCODE_RULE (gref, nref); \
2849 } \
2850 else if (c1 < 93) /* new format (after ver.21) */ \
2851 { \
2852 ONE_MORE_BYTE (c2); \
2853 c1 = COMPOSITION_ENCODE_RULE (c1 - 81, c2 - 32); \
2854 } \
2855 else \
2856 c1 = 0; \
2857 } while (0)
2858
2859
2860 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions". */
2861
2862 static void
2863 decode_coding_iso_2022 (coding)
2864 struct coding_system *coding;
2865 {
2866 const unsigned char *src = coding->source + coding->consumed;
2867 const unsigned char *src_end = coding->source + coding->src_bytes;
2868 const unsigned char *src_base;
2869 int *charbuf = coding->charbuf;
2870 int *charbuf_end
2871 = charbuf + coding->charbuf_size - 4 - MAX_ANNOTATION_LENGTH;
2872 int consumed_chars = 0, consumed_chars_base;
2873 int multibytep = coding->src_multibyte;
2874 /* Charsets invoked to graphic plane 0 and 1 respectively. */
2875 int charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
2876 int charset_id_1 = CODING_ISO_INVOKED_CHARSET (coding, 1);
2877 struct charset *charset;
2878 int c;
2879 /* For handling composition sequence. */
2880 #define COMPOSING_NO 0
2881 #define COMPOSING_CHAR 1
2882 #define COMPOSING_RULE 2
2883 #define COMPOSING_COMPONENT_CHAR 3
2884 #define COMPOSING_COMPONENT_RULE 4
2885
2886 int composition_state = COMPOSING_NO;
2887 enum composition_method method;
2888 int components[MAX_COMPOSITION_COMPONENTS * 2 + 1];
2889 int component_idx;
2890 int component_len;
2891 Lisp_Object attrs, charset_list;
2892 int char_offset = coding->produced_char;
2893 int last_offset = char_offset;
2894 int last_id = charset_ascii;
2895
2896 CODING_GET_INFO (coding, attrs, charset_list);
2897 setup_iso_safe_charsets (attrs);
2898
2899 while (1)
2900 {
2901 int c1, c2;
2902
2903 src_base = src;
2904 consumed_chars_base = consumed_chars;
2905
2906 if (charbuf >= charbuf_end)
2907 break;
2908
2909 ONE_MORE_BYTE (c1);
2910 if (c1 < 0)
2911 goto invalid_code;
2912
2913 /* We produce at most one character. */
2914 switch (iso_code_class [c1])
2915 {
2916 case ISO_0x20_or_0x7F:
2917 if (composition_state != COMPOSING_NO)
2918 {
2919 if (composition_state == COMPOSING_RULE
2920 || composition_state == COMPOSING_COMPONENT_RULE)
2921 {
2922 DECODE_COMPOSITION_RULE (c1);
2923 components[component_idx++] = c1;
2924 composition_state--;
2925 continue;
2926 }
2927 }
2928 if (charset_id_0 < 0
2929 || ! CHARSET_ISO_CHARS_96 (CHARSET_FROM_ID (charset_id_0)))
2930 /* This is SPACE or DEL. */
2931 charset = CHARSET_FROM_ID (charset_ascii);
2932 else
2933 charset = CHARSET_FROM_ID (charset_id_0);
2934 break;
2935
2936 case ISO_graphic_plane_0:
2937 if (composition_state != COMPOSING_NO)
2938 {
2939 if (composition_state == COMPOSING_RULE
2940 || composition_state == COMPOSING_COMPONENT_RULE)
2941 {
2942 DECODE_COMPOSITION_RULE (c1);
2943 components[component_idx++] = c1;
2944 composition_state--;
2945 continue;
2946 }
2947 }
2948 charset = CHARSET_FROM_ID (charset_id_0);
2949 break;
2950
2951 case ISO_0xA0_or_0xFF:
2952 if (charset_id_1 < 0
2953 || ! CHARSET_ISO_CHARS_96 (CHARSET_FROM_ID (charset_id_1))
2954 || CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS)
2955 goto invalid_code;
2956 /* This is a graphic character, we fall down ... */
2957
2958 case ISO_graphic_plane_1:
2959 if (charset_id_1 < 0)
2960 goto invalid_code;
2961 charset = CHARSET_FROM_ID (charset_id_1);
2962 break;
2963
2964 case ISO_control_0:
2965 MAYBE_FINISH_COMPOSITION ();
2966 charset = CHARSET_FROM_ID (charset_ascii);
2967 break;
2968
2969 case ISO_control_1:
2970 MAYBE_FINISH_COMPOSITION ();
2971 goto invalid_code;
2972
2973 case ISO_shift_out:
2974 if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LOCKING_SHIFT)
2975 || CODING_ISO_DESIGNATION (coding, 1) < 0)
2976 goto invalid_code;
2977 CODING_ISO_INVOCATION (coding, 0) = 1;
2978 charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
2979 continue;
2980
2981 case ISO_shift_in:
2982 if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LOCKING_SHIFT))
2983 goto invalid_code;
2984 CODING_ISO_INVOCATION (coding, 0) = 0;
2985 charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
2986 continue;
2987
2988 case ISO_single_shift_2_7:
2989 case ISO_single_shift_2:
2990 if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT))
2991 goto invalid_code;
2992 /* SS2 is handled as an escape sequence of ESC 'N' */
2993 c1 = 'N';
2994 goto label_escape_sequence;
2995
2996 case ISO_single_shift_3:
2997 if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT))
2998 goto invalid_code;
2999 /* SS2 is handled as an escape sequence of ESC 'O' */
3000 c1 = 'O';
3001 goto label_escape_sequence;
3002
3003 case ISO_control_sequence_introducer:
3004 /* CSI is handled as an escape sequence of ESC '[' ... */
3005 c1 = '[';
3006 goto label_escape_sequence;
3007
3008 case ISO_escape:
3009 ONE_MORE_BYTE (c1);
3010 label_escape_sequence:
3011 /* Escape sequences handled here are invocation,
3012 designation, direction specification, and character
3013 composition specification. */
3014 switch (c1)
3015 {
3016 case '&': /* revision of following character set */
3017 ONE_MORE_BYTE (c1);
3018 if (!(c1 >= '@' && c1 <= '~'))
3019 goto invalid_code;
3020 ONE_MORE_BYTE (c1);
3021 if (c1 != ISO_CODE_ESC)
3022 goto invalid_code;
3023 ONE_MORE_BYTE (c1);
3024 goto label_escape_sequence;
3025
3026 case '$': /* designation of 2-byte character set */
3027 if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DESIGNATION))
3028 goto invalid_code;
3029 ONE_MORE_BYTE (c1);
3030 if (c1 >= '@' && c1 <= 'B')
3031 { /* designation of JISX0208.1978, GB2312.1980,
3032 or JISX0208.1980 */
3033 DECODE_DESIGNATION (0, 2, 0, c1);
3034 }
3035 else if (c1 >= 0x28 && c1 <= 0x2B)
3036 { /* designation of DIMENSION2_CHARS94 character set */
3037 ONE_MORE_BYTE (c2);
3038 DECODE_DESIGNATION (c1 - 0x28, 2, 0, c2);
3039 }
3040 else if (c1 >= 0x2C && c1 <= 0x2F)
3041 { /* designation of DIMENSION2_CHARS96 character set */
3042 ONE_MORE_BYTE (c2);
3043 DECODE_DESIGNATION (c1 - 0x2C, 2, 1, c2);
3044 }
3045 else
3046 goto invalid_code;
3047 /* We must update these variables now. */
3048 charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3049 charset_id_1 = CODING_ISO_INVOKED_CHARSET (coding, 1);
3050 continue;
3051
3052 case 'n': /* invocation of locking-shift-2 */
3053 if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LOCKING_SHIFT)
3054 || CODING_ISO_DESIGNATION (coding, 2) < 0)
3055 goto invalid_code;
3056 CODING_ISO_INVOCATION (coding, 0) = 2;
3057 charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3058 continue;
3059
3060 case 'o': /* invocation of locking-shift-3 */
3061 if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LOCKING_SHIFT)
3062 || CODING_ISO_DESIGNATION (coding, 3) < 0)
3063 goto invalid_code;
3064 CODING_ISO_INVOCATION (coding, 0) = 3;
3065 charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3066 continue;
3067
3068 case 'N': /* invocation of single-shift-2 */
3069 if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT)
3070 || CODING_ISO_DESIGNATION (coding, 2) < 0)
3071 goto invalid_code;
3072 charset = CHARSET_FROM_ID (CODING_ISO_DESIGNATION (coding, 2));
3073 ONE_MORE_BYTE (c1);
3074 if (c1 < 0x20 || (c1 >= 0x80 && c1 < 0xA0))
3075 goto invalid_code;
3076 break;
3077
3078 case 'O': /* invocation of single-shift-3 */
3079 if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT)
3080 || CODING_ISO_DESIGNATION (coding, 3) < 0)
3081 goto invalid_code;
3082 charset = CHARSET_FROM_ID (CODING_ISO_DESIGNATION (coding, 3));
3083 ONE_MORE_BYTE (c1);
3084 if (c1 < 0x20 || (c1 >= 0x80 && c1 < 0xA0))
3085 goto invalid_code;
3086 break;
3087
3088 case '0': case '2': case '3': case '4': /* start composition */
3089 if (! (coding->common_flags & CODING_ANNOTATE_COMPOSITION_MASK))
3090 goto invalid_code;
3091 DECODE_COMPOSITION_START (c1);
3092 continue;
3093
3094 case '1': /* end composition */
3095 if (composition_state == COMPOSING_NO)
3096 goto invalid_code;
3097 DECODE_COMPOSITION_END ();
3098 continue;
3099
3100 case '[': /* specification of direction */
3101 if (! CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DIRECTION)
3102 goto invalid_code;
3103 /* For the moment, nested direction is not supported.
3104 So, `coding->mode & CODING_MODE_DIRECTION' zero means
3105 left-to-right, and nozero means right-to-left. */
3106 ONE_MORE_BYTE (c1);
3107 switch (c1)
3108 {
3109 case ']': /* end of the current direction */
3110 coding->mode &= ~CODING_MODE_DIRECTION;
3111
3112 case '0': /* end of the current direction */
3113 case '1': /* start of left-to-right direction */
3114 ONE_MORE_BYTE (c1);
3115 if (c1 == ']')
3116 coding->mode &= ~CODING_MODE_DIRECTION;
3117 else
3118 goto invalid_code;
3119 break;
3120
3121 case '2': /* start of right-to-left direction */
3122 ONE_MORE_BYTE (c1);
3123 if (c1 == ']')
3124 coding->mode |= CODING_MODE_DIRECTION;
3125 else
3126 goto invalid_code;
3127 break;
3128
3129 default:
3130 goto invalid_code;
3131 }
3132 continue;
3133
3134 case '%':
3135 ONE_MORE_BYTE (c1);
3136 if (c1 == '/')
3137 {
3138 /* CTEXT extended segment:
3139 ESC % / [0-4] M L --ENCODING-NAME-- \002 --BYTES--
3140 We keep these bytes as is for the moment.
3141 They may be decoded by post-read-conversion. */
3142 int dim, M, L;
3143 int size;
3144
3145 ONE_MORE_BYTE (dim);
3146 ONE_MORE_BYTE (M);
3147 ONE_MORE_BYTE (L);
3148 size = ((M - 128) * 128) + (L - 128);
3149 if (charbuf + 8 + size > charbuf_end)
3150 goto break_loop;
3151 *charbuf++ = ISO_CODE_ESC;
3152 *charbuf++ = '%';
3153 *charbuf++ = '/';
3154 *charbuf++ = dim;
3155 *charbuf++ = BYTE8_TO_CHAR (M);
3156 *charbuf++ = BYTE8_TO_CHAR (L);
3157 while (size-- > 0)
3158 {
3159 ONE_MORE_BYTE (c1);
3160 *charbuf++ = ASCII_BYTE_P (c1) ? c1 : BYTE8_TO_CHAR (c1);
3161 }
3162 }
3163 else if (c1 == 'G')
3164 {
3165 /* XFree86 extension for embedding UTF-8 in CTEXT:
3166 ESC % G --UTF-8-BYTES-- ESC % @
3167 We keep these bytes as is for the moment.
3168 They may be decoded by post-read-conversion. */
3169 int *p = charbuf;
3170
3171 if (p + 6 > charbuf_end)
3172 goto break_loop;
3173 *p++ = ISO_CODE_ESC;
3174 *p++ = '%';
3175 *p++ = 'G';
3176 while (p < charbuf_end)
3177 {
3178 ONE_MORE_BYTE (c1);
3179 if (c1 == ISO_CODE_ESC
3180 && src + 1 < src_end
3181 && src[0] == '%'
3182 && src[1] == '@')
3183 break;
3184 *p++ = ASCII_BYTE_P (c1) ? c1 : BYTE8_TO_CHAR (c1);
3185 }
3186 if (p + 3 > charbuf_end)
3187 goto break_loop;
3188 *p++ = ISO_CODE_ESC;
3189 *p++ = '%';
3190 *p++ = '@';
3191 charbuf = p;
3192 }
3193 else
3194 goto invalid_code;
3195 continue;
3196 break;
3197
3198 default:
3199 if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DESIGNATION))
3200 goto invalid_code;
3201 if (c1 >= 0x28 && c1 <= 0x2B)
3202 { /* designation of DIMENSION1_CHARS94 character set */
3203 ONE_MORE_BYTE (c2);
3204 DECODE_DESIGNATION (c1 - 0x28, 1, 0, c2);
3205 }
3206 else if (c1 >= 0x2C && c1 <= 0x2F)
3207 { /* designation of DIMENSION1_CHARS96 character set */
3208 ONE_MORE_BYTE (c2);
3209 DECODE_DESIGNATION (c1 - 0x2C, 1, 1, c2);
3210 }
3211 else
3212 goto invalid_code;
3213 /* We must update these variables now. */
3214 charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3215 charset_id_1 = CODING_ISO_INVOKED_CHARSET (coding, 1);
3216 continue;
3217 }
3218 }
3219
3220 if (charset->id != charset_ascii
3221 && last_id != charset->id)
3222 {
3223 if (last_id != charset_ascii)
3224 ADD_CHARSET_DATA (charbuf, last_offset, char_offset, last_id);
3225 last_id = charset->id;
3226 last_offset = char_offset;
3227 }
3228
3229 /* Now we know CHARSET and 1st position code C1 of a character.
3230 Produce a decoded character while getting 2nd position code
3231 C2 if necessary. */
3232 c1 &= 0x7F;
3233 if (CHARSET_DIMENSION (charset) > 1)
3234 {
3235 ONE_MORE_BYTE (c2);
3236 if (c2 < 0x20 || (c2 >= 0x80 && c2 < 0xA0))
3237 /* C2 is not in a valid range. */
3238 goto invalid_code;
3239 c1 = (c1 << 8) | (c2 & 0x7F);
3240 if (CHARSET_DIMENSION (charset) > 2)
3241 {
3242 ONE_MORE_BYTE (c2);
3243 if (c2 < 0x20 || (c2 >= 0x80 && c2 < 0xA0))
3244 /* C2 is not in a valid range. */
3245 goto invalid_code;
3246 c1 = (c1 << 8) | (c2 & 0x7F);
3247 }
3248 }
3249
3250 CODING_DECODE_CHAR (coding, src, src_base, src_end, charset, c1, c);
3251 if (c < 0)
3252 {
3253 MAYBE_FINISH_COMPOSITION ();
3254 for (; src_base < src; src_base++, char_offset++)
3255 {
3256 if (ASCII_BYTE_P (*src_base))
3257 *charbuf++ = *src_base;
3258 else
3259 *charbuf++ = BYTE8_TO_CHAR (*src_base);
3260 }
3261 }
3262 else if (composition_state == COMPOSING_NO)
3263 {
3264 *charbuf++ = c;
3265 char_offset++;
3266 }
3267 else
3268 {
3269 components[component_idx++] = c;
3270 if (method == COMPOSITION_WITH_RULE
3271 || (method == COMPOSITION_WITH_RULE_ALTCHARS
3272 && composition_state == COMPOSING_COMPONENT_CHAR))
3273 composition_state++;
3274 }
3275 continue;
3276
3277 invalid_code:
3278 MAYBE_FINISH_COMPOSITION ();
3279 src = src_base;
3280 consumed_chars = consumed_chars_base;
3281 ONE_MORE_BYTE (c);
3282 *charbuf++ = c < 0 ? -c : ASCII_BYTE_P (c) ? c : BYTE8_TO_CHAR (c);
3283 char_offset++;
3284 coding->errors++;
3285 continue;
3286
3287 break_loop:
3288 break;
3289 }
3290
3291 no_more_source:
3292 if (last_id != charset_ascii)
3293 ADD_CHARSET_DATA (charbuf, last_offset, char_offset, last_id);
3294 coding->consumed_char += consumed_chars_base;
3295 coding->consumed = src_base - coding->source;
3296 coding->charbuf_used = charbuf - coding->charbuf;
3297 }
3298
3299
3300 /* ISO2022 encoding stuff. */
3301
3302 /*
3303 It is not enough to say just "ISO2022" on encoding, we have to
3304 specify more details. In Emacs, each coding system of ISO2022
3305 variant has the following specifications:
3306 1. Initial designation to G0 thru G3.
3307 2. Allows short-form designation?
3308 3. ASCII should be designated to G0 before control characters?
3309 4. ASCII should be designated to G0 at end of line?
3310 5. 7-bit environment or 8-bit environment?
3311 6. Use locking-shift?
3312 7. Use Single-shift?
3313 And the following two are only for Japanese:
3314 8. Use ASCII in place of JIS0201-1976-Roman?
3315 9. Use JISX0208-1983 in place of JISX0208-1978?
3316 These specifications are encoded in CODING_ISO_FLAGS (coding) as flag bits
3317 defined by macros CODING_ISO_FLAG_XXX. See `coding.h' for more
3318 details.
3319 */
3320
3321 /* Produce codes (escape sequence) for designating CHARSET to graphic
3322 register REG at DST, and increment DST. If <final-char> of CHARSET is
3323 '@', 'A', or 'B' and the coding system CODING allows, produce
3324 designation sequence of short-form. */
3325
3326 #define ENCODE_DESIGNATION(charset, reg, coding) \
3327 do { \
3328 unsigned char final_char = CHARSET_ISO_FINAL (charset); \
3329 char *intermediate_char_94 = "()*+"; \
3330 char *intermediate_char_96 = ",-./"; \
3331 int revision = -1; \
3332 int c; \
3333 \
3334 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_REVISION) \
3335 revision = CHARSET_ISO_REVISION (charset); \
3336 \
3337 if (revision >= 0) \
3338 { \
3339 EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, '&'); \
3340 EMIT_ONE_BYTE ('@' + revision); \
3341 } \
3342 EMIT_ONE_ASCII_BYTE (ISO_CODE_ESC); \
3343 if (CHARSET_DIMENSION (charset) == 1) \
3344 { \
3345 if (! CHARSET_ISO_CHARS_96 (charset)) \
3346 c = intermediate_char_94[reg]; \
3347 else \
3348 c = intermediate_char_96[reg]; \
3349 EMIT_ONE_ASCII_BYTE (c); \
3350 } \
3351 else \
3352 { \
3353 EMIT_ONE_ASCII_BYTE ('$'); \
3354 if (! CHARSET_ISO_CHARS_96 (charset)) \
3355 { \
3356 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LONG_FORM \
3357 || reg != 0 \
3358 || final_char < '@' || final_char > 'B') \
3359 EMIT_ONE_ASCII_BYTE (intermediate_char_94[reg]); \
3360 } \
3361 else \
3362 EMIT_ONE_ASCII_BYTE (intermediate_char_96[reg]); \
3363 } \
3364 EMIT_ONE_ASCII_BYTE (final_char); \
3365 \
3366 CODING_ISO_DESIGNATION (coding, reg) = CHARSET_ID (charset); \
3367 } while (0)
3368
3369
3370 /* The following two macros produce codes (control character or escape
3371 sequence) for ISO2022 single-shift functions (single-shift-2 and
3372 single-shift-3). */
3373
3374 #define ENCODE_SINGLE_SHIFT_2 \
3375 do { \
3376 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS) \
3377 EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'N'); \
3378 else \
3379 EMIT_ONE_BYTE (ISO_CODE_SS2); \
3380 CODING_ISO_SINGLE_SHIFTING (coding) = 1; \
3381 } while (0)
3382
3383
3384 #define ENCODE_SINGLE_SHIFT_3 \
3385 do { \
3386 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS) \
3387 EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'O'); \
3388 else \
3389 EMIT_ONE_BYTE (ISO_CODE_SS3); \
3390 CODING_ISO_SINGLE_SHIFTING (coding) = 1; \
3391 } while (0)
3392
3393
3394 /* The following four macros produce codes (control character or
3395 escape sequence) for ISO2022 locking-shift functions (shift-in,
3396 shift-out, locking-shift-2, and locking-shift-3). */
3397
3398 #define ENCODE_SHIFT_IN \
3399 do { \
3400 EMIT_ONE_ASCII_BYTE (ISO_CODE_SI); \
3401 CODING_ISO_INVOCATION (coding, 0) = 0; \
3402 } while (0)
3403
3404
3405 #define ENCODE_SHIFT_OUT \
3406 do { \
3407 EMIT_ONE_ASCII_BYTE (ISO_CODE_SO); \
3408 CODING_ISO_INVOCATION (coding, 0) = 1; \
3409 } while (0)
3410
3411
3412 #define ENCODE_LOCKING_SHIFT_2 \
3413 do { \
3414 EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'n'); \
3415 CODING_ISO_INVOCATION (coding, 0) = 2; \
3416 } while (0)
3417
3418
3419 #define ENCODE_LOCKING_SHIFT_3 \
3420 do { \
3421 EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'n'); \
3422 CODING_ISO_INVOCATION (coding, 0) = 3; \
3423 } while (0)
3424
3425
3426 /* Produce codes for a DIMENSION1 character whose character set is
3427 CHARSET and whose position-code is C1. Designation and invocation
3428 sequences are also produced in advance if necessary. */
3429
3430 #define ENCODE_ISO_CHARACTER_DIMENSION1(charset, c1) \
3431 do { \
3432 int id = CHARSET_ID (charset); \
3433 \
3434 if ((CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_USE_ROMAN) \
3435 && id == charset_ascii) \
3436 { \
3437 id = charset_jisx0201_roman; \
3438 charset = CHARSET_FROM_ID (id); \
3439 } \
3440 \
3441 if (CODING_ISO_SINGLE_SHIFTING (coding)) \
3442 { \
3443 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS) \
3444 EMIT_ONE_ASCII_BYTE (c1 & 0x7F); \
3445 else \
3446 EMIT_ONE_BYTE (c1 | 0x80); \
3447 CODING_ISO_SINGLE_SHIFTING (coding) = 0; \
3448 break; \
3449 } \
3450 else if (id == CODING_ISO_INVOKED_CHARSET (coding, 0)) \
3451 { \
3452 EMIT_ONE_ASCII_BYTE (c1 & 0x7F); \
3453 break; \
3454 } \
3455 else if (id == CODING_ISO_INVOKED_CHARSET (coding, 1)) \
3456 { \
3457 EMIT_ONE_BYTE (c1 | 0x80); \
3458 break; \
3459 } \
3460 else \
3461 /* Since CHARSET is not yet invoked to any graphic planes, we \
3462 must invoke it, or, at first, designate it to some graphic \
3463 register. Then repeat the loop to actually produce the \
3464 character. */ \
3465 dst = encode_invocation_designation (charset, coding, dst, \
3466 &produced_chars); \
3467 } while (1)
3468
3469
3470 /* Produce codes for a DIMENSION2 character whose character set is
3471 CHARSET and whose position-codes are C1 and C2. Designation and
3472 invocation codes are also produced in advance if necessary. */
3473
3474 #define ENCODE_ISO_CHARACTER_DIMENSION2(charset, c1, c2) \
3475 do { \
3476 int id = CHARSET_ID (charset); \
3477 \
3478 if ((CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_USE_OLDJIS) \
3479 && id == charset_jisx0208) \
3480 { \
3481 id = charset_jisx0208_1978; \
3482 charset = CHARSET_FROM_ID (id); \
3483 } \
3484 \
3485 if (CODING_ISO_SINGLE_SHIFTING (coding)) \
3486 { \
3487 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS) \
3488 EMIT_TWO_ASCII_BYTES ((c1) & 0x7F, (c2) & 0x7F); \
3489 else \
3490 EMIT_TWO_BYTES ((c1) | 0x80, (c2) | 0x80); \
3491 CODING_ISO_SINGLE_SHIFTING (coding) = 0; \
3492 break; \
3493 } \
3494 else if (id == CODING_ISO_INVOKED_CHARSET (coding, 0)) \
3495 { \
3496 EMIT_TWO_ASCII_BYTES ((c1) & 0x7F, (c2) & 0x7F); \
3497 break; \
3498 } \
3499 else if (id == CODING_ISO_INVOKED_CHARSET (coding, 1)) \
3500 { \
3501 EMIT_TWO_BYTES ((c1) | 0x80, (c2) | 0x80); \
3502 break; \
3503 } \
3504 else \
3505 /* Since CHARSET is not yet invoked to any graphic planes, we \
3506 must invoke it, or, at first, designate it to some graphic \
3507 register. Then repeat the loop to actually produce the \
3508 character. */ \
3509 dst = encode_invocation_designation (charset, coding, dst, \
3510 &produced_chars); \
3511 } while (1)
3512
3513
3514 #define ENCODE_ISO_CHARACTER(charset, c) \
3515 do { \
3516 int code = ENCODE_CHAR ((charset),(c)); \
3517 \
3518 if (CHARSET_DIMENSION (charset) == 1) \
3519 ENCODE_ISO_CHARACTER_DIMENSION1 ((charset), code); \
3520 else \
3521 ENCODE_ISO_CHARACTER_DIMENSION2 ((charset), code >> 8, code & 0xFF); \
3522 } while (0)
3523
3524
3525 /* Produce designation and invocation codes at a place pointed by DST
3526 to use CHARSET. The element `spec.iso_2022' of *CODING is updated.
3527 Return new DST. */
3528
3529 unsigned char *
3530 encode_invocation_designation (charset, coding, dst, p_nchars)
3531 struct charset *charset;
3532 struct coding_system *coding;
3533 unsigned char *dst;
3534 int *p_nchars;
3535 {
3536 int multibytep = coding->dst_multibyte;
3537 int produced_chars = *p_nchars;
3538 int reg; /* graphic register number */
3539 int id = CHARSET_ID (charset);
3540
3541 /* At first, check designations. */
3542 for (reg = 0; reg < 4; reg++)
3543 if (id == CODING_ISO_DESIGNATION (coding, reg))
3544 break;
3545
3546 if (reg >= 4)
3547 {
3548 /* CHARSET is not yet designated to any graphic registers. */
3549 /* At first check the requested designation. */
3550 reg = CODING_ISO_REQUEST (coding, id);
3551 if (reg < 0)
3552 /* Since CHARSET requests no special designation, designate it
3553 to graphic register 0. */
3554 reg = 0;
3555
3556 ENCODE_DESIGNATION (charset, reg, coding);
3557 }
3558
3559 if (CODING_ISO_INVOCATION (coding, 0) != reg
3560 && CODING_ISO_INVOCATION (coding, 1) != reg)
3561 {
3562 /* Since the graphic register REG is not invoked to any graphic
3563 planes, invoke it to graphic plane 0. */
3564 switch (reg)
3565 {
3566 case 0: /* graphic register 0 */
3567 ENCODE_SHIFT_IN;
3568 break;
3569
3570 case 1: /* graphic register 1 */
3571 ENCODE_SHIFT_OUT;
3572 break;
3573
3574 case 2: /* graphic register 2 */
3575 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT)
3576 ENCODE_SINGLE_SHIFT_2;
3577 else
3578 ENCODE_LOCKING_SHIFT_2;
3579 break;
3580
3581 case 3: /* graphic register 3 */
3582 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT)
3583 ENCODE_SINGLE_SHIFT_3;
3584 else
3585 ENCODE_LOCKING_SHIFT_3;
3586 break;
3587 }
3588 }
3589
3590 *p_nchars = produced_chars;
3591 return dst;
3592 }
3593
3594 /* The following three macros produce codes for indicating direction
3595 of text. */
3596 #define ENCODE_CONTROL_SEQUENCE_INTRODUCER \
3597 do { \
3598 if (CODING_ISO_FLAGS (coding) == CODING_ISO_FLAG_SEVEN_BITS) \
3599 EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, '['); \
3600 else \
3601 EMIT_ONE_BYTE (ISO_CODE_CSI); \
3602 } while (0)
3603
3604
3605 #define ENCODE_DIRECTION_R2L() \
3606 do { \
3607 ENCODE_CONTROL_SEQUENCE_INTRODUCER (dst); \
3608 EMIT_TWO_ASCII_BYTES ('2', ']'); \
3609 } while (0)
3610
3611
3612 #define ENCODE_DIRECTION_L2R() \
3613 do { \
3614 ENCODE_CONTROL_SEQUENCE_INTRODUCER (dst); \
3615 EMIT_TWO_ASCII_BYTES ('0', ']'); \
3616 } while (0)
3617
3618
3619 /* Produce codes for designation and invocation to reset the graphic
3620 planes and registers to initial state. */
3621 #define ENCODE_RESET_PLANE_AND_REGISTER() \
3622 do { \
3623 int reg; \
3624 struct charset *charset; \
3625 \
3626 if (CODING_ISO_INVOCATION (coding, 0) != 0) \
3627 ENCODE_SHIFT_IN; \
3628 for (reg = 0; reg < 4; reg++) \
3629 if (CODING_ISO_INITIAL (coding, reg) >= 0 \
3630 && (CODING_ISO_DESIGNATION (coding, reg) \
3631 != CODING_ISO_INITIAL (coding, reg))) \
3632 { \
3633 charset = CHARSET_FROM_ID (CODING_ISO_INITIAL (coding, reg)); \
3634 ENCODE_DESIGNATION (charset, reg, coding); \
3635 } \
3636 } while (0)
3637
3638
3639 /* Produce designation sequences of charsets in the line started from
3640 SRC to a place pointed by DST, and return updated DST.
3641
3642 If the current block ends before any end-of-line, we may fail to
3643 find all the necessary designations. */
3644
3645 static unsigned char *
3646 encode_designation_at_bol (coding, charbuf, charbuf_end, dst)
3647 struct coding_system *coding;
3648 int *charbuf, *charbuf_end;
3649 unsigned char *dst;
3650 {
3651 struct charset *charset;
3652 /* Table of charsets to be designated to each graphic register. */
3653 int r[4];
3654 int c, found = 0, reg;
3655 int produced_chars = 0;
3656 int multibytep = coding->dst_multibyte;
3657 Lisp_Object attrs;
3658 Lisp_Object charset_list;
3659
3660 attrs = CODING_ID_ATTRS (coding->id);
3661 charset_list = CODING_ATTR_CHARSET_LIST (attrs);
3662 if (EQ (charset_list, Qiso_2022))
3663 charset_list = Viso_2022_charset_list;
3664
3665 for (reg = 0; reg < 4; reg++)
3666 r[reg] = -1;
3667
3668 while (found < 4)
3669 {
3670 int id;
3671
3672 c = *charbuf++;
3673 if (c == '\n')
3674 break;
3675 charset = char_charset (c, charset_list, NULL);
3676 id = CHARSET_ID (charset);
3677 reg = CODING_ISO_REQUEST (coding, id);
3678 if (reg >= 0 && r[reg] < 0)
3679 {
3680 found++;
3681 r[reg] = id;
3682 }
3683 }
3684
3685 if (found)
3686 {
3687 for (reg = 0; reg < 4; reg++)
3688 if (r[reg] >= 0
3689 && CODING_ISO_DESIGNATION (coding, reg) != r[reg])
3690 ENCODE_DESIGNATION (CHARSET_FROM_ID (r[reg]), reg, coding);
3691 }
3692
3693 return dst;
3694 }
3695
3696 /* See the above "GENERAL NOTES on `encode_coding_XXX ()' functions". */
3697
3698 static int
3699 encode_coding_iso_2022 (coding)
3700 struct coding_system *coding;
3701 {
3702 int multibytep = coding->dst_multibyte;
3703 int *charbuf = coding->charbuf;
3704 int *charbuf_end = charbuf + coding->charbuf_used;
3705 unsigned char *dst = coding->destination + coding->produced;
3706 unsigned char *dst_end = coding->destination + coding->dst_bytes;
3707 int safe_room = 16;
3708 int bol_designation
3709 = (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DESIGNATE_AT_BOL
3710 && CODING_ISO_BOL (coding));
3711 int produced_chars = 0;
3712 Lisp_Object attrs, eol_type, charset_list;
3713 int ascii_compatible;
3714 int c;
3715 int preferred_charset_id = -1;
3716
3717 CODING_GET_INFO (coding, attrs, charset_list);
3718 eol_type = CODING_ID_EOL_TYPE (coding->id);
3719 if (VECTORP (eol_type))
3720 eol_type = Qunix;
3721
3722 setup_iso_safe_charsets (attrs);
3723 /* Charset list may have been changed. */
3724 charset_list = CODING_ATTR_CHARSET_LIST (attrs); \
3725 coding->safe_charsets = (char *) SDATA (CODING_ATTR_SAFE_CHARSETS(attrs));
3726
3727 ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs));
3728
3729 while (charbuf < charbuf_end)
3730 {
3731 ASSURE_DESTINATION (safe_room);
3732
3733 if (bol_designation)
3734 {
3735 unsigned char *dst_prev = dst;
3736
3737 /* We have to produce designation sequences if any now. */
3738 dst = encode_designation_at_bol (coding, charbuf, charbuf_end, dst);
3739 bol_designation = 0;
3740 /* We are sure that designation sequences are all ASCII bytes. */
3741 produced_chars += dst - dst_prev;
3742 }
3743
3744 c = *charbuf++;
3745
3746 if (c < 0)
3747 {
3748 /* Handle an annotation. */
3749 switch (*charbuf)
3750 {
3751 case CODING_ANNOTATE_COMPOSITION_MASK:
3752 /* Not yet implemented. */
3753 break;
3754 case CODING_ANNOTATE_CHARSET_MASK:
3755 preferred_charset_id = charbuf[3];
3756 if (preferred_charset_id >= 0
3757 && NILP (Fmemq (make_number (preferred_charset_id),
3758 charset_list)))
3759 preferred_charset_id = -1;
3760 break;
3761 default:
3762 abort ();
3763 }
3764 charbuf += -c - 1;
3765 continue;
3766 }
3767
3768 /* Now encode the character C. */
3769 if (c < 0x20 || c == 0x7F)
3770 {
3771 if (c == '\n'
3772 || (c == '\r' && EQ (eol_type, Qmac)))
3773 {
3774 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_RESET_AT_EOL)
3775 ENCODE_RESET_PLANE_AND_REGISTER ();
3776 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_INIT_AT_BOL)
3777 {
3778 int i;
3779
3780 for (i = 0; i < 4; i++)
3781 CODING_ISO_DESIGNATION (coding, i)
3782 = CODING_ISO_INITIAL (coding, i);
3783 }
3784 bol_designation
3785 = CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DESIGNATE_AT_BOL;
3786 }
3787 else if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_RESET_AT_CNTL)
3788 ENCODE_RESET_PLANE_AND_REGISTER ();
3789 EMIT_ONE_ASCII_BYTE (c);
3790 }
3791 else if (ASCII_CHAR_P (c))
3792 {
3793 if (ascii_compatible)
3794 EMIT_ONE_ASCII_BYTE (c);
3795 else
3796 {
3797 struct charset *charset = CHARSET_FROM_ID (charset_ascii);
3798 ENCODE_ISO_CHARACTER (charset, c);
3799 }
3800 }
3801 else if (CHAR_BYTE8_P (c))
3802 {
3803 c = CHAR_TO_BYTE8 (c);
3804 EMIT_ONE_BYTE (c);
3805 }
3806 else
3807 {
3808 struct charset *charset;
3809
3810 if (preferred_charset_id >= 0)
3811 {
3812 charset = CHARSET_FROM_ID (preferred_charset_id);
3813 if (! CHAR_CHARSET_P (c, charset))
3814 charset = char_charset (c, charset_list, NULL);
3815 }
3816 else
3817 charset = char_charset (c, charset_list, NULL);
3818 if (!charset)
3819 {
3820 if (coding->mode & CODING_MODE_SAFE_ENCODING)
3821 {
3822 c = CODING_INHIBIT_CHARACTER_SUBSTITUTION;
3823 charset = CHARSET_FROM_ID (charset_ascii);
3824 }
3825 else
3826 {
3827 c = coding->default_char;
3828 charset = char_charset (c, charset_list, NULL);
3829 }
3830 }
3831 ENCODE_ISO_CHARACTER (charset, c);
3832 }
3833 }
3834
3835 if (coding->mode & CODING_MODE_LAST_BLOCK
3836 && CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_RESET_AT_EOL)
3837 {
3838 ASSURE_DESTINATION (safe_room);
3839 ENCODE_RESET_PLANE_AND_REGISTER ();
3840 }
3841 record_conversion_result (coding, CODING_RESULT_SUCCESS);
3842 CODING_ISO_BOL (coding) = bol_designation;
3843 coding->produced_char += produced_chars;
3844 coding->produced = dst - coding->destination;
3845 return 0;
3846 }
3847
3848 \f
3849 /*** 8,9. SJIS and BIG5 handlers ***/
3850
3851 /* Although SJIS and BIG5 are not ISO's coding system, they are used
3852 quite widely. So, for the moment, Emacs supports them in the bare
3853 C code. But, in the future, they may be supported only by CCL. */
3854
3855 /* SJIS is a coding system encoding three character sets: ASCII, right
3856 half of JISX0201-Kana, and JISX0208. An ASCII character is encoded
3857 as is. A character of charset katakana-jisx0201 is encoded by
3858 "position-code + 0x80". A character of charset japanese-jisx0208
3859 is encoded in 2-byte but two position-codes are divided and shifted
3860 so that it fit in the range below.
3861
3862 --- CODE RANGE of SJIS ---
3863 (character set) (range)
3864 ASCII 0x00 .. 0x7F
3865 KATAKANA-JISX0201 0xA0 .. 0xDF
3866 JISX0208 (1st byte) 0x81 .. 0x9F and 0xE0 .. 0xEF
3867 (2nd byte) 0x40 .. 0x7E and 0x80 .. 0xFC
3868 -------------------------------
3869
3870 */
3871
3872 /* BIG5 is a coding system encoding two character sets: ASCII and
3873 Big5. An ASCII character is encoded as is. Big5 is a two-byte
3874 character set and is encoded in two-byte.
3875
3876 --- CODE RANGE of BIG5 ---
3877 (character set) (range)
3878 ASCII 0x00 .. 0x7F
3879 Big5 (1st byte) 0xA1 .. 0xFE
3880 (2nd byte) 0x40 .. 0x7E and 0xA1 .. 0xFE
3881 --------------------------
3882
3883 */
3884
3885 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
3886 Check if a text is encoded in SJIS. If it is, return
3887 CATEGORY_MASK_SJIS, else return 0. */
3888
3889 static int
3890 detect_coding_sjis (coding, detect_info)
3891 struct coding_system *coding;
3892 struct coding_detection_info *detect_info;
3893 {
3894 const unsigned char *src = coding->source, *src_base;
3895 const unsigned char *src_end = coding->source + coding->src_bytes;
3896 int multibytep = coding->src_multibyte;
3897 int consumed_chars = 0;
3898 int found = 0;
3899 int c;
3900
3901 detect_info->checked |= CATEGORY_MASK_SJIS;
3902 /* A coding system of this category is always ASCII compatible. */
3903 src += coding->head_ascii;
3904
3905 while (1)
3906 {
3907 src_base = src;
3908 ONE_MORE_BYTE (c);
3909 if (c < 0x80)
3910 continue;
3911 if ((c >= 0x81 && c <= 0x9F) || (c >= 0xE0 && c <= 0xEF))
3912 {
3913 ONE_MORE_BYTE (c);
3914 if (c < 0x40 || c == 0x7F || c > 0xFC)
3915 break;
3916 found = CATEGORY_MASK_SJIS;
3917 }
3918 else if (c >= 0xA0 && c < 0xE0)
3919 found = CATEGORY_MASK_SJIS;
3920 else
3921 break;
3922 }
3923 detect_info->rejected |= CATEGORY_MASK_SJIS;
3924 return 0;
3925
3926 no_more_source:
3927 if (src_base < src && coding->mode & CODING_MODE_LAST_BLOCK)
3928 {
3929 detect_info->rejected |= CATEGORY_MASK_SJIS;
3930 return 0;
3931 }
3932 detect_info->found |= found;
3933 return 1;
3934 }
3935
3936 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
3937 Check if a text is encoded in BIG5. If it is, return
3938 CATEGORY_MASK_BIG5, else return 0. */
3939
3940 static int
3941 detect_coding_big5 (coding, detect_info)
3942 struct coding_system *coding;
3943 struct coding_detection_info *detect_info;
3944 {
3945 const unsigned char *src = coding->source, *src_base;
3946 const unsigned char *src_end = coding->source + coding->src_bytes;
3947 int multibytep = coding->src_multibyte;
3948 int consumed_chars = 0;
3949 int found = 0;
3950 int c;
3951
3952 detect_info->checked |= CATEGORY_MASK_BIG5;
3953 /* A coding system of this category is always ASCII compatible. */
3954 src += coding->head_ascii;
3955
3956 while (1)
3957 {
3958 src_base = src;
3959 ONE_MORE_BYTE (c);
3960 if (c < 0x80)
3961 continue;
3962 if (c >= 0xA1)
3963 {
3964 ONE_MORE_BYTE (c);
3965 if (c < 0x40 || (c >= 0x7F && c <= 0xA0))
3966 return 0;
3967 found = CATEGORY_MASK_BIG5;
3968 }
3969 else
3970 break;
3971 }
3972 detect_info->rejected |= CATEGORY_MASK_BIG5;
3973 return 0;
3974
3975 no_more_source:
3976 if (src_base < src && coding->mode & CODING_MODE_LAST_BLOCK)
3977 {
3978 detect_info->rejected |= CATEGORY_MASK_BIG5;
3979 return 0;
3980 }
3981 detect_info->found |= found;
3982 return 1;
3983 }
3984
3985 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".
3986 If SJIS_P is 1, decode SJIS text, else decode BIG5 test. */
3987
3988 static void
3989 decode_coding_sjis (coding)
3990 struct coding_system *coding;
3991 {
3992 const unsigned char *src = coding->source + coding->consumed;
3993 const unsigned char *src_end = coding->source + coding->src_bytes;
3994 const unsigned char *src_base;
3995 int *charbuf = coding->charbuf;
3996 int *charbuf_end = charbuf + coding->charbuf_size - MAX_ANNOTATION_LENGTH;
3997 int consumed_chars = 0, consumed_chars_base;
3998 int multibytep = coding->src_multibyte;
3999 struct charset *charset_roman, *charset_kanji, *charset_kana;
4000 Lisp_Object attrs, charset_list, val;
4001 int char_offset = coding->produced_char;
4002 int last_offset = char_offset;
4003 int last_id = charset_ascii;
4004
4005 CODING_GET_INFO (coding, attrs, charset_list);
4006
4007 val = charset_list;
4008 charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4009 charset_kana = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4010 charset_kanji = CHARSET_FROM_ID (XINT (XCAR (val)));
4011
4012 while (1)
4013 {
4014 int c, c1;
4015 struct charset *charset;
4016
4017 src_base = src;
4018 consumed_chars_base = consumed_chars;
4019
4020 if (charbuf >= charbuf_end)
4021 break;
4022
4023 ONE_MORE_BYTE (c);
4024 if (c < 0)
4025 goto invalid_code;
4026 if (c < 0x80)
4027 charset = charset_roman;
4028 else
4029 {
4030 if (c >= 0xF0)
4031 goto invalid_code;
4032 if (c < 0xA0 || c >= 0xE0)
4033 {
4034 /* SJIS -> JISX0208 */
4035 ONE_MORE_BYTE (c1);
4036 if (c1 < 0x40 || c1 == 0x7F || c1 > 0xFC)
4037 goto invalid_code;
4038 c = (c << 8) | c1;
4039 SJIS_TO_JIS (c);
4040 charset = charset_kanji;
4041 }
4042 else if (c > 0xA0)
4043 {
4044 /* SJIS -> JISX0201-Kana */
4045 c &= 0x7F;
4046 charset = charset_kana;
4047 }
4048 else
4049 goto invalid_code;
4050 }
4051 if (charset->id != charset_ascii
4052 && last_id != charset->id)
4053 {
4054 if (last_id != charset_ascii)
4055 ADD_CHARSET_DATA (charbuf, last_offset, char_offset, last_id);
4056 last_id = charset->id;
4057 last_offset = char_offset;
4058 }
4059 CODING_DECODE_CHAR (coding, src, src_base, src_end, charset, c, c);
4060 *charbuf++ = c;
4061 char_offset++;
4062 continue;
4063
4064 invalid_code:
4065 src = src_base;
4066 consumed_chars = consumed_chars_base;
4067 ONE_MORE_BYTE (c);
4068 *charbuf++ = c < 0 ? -c : BYTE8_TO_CHAR (c);
4069 char_offset++;
4070 coding->errors++;
4071 }
4072
4073 no_more_source:
4074 if (last_id != charset_ascii)
4075 ADD_CHARSET_DATA (charbuf, last_offset, char_offset, last_id);
4076 coding->consumed_char += consumed_chars_base;
4077 coding->consumed = src_base - coding->source;
4078 coding->charbuf_used = charbuf - coding->charbuf;
4079 }
4080
4081 static void
4082 decode_coding_big5 (coding)
4083 struct coding_system *coding;
4084 {
4085 const unsigned char *src = coding->source + coding->consumed;
4086 const unsigned char *src_end = coding->source + coding->src_bytes;
4087 const unsigned char *src_base;
4088 int *charbuf = coding->charbuf;
4089 int *charbuf_end = charbuf + coding->charbuf_size - MAX_ANNOTATION_LENGTH;
4090 int consumed_chars = 0, consumed_chars_base;
4091 int multibytep = coding->src_multibyte;
4092 struct charset *charset_roman, *charset_big5;
4093 Lisp_Object attrs, charset_list, val;
4094 int char_offset = coding->produced_char;
4095 int last_offset = char_offset;
4096 int last_id = charset_ascii;
4097
4098 CODING_GET_INFO (coding, attrs, charset_list);
4099 val = charset_list;
4100 charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4101 charset_big5 = CHARSET_FROM_ID (XINT (XCAR (val)));
4102
4103 while (1)
4104 {
4105 int c, c1;
4106 struct charset *charset;
4107
4108 src_base = src;
4109 consumed_chars_base = consumed_chars;
4110
4111 if (charbuf >= charbuf_end)
4112 break;
4113
4114 ONE_MORE_BYTE (c);
4115
4116 if (c < 0)
4117 goto invalid_code;
4118 if (c < 0x80)
4119 charset = charset_roman;
4120 else
4121 {
4122 /* BIG5 -> Big5 */
4123 if (c < 0xA1 || c > 0xFE)
4124 goto invalid_code;
4125 ONE_MORE_BYTE (c1);
4126 if (c1 < 0x40 || (c1 > 0x7E && c1 < 0xA1) || c1 > 0xFE)
4127 goto invalid_code;
4128 c = c << 8 | c1;
4129 charset = charset_big5;
4130 }
4131 if (charset->id != charset_ascii
4132 && last_id != charset->id)
4133 {
4134 if (last_id != charset_ascii)
4135 ADD_CHARSET_DATA (charbuf, last_offset, char_offset, last_id);
4136 last_id = charset->id;
4137 last_offset = char_offset;
4138 }
4139 CODING_DECODE_CHAR (coding, src, src_base, src_end, charset, c, c);
4140 *charbuf++ = c;
4141 char_offset++;
4142 continue;
4143
4144 invalid_code:
4145 src = src_base;
4146 consumed_chars = consumed_chars_base;
4147 ONE_MORE_BYTE (c);
4148 *charbuf++ = c < 0 ? -c : BYTE8_TO_CHAR (c);
4149 char_offset++;
4150 coding->errors++;
4151 }
4152
4153 no_more_source:
4154 if (last_id != charset_ascii)
4155 ADD_CHARSET_DATA (charbuf, last_offset, char_offset, last_id);
4156 coding->consumed_char += consumed_chars_base;
4157 coding->consumed = src_base - coding->source;
4158 coding->charbuf_used = charbuf - coding->charbuf;
4159 }
4160
4161 /* See the above "GENERAL NOTES on `encode_coding_XXX ()' functions".
4162 This function can encode charsets `ascii', `katakana-jisx0201',
4163 `japanese-jisx0208', `chinese-big5-1', and `chinese-big5-2'. We
4164 are sure that all these charsets are registered as official charset
4165 (i.e. do not have extended leading-codes). Characters of other
4166 charsets are produced without any encoding. If SJIS_P is 1, encode
4167 SJIS text, else encode BIG5 text. */
4168
4169 static int
4170 encode_coding_sjis (coding)
4171 struct coding_system *coding;
4172 {
4173 int multibytep = coding->dst_multibyte;
4174 int *charbuf = coding->charbuf;
4175 int *charbuf_end = charbuf + coding->charbuf_used;
4176 unsigned char *dst = coding->destination + coding->produced;
4177 unsigned char *dst_end = coding->destination + coding->dst_bytes;
4178 int safe_room = 4;
4179 int produced_chars = 0;
4180 Lisp_Object attrs, charset_list, val;
4181 int ascii_compatible;
4182 struct charset *charset_roman, *charset_kanji, *charset_kana;
4183 int c;
4184
4185 CODING_GET_INFO (coding, attrs, charset_list);
4186 val = charset_list;
4187 charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4188 charset_kana = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4189 charset_kanji = CHARSET_FROM_ID (XINT (XCAR (val)));
4190
4191 ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs));
4192
4193 while (charbuf < charbuf_end)
4194 {
4195 ASSURE_DESTINATION (safe_room);
4196 c = *charbuf++;
4197 /* Now encode the character C. */
4198 if (ASCII_CHAR_P (c) && ascii_compatible)
4199 EMIT_ONE_ASCII_BYTE (c);
4200 else if (CHAR_BYTE8_P (c))
4201 {
4202 c = CHAR_TO_BYTE8 (c);
4203 EMIT_ONE_BYTE (c);
4204 }
4205 else
4206 {
4207 unsigned code;
4208 struct charset *charset = char_charset (c, charset_list, &code);
4209
4210 if (!charset)
4211 {
4212 if (coding->mode & CODING_MODE_SAFE_ENCODING)
4213 {
4214 code = CODING_INHIBIT_CHARACTER_SUBSTITUTION;
4215 charset = CHARSET_FROM_ID (charset_ascii);
4216 }
4217 else
4218 {
4219 c = coding->default_char;
4220 charset = char_charset (c, charset_list, &code);
4221 }
4222 }
4223 if (code == CHARSET_INVALID_CODE (charset))
4224 abort ();
4225 if (charset == charset_kanji)
4226 {
4227 int c1, c2;
4228 JIS_TO_SJIS (code);
4229 c1 = code >> 8, c2 = code & 0xFF;
4230 EMIT_TWO_BYTES (c1, c2);
4231 }
4232 else if (charset == charset_kana)
4233 EMIT_ONE_BYTE (code | 0x80);
4234 else
4235 EMIT_ONE_ASCII_BYTE (code & 0x7F);
4236 }
4237 }
4238 record_conversion_result (coding, CODING_RESULT_SUCCESS);
4239 coding->produced_char += produced_chars;
4240 coding->produced = dst - coding->destination;
4241 return 0;
4242 }
4243
4244 static int
4245 encode_coding_big5 (coding)
4246 struct coding_system *coding;
4247 {
4248 int multibytep = coding->dst_multibyte;
4249 int *charbuf = coding->charbuf;
4250 int *charbuf_end = charbuf + coding->charbuf_used;
4251 unsigned char *dst = coding->destination + coding->produced;
4252 unsigned char *dst_end = coding->destination + coding->dst_bytes;
4253 int safe_room = 4;
4254 int produced_chars = 0;
4255 Lisp_Object attrs, charset_list, val;
4256 int ascii_compatible;
4257 struct charset *charset_roman, *charset_big5;
4258 int c;
4259
4260 CODING_GET_INFO (coding, attrs, charset_list);
4261 val = charset_list;
4262 charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4263 charset_big5 = CHARSET_FROM_ID (XINT (XCAR (val)));
4264 ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs));
4265
4266 while (charbuf < charbuf_end)
4267 {
4268 ASSURE_DESTINATION (safe_room);
4269 c = *charbuf++;
4270 /* Now encode the character C. */
4271 if (ASCII_CHAR_P (c) && ascii_compatible)
4272 EMIT_ONE_ASCII_BYTE (c);
4273 else if (CHAR_BYTE8_P (c))
4274 {
4275 c = CHAR_TO_BYTE8 (c);
4276 EMIT_ONE_BYTE (c);
4277 }
4278 else
4279 {
4280 unsigned code;
4281 struct charset *charset = char_charset (c, charset_list, &code);
4282
4283 if (! charset)
4284 {
4285 if (coding->mode & CODING_MODE_SAFE_ENCODING)
4286 {
4287 code = CODING_INHIBIT_CHARACTER_SUBSTITUTION;
4288 charset = CHARSET_FROM_ID (charset_ascii);
4289 }
4290 else
4291 {
4292 c = coding->default_char;
4293 charset = char_charset (c, charset_list, &code);
4294 }
4295 }
4296 if (code == CHARSET_INVALID_CODE (charset))
4297 abort ();
4298 if (charset == charset_big5)
4299 {
4300 int c1, c2;
4301
4302 c1 = code >> 8, c2 = code & 0xFF;
4303 EMIT_TWO_BYTES (c1, c2);
4304 }
4305 else
4306 EMIT_ONE_ASCII_BYTE (code & 0x7F);
4307 }
4308 }
4309 record_conversion_result (coding, CODING_RESULT_SUCCESS);
4310 coding->produced_char += produced_chars;
4311 coding->produced = dst - coding->destination;
4312 return 0;
4313 }
4314
4315 \f
4316 /*** 10. CCL handlers ***/
4317
4318 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
4319 Check if a text is encoded in a coding system of which
4320 encoder/decoder are written in CCL program. If it is, return
4321 CATEGORY_MASK_CCL, else return 0. */
4322
4323 static int
4324 detect_coding_ccl (coding, detect_info)
4325 struct coding_system *coding;
4326 struct coding_detection_info *detect_info;
4327 {
4328 const unsigned char *src = coding->source, *src_base;
4329 const unsigned char *src_end = coding->source + coding->src_bytes;
4330 int multibytep = coding->src_multibyte;
4331 int consumed_chars = 0;
4332 int found = 0;
4333 unsigned char *valids = CODING_CCL_VALIDS (coding);
4334 int head_ascii = coding->head_ascii;
4335 Lisp_Object attrs;
4336
4337 detect_info->checked |= CATEGORY_MASK_CCL;
4338
4339 coding = &coding_categories[coding_category_ccl];
4340 attrs = CODING_ID_ATTRS (coding->id);
4341 if (! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
4342 src += head_ascii;
4343
4344 while (1)
4345 {
4346 int c;
4347
4348 src_base = src;
4349 ONE_MORE_BYTE (c);
4350 if (c < 0 || ! valids[c])
4351 break;
4352 if ((valids[c] > 1))
4353 found = CATEGORY_MASK_CCL;
4354 }
4355 detect_info->rejected |= CATEGORY_MASK_CCL;
4356 return 0;
4357
4358 no_more_source:
4359 detect_info->found |= found;
4360 return 1;
4361 }
4362
4363 static void
4364 decode_coding_ccl (coding)
4365 struct coding_system *coding;
4366 {
4367 const unsigned char *src = coding->source + coding->consumed;
4368 const unsigned char *src_end = coding->source + coding->src_bytes;
4369 int *charbuf = coding->charbuf;
4370 int *charbuf_end = charbuf + coding->charbuf_size;
4371 int consumed_chars = 0;
4372 int multibytep = coding->src_multibyte;
4373 struct ccl_program ccl;
4374 int source_charbuf[1024];
4375 int source_byteidx[1024];
4376 Lisp_Object attrs, charset_list;
4377
4378 CODING_GET_INFO (coding, attrs, charset_list);
4379 setup_ccl_program (&ccl, CODING_CCL_DECODER (coding));
4380
4381 while (src < src_end)
4382 {
4383 const unsigned char *p = src;
4384 int *source, *source_end;
4385 int i = 0;
4386
4387 if (multibytep)
4388 while (i < 1024 && p < src_end)
4389 {
4390 source_byteidx[i] = p - src;
4391 source_charbuf[i++] = STRING_CHAR_ADVANCE (p);
4392 }
4393 else
4394 while (i < 1024 && p < src_end)
4395 source_charbuf[i++] = *p++;
4396
4397 if (p == src_end && coding->mode & CODING_MODE_LAST_BLOCK)
4398 ccl.last_block = 1;
4399
4400 source = source_charbuf;
4401 source_end = source + i;
4402 while (source < source_end)
4403 {
4404 ccl_driver (&ccl, source, charbuf,
4405 source_end - source, charbuf_end - charbuf,
4406 charset_list);
4407 source += ccl.consumed;
4408 charbuf += ccl.produced;
4409 if (ccl.status != CCL_STAT_SUSPEND_BY_DST)
4410 break;
4411 }
4412 if (source < source_end)
4413 src += source_byteidx[source - source_charbuf];
4414 else
4415 src = p;
4416 consumed_chars += source - source_charbuf;
4417
4418 if (ccl.status != CCL_STAT_SUSPEND_BY_SRC
4419 && ccl.status != CODING_RESULT_INSUFFICIENT_SRC)
4420 break;
4421 }
4422
4423 switch (ccl.status)
4424 {
4425 case CCL_STAT_SUSPEND_BY_SRC:
4426 record_conversion_result (coding, CODING_RESULT_INSUFFICIENT_SRC);
4427 break;
4428 case CCL_STAT_SUSPEND_BY_DST:
4429 break;
4430 case CCL_STAT_QUIT:
4431 case CCL_STAT_INVALID_CMD:
4432 record_conversion_result (coding, CODING_RESULT_INTERRUPT);
4433 break;
4434 default:
4435 record_conversion_result (coding, CODING_RESULT_SUCCESS);
4436 break;
4437 }
4438 coding->consumed_char += consumed_chars;
4439 coding->consumed = src - coding->source;
4440 coding->charbuf_used = charbuf - coding->charbuf;
4441 }
4442
4443 static int
4444 encode_coding_ccl (coding)
4445 struct coding_system *coding;
4446 {
4447 struct ccl_program ccl;
4448 int multibytep = coding->dst_multibyte;
4449 int *charbuf = coding->charbuf;
4450 int *charbuf_end = charbuf + coding->charbuf_used;
4451 unsigned char *dst = coding->destination + coding->produced;
4452 unsigned char *dst_end = coding->destination + coding->dst_bytes;
4453 unsigned char *adjusted_dst_end = dst_end - 1;
4454 int destination_charbuf[1024];
4455 int i, produced_chars = 0;
4456 Lisp_Object attrs, charset_list;
4457
4458 CODING_GET_INFO (coding, attrs, charset_list);
4459 setup_ccl_program (&ccl, CODING_CCL_ENCODER (coding));
4460
4461 ccl.last_block = coding->mode & CODING_MODE_LAST_BLOCK;
4462 ccl.dst_multibyte = coding->dst_multibyte;
4463
4464 while (charbuf < charbuf_end && dst < adjusted_dst_end)
4465 {
4466 int dst_bytes = dst_end - dst;
4467 if (dst_bytes > 1024)
4468 dst_bytes = 1024;
4469
4470 ccl_driver (&ccl, charbuf, destination_charbuf,
4471 charbuf_end - charbuf, dst_bytes, charset_list);
4472 charbuf += ccl.consumed;
4473 if (multibytep)
4474 for (i = 0; i < ccl.produced; i++)
4475 EMIT_ONE_BYTE (destination_charbuf[i] & 0xFF);
4476 else
4477 {
4478 for (i = 0; i < ccl.produced; i++)
4479 *dst++ = destination_charbuf[i] & 0xFF;
4480 produced_chars += ccl.produced;
4481 }
4482 }
4483
4484 switch (ccl.status)
4485 {
4486 case CCL_STAT_SUSPEND_BY_SRC:
4487 record_conversion_result (coding, CODING_RESULT_INSUFFICIENT_SRC);
4488 break;
4489 case CCL_STAT_SUSPEND_BY_DST:
4490 record_conversion_result (coding, CODING_RESULT_INSUFFICIENT_DST);
4491 break;
4492 case CCL_STAT_QUIT:
4493 case CCL_STAT_INVALID_CMD:
4494 record_conversion_result (coding, CODING_RESULT_INTERRUPT);
4495 break;
4496 default:
4497 record_conversion_result (coding, CODING_RESULT_SUCCESS);
4498 break;
4499 }
4500
4501 coding->produced_char += produced_chars;
4502 coding->produced = dst - coding->destination;
4503 return 0;
4504 }
4505
4506
4507 \f
4508 /*** 10, 11. no-conversion handlers ***/
4509
4510 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions". */
4511
4512 static void
4513 decode_coding_raw_text (coding)
4514 struct coding_system *coding;
4515 {
4516 coding->chars_at_source = 1;
4517 coding->consumed_char = 0;
4518 coding->consumed = 0;
4519 record_conversion_result (coding, CODING_RESULT_SUCCESS);
4520 }
4521
4522 static int
4523 encode_coding_raw_text (coding)
4524 struct coding_system *coding;
4525 {
4526 int multibytep = coding->dst_multibyte;
4527 int *charbuf = coding->charbuf;
4528 int *charbuf_end = coding->charbuf + coding->charbuf_used;
4529 unsigned char *dst = coding->destination + coding->produced;
4530 unsigned char *dst_end = coding->destination + coding->dst_bytes;
4531 int produced_chars = 0;
4532 int c;
4533
4534 if (multibytep)
4535 {
4536 int safe_room = MAX_MULTIBYTE_LENGTH * 2;
4537
4538 if (coding->src_multibyte)
4539 while (charbuf < charbuf_end)
4540 {
4541 ASSURE_DESTINATION (safe_room);
4542 c = *charbuf++;
4543 if (ASCII_CHAR_P (c))
4544 EMIT_ONE_ASCII_BYTE (c);
4545 else if (CHAR_BYTE8_P (c))
4546 {
4547 c = CHAR_TO_BYTE8 (c);
4548 EMIT_ONE_BYTE (c);
4549 }
4550 else
4551 {
4552 unsigned char str[MAX_MULTIBYTE_LENGTH], *p0 = str, *p1 = str;
4553
4554 CHAR_STRING_ADVANCE (c, p1);
4555 while (p0 < p1)
4556 {
4557 EMIT_ONE_BYTE (*p0);
4558 p0++;
4559 }
4560 }
4561 }
4562 else
4563 while (charbuf < charbuf_end)
4564 {
4565 ASSURE_DESTINATION (safe_room);
4566 c = *charbuf++;
4567 EMIT_ONE_BYTE (c);
4568 }
4569 }
4570 else
4571 {
4572 if (coding->src_multibyte)
4573 {
4574 int safe_room = MAX_MULTIBYTE_LENGTH;
4575
4576 while (charbuf < charbuf_end)
4577 {
4578 ASSURE_DESTINATION (safe_room);
4579 c = *charbuf++;
4580 if (ASCII_CHAR_P (c))
4581 *dst++ = c;
4582 else if (CHAR_BYTE8_P (c))
4583 *dst++ = CHAR_TO_BYTE8 (c);
4584 else
4585 CHAR_STRING_ADVANCE (c, dst);
4586 produced_chars++;
4587 }
4588 }
4589 else
4590 {
4591 ASSURE_DESTINATION (charbuf_end - charbuf);
4592 while (charbuf < charbuf_end && dst < dst_end)
4593 *dst++ = *charbuf++;
4594 produced_chars = dst - (coding->destination + coding->dst_bytes);
4595 }
4596 }
4597 record_conversion_result (coding, CODING_RESULT_SUCCESS);
4598 coding->produced_char += produced_chars;
4599 coding->produced = dst - coding->destination;
4600 return 0;
4601 }
4602
4603 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
4604 Check if a text is encoded in a charset-based coding system. If it
4605 is, return 1, else return 0. */
4606
4607 static int
4608 detect_coding_charset (coding, detect_info)
4609 struct coding_system *coding;
4610 struct coding_detection_info *detect_info;
4611 {
4612 const unsigned char *src = coding->source, *src_base;
4613 const unsigned char *src_end = coding->source + coding->src_bytes;
4614 int multibytep = coding->src_multibyte;
4615 int consumed_chars = 0;
4616 Lisp_Object attrs, valids;
4617 int found = 0;
4618
4619 detect_info->checked |= CATEGORY_MASK_CHARSET;
4620
4621 coding = &coding_categories[coding_category_charset];
4622 attrs = CODING_ID_ATTRS (coding->id);
4623 valids = AREF (attrs, coding_attr_charset_valids);
4624
4625 if (! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
4626 src += coding->head_ascii;
4627
4628 while (1)
4629 {
4630 int c;
4631
4632 src_base = src;
4633 ONE_MORE_BYTE (c);
4634 if (c < 0)
4635 continue;
4636 if (NILP (AREF (valids, c)))
4637 break;
4638 if (c >= 0x80)
4639 found = CATEGORY_MASK_CHARSET;
4640 }
4641 detect_info->rejected |= CATEGORY_MASK_CHARSET;
4642 return 0;
4643
4644 no_more_source:
4645 detect_info->found |= found;
4646 return 1;
4647 }
4648
4649 static void
4650 decode_coding_charset (coding)
4651 struct coding_system *coding;
4652 {
4653 const unsigned char *src = coding->source + coding->consumed;
4654 const unsigned char *src_end = coding->source + coding->src_bytes;
4655 const unsigned char *src_base;
4656 int *charbuf = coding->charbuf;
4657 int *charbuf_end = charbuf + coding->charbuf_size - MAX_ANNOTATION_LENGTH;
4658 int consumed_chars = 0, consumed_chars_base;
4659 int multibytep = coding->src_multibyte;
4660 Lisp_Object attrs, charset_list, valids;
4661 int char_offset = coding->produced_char;
4662 int last_offset = char_offset;
4663 int last_id = charset_ascii;
4664
4665 CODING_GET_INFO (coding, attrs, charset_list);
4666 valids = AREF (attrs, coding_attr_charset_valids);
4667
4668 while (1)
4669 {
4670 int c;
4671 Lisp_Object val;
4672 struct charset *charset;
4673 int dim;
4674 int len = 1;
4675 unsigned code;
4676
4677 src_base = src;
4678 consumed_chars_base = consumed_chars;
4679
4680 if (charbuf >= charbuf_end)
4681 break;
4682
4683 ONE_MORE_BYTE (c);
4684 if (c < 0)
4685 goto invalid_code;
4686 code = c;
4687
4688 val = AREF (valids, c);
4689 if (NILP (val))
4690 goto invalid_code;
4691 if (INTEGERP (val))
4692 {
4693 charset = CHARSET_FROM_ID (XFASTINT (val));
4694 dim = CHARSET_DIMENSION (charset);
4695 while (len < dim)
4696 {
4697 ONE_MORE_BYTE (c);
4698 code = (code << 8) | c;
4699 len++;
4700 }
4701 CODING_DECODE_CHAR (coding, src, src_base, src_end,
4702 charset, code, c);
4703 }
4704 else
4705 {
4706 /* VAL is a list of charset IDs. It is assured that the
4707 list is sorted by charset dimensions (smaller one
4708 comes first). */
4709 while (CONSP (val))
4710 {
4711 charset = CHARSET_FROM_ID (XFASTINT (XCAR (val)));
4712 dim = CHARSET_DIMENSION (charset);
4713 while (len < dim)
4714 {
4715 ONE_MORE_BYTE (c);
4716 code = (code << 8) | c;
4717 len++;
4718 }
4719 CODING_DECODE_CHAR (coding, src, src_base,
4720 src_end, charset, code, c);
4721 if (c >= 0)
4722 break;
4723 val = XCDR (val);
4724 }
4725 }
4726 if (c < 0)
4727 goto invalid_code;
4728 if (charset->id != charset_ascii
4729 && last_id != charset->id)
4730 {
4731 if (last_id != charset_ascii)
4732 ADD_CHARSET_DATA (charbuf, last_offset, char_offset, last_id);
4733 last_id = charset->id;
4734 last_offset = char_offset;
4735 }
4736
4737 *charbuf++ = c;
4738 char_offset++;
4739 continue;
4740
4741 invalid_code:
4742 src = src_base;
4743 consumed_chars = consumed_chars_base;
4744 ONE_MORE_BYTE (c);
4745 *charbuf++ = c < 0 ? -c : ASCII_BYTE_P (c) ? c : BYTE8_TO_CHAR (c);
4746 char_offset++;
4747 coding->errors++;
4748 }
4749
4750 no_more_source:
4751 if (last_id != charset_ascii)
4752 ADD_CHARSET_DATA (charbuf, last_offset, char_offset, last_id);
4753 coding->consumed_char += consumed_chars_base;
4754 coding->consumed = src_base - coding->source;
4755 coding->charbuf_used = charbuf - coding->charbuf;
4756 }
4757
4758 static int
4759 encode_coding_charset (coding)
4760 struct coding_system *coding;
4761 {
4762 int multibytep = coding->dst_multibyte;
4763 int *charbuf = coding->charbuf;
4764 int *charbuf_end = charbuf + coding->charbuf_used;
4765 unsigned char *dst = coding->destination + coding->produced;
4766 unsigned char *dst_end = coding->destination + coding->dst_bytes;
4767 int safe_room = MAX_MULTIBYTE_LENGTH;
4768 int produced_chars = 0;
4769 Lisp_Object attrs, charset_list;
4770 int ascii_compatible;
4771 int c;
4772
4773 CODING_GET_INFO (coding, attrs, charset_list);
4774 ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs));
4775
4776 while (charbuf < charbuf_end)
4777 {
4778 struct charset *charset;
4779 unsigned code;
4780
4781 ASSURE_DESTINATION (safe_room);
4782 c = *charbuf++;
4783 if (ascii_compatible && ASCII_CHAR_P (c))
4784 EMIT_ONE_ASCII_BYTE (c);
4785 else if (CHAR_BYTE8_P (c))
4786 {
4787 c = CHAR_TO_BYTE8 (c);
4788 EMIT_ONE_BYTE (c);
4789 }
4790 else
4791 {
4792 charset = char_charset (c, charset_list, &code);
4793 if (charset)
4794 {
4795 if (CHARSET_DIMENSION (charset) == 1)
4796 EMIT_ONE_BYTE (code);
4797 else if (CHARSET_DIMENSION (charset) == 2)
4798 EMIT_TWO_BYTES (code >> 8, code & 0xFF);
4799 else if (CHARSET_DIMENSION (charset) == 3)
4800 EMIT_THREE_BYTES (code >> 16, (code >> 8) & 0xFF, code & 0xFF);
4801 else
4802 EMIT_FOUR_BYTES (code >> 24, (code >> 16) & 0xFF,
4803 (code >> 8) & 0xFF, code & 0xFF);
4804 }
4805 else
4806 {
4807 if (coding->mode & CODING_MODE_SAFE_ENCODING)
4808 c = CODING_INHIBIT_CHARACTER_SUBSTITUTION;
4809 else
4810 c = coding->default_char;
4811 EMIT_ONE_BYTE (c);
4812 }
4813 }
4814 }
4815
4816 record_conversion_result (coding, CODING_RESULT_SUCCESS);
4817 coding->produced_char += produced_chars;
4818 coding->produced = dst - coding->destination;
4819 return 0;
4820 }
4821
4822 \f
4823 /*** 7. C library functions ***/
4824
4825 /* Setup coding context CODING from information about CODING_SYSTEM.
4826 If CODING_SYSTEM is nil, `no-conversion' is assumed. If
4827 CODING_SYSTEM is invalid, signal an error. */
4828
4829 void
4830 setup_coding_system (coding_system, coding)
4831 Lisp_Object coding_system;
4832 struct coding_system *coding;
4833 {
4834 Lisp_Object attrs;
4835 Lisp_Object eol_type;
4836 Lisp_Object coding_type;
4837 Lisp_Object val;
4838
4839 if (NILP (coding_system))
4840 coding_system = Qno_conversion;
4841
4842 CHECK_CODING_SYSTEM_GET_ID (coding_system, coding->id);
4843
4844 attrs = CODING_ID_ATTRS (coding->id);
4845 eol_type = CODING_ID_EOL_TYPE (coding->id);
4846
4847 coding->mode = 0;
4848 coding->head_ascii = -1;
4849 coding->common_flags
4850 = (VECTORP (eol_type) ? CODING_REQUIRE_DETECTION_MASK : 0);
4851 if (! NILP (CODING_ATTR_POST_READ (attrs)))
4852 coding->common_flags |= CODING_REQUIRE_DECODING_MASK;
4853 if (! NILP (CODING_ATTR_PRE_WRITE (attrs)))
4854 coding->common_flags |= CODING_REQUIRE_ENCODING_MASK;
4855 if (! NILP (CODING_ATTR_FOR_UNIBYTE (attrs)))
4856 coding->common_flags |= CODING_FOR_UNIBYTE_MASK;
4857
4858 val = CODING_ATTR_SAFE_CHARSETS (attrs);
4859 coding->max_charset_id = SCHARS (val) - 1;
4860 coding->safe_charsets = (char *) SDATA (val);
4861 coding->default_char = XINT (CODING_ATTR_DEFAULT_CHAR (attrs));
4862
4863 coding_type = CODING_ATTR_TYPE (attrs);
4864 if (EQ (coding_type, Qundecided))
4865 {
4866 coding->detector = NULL;
4867 coding->decoder = decode_coding_raw_text;
4868 coding->encoder = encode_coding_raw_text;
4869 coding->common_flags |= CODING_REQUIRE_DETECTION_MASK;
4870 }
4871 else if (EQ (coding_type, Qiso_2022))
4872 {
4873 int i;
4874 int flags = XINT (AREF (attrs, coding_attr_iso_flags));
4875
4876 /* Invoke graphic register 0 to plane 0. */
4877 CODING_ISO_INVOCATION (coding, 0) = 0;
4878 /* Invoke graphic register 1 to plane 1 if we can use 8-bit. */
4879 CODING_ISO_INVOCATION (coding, 1)
4880 = (flags & CODING_ISO_FLAG_SEVEN_BITS ? -1 : 1);
4881 /* Setup the initial status of designation. */
4882 for (i = 0; i < 4; i++)
4883 CODING_ISO_DESIGNATION (coding, i) = CODING_ISO_INITIAL (coding, i);
4884 /* Not single shifting initially. */
4885 CODING_ISO_SINGLE_SHIFTING (coding) = 0;
4886 /* Beginning of buffer should also be regarded as bol. */
4887 CODING_ISO_BOL (coding) = 1;
4888 coding->detector = detect_coding_iso_2022;
4889 coding->decoder = decode_coding_iso_2022;
4890 coding->encoder = encode_coding_iso_2022;
4891 if (flags & CODING_ISO_FLAG_SAFE)
4892 coding->mode |= CODING_MODE_SAFE_ENCODING;
4893 coding->common_flags
4894 |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK
4895 | CODING_REQUIRE_FLUSHING_MASK);
4896 if (flags & CODING_ISO_FLAG_COMPOSITION)
4897 coding->common_flags |= CODING_ANNOTATE_COMPOSITION_MASK;
4898 if (flags & CODING_ISO_FLAG_DESIGNATION)
4899 coding->common_flags |= CODING_ANNOTATE_CHARSET_MASK;
4900 if (flags & CODING_ISO_FLAG_FULL_SUPPORT)
4901 {
4902 setup_iso_safe_charsets (attrs);
4903 val = CODING_ATTR_SAFE_CHARSETS (attrs);
4904 coding->max_charset_id = SCHARS (val) - 1;
4905 coding->safe_charsets = (char *) SDATA (val);
4906 }
4907 CODING_ISO_FLAGS (coding) = flags;
4908 }
4909 else if (EQ (coding_type, Qcharset))
4910 {
4911 coding->detector = detect_coding_charset;
4912 coding->decoder = decode_coding_charset;
4913 coding->encoder = encode_coding_charset;
4914 coding->common_flags
4915 |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
4916 }
4917 else if (EQ (coding_type, Qutf_8))
4918 {
4919 coding->detector = detect_coding_utf_8;
4920 coding->decoder = decode_coding_utf_8;
4921 coding->encoder = encode_coding_utf_8;
4922 coding->common_flags
4923 |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
4924 }
4925 else if (EQ (coding_type, Qutf_16))
4926 {
4927 val = AREF (attrs, coding_attr_utf_16_bom);
4928 CODING_UTF_16_BOM (coding) = (CONSP (val) ? utf_16_detect_bom
4929 : EQ (val, Qt) ? utf_16_with_bom
4930 : utf_16_without_bom);
4931 val = AREF (attrs, coding_attr_utf_16_endian);
4932 CODING_UTF_16_ENDIAN (coding) = (EQ (val, Qbig) ? utf_16_big_endian
4933 : utf_16_little_endian);
4934 CODING_UTF_16_SURROGATE (coding) = 0;
4935 coding->detector = detect_coding_utf_16;
4936 coding->decoder = decode_coding_utf_16;
4937 coding->encoder = encode_coding_utf_16;
4938 coding->common_flags
4939 |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
4940 if (CODING_UTF_16_BOM (coding) == utf_16_detect_bom)
4941 coding->common_flags |= CODING_REQUIRE_DETECTION_MASK;
4942 }
4943 else if (EQ (coding_type, Qccl))
4944 {
4945 coding->detector = detect_coding_ccl;
4946 coding->decoder = decode_coding_ccl;
4947 coding->encoder = encode_coding_ccl;
4948 coding->common_flags
4949 |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK
4950 | CODING_REQUIRE_FLUSHING_MASK);
4951 }
4952 else if (EQ (coding_type, Qemacs_mule))
4953 {
4954 coding->detector = detect_coding_emacs_mule;
4955 coding->decoder = decode_coding_emacs_mule;
4956 coding->encoder = encode_coding_emacs_mule;
4957 coding->common_flags
4958 |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
4959 if (! NILP (AREF (attrs, coding_attr_emacs_mule_full))
4960 && ! EQ (CODING_ATTR_CHARSET_LIST (attrs), Vemacs_mule_charset_list))
4961 {
4962 Lisp_Object tail, safe_charsets;
4963 int max_charset_id = 0;
4964
4965 for (tail = Vemacs_mule_charset_list; CONSP (tail);
4966 tail = XCDR (tail))
4967 if (max_charset_id < XFASTINT (XCAR (tail)))
4968 max_charset_id = XFASTINT (XCAR (tail));
4969 safe_charsets = Fmake_string (make_number (max_charset_id + 1),
4970 make_number (255));
4971 for (tail = Vemacs_mule_charset_list; CONSP (tail);
4972 tail = XCDR (tail))
4973 SSET (safe_charsets, XFASTINT (XCAR (tail)), 0);
4974 coding->max_charset_id = max_charset_id;
4975 coding->safe_charsets = (char *) SDATA (safe_charsets);
4976 }
4977 }
4978 else if (EQ (coding_type, Qshift_jis))
4979 {
4980 coding->detector = detect_coding_sjis;
4981 coding->decoder = decode_coding_sjis;
4982 coding->encoder = encode_coding_sjis;
4983 coding->common_flags
4984 |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
4985 }
4986 else if (EQ (coding_type, Qbig5))
4987 {
4988 coding->detector = detect_coding_big5;
4989 coding->decoder = decode_coding_big5;
4990 coding->encoder = encode_coding_big5;
4991 coding->common_flags
4992 |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
4993 }
4994 else /* EQ (coding_type, Qraw_text) */
4995 {
4996 coding->detector = NULL;
4997 coding->decoder = decode_coding_raw_text;
4998 coding->encoder = encode_coding_raw_text;
4999 }
5000
5001 return;
5002 }
5003
5004 /* Return raw-text or one of its subsidiaries that has the same
5005 eol_type as CODING-SYSTEM. */
5006
5007 Lisp_Object
5008 raw_text_coding_system (coding_system)
5009 Lisp_Object coding_system;
5010 {
5011 Lisp_Object spec, attrs;
5012 Lisp_Object eol_type, raw_text_eol_type;
5013
5014 if (NILP (coding_system))
5015 return Qraw_text;
5016 spec = CODING_SYSTEM_SPEC (coding_system);
5017 attrs = AREF (spec, 0);
5018
5019 if (EQ (CODING_ATTR_TYPE (attrs), Qraw_text))
5020 return coding_system;
5021
5022 eol_type = AREF (spec, 2);
5023 if (VECTORP (eol_type))
5024 return Qraw_text;
5025 spec = CODING_SYSTEM_SPEC (Qraw_text);
5026 raw_text_eol_type = AREF (spec, 2);
5027 return (EQ (eol_type, Qunix) ? AREF (raw_text_eol_type, 0)
5028 : EQ (eol_type, Qdos) ? AREF (raw_text_eol_type, 1)
5029 : AREF (raw_text_eol_type, 2));
5030 }
5031
5032
5033 /* If CODING_SYSTEM doesn't specify end-of-line format but PARENT
5034 does, return one of the subsidiary that has the same eol-spec as
5035 PARENT. Otherwise, return CODING_SYSTEM. */
5036
5037 Lisp_Object
5038 coding_inherit_eol_type (coding_system, parent)
5039 Lisp_Object coding_system, parent;
5040 {
5041 Lisp_Object spec, eol_type;
5042
5043 if (NILP (coding_system))
5044 coding_system = Qraw_text;
5045 spec = CODING_SYSTEM_SPEC (coding_system);
5046 eol_type = AREF (spec, 2);
5047 if (VECTORP (eol_type)
5048 && ! NILP (parent))
5049 {
5050 Lisp_Object parent_spec;
5051 Lisp_Object parent_eol_type;
5052
5053 parent_spec
5054 = CODING_SYSTEM_SPEC (buffer_defaults.buffer_file_coding_system);
5055 parent_eol_type = AREF (parent_spec, 2);
5056 if (EQ (parent_eol_type, Qunix))
5057 coding_system = AREF (eol_type, 0);
5058 else if (EQ (parent_eol_type, Qdos))
5059 coding_system = AREF (eol_type, 1);
5060 else if (EQ (parent_eol_type, Qmac))
5061 coding_system = AREF (eol_type, 2);
5062 }
5063 return coding_system;
5064 }
5065
5066 /* Emacs has a mechanism to automatically detect a coding system if it
5067 is one of Emacs' internal format, ISO2022, SJIS, and BIG5. But,
5068 it's impossible to distinguish some coding systems accurately
5069 because they use the same range of codes. So, at first, coding
5070 systems are categorized into 7, those are:
5071
5072 o coding-category-emacs-mule
5073
5074 The category for a coding system which has the same code range
5075 as Emacs' internal format. Assigned the coding-system (Lisp
5076 symbol) `emacs-mule' by default.
5077
5078 o coding-category-sjis
5079
5080 The category for a coding system which has the same code range
5081 as SJIS. Assigned the coding-system (Lisp
5082 symbol) `japanese-shift-jis' by default.
5083
5084 o coding-category-iso-7
5085
5086 The category for a coding system which has the same code range
5087 as ISO2022 of 7-bit environment. This doesn't use any locking
5088 shift and single shift functions. This can encode/decode all
5089 charsets. Assigned the coding-system (Lisp symbol)
5090 `iso-2022-7bit' by default.
5091
5092 o coding-category-iso-7-tight
5093
5094 Same as coding-category-iso-7 except that this can
5095 encode/decode only the specified charsets.
5096
5097 o coding-category-iso-8-1
5098
5099 The category for a coding system which has the same code range
5100 as ISO2022 of 8-bit environment and graphic plane 1 used only
5101 for DIMENSION1 charset. This doesn't use any locking shift
5102 and single shift functions. Assigned the coding-system (Lisp
5103 symbol) `iso-latin-1' by default.
5104
5105 o coding-category-iso-8-2
5106
5107 The category for a coding system which has the same code range
5108 as ISO2022 of 8-bit environment and graphic plane 1 used only
5109 for DIMENSION2 charset. This doesn't use any locking shift
5110 and single shift functions. Assigned the coding-system (Lisp
5111 symbol) `japanese-iso-8bit' by default.
5112
5113 o coding-category-iso-7-else
5114
5115 The category for a coding system which has the same code range
5116 as ISO2022 of 7-bit environemnt but uses locking shift or
5117 single shift functions. Assigned the coding-system (Lisp
5118 symbol) `iso-2022-7bit-lock' by default.
5119
5120 o coding-category-iso-8-else
5121
5122 The category for a coding system which has the same code range
5123 as ISO2022 of 8-bit environemnt but uses locking shift or
5124 single shift functions. Assigned the coding-system (Lisp
5125 symbol) `iso-2022-8bit-ss2' by default.
5126
5127 o coding-category-big5
5128
5129 The category for a coding system which has the same code range
5130 as BIG5. Assigned the coding-system (Lisp symbol)
5131 `cn-big5' by default.
5132
5133 o coding-category-utf-8
5134
5135 The category for a coding system which has the same code range
5136 as UTF-8 (cf. RFC2279). Assigned the coding-system (Lisp
5137 symbol) `utf-8' by default.
5138
5139 o coding-category-utf-16-be
5140
5141 The category for a coding system in which a text has an
5142 Unicode signature (cf. Unicode Standard) in the order of BIG
5143 endian at the head. Assigned the coding-system (Lisp symbol)
5144 `utf-16-be' by default.
5145
5146 o coding-category-utf-16-le
5147
5148 The category for a coding system in which a text has an
5149 Unicode signature (cf. Unicode Standard) in the order of
5150 LITTLE endian at the head. Assigned the coding-system (Lisp
5151 symbol) `utf-16-le' by default.
5152
5153 o coding-category-ccl
5154
5155 The category for a coding system of which encoder/decoder is
5156 written in CCL programs. The default value is nil, i.e., no
5157 coding system is assigned.
5158
5159 o coding-category-binary
5160
5161 The category for a coding system not categorized in any of the
5162 above. Assigned the coding-system (Lisp symbol)
5163 `no-conversion' by default.
5164
5165 Each of them is a Lisp symbol and the value is an actual
5166 `coding-system's (this is also a Lisp symbol) assigned by a user.
5167 What Emacs does actually is to detect a category of coding system.
5168 Then, it uses a `coding-system' assigned to it. If Emacs can't
5169 decide only one possible category, it selects a category of the
5170 highest priority. Priorities of categories are also specified by a
5171 user in a Lisp variable `coding-category-list'.
5172
5173 */
5174
5175 #define EOL_SEEN_NONE 0
5176 #define EOL_SEEN_LF 1
5177 #define EOL_SEEN_CR 2
5178 #define EOL_SEEN_CRLF 4
5179
5180 /* Detect how end-of-line of a text of length SRC_BYTES pointed by
5181 SOURCE is encoded. If CATEGORY is one of
5182 coding_category_utf_16_XXXX, assume that CR and LF are encoded by
5183 two-byte, else they are encoded by one-byte.
5184
5185 Return one of EOL_SEEN_XXX. */
5186
5187 #define MAX_EOL_CHECK_COUNT 3
5188
5189 static int
5190 detect_eol (source, src_bytes, category)
5191 unsigned char *source;
5192 EMACS_INT src_bytes;
5193 enum coding_category category;
5194 {
5195 unsigned char *src = source, *src_end = src + src_bytes;
5196 unsigned char c;
5197 int total = 0;
5198 int eol_seen = EOL_SEEN_NONE;
5199
5200 if ((1 << category) & CATEGORY_MASK_UTF_16)
5201 {
5202 int msb, lsb;
5203
5204 msb = category == (coding_category_utf_16_le
5205 | coding_category_utf_16_le_nosig);
5206 lsb = 1 - msb;
5207
5208 while (src + 1 < src_end)
5209 {
5210 c = src[lsb];
5211 if (src[msb] == 0 && (c == '\n' || c == '\r'))
5212 {
5213 int this_eol;
5214
5215 if (c == '\n')
5216 this_eol = EOL_SEEN_LF;
5217 else if (src + 3 >= src_end
5218 || src[msb + 2] != 0
5219 || src[lsb + 2] != '\n')
5220 this_eol = EOL_SEEN_CR;
5221 else
5222 this_eol = EOL_SEEN_CRLF;
5223
5224 if (eol_seen == EOL_SEEN_NONE)
5225 /* This is the first end-of-line. */
5226 eol_seen = this_eol;
5227 else if (eol_seen != this_eol)
5228 {
5229 /* The found type is different from what found before. */
5230 eol_seen = EOL_SEEN_LF;
5231 break;
5232 }
5233 if (++total == MAX_EOL_CHECK_COUNT)
5234 break;
5235 }
5236 src += 2;
5237 }
5238 }
5239 else
5240 {
5241 while (src < src_end)
5242 {
5243 c = *src++;
5244 if (c == '\n' || c == '\r')
5245 {
5246 int this_eol;
5247
5248 if (c == '\n')
5249 this_eol = EOL_SEEN_LF;
5250 else if (src >= src_end || *src != '\n')
5251 this_eol = EOL_SEEN_CR;
5252 else
5253 this_eol = EOL_SEEN_CRLF, src++;
5254
5255 if (eol_seen == EOL_SEEN_NONE)
5256 /* This is the first end-of-line. */
5257 eol_seen = this_eol;
5258 else if (eol_seen != this_eol)
5259 {
5260 /* The found type is different from what found before. */
5261 eol_seen = EOL_SEEN_LF;
5262 break;
5263 }
5264 if (++total == MAX_EOL_CHECK_COUNT)
5265 break;
5266 }
5267 }
5268 }
5269 return eol_seen;
5270 }
5271
5272
5273 static Lisp_Object
5274 adjust_coding_eol_type (coding, eol_seen)
5275 struct coding_system *coding;
5276 int eol_seen;
5277 {
5278 Lisp_Object eol_type;
5279
5280 eol_type = CODING_ID_EOL_TYPE (coding->id);
5281 if (eol_seen & EOL_SEEN_LF)
5282 {
5283 coding->id = CODING_SYSTEM_ID (AREF (eol_type, 0));
5284 eol_type = Qunix;
5285 }
5286 else if (eol_seen & EOL_SEEN_CRLF)
5287 {
5288 coding->id = CODING_SYSTEM_ID (AREF (eol_type, 1));
5289 eol_type = Qdos;
5290 }
5291 else if (eol_seen & EOL_SEEN_CR)
5292 {
5293 coding->id = CODING_SYSTEM_ID (AREF (eol_type, 2));
5294 eol_type = Qmac;
5295 }
5296 return eol_type;
5297 }
5298
5299 /* Detect how a text specified in CODING is encoded. If a coding
5300 system is detected, update fields of CODING by the detected coding
5301 system. */
5302
5303 void
5304 detect_coding (coding)
5305 struct coding_system *coding;
5306 {
5307 const unsigned char *src, *src_end;
5308 Lisp_Object attrs, coding_type;
5309
5310 coding->consumed = coding->consumed_char = 0;
5311 coding->produced = coding->produced_char = 0;
5312 coding_set_source (coding);
5313
5314 src_end = coding->source + coding->src_bytes;
5315
5316 /* If we have not yet decided the text encoding type, detect it
5317 now. */
5318 if (EQ (CODING_ATTR_TYPE (CODING_ID_ATTRS (coding->id)), Qundecided))
5319 {
5320 int c, i;
5321
5322 for (i = 0, src = coding->source; src < src_end; i++, src++)
5323 {
5324 c = *src;
5325 if (c & 0x80 || (c < 0x20 && (c == 0
5326 || c == ISO_CODE_ESC
5327 || c == ISO_CODE_SI
5328 || c == ISO_CODE_SO)))
5329 break;
5330 }
5331 coding->head_ascii = src - (coding->source + coding->consumed);
5332
5333 if (coding->head_ascii < coding->src_bytes)
5334 {
5335 struct coding_detection_info detect_info;
5336 enum coding_category category;
5337 struct coding_system *this;
5338
5339 detect_info.checked = detect_info.found = detect_info.rejected = 0;
5340 for (i = 0; i < coding_category_raw_text; i++)
5341 {
5342 category = coding_priorities[i];
5343 this = coding_categories + category;
5344 if (this->id < 0)
5345 {
5346 /* No coding system of this category is defined. */
5347 detect_info.rejected |= (1 << category);
5348 }
5349 else if (category >= coding_category_raw_text)
5350 continue;
5351 else if (detect_info.checked & (1 << category))
5352 {
5353 if (detect_info.found & (1 << category))
5354 break;
5355 }
5356 else if ((*(this->detector)) (coding, &detect_info)
5357 && detect_info.found & (1 << category))
5358 {
5359 if (category == coding_category_utf_16_auto)
5360 {
5361 if (detect_info.found & CATEGORY_MASK_UTF_16_LE)
5362 category = coding_category_utf_16_le;
5363 else
5364 category = coding_category_utf_16_be;
5365 }
5366 break;
5367 }
5368 }
5369 if (i < coding_category_raw_text)
5370 setup_coding_system (CODING_ID_NAME (this->id), coding);
5371 else if (detect_info.rejected == CATEGORY_MASK_ANY)
5372 setup_coding_system (Qraw_text, coding);
5373 else if (detect_info.rejected)
5374 for (i = 0; i < coding_category_raw_text; i++)
5375 if (! (detect_info.rejected & (1 << coding_priorities[i])))
5376 {
5377 this = coding_categories + coding_priorities[i];
5378 setup_coding_system (CODING_ID_NAME (this->id), coding);
5379 break;
5380 }
5381 }
5382 }
5383 else if (XINT (CODING_ATTR_CATEGORY (CODING_ID_ATTRS (coding->id)))
5384 == coding_category_utf_16_auto)
5385 {
5386 Lisp_Object coding_systems;
5387 struct coding_detection_info detect_info;
5388
5389 coding_systems
5390 = AREF (CODING_ID_ATTRS (coding->id), coding_attr_utf_16_bom);
5391 detect_info.found = detect_info.rejected = 0;
5392 if (CONSP (coding_systems)
5393 && detect_coding_utf_16 (coding, &detect_info))
5394 {
5395 if (detect_info.found & CATEGORY_MASK_UTF_16_LE)
5396 setup_coding_system (XCAR (coding_systems), coding);
5397 else if (detect_info.found & CATEGORY_MASK_UTF_16_BE)
5398 setup_coding_system (XCDR (coding_systems), coding);
5399 }
5400 }
5401 }
5402
5403
5404 static void
5405 decode_eol (coding)
5406 struct coding_system *coding;
5407 {
5408 Lisp_Object eol_type;
5409 unsigned char *p, *pbeg, *pend;
5410
5411 eol_type = CODING_ID_EOL_TYPE (coding->id);
5412 if (EQ (eol_type, Qunix))
5413 return;
5414
5415 if (NILP (coding->dst_object))
5416 pbeg = coding->destination;
5417 else
5418 pbeg = BYTE_POS_ADDR (coding->dst_pos_byte);
5419 pend = pbeg + coding->produced;
5420
5421 if (VECTORP (eol_type))
5422 {
5423 int eol_seen = EOL_SEEN_NONE;
5424
5425 for (p = pbeg; p < pend; p++)
5426 {
5427 if (*p == '\n')
5428 eol_seen |= EOL_SEEN_LF;
5429 else if (*p == '\r')
5430 {
5431 if (p + 1 < pend && *(p + 1) == '\n')
5432 {
5433 eol_seen |= EOL_SEEN_CRLF;
5434 p++;
5435 }
5436 else
5437 eol_seen |= EOL_SEEN_CR;
5438 }
5439 }
5440 if (eol_seen != EOL_SEEN_NONE
5441 && eol_seen != EOL_SEEN_LF
5442 && eol_seen != EOL_SEEN_CRLF
5443 && eol_seen != EOL_SEEN_CR)
5444 eol_seen = EOL_SEEN_LF;
5445 if (eol_seen != EOL_SEEN_NONE)
5446 eol_type = adjust_coding_eol_type (coding, eol_seen);
5447 }
5448
5449 if (EQ (eol_type, Qmac))
5450 {
5451 for (p = pbeg; p < pend; p++)
5452 if (*p == '\r')
5453 *p = '\n';
5454 }
5455 else if (EQ (eol_type, Qdos))
5456 {
5457 int n = 0;
5458
5459 if (NILP (coding->dst_object))
5460 {
5461 for (p = pend - 2; p >= pbeg; p--)
5462 if (*p == '\r')
5463 {
5464 safe_bcopy ((char *) (p + 1), (char *) p, pend-- - p - 1);
5465 n++;
5466 }
5467 }
5468 else
5469 {
5470 for (p = pend - 2; p >= pbeg; p--)
5471 if (*p == '\r')
5472 {
5473 int pos_byte = coding->dst_pos_byte + (p - pbeg);
5474 int pos = BYTE_TO_CHAR (pos_byte);
5475
5476 del_range_2 (pos, pos_byte, pos + 1, pos_byte + 1, 0);
5477 n++;
5478 }
5479 }
5480 coding->produced -= n;
5481 coding->produced_char -= n;
5482 }
5483 }
5484
5485 static void
5486 translate_chars (coding, table)
5487 struct coding_system *coding;
5488 Lisp_Object table;
5489 {
5490 int *charbuf = coding->charbuf;
5491 int *charbuf_end = charbuf + coding->charbuf_used;
5492 int c;
5493
5494 if (coding->chars_at_source)
5495 return;
5496
5497 while (charbuf < charbuf_end)
5498 {
5499 c = *charbuf;
5500 if (c < 0)
5501 charbuf += c;
5502 else
5503 *charbuf++ = translate_char (table, c);
5504 }
5505 }
5506
5507 static int
5508 produce_chars (coding)
5509 struct coding_system *coding;
5510 {
5511 unsigned char *dst = coding->destination + coding->produced;
5512 unsigned char *dst_end = coding->destination + coding->dst_bytes;
5513 int produced;
5514 int produced_chars = 0;
5515
5516 if (! coding->chars_at_source)
5517 {
5518 /* Characters are in coding->charbuf. */
5519 int *buf = coding->charbuf;
5520 int *buf_end = buf + coding->charbuf_used;
5521 unsigned char *adjusted_dst_end;
5522
5523 if (BUFFERP (coding->src_object)
5524 && EQ (coding->src_object, coding->dst_object))
5525 dst_end = ((unsigned char *) coding->source) + coding->consumed;
5526 adjusted_dst_end = dst_end - MAX_MULTIBYTE_LENGTH;
5527
5528 while (buf < buf_end)
5529 {
5530 int c = *buf++;
5531
5532 if (dst >= adjusted_dst_end)
5533 {
5534 dst = alloc_destination (coding,
5535 buf_end - buf + MAX_MULTIBYTE_LENGTH,
5536 dst);
5537 dst_end = coding->destination + coding->dst_bytes;
5538 adjusted_dst_end = dst_end - MAX_MULTIBYTE_LENGTH;
5539 }
5540 if (c >= 0)
5541 {
5542 if (coding->dst_multibyte
5543 || ! CHAR_BYTE8_P (c))
5544 CHAR_STRING_ADVANCE (c, dst);
5545 else
5546 *dst++ = CHAR_TO_BYTE8 (c);
5547 produced_chars++;
5548 }
5549 else
5550 /* This is an annotation datum. (-C) is the length of
5551 it. */
5552 buf += -c - 1;
5553 }
5554 }
5555 else
5556 {
5557 const unsigned char *src = coding->source;
5558 const unsigned char *src_end = src + coding->src_bytes;
5559 Lisp_Object eol_type;
5560
5561 eol_type = CODING_ID_EOL_TYPE (coding->id);
5562
5563 if (coding->src_multibyte != coding->dst_multibyte)
5564 {
5565 if (coding->src_multibyte)
5566 {
5567 int multibytep = 1;
5568 int consumed_chars;
5569
5570 while (1)
5571 {
5572 const unsigned char *src_base = src;
5573 int c;
5574
5575 ONE_MORE_BYTE (c);
5576 if (c == '\r')
5577 {
5578 if (EQ (eol_type, Qdos))
5579 {
5580 if (src == src_end)
5581 {
5582 record_conversion_result
5583 (coding, CODING_RESULT_INSUFFICIENT_SRC);
5584 goto no_more_source;
5585 }
5586 if (*src == '\n')
5587 c = *src++;
5588 }
5589 else if (EQ (eol_type, Qmac))
5590 c = '\n';
5591 }
5592 if (dst == dst_end)
5593 {
5594 coding->consumed = src - coding->source;
5595
5596 if (EQ (coding->src_object, coding->dst_object))
5597 dst_end = (unsigned char *) src;
5598 if (dst == dst_end)
5599 {
5600 dst = alloc_destination (coding, src_end - src + 1,
5601 dst);
5602 dst_end = coding->destination + coding->dst_bytes;
5603 coding_set_source (coding);
5604 src = coding->source + coding->consumed;
5605 src_end = coding->source + coding->src_bytes;
5606 }
5607 }
5608 *dst++ = c;
5609 produced_chars++;
5610 }
5611 no_more_source:
5612 ;
5613 }
5614 else
5615 while (src < src_end)
5616 {
5617 int multibytep = 1;
5618 int c = *src++;
5619
5620 if (c == '\r')
5621 {
5622 if (EQ (eol_type, Qdos))
5623 {
5624 if (src < src_end
5625 && *src == '\n')
5626 c = *src++;
5627 }
5628 else if (EQ (eol_type, Qmac))
5629 c = '\n';
5630 }
5631 if (dst >= dst_end - 1)
5632 {
5633 coding->consumed = src - coding->source;
5634
5635 if (EQ (coding->src_object, coding->dst_object))
5636 dst_end = (unsigned char *) src;
5637 if (dst >= dst_end - 1)
5638 {
5639 dst = alloc_destination (coding, src_end - src + 2,
5640 dst);
5641 dst_end = coding->destination + coding->dst_bytes;
5642 coding_set_source (coding);
5643 src = coding->source + coding->consumed;
5644 src_end = coding->source + coding->src_bytes;
5645 }
5646 }
5647 EMIT_ONE_BYTE (c);
5648 }
5649 }
5650 else
5651 {
5652 if (!EQ (coding->src_object, coding->dst_object))
5653 {
5654 int require = coding->src_bytes - coding->dst_bytes;
5655
5656 if (require > 0)
5657 {
5658 EMACS_INT offset = src - coding->source;
5659
5660 dst = alloc_destination (coding, require, dst);
5661 coding_set_source (coding);
5662 src = coding->source + offset;
5663 src_end = coding->source + coding->src_bytes;
5664 }
5665 }
5666 produced_chars = coding->src_chars;
5667 while (src < src_end)
5668 {
5669 int c = *src++;
5670
5671 if (c == '\r')
5672 {
5673 if (EQ (eol_type, Qdos))
5674 {
5675 if (src < src_end
5676 && *src == '\n')
5677 c = *src++;
5678 produced_chars--;
5679 }
5680 else if (EQ (eol_type, Qmac))
5681 c = '\n';
5682 }
5683 *dst++ = c;
5684 }
5685 }
5686 coding->consumed = coding->src_bytes;
5687 coding->consumed_char = coding->src_chars;
5688 }
5689
5690 produced = dst - (coding->destination + coding->produced);
5691 if (BUFFERP (coding->dst_object))
5692 insert_from_gap (produced_chars, produced);
5693 coding->produced += produced;
5694 coding->produced_char += produced_chars;
5695 return produced_chars;
5696 }
5697
5698 /* Compose text in CODING->object according to the annotation data at
5699 CHARBUF. CHARBUF is an array:
5700 [ -LENGTH ANNOTATION_MASK FROM TO METHOD COMP_LEN [ COMPONENTS... ] ]
5701 */
5702
5703 static INLINE void
5704 produce_composition (coding, charbuf)
5705 struct coding_system *coding;
5706 int *charbuf;
5707 {
5708 int len;
5709 EMACS_INT from, to;
5710 enum composition_method method;
5711 Lisp_Object components;
5712
5713 len = -charbuf[0];
5714 from = coding->dst_pos + charbuf[2];
5715 to = coding->dst_pos + charbuf[3];
5716 method = (enum composition_method) (charbuf[4]);
5717
5718 if (method == COMPOSITION_RELATIVE)
5719 components = Qnil;
5720 else
5721 {
5722 Lisp_Object args[MAX_COMPOSITION_COMPONENTS * 2 - 1];
5723 int i;
5724
5725 len -= 5;
5726 charbuf += 5;
5727 for (i = 0; i < len; i++)
5728 args[i] = make_number (charbuf[i]);
5729 components = (method == COMPOSITION_WITH_ALTCHARS
5730 ? Fstring (len, args) : Fvector (len, args));
5731 }
5732 compose_text (from, to, components, Qnil, coding->dst_object);
5733 }
5734
5735
5736 /* Put `charset' property on text in CODING->object according to
5737 the annotation data at CHARBUF. CHARBUF is an array:
5738 [ -LENGTH ANNOTATION_MASK FROM TO CHARSET-ID ]
5739 */
5740
5741 static INLINE void
5742 produce_charset (coding, charbuf)
5743 struct coding_system *coding;
5744 int *charbuf;
5745 {
5746 EMACS_INT from = coding->dst_pos + charbuf[2];
5747 EMACS_INT to = coding->dst_pos + charbuf[3];
5748 struct charset *charset = CHARSET_FROM_ID (charbuf[4]);
5749
5750 Fput_text_property (make_number (from), make_number (to),
5751 Qcharset, CHARSET_NAME (charset),
5752 coding->dst_object);
5753 }
5754
5755
5756 #define CHARBUF_SIZE 0x4000
5757
5758 #define ALLOC_CONVERSION_WORK_AREA(coding) \
5759 do { \
5760 int size = CHARBUF_SIZE;; \
5761 \
5762 coding->charbuf = NULL; \
5763 while (size > 1024) \
5764 { \
5765 coding->charbuf = (int *) alloca (sizeof (int) * size); \
5766 if (coding->charbuf) \
5767 break; \
5768 size >>= 1; \
5769 } \
5770 if (! coding->charbuf) \
5771 { \
5772 record_conversion_result (coding, CODING_RESULT_INSUFFICIENT_MEM); \
5773 return coding->result; \
5774 } \
5775 coding->charbuf_size = size; \
5776 } while (0)
5777
5778
5779 static void
5780 produce_annotation (coding)
5781 struct coding_system *coding;
5782 {
5783 int *charbuf = coding->charbuf;
5784 int *charbuf_end = charbuf + coding->charbuf_used;
5785
5786 if (NILP (coding->dst_object))
5787 return;
5788
5789 while (charbuf < charbuf_end)
5790 {
5791 if (*charbuf >= 0)
5792 charbuf++;
5793 else
5794 {
5795 int len = -*charbuf;
5796 switch (charbuf[1])
5797 {
5798 case CODING_ANNOTATE_COMPOSITION_MASK:
5799 produce_composition (coding, charbuf);
5800 break;
5801 case CODING_ANNOTATE_CHARSET_MASK:
5802 produce_charset (coding, charbuf);
5803 break;
5804 default:
5805 abort ();
5806 }
5807 charbuf += len;
5808 }
5809 }
5810 }
5811
5812 /* Decode the data at CODING->src_object into CODING->dst_object.
5813 CODING->src_object is a buffer, a string, or nil.
5814 CODING->dst_object is a buffer.
5815
5816 If CODING->src_object is a buffer, it must be the current buffer.
5817 In this case, if CODING->src_pos is positive, it is a position of
5818 the source text in the buffer, otherwise, the source text is in the
5819 gap area of the buffer, and CODING->src_pos specifies the offset of
5820 the text from GPT (which must be the same as PT). If this is the
5821 same buffer as CODING->dst_object, CODING->src_pos must be
5822 negative.
5823
5824 If CODING->src_object is a string, CODING->src_pos in an index to
5825 that string.
5826
5827 If CODING->src_object is nil, CODING->source must already point to
5828 the non-relocatable memory area. In this case, CODING->src_pos is
5829 an offset from CODING->source.
5830
5831 The decoded data is inserted at the current point of the buffer
5832 CODING->dst_object.
5833 */
5834
5835 static int
5836 decode_coding (coding)
5837 struct coding_system *coding;
5838 {
5839 Lisp_Object attrs;
5840 Lisp_Object undo_list;
5841
5842 if (BUFFERP (coding->src_object)
5843 && coding->src_pos > 0
5844 && coding->src_pos < GPT
5845 && coding->src_pos + coding->src_chars > GPT)
5846 move_gap_both (coding->src_pos, coding->src_pos_byte);
5847
5848 undo_list = Qt;
5849 if (BUFFERP (coding->dst_object))
5850 {
5851 if (current_buffer != XBUFFER (coding->dst_object))
5852 set_buffer_internal (XBUFFER (coding->dst_object));
5853 if (GPT != PT)
5854 move_gap_both (PT, PT_BYTE);
5855 undo_list = current_buffer->undo_list;
5856 current_buffer->undo_list = Qt;
5857 }
5858
5859 coding->consumed = coding->consumed_char = 0;
5860 coding->produced = coding->produced_char = 0;
5861 coding->chars_at_source = 0;
5862 record_conversion_result (coding, CODING_RESULT_SUCCESS);
5863 coding->errors = 0;
5864
5865 ALLOC_CONVERSION_WORK_AREA (coding);
5866
5867 attrs = CODING_ID_ATTRS (coding->id);
5868
5869 do
5870 {
5871 coding_set_source (coding);
5872 coding->annotated = 0;
5873 (*(coding->decoder)) (coding);
5874 if (!NILP (CODING_ATTR_DECODE_TBL (attrs)))
5875 translate_chars (coding, CODING_ATTR_DECODE_TBL (attrs));
5876 else if (!NILP (Vstandard_translation_table_for_decode))
5877 translate_chars (coding, Vstandard_translation_table_for_decode);
5878 coding_set_destination (coding);
5879 produce_chars (coding);
5880 if (coding->annotated)
5881 produce_annotation (coding);
5882 }
5883 while (coding->consumed < coding->src_bytes
5884 && ! coding->result);
5885
5886 coding->carryover_bytes = 0;
5887 if (coding->consumed < coding->src_bytes)
5888 {
5889 int nbytes = coding->src_bytes - coding->consumed;
5890 const unsigned char *src;
5891
5892 coding_set_source (coding);
5893 coding_set_destination (coding);
5894 src = coding->source + coding->consumed;
5895
5896 if (coding->mode & CODING_MODE_LAST_BLOCK)
5897 {
5898 /* Flush out unprocessed data as binary chars. We are sure
5899 that the number of data is less than the size of
5900 coding->charbuf. */
5901 coding->charbuf_used = 0;
5902 while (nbytes-- > 0)
5903 {
5904 int c = *src++;
5905
5906 coding->charbuf[coding->charbuf_used++] = (c & 0x80 ? - c : c);
5907 }
5908 produce_chars (coding);
5909 }
5910 else
5911 {
5912 /* Record unprocessed bytes in coding->carryover. We are
5913 sure that the number of data is less than the size of
5914 coding->carryover. */
5915 unsigned char *p = coding->carryover;
5916
5917 coding->carryover_bytes = nbytes;
5918 while (nbytes-- > 0)
5919 *p++ = *src++;
5920 }
5921 coding->consumed = coding->src_bytes;
5922 }
5923
5924 if (BUFFERP (coding->dst_object))
5925 {
5926 current_buffer->undo_list = undo_list;
5927 record_insert (coding->dst_pos, coding->produced_char);
5928 }
5929 if (! EQ (CODING_ID_EOL_TYPE (coding->id), Qunix))
5930 decode_eol (coding);
5931 return coding->result;
5932 }
5933
5934
5935 /* Extract an annotation datum from a composition starting at POS and
5936 ending before LIMIT of CODING->src_object (buffer or string), store
5937 the data in BUF, set *STOP to a starting position of the next
5938 composition (if any) or to LIMIT, and return the address of the
5939 next element of BUF.
5940
5941 If such an annotation is not found, set *STOP to a starting
5942 position of a composition after POS (if any) or to LIMIT, and
5943 return BUF. */
5944
5945 static INLINE int *
5946 handle_composition_annotation (pos, limit, coding, buf, stop)
5947 EMACS_INT pos, limit;
5948 struct coding_system *coding;
5949 int *buf;
5950 EMACS_INT *stop;
5951 {
5952 EMACS_INT start, end;
5953 Lisp_Object prop;
5954
5955 if (! find_composition (pos, limit, &start, &end, &prop, coding->src_object)
5956 || end > limit)
5957 *stop = limit;
5958 else if (start > pos)
5959 *stop = start;
5960 else
5961 {
5962 if (start == pos)
5963 {
5964 /* We found a composition. Store the corresponding
5965 annotation data in BUF. */
5966 int *head = buf;
5967 enum composition_method method = COMPOSITION_METHOD (prop);
5968 int nchars = COMPOSITION_LENGTH (prop);
5969
5970 ADD_COMPOSITION_DATA (buf, 0, nchars, method);
5971 if (method != COMPOSITION_RELATIVE)
5972 {
5973 Lisp_Object components;
5974 int len, i, i_byte;
5975
5976 components = COMPOSITION_COMPONENTS (prop);
5977 if (VECTORP (components))
5978 {
5979 len = XVECTOR (components)->size;
5980 for (i = 0; i < len; i++)
5981 *buf++ = XINT (AREF (components, i));
5982 }
5983 else if (STRINGP (components))
5984 {
5985 len = SCHARS (components);
5986 i = i_byte = 0;
5987 while (i < len)
5988 {
5989 FETCH_STRING_CHAR_ADVANCE (*buf, components, i, i_byte);
5990 buf++;
5991 }
5992 }
5993 else if (INTEGERP (components))
5994 {
5995 len = 1;
5996 *buf++ = XINT (components);
5997 }
5998 else if (CONSP (components))
5999 {
6000 for (len = 0; CONSP (components);
6001 len++, components = XCDR (components))
6002 *buf++ = XINT (XCAR (components));
6003 }
6004 else
6005 abort ();
6006 *head -= len;
6007 }
6008 }
6009
6010 if (find_composition (end, limit, &start, &end, &prop,
6011 coding->src_object)
6012 && end <= limit)
6013 *stop = start;
6014 else
6015 *stop = limit;
6016 }
6017 return buf;
6018 }
6019
6020
6021 /* Extract an annotation datum from a text property `charset' at POS of
6022 CODING->src_object (buffer of string), store the data in BUF, set
6023 *STOP to the position where the value of `charset' property changes
6024 (limiting by LIMIT), and return the address of the next element of
6025 BUF.
6026
6027 If the property value is nil, set *STOP to the position where the
6028 property value is non-nil (limiting by LIMIT), and return BUF. */
6029
6030 static INLINE int *
6031 handle_charset_annotation (pos, limit, coding, buf, stop)
6032 EMACS_INT pos, limit;
6033 struct coding_system *coding;
6034 int *buf;
6035 EMACS_INT *stop;
6036 {
6037 Lisp_Object val, next;
6038 int id;
6039
6040 val = Fget_text_property (make_number (pos), Qcharset, coding->src_object);
6041 if (! NILP (val) && CHARSETP (val))
6042 id = XINT (CHARSET_SYMBOL_ID (val));
6043 else
6044 id = -1;
6045 ADD_CHARSET_DATA (buf, 0, 0, id);
6046 next = Fnext_single_property_change (make_number (pos), Qcharset,
6047 coding->src_object,
6048 make_number (limit));
6049 *stop = XINT (next);
6050 return buf;
6051 }
6052
6053
6054 static void
6055 consume_chars (coding)
6056 struct coding_system *coding;
6057 {
6058 int *buf = coding->charbuf;
6059 int *buf_end = coding->charbuf + coding->charbuf_size;
6060 const unsigned char *src = coding->source + coding->consumed;
6061 const unsigned char *src_end = coding->source + coding->src_bytes;
6062 EMACS_INT pos = coding->src_pos + coding->consumed_char;
6063 EMACS_INT end_pos = coding->src_pos + coding->src_chars;
6064 int multibytep = coding->src_multibyte;
6065 Lisp_Object eol_type;
6066 int c;
6067 EMACS_INT stop, stop_composition, stop_charset;
6068
6069 eol_type = CODING_ID_EOL_TYPE (coding->id);
6070 if (VECTORP (eol_type))
6071 eol_type = Qunix;
6072
6073 /* Note: composition handling is not yet implemented. */
6074 coding->common_flags &= ~CODING_ANNOTATE_COMPOSITION_MASK;
6075
6076 if (NILP (coding->src_object))
6077 stop = stop_composition = stop_charset = end_pos;
6078 else
6079 {
6080 if (coding->common_flags & CODING_ANNOTATE_COMPOSITION_MASK)
6081 stop = stop_composition = pos;
6082 else
6083 stop = stop_composition = end_pos;
6084 if (coding->common_flags & CODING_ANNOTATE_CHARSET_MASK)
6085 stop = stop_charset = pos;
6086 else
6087 stop_charset = end_pos;
6088 }
6089
6090 /* Compensate for CRLF and conversion. */
6091 buf_end -= 1 + MAX_ANNOTATION_LENGTH;
6092 while (buf < buf_end)
6093 {
6094 if (pos == stop)
6095 {
6096 if (pos == end_pos)
6097 break;
6098 if (pos == stop_composition)
6099 buf = handle_composition_annotation (pos, end_pos, coding,
6100 buf, &stop_composition);
6101 if (pos == stop_charset)
6102 buf = handle_charset_annotation (pos, end_pos, coding,
6103 buf, &stop_charset);
6104 stop = (stop_composition < stop_charset
6105 ? stop_composition : stop_charset);
6106 }
6107
6108 if (! multibytep)
6109 {
6110 EMACS_INT bytes;
6111
6112 if (! CODING_FOR_UNIBYTE (coding)
6113 && (bytes = MULTIBYTE_LENGTH (src, src_end)) > 0)
6114 c = STRING_CHAR_ADVANCE (src), pos += bytes;
6115 else
6116 c = *src++, pos++;
6117 }
6118 else
6119 c = STRING_CHAR_ADVANCE (src), pos++;
6120 if ((c == '\r') && (coding->mode & CODING_MODE_SELECTIVE_DISPLAY))
6121 c = '\n';
6122 if (! EQ (eol_type, Qunix))
6123 {
6124 if (c == '\n')
6125 {
6126 if (EQ (eol_type, Qdos))
6127 *buf++ = '\r';
6128 else
6129 c = '\r';
6130 }
6131 }
6132 *buf++ = c;
6133 }
6134
6135 coding->consumed = src - coding->source;
6136 coding->consumed_char = pos - coding->src_pos;
6137 coding->charbuf_used = buf - coding->charbuf;
6138 coding->chars_at_source = 0;
6139 }
6140
6141
6142 /* Encode the text at CODING->src_object into CODING->dst_object.
6143 CODING->src_object is a buffer or a string.
6144 CODING->dst_object is a buffer or nil.
6145
6146 If CODING->src_object is a buffer, it must be the current buffer.
6147 In this case, if CODING->src_pos is positive, it is a position of
6148 the source text in the buffer, otherwise. the source text is in the
6149 gap area of the buffer, and coding->src_pos specifies the offset of
6150 the text from GPT (which must be the same as PT). If this is the
6151 same buffer as CODING->dst_object, CODING->src_pos must be
6152 negative and CODING should not have `pre-write-conversion'.
6153
6154 If CODING->src_object is a string, CODING should not have
6155 `pre-write-conversion'.
6156
6157 If CODING->dst_object is a buffer, the encoded data is inserted at
6158 the current point of that buffer.
6159
6160 If CODING->dst_object is nil, the encoded data is placed at the
6161 memory area specified by CODING->destination. */
6162
6163 static int
6164 encode_coding (coding)
6165 struct coding_system *coding;
6166 {
6167 Lisp_Object attrs;
6168
6169 attrs = CODING_ID_ATTRS (coding->id);
6170
6171 if (BUFFERP (coding->dst_object))
6172 {
6173 set_buffer_internal (XBUFFER (coding->dst_object));
6174 coding->dst_multibyte
6175 = ! NILP (current_buffer->enable_multibyte_characters);
6176 }
6177
6178 coding->consumed = coding->consumed_char = 0;
6179 coding->produced = coding->produced_char = 0;
6180 record_conversion_result (coding, CODING_RESULT_SUCCESS);
6181 coding->errors = 0;
6182
6183 ALLOC_CONVERSION_WORK_AREA (coding);
6184
6185 do {
6186 coding_set_source (coding);
6187 consume_chars (coding);
6188
6189 if (!NILP (CODING_ATTR_ENCODE_TBL (attrs)))
6190 translate_chars (coding, CODING_ATTR_ENCODE_TBL (attrs));
6191 else if (!NILP (Vstandard_translation_table_for_encode))
6192 translate_chars (coding, Vstandard_translation_table_for_encode);
6193
6194 coding_set_destination (coding);
6195 (*(coding->encoder)) (coding);
6196 } while (coding->consumed_char < coding->src_chars);
6197
6198 if (BUFFERP (coding->dst_object))
6199 insert_from_gap (coding->produced_char, coding->produced);
6200
6201 return (coding->result);
6202 }
6203
6204
6205 /* Name (or base name) of work buffer for code conversion. */
6206 static Lisp_Object Vcode_conversion_workbuf_name;
6207
6208 /* A working buffer used by the top level conversion. Once it is
6209 created, it is never destroyed. It has the name
6210 Vcode_conversion_workbuf_name. The other working buffers are
6211 destroyed after the use is finished, and their names are modified
6212 versions of Vcode_conversion_workbuf_name. */
6213 static Lisp_Object Vcode_conversion_reused_workbuf;
6214
6215 /* 1 iff Vcode_conversion_reused_workbuf is already in use. */
6216 static int reused_workbuf_in_use;
6217
6218
6219 /* Return a working buffer of code convesion. MULTIBYTE specifies the
6220 multibyteness of returning buffer. */
6221
6222 Lisp_Object
6223 make_conversion_work_buffer (multibyte)
6224 {
6225 Lisp_Object name, workbuf;
6226 struct buffer *current;
6227
6228 if (reused_workbuf_in_use++)
6229 {
6230 name = Fgenerate_new_buffer_name (Vcode_conversion_workbuf_name, Qnil);
6231 workbuf = Fget_buffer_create (name);
6232 }
6233 else
6234 {
6235 name = Vcode_conversion_workbuf_name;
6236 workbuf = Fget_buffer_create (name);
6237 if (NILP (Vcode_conversion_reused_workbuf))
6238 Vcode_conversion_reused_workbuf = workbuf;
6239 }
6240 current = current_buffer;
6241 set_buffer_internal (XBUFFER (workbuf));
6242 Ferase_buffer ();
6243 current_buffer->undo_list = Qt;
6244 current_buffer->enable_multibyte_characters = multibyte ? Qt : Qnil;
6245 set_buffer_internal (current);
6246 return workbuf;
6247 }
6248
6249
6250 static Lisp_Object
6251 code_conversion_restore (arg)
6252 Lisp_Object arg;
6253 {
6254 Lisp_Object current, workbuf;
6255
6256 current = XCAR (arg);
6257 workbuf = XCDR (arg);
6258 if (! NILP (workbuf))
6259 {
6260 if (EQ (workbuf, Vcode_conversion_reused_workbuf))
6261 reused_workbuf_in_use = 0;
6262 else if (! NILP (Fbuffer_live_p (workbuf)))
6263 Fkill_buffer (workbuf);
6264 }
6265 set_buffer_internal (XBUFFER (current));
6266 return Qnil;
6267 }
6268
6269 Lisp_Object
6270 code_conversion_save (with_work_buf, multibyte)
6271 int with_work_buf, multibyte;
6272 {
6273 Lisp_Object workbuf = Qnil;
6274
6275 if (with_work_buf)
6276 workbuf = make_conversion_work_buffer (multibyte);
6277 record_unwind_protect (code_conversion_restore,
6278 Fcons (Fcurrent_buffer (), workbuf));
6279 return workbuf;
6280 }
6281
6282 int
6283 decode_coding_gap (coding, chars, bytes)
6284 struct coding_system *coding;
6285 EMACS_INT chars, bytes;
6286 {
6287 int count = specpdl_ptr - specpdl;
6288 Lisp_Object attrs;
6289
6290 code_conversion_save (0, 0);
6291
6292 coding->src_object = Fcurrent_buffer ();
6293 coding->src_chars = chars;
6294 coding->src_bytes = bytes;
6295 coding->src_pos = -chars;
6296 coding->src_pos_byte = -bytes;
6297 coding->src_multibyte = chars < bytes;
6298 coding->dst_object = coding->src_object;
6299 coding->dst_pos = PT;
6300 coding->dst_pos_byte = PT_BYTE;
6301 coding->dst_multibyte = ! NILP (current_buffer->enable_multibyte_characters);
6302 coding->mode |= CODING_MODE_LAST_BLOCK;
6303
6304 if (CODING_REQUIRE_DETECTION (coding))
6305 detect_coding (coding);
6306
6307 decode_coding (coding);
6308
6309 attrs = CODING_ID_ATTRS (coding->id);
6310 if (! NILP (CODING_ATTR_POST_READ (attrs)))
6311 {
6312 EMACS_INT prev_Z = Z, prev_Z_BYTE = Z_BYTE;
6313 Lisp_Object val;
6314
6315 TEMP_SET_PT_BOTH (coding->dst_pos, coding->dst_pos_byte);
6316 val = call1 (CODING_ATTR_POST_READ (attrs),
6317 make_number (coding->produced_char));
6318 CHECK_NATNUM (val);
6319 coding->produced_char += Z - prev_Z;
6320 coding->produced += Z_BYTE - prev_Z_BYTE;
6321 }
6322
6323 unbind_to (count, Qnil);
6324 return coding->result;
6325 }
6326
6327 int
6328 encode_coding_gap (coding, chars, bytes)
6329 struct coding_system *coding;
6330 EMACS_INT chars, bytes;
6331 {
6332 int count = specpdl_ptr - specpdl;
6333
6334 code_conversion_save (0, 0);
6335
6336 coding->src_object = Fcurrent_buffer ();
6337 coding->src_chars = chars;
6338 coding->src_bytes = bytes;
6339 coding->src_pos = -chars;
6340 coding->src_pos_byte = -bytes;
6341 coding->src_multibyte = chars < bytes;
6342 coding->dst_object = coding->src_object;
6343 coding->dst_pos = PT;
6344 coding->dst_pos_byte = PT_BYTE;
6345
6346 encode_coding (coding);
6347
6348 unbind_to (count, Qnil);
6349 return coding->result;
6350 }
6351
6352
6353 /* Decode the text in the range FROM/FROM_BYTE and TO/TO_BYTE in
6354 SRC_OBJECT into DST_OBJECT by coding context CODING.
6355
6356 SRC_OBJECT is a buffer, a string, or Qnil.
6357
6358 If it is a buffer, the text is at point of the buffer. FROM and TO
6359 are positions in the buffer.
6360
6361 If it is a string, the text is at the beginning of the string.
6362 FROM and TO are indices to the string.
6363
6364 If it is nil, the text is at coding->source. FROM and TO are
6365 indices to coding->source.
6366
6367 DST_OBJECT is a buffer, Qt, or Qnil.
6368
6369 If it is a buffer, the decoded text is inserted at point of the
6370 buffer. If the buffer is the same as SRC_OBJECT, the source text
6371 is deleted.
6372
6373 If it is Qt, a string is made from the decoded text, and
6374 set in CODING->dst_object.
6375
6376 If it is Qnil, the decoded text is stored at CODING->destination.
6377 The caller must allocate CODING->dst_bytes bytes at
6378 CODING->destination by xmalloc. If the decoded text is longer than
6379 CODING->dst_bytes, CODING->destination is relocated by xrealloc.
6380 */
6381
6382 void
6383 decode_coding_object (coding, src_object, from, from_byte, to, to_byte,
6384 dst_object)
6385 struct coding_system *coding;
6386 Lisp_Object src_object;
6387 EMACS_INT from, from_byte, to, to_byte;
6388 Lisp_Object dst_object;
6389 {
6390 int count = specpdl_ptr - specpdl;
6391 unsigned char *destination;
6392 EMACS_INT dst_bytes;
6393 EMACS_INT chars = to - from;
6394 EMACS_INT bytes = to_byte - from_byte;
6395 Lisp_Object attrs;
6396 Lisp_Object buffer;
6397 int saved_pt = -1, saved_pt_byte;
6398
6399 buffer = Fcurrent_buffer ();
6400
6401 if (NILP (dst_object))
6402 {
6403 destination = coding->destination;
6404 dst_bytes = coding->dst_bytes;
6405 }
6406
6407 coding->src_object = src_object;
6408 coding->src_chars = chars;
6409 coding->src_bytes = bytes;
6410 coding->src_multibyte = chars < bytes;
6411
6412 if (STRINGP (src_object))
6413 {
6414 coding->src_pos = from;
6415 coding->src_pos_byte = from_byte;
6416 }
6417 else if (BUFFERP (src_object))
6418 {
6419 set_buffer_internal (XBUFFER (src_object));
6420 if (from != GPT)
6421 move_gap_both (from, from_byte);
6422 if (EQ (src_object, dst_object))
6423 {
6424 saved_pt = PT, saved_pt_byte = PT_BYTE;
6425 TEMP_SET_PT_BOTH (from, from_byte);
6426 del_range_both (from, from_byte, to, to_byte, 1);
6427 coding->src_pos = -chars;
6428 coding->src_pos_byte = -bytes;
6429 }
6430 else
6431 {
6432 coding->src_pos = from;
6433 coding->src_pos_byte = from_byte;
6434 }
6435 }
6436
6437 if (CODING_REQUIRE_DETECTION (coding))
6438 detect_coding (coding);
6439 attrs = CODING_ID_ATTRS (coding->id);
6440
6441 if (EQ (dst_object, Qt)
6442 || (! NILP (CODING_ATTR_POST_READ (attrs))
6443 && NILP (dst_object)))
6444 {
6445 coding->dst_object = code_conversion_save (1, 1);
6446 coding->dst_pos = BEG;
6447 coding->dst_pos_byte = BEG_BYTE;
6448 coding->dst_multibyte = 1;
6449 }
6450 else if (BUFFERP (dst_object))
6451 {
6452 code_conversion_save (0, 0);
6453 coding->dst_object = dst_object;
6454 coding->dst_pos = BUF_PT (XBUFFER (dst_object));
6455 coding->dst_pos_byte = BUF_PT_BYTE (XBUFFER (dst_object));
6456 coding->dst_multibyte
6457 = ! NILP (XBUFFER (dst_object)->enable_multibyte_characters);
6458 }
6459 else
6460 {
6461 code_conversion_save (0, 0);
6462 coding->dst_object = Qnil;
6463 coding->dst_multibyte = 1;
6464 }
6465
6466 decode_coding (coding);
6467
6468 if (BUFFERP (coding->dst_object))
6469 set_buffer_internal (XBUFFER (coding->dst_object));
6470
6471 if (! NILP (CODING_ATTR_POST_READ (attrs)))
6472 {
6473 struct gcpro gcpro1, gcpro2;
6474 EMACS_INT prev_Z = Z, prev_Z_BYTE = Z_BYTE;
6475 Lisp_Object val;
6476
6477 TEMP_SET_PT_BOTH (coding->dst_pos, coding->dst_pos_byte);
6478 GCPRO2 (coding->src_object, coding->dst_object);
6479 val = call1 (CODING_ATTR_POST_READ (attrs),
6480 make_number (coding->produced_char));
6481 UNGCPRO;
6482 CHECK_NATNUM (val);
6483 coding->produced_char += Z - prev_Z;
6484 coding->produced += Z_BYTE - prev_Z_BYTE;
6485 }
6486
6487 if (EQ (dst_object, Qt))
6488 {
6489 coding->dst_object = Fbuffer_string ();
6490 }
6491 else if (NILP (dst_object) && BUFFERP (coding->dst_object))
6492 {
6493 set_buffer_internal (XBUFFER (coding->dst_object));
6494 if (dst_bytes < coding->produced)
6495 {
6496 destination
6497 = (unsigned char *) xrealloc (destination, coding->produced);
6498 if (! destination)
6499 {
6500 record_conversion_result (coding,
6501 CODING_RESULT_INSUFFICIENT_DST);
6502 unbind_to (count, Qnil);
6503 return;
6504 }
6505 if (BEGV < GPT && GPT < BEGV + coding->produced_char)
6506 move_gap_both (BEGV, BEGV_BYTE);
6507 bcopy (BEGV_ADDR, destination, coding->produced);
6508 coding->destination = destination;
6509 }
6510 }
6511
6512 if (saved_pt >= 0)
6513 {
6514 /* This is the case of:
6515 (BUFFERP (src_object) && EQ (src_object, dst_object))
6516 As we have moved PT while replacing the original buffer
6517 contents, we must recover it now. */
6518 set_buffer_internal (XBUFFER (src_object));
6519 if (saved_pt < from)
6520 TEMP_SET_PT_BOTH (saved_pt, saved_pt_byte);
6521 else if (saved_pt < from + chars)
6522 TEMP_SET_PT_BOTH (from, from_byte);
6523 else if (! NILP (current_buffer->enable_multibyte_characters))
6524 TEMP_SET_PT_BOTH (saved_pt + (coding->produced_char - chars),
6525 saved_pt_byte + (coding->produced - bytes));
6526 else
6527 TEMP_SET_PT_BOTH (saved_pt + (coding->produced - bytes),
6528 saved_pt_byte + (coding->produced - bytes));
6529 }
6530
6531 unbind_to (count, coding->dst_object);
6532 }
6533
6534
6535 void
6536 encode_coding_object (coding, src_object, from, from_byte, to, to_byte,
6537 dst_object)
6538 struct coding_system *coding;
6539 Lisp_Object src_object;
6540 EMACS_INT from, from_byte, to, to_byte;
6541 Lisp_Object dst_object;
6542 {
6543 int count = specpdl_ptr - specpdl;
6544 EMACS_INT chars = to - from;
6545 EMACS_INT bytes = to_byte - from_byte;
6546 Lisp_Object attrs;
6547 Lisp_Object buffer;
6548 int saved_pt = -1, saved_pt_byte;
6549
6550 buffer = Fcurrent_buffer ();
6551
6552 coding->src_object = src_object;
6553 coding->src_chars = chars;
6554 coding->src_bytes = bytes;
6555 coding->src_multibyte = chars < bytes;
6556
6557 attrs = CODING_ID_ATTRS (coding->id);
6558
6559 if (! NILP (CODING_ATTR_PRE_WRITE (attrs)))
6560 {
6561 coding->src_object = code_conversion_save (1, coding->src_multibyte);
6562 set_buffer_internal (XBUFFER (coding->src_object));
6563 if (STRINGP (src_object))
6564 insert_from_string (src_object, from, from_byte, chars, bytes, 0);
6565 else if (BUFFERP (src_object))
6566 insert_from_buffer (XBUFFER (src_object), from, chars, 0);
6567 else
6568 insert_1_both (coding->source + from, chars, bytes, 0, 0, 0);
6569
6570 if (EQ (src_object, dst_object))
6571 {
6572 set_buffer_internal (XBUFFER (src_object));
6573 saved_pt = PT, saved_pt_byte = PT_BYTE;
6574 del_range_both (from, from_byte, to, to_byte, 1);
6575 set_buffer_internal (XBUFFER (coding->src_object));
6576 }
6577
6578 call2 (CODING_ATTR_PRE_WRITE (attrs),
6579 make_number (BEG), make_number (Z));
6580 coding->src_object = Fcurrent_buffer ();
6581 if (BEG != GPT)
6582 move_gap_both (BEG, BEG_BYTE);
6583 coding->src_chars = Z - BEG;
6584 coding->src_bytes = Z_BYTE - BEG_BYTE;
6585 coding->src_pos = BEG;
6586 coding->src_pos_byte = BEG_BYTE;
6587 coding->src_multibyte = Z < Z_BYTE;
6588 }
6589 else if (STRINGP (src_object))
6590 {
6591 code_conversion_save (0, 0);
6592 coding->src_pos = from;
6593 coding->src_pos_byte = from_byte;
6594 }
6595 else if (BUFFERP (src_object))
6596 {
6597 code_conversion_save (0, 0);
6598 set_buffer_internal (XBUFFER (src_object));
6599 if (EQ (src_object, dst_object))
6600 {
6601 saved_pt = PT, saved_pt_byte = PT_BYTE;
6602 coding->src_object = del_range_1 (from, to, 1, 1);
6603 coding->src_pos = 0;
6604 coding->src_pos_byte = 0;
6605 }
6606 else
6607 {
6608 if (from < GPT && to >= GPT)
6609 move_gap_both (from, from_byte);
6610 coding->src_pos = from;
6611 coding->src_pos_byte = from_byte;
6612 }
6613 }
6614 else
6615 code_conversion_save (0, 0);
6616
6617 if (BUFFERP (dst_object))
6618 {
6619 coding->dst_object = dst_object;
6620 if (EQ (src_object, dst_object))
6621 {
6622 coding->dst_pos = from;
6623 coding->dst_pos_byte = from_byte;
6624 }
6625 else
6626 {
6627 coding->dst_pos = BUF_PT (XBUFFER (dst_object));
6628 coding->dst_pos_byte = BUF_PT_BYTE (XBUFFER (dst_object));
6629 }
6630 coding->dst_multibyte
6631 = ! NILP (XBUFFER (dst_object)->enable_multibyte_characters);
6632 }
6633 else if (EQ (dst_object, Qt))
6634 {
6635 coding->dst_object = Qnil;
6636 coding->dst_bytes = coding->src_chars;
6637 if (coding->dst_bytes == 0)
6638 coding->dst_bytes = 1;
6639 coding->destination = (unsigned char *) xmalloc (coding->dst_bytes);
6640 coding->dst_multibyte = 0;
6641 }
6642 else
6643 {
6644 coding->dst_object = Qnil;
6645 coding->dst_multibyte = 0;
6646 }
6647
6648 encode_coding (coding);
6649
6650 if (EQ (dst_object, Qt))
6651 {
6652 if (BUFFERP (coding->dst_object))
6653 coding->dst_object = Fbuffer_string ();
6654 else
6655 {
6656 coding->dst_object
6657 = make_unibyte_string ((char *) coding->destination,
6658 coding->produced);
6659 xfree (coding->destination);
6660 }
6661 }
6662
6663 if (saved_pt >= 0)
6664 {
6665 /* This is the case of:
6666 (BUFFERP (src_object) && EQ (src_object, dst_object))
6667 As we have moved PT while replacing the original buffer
6668 contents, we must recover it now. */
6669 set_buffer_internal (XBUFFER (src_object));
6670 if (saved_pt < from)
6671 TEMP_SET_PT_BOTH (saved_pt, saved_pt_byte);
6672 else if (saved_pt < from + chars)
6673 TEMP_SET_PT_BOTH (from, from_byte);
6674 else if (! NILP (current_buffer->enable_multibyte_characters))
6675 TEMP_SET_PT_BOTH (saved_pt + (coding->produced_char - chars),
6676 saved_pt_byte + (coding->produced - bytes));
6677 else
6678 TEMP_SET_PT_BOTH (saved_pt + (coding->produced - bytes),
6679 saved_pt_byte + (coding->produced - bytes));
6680 }
6681
6682 unbind_to (count, Qnil);
6683 }
6684
6685
6686 Lisp_Object
6687 preferred_coding_system ()
6688 {
6689 int id = coding_categories[coding_priorities[0]].id;
6690
6691 return CODING_ID_NAME (id);
6692 }
6693
6694 \f
6695 #ifdef emacs
6696 /*** 8. Emacs Lisp library functions ***/
6697
6698 DEFUN ("coding-system-p", Fcoding_system_p, Scoding_system_p, 1, 1, 0,
6699 doc: /* Return t if OBJECT is nil or a coding-system.
6700 See the documentation of `define-coding-system' for information
6701 about coding-system objects. */)
6702 (obj)
6703 Lisp_Object obj;
6704 {
6705 return ((NILP (obj) || CODING_SYSTEM_P (obj)) ? Qt : Qnil);
6706 }
6707
6708 DEFUN ("read-non-nil-coding-system", Fread_non_nil_coding_system,
6709 Sread_non_nil_coding_system, 1, 1, 0,
6710 doc: /* Read a coding system from the minibuffer, prompting with string PROMPT. */)
6711 (prompt)
6712 Lisp_Object prompt;
6713 {
6714 Lisp_Object val;
6715 do
6716 {
6717 val = Fcompleting_read (prompt, Vcoding_system_alist, Qnil,
6718 Qt, Qnil, Qcoding_system_history, Qnil, Qnil);
6719 }
6720 while (SCHARS (val) == 0);
6721 return (Fintern (val, Qnil));
6722 }
6723
6724 DEFUN ("read-coding-system", Fread_coding_system, Sread_coding_system, 1, 2, 0,
6725 doc: /* Read a coding system from the minibuffer, prompting with string PROMPT.
6726 If the user enters null input, return second argument DEFAULT-CODING-SYSTEM. */)
6727 (prompt, default_coding_system)
6728 Lisp_Object prompt, default_coding_system;
6729 {
6730 Lisp_Object val;
6731 if (SYMBOLP (default_coding_system))
6732 XSETSTRING (default_coding_system, XPNTR (SYMBOL_NAME (default_coding_system)));
6733 val = Fcompleting_read (prompt, Vcoding_system_alist, Qnil,
6734 Qt, Qnil, Qcoding_system_history,
6735 default_coding_system, Qnil);
6736 return (SCHARS (val) == 0 ? Qnil : Fintern (val, Qnil));
6737 }
6738
6739 DEFUN ("check-coding-system", Fcheck_coding_system, Scheck_coding_system,
6740 1, 1, 0,
6741 doc: /* Check validity of CODING-SYSTEM.
6742 If valid, return CODING-SYSTEM, else signal a `coding-system-error' error. */)
6743 (coding_system)
6744 Lisp_Object coding_system;
6745 {
6746 CHECK_SYMBOL (coding_system);
6747 if (!NILP (Fcoding_system_p (coding_system)))
6748 return coding_system;
6749 while (1)
6750 Fsignal (Qcoding_system_error, Fcons (coding_system, Qnil));
6751 }
6752
6753 \f
6754 /* Detect how the bytes at SRC of length SRC_BYTES are encoded. If
6755 HIGHEST is nonzero, return the coding system of the highest
6756 priority among the detected coding systems. Otherwize return a
6757 list of detected coding systems sorted by their priorities. If
6758 MULTIBYTEP is nonzero, it is assumed that the bytes are in correct
6759 multibyte form but contains only ASCII and eight-bit chars.
6760 Otherwise, the bytes are raw bytes.
6761
6762 CODING-SYSTEM controls the detection as below:
6763
6764 If it is nil, detect both text-format and eol-format. If the
6765 text-format part of CODING-SYSTEM is already specified
6766 (e.g. `iso-latin-1'), detect only eol-format. If the eol-format
6767 part of CODING-SYSTEM is already specified (e.g. `undecided-unix'),
6768 detect only text-format. */
6769
6770 Lisp_Object
6771 detect_coding_system (src, src_chars, src_bytes, highest, multibytep,
6772 coding_system)
6773 const unsigned char *src;
6774 int src_chars, src_bytes, highest;
6775 int multibytep;
6776 Lisp_Object coding_system;
6777 {
6778 const unsigned char *src_end = src + src_bytes;
6779 Lisp_Object attrs, eol_type;
6780 Lisp_Object val;
6781 struct coding_system coding;
6782 int id;
6783 struct coding_detection_info detect_info;
6784 enum coding_category base_category;
6785
6786 if (NILP (coding_system))
6787 coding_system = Qundecided;
6788 setup_coding_system (coding_system, &coding);
6789 attrs = CODING_ID_ATTRS (coding.id);
6790 eol_type = CODING_ID_EOL_TYPE (coding.id);
6791 coding_system = CODING_ATTR_BASE_NAME (attrs);
6792
6793 coding.source = src;
6794 coding.src_chars = src_chars;
6795 coding.src_bytes = src_bytes;
6796 coding.src_multibyte = multibytep;
6797 coding.consumed = 0;
6798 coding.mode |= CODING_MODE_LAST_BLOCK;
6799
6800 detect_info.checked = detect_info.found = detect_info.rejected = 0;
6801
6802 /* At first, detect text-format if necessary. */
6803 base_category = XINT (CODING_ATTR_CATEGORY (attrs));
6804 if (base_category == coding_category_undecided)
6805 {
6806 enum coding_category category;
6807 struct coding_system *this;
6808 int c, i;
6809
6810 /* Skip all ASCII bytes except for a few ISO2022 controls. */
6811 for (i = 0; src < src_end; i++, src++)
6812 {
6813 c = *src;
6814 if (c & 0x80 || (c < 0x20 && (c == 0
6815 || c == ISO_CODE_ESC
6816 || c == ISO_CODE_SI
6817 || c == ISO_CODE_SO)))
6818 break;
6819 }
6820 coding.head_ascii = src - coding.source;
6821
6822 if (src < src_end)
6823 for (i = 0; i < coding_category_raw_text; i++)
6824 {
6825 category = coding_priorities[i];
6826 this = coding_categories + category;
6827
6828 if (this->id < 0)
6829 {
6830 /* No coding system of this category is defined. */
6831 detect_info.rejected |= (1 << category);
6832 }
6833 else if (category >= coding_category_raw_text)
6834 continue;
6835 else if (detect_info.checked & (1 << category))
6836 {
6837 if (highest
6838 && (detect_info.found & (1 << category)))
6839 break;
6840 }
6841 else
6842 {
6843 if ((*(this->detector)) (&coding, &detect_info)
6844 && highest
6845 && (detect_info.found & (1 << category)))
6846 {
6847 if (category == coding_category_utf_16_auto)
6848 {
6849 if (detect_info.found & CATEGORY_MASK_UTF_16_LE)
6850 category = coding_category_utf_16_le;
6851 else
6852 category = coding_category_utf_16_be;
6853 }
6854 break;
6855 }
6856 }
6857 }
6858
6859 if (detect_info.rejected == CATEGORY_MASK_ANY)
6860 {
6861 detect_info.found = CATEGORY_MASK_RAW_TEXT;
6862 id = coding_categories[coding_category_raw_text].id;
6863 val = Fcons (make_number (id), Qnil);
6864 }
6865 else if (! detect_info.rejected && ! detect_info.found)
6866 {
6867 detect_info.found = CATEGORY_MASK_ANY;
6868 id = coding_categories[coding_category_undecided].id;
6869 val = Fcons (make_number (id), Qnil);
6870 }
6871 else if (highest)
6872 {
6873 if (detect_info.found)
6874 {
6875 detect_info.found = 1 << category;
6876 val = Fcons (make_number (this->id), Qnil);
6877 }
6878 else
6879 for (i = 0; i < coding_category_raw_text; i++)
6880 if (! (detect_info.rejected & (1 << coding_priorities[i])))
6881 {
6882 detect_info.found = 1 << coding_priorities[i];
6883 id = coding_categories[coding_priorities[i]].id;
6884 val = Fcons (make_number (id), Qnil);
6885 break;
6886 }
6887 }
6888 else
6889 {
6890 int mask = detect_info.rejected | detect_info.found;
6891 int found = 0;
6892 val = Qnil;
6893
6894 for (i = coding_category_raw_text - 1; i >= 0; i--)
6895 {
6896 category = coding_priorities[i];
6897 if (! (mask & (1 << category)))
6898 {
6899 found |= 1 << category;
6900 id = coding_categories[category].id;
6901 val = Fcons (make_number (id), val);
6902 }
6903 }
6904 for (i = coding_category_raw_text - 1; i >= 0; i--)
6905 {
6906 category = coding_priorities[i];
6907 if (detect_info.found & (1 << category))
6908 {
6909 id = coding_categories[category].id;
6910 val = Fcons (make_number (id), val);
6911 }
6912 }
6913 detect_info.found |= found;
6914 }
6915 }
6916 else if (base_category == coding_category_utf_16_auto)
6917 {
6918 if (detect_coding_utf_16 (&coding, &detect_info))
6919 {
6920 enum coding_category category;
6921 struct coding_system *this;
6922
6923 if (detect_info.found & CATEGORY_MASK_UTF_16_LE)
6924 this = coding_categories + coding_category_utf_16_le;
6925 else if (detect_info.found & CATEGORY_MASK_UTF_16_BE)
6926 this = coding_categories + coding_category_utf_16_be;
6927 else if (detect_info.rejected & CATEGORY_MASK_UTF_16_LE_NOSIG)
6928 this = coding_categories + coding_category_utf_16_be_nosig;
6929 else
6930 this = coding_categories + coding_category_utf_16_le_nosig;
6931 val = Fcons (make_number (this->id), Qnil);
6932 }
6933 }
6934 else
6935 {
6936 detect_info.found = 1 << XINT (CODING_ATTR_CATEGORY (attrs));
6937 val = Fcons (make_number (coding.id), Qnil);
6938 }
6939
6940 /* Then, detect eol-format if necessary. */
6941 {
6942 int normal_eol = -1, utf_16_be_eol = -1, utf_16_le_eol;
6943 Lisp_Object tail;
6944
6945 if (VECTORP (eol_type))
6946 {
6947 if (detect_info.found & ~CATEGORY_MASK_UTF_16)
6948 normal_eol = detect_eol (coding.source, src_bytes,
6949 coding_category_raw_text);
6950 if (detect_info.found & (CATEGORY_MASK_UTF_16_BE
6951 | CATEGORY_MASK_UTF_16_BE_NOSIG))
6952 utf_16_be_eol = detect_eol (coding.source, src_bytes,
6953 coding_category_utf_16_be);
6954 if (detect_info.found & (CATEGORY_MASK_UTF_16_LE
6955 | CATEGORY_MASK_UTF_16_LE_NOSIG))
6956 utf_16_le_eol = detect_eol (coding.source, src_bytes,
6957 coding_category_utf_16_le);
6958 }
6959 else
6960 {
6961 if (EQ (eol_type, Qunix))
6962 normal_eol = utf_16_be_eol = utf_16_le_eol = EOL_SEEN_LF;
6963 else if (EQ (eol_type, Qdos))
6964 normal_eol = utf_16_be_eol = utf_16_le_eol = EOL_SEEN_CRLF;
6965 else
6966 normal_eol = utf_16_be_eol = utf_16_le_eol = EOL_SEEN_CR;
6967 }
6968
6969 for (tail = val; CONSP (tail); tail = XCDR (tail))
6970 {
6971 enum coding_category category;
6972 int this_eol;
6973
6974 id = XINT (XCAR (tail));
6975 attrs = CODING_ID_ATTRS (id);
6976 category = XINT (CODING_ATTR_CATEGORY (attrs));
6977 eol_type = CODING_ID_EOL_TYPE (id);
6978 if (VECTORP (eol_type))
6979 {
6980 if (category == coding_category_utf_16_be
6981 || category == coding_category_utf_16_be_nosig)
6982 this_eol = utf_16_be_eol;
6983 else if (category == coding_category_utf_16_le
6984 || category == coding_category_utf_16_le_nosig)
6985 this_eol = utf_16_le_eol;
6986 else
6987 this_eol = normal_eol;
6988
6989 if (this_eol == EOL_SEEN_LF)
6990 XSETCAR (tail, AREF (eol_type, 0));
6991 else if (this_eol == EOL_SEEN_CRLF)
6992 XSETCAR (tail, AREF (eol_type, 1));
6993 else if (this_eol == EOL_SEEN_CR)
6994 XSETCAR (tail, AREF (eol_type, 2));
6995 else
6996 XSETCAR (tail, CODING_ID_NAME (id));
6997 }
6998 else
6999 XSETCAR (tail, CODING_ID_NAME (id));
7000 }
7001 }
7002
7003 return (highest ? XCAR (val) : val);
7004 }
7005
7006
7007 DEFUN ("detect-coding-region", Fdetect_coding_region, Sdetect_coding_region,
7008 2, 3, 0,
7009 doc: /* Detect coding system of the text in the region between START and END.
7010 Return a list of possible coding systems ordered by priority.
7011
7012 If only ASCII characters are found, it returns a list of single element
7013 `undecided' or its subsidiary coding system according to a detected
7014 end-of-line format.
7015
7016 If optional argument HIGHEST is non-nil, return the coding system of
7017 highest priority. */)
7018 (start, end, highest)
7019 Lisp_Object start, end, highest;
7020 {
7021 int from, to;
7022 int from_byte, to_byte;
7023
7024 CHECK_NUMBER_COERCE_MARKER (start);
7025 CHECK_NUMBER_COERCE_MARKER (end);
7026
7027 validate_region (&start, &end);
7028 from = XINT (start), to = XINT (end);
7029 from_byte = CHAR_TO_BYTE (from);
7030 to_byte = CHAR_TO_BYTE (to);
7031
7032 if (from < GPT && to >= GPT)
7033 move_gap_both (to, to_byte);
7034
7035 return detect_coding_system (BYTE_POS_ADDR (from_byte),
7036 to - from, to_byte - from_byte,
7037 !NILP (highest),
7038 !NILP (current_buffer
7039 ->enable_multibyte_characters),
7040 Qnil);
7041 }
7042
7043 DEFUN ("detect-coding-string", Fdetect_coding_string, Sdetect_coding_string,
7044 1, 2, 0,
7045 doc: /* Detect coding system of the text in STRING.
7046 Return a list of possible coding systems ordered by priority.
7047
7048 If only ASCII characters are found, it returns a list of single element
7049 `undecided' or its subsidiary coding system according to a detected
7050 end-of-line format.
7051
7052 If optional argument HIGHEST is non-nil, return the coding system of
7053 highest priority. */)
7054 (string, highest)
7055 Lisp_Object string, highest;
7056 {
7057 CHECK_STRING (string);
7058
7059 return detect_coding_system (SDATA (string),
7060 SCHARS (string), SBYTES (string),
7061 !NILP (highest), STRING_MULTIBYTE (string),
7062 Qnil);
7063 }
7064
7065
7066 static INLINE int
7067 char_encodable_p (c, attrs)
7068 int c;
7069 Lisp_Object attrs;
7070 {
7071 Lisp_Object tail;
7072 struct charset *charset;
7073
7074 for (tail = CODING_ATTR_CHARSET_LIST (attrs);
7075 CONSP (tail); tail = XCDR (tail))
7076 {
7077 charset = CHARSET_FROM_ID (XINT (XCAR (tail)));
7078 if (CHAR_CHARSET_P (c, charset))
7079 break;
7080 }
7081 return (! NILP (tail));
7082 }
7083
7084
7085 /* Return a list of coding systems that safely encode the text between
7086 START and END. If EXCLUDE is non-nil, it is a list of coding
7087 systems not to check. The returned list doesn't contain any such
7088 coding systems. In any case, if the text contains only ASCII or is
7089 unibyte, return t. */
7090
7091 DEFUN ("find-coding-systems-region-internal",
7092 Ffind_coding_systems_region_internal,
7093 Sfind_coding_systems_region_internal, 2, 3, 0,
7094 doc: /* Internal use only. */)
7095 (start, end, exclude)
7096 Lisp_Object start, end, exclude;
7097 {
7098 Lisp_Object coding_attrs_list, safe_codings;
7099 EMACS_INT start_byte, end_byte;
7100 const unsigned char *p, *pbeg, *pend;
7101 int c;
7102 Lisp_Object tail, elt;
7103
7104 if (STRINGP (start))
7105 {
7106 if (!STRING_MULTIBYTE (start)
7107 || SCHARS (start) == SBYTES (start))
7108 return Qt;
7109 start_byte = 0;
7110 end_byte = SBYTES (start);
7111 }
7112 else
7113 {
7114 CHECK_NUMBER_COERCE_MARKER (start);
7115 CHECK_NUMBER_COERCE_MARKER (end);
7116 if (XINT (start) < BEG || XINT (end) > Z || XINT (start) > XINT (end))
7117 args_out_of_range (start, end);
7118 if (NILP (current_buffer->enable_multibyte_characters))
7119 return Qt;
7120 start_byte = CHAR_TO_BYTE (XINT (start));
7121 end_byte = CHAR_TO_BYTE (XINT (end));
7122 if (XINT (end) - XINT (start) == end_byte - start_byte)
7123 return Qt;
7124
7125 if (XINT (start) < GPT && XINT (end) > GPT)
7126 {
7127 if ((GPT - XINT (start)) < (XINT (end) - GPT))
7128 move_gap_both (XINT (start), start_byte);
7129 else
7130 move_gap_both (XINT (end), end_byte);
7131 }
7132 }
7133
7134 coding_attrs_list = Qnil;
7135 for (tail = Vcoding_system_list; CONSP (tail); tail = XCDR (tail))
7136 if (NILP (exclude)
7137 || NILP (Fmemq (XCAR (tail), exclude)))
7138 {
7139 Lisp_Object attrs;
7140
7141 attrs = AREF (CODING_SYSTEM_SPEC (XCAR (tail)), 0);
7142 if (EQ (XCAR (tail), CODING_ATTR_BASE_NAME (attrs))
7143 && ! EQ (CODING_ATTR_TYPE (attrs), Qundecided))
7144 coding_attrs_list = Fcons (attrs, coding_attrs_list);
7145 }
7146
7147 if (STRINGP (start))
7148 p = pbeg = SDATA (start);
7149 else
7150 p = pbeg = BYTE_POS_ADDR (start_byte);
7151 pend = p + (end_byte - start_byte);
7152
7153 while (p < pend && ASCII_BYTE_P (*p)) p++;
7154 while (p < pend && ASCII_BYTE_P (*(pend - 1))) pend--;
7155
7156 while (p < pend)
7157 {
7158 if (ASCII_BYTE_P (*p))
7159 p++;
7160 else
7161 {
7162 c = STRING_CHAR_ADVANCE (p);
7163
7164 charset_map_loaded = 0;
7165 for (tail = coding_attrs_list; CONSP (tail);)
7166 {
7167 elt = XCAR (tail);
7168 if (NILP (elt))
7169 tail = XCDR (tail);
7170 else if (char_encodable_p (c, elt))
7171 tail = XCDR (tail);
7172 else if (CONSP (XCDR (tail)))
7173 {
7174 XSETCAR (tail, XCAR (XCDR (tail)));
7175 XSETCDR (tail, XCDR (XCDR (tail)));
7176 }
7177 else
7178 {
7179 XSETCAR (tail, Qnil);
7180 tail = XCDR (tail);
7181 }
7182 }
7183 if (charset_map_loaded)
7184 {
7185 EMACS_INT p_offset = p - pbeg, pend_offset = pend - pbeg;
7186
7187 if (STRINGP (start))
7188 pbeg = SDATA (start);
7189 else
7190 pbeg = BYTE_POS_ADDR (start_byte);
7191 p = pbeg + p_offset;
7192 pend = pbeg + pend_offset;
7193 }
7194 }
7195 }
7196
7197 safe_codings = Qnil;
7198 for (tail = coding_attrs_list; CONSP (tail); tail = XCDR (tail))
7199 if (! NILP (XCAR (tail)))
7200 safe_codings = Fcons (CODING_ATTR_BASE_NAME (XCAR (tail)), safe_codings);
7201
7202 return safe_codings;
7203 }
7204
7205
7206 DEFUN ("unencodable-char-position", Funencodable_char_position,
7207 Sunencodable_char_position, 3, 5, 0,
7208 doc: /*
7209 Return position of first un-encodable character in a region.
7210 START and END specfiy the region and CODING-SYSTEM specifies the
7211 encoding to check. Return nil if CODING-SYSTEM does encode the region.
7212
7213 If optional 4th argument COUNT is non-nil, it specifies at most how
7214 many un-encodable characters to search. In this case, the value is a
7215 list of positions.
7216
7217 If optional 5th argument STRING is non-nil, it is a string to search
7218 for un-encodable characters. In that case, START and END are indexes
7219 to the string. */)
7220 (start, end, coding_system, count, string)
7221 Lisp_Object start, end, coding_system, count, string;
7222 {
7223 int n;
7224 struct coding_system coding;
7225 Lisp_Object attrs, charset_list;
7226 Lisp_Object positions;
7227 int from, to;
7228 const unsigned char *p, *stop, *pend;
7229 int ascii_compatible;
7230
7231 setup_coding_system (Fcheck_coding_system (coding_system), &coding);
7232 attrs = CODING_ID_ATTRS (coding.id);
7233 if (EQ (CODING_ATTR_TYPE (attrs), Qraw_text))
7234 return Qnil;
7235 ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs));
7236 charset_list = CODING_ATTR_CHARSET_LIST (attrs);
7237
7238 if (NILP (string))
7239 {
7240 validate_region (&start, &end);
7241 from = XINT (start);
7242 to = XINT (end);
7243 if (NILP (current_buffer->enable_multibyte_characters)
7244 || (ascii_compatible
7245 && (to - from) == (CHAR_TO_BYTE (to) - (CHAR_TO_BYTE (from)))))
7246 return Qnil;
7247 p = CHAR_POS_ADDR (from);
7248 pend = CHAR_POS_ADDR (to);
7249 if (from < GPT && to >= GPT)
7250 stop = GPT_ADDR;
7251 else
7252 stop = pend;
7253 }
7254 else
7255 {
7256 CHECK_STRING (string);
7257 CHECK_NATNUM (start);
7258 CHECK_NATNUM (end);
7259 from = XINT (start);
7260 to = XINT (end);
7261 if (from > to
7262 || to > SCHARS (string))
7263 args_out_of_range_3 (string, start, end);
7264 if (! STRING_MULTIBYTE (string))
7265 return Qnil;
7266 p = SDATA (string) + string_char_to_byte (string, from);
7267 stop = pend = SDATA (string) + string_char_to_byte (string, to);
7268 if (ascii_compatible && (to - from) == (pend - p))
7269 return Qnil;
7270 }
7271
7272 if (NILP (count))
7273 n = 1;
7274 else
7275 {
7276 CHECK_NATNUM (count);
7277 n = XINT (count);
7278 }
7279
7280 positions = Qnil;
7281 while (1)
7282 {
7283 int c;
7284
7285 if (ascii_compatible)
7286 while (p < stop && ASCII_BYTE_P (*p))
7287 p++, from++;
7288 if (p >= stop)
7289 {
7290 if (p >= pend)
7291 break;
7292 stop = pend;
7293 p = GAP_END_ADDR;
7294 }
7295
7296 c = STRING_CHAR_ADVANCE (p);
7297 if (! (ASCII_CHAR_P (c) && ascii_compatible)
7298 && ! char_charset (c, charset_list, NULL))
7299 {
7300 positions = Fcons (make_number (from), positions);
7301 n--;
7302 if (n == 0)
7303 break;
7304 }
7305
7306 from++;
7307 }
7308
7309 return (NILP (count) ? Fcar (positions) : Fnreverse (positions));
7310 }
7311
7312
7313 DEFUN ("check-coding-systems-region", Fcheck_coding_systems_region,
7314 Scheck_coding_systems_region, 3, 3, 0,
7315 doc: /* Check if the region is encodable by coding systems.
7316
7317 START and END are buffer positions specifying the region.
7318 CODING-SYSTEM-LIST is a list of coding systems to check.
7319
7320 The value is an alist ((CODING-SYSTEM POS0 POS1 ...) ...), where
7321 CODING-SYSTEM is a member of CODING-SYSTEM-LIst and can't encode the
7322 whole region, POS0, POS1, ... are buffer positions where non-encodable
7323 characters are found.
7324
7325 If all coding systems in CODING-SYSTEM-LIST can encode the region, the
7326 value is nil.
7327
7328 START may be a string. In that case, check if the string is
7329 encodable, and the value contains indices to the string instead of
7330 buffer positions. END is ignored. */)
7331 (start, end, coding_system_list)
7332 Lisp_Object start, end, coding_system_list;
7333 {
7334 Lisp_Object list;
7335 EMACS_INT start_byte, end_byte;
7336 int pos;
7337 const unsigned char *p, *pbeg, *pend;
7338 int c;
7339 Lisp_Object tail, elt;
7340
7341 if (STRINGP (start))
7342 {
7343 if (!STRING_MULTIBYTE (start)
7344 && SCHARS (start) != SBYTES (start))
7345 return Qnil;
7346 start_byte = 0;
7347 end_byte = SBYTES (start);
7348 pos = 0;
7349 }
7350 else
7351 {
7352 CHECK_NUMBER_COERCE_MARKER (start);
7353 CHECK_NUMBER_COERCE_MARKER (end);
7354 if (XINT (start) < BEG || XINT (end) > Z || XINT (start) > XINT (end))
7355 args_out_of_range (start, end);
7356 if (NILP (current_buffer->enable_multibyte_characters))
7357 return Qnil;
7358 start_byte = CHAR_TO_BYTE (XINT (start));
7359 end_byte = CHAR_TO_BYTE (XINT (end));
7360 if (XINT (end) - XINT (start) == end_byte - start_byte)
7361 return Qt;
7362
7363 if (XINT (start) < GPT && XINT (end) > GPT)
7364 {
7365 if ((GPT - XINT (start)) < (XINT (end) - GPT))
7366 move_gap_both (XINT (start), start_byte);
7367 else
7368 move_gap_both (XINT (end), end_byte);
7369 }
7370 pos = XINT (start);
7371 }
7372
7373 list = Qnil;
7374 for (tail = coding_system_list; CONSP (tail); tail = XCDR (tail))
7375 {
7376 elt = XCAR (tail);
7377 list = Fcons (Fcons (elt, Fcons (AREF (CODING_SYSTEM_SPEC (elt), 0),
7378 Qnil)),
7379 list);
7380 }
7381
7382 if (STRINGP (start))
7383 p = pbeg = SDATA (start);
7384 else
7385 p = pbeg = BYTE_POS_ADDR (start_byte);
7386 pend = p + (end_byte - start_byte);
7387
7388 while (p < pend && ASCII_BYTE_P (*p)) p++, pos++;
7389 while (p < pend && ASCII_BYTE_P (*(pend - 1))) pend--;
7390
7391 while (p < pend)
7392 {
7393 if (ASCII_BYTE_P (*p))
7394 p++;
7395 else
7396 {
7397 c = STRING_CHAR_ADVANCE (p);
7398
7399 charset_map_loaded = 0;
7400 for (tail = list; CONSP (tail); tail = XCDR (tail))
7401 {
7402 elt = XCDR (XCAR (tail));
7403 if (! char_encodable_p (c, XCAR (elt)))
7404 XSETCDR (elt, Fcons (make_number (pos), XCDR (elt)));
7405 }
7406 if (charset_map_loaded)
7407 {
7408 EMACS_INT p_offset = p - pbeg, pend_offset = pend - pbeg;
7409
7410 if (STRINGP (start))
7411 pbeg = SDATA (start);
7412 else
7413 pbeg = BYTE_POS_ADDR (start_byte);
7414 p = pbeg + p_offset;
7415 pend = pbeg + pend_offset;
7416 }
7417 }
7418 pos++;
7419 }
7420
7421 tail = list;
7422 list = Qnil;
7423 for (; CONSP (tail); tail = XCDR (tail))
7424 {
7425 elt = XCAR (tail);
7426 if (CONSP (XCDR (XCDR (elt))))
7427 list = Fcons (Fcons (XCAR (elt), Fnreverse (XCDR (XCDR (elt)))),
7428 list);
7429 }
7430
7431 return list;
7432 }
7433
7434
7435
7436 Lisp_Object
7437 code_convert_region (start, end, coding_system, dst_object, encodep, norecord)
7438 Lisp_Object start, end, coding_system, dst_object;
7439 int encodep, norecord;
7440 {
7441 struct coding_system coding;
7442 EMACS_INT from, from_byte, to, to_byte;
7443 Lisp_Object src_object;
7444
7445 CHECK_NUMBER_COERCE_MARKER (start);
7446 CHECK_NUMBER_COERCE_MARKER (end);
7447 if (NILP (coding_system))
7448 coding_system = Qno_conversion;
7449 else
7450 CHECK_CODING_SYSTEM (coding_system);
7451 src_object = Fcurrent_buffer ();
7452 if (NILP (dst_object))
7453 dst_object = src_object;
7454 else if (! EQ (dst_object, Qt))
7455 CHECK_BUFFER (dst_object);
7456
7457 validate_region (&start, &end);
7458 from = XFASTINT (start);
7459 from_byte = CHAR_TO_BYTE (from);
7460 to = XFASTINT (end);
7461 to_byte = CHAR_TO_BYTE (to);
7462
7463 setup_coding_system (coding_system, &coding);
7464 coding.mode |= CODING_MODE_LAST_BLOCK;
7465
7466 if (encodep)
7467 encode_coding_object (&coding, src_object, from, from_byte, to, to_byte,
7468 dst_object);
7469 else
7470 decode_coding_object (&coding, src_object, from, from_byte, to, to_byte,
7471 dst_object);
7472 if (! norecord)
7473 Vlast_coding_system_used = CODING_ID_NAME (coding.id);
7474
7475 return (BUFFERP (dst_object)
7476 ? make_number (coding.produced_char)
7477 : coding.dst_object);
7478 }
7479
7480
7481 DEFUN ("decode-coding-region", Fdecode_coding_region, Sdecode_coding_region,
7482 3, 4, "r\nzCoding system: ",
7483 doc: /* Decode the current region from the specified coding system.
7484 When called from a program, takes four arguments:
7485 START, END, CODING-SYSTEM, and DESTINATION.
7486 START and END are buffer positions.
7487
7488 Optional 4th arguments DESTINATION specifies where the decoded text goes.
7489 If nil, the region between START and END is replace by the decoded text.
7490 If buffer, the decoded text is inserted in the buffer.
7491 If t, the decoded text is returned.
7492
7493 This function sets `last-coding-system-used' to the precise coding system
7494 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
7495 not fully specified.)
7496 It returns the length of the decoded text. */)
7497 (start, end, coding_system, destination)
7498 Lisp_Object start, end, coding_system, destination;
7499 {
7500 return code_convert_region (start, end, coding_system, destination, 0, 0);
7501 }
7502
7503 DEFUN ("encode-coding-region", Fencode_coding_region, Sencode_coding_region,
7504 3, 4, "r\nzCoding system: ",
7505 doc: /* Encode the current region by specified coding system.
7506 When called from a program, takes three arguments:
7507 START, END, and CODING-SYSTEM. START and END are buffer positions.
7508
7509 Optional 4th arguments DESTINATION specifies where the encoded text goes.
7510 If nil, the region between START and END is replace by the encoded text.
7511 If buffer, the encoded text is inserted in the buffer.
7512 If t, the encoded text is returned.
7513
7514 This function sets `last-coding-system-used' to the precise coding system
7515 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
7516 not fully specified.)
7517 It returns the length of the encoded text. */)
7518 (start, end, coding_system, destination)
7519 Lisp_Object start, end, coding_system, destination;
7520 {
7521 return code_convert_region (start, end, coding_system, destination, 1, 0);
7522 }
7523
7524 Lisp_Object
7525 code_convert_string (string, coding_system, dst_object,
7526 encodep, nocopy, norecord)
7527 Lisp_Object string, coding_system, dst_object;
7528 int encodep, nocopy, norecord;
7529 {
7530 struct coding_system coding;
7531 EMACS_INT chars, bytes;
7532
7533 CHECK_STRING (string);
7534 if (NILP (coding_system))
7535 {
7536 if (! norecord)
7537 Vlast_coding_system_used = Qno_conversion;
7538 if (NILP (dst_object))
7539 return (nocopy ? Fcopy_sequence (string) : string);
7540 }
7541
7542 if (NILP (coding_system))
7543 coding_system = Qno_conversion;
7544 else
7545 CHECK_CODING_SYSTEM (coding_system);
7546 if (NILP (dst_object))
7547 dst_object = Qt;
7548 else if (! EQ (dst_object, Qt))
7549 CHECK_BUFFER (dst_object);
7550
7551 setup_coding_system (coding_system, &coding);
7552 coding.mode |= CODING_MODE_LAST_BLOCK;
7553 chars = SCHARS (string);
7554 bytes = SBYTES (string);
7555 if (encodep)
7556 encode_coding_object (&coding, string, 0, 0, chars, bytes, dst_object);
7557 else
7558 decode_coding_object (&coding, string, 0, 0, chars, bytes, dst_object);
7559 if (! norecord)
7560 Vlast_coding_system_used = CODING_ID_NAME (coding.id);
7561
7562 return (BUFFERP (dst_object)
7563 ? make_number (coding.produced_char)
7564 : coding.dst_object);
7565 }
7566
7567
7568 /* Encode or decode STRING according to CODING_SYSTEM.
7569 Do not set Vlast_coding_system_used.
7570
7571 This function is called only from macros DECODE_FILE and
7572 ENCODE_FILE, thus we ignore character composition. */
7573
7574 Lisp_Object
7575 code_convert_string_norecord (string, coding_system, encodep)
7576 Lisp_Object string, coding_system;
7577 int encodep;
7578 {
7579 return code_convert_string (string, coding_system, Qt, encodep, 0, 1);
7580 }
7581
7582
7583 DEFUN ("decode-coding-string", Fdecode_coding_string, Sdecode_coding_string,
7584 2, 4, 0,
7585 doc: /* Decode STRING which is encoded in CODING-SYSTEM, and return the result.
7586
7587 Optional third arg NOCOPY non-nil means it is OK to return STRING itself
7588 if the decoding operation is trivial.
7589
7590 Optional fourth arg BUFFER non-nil meant that the decoded text is
7591 inserted in BUFFER instead of returned as a string. In this case,
7592 the return value is BUFFER.
7593
7594 This function sets `last-coding-system-used' to the precise coding system
7595 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
7596 not fully specified. */)
7597 (string, coding_system, nocopy, buffer)
7598 Lisp_Object string, coding_system, nocopy, buffer;
7599 {
7600 return code_convert_string (string, coding_system, buffer,
7601 0, ! NILP (nocopy), 0);
7602 }
7603
7604 DEFUN ("encode-coding-string", Fencode_coding_string, Sencode_coding_string,
7605 2, 4, 0,
7606 doc: /* Encode STRING to CODING-SYSTEM, and return the result.
7607
7608 Optional third arg NOCOPY non-nil means it is OK to return STRING
7609 itself if the encoding operation is trivial.
7610
7611 Optional fourth arg BUFFER non-nil meant that the encoded text is
7612 inserted in BUFFER instead of returned as a string. In this case,
7613 the return value is BUFFER.
7614
7615 This function sets `last-coding-system-used' to the precise coding system
7616 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
7617 not fully specified.) */)
7618 (string, coding_system, nocopy, buffer)
7619 Lisp_Object string, coding_system, nocopy, buffer;
7620 {
7621 return code_convert_string (string, coding_system, buffer,
7622 1, ! NILP (nocopy), 1);
7623 }
7624
7625 \f
7626 DEFUN ("decode-sjis-char", Fdecode_sjis_char, Sdecode_sjis_char, 1, 1, 0,
7627 doc: /* Decode a Japanese character which has CODE in shift_jis encoding.
7628 Return the corresponding character. */)
7629 (code)
7630 Lisp_Object code;
7631 {
7632 Lisp_Object spec, attrs, val;
7633 struct charset *charset_roman, *charset_kanji, *charset_kana, *charset;
7634 int c;
7635
7636 CHECK_NATNUM (code);
7637 c = XFASTINT (code);
7638 CHECK_CODING_SYSTEM_GET_SPEC (Vsjis_coding_system, spec);
7639 attrs = AREF (spec, 0);
7640
7641 if (ASCII_BYTE_P (c)
7642 && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
7643 return code;
7644
7645 val = CODING_ATTR_CHARSET_LIST (attrs);
7646 charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
7647 charset_kana = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
7648 charset_kanji = CHARSET_FROM_ID (XINT (XCAR (val)));
7649
7650 if (c <= 0x7F)
7651 charset = charset_roman;
7652 else if (c >= 0xA0 && c < 0xDF)
7653 {
7654 charset = charset_kana;
7655 c -= 0x80;
7656 }
7657 else
7658 {
7659 int s1 = c >> 8, s2 = c & 0xFF;
7660
7661 if (s1 < 0x81 || (s1 > 0x9F && s1 < 0xE0) || s1 > 0xEF
7662 || s2 < 0x40 || s2 == 0x7F || s2 > 0xFC)
7663 error ("Invalid code: %d", code);
7664 SJIS_TO_JIS (c);
7665 charset = charset_kanji;
7666 }
7667 c = DECODE_CHAR (charset, c);
7668 if (c < 0)
7669 error ("Invalid code: %d", code);
7670 return make_number (c);
7671 }
7672
7673
7674 DEFUN ("encode-sjis-char", Fencode_sjis_char, Sencode_sjis_char, 1, 1, 0,
7675 doc: /* Encode a Japanese character CHAR to shift_jis encoding.
7676 Return the corresponding code in SJIS. */)
7677 (ch)
7678 Lisp_Object ch;
7679 {
7680 Lisp_Object spec, attrs, charset_list;
7681 int c;
7682 struct charset *charset;
7683 unsigned code;
7684
7685 CHECK_CHARACTER (ch);
7686 c = XFASTINT (ch);
7687 CHECK_CODING_SYSTEM_GET_SPEC (Vsjis_coding_system, spec);
7688 attrs = AREF (spec, 0);
7689
7690 if (ASCII_CHAR_P (c)
7691 && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
7692 return ch;
7693
7694 charset_list = CODING_ATTR_CHARSET_LIST (attrs);
7695 charset = char_charset (c, charset_list, &code);
7696 if (code == CHARSET_INVALID_CODE (charset))
7697 error ("Can't encode by shift_jis encoding: %d", c);
7698 JIS_TO_SJIS (code);
7699
7700 return make_number (code);
7701 }
7702
7703 DEFUN ("decode-big5-char", Fdecode_big5_char, Sdecode_big5_char, 1, 1, 0,
7704 doc: /* Decode a Big5 character which has CODE in BIG5 coding system.
7705 Return the corresponding character. */)
7706 (code)
7707 Lisp_Object code;
7708 {
7709 Lisp_Object spec, attrs, val;
7710 struct charset *charset_roman, *charset_big5, *charset;
7711 int c;
7712
7713 CHECK_NATNUM (code);
7714 c = XFASTINT (code);
7715 CHECK_CODING_SYSTEM_GET_SPEC (Vbig5_coding_system, spec);
7716 attrs = AREF (spec, 0);
7717
7718 if (ASCII_BYTE_P (c)
7719 && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
7720 return code;
7721
7722 val = CODING_ATTR_CHARSET_LIST (attrs);
7723 charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
7724 charset_big5 = CHARSET_FROM_ID (XINT (XCAR (val)));
7725
7726 if (c <= 0x7F)
7727 charset = charset_roman;
7728 else
7729 {
7730 int b1 = c >> 8, b2 = c & 0x7F;
7731 if (b1 < 0xA1 || b1 > 0xFE
7732 || b2 < 0x40 || (b2 > 0x7E && b2 < 0xA1) || b2 > 0xFE)
7733 error ("Invalid code: %d", code);
7734 charset = charset_big5;
7735 }
7736 c = DECODE_CHAR (charset, (unsigned )c);
7737 if (c < 0)
7738 error ("Invalid code: %d", code);
7739 return make_number (c);
7740 }
7741
7742 DEFUN ("encode-big5-char", Fencode_big5_char, Sencode_big5_char, 1, 1, 0,
7743 doc: /* Encode the Big5 character CHAR to BIG5 coding system.
7744 Return the corresponding character code in Big5. */)
7745 (ch)
7746 Lisp_Object ch;
7747 {
7748 Lisp_Object spec, attrs, charset_list;
7749 struct charset *charset;
7750 int c;
7751 unsigned code;
7752
7753 CHECK_CHARACTER (ch);
7754 c = XFASTINT (ch);
7755 CHECK_CODING_SYSTEM_GET_SPEC (Vbig5_coding_system, spec);
7756 attrs = AREF (spec, 0);
7757 if (ASCII_CHAR_P (c)
7758 && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
7759 return ch;
7760
7761 charset_list = CODING_ATTR_CHARSET_LIST (attrs);
7762 charset = char_charset (c, charset_list, &code);
7763 if (code == CHARSET_INVALID_CODE (charset))
7764 error ("Can't encode by Big5 encoding: %d", c);
7765
7766 return make_number (code);
7767 }
7768
7769 \f
7770 DEFUN ("set-terminal-coding-system-internal",
7771 Fset_terminal_coding_system_internal,
7772 Sset_terminal_coding_system_internal, 1, 1, 0,
7773 doc: /* Internal use only. */)
7774 (coding_system)
7775 Lisp_Object coding_system;
7776 {
7777 CHECK_SYMBOL (coding_system);
7778 setup_coding_system (Fcheck_coding_system (coding_system),
7779 &terminal_coding);
7780
7781 /* We had better not send unsafe characters to terminal. */
7782 terminal_coding.mode |= CODING_MODE_SAFE_ENCODING;
7783 /* Characer composition should be disabled. */
7784 terminal_coding.common_flags &= ~CODING_ANNOTATE_COMPOSITION_MASK;
7785 terminal_coding.src_multibyte = 1;
7786 terminal_coding.dst_multibyte = 0;
7787 return Qnil;
7788 }
7789
7790 DEFUN ("set-safe-terminal-coding-system-internal",
7791 Fset_safe_terminal_coding_system_internal,
7792 Sset_safe_terminal_coding_system_internal, 1, 1, 0,
7793 doc: /* Internal use only. */)
7794 (coding_system)
7795 Lisp_Object coding_system;
7796 {
7797 CHECK_SYMBOL (coding_system);
7798 setup_coding_system (Fcheck_coding_system (coding_system),
7799 &safe_terminal_coding);
7800 /* Characer composition should be disabled. */
7801 safe_terminal_coding.common_flags &= ~CODING_ANNOTATE_COMPOSITION_MASK;
7802 safe_terminal_coding.src_multibyte = 1;
7803 safe_terminal_coding.dst_multibyte = 0;
7804 return Qnil;
7805 }
7806
7807 DEFUN ("terminal-coding-system",
7808 Fterminal_coding_system, Sterminal_coding_system, 0, 0, 0,
7809 doc: /* Return coding system specified for terminal output. */)
7810 ()
7811 {
7812 return CODING_ID_NAME (terminal_coding.id);
7813 }
7814
7815 DEFUN ("set-keyboard-coding-system-internal",
7816 Fset_keyboard_coding_system_internal,
7817 Sset_keyboard_coding_system_internal, 1, 1, 0,
7818 doc: /* Internal use only. */)
7819 (coding_system)
7820 Lisp_Object coding_system;
7821 {
7822 CHECK_SYMBOL (coding_system);
7823 setup_coding_system (Fcheck_coding_system (coding_system),
7824 &keyboard_coding);
7825 /* Characer composition should be disabled. */
7826 keyboard_coding.common_flags &= ~CODING_ANNOTATE_COMPOSITION_MASK;
7827 return Qnil;
7828 }
7829
7830 DEFUN ("keyboard-coding-system",
7831 Fkeyboard_coding_system, Skeyboard_coding_system, 0, 0, 0,
7832 doc: /* Return coding system specified for decoding keyboard input. */)
7833 ()
7834 {
7835 return CODING_ID_NAME (keyboard_coding.id);
7836 }
7837
7838 \f
7839 DEFUN ("find-operation-coding-system", Ffind_operation_coding_system,
7840 Sfind_operation_coding_system, 1, MANY, 0,
7841 doc: /* Choose a coding system for an operation based on the target name.
7842 The value names a pair of coding systems: (DECODING-SYSTEM . ENCODING-SYSTEM).
7843 DECODING-SYSTEM is the coding system to use for decoding
7844 \(in case OPERATION does decoding), and ENCODING-SYSTEM is the coding system
7845 for encoding (in case OPERATION does encoding).
7846
7847 The first argument OPERATION specifies an I/O primitive:
7848 For file I/O, `insert-file-contents' or `write-region'.
7849 For process I/O, `call-process', `call-process-region', or `start-process'.
7850 For network I/O, `open-network-stream'.
7851
7852 The remaining arguments should be the same arguments that were passed
7853 to the primitive. Depending on which primitive, one of those arguments
7854 is selected as the TARGET. For example, if OPERATION does file I/O,
7855 whichever argument specifies the file name is TARGET.
7856
7857 TARGET has a meaning which depends on OPERATION:
7858 For file I/O, TARGET is a file name.
7859 For process I/O, TARGET is a process name.
7860 For network I/O, TARGET is a service name or a port number
7861
7862 This function looks up what specified for TARGET in,
7863 `file-coding-system-alist', `process-coding-system-alist',
7864 or `network-coding-system-alist' depending on OPERATION.
7865 They may specify a coding system, a cons of coding systems,
7866 or a function symbol to call.
7867 In the last case, we call the function with one argument,
7868 which is a list of all the arguments given to this function.
7869
7870 usage: (find-operation-coding-system OPERATION ARGUMENTS ...) */)
7871 (nargs, args)
7872 int nargs;
7873 Lisp_Object *args;
7874 {
7875 Lisp_Object operation, target_idx, target, val;
7876 register Lisp_Object chain;
7877
7878 if (nargs < 2)
7879 error ("Too few arguments");
7880 operation = args[0];
7881 if (!SYMBOLP (operation)
7882 || !INTEGERP (target_idx = Fget (operation, Qtarget_idx)))
7883 error ("Invalid first arguement");
7884 if (nargs < 1 + XINT (target_idx))
7885 error ("Too few arguments for operation: %s",
7886 SDATA (SYMBOL_NAME (operation)));
7887 target = args[XINT (target_idx) + 1];
7888 if (!(STRINGP (target)
7889 || (EQ (operation, Qopen_network_stream) && INTEGERP (target))))
7890 error ("Invalid %dth argument", XINT (target_idx) + 1);
7891
7892 chain = ((EQ (operation, Qinsert_file_contents)
7893 || EQ (operation, Qwrite_region))
7894 ? Vfile_coding_system_alist
7895 : (EQ (operation, Qopen_network_stream)
7896 ? Vnetwork_coding_system_alist
7897 : Vprocess_coding_system_alist));
7898 if (NILP (chain))
7899 return Qnil;
7900
7901 for (; CONSP (chain); chain = XCDR (chain))
7902 {
7903 Lisp_Object elt;
7904
7905 elt = XCAR (chain);
7906 if (CONSP (elt)
7907 && ((STRINGP (target)
7908 && STRINGP (XCAR (elt))
7909 && fast_string_match (XCAR (elt), target) >= 0)
7910 || (INTEGERP (target) && EQ (target, XCAR (elt)))))
7911 {
7912 val = XCDR (elt);
7913 /* Here, if VAL is both a valid coding system and a valid
7914 function symbol, we return VAL as a coding system. */
7915 if (CONSP (val))
7916 return val;
7917 if (! SYMBOLP (val))
7918 return Qnil;
7919 if (! NILP (Fcoding_system_p (val)))
7920 return Fcons (val, val);
7921 if (! NILP (Ffboundp (val)))
7922 {
7923 val = call1 (val, Flist (nargs, args));
7924 if (CONSP (val))
7925 return val;
7926 if (SYMBOLP (val) && ! NILP (Fcoding_system_p (val)))
7927 return Fcons (val, val);
7928 }
7929 return Qnil;
7930 }
7931 }
7932 return Qnil;
7933 }
7934
7935 DEFUN ("set-coding-system-priority", Fset_coding_system_priority,
7936 Sset_coding_system_priority, 0, MANY, 0,
7937 doc: /* Assign higher priority to the coding systems given as arguments.
7938 If multiple coding systems belongs to the same category,
7939 all but the first one are ignored.
7940
7941 usage: (set-coding-system-priority ...) */)
7942 (nargs, args)
7943 int nargs;
7944 Lisp_Object *args;
7945 {
7946 int i, j;
7947 int changed[coding_category_max];
7948 enum coding_category priorities[coding_category_max];
7949
7950 bzero (changed, sizeof changed);
7951
7952 for (i = j = 0; i < nargs; i++)
7953 {
7954 enum coding_category category;
7955 Lisp_Object spec, attrs;
7956
7957 CHECK_CODING_SYSTEM_GET_SPEC (args[i], spec);
7958 attrs = AREF (spec, 0);
7959 category = XINT (CODING_ATTR_CATEGORY (attrs));
7960 if (changed[category])
7961 /* Ignore this coding system because a coding system of the
7962 same category already had a higher priority. */
7963 continue;
7964 changed[category] = 1;
7965 priorities[j++] = category;
7966 if (coding_categories[category].id >= 0
7967 && ! EQ (args[i], CODING_ID_NAME (coding_categories[category].id)))
7968 setup_coding_system (args[i], &coding_categories[category]);
7969 Fset (AREF (Vcoding_category_table, category), args[i]);
7970 }
7971
7972 /* Now we have decided top J priorities. Reflect the order of the
7973 original priorities to the remaining priorities. */
7974
7975 for (i = j, j = 0; i < coding_category_max; i++, j++)
7976 {
7977 while (j < coding_category_max
7978 && changed[coding_priorities[j]])
7979 j++;
7980 if (j == coding_category_max)
7981 abort ();
7982 priorities[i] = coding_priorities[j];
7983 }
7984
7985 bcopy (priorities, coding_priorities, sizeof priorities);
7986
7987 /* Update `coding-category-list'. */
7988 Vcoding_category_list = Qnil;
7989 for (i = coding_category_max - 1; i >= 0; i--)
7990 Vcoding_category_list
7991 = Fcons (AREF (Vcoding_category_table, priorities[i]),
7992 Vcoding_category_list);
7993
7994 return Qnil;
7995 }
7996
7997 DEFUN ("coding-system-priority-list", Fcoding_system_priority_list,
7998 Scoding_system_priority_list, 0, 1, 0,
7999 doc: /* Return a list of coding systems ordered by their priorities.
8000 HIGHESTP non-nil means just return the highest priority one. */)
8001 (highestp)
8002 Lisp_Object highestp;
8003 {
8004 int i;
8005 Lisp_Object val;
8006
8007 for (i = 0, val = Qnil; i < coding_category_max; i++)
8008 {
8009 enum coding_category category = coding_priorities[i];
8010 int id = coding_categories[category].id;
8011 Lisp_Object attrs;
8012
8013 if (id < 0)
8014 continue;
8015 attrs = CODING_ID_ATTRS (id);
8016 if (! NILP (highestp))
8017 return CODING_ATTR_BASE_NAME (attrs);
8018 val = Fcons (CODING_ATTR_BASE_NAME (attrs), val);
8019 }
8020 return Fnreverse (val);
8021 }
8022
8023 static char *suffixes[] = { "-unix", "-dos", "-mac" };
8024
8025 static Lisp_Object
8026 make_subsidiaries (base)
8027 Lisp_Object base;
8028 {
8029 Lisp_Object subsidiaries;
8030 int base_name_len = SBYTES (SYMBOL_NAME (base));
8031 char *buf = (char *) alloca (base_name_len + 6);
8032 int i;
8033
8034 bcopy (SDATA (SYMBOL_NAME (base)), buf, base_name_len);
8035 subsidiaries = Fmake_vector (make_number (3), Qnil);
8036 for (i = 0; i < 3; i++)
8037 {
8038 bcopy (suffixes[i], buf + base_name_len, strlen (suffixes[i]) + 1);
8039 ASET (subsidiaries, i, intern (buf));
8040 }
8041 return subsidiaries;
8042 }
8043
8044
8045 DEFUN ("define-coding-system-internal", Fdefine_coding_system_internal,
8046 Sdefine_coding_system_internal, coding_arg_max, MANY, 0,
8047 doc: /* For internal use only.
8048 usage: (define-coding-system-internal ...) */)
8049 (nargs, args)
8050 int nargs;
8051 Lisp_Object *args;
8052 {
8053 Lisp_Object name;
8054 Lisp_Object spec_vec; /* [ ATTRS ALIASE EOL_TYPE ] */
8055 Lisp_Object attrs; /* Vector of attributes. */
8056 Lisp_Object eol_type;
8057 Lisp_Object aliases;
8058 Lisp_Object coding_type, charset_list, safe_charsets;
8059 enum coding_category category;
8060 Lisp_Object tail, val;
8061 int max_charset_id = 0;
8062 int i;
8063
8064 if (nargs < coding_arg_max)
8065 goto short_args;
8066
8067 attrs = Fmake_vector (make_number (coding_attr_last_index), Qnil);
8068
8069 name = args[coding_arg_name];
8070 CHECK_SYMBOL (name);
8071 CODING_ATTR_BASE_NAME (attrs) = name;
8072
8073 val = args[coding_arg_mnemonic];
8074 if (! STRINGP (val))
8075 CHECK_CHARACTER (val);
8076 CODING_ATTR_MNEMONIC (attrs) = val;
8077
8078 coding_type = args[coding_arg_coding_type];
8079 CHECK_SYMBOL (coding_type);
8080 CODING_ATTR_TYPE (attrs) = coding_type;
8081
8082 charset_list = args[coding_arg_charset_list];
8083 if (SYMBOLP (charset_list))
8084 {
8085 if (EQ (charset_list, Qiso_2022))
8086 {
8087 if (! EQ (coding_type, Qiso_2022))
8088 error ("Invalid charset-list");
8089 charset_list = Viso_2022_charset_list;
8090 }
8091 else if (EQ (charset_list, Qemacs_mule))
8092 {
8093 if (! EQ (coding_type, Qemacs_mule))
8094 error ("Invalid charset-list");
8095 charset_list = Vemacs_mule_charset_list;
8096 }
8097 for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
8098 if (max_charset_id < XFASTINT (XCAR (tail)))
8099 max_charset_id = XFASTINT (XCAR (tail));
8100 }
8101 else
8102 {
8103 charset_list = Fcopy_sequence (charset_list);
8104 for (tail = charset_list; !NILP (tail); tail = Fcdr (tail))
8105 {
8106 struct charset *charset;
8107
8108 val = Fcar (tail);
8109 CHECK_CHARSET_GET_CHARSET (val, charset);
8110 if (EQ (coding_type, Qiso_2022)
8111 ? CHARSET_ISO_FINAL (charset) < 0
8112 : EQ (coding_type, Qemacs_mule)
8113 ? CHARSET_EMACS_MULE_ID (charset) < 0
8114 : 0)
8115 error ("Can't handle charset `%s'",
8116 SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
8117
8118 XSETCAR (tail, make_number (charset->id));
8119 if (max_charset_id < charset->id)
8120 max_charset_id = charset->id;
8121 }
8122 }
8123 CODING_ATTR_CHARSET_LIST (attrs) = charset_list;
8124
8125 safe_charsets = Fmake_string (make_number (max_charset_id + 1),
8126 make_number (255));
8127 for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
8128 SSET (safe_charsets, XFASTINT (XCAR (tail)), 0);
8129 CODING_ATTR_SAFE_CHARSETS (attrs) = safe_charsets;
8130
8131 CODING_ATTR_ASCII_COMPAT (attrs) = args[coding_arg_ascii_compatible_p];
8132
8133 val = args[coding_arg_decode_translation_table];
8134 if (! NILP (val))
8135 CHECK_CHAR_TABLE (val);
8136 CODING_ATTR_DECODE_TBL (attrs) = val;
8137
8138 val = args[coding_arg_encode_translation_table];
8139 if (! NILP (val))
8140 CHECK_CHAR_TABLE (val);
8141 CODING_ATTR_ENCODE_TBL (attrs) = val;
8142
8143 val = args[coding_arg_post_read_conversion];
8144 CHECK_SYMBOL (val);
8145 CODING_ATTR_POST_READ (attrs) = val;
8146
8147 val = args[coding_arg_pre_write_conversion];
8148 CHECK_SYMBOL (val);
8149 CODING_ATTR_PRE_WRITE (attrs) = val;
8150
8151 val = args[coding_arg_default_char];
8152 if (NILP (val))
8153 CODING_ATTR_DEFAULT_CHAR (attrs) = make_number (' ');
8154 else
8155 {
8156 CHECK_CHARACTER (val);
8157 CODING_ATTR_DEFAULT_CHAR (attrs) = val;
8158 }
8159
8160 val = args[coding_arg_for_unibyte];
8161 CODING_ATTR_FOR_UNIBYTE (attrs) = NILP (val) ? Qnil : Qt;
8162
8163 val = args[coding_arg_plist];
8164 CHECK_LIST (val);
8165 CODING_ATTR_PLIST (attrs) = val;
8166
8167 if (EQ (coding_type, Qcharset))
8168 {
8169 /* Generate a lisp vector of 256 elements. Each element is nil,
8170 integer, or a list of charset IDs.
8171
8172 If Nth element is nil, the byte code N is invalid in this
8173 coding system.
8174
8175 If Nth element is a number NUM, N is the first byte of a
8176 charset whose ID is NUM.
8177
8178 If Nth element is a list of charset IDs, N is the first byte
8179 of one of them. The list is sorted by dimensions of the
8180 charsets. A charset of smaller dimension comes firtst. */
8181 val = Fmake_vector (make_number (256), Qnil);
8182
8183 for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
8184 {
8185 struct charset *charset = CHARSET_FROM_ID (XFASTINT (XCAR (tail)));
8186 int dim = CHARSET_DIMENSION (charset);
8187 int idx = (dim - 1) * 4;
8188
8189 if (CHARSET_ASCII_COMPATIBLE_P (charset))
8190 CODING_ATTR_ASCII_COMPAT (attrs) = Qt;
8191
8192 for (i = charset->code_space[idx];
8193 i <= charset->code_space[idx + 1]; i++)
8194 {
8195 Lisp_Object tmp, tmp2;
8196 int dim2;
8197
8198 tmp = AREF (val, i);
8199 if (NILP (tmp))
8200 tmp = XCAR (tail);
8201 else if (NUMBERP (tmp))
8202 {
8203 dim2 = CHARSET_DIMENSION (CHARSET_FROM_ID (XFASTINT (tmp)));
8204 if (dim < dim2)
8205 tmp = Fcons (XCAR (tail), Fcons (tmp, Qnil));
8206 else
8207 tmp = Fcons (tmp, Fcons (XCAR (tail), Qnil));
8208 }
8209 else
8210 {
8211 for (tmp2 = tmp; CONSP (tmp2); tmp2 = XCDR (tmp2))
8212 {
8213 dim2 = CHARSET_DIMENSION (CHARSET_FROM_ID (XFASTINT (XCAR (tmp2))));
8214 if (dim < dim2)
8215 break;
8216 }
8217 if (NILP (tmp2))
8218 tmp = nconc2 (tmp, Fcons (XCAR (tail), Qnil));
8219 else
8220 {
8221 XSETCDR (tmp2, Fcons (XCAR (tmp2), XCDR (tmp2)));
8222 XSETCAR (tmp2, XCAR (tail));
8223 }
8224 }
8225 ASET (val, i, tmp);
8226 }
8227 }
8228 ASET (attrs, coding_attr_charset_valids, val);
8229 category = coding_category_charset;
8230 }
8231 else if (EQ (coding_type, Qccl))
8232 {
8233 Lisp_Object valids;
8234
8235 if (nargs < coding_arg_ccl_max)
8236 goto short_args;
8237
8238 val = args[coding_arg_ccl_decoder];
8239 CHECK_CCL_PROGRAM (val);
8240 if (VECTORP (val))
8241 val = Fcopy_sequence (val);
8242 ASET (attrs, coding_attr_ccl_decoder, val);
8243
8244 val = args[coding_arg_ccl_encoder];
8245 CHECK_CCL_PROGRAM (val);
8246 if (VECTORP (val))
8247 val = Fcopy_sequence (val);
8248 ASET (attrs, coding_attr_ccl_encoder, val);
8249
8250 val = args[coding_arg_ccl_valids];
8251 valids = Fmake_string (make_number (256), make_number (0));
8252 for (tail = val; !NILP (tail); tail = Fcdr (tail))
8253 {
8254 int from, to;
8255
8256 val = Fcar (tail);
8257 if (INTEGERP (val))
8258 {
8259 from = to = XINT (val);
8260 if (from < 0 || from > 255)
8261 args_out_of_range_3 (val, make_number (0), make_number (255));
8262 }
8263 else
8264 {
8265 CHECK_CONS (val);
8266 CHECK_NATNUM_CAR (val);
8267 CHECK_NATNUM_CDR (val);
8268 from = XINT (XCAR (val));
8269 if (from > 255)
8270 args_out_of_range_3 (XCAR (val),
8271 make_number (0), make_number (255));
8272 to = XINT (XCDR (val));
8273 if (to < from || to > 255)
8274 args_out_of_range_3 (XCDR (val),
8275 XCAR (val), make_number (255));
8276 }
8277 for (i = from; i <= to; i++)
8278 SSET (valids, i, 1);
8279 }
8280 ASET (attrs, coding_attr_ccl_valids, valids);
8281
8282 category = coding_category_ccl;
8283 }
8284 else if (EQ (coding_type, Qutf_16))
8285 {
8286 Lisp_Object bom, endian;
8287
8288 CODING_ATTR_ASCII_COMPAT (attrs) = Qnil;
8289
8290 if (nargs < coding_arg_utf16_max)
8291 goto short_args;
8292
8293 bom = args[coding_arg_utf16_bom];
8294 if (! NILP (bom) && ! EQ (bom, Qt))
8295 {
8296 CHECK_CONS (bom);
8297 val = XCAR (bom);
8298 CHECK_CODING_SYSTEM (val);
8299 val = XCDR (bom);
8300 CHECK_CODING_SYSTEM (val);
8301 }
8302 ASET (attrs, coding_attr_utf_16_bom, bom);
8303
8304 endian = args[coding_arg_utf16_endian];
8305 CHECK_SYMBOL (endian);
8306 if (NILP (endian))
8307 endian = Qbig;
8308 else if (! EQ (endian, Qbig) && ! EQ (endian, Qlittle))
8309 error ("Invalid endian: %s", SDATA (SYMBOL_NAME (endian)));
8310 ASET (attrs, coding_attr_utf_16_endian, endian);
8311
8312 category = (CONSP (bom)
8313 ? coding_category_utf_16_auto
8314 : NILP (bom)
8315 ? (EQ (endian, Qbig)
8316 ? coding_category_utf_16_be_nosig
8317 : coding_category_utf_16_le_nosig)
8318 : (EQ (endian, Qbig)
8319 ? coding_category_utf_16_be
8320 : coding_category_utf_16_le));
8321 }
8322 else if (EQ (coding_type, Qiso_2022))
8323 {
8324 Lisp_Object initial, reg_usage, request, flags;
8325 int i;
8326
8327 if (nargs < coding_arg_iso2022_max)
8328 goto short_args;
8329
8330 initial = Fcopy_sequence (args[coding_arg_iso2022_initial]);
8331 CHECK_VECTOR (initial);
8332 for (i = 0; i < 4; i++)
8333 {
8334 val = Faref (initial, make_number (i));
8335 if (! NILP (val))
8336 {
8337 struct charset *charset;
8338
8339 CHECK_CHARSET_GET_CHARSET (val, charset);
8340 ASET (initial, i, make_number (CHARSET_ID (charset)));
8341 if (i == 0 && CHARSET_ASCII_COMPATIBLE_P (charset))
8342 CODING_ATTR_ASCII_COMPAT (attrs) = Qt;
8343 }
8344 else
8345 ASET (initial, i, make_number (-1));
8346 }
8347
8348 reg_usage = args[coding_arg_iso2022_reg_usage];
8349 CHECK_CONS (reg_usage);
8350 CHECK_NUMBER_CAR (reg_usage);
8351 CHECK_NUMBER_CDR (reg_usage);
8352
8353 request = Fcopy_sequence (args[coding_arg_iso2022_request]);
8354 for (tail = request; ! NILP (tail); tail = Fcdr (tail))
8355 {
8356 int id;
8357 Lisp_Object tmp;
8358
8359 val = Fcar (tail);
8360 CHECK_CONS (val);
8361 tmp = XCAR (val);
8362 CHECK_CHARSET_GET_ID (tmp, id);
8363 CHECK_NATNUM_CDR (val);
8364 if (XINT (XCDR (val)) >= 4)
8365 error ("Invalid graphic register number: %d", XINT (XCDR (val)));
8366 XSETCAR (val, make_number (id));
8367 }
8368
8369 flags = args[coding_arg_iso2022_flags];
8370 CHECK_NATNUM (flags);
8371 i = XINT (flags);
8372 if (EQ (args[coding_arg_charset_list], Qiso_2022))
8373 flags = make_number (i | CODING_ISO_FLAG_FULL_SUPPORT);
8374
8375 ASET (attrs, coding_attr_iso_initial, initial);
8376 ASET (attrs, coding_attr_iso_usage, reg_usage);
8377 ASET (attrs, coding_attr_iso_request, request);
8378 ASET (attrs, coding_attr_iso_flags, flags);
8379 setup_iso_safe_charsets (attrs);
8380
8381 if (i & CODING_ISO_FLAG_SEVEN_BITS)
8382 category = ((i & (CODING_ISO_FLAG_LOCKING_SHIFT
8383 | CODING_ISO_FLAG_SINGLE_SHIFT))
8384 ? coding_category_iso_7_else
8385 : EQ (args[coding_arg_charset_list], Qiso_2022)
8386 ? coding_category_iso_7
8387 : coding_category_iso_7_tight);
8388 else
8389 {
8390 int id = XINT (AREF (initial, 1));
8391
8392 category = (((i & CODING_ISO_FLAG_LOCKING_SHIFT)
8393 || EQ (args[coding_arg_charset_list], Qiso_2022)
8394 || id < 0)
8395 ? coding_category_iso_8_else
8396 : (CHARSET_DIMENSION (CHARSET_FROM_ID (id)) == 1)
8397 ? coding_category_iso_8_1
8398 : coding_category_iso_8_2);
8399 }
8400 if (category != coding_category_iso_8_1
8401 && category != coding_category_iso_8_2)
8402 CODING_ATTR_ASCII_COMPAT (attrs) = Qnil;
8403 }
8404 else if (EQ (coding_type, Qemacs_mule))
8405 {
8406 if (EQ (args[coding_arg_charset_list], Qemacs_mule))
8407 ASET (attrs, coding_attr_emacs_mule_full, Qt);
8408 CODING_ATTR_ASCII_COMPAT (attrs) = Qt;
8409 category = coding_category_emacs_mule;
8410 }
8411 else if (EQ (coding_type, Qshift_jis))
8412 {
8413
8414 struct charset *charset;
8415
8416 if (XINT (Flength (charset_list)) != 3)
8417 error ("There should be just three charsets");
8418
8419 charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
8420 if (CHARSET_DIMENSION (charset) != 1)
8421 error ("Dimension of charset %s is not one",
8422 SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
8423 if (CHARSET_ASCII_COMPATIBLE_P (charset))
8424 CODING_ATTR_ASCII_COMPAT (attrs) = Qt;
8425
8426 charset_list = XCDR (charset_list);
8427 charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
8428 if (CHARSET_DIMENSION (charset) != 1)
8429 error ("Dimension of charset %s is not one",
8430 SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
8431
8432 charset_list = XCDR (charset_list);
8433 charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
8434 if (CHARSET_DIMENSION (charset) != 2)
8435 error ("Dimension of charset %s is not two",
8436 SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
8437
8438 category = coding_category_sjis;
8439 Vsjis_coding_system = name;
8440 }
8441 else if (EQ (coding_type, Qbig5))
8442 {
8443 struct charset *charset;
8444
8445 if (XINT (Flength (charset_list)) != 2)
8446 error ("There should be just two charsets");
8447
8448 charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
8449 if (CHARSET_DIMENSION (charset) != 1)
8450 error ("Dimension of charset %s is not one",
8451 SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
8452 if (CHARSET_ASCII_COMPATIBLE_P (charset))
8453 CODING_ATTR_ASCII_COMPAT (attrs) = Qt;
8454
8455 charset_list = XCDR (charset_list);
8456 charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
8457 if (CHARSET_DIMENSION (charset) != 2)
8458 error ("Dimension of charset %s is not two",
8459 SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
8460
8461 category = coding_category_big5;
8462 Vbig5_coding_system = name;
8463 }
8464 else if (EQ (coding_type, Qraw_text))
8465 {
8466 category = coding_category_raw_text;
8467 CODING_ATTR_ASCII_COMPAT (attrs) = Qt;
8468 }
8469 else if (EQ (coding_type, Qutf_8))
8470 {
8471 category = coding_category_utf_8;
8472 CODING_ATTR_ASCII_COMPAT (attrs) = Qt;
8473 }
8474 else if (EQ (coding_type, Qundecided))
8475 category = coding_category_undecided;
8476 else
8477 error ("Invalid coding system type: %s",
8478 SDATA (SYMBOL_NAME (coding_type)));
8479
8480 CODING_ATTR_CATEGORY (attrs) = make_number (category);
8481 CODING_ATTR_PLIST (attrs)
8482 = Fcons (QCcategory, Fcons (AREF (Vcoding_category_table, category),
8483 CODING_ATTR_PLIST (attrs)));
8484
8485 eol_type = args[coding_arg_eol_type];
8486 if (! NILP (eol_type)
8487 && ! EQ (eol_type, Qunix)
8488 && ! EQ (eol_type, Qdos)
8489 && ! EQ (eol_type, Qmac))
8490 error ("Invalid eol-type");
8491
8492 aliases = Fcons (name, Qnil);
8493
8494 if (NILP (eol_type))
8495 {
8496 eol_type = make_subsidiaries (name);
8497 for (i = 0; i < 3; i++)
8498 {
8499 Lisp_Object this_spec, this_name, this_aliases, this_eol_type;
8500
8501 this_name = AREF (eol_type, i);
8502 this_aliases = Fcons (this_name, Qnil);
8503 this_eol_type = (i == 0 ? Qunix : i == 1 ? Qdos : Qmac);
8504 this_spec = Fmake_vector (make_number (3), attrs);
8505 ASET (this_spec, 1, this_aliases);
8506 ASET (this_spec, 2, this_eol_type);
8507 Fputhash (this_name, this_spec, Vcoding_system_hash_table);
8508 Vcoding_system_list = Fcons (this_name, Vcoding_system_list);
8509 Vcoding_system_alist = Fcons (Fcons (Fsymbol_name (this_name), Qnil),
8510 Vcoding_system_alist);
8511 }
8512 }
8513
8514 spec_vec = Fmake_vector (make_number (3), attrs);
8515 ASET (spec_vec, 1, aliases);
8516 ASET (spec_vec, 2, eol_type);
8517
8518 Fputhash (name, spec_vec, Vcoding_system_hash_table);
8519 Vcoding_system_list = Fcons (name, Vcoding_system_list);
8520 Vcoding_system_alist = Fcons (Fcons (Fsymbol_name (name), Qnil),
8521 Vcoding_system_alist);
8522
8523 {
8524 int id = coding_categories[category].id;
8525
8526 if (id < 0 || EQ (name, CODING_ID_NAME (id)))
8527 setup_coding_system (name, &coding_categories[category]);
8528 }
8529
8530 return Qnil;
8531
8532 short_args:
8533 return Fsignal (Qwrong_number_of_arguments,
8534 Fcons (intern ("define-coding-system-internal"),
8535 make_number (nargs)));
8536 }
8537
8538
8539 DEFUN ("define-coding-system-alias", Fdefine_coding_system_alias,
8540 Sdefine_coding_system_alias, 2, 2, 0,
8541 doc: /* Define ALIAS as an alias for CODING-SYSTEM. */)
8542 (alias, coding_system)
8543 Lisp_Object alias, coding_system;
8544 {
8545 Lisp_Object spec, aliases, eol_type;
8546
8547 CHECK_SYMBOL (alias);
8548 CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
8549 aliases = AREF (spec, 1);
8550 /* ALISES should be a list of length more than zero, and the first
8551 element is a base coding system. Append ALIAS at the tail of the
8552 list. */
8553 while (!NILP (XCDR (aliases)))
8554 aliases = XCDR (aliases);
8555 XSETCDR (aliases, Fcons (alias, Qnil));
8556
8557 eol_type = AREF (spec, 2);
8558 if (VECTORP (eol_type))
8559 {
8560 Lisp_Object subsidiaries;
8561 int i;
8562
8563 subsidiaries = make_subsidiaries (alias);
8564 for (i = 0; i < 3; i++)
8565 Fdefine_coding_system_alias (AREF (subsidiaries, i),
8566 AREF (eol_type, i));
8567 }
8568
8569 Fputhash (alias, spec, Vcoding_system_hash_table);
8570 Vcoding_system_list = Fcons (alias, Vcoding_system_list);
8571 Vcoding_system_alist = Fcons (Fcons (Fsymbol_name (alias), Qnil),
8572 Vcoding_system_alist);
8573
8574 return Qnil;
8575 }
8576
8577 DEFUN ("coding-system-base", Fcoding_system_base, Scoding_system_base,
8578 1, 1, 0,
8579 doc: /* Return the base of CODING-SYSTEM.
8580 Any alias or subsidiary coding system is not a base coding system. */)
8581 (coding_system)
8582 Lisp_Object coding_system;
8583 {
8584 Lisp_Object spec, attrs;
8585
8586 if (NILP (coding_system))
8587 return (Qno_conversion);
8588 CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
8589 attrs = AREF (spec, 0);
8590 return CODING_ATTR_BASE_NAME (attrs);
8591 }
8592
8593 DEFUN ("coding-system-plist", Fcoding_system_plist, Scoding_system_plist,
8594 1, 1, 0,
8595 doc: "Return the property list of CODING-SYSTEM.")
8596 (coding_system)
8597 Lisp_Object coding_system;
8598 {
8599 Lisp_Object spec, attrs;
8600
8601 if (NILP (coding_system))
8602 coding_system = Qno_conversion;
8603 CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
8604 attrs = AREF (spec, 0);
8605 return CODING_ATTR_PLIST (attrs);
8606 }
8607
8608
8609 DEFUN ("coding-system-aliases", Fcoding_system_aliases, Scoding_system_aliases,
8610 1, 1, 0,
8611 doc: /* Return the list of aliases of CODING-SYSTEM. */)
8612 (coding_system)
8613 Lisp_Object coding_system;
8614 {
8615 Lisp_Object spec;
8616
8617 if (NILP (coding_system))
8618 coding_system = Qno_conversion;
8619 CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
8620 return AREF (spec, 1);
8621 }
8622
8623 DEFUN ("coding-system-eol-type", Fcoding_system_eol_type,
8624 Scoding_system_eol_type, 1, 1, 0,
8625 doc: /* Return eol-type of CODING-SYSTEM.
8626 An eol-type is integer 0, 1, 2, or a vector of coding systems.
8627
8628 Integer values 0, 1, and 2 indicate a format of end-of-line; LF, CRLF,
8629 and CR respectively.
8630
8631 A vector value indicates that a format of end-of-line should be
8632 detected automatically. Nth element of the vector is the subsidiary
8633 coding system whose eol-type is N. */)
8634 (coding_system)
8635 Lisp_Object coding_system;
8636 {
8637 Lisp_Object spec, eol_type;
8638 int n;
8639
8640 if (NILP (coding_system))
8641 coding_system = Qno_conversion;
8642 if (! CODING_SYSTEM_P (coding_system))
8643 return Qnil;
8644 spec = CODING_SYSTEM_SPEC (coding_system);
8645 eol_type = AREF (spec, 2);
8646 if (VECTORP (eol_type))
8647 return Fcopy_sequence (eol_type);
8648 n = EQ (eol_type, Qunix) ? 0 : EQ (eol_type, Qdos) ? 1 : 2;
8649 return make_number (n);
8650 }
8651
8652 #endif /* emacs */
8653
8654 \f
8655 /*** 9. Post-amble ***/
8656
8657 void
8658 init_coding_once ()
8659 {
8660 int i;
8661
8662 for (i = 0; i < coding_category_max; i++)
8663 {
8664 coding_categories[i].id = -1;
8665 coding_priorities[i] = i;
8666 }
8667
8668 /* ISO2022 specific initialize routine. */
8669 for (i = 0; i < 0x20; i++)
8670 iso_code_class[i] = ISO_control_0;
8671 for (i = 0x21; i < 0x7F; i++)
8672 iso_code_class[i] = ISO_graphic_plane_0;
8673 for (i = 0x80; i < 0xA0; i++)
8674 iso_code_class[i] = ISO_control_1;
8675 for (i = 0xA1; i < 0xFF; i++)
8676 iso_code_class[i] = ISO_graphic_plane_1;
8677 iso_code_class[0x20] = iso_code_class[0x7F] = ISO_0x20_or_0x7F;
8678 iso_code_class[0xA0] = iso_code_class[0xFF] = ISO_0xA0_or_0xFF;
8679 iso_code_class[ISO_CODE_SO] = ISO_shift_out;
8680 iso_code_class[ISO_CODE_SI] = ISO_shift_in;
8681 iso_code_class[ISO_CODE_SS2_7] = ISO_single_shift_2_7;
8682 iso_code_class[ISO_CODE_ESC] = ISO_escape;
8683 iso_code_class[ISO_CODE_SS2] = ISO_single_shift_2;
8684 iso_code_class[ISO_CODE_SS3] = ISO_single_shift_3;
8685 iso_code_class[ISO_CODE_CSI] = ISO_control_sequence_introducer;
8686
8687 for (i = 0; i < 256; i++)
8688 {
8689 emacs_mule_bytes[i] = 1;
8690 }
8691 emacs_mule_bytes[EMACS_MULE_LEADING_CODE_PRIVATE_11] = 3;
8692 emacs_mule_bytes[EMACS_MULE_LEADING_CODE_PRIVATE_12] = 3;
8693 emacs_mule_bytes[EMACS_MULE_LEADING_CODE_PRIVATE_21] = 4;
8694 emacs_mule_bytes[EMACS_MULE_LEADING_CODE_PRIVATE_22] = 4;
8695 }
8696
8697 #ifdef emacs
8698
8699 void
8700 syms_of_coding ()
8701 {
8702 staticpro (&Vcoding_system_hash_table);
8703 {
8704 Lisp_Object args[2];
8705 args[0] = QCtest;
8706 args[1] = Qeq;
8707 Vcoding_system_hash_table = Fmake_hash_table (2, args);
8708 }
8709
8710 staticpro (&Vsjis_coding_system);
8711 Vsjis_coding_system = Qnil;
8712
8713 staticpro (&Vbig5_coding_system);
8714 Vbig5_coding_system = Qnil;
8715
8716 staticpro (&Vcode_conversion_reused_workbuf);
8717 Vcode_conversion_reused_workbuf = Qnil;
8718
8719 staticpro (&Vcode_conversion_workbuf_name);
8720 Vcode_conversion_workbuf_name = build_string (" *code-conversion-work*");
8721
8722 reused_workbuf_in_use = 0;
8723
8724 DEFSYM (Qcharset, "charset");
8725 DEFSYM (Qtarget_idx, "target-idx");
8726 DEFSYM (Qcoding_system_history, "coding-system-history");
8727 Fset (Qcoding_system_history, Qnil);
8728
8729 /* Target FILENAME is the first argument. */
8730 Fput (Qinsert_file_contents, Qtarget_idx, make_number (0));
8731 /* Target FILENAME is the third argument. */
8732 Fput (Qwrite_region, Qtarget_idx, make_number (2));
8733
8734 DEFSYM (Qcall_process, "call-process");
8735 /* Target PROGRAM is the first argument. */
8736 Fput (Qcall_process, Qtarget_idx, make_number (0));
8737
8738 DEFSYM (Qcall_process_region, "call-process-region");
8739 /* Target PROGRAM is the third argument. */
8740 Fput (Qcall_process_region, Qtarget_idx, make_number (2));
8741
8742 DEFSYM (Qstart_process, "start-process");
8743 /* Target PROGRAM is the third argument. */
8744 Fput (Qstart_process, Qtarget_idx, make_number (2));
8745
8746 DEFSYM (Qopen_network_stream, "open-network-stream");
8747 /* Target SERVICE is the fourth argument. */
8748 Fput (Qopen_network_stream, Qtarget_idx, make_number (3));
8749
8750 DEFSYM (Qcoding_system, "coding-system");
8751 DEFSYM (Qcoding_aliases, "coding-aliases");
8752
8753 DEFSYM (Qeol_type, "eol-type");
8754 DEFSYM (Qunix, "unix");
8755 DEFSYM (Qdos, "dos");
8756
8757 DEFSYM (Qbuffer_file_coding_system, "buffer-file-coding-system");
8758 DEFSYM (Qpost_read_conversion, "post-read-conversion");
8759 DEFSYM (Qpre_write_conversion, "pre-write-conversion");
8760 DEFSYM (Qdefault_char, "default-char");
8761 DEFSYM (Qundecided, "undecided");
8762 DEFSYM (Qno_conversion, "no-conversion");
8763 DEFSYM (Qraw_text, "raw-text");
8764
8765 DEFSYM (Qiso_2022, "iso-2022");
8766
8767 DEFSYM (Qutf_8, "utf-8");
8768 DEFSYM (Qutf_8_emacs, "utf-8-emacs");
8769
8770 DEFSYM (Qutf_16, "utf-16");
8771 DEFSYM (Qbig, "big");
8772 DEFSYM (Qlittle, "little");
8773
8774 DEFSYM (Qshift_jis, "shift-jis");
8775 DEFSYM (Qbig5, "big5");
8776
8777 DEFSYM (Qcoding_system_p, "coding-system-p");
8778
8779 DEFSYM (Qcoding_system_error, "coding-system-error");
8780 Fput (Qcoding_system_error, Qerror_conditions,
8781 Fcons (Qcoding_system_error, Fcons (Qerror, Qnil)));
8782 Fput (Qcoding_system_error, Qerror_message,
8783 build_string ("Invalid coding system"));
8784
8785 /* Intern this now in case it isn't already done.
8786 Setting this variable twice is harmless.
8787 But don't staticpro it here--that is done in alloc.c. */
8788 Qchar_table_extra_slots = intern ("char-table-extra-slots");
8789
8790 DEFSYM (Qtranslation_table, "translation-table");
8791 Fput (Qtranslation_table, Qchar_table_extra_slots, make_number (1));
8792 DEFSYM (Qtranslation_table_id, "translation-table-id");
8793 DEFSYM (Qtranslation_table_for_decode, "translation-table-for-decode");
8794 DEFSYM (Qtranslation_table_for_encode, "translation-table-for-encode");
8795
8796 DEFSYM (Qvalid_codes, "valid-codes");
8797
8798 DEFSYM (Qemacs_mule, "emacs-mule");
8799
8800 DEFSYM (QCcategory, ":category");
8801
8802 Vcoding_category_table
8803 = Fmake_vector (make_number (coding_category_max), Qnil);
8804 staticpro (&Vcoding_category_table);
8805 /* Followings are target of code detection. */
8806 ASET (Vcoding_category_table, coding_category_iso_7,
8807 intern ("coding-category-iso-7"));
8808 ASET (Vcoding_category_table, coding_category_iso_7_tight,
8809 intern ("coding-category-iso-7-tight"));
8810 ASET (Vcoding_category_table, coding_category_iso_8_1,
8811 intern ("coding-category-iso-8-1"));
8812 ASET (Vcoding_category_table, coding_category_iso_8_2,
8813 intern ("coding-category-iso-8-2"));
8814 ASET (Vcoding_category_table, coding_category_iso_7_else,
8815 intern ("coding-category-iso-7-else"));
8816 ASET (Vcoding_category_table, coding_category_iso_8_else,
8817 intern ("coding-category-iso-8-else"));
8818 ASET (Vcoding_category_table, coding_category_utf_8,
8819 intern ("coding-category-utf-8"));
8820 ASET (Vcoding_category_table, coding_category_utf_16_be,
8821 intern ("coding-category-utf-16-be"));
8822 ASET (Vcoding_category_table, coding_category_utf_16_auto,
8823 intern ("coding-category-utf-16-auto"));
8824 ASET (Vcoding_category_table, coding_category_utf_16_le,
8825 intern ("coding-category-utf-16-le"));
8826 ASET (Vcoding_category_table, coding_category_utf_16_be_nosig,
8827 intern ("coding-category-utf-16-be-nosig"));
8828 ASET (Vcoding_category_table, coding_category_utf_16_le_nosig,
8829 intern ("coding-category-utf-16-le-nosig"));
8830 ASET (Vcoding_category_table, coding_category_charset,
8831 intern ("coding-category-charset"));
8832 ASET (Vcoding_category_table, coding_category_sjis,
8833 intern ("coding-category-sjis"));
8834 ASET (Vcoding_category_table, coding_category_big5,
8835 intern ("coding-category-big5"));
8836 ASET (Vcoding_category_table, coding_category_ccl,
8837 intern ("coding-category-ccl"));
8838 ASET (Vcoding_category_table, coding_category_emacs_mule,
8839 intern ("coding-category-emacs-mule"));
8840 /* Followings are NOT target of code detection. */
8841 ASET (Vcoding_category_table, coding_category_raw_text,
8842 intern ("coding-category-raw-text"));
8843 ASET (Vcoding_category_table, coding_category_undecided,
8844 intern ("coding-category-undecided"));
8845
8846 DEFSYM (Qinsufficient_source, "insufficient-source");
8847 DEFSYM (Qinconsistent_eol, "inconsistent-eol");
8848 DEFSYM (Qinvalid_source, "invalid-source");
8849 DEFSYM (Qinterrupted, "interrupted");
8850 DEFSYM (Qinsufficient_memory, "insufficient-memory");
8851
8852 defsubr (&Scoding_system_p);
8853 defsubr (&Sread_coding_system);
8854 defsubr (&Sread_non_nil_coding_system);
8855 defsubr (&Scheck_coding_system);
8856 defsubr (&Sdetect_coding_region);
8857 defsubr (&Sdetect_coding_string);
8858 defsubr (&Sfind_coding_systems_region_internal);
8859 defsubr (&Sunencodable_char_position);
8860 defsubr (&Scheck_coding_systems_region);
8861 defsubr (&Sdecode_coding_region);
8862 defsubr (&Sencode_coding_region);
8863 defsubr (&Sdecode_coding_string);
8864 defsubr (&Sencode_coding_string);
8865 defsubr (&Sdecode_sjis_char);
8866 defsubr (&Sencode_sjis_char);
8867 defsubr (&Sdecode_big5_char);
8868 defsubr (&Sencode_big5_char);
8869 defsubr (&Sset_terminal_coding_system_internal);
8870 defsubr (&Sset_safe_terminal_coding_system_internal);
8871 defsubr (&Sterminal_coding_system);
8872 defsubr (&Sset_keyboard_coding_system_internal);
8873 defsubr (&Skeyboard_coding_system);
8874 defsubr (&Sfind_operation_coding_system);
8875 defsubr (&Sset_coding_system_priority);
8876 defsubr (&Sdefine_coding_system_internal);
8877 defsubr (&Sdefine_coding_system_alias);
8878 defsubr (&Scoding_system_base);
8879 defsubr (&Scoding_system_plist);
8880 defsubr (&Scoding_system_aliases);
8881 defsubr (&Scoding_system_eol_type);
8882 defsubr (&Scoding_system_priority_list);
8883
8884 DEFVAR_LISP ("coding-system-list", &Vcoding_system_list,
8885 doc: /* List of coding systems.
8886
8887 Do not alter the value of this variable manually. This variable should be
8888 updated by the functions `define-coding-system' and
8889 `define-coding-system-alias'. */);
8890 Vcoding_system_list = Qnil;
8891
8892 DEFVAR_LISP ("coding-system-alist", &Vcoding_system_alist,
8893 doc: /* Alist of coding system names.
8894 Each element is one element list of coding system name.
8895 This variable is given to `completing-read' as TABLE argument.
8896
8897 Do not alter the value of this variable manually. This variable should be
8898 updated by the functions `make-coding-system' and
8899 `define-coding-system-alias'. */);
8900 Vcoding_system_alist = Qnil;
8901
8902 DEFVAR_LISP ("coding-category-list", &Vcoding_category_list,
8903 doc: /* List of coding-categories (symbols) ordered by priority.
8904
8905 On detecting a coding system, Emacs tries code detection algorithms
8906 associated with each coding-category one by one in this order. When
8907 one algorithm agrees with a byte sequence of source text, the coding
8908 system bound to the corresponding coding-category is selected. */);
8909 {
8910 int i;
8911
8912 Vcoding_category_list = Qnil;
8913 for (i = coding_category_max - 1; i >= 0; i--)
8914 Vcoding_category_list
8915 = Fcons (XVECTOR (Vcoding_category_table)->contents[i],
8916 Vcoding_category_list);
8917 }
8918
8919 DEFVAR_LISP ("coding-system-for-read", &Vcoding_system_for_read,
8920 doc: /* Specify the coding system for read operations.
8921 It is useful to bind this variable with `let', but do not set it globally.
8922 If the value is a coding system, it is used for decoding on read operation.
8923 If not, an appropriate element is used from one of the coding system alists:
8924 There are three such tables, `file-coding-system-alist',
8925 `process-coding-system-alist', and `network-coding-system-alist'. */);
8926 Vcoding_system_for_read = Qnil;
8927
8928 DEFVAR_LISP ("coding-system-for-write", &Vcoding_system_for_write,
8929 doc: /* Specify the coding system for write operations.
8930 Programs bind this variable with `let', but you should not set it globally.
8931 If the value is a coding system, it is used for encoding of output,
8932 when writing it to a file and when sending it to a file or subprocess.
8933
8934 If this does not specify a coding system, an appropriate element
8935 is used from one of the coding system alists:
8936 There are three such tables, `file-coding-system-alist',
8937 `process-coding-system-alist', and `network-coding-system-alist'.
8938 For output to files, if the above procedure does not specify a coding system,
8939 the value of `buffer-file-coding-system' is used. */);
8940 Vcoding_system_for_write = Qnil;
8941
8942 DEFVAR_LISP ("last-coding-system-used", &Vlast_coding_system_used,
8943 doc: /*
8944 Coding system used in the latest file or process I/O. */);
8945 Vlast_coding_system_used = Qnil;
8946
8947 DEFVAR_LISP ("last-code-conversion-error", &Vlast_code_conversion_error,
8948 doc: /*
8949 Error status of the last code conversion.
8950
8951 When an error was detected in the last code conversion, this variable
8952 is set to one of the following symbols.
8953 `insufficient-source'
8954 `inconsistent-eol'
8955 `invalid-source'
8956 `interrupted'
8957 `insufficient-memory'
8958 When no error was detected, the value doesn't change. So, to check
8959 the error status of a code conversion by this variable, you must
8960 explicitly set this variable to nil before performing code
8961 conversion. */);
8962 Vlast_code_conversion_error = Qnil;
8963
8964 DEFVAR_BOOL ("inhibit-eol-conversion", &inhibit_eol_conversion,
8965 doc: /*
8966 *Non-nil means always inhibit code conversion of end-of-line format.
8967 See info node `Coding Systems' and info node `Text and Binary' concerning
8968 such conversion. */);
8969 inhibit_eol_conversion = 0;
8970
8971 DEFVAR_BOOL ("inherit-process-coding-system", &inherit_process_coding_system,
8972 doc: /*
8973 Non-nil means process buffer inherits coding system of process output.
8974 Bind it to t if the process output is to be treated as if it were a file
8975 read from some filesystem. */);
8976 inherit_process_coding_system = 0;
8977
8978 DEFVAR_LISP ("file-coding-system-alist", &Vfile_coding_system_alist,
8979 doc: /*
8980 Alist to decide a coding system to use for a file I/O operation.
8981 The format is ((PATTERN . VAL) ...),
8982 where PATTERN is a regular expression matching a file name,
8983 VAL is a coding system, a cons of coding systems, or a function symbol.
8984 If VAL is a coding system, it is used for both decoding and encoding
8985 the file contents.
8986 If VAL is a cons of coding systems, the car part is used for decoding,
8987 and the cdr part is used for encoding.
8988 If VAL is a function symbol, the function must return a coding system
8989 or a cons of coding systems which are used as above. The function gets
8990 the arguments with which `find-operation-coding-systems' was called.
8991
8992 See also the function `find-operation-coding-system'
8993 and the variable `auto-coding-alist'. */);
8994 Vfile_coding_system_alist = Qnil;
8995
8996 DEFVAR_LISP ("process-coding-system-alist", &Vprocess_coding_system_alist,
8997 doc: /*
8998 Alist to decide a coding system to use for a process I/O operation.
8999 The format is ((PATTERN . VAL) ...),
9000 where PATTERN is a regular expression matching a program name,
9001 VAL is a coding system, a cons of coding systems, or a function symbol.
9002 If VAL is a coding system, it is used for both decoding what received
9003 from the program and encoding what sent to the program.
9004 If VAL is a cons of coding systems, the car part is used for decoding,
9005 and the cdr part is used for encoding.
9006 If VAL is a function symbol, the function must return a coding system
9007 or a cons of coding systems which are used as above.
9008
9009 See also the function `find-operation-coding-system'. */);
9010 Vprocess_coding_system_alist = Qnil;
9011
9012 DEFVAR_LISP ("network-coding-system-alist", &Vnetwork_coding_system_alist,
9013 doc: /*
9014 Alist to decide a coding system to use for a network I/O operation.
9015 The format is ((PATTERN . VAL) ...),
9016 where PATTERN is a regular expression matching a network service name
9017 or is a port number to connect to,
9018 VAL is a coding system, a cons of coding systems, or a function symbol.
9019 If VAL is a coding system, it is used for both decoding what received
9020 from the network stream and encoding what sent to the network stream.
9021 If VAL is a cons of coding systems, the car part is used for decoding,
9022 and the cdr part is used for encoding.
9023 If VAL is a function symbol, the function must return a coding system
9024 or a cons of coding systems which are used as above.
9025
9026 See also the function `find-operation-coding-system'. */);
9027 Vnetwork_coding_system_alist = Qnil;
9028
9029 DEFVAR_LISP ("locale-coding-system", &Vlocale_coding_system,
9030 doc: /* Coding system to use with system messages.
9031 Also used for decoding keyboard input on X Window system. */);
9032 Vlocale_coding_system = Qnil;
9033
9034 /* The eol mnemonics are reset in startup.el system-dependently. */
9035 DEFVAR_LISP ("eol-mnemonic-unix", &eol_mnemonic_unix,
9036 doc: /*
9037 *String displayed in mode line for UNIX-like (LF) end-of-line format. */);
9038 eol_mnemonic_unix = build_string (":");
9039
9040 DEFVAR_LISP ("eol-mnemonic-dos", &eol_mnemonic_dos,
9041 doc: /*
9042 *String displayed in mode line for DOS-like (CRLF) end-of-line format. */);
9043 eol_mnemonic_dos = build_string ("\\");
9044
9045 DEFVAR_LISP ("eol-mnemonic-mac", &eol_mnemonic_mac,
9046 doc: /*
9047 *String displayed in mode line for MAC-like (CR) end-of-line format. */);
9048 eol_mnemonic_mac = build_string ("/");
9049
9050 DEFVAR_LISP ("eol-mnemonic-undecided", &eol_mnemonic_undecided,
9051 doc: /*
9052 *String displayed in mode line when end-of-line format is not yet determined. */);
9053 eol_mnemonic_undecided = build_string (":");
9054
9055 DEFVAR_LISP ("enable-character-translation", &Venable_character_translation,
9056 doc: /*
9057 *Non-nil enables character translation while encoding and decoding. */);
9058 Venable_character_translation = Qt;
9059
9060 DEFVAR_LISP ("standard-translation-table-for-decode",
9061 &Vstandard_translation_table_for_decode,
9062 doc: /* Table for translating characters while decoding. */);
9063 Vstandard_translation_table_for_decode = Qnil;
9064
9065 DEFVAR_LISP ("standard-translation-table-for-encode",
9066 &Vstandard_translation_table_for_encode,
9067 doc: /* Table for translating characters while encoding. */);
9068 Vstandard_translation_table_for_encode = Qnil;
9069
9070 DEFVAR_LISP ("charset-revision-table", &Vcharset_revision_table,
9071 doc: /* Alist of charsets vs revision numbers.
9072 While encoding, if a charset (car part of an element) is found,
9073 designate it with the escape sequence identifying revision (cdr part
9074 of the element). */);
9075 Vcharset_revision_table = Qnil;
9076
9077 DEFVAR_LISP ("default-process-coding-system",
9078 &Vdefault_process_coding_system,
9079 doc: /* Cons of coding systems used for process I/O by default.
9080 The car part is used for decoding a process output,
9081 the cdr part is used for encoding a text to be sent to a process. */);
9082 Vdefault_process_coding_system = Qnil;
9083
9084 DEFVAR_LISP ("latin-extra-code-table", &Vlatin_extra_code_table,
9085 doc: /*
9086 Table of extra Latin codes in the range 128..159 (inclusive).
9087 This is a vector of length 256.
9088 If Nth element is non-nil, the existence of code N in a file
9089 \(or output of subprocess) doesn't prevent it to be detected as
9090 a coding system of ISO 2022 variant which has a flag
9091 `accept-latin-extra-code' t (e.g. iso-latin-1) on reading a file
9092 or reading output of a subprocess.
9093 Only 128th through 159th elements has a meaning. */);
9094 Vlatin_extra_code_table = Fmake_vector (make_number (256), Qnil);
9095
9096 DEFVAR_LISP ("select-safe-coding-system-function",
9097 &Vselect_safe_coding_system_function,
9098 doc: /*
9099 Function to call to select safe coding system for encoding a text.
9100
9101 If set, this function is called to force a user to select a proper
9102 coding system which can encode the text in the case that a default
9103 coding system used in each operation can't encode the text.
9104
9105 The default value is `select-safe-coding-system' (which see). */);
9106 Vselect_safe_coding_system_function = Qnil;
9107
9108 DEFVAR_BOOL ("coding-system-require-warning",
9109 &coding_system_require_warning,
9110 doc: /* Internal use only.
9111 If non-nil, on writing a file, `select-safe-coding-system-function' is
9112 called even if `coding-system-for-write' is non-nil. The command
9113 `universal-coding-system-argument' binds this variable to t temporarily. */);
9114 coding_system_require_warning = 0;
9115
9116
9117 DEFVAR_BOOL ("inhibit-iso-escape-detection",
9118 &inhibit_iso_escape_detection,
9119 doc: /*
9120 If non-nil, Emacs ignores ISO2022's escape sequence on code detection.
9121
9122 By default, on reading a file, Emacs tries to detect how the text is
9123 encoded. This code detection is sensitive to escape sequences. If
9124 the sequence is valid as ISO2022, the code is determined as one of
9125 the ISO2022 encodings, and the file is decoded by the corresponding
9126 coding system (e.g. `iso-2022-7bit').
9127
9128 However, there may be a case that you want to read escape sequences in
9129 a file as is. In such a case, you can set this variable to non-nil.
9130 Then, as the code detection ignores any escape sequences, no file is
9131 detected as encoded in some ISO2022 encoding. The result is that all
9132 escape sequences become visible in a buffer.
9133
9134 The default value is nil, and it is strongly recommended not to change
9135 it. That is because many Emacs Lisp source files that contain
9136 non-ASCII characters are encoded by the coding system `iso-2022-7bit'
9137 in Emacs's distribution, and they won't be decoded correctly on
9138 reading if you suppress escape sequence detection.
9139
9140 The other way to read escape sequences in a file without decoding is
9141 to explicitly specify some coding system that doesn't use ISO2022's
9142 escape sequence (e.g `latin-1') on reading by \\[universal-coding-system-argument]. */);
9143 inhibit_iso_escape_detection = 0;
9144
9145 DEFVAR_LISP ("translation-table-for-input", &Vtranslation_table_for_input,
9146 doc: /* Char table for translating self-inserting characters.
9147 This is applied to the result of input methods, not their input. See also
9148 `keyboard-translate-table'. */);
9149 Vtranslation_table_for_input = Qnil;
9150
9151 {
9152 Lisp_Object args[coding_arg_max];
9153 Lisp_Object plist[16];
9154 int i;
9155
9156 for (i = 0; i < coding_arg_max; i++)
9157 args[i] = Qnil;
9158
9159 plist[0] = intern (":name");
9160 plist[1] = args[coding_arg_name] = Qno_conversion;
9161 plist[2] = intern (":mnemonic");
9162 plist[3] = args[coding_arg_mnemonic] = make_number ('=');
9163 plist[4] = intern (":coding-type");
9164 plist[5] = args[coding_arg_coding_type] = Qraw_text;
9165 plist[6] = intern (":ascii-compatible-p");
9166 plist[7] = args[coding_arg_ascii_compatible_p] = Qt;
9167 plist[8] = intern (":default-char");
9168 plist[9] = args[coding_arg_default_char] = make_number (0);
9169 plist[10] = intern (":for-unibyte");
9170 plist[11] = args[coding_arg_for_unibyte] = Qt;
9171 plist[12] = intern (":docstring");
9172 plist[13] = build_string ("Do no conversion.\n\
9173 \n\
9174 When you visit a file with this coding, the file is read into a\n\
9175 unibyte buffer as is, thus each byte of a file is treated as a\n\
9176 character.");
9177 plist[14] = intern (":eol-type");
9178 plist[15] = args[coding_arg_eol_type] = Qunix;
9179 args[coding_arg_plist] = Flist (16, plist);
9180 Fdefine_coding_system_internal (coding_arg_max, args);
9181 }
9182
9183 setup_coding_system (Qno_conversion, &keyboard_coding);
9184 setup_coding_system (Qno_conversion, &terminal_coding);
9185 setup_coding_system (Qno_conversion, &safe_terminal_coding);
9186
9187 {
9188 int i;
9189
9190 for (i = 0; i < coding_category_max; i++)
9191 Fset (AREF (Vcoding_category_table, i), Qno_conversion);
9192 }
9193 }
9194
9195 char *
9196 emacs_strerror (error_number)
9197 int error_number;
9198 {
9199 char *str;
9200
9201 synchronize_system_messages_locale ();
9202 str = strerror (error_number);
9203
9204 if (! NILP (Vlocale_coding_system))
9205 {
9206 Lisp_Object dec = code_convert_string_norecord (build_string (str),
9207 Vlocale_coding_system,
9208 0);
9209 str = (char *) SDATA (dec);
9210 }
9211
9212 return str;
9213 }
9214
9215 #endif /* emacs */