(Fdefine_coding_system_internal): Fix checking of ascii compatibility.
[bpt/emacs.git] / src / coding.c
1 /* Coding system handler (conversion, detection, etc).
2 Copyright (C) 1995, 1997, 1998 Electrotechnical Laboratory, JAPAN.
3 Licensed to the Free Software Foundation.
4 Copyright (C) 2001, 2002 Free Software Foundation, Inc.
5 Copyright (C) 2003
6 National Institute of Advanced Industrial Science and Technology (AIST)
7 Registration Number H13PRO009
8
9 This file is part of GNU Emacs.
10
11 GNU Emacs is free software; you can redistribute it and/or modify
12 it under the terms of the GNU General Public License as published by
13 the Free Software Foundation; either version 2, or (at your option)
14 any later version.
15
16 GNU Emacs is distributed in the hope that it will be useful,
17 but WITHOUT ANY WARRANTY; without even the implied warranty of
18 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19 GNU General Public License for more details.
20
21 You should have received a copy of the GNU General Public License
22 along with GNU Emacs; see the file COPYING. If not, write to
23 the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
24 Boston, MA 02111-1307, USA. */
25
26 /*** TABLE OF CONTENTS ***
27
28 0. General comments
29 1. Preamble
30 2. Emacs' internal format (emacs-utf-8) handlers
31 3. UTF-8 handlers
32 4. UTF-16 handlers
33 5. Charset-base coding systems handlers
34 6. emacs-mule (old Emacs' internal format) handlers
35 7. ISO2022 handlers
36 8. Shift-JIS and BIG5 handlers
37 9. CCL handlers
38 10. C library functions
39 11. Emacs Lisp library functions
40 12. Postamble
41
42 */
43
44 /*** 0. General comments ***
45
46
47 CODING SYSTEM
48
49 A coding system is an object for an encoding mechanism that contains
50 information about how to convert byte sequences to character
51 sequences and vice versa. When we say "decode", it means converting
52 a byte sequence of a specific coding system into a character
53 sequence that is represented by Emacs' internal coding system
54 `emacs-utf-8', and when we say "encode", it means converting a
55 character sequence of emacs-utf-8 to a byte sequence of a specific
56 coding system.
57
58 In Emacs Lisp, a coding system is represented by a Lisp symbol. In
59 C level, a coding system is represented by a vector of attributes
60 stored in the hash table Vcharset_hash_table. The conversion from
61 coding system symbol to attributes vector is done by looking up
62 Vcharset_hash_table by the symbol.
63
64 Coding systems are classified into the following types depending on
65 the encoding mechanism. Here's a brief description of the types.
66
67 o UTF-8
68
69 o UTF-16
70
71 o Charset-base coding system
72
73 A coding system defined by one or more (coded) character sets.
74 Decoding and encoding are done by a code converter defined for each
75 character set.
76
77 o Old Emacs internal format (emacs-mule)
78
79 The coding system adopted by old versions of Emacs (20 and 21).
80
81 o ISO2022-base coding system
82
83 The most famous coding system for multiple character sets. X's
84 Compound Text, various EUCs (Extended Unix Code), and coding systems
85 used in the Internet communication such as ISO-2022-JP are all
86 variants of ISO2022.
87
88 o SJIS (or Shift-JIS or MS-Kanji-Code)
89
90 A coding system to encode character sets: ASCII, JISX0201, and
91 JISX0208. Widely used for PC's in Japan. Details are described in
92 section 8.
93
94 o BIG5
95
96 A coding system to encode character sets: ASCII and Big5. Widely
97 used for Chinese (mainly in Taiwan and Hong Kong). Details are
98 described in section 8. In this file, when we write "big5" (all
99 lowercase), we mean the coding system, and when we write "Big5"
100 (capitalized), we mean the character set.
101
102 o CCL
103
104 If a user wants to decode/encode text encoded in a coding system
105 not listed above, he can supply a decoder and an encoder for it in
106 CCL (Code Conversion Language) programs. Emacs executes the CCL
107 program while decoding/encoding.
108
109 o Raw-text
110
111 A coding system for text containing raw eight-bit data. Emacs
112 treats each byte of source text as a character (except for
113 end-of-line conversion).
114
115 o No-conversion
116
117 Like raw text, but don't do end-of-line conversion.
118
119
120 END-OF-LINE FORMAT
121
122 How text end-of-line is encoded depends on operating system. For
123 instance, Unix's format is just one byte of LF (line-feed) code,
124 whereas DOS's format is two-byte sequence of `carriage-return' and
125 `line-feed' codes. MacOS's format is usually one byte of
126 `carriage-return'.
127
128 Since text character encoding and end-of-line encoding are
129 independent, any coding system described above can take any format
130 of end-of-line (except for no-conversion).
131
132 STRUCT CODING_SYSTEM
133
134 Before using a coding system for code conversion (i.e. decoding and
135 encoding), we setup a structure of type `struct coding_system'.
136 This structure keeps various information about a specific code
137 conversion (e.g. the location of source and destination data).
138
139 */
140
141 /* COMMON MACROS */
142
143
144 /*** GENERAL NOTES on `detect_coding_XXX ()' functions ***
145
146 These functions check if a byte sequence specified as a source in
147 CODING conforms to the format of XXX, and update the members of
148 DETECT_INFO.
149
150 Return 1 if the byte sequence conforms to XXX, otherwise return 0.
151
152 Below is the template of these functions. */
153
154 #if 0
155 static int
156 detect_coding_XXX (coding, detect_info)
157 struct coding_system *coding;
158 struct coding_detection_info *detect_info;
159 {
160 unsigned char *src = coding->source;
161 unsigned char *src_end = coding->source + coding->src_bytes;
162 int multibytep = coding->src_multibyte;
163 int consumed_chars = 0;
164 int found = 0;
165 ...;
166
167 while (1)
168 {
169 /* Get one byte from the source. If the souce is exausted, jump
170 to no_more_source:. */
171 ONE_MORE_BYTE (c);
172
173 if (! __C_conforms_to_XXX___ (c))
174 break;
175 if (! __C_strongly_suggests_XXX__ (c))
176 found = CATEGORY_MASK_XXX;
177 }
178 /* The byte sequence is invalid for XXX. */
179 detect_info->rejected |= CATEGORY_MASK_XXX;
180 return 0;
181
182 no_more_source:
183 /* The source exausted successfully. */
184 detect_info->found |= found;
185 return 1;
186 }
187 #endif
188
189 /*** GENERAL NOTES on `decode_coding_XXX ()' functions ***
190
191 These functions decode a byte sequence specified as a source by
192 CODING. The resulting multibyte text goes to a place pointed to by
193 CODING->charbuf, the length of which should not exceed
194 CODING->charbuf_size;
195
196 These functions set the information of original and decoded texts in
197 CODING->consumed, CODING->consumed_char, and CODING->charbuf_used.
198 They also set CODING->result to one of CODING_RESULT_XXX indicating
199 how the decoding is finished.
200
201 Below is the template of these functions. */
202
203 #if 0
204 static void
205 decode_coding_XXXX (coding)
206 struct coding_system *coding;
207 {
208 unsigned char *src = coding->source + coding->consumed;
209 unsigned char *src_end = coding->source + coding->src_bytes;
210 /* SRC_BASE remembers the start position in source in each loop.
211 The loop will be exited when there's not enough source code, or
212 when there's no room in CHARBUF for a decoded character. */
213 unsigned char *src_base;
214 /* A buffer to produce decoded characters. */
215 int *charbuf = coding->charbuf;
216 int *charbuf_end = charbuf + coding->charbuf_size;
217 int multibytep = coding->src_multibyte;
218
219 while (1)
220 {
221 src_base = src;
222 if (charbuf < charbuf_end)
223 /* No more room to produce a decoded character. */
224 break;
225 ONE_MORE_BYTE (c);
226 /* Decode it. */
227 }
228
229 no_more_source:
230 if (src_base < src_end
231 && coding->mode & CODING_MODE_LAST_BLOCK)
232 /* If the source ends by partial bytes to construct a character,
233 treat them as eight-bit raw data. */
234 while (src_base < src_end && charbuf < charbuf_end)
235 *charbuf++ = *src_base++;
236 /* Remember how many bytes and characters we consumed. If the
237 source is multibyte, the bytes and chars are not identical. */
238 coding->consumed = coding->consumed_char = src_base - coding->source;
239 /* Remember how many characters we produced. */
240 coding->charbuf_used = charbuf - coding->charbuf;
241 }
242 #endif
243
244 /*** GENERAL NOTES on `encode_coding_XXX ()' functions ***
245
246 These functions encode SRC_BYTES length text at SOURCE of Emacs'
247 internal multibyte format by CODING. The resulting byte sequence
248 goes to a place pointed to by DESTINATION, the length of which
249 should not exceed DST_BYTES.
250
251 These functions set the information of original and encoded texts in
252 the members produced, produced_char, consumed, and consumed_char of
253 the structure *CODING. They also set the member result to one of
254 CODING_RESULT_XXX indicating how the encoding finished.
255
256 DST_BYTES zero means that source area and destination area are
257 overlapped, which means that we can produce a encoded text until it
258 reaches at the head of not-yet-encoded source text.
259
260 Below is a template of these functions. */
261 #if 0
262 static void
263 encode_coding_XXX (coding)
264 struct coding_system *coding;
265 {
266 int multibytep = coding->dst_multibyte;
267 int *charbuf = coding->charbuf;
268 int *charbuf_end = charbuf->charbuf + coding->charbuf_used;
269 unsigned char *dst = coding->destination + coding->produced;
270 unsigned char *dst_end = coding->destination + coding->dst_bytes;
271 unsigned char *adjusted_dst_end = dst_end - _MAX_BYTES_PRODUCED_IN_LOOP_;
272 int produced_chars = 0;
273
274 for (; charbuf < charbuf_end && dst < adjusted_dst_end; charbuf++)
275 {
276 int c = *charbuf;
277 /* Encode C into DST, and increment DST. */
278 }
279 label_no_more_destination:
280 /* How many chars and bytes we produced. */
281 coding->produced_char += produced_chars;
282 coding->produced = dst - coding->destination;
283 }
284 #endif
285
286 \f
287 /*** 1. Preamble ***/
288
289 #include <config.h>
290 #include <stdio.h>
291
292 #include "lisp.h"
293 #include "buffer.h"
294 #include "character.h"
295 #include "charset.h"
296 #include "ccl.h"
297 #include "composite.h"
298 #include "coding.h"
299 #include "window.h"
300
301 Lisp_Object Vcoding_system_hash_table;
302
303 Lisp_Object Qcoding_system, Qcoding_aliases, Qeol_type;
304 Lisp_Object Qunix, Qdos;
305 extern Lisp_Object Qmac; /* frame.c */
306 Lisp_Object Qbuffer_file_coding_system;
307 Lisp_Object Qpost_read_conversion, Qpre_write_conversion;
308 Lisp_Object Qdefault_char;
309 Lisp_Object Qno_conversion, Qundecided;
310 Lisp_Object Qcharset, Qiso_2022, Qutf_8, Qutf_16, Qshift_jis, Qbig5;
311 Lisp_Object Qbig, Qlittle;
312 Lisp_Object Qcoding_system_history;
313 Lisp_Object Qvalid_codes;
314 Lisp_Object QCcategory;
315
316 extern Lisp_Object Qinsert_file_contents, Qwrite_region;
317 Lisp_Object Qcall_process, Qcall_process_region, Qprocess_argument;
318 Lisp_Object Qstart_process, Qopen_network_stream;
319 Lisp_Object Qtarget_idx;
320
321 int coding_system_require_warning;
322
323 Lisp_Object Vselect_safe_coding_system_function;
324
325 /* Mnemonic string for each format of end-of-line. */
326 Lisp_Object eol_mnemonic_unix, eol_mnemonic_dos, eol_mnemonic_mac;
327 /* Mnemonic string to indicate format of end-of-line is not yet
328 decided. */
329 Lisp_Object eol_mnemonic_undecided;
330
331 #ifdef emacs
332
333 Lisp_Object Vcoding_system_list, Vcoding_system_alist;
334
335 Lisp_Object Qcoding_system_p, Qcoding_system_error;
336
337 /* Coding system emacs-mule and raw-text are for converting only
338 end-of-line format. */
339 Lisp_Object Qemacs_mule, Qraw_text;
340 Lisp_Object Qutf_8_emacs;
341
342 /* Coding-systems are handed between Emacs Lisp programs and C internal
343 routines by the following three variables. */
344 /* Coding-system for reading files and receiving data from process. */
345 Lisp_Object Vcoding_system_for_read;
346 /* Coding-system for writing files and sending data to process. */
347 Lisp_Object Vcoding_system_for_write;
348 /* Coding-system actually used in the latest I/O. */
349 Lisp_Object Vlast_coding_system_used;
350
351 /* A vector of length 256 which contains information about special
352 Latin codes (especially for dealing with Microsoft codes). */
353 Lisp_Object Vlatin_extra_code_table;
354
355 /* Flag to inhibit code conversion of end-of-line format. */
356 int inhibit_eol_conversion;
357
358 /* Flag to inhibit ISO2022 escape sequence detection. */
359 int inhibit_iso_escape_detection;
360
361 /* Flag to make buffer-file-coding-system inherit from process-coding. */
362 int inherit_process_coding_system;
363
364 /* Coding system to be used to encode text for terminal display. */
365 struct coding_system terminal_coding;
366
367 /* Coding system to be used to encode text for terminal display when
368 terminal coding system is nil. */
369 struct coding_system safe_terminal_coding;
370
371 /* Coding system of what is sent from terminal keyboard. */
372 struct coding_system keyboard_coding;
373
374 Lisp_Object Vfile_coding_system_alist;
375 Lisp_Object Vprocess_coding_system_alist;
376 Lisp_Object Vnetwork_coding_system_alist;
377
378 Lisp_Object Vlocale_coding_system;
379
380 #endif /* emacs */
381
382 /* Flag to tell if we look up translation table on character code
383 conversion. */
384 Lisp_Object Venable_character_translation;
385 /* Standard translation table to look up on decoding (reading). */
386 Lisp_Object Vstandard_translation_table_for_decode;
387 /* Standard translation table to look up on encoding (writing). */
388 Lisp_Object Vstandard_translation_table_for_encode;
389
390 Lisp_Object Qtranslation_table;
391 Lisp_Object Qtranslation_table_id;
392 Lisp_Object Qtranslation_table_for_decode;
393 Lisp_Object Qtranslation_table_for_encode;
394
395 /* Alist of charsets vs revision number. */
396 static Lisp_Object Vcharset_revision_table;
397
398 /* Default coding systems used for process I/O. */
399 Lisp_Object Vdefault_process_coding_system;
400
401 /* Char table for translating Quail and self-inserting input. */
402 Lisp_Object Vtranslation_table_for_input;
403
404 /* Two special coding systems. */
405 Lisp_Object Vsjis_coding_system;
406 Lisp_Object Vbig5_coding_system;
407
408
409 static int detect_coding_utf_8 P_ ((struct coding_system *,
410 struct coding_detection_info *info));
411 static void decode_coding_utf_8 P_ ((struct coding_system *));
412 static int encode_coding_utf_8 P_ ((struct coding_system *));
413
414 static int detect_coding_utf_16 P_ ((struct coding_system *,
415 struct coding_detection_info *info));
416 static void decode_coding_utf_16 P_ ((struct coding_system *));
417 static int encode_coding_utf_16 P_ ((struct coding_system *));
418
419 static int detect_coding_iso_2022 P_ ((struct coding_system *,
420 struct coding_detection_info *info));
421 static void decode_coding_iso_2022 P_ ((struct coding_system *));
422 static int encode_coding_iso_2022 P_ ((struct coding_system *));
423
424 static int detect_coding_emacs_mule P_ ((struct coding_system *,
425 struct coding_detection_info *info));
426 static void decode_coding_emacs_mule P_ ((struct coding_system *));
427 static int encode_coding_emacs_mule P_ ((struct coding_system *));
428
429 static int detect_coding_sjis P_ ((struct coding_system *,
430 struct coding_detection_info *info));
431 static void decode_coding_sjis P_ ((struct coding_system *));
432 static int encode_coding_sjis P_ ((struct coding_system *));
433
434 static int detect_coding_big5 P_ ((struct coding_system *,
435 struct coding_detection_info *info));
436 static void decode_coding_big5 P_ ((struct coding_system *));
437 static int encode_coding_big5 P_ ((struct coding_system *));
438
439 static int detect_coding_ccl P_ ((struct coding_system *,
440 struct coding_detection_info *info));
441 static void decode_coding_ccl P_ ((struct coding_system *));
442 static int encode_coding_ccl P_ ((struct coding_system *));
443
444 static void decode_coding_raw_text P_ ((struct coding_system *));
445 static int encode_coding_raw_text P_ ((struct coding_system *));
446
447
448 /* ISO2022 section */
449
450 #define CODING_ISO_INITIAL(coding, reg) \
451 (XINT (AREF (AREF (CODING_ID_ATTRS ((coding)->id), \
452 coding_attr_iso_initial), \
453 reg)))
454
455
456 #define CODING_ISO_REQUEST(coding, charset_id) \
457 ((charset_id <= (coding)->max_charset_id \
458 ? (coding)->safe_charsets[charset_id] \
459 : -1))
460
461
462 #define CODING_ISO_FLAGS(coding) \
463 ((coding)->spec.iso_2022.flags)
464 #define CODING_ISO_DESIGNATION(coding, reg) \
465 ((coding)->spec.iso_2022.current_designation[reg])
466 #define CODING_ISO_INVOCATION(coding, plane) \
467 ((coding)->spec.iso_2022.current_invocation[plane])
468 #define CODING_ISO_SINGLE_SHIFTING(coding) \
469 ((coding)->spec.iso_2022.single_shifting)
470 #define CODING_ISO_BOL(coding) \
471 ((coding)->spec.iso_2022.bol)
472 #define CODING_ISO_INVOKED_CHARSET(coding, plane) \
473 CODING_ISO_DESIGNATION ((coding), CODING_ISO_INVOCATION ((coding), (plane)))
474
475 /* Control characters of ISO2022. */
476 /* code */ /* function */
477 #define ISO_CODE_LF 0x0A /* line-feed */
478 #define ISO_CODE_CR 0x0D /* carriage-return */
479 #define ISO_CODE_SO 0x0E /* shift-out */
480 #define ISO_CODE_SI 0x0F /* shift-in */
481 #define ISO_CODE_SS2_7 0x19 /* single-shift-2 for 7-bit code */
482 #define ISO_CODE_ESC 0x1B /* escape */
483 #define ISO_CODE_SS2 0x8E /* single-shift-2 */
484 #define ISO_CODE_SS3 0x8F /* single-shift-3 */
485 #define ISO_CODE_CSI 0x9B /* control-sequence-introducer */
486
487 /* All code (1-byte) of ISO2022 is classified into one of the
488 followings. */
489 enum iso_code_class_type
490 {
491 ISO_control_0, /* Control codes in the range
492 0x00..0x1F and 0x7F, except for the
493 following 5 codes. */
494 ISO_carriage_return, /* ISO_CODE_CR (0x0D) */
495 ISO_shift_out, /* ISO_CODE_SO (0x0E) */
496 ISO_shift_in, /* ISO_CODE_SI (0x0F) */
497 ISO_single_shift_2_7, /* ISO_CODE_SS2_7 (0x19) */
498 ISO_escape, /* ISO_CODE_SO (0x1B) */
499 ISO_control_1, /* Control codes in the range
500 0x80..0x9F, except for the
501 following 3 codes. */
502 ISO_single_shift_2, /* ISO_CODE_SS2 (0x8E) */
503 ISO_single_shift_3, /* ISO_CODE_SS3 (0x8F) */
504 ISO_control_sequence_introducer, /* ISO_CODE_CSI (0x9B) */
505 ISO_0x20_or_0x7F, /* Codes of the values 0x20 or 0x7F. */
506 ISO_graphic_plane_0, /* Graphic codes in the range 0x21..0x7E. */
507 ISO_0xA0_or_0xFF, /* Codes of the values 0xA0 or 0xFF. */
508 ISO_graphic_plane_1 /* Graphic codes in the range 0xA1..0xFE. */
509 };
510
511 /** The macros CODING_ISO_FLAG_XXX defines a flag bit of the
512 `iso-flags' attribute of an iso2022 coding system. */
513
514 /* If set, produce long-form designation sequence (e.g. ESC $ ( A)
515 instead of the correct short-form sequence (e.g. ESC $ A). */
516 #define CODING_ISO_FLAG_LONG_FORM 0x0001
517
518 /* If set, reset graphic planes and registers at end-of-line to the
519 initial state. */
520 #define CODING_ISO_FLAG_RESET_AT_EOL 0x0002
521
522 /* If set, reset graphic planes and registers before any control
523 characters to the initial state. */
524 #define CODING_ISO_FLAG_RESET_AT_CNTL 0x0004
525
526 /* If set, encode by 7-bit environment. */
527 #define CODING_ISO_FLAG_SEVEN_BITS 0x0008
528
529 /* If set, use locking-shift function. */
530 #define CODING_ISO_FLAG_LOCKING_SHIFT 0x0010
531
532 /* If set, use single-shift function. Overwrite
533 CODING_ISO_FLAG_LOCKING_SHIFT. */
534 #define CODING_ISO_FLAG_SINGLE_SHIFT 0x0020
535
536 /* If set, use designation escape sequence. */
537 #define CODING_ISO_FLAG_DESIGNATION 0x0040
538
539 /* If set, produce revision number sequence. */
540 #define CODING_ISO_FLAG_REVISION 0x0080
541
542 /* If set, produce ISO6429's direction specifying sequence. */
543 #define CODING_ISO_FLAG_DIRECTION 0x0100
544
545 /* If set, assume designation states are reset at beginning of line on
546 output. */
547 #define CODING_ISO_FLAG_INIT_AT_BOL 0x0200
548
549 /* If set, designation sequence should be placed at beginning of line
550 on output. */
551 #define CODING_ISO_FLAG_DESIGNATE_AT_BOL 0x0400
552
553 /* If set, do not encode unsafe charactes on output. */
554 #define CODING_ISO_FLAG_SAFE 0x0800
555
556 /* If set, extra latin codes (128..159) are accepted as a valid code
557 on input. */
558 #define CODING_ISO_FLAG_LATIN_EXTRA 0x1000
559
560 #define CODING_ISO_FLAG_COMPOSITION 0x2000
561
562 #define CODING_ISO_FLAG_EUC_TW_SHIFT 0x4000
563
564 #define CODING_ISO_FLAG_USE_ROMAN 0x8000
565
566 #define CODING_ISO_FLAG_USE_OLDJIS 0x10000
567
568 #define CODING_ISO_FLAG_FULL_SUPPORT 0x100000
569
570 /* A character to be produced on output if encoding of the original
571 character is prohibited by CODING_ISO_FLAG_SAFE. */
572 #define CODING_INHIBIT_CHARACTER_SUBSTITUTION '?'
573
574
575 /* UTF-16 section */
576 #define CODING_UTF_16_BOM(coding) \
577 ((coding)->spec.utf_16.bom)
578
579 #define CODING_UTF_16_ENDIAN(coding) \
580 ((coding)->spec.utf_16.endian)
581
582 #define CODING_UTF_16_SURROGATE(coding) \
583 ((coding)->spec.utf_16.surrogate)
584
585
586 /* CCL section */
587 #define CODING_CCL_DECODER(coding) \
588 AREF (CODING_ID_ATTRS ((coding)->id), coding_attr_ccl_decoder)
589 #define CODING_CCL_ENCODER(coding) \
590 AREF (CODING_ID_ATTRS ((coding)->id), coding_attr_ccl_encoder)
591 #define CODING_CCL_VALIDS(coding) \
592 (SDATA (AREF (CODING_ID_ATTRS ((coding)->id), coding_attr_ccl_valids)))
593
594 /* Index for each coding category in `coding_categories' */
595
596 enum coding_category
597 {
598 coding_category_iso_7,
599 coding_category_iso_7_tight,
600 coding_category_iso_8_1,
601 coding_category_iso_8_2,
602 coding_category_iso_7_else,
603 coding_category_iso_8_else,
604 coding_category_utf_8,
605 coding_category_utf_16_auto,
606 coding_category_utf_16_be,
607 coding_category_utf_16_le,
608 coding_category_utf_16_be_nosig,
609 coding_category_utf_16_le_nosig,
610 coding_category_charset,
611 coding_category_sjis,
612 coding_category_big5,
613 coding_category_ccl,
614 coding_category_emacs_mule,
615 /* All above are targets of code detection. */
616 coding_category_raw_text,
617 coding_category_undecided,
618 coding_category_max
619 };
620
621 /* Definitions of flag bits used in detect_coding_XXXX. */
622 #define CATEGORY_MASK_ISO_7 (1 << coding_category_iso_7)
623 #define CATEGORY_MASK_ISO_7_TIGHT (1 << coding_category_iso_7_tight)
624 #define CATEGORY_MASK_ISO_8_1 (1 << coding_category_iso_8_1)
625 #define CATEGORY_MASK_ISO_8_2 (1 << coding_category_iso_8_2)
626 #define CATEGORY_MASK_ISO_7_ELSE (1 << coding_category_iso_7_else)
627 #define CATEGORY_MASK_ISO_8_ELSE (1 << coding_category_iso_8_else)
628 #define CATEGORY_MASK_UTF_8 (1 << coding_category_utf_8)
629 #define CATEGORY_MASK_UTF_16_AUTO (1 << coding_category_utf_16_auto)
630 #define CATEGORY_MASK_UTF_16_BE (1 << coding_category_utf_16_be)
631 #define CATEGORY_MASK_UTF_16_LE (1 << coding_category_utf_16_le)
632 #define CATEGORY_MASK_UTF_16_BE_NOSIG (1 << coding_category_utf_16_be_nosig)
633 #define CATEGORY_MASK_UTF_16_LE_NOSIG (1 << coding_category_utf_16_le_nosig)
634 #define CATEGORY_MASK_CHARSET (1 << coding_category_charset)
635 #define CATEGORY_MASK_SJIS (1 << coding_category_sjis)
636 #define CATEGORY_MASK_BIG5 (1 << coding_category_big5)
637 #define CATEGORY_MASK_CCL (1 << coding_category_ccl)
638 #define CATEGORY_MASK_EMACS_MULE (1 << coding_category_emacs_mule)
639 #define CATEGORY_MASK_RAW_TEXT (1 << coding_category_raw_text)
640
641 /* This value is returned if detect_coding_mask () find nothing other
642 than ASCII characters. */
643 #define CATEGORY_MASK_ANY \
644 (CATEGORY_MASK_ISO_7 \
645 | CATEGORY_MASK_ISO_7_TIGHT \
646 | CATEGORY_MASK_ISO_8_1 \
647 | CATEGORY_MASK_ISO_8_2 \
648 | CATEGORY_MASK_ISO_7_ELSE \
649 | CATEGORY_MASK_ISO_8_ELSE \
650 | CATEGORY_MASK_UTF_8 \
651 | CATEGORY_MASK_UTF_16_BE \
652 | CATEGORY_MASK_UTF_16_LE \
653 | CATEGORY_MASK_UTF_16_BE_NOSIG \
654 | CATEGORY_MASK_UTF_16_LE_NOSIG \
655 | CATEGORY_MASK_CHARSET \
656 | CATEGORY_MASK_SJIS \
657 | CATEGORY_MASK_BIG5 \
658 | CATEGORY_MASK_CCL \
659 | CATEGORY_MASK_EMACS_MULE)
660
661
662 #define CATEGORY_MASK_ISO_7BIT \
663 (CATEGORY_MASK_ISO_7 | CATEGORY_MASK_ISO_7_TIGHT)
664
665 #define CATEGORY_MASK_ISO_8BIT \
666 (CATEGORY_MASK_ISO_8_1 | CATEGORY_MASK_ISO_8_2)
667
668 #define CATEGORY_MASK_ISO_ELSE \
669 (CATEGORY_MASK_ISO_7_ELSE | CATEGORY_MASK_ISO_8_ELSE)
670
671 #define CATEGORY_MASK_ISO_ESCAPE \
672 (CATEGORY_MASK_ISO_7 \
673 | CATEGORY_MASK_ISO_7_TIGHT \
674 | CATEGORY_MASK_ISO_7_ELSE \
675 | CATEGORY_MASK_ISO_8_ELSE)
676
677 #define CATEGORY_MASK_ISO \
678 ( CATEGORY_MASK_ISO_7BIT \
679 | CATEGORY_MASK_ISO_8BIT \
680 | CATEGORY_MASK_ISO_ELSE)
681
682 #define CATEGORY_MASK_UTF_16 \
683 (CATEGORY_MASK_UTF_16_BE \
684 | CATEGORY_MASK_UTF_16_LE \
685 | CATEGORY_MASK_UTF_16_BE_NOSIG \
686 | CATEGORY_MASK_UTF_16_LE_NOSIG)
687
688
689 /* List of symbols `coding-category-xxx' ordered by priority. This
690 variable is exposed to Emacs Lisp. */
691 static Lisp_Object Vcoding_category_list;
692
693 /* Table of coding categories (Lisp symbols). This variable is for
694 internal use oly. */
695 static Lisp_Object Vcoding_category_table;
696
697 /* Table of coding-categories ordered by priority. */
698 static enum coding_category coding_priorities[coding_category_max];
699
700 /* Nth element is a coding context for the coding system bound to the
701 Nth coding category. */
702 static struct coding_system coding_categories[coding_category_max];
703
704 /*** Commonly used macros and functions ***/
705
706 #ifndef min
707 #define min(a, b) ((a) < (b) ? (a) : (b))
708 #endif
709 #ifndef max
710 #define max(a, b) ((a) > (b) ? (a) : (b))
711 #endif
712
713 #define CODING_GET_INFO(coding, attrs, eol_type, charset_list) \
714 do { \
715 attrs = CODING_ID_ATTRS (coding->id); \
716 eol_type = CODING_ID_EOL_TYPE (coding->id); \
717 if (VECTORP (eol_type)) \
718 eol_type = Qunix; \
719 charset_list = CODING_ATTR_CHARSET_LIST (attrs); \
720 } while (0)
721
722
723 /* Safely get one byte from the source text pointed by SRC which ends
724 at SRC_END, and set C to that byte. If there are not enough bytes
725 in the source, it jumps to `no_more_source'. The caller
726 should declare and set these variables appropriately in advance:
727 src, src_end, multibytep
728 */
729
730 #define ONE_MORE_BYTE(c) \
731 do { \
732 if (src == src_end) \
733 { \
734 if (src_base < src) \
735 coding->result = CODING_RESULT_INSUFFICIENT_SRC; \
736 goto no_more_source; \
737 } \
738 c = *src++; \
739 if (multibytep && (c & 0x80)) \
740 { \
741 if ((c & 0xFE) != 0xC0) \
742 error ("Undecodable char found"); \
743 c = ((c & 1) << 6) | *src++; \
744 } \
745 consumed_chars++; \
746 } while (0)
747
748
749 #define ONE_MORE_BYTE_NO_CHECK(c) \
750 do { \
751 c = *src++; \
752 if (multibytep && (c & 0x80)) \
753 { \
754 if ((c & 0xFE) != 0xC0) \
755 error ("Undecodable char found"); \
756 c = ((c & 1) << 6) | *src++; \
757 } \
758 consumed_chars++; \
759 } while (0)
760
761
762 /* Store a byte C in the place pointed by DST and increment DST to the
763 next free point, and increment PRODUCED_CHARS. The caller should
764 assure that C is 0..127, and declare and set the variable `dst'
765 appropriately in advance.
766 */
767
768
769 #define EMIT_ONE_ASCII_BYTE(c) \
770 do { \
771 produced_chars++; \
772 *dst++ = (c); \
773 } while (0)
774
775
776 /* Like EMIT_ONE_ASCII_BYTE byt store two bytes; C1 and C2. */
777
778 #define EMIT_TWO_ASCII_BYTES(c1, c2) \
779 do { \
780 produced_chars += 2; \
781 *dst++ = (c1), *dst++ = (c2); \
782 } while (0)
783
784
785 /* Store a byte C in the place pointed by DST and increment DST to the
786 next free point, and increment PRODUCED_CHARS. If MULTIBYTEP is
787 nonzero, store in an appropriate multibyte from. The caller should
788 declare and set the variables `dst' and `multibytep' appropriately
789 in advance. */
790
791 #define EMIT_ONE_BYTE(c) \
792 do { \
793 produced_chars++; \
794 if (multibytep) \
795 { \
796 int ch = (c); \
797 if (ch >= 0x80) \
798 ch = BYTE8_TO_CHAR (ch); \
799 CHAR_STRING_ADVANCE (ch, dst); \
800 } \
801 else \
802 *dst++ = (c); \
803 } while (0)
804
805
806 /* Like EMIT_ONE_BYTE, but emit two bytes; C1 and C2. */
807
808 #define EMIT_TWO_BYTES(c1, c2) \
809 do { \
810 produced_chars += 2; \
811 if (multibytep) \
812 { \
813 int ch; \
814 \
815 ch = (c1); \
816 if (ch >= 0x80) \
817 ch = BYTE8_TO_CHAR (ch); \
818 CHAR_STRING_ADVANCE (ch, dst); \
819 ch = (c2); \
820 if (ch >= 0x80) \
821 ch = BYTE8_TO_CHAR (ch); \
822 CHAR_STRING_ADVANCE (ch, dst); \
823 } \
824 else \
825 { \
826 *dst++ = (c1); \
827 *dst++ = (c2); \
828 } \
829 } while (0)
830
831
832 #define EMIT_THREE_BYTES(c1, c2, c3) \
833 do { \
834 EMIT_ONE_BYTE (c1); \
835 EMIT_TWO_BYTES (c2, c3); \
836 } while (0)
837
838
839 #define EMIT_FOUR_BYTES(c1, c2, c3, c4) \
840 do { \
841 EMIT_TWO_BYTES (c1, c2); \
842 EMIT_TWO_BYTES (c3, c4); \
843 } while (0)
844
845
846 #define CODING_DECODE_CHAR(coding, src, src_base, src_end, charset, code, c) \
847 do { \
848 charset_map_loaded = 0; \
849 c = DECODE_CHAR (charset, code); \
850 if (charset_map_loaded) \
851 { \
852 const unsigned char *orig = coding->source; \
853 EMACS_INT offset; \
854 \
855 coding_set_source (coding); \
856 offset = coding->source - orig; \
857 src += offset; \
858 src_base += offset; \
859 src_end += offset; \
860 } \
861 } while (0)
862
863
864 #define ASSURE_DESTINATION(bytes) \
865 do { \
866 if (dst + (bytes) >= dst_end) \
867 { \
868 int more_bytes = charbuf_end - charbuf + (bytes); \
869 \
870 dst = alloc_destination (coding, more_bytes, dst); \
871 dst_end = coding->destination + coding->dst_bytes; \
872 } \
873 } while (0)
874
875
876
877 static void
878 coding_set_source (coding)
879 struct coding_system *coding;
880 {
881 if (BUFFERP (coding->src_object))
882 {
883 struct buffer *buf = XBUFFER (coding->src_object);
884
885 if (coding->src_pos < 0)
886 coding->source = BUF_GAP_END_ADDR (buf) + coding->src_pos_byte;
887 else
888 coding->source = BUF_BYTE_ADDRESS (buf, coding->src_pos_byte);
889 }
890 else if (STRINGP (coding->src_object))
891 {
892 coding->source = SDATA (coding->src_object) + coding->src_pos_byte;
893 }
894 else
895 /* Otherwise, the source is C string and is never relocated
896 automatically. Thus we don't have to update anything. */
897 ;
898 }
899
900 static void
901 coding_set_destination (coding)
902 struct coding_system *coding;
903 {
904 if (BUFFERP (coding->dst_object))
905 {
906 if (coding->src_pos < 0)
907 {
908 coding->destination = BEG_ADDR + coding->dst_pos_byte - 1;
909 coding->dst_bytes = (GAP_END_ADDR
910 - (coding->src_bytes - coding->consumed)
911 - coding->destination);
912 }
913 else
914 {
915 /* We are sure that coding->dst_pos_byte is before the gap
916 of the buffer. */
917 coding->destination = (BUF_BEG_ADDR (XBUFFER (coding->dst_object))
918 + coding->dst_pos_byte - 1);
919 coding->dst_bytes = (BUF_GAP_END_ADDR (XBUFFER (coding->dst_object))
920 - coding->destination);
921 }
922 }
923 else
924 /* Otherwise, the destination is C string and is never relocated
925 automatically. Thus we don't have to update anything. */
926 ;
927 }
928
929
930 static void
931 coding_alloc_by_realloc (coding, bytes)
932 struct coding_system *coding;
933 EMACS_INT bytes;
934 {
935 coding->destination = (unsigned char *) xrealloc (coding->destination,
936 coding->dst_bytes + bytes);
937 coding->dst_bytes += bytes;
938 }
939
940 static void
941 coding_alloc_by_making_gap (coding, bytes)
942 struct coding_system *coding;
943 EMACS_INT bytes;
944 {
945 if (BUFFERP (coding->dst_object)
946 && EQ (coding->src_object, coding->dst_object))
947 {
948 EMACS_INT add = coding->src_bytes - coding->consumed;
949
950 GAP_SIZE -= add; ZV += add; Z += add; ZV_BYTE += add; Z_BYTE += add;
951 make_gap (bytes);
952 GAP_SIZE += add; ZV -= add; Z -= add; ZV_BYTE -= add; Z_BYTE -= add;
953 }
954 else
955 {
956 Lisp_Object this_buffer;
957
958 this_buffer = Fcurrent_buffer ();
959 set_buffer_internal (XBUFFER (coding->dst_object));
960 make_gap (bytes);
961 set_buffer_internal (XBUFFER (this_buffer));
962 }
963 }
964
965
966 static unsigned char *
967 alloc_destination (coding, nbytes, dst)
968 struct coding_system *coding;
969 EMACS_INT nbytes;
970 unsigned char *dst;
971 {
972 EMACS_INT offset = dst - coding->destination;
973
974 if (BUFFERP (coding->dst_object))
975 coding_alloc_by_making_gap (coding, nbytes);
976 else
977 coding_alloc_by_realloc (coding, nbytes);
978 coding->result = CODING_RESULT_SUCCESS;
979 coding_set_destination (coding);
980 dst = coding->destination + offset;
981 return dst;
982 }
983
984 /** Macros for annotations. */
985
986 /* Maximum length of annotation data (sum of annotations for
987 composition and charset). */
988 #define MAX_ANNOTATION_LENGTH (5 + (MAX_COMPOSITION_COMPONENTS * 2) - 1 + 5)
989
990 /* An annotation data is stored in the array coding->charbuf in this
991 format:
992 [ -LENGTH ANNOTATION_MASK FROM TO ... ]
993 LENGTH is the number of elements in the annotation.
994 ANNOTATION_MASK is one of CODING_ANNOTATE_XXX_MASK.
995 FROM and TO specify the range of text annotated. They are relative
996 to coding->src_pos (on encoding) or coding->dst_pos (on decoding).
997
998 The format of the following elements depend on ANNOTATION_MASK.
999
1000 In the case of CODING_ANNOTATE_COMPOSITION_MASK, these elements
1001 follows:
1002 ... METHOD [ COMPOSITION-COMPONENTS ... ]
1003 METHOD is one of enum composition_method.
1004 Optionnal COMPOSITION-COMPONENTS are characters and composition
1005 rules.
1006
1007 In the case of CODING_ANNOTATE_CHARSET_MASK, one element CHARSET-ID
1008 follows. */
1009
1010 #define ADD_ANNOTATION_DATA(buf, len, mask, from, to) \
1011 do { \
1012 *(buf)++ = -(len); \
1013 *(buf)++ = (mask); \
1014 *(buf)++ = (from); \
1015 *(buf)++ = (to); \
1016 coding->annotated = 1; \
1017 } while (0);
1018
1019 #define ADD_COMPOSITION_DATA(buf, from, to, method) \
1020 do { \
1021 ADD_ANNOTATION_DATA (buf, 5, CODING_ANNOTATE_COMPOSITION_MASK, from, to); \
1022 *buf++ = method; \
1023 } while (0)
1024
1025
1026 #define ADD_CHARSET_DATA(buf, from, to, id) \
1027 do { \
1028 ADD_ANNOTATION_DATA (buf, 5, CODING_ANNOTATE_CHARSET_MASK, from, to); \
1029 *buf++ = id; \
1030 } while (0)
1031
1032 \f
1033 /*** 2. Emacs' internal format (emacs-utf-8) ***/
1034
1035
1036
1037 \f
1038 /*** 3. UTF-8 ***/
1039
1040 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
1041 Check if a text is encoded in UTF-8. If it is, return 1, else
1042 return 0. */
1043
1044 #define UTF_8_1_OCTET_P(c) ((c) < 0x80)
1045 #define UTF_8_EXTRA_OCTET_P(c) (((c) & 0xC0) == 0x80)
1046 #define UTF_8_2_OCTET_LEADING_P(c) (((c) & 0xE0) == 0xC0)
1047 #define UTF_8_3_OCTET_LEADING_P(c) (((c) & 0xF0) == 0xE0)
1048 #define UTF_8_4_OCTET_LEADING_P(c) (((c) & 0xF8) == 0xF0)
1049 #define UTF_8_5_OCTET_LEADING_P(c) (((c) & 0xFC) == 0xF8)
1050
1051 static int
1052 detect_coding_utf_8 (coding, detect_info)
1053 struct coding_system *coding;
1054 struct coding_detection_info *detect_info;
1055 {
1056 const unsigned char *src = coding->source, *src_base = src;
1057 const unsigned char *src_end = coding->source + coding->src_bytes;
1058 int multibytep = coding->src_multibyte;
1059 int consumed_chars = 0;
1060 int found = 0;
1061 int incomplete;
1062
1063 detect_info->checked |= CATEGORY_MASK_UTF_8;
1064 /* A coding system of this category is always ASCII compatible. */
1065 src += coding->head_ascii;
1066
1067 while (1)
1068 {
1069 int c, c1, c2, c3, c4;
1070
1071 incomplete = 0;
1072 ONE_MORE_BYTE (c);
1073 if (UTF_8_1_OCTET_P (c))
1074 continue;
1075 incomplete = 1;
1076 ONE_MORE_BYTE (c1);
1077 if (! UTF_8_EXTRA_OCTET_P (c1))
1078 break;
1079 if (UTF_8_2_OCTET_LEADING_P (c))
1080 {
1081 found = CATEGORY_MASK_UTF_8;
1082 continue;
1083 }
1084 ONE_MORE_BYTE (c2);
1085 if (! UTF_8_EXTRA_OCTET_P (c2))
1086 break;
1087 if (UTF_8_3_OCTET_LEADING_P (c))
1088 {
1089 found = CATEGORY_MASK_UTF_8;
1090 continue;
1091 }
1092 ONE_MORE_BYTE (c3);
1093 if (! UTF_8_EXTRA_OCTET_P (c3))
1094 break;
1095 if (UTF_8_4_OCTET_LEADING_P (c))
1096 {
1097 found = CATEGORY_MASK_UTF_8;
1098 continue;
1099 }
1100 ONE_MORE_BYTE (c4);
1101 if (! UTF_8_EXTRA_OCTET_P (c4))
1102 break;
1103 if (UTF_8_5_OCTET_LEADING_P (c))
1104 {
1105 found = CATEGORY_MASK_UTF_8;
1106 continue;
1107 }
1108 break;
1109 }
1110 detect_info->rejected |= CATEGORY_MASK_UTF_8;
1111 return 0;
1112
1113 no_more_source:
1114 if (incomplete && coding->mode & CODING_MODE_LAST_BLOCK)
1115 {
1116 detect_info->rejected |= CATEGORY_MASK_UTF_8;
1117 return 0;
1118 }
1119 detect_info->found |= found;
1120 return 1;
1121 }
1122
1123
1124 static void
1125 decode_coding_utf_8 (coding)
1126 struct coding_system *coding;
1127 {
1128 const unsigned char *src = coding->source + coding->consumed;
1129 const unsigned char *src_end = coding->source + coding->src_bytes;
1130 const unsigned char *src_base;
1131 int *charbuf = coding->charbuf;
1132 int *charbuf_end = charbuf + coding->charbuf_size;
1133 int consumed_chars = 0, consumed_chars_base;
1134 int multibytep = coding->src_multibyte;
1135 Lisp_Object attr, eol_type, charset_list;
1136
1137 CODING_GET_INFO (coding, attr, eol_type, charset_list);
1138
1139 while (1)
1140 {
1141 int c, c1, c2, c3, c4, c5;
1142
1143 src_base = src;
1144 consumed_chars_base = consumed_chars;
1145
1146 if (charbuf >= charbuf_end)
1147 break;
1148
1149 ONE_MORE_BYTE (c1);
1150 if (UTF_8_1_OCTET_P(c1))
1151 {
1152 c = c1;
1153 if (c == '\r')
1154 {
1155 if (EQ (eol_type, Qdos))
1156 {
1157 if (src == src_end)
1158 {
1159 coding->result = CODING_RESULT_INSUFFICIENT_SRC;
1160 goto no_more_source;
1161 }
1162 if (*src == '\n')
1163 ONE_MORE_BYTE (c);
1164 }
1165 else if (EQ (eol_type, Qmac))
1166 c = '\n';
1167 }
1168 }
1169 else
1170 {
1171 ONE_MORE_BYTE (c2);
1172 if (! UTF_8_EXTRA_OCTET_P (c2))
1173 goto invalid_code;
1174 if (UTF_8_2_OCTET_LEADING_P (c1))
1175 {
1176 c = ((c1 & 0x1F) << 6) | (c2 & 0x3F);
1177 /* Reject overlong sequences here and below. Encoders
1178 producing them are incorrect, they can be misleading,
1179 and they mess up read/write invariance. */
1180 if (c < 128)
1181 goto invalid_code;
1182 }
1183 else
1184 {
1185 ONE_MORE_BYTE (c3);
1186 if (! UTF_8_EXTRA_OCTET_P (c3))
1187 goto invalid_code;
1188 if (UTF_8_3_OCTET_LEADING_P (c1))
1189 {
1190 c = (((c1 & 0xF) << 12)
1191 | ((c2 & 0x3F) << 6) | (c3 & 0x3F));
1192 if (c < 0x800
1193 || (c >= 0xd800 && c < 0xe000)) /* surrogates (invalid) */
1194 goto invalid_code;
1195 }
1196 else
1197 {
1198 ONE_MORE_BYTE (c4);
1199 if (! UTF_8_EXTRA_OCTET_P (c4))
1200 goto invalid_code;
1201 if (UTF_8_4_OCTET_LEADING_P (c1))
1202 {
1203 c = (((c1 & 0x7) << 18) | ((c2 & 0x3F) << 12)
1204 | ((c3 & 0x3F) << 6) | (c4 & 0x3F));
1205 if (c < 0x10000)
1206 goto invalid_code;
1207 }
1208 else
1209 {
1210 ONE_MORE_BYTE (c5);
1211 if (! UTF_8_EXTRA_OCTET_P (c5))
1212 goto invalid_code;
1213 if (UTF_8_5_OCTET_LEADING_P (c1))
1214 {
1215 c = (((c1 & 0x3) << 24) | ((c2 & 0x3F) << 18)
1216 | ((c3 & 0x3F) << 12) | ((c4 & 0x3F) << 6)
1217 | (c5 & 0x3F));
1218 if ((c > MAX_CHAR) || (c < 0x200000))
1219 goto invalid_code;
1220 }
1221 else
1222 goto invalid_code;
1223 }
1224 }
1225 }
1226 }
1227
1228 *charbuf++ = c;
1229 continue;
1230
1231 invalid_code:
1232 src = src_base;
1233 consumed_chars = consumed_chars_base;
1234 ONE_MORE_BYTE (c);
1235 *charbuf++ = ASCII_BYTE_P (c) ? c : BYTE8_TO_CHAR (c);
1236 coding->errors++;
1237 }
1238
1239 no_more_source:
1240 coding->consumed_char += consumed_chars_base;
1241 coding->consumed = src_base - coding->source;
1242 coding->charbuf_used = charbuf - coding->charbuf;
1243 }
1244
1245
1246 static int
1247 encode_coding_utf_8 (coding)
1248 struct coding_system *coding;
1249 {
1250 int multibytep = coding->dst_multibyte;
1251 int *charbuf = coding->charbuf;
1252 int *charbuf_end = charbuf + coding->charbuf_used;
1253 unsigned char *dst = coding->destination + coding->produced;
1254 unsigned char *dst_end = coding->destination + coding->dst_bytes;
1255 int produced_chars = 0;
1256 int c;
1257
1258 if (multibytep)
1259 {
1260 int safe_room = MAX_MULTIBYTE_LENGTH * 2;
1261
1262 while (charbuf < charbuf_end)
1263 {
1264 unsigned char str[MAX_MULTIBYTE_LENGTH], *p, *pend = str;
1265
1266 ASSURE_DESTINATION (safe_room);
1267 c = *charbuf++;
1268 if (CHAR_BYTE8_P (c))
1269 {
1270 c = CHAR_TO_BYTE8 (c);
1271 EMIT_ONE_BYTE (c);
1272 }
1273 else
1274 {
1275 CHAR_STRING_ADVANCE (c, pend);
1276 for (p = str; p < pend; p++)
1277 EMIT_ONE_BYTE (*p);
1278 }
1279 }
1280 }
1281 else
1282 {
1283 int safe_room = MAX_MULTIBYTE_LENGTH;
1284
1285 while (charbuf < charbuf_end)
1286 {
1287 ASSURE_DESTINATION (safe_room);
1288 c = *charbuf++;
1289 dst += CHAR_STRING (c, dst);
1290 produced_chars++;
1291 }
1292 }
1293 coding->result = CODING_RESULT_SUCCESS;
1294 coding->produced_char += produced_chars;
1295 coding->produced = dst - coding->destination;
1296 return 0;
1297 }
1298
1299
1300 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
1301 Check if a text is encoded in one of UTF-16 based coding systems.
1302 If it is, return 1, else return 0. */
1303
1304 #define UTF_16_HIGH_SURROGATE_P(val) \
1305 (((val) & 0xFC00) == 0xD800)
1306
1307 #define UTF_16_LOW_SURROGATE_P(val) \
1308 (((val) & 0xFC00) == 0xDC00)
1309
1310 #define UTF_16_INVALID_P(val) \
1311 (((val) == 0xFFFE) \
1312 || ((val) == 0xFFFF) \
1313 || UTF_16_LOW_SURROGATE_P (val))
1314
1315
1316 static int
1317 detect_coding_utf_16 (coding, detect_info)
1318 struct coding_system *coding;
1319 struct coding_detection_info *detect_info;
1320 {
1321 const unsigned char *src = coding->source, *src_base = src;
1322 const unsigned char *src_end = coding->source + coding->src_bytes;
1323 int multibytep = coding->src_multibyte;
1324 int consumed_chars = 0;
1325 int c1, c2;
1326
1327 detect_info->checked |= CATEGORY_MASK_UTF_16;
1328
1329 if (coding->mode & CODING_MODE_LAST_BLOCK
1330 && (coding->src_bytes & 1))
1331 {
1332 detect_info->rejected |= CATEGORY_MASK_UTF_16;
1333 return 0;
1334 }
1335 ONE_MORE_BYTE (c1);
1336 ONE_MORE_BYTE (c2);
1337
1338 if ((c1 == 0xFF) && (c2 == 0xFE))
1339 {
1340 detect_info->found |= (CATEGORY_MASK_UTF_16_LE
1341 | CATEGORY_MASK_UTF_16_AUTO);
1342 detect_info->rejected |= CATEGORY_MASK_UTF_16_BE;
1343 }
1344 else if ((c1 == 0xFE) && (c2 == 0xFF))
1345 {
1346 detect_info->found |= (CATEGORY_MASK_UTF_16_BE
1347 | CATEGORY_MASK_UTF_16_AUTO);
1348 detect_info->rejected |= CATEGORY_MASK_UTF_16_LE;
1349 }
1350 no_more_source:
1351 return 1;
1352 }
1353
1354 static void
1355 decode_coding_utf_16 (coding)
1356 struct coding_system *coding;
1357 {
1358 const unsigned char *src = coding->source + coding->consumed;
1359 const unsigned char *src_end = coding->source + coding->src_bytes;
1360 const unsigned char *src_base;
1361 int *charbuf = coding->charbuf;
1362 int *charbuf_end = charbuf + coding->charbuf_size;
1363 int consumed_chars = 0, consumed_chars_base;
1364 int multibytep = coding->src_multibyte;
1365 enum utf_16_bom_type bom = CODING_UTF_16_BOM (coding);
1366 enum utf_16_endian_type endian = CODING_UTF_16_ENDIAN (coding);
1367 int surrogate = CODING_UTF_16_SURROGATE (coding);
1368 Lisp_Object attr, eol_type, charset_list;
1369
1370 CODING_GET_INFO (coding, attr, eol_type, charset_list);
1371
1372 if (bom == utf_16_with_bom)
1373 {
1374 int c, c1, c2;
1375
1376 src_base = src;
1377 ONE_MORE_BYTE (c1);
1378 ONE_MORE_BYTE (c2);
1379 c = (c1 << 8) | c2;
1380
1381 if (endian == utf_16_big_endian
1382 ? c != 0xFEFF : c != 0xFFFE)
1383 {
1384 /* The first two bytes are not BOM. Treat them as bytes
1385 for a normal character. */
1386 src = src_base;
1387 coding->errors++;
1388 }
1389 CODING_UTF_16_BOM (coding) = utf_16_without_bom;
1390 }
1391 else if (bom == utf_16_detect_bom)
1392 {
1393 /* We have already tried to detect BOM and failed in
1394 detect_coding. */
1395 CODING_UTF_16_BOM (coding) = utf_16_without_bom;
1396 }
1397
1398 while (1)
1399 {
1400 int c, c1, c2;
1401
1402 src_base = src;
1403 consumed_chars_base = consumed_chars;
1404
1405 if (charbuf + 2 >= charbuf_end)
1406 break;
1407
1408 ONE_MORE_BYTE (c1);
1409 ONE_MORE_BYTE (c2);
1410 c = (endian == utf_16_big_endian
1411 ? ((c1 << 8) | c2) : ((c2 << 8) | c1));
1412 if (surrogate)
1413 {
1414 if (! UTF_16_LOW_SURROGATE_P (c))
1415 {
1416 if (endian == utf_16_big_endian)
1417 c1 = surrogate >> 8, c2 = surrogate & 0xFF;
1418 else
1419 c1 = surrogate & 0xFF, c2 = surrogate >> 8;
1420 *charbuf++ = c1;
1421 *charbuf++ = c2;
1422 coding->errors++;
1423 if (UTF_16_HIGH_SURROGATE_P (c))
1424 CODING_UTF_16_SURROGATE (coding) = surrogate = c;
1425 else
1426 *charbuf++ = c;
1427 }
1428 else
1429 {
1430 c = ((surrogate - 0xD800) << 10) | (c - 0xDC00);
1431 CODING_UTF_16_SURROGATE (coding) = surrogate = 0;
1432 *charbuf++ = c;
1433 }
1434 }
1435 else
1436 {
1437 if (UTF_16_HIGH_SURROGATE_P (c))
1438 CODING_UTF_16_SURROGATE (coding) = surrogate = c;
1439 else
1440 *charbuf++ = c;
1441 }
1442 }
1443
1444 no_more_source:
1445 coding->consumed_char += consumed_chars_base;
1446 coding->consumed = src_base - coding->source;
1447 coding->charbuf_used = charbuf - coding->charbuf;
1448 }
1449
1450 static int
1451 encode_coding_utf_16 (coding)
1452 struct coding_system *coding;
1453 {
1454 int multibytep = coding->dst_multibyte;
1455 int *charbuf = coding->charbuf;
1456 int *charbuf_end = charbuf + coding->charbuf_used;
1457 unsigned char *dst = coding->destination + coding->produced;
1458 unsigned char *dst_end = coding->destination + coding->dst_bytes;
1459 int safe_room = 8;
1460 enum utf_16_bom_type bom = CODING_UTF_16_BOM (coding);
1461 int big_endian = CODING_UTF_16_ENDIAN (coding) == utf_16_big_endian;
1462 int produced_chars = 0;
1463 Lisp_Object attrs, eol_type, charset_list;
1464 int c;
1465
1466 CODING_GET_INFO (coding, attrs, eol_type, charset_list);
1467
1468 if (bom != utf_16_without_bom)
1469 {
1470 ASSURE_DESTINATION (safe_room);
1471 if (big_endian)
1472 EMIT_TWO_BYTES (0xFE, 0xFF);
1473 else
1474 EMIT_TWO_BYTES (0xFF, 0xFE);
1475 CODING_UTF_16_BOM (coding) = utf_16_without_bom;
1476 }
1477
1478 while (charbuf < charbuf_end)
1479 {
1480 ASSURE_DESTINATION (safe_room);
1481 c = *charbuf++;
1482 if (c >= MAX_UNICODE_CHAR)
1483 c = coding->default_char;
1484
1485 if (c < 0x10000)
1486 {
1487 if (big_endian)
1488 EMIT_TWO_BYTES (c >> 8, c & 0xFF);
1489 else
1490 EMIT_TWO_BYTES (c & 0xFF, c >> 8);
1491 }
1492 else
1493 {
1494 int c1, c2;
1495
1496 c -= 0x10000;
1497 c1 = (c >> 10) + 0xD800;
1498 c2 = (c & 0x3FF) + 0xDC00;
1499 if (big_endian)
1500 EMIT_FOUR_BYTES (c1 >> 8, c1 & 0xFF, c2 >> 8, c2 & 0xFF);
1501 else
1502 EMIT_FOUR_BYTES (c1 & 0xFF, c1 >> 8, c2 & 0xFF, c2 >> 8);
1503 }
1504 }
1505 coding->result = CODING_RESULT_SUCCESS;
1506 coding->produced = dst - coding->destination;
1507 coding->produced_char += produced_chars;
1508 return 0;
1509 }
1510
1511 \f
1512 /*** 6. Old Emacs' internal format (emacs-mule) ***/
1513
1514 /* Emacs' internal format for representation of multiple character
1515 sets is a kind of multi-byte encoding, i.e. characters are
1516 represented by variable-length sequences of one-byte codes.
1517
1518 ASCII characters and control characters (e.g. `tab', `newline') are
1519 represented by one-byte sequences which are their ASCII codes, in
1520 the range 0x00 through 0x7F.
1521
1522 8-bit characters of the range 0x80..0x9F are represented by
1523 two-byte sequences of LEADING_CODE_8_BIT_CONTROL and (their 8-bit
1524 code + 0x20).
1525
1526 8-bit characters of the range 0xA0..0xFF are represented by
1527 one-byte sequences which are their 8-bit code.
1528
1529 The other characters are represented by a sequence of `base
1530 leading-code', optional `extended leading-code', and one or two
1531 `position-code's. The length of the sequence is determined by the
1532 base leading-code. Leading-code takes the range 0x81 through 0x9D,
1533 whereas extended leading-code and position-code take the range 0xA0
1534 through 0xFF. See `charset.h' for more details about leading-code
1535 and position-code.
1536
1537 --- CODE RANGE of Emacs' internal format ---
1538 character set range
1539 ------------- -----
1540 ascii 0x00..0x7F
1541 eight-bit-control LEADING_CODE_8_BIT_CONTROL + 0xA0..0xBF
1542 eight-bit-graphic 0xA0..0xBF
1543 ELSE 0x81..0x9D + [0xA0..0xFF]+
1544 ---------------------------------------------
1545
1546 As this is the internal character representation, the format is
1547 usually not used externally (i.e. in a file or in a data sent to a
1548 process). But, it is possible to have a text externally in this
1549 format (i.e. by encoding by the coding system `emacs-mule').
1550
1551 In that case, a sequence of one-byte codes has a slightly different
1552 form.
1553
1554 At first, all characters in eight-bit-control are represented by
1555 one-byte sequences which are their 8-bit code.
1556
1557 Next, character composition data are represented by the byte
1558 sequence of the form: 0x80 METHOD BYTES CHARS COMPONENT ...,
1559 where,
1560 METHOD is 0xF0 plus one of composition method (enum
1561 composition_method),
1562
1563 BYTES is 0xA0 plus a byte length of this composition data,
1564
1565 CHARS is 0x20 plus a number of characters composed by this
1566 data,
1567
1568 COMPONENTs are characters of multibye form or composition
1569 rules encoded by two-byte of ASCII codes.
1570
1571 In addition, for backward compatibility, the following formats are
1572 also recognized as composition data on decoding.
1573
1574 0x80 MSEQ ...
1575 0x80 0xFF MSEQ RULE MSEQ RULE ... MSEQ
1576
1577 Here,
1578 MSEQ is a multibyte form but in these special format:
1579 ASCII: 0xA0 ASCII_CODE+0x80,
1580 other: LEADING_CODE+0x20 FOLLOWING-BYTE ...,
1581 RULE is a one byte code of the range 0xA0..0xF0 that
1582 represents a composition rule.
1583 */
1584
1585 char emacs_mule_bytes[256];
1586
1587 int
1588 emacs_mule_char (coding, src, nbytes, nchars, id)
1589 struct coding_system *coding;
1590 unsigned char *src;
1591 int *nbytes, *nchars, *id;
1592 {
1593 const unsigned char *src_end = coding->source + coding->src_bytes;
1594 const unsigned char *src_base = src;
1595 int multibytep = coding->src_multibyte;
1596 struct charset *charset;
1597 unsigned code;
1598 int c;
1599 int consumed_chars = 0;
1600
1601 ONE_MORE_BYTE (c);
1602 switch (emacs_mule_bytes[c])
1603 {
1604 case 2:
1605 if (! (charset = emacs_mule_charset[c]))
1606 goto invalid_code;
1607 ONE_MORE_BYTE (c);
1608 code = c & 0x7F;
1609 break;
1610
1611 case 3:
1612 if (c == EMACS_MULE_LEADING_CODE_PRIVATE_11
1613 || c == EMACS_MULE_LEADING_CODE_PRIVATE_12)
1614 {
1615 ONE_MORE_BYTE (c);
1616 if (! (charset = emacs_mule_charset[c]))
1617 goto invalid_code;
1618 ONE_MORE_BYTE (c);
1619 code = c & 0x7F;
1620 }
1621 else
1622 {
1623 if (! (charset = emacs_mule_charset[c]))
1624 goto invalid_code;
1625 ONE_MORE_BYTE (c);
1626 code = (c & 0x7F) << 8;
1627 ONE_MORE_BYTE (c);
1628 code |= c & 0x7F;
1629 }
1630 break;
1631
1632 case 4:
1633 ONE_MORE_BYTE (c);
1634 if (! (charset = emacs_mule_charset[c]))
1635 goto invalid_code;
1636 ONE_MORE_BYTE (c);
1637 code = (c & 0x7F) << 8;
1638 ONE_MORE_BYTE (c);
1639 code |= c & 0x7F;
1640 break;
1641
1642 case 1:
1643 code = c;
1644 charset = CHARSET_FROM_ID (ASCII_BYTE_P (code)
1645 ? charset_ascii : charset_eight_bit);
1646 break;
1647
1648 default:
1649 abort ();
1650 }
1651 c = DECODE_CHAR (charset, code);
1652 if (c < 0)
1653 goto invalid_code;
1654 *nbytes = src - src_base;
1655 *nchars = consumed_chars;
1656 if (id)
1657 *id = charset->id;
1658 return c;
1659
1660 no_more_source:
1661 return -2;
1662
1663 invalid_code:
1664 return -1;
1665 }
1666
1667
1668 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
1669 Check if a text is encoded in `emacs-mule'. If it is, return 1,
1670 else return 0. */
1671
1672 static int
1673 detect_coding_emacs_mule (coding, detect_info)
1674 struct coding_system *coding;
1675 struct coding_detection_info *detect_info;
1676 {
1677 const unsigned char *src = coding->source, *src_base = src;
1678 const unsigned char *src_end = coding->source + coding->src_bytes;
1679 int multibytep = coding->src_multibyte;
1680 int consumed_chars = 0;
1681 int c;
1682 int found = 0;
1683 int incomplete;
1684
1685 detect_info->checked |= CATEGORY_MASK_EMACS_MULE;
1686 /* A coding system of this category is always ASCII compatible. */
1687 src += coding->head_ascii;
1688
1689 while (1)
1690 {
1691 incomplete = 0;
1692 ONE_MORE_BYTE (c);
1693 incomplete = 1;
1694
1695 if (c == 0x80)
1696 {
1697 /* Perhaps the start of composite character. We simple skip
1698 it because analyzing it is too heavy for detecting. But,
1699 at least, we check that the composite character
1700 constitues of more than 4 bytes. */
1701 const unsigned char *src_base;
1702
1703 repeat:
1704 src_base = src;
1705 do
1706 {
1707 ONE_MORE_BYTE (c);
1708 }
1709 while (c >= 0xA0);
1710
1711 if (src - src_base <= 4)
1712 break;
1713 found = CATEGORY_MASK_EMACS_MULE;
1714 if (c == 0x80)
1715 goto repeat;
1716 }
1717
1718 if (c < 0x80)
1719 {
1720 if (c < 0x20
1721 && (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO))
1722 break;
1723 }
1724 else
1725 {
1726 const unsigned char *src_base = src - 1;
1727
1728 do
1729 {
1730 ONE_MORE_BYTE (c);
1731 }
1732 while (c >= 0xA0);
1733 if (src - src_base != emacs_mule_bytes[*src_base])
1734 break;
1735 found = CATEGORY_MASK_EMACS_MULE;
1736 }
1737 }
1738 detect_info->rejected |= CATEGORY_MASK_EMACS_MULE;
1739 return 0;
1740
1741 no_more_source:
1742 if (incomplete && coding->mode & CODING_MODE_LAST_BLOCK)
1743 {
1744 detect_info->rejected |= CATEGORY_MASK_EMACS_MULE;
1745 return 0;
1746 }
1747 detect_info->found |= found;
1748 return 1;
1749 }
1750
1751
1752 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions". */
1753
1754 /* Decode a character represented as a component of composition
1755 sequence of Emacs 20/21 style at SRC. Set C to that character and
1756 update SRC to the head of next character (or an encoded composition
1757 rule). If SRC doesn't points a composition component, set C to -1.
1758 If SRC points an invalid byte sequence, global exit by a return
1759 value 0. */
1760
1761 #define DECODE_EMACS_MULE_COMPOSITION_CHAR(buf) \
1762 if (1) \
1763 { \
1764 int c; \
1765 int nbytes, nchars; \
1766 \
1767 if (src == src_end) \
1768 break; \
1769 c = emacs_mule_char (coding, src, &nbytes, &nchars, NULL);\
1770 if (c < 0) \
1771 { \
1772 if (c == -2) \
1773 break; \
1774 goto invalid_code; \
1775 } \
1776 *buf++ = c; \
1777 src += nbytes; \
1778 consumed_chars += nchars; \
1779 } \
1780 else
1781
1782
1783 /* Decode a composition rule represented as a component of composition
1784 sequence of Emacs 20 style at SRC. Store the decoded rule in *BUF,
1785 and increment BUF. If SRC points an invalid byte sequence, set C
1786 to -1. */
1787
1788 #define DECODE_EMACS_MULE_COMPOSITION_RULE_20(buf) \
1789 do { \
1790 int c, gref, nref; \
1791 \
1792 if (src >= src_end) \
1793 goto invalid_code; \
1794 ONE_MORE_BYTE_NO_CHECK (c); \
1795 c -= 0x20; \
1796 if (c < 0 || c >= 81) \
1797 goto invalid_code; \
1798 \
1799 gref = c / 9, nref = c % 9; \
1800 *buf++ = COMPOSITION_ENCODE_RULE (gref, nref); \
1801 } while (0)
1802
1803
1804 /* Decode a composition rule represented as a component of composition
1805 sequence of Emacs 21 style at SRC. Store the decoded rule in *BUF,
1806 and increment BUF. If SRC points an invalid byte sequence, set C
1807 to -1. */
1808
1809 #define DECODE_EMACS_MULE_COMPOSITION_RULE_21(buf) \
1810 do { \
1811 int gref, nref; \
1812 \
1813 if (src + 1>= src_end) \
1814 goto invalid_code; \
1815 ONE_MORE_BYTE_NO_CHECK (gref); \
1816 gref -= 0x20; \
1817 ONE_MORE_BYTE_NO_CHECK (nref); \
1818 nref -= 0x20; \
1819 if (gref < 0 || gref >= 81 \
1820 || nref < 0 || nref >= 81) \
1821 goto invalid_code; \
1822 *buf++ = COMPOSITION_ENCODE_RULE (gref, nref); \
1823 } while (0)
1824
1825
1826 #define DECODE_EMACS_MULE_21_COMPOSITION(c) \
1827 do { \
1828 /* Emacs 21 style format. The first three bytes at SRC are \
1829 (METHOD - 0xF2), (BYTES - 0xA0), (CHARS - 0xA0), where BYTES is \
1830 the byte length of this composition information, CHARS is the \
1831 number of characters composed by this composition. */ \
1832 enum composition_method method = c - 0xF2; \
1833 int *charbuf_base = charbuf; \
1834 int from, to; \
1835 int consumed_chars_limit; \
1836 int nbytes, nchars; \
1837 \
1838 ONE_MORE_BYTE (c); \
1839 nbytes = c - 0xA0; \
1840 if (nbytes < 3) \
1841 goto invalid_code; \
1842 ONE_MORE_BYTE (c); \
1843 nchars = c - 0xA0; \
1844 from = coding->produced + char_offset; \
1845 to = from + nchars; \
1846 ADD_COMPOSITION_DATA (charbuf, from, to, method); \
1847 consumed_chars_limit = consumed_chars_base + nbytes; \
1848 if (method != COMPOSITION_RELATIVE) \
1849 { \
1850 int i = 0; \
1851 while (consumed_chars < consumed_chars_limit) \
1852 { \
1853 if (i % 2 && method != COMPOSITION_WITH_ALTCHARS) \
1854 DECODE_EMACS_MULE_COMPOSITION_RULE_21 (charbuf); \
1855 else \
1856 DECODE_EMACS_MULE_COMPOSITION_CHAR (charbuf); \
1857 i++; \
1858 } \
1859 if (consumed_chars < consumed_chars_limit) \
1860 goto invalid_code; \
1861 charbuf_base[0] -= i; \
1862 } \
1863 } while (0)
1864
1865
1866 #define DECODE_EMACS_MULE_20_RELATIVE_COMPOSITION(c) \
1867 do { \
1868 /* Emacs 20 style format for relative composition. */ \
1869 /* Store multibyte form of characters to be composed. */ \
1870 enum composition_method method = COMPOSITION_RELATIVE; \
1871 int components[MAX_COMPOSITION_COMPONENTS * 2 - 1]; \
1872 int *buf = components; \
1873 int i, j; \
1874 int from, to; \
1875 \
1876 src = src_base; \
1877 ONE_MORE_BYTE (c); /* skip 0x80 */ \
1878 for (i = 0; i < MAX_COMPOSITION_COMPONENTS; i++) \
1879 DECODE_EMACS_MULE_COMPOSITION_CHAR (buf); \
1880 if (i < 2) \
1881 goto invalid_code; \
1882 from = coding->produced_char + char_offset; \
1883 to = from + i; \
1884 ADD_COMPOSITION_DATA (charbuf, from, to, method); \
1885 for (j = 0; j < i; j++) \
1886 *charbuf++ = components[j]; \
1887 } while (0)
1888
1889
1890 #define DECODE_EMACS_MULE_20_RULEBASE_COMPOSITION(c) \
1891 do { \
1892 /* Emacs 20 style format for rule-base composition. */ \
1893 /* Store multibyte form of characters to be composed. */ \
1894 enum composition_method method = COMPOSITION_WITH_RULE; \
1895 int components[MAX_COMPOSITION_COMPONENTS * 2 - 1]; \
1896 int *buf = components; \
1897 int i, j; \
1898 int from, to; \
1899 \
1900 DECODE_EMACS_MULE_COMPOSITION_CHAR (buf); \
1901 for (i = 0; i < MAX_COMPOSITION_COMPONENTS; i++) \
1902 { \
1903 DECODE_EMACS_MULE_COMPOSITION_RULE_20 (buf); \
1904 DECODE_EMACS_MULE_COMPOSITION_CHAR (buf); \
1905 } \
1906 if (i < 1 || (buf - components) % 2 == 0) \
1907 goto invalid_code; \
1908 if (charbuf + i + (i / 2) + 1 < charbuf_end) \
1909 goto no_more_source; \
1910 from = coding->produced_char + char_offset; \
1911 to = from + i; \
1912 ADD_COMPOSITION_DATA (buf, from, to, method); \
1913 for (j = 0; j < i; j++) \
1914 *charbuf++ = components[j]; \
1915 for (j = 0; j < i; j += 2) \
1916 *charbuf++ = components[j]; \
1917 } while (0)
1918
1919
1920 static void
1921 decode_coding_emacs_mule (coding)
1922 struct coding_system *coding;
1923 {
1924 const unsigned char *src = coding->source + coding->consumed;
1925 const unsigned char *src_end = coding->source + coding->src_bytes;
1926 const unsigned char *src_base;
1927 int *charbuf = coding->charbuf;
1928 int *charbuf_end = charbuf + coding->charbuf_size - MAX_ANNOTATION_LENGTH;
1929 int consumed_chars = 0, consumed_chars_base;
1930 int multibytep = coding->src_multibyte;
1931 Lisp_Object attrs, eol_type, charset_list;
1932 int char_offset = coding->produced_char;
1933 int last_offset = char_offset;
1934 int last_id = charset_ascii;
1935
1936 CODING_GET_INFO (coding, attrs, eol_type, charset_list);
1937
1938 while (1)
1939 {
1940 int c;
1941
1942 src_base = src;
1943 consumed_chars_base = consumed_chars;
1944
1945 if (charbuf >= charbuf_end)
1946 break;
1947
1948 ONE_MORE_BYTE (c);
1949
1950 if (c < 0x80)
1951 {
1952 if (c == '\r')
1953 {
1954 if (EQ (eol_type, Qdos))
1955 {
1956 if (src == src_end)
1957 {
1958 coding->result = CODING_RESULT_INSUFFICIENT_SRC;
1959 goto no_more_source;
1960 }
1961 if (*src == '\n')
1962 ONE_MORE_BYTE (c);
1963 }
1964 else if (EQ (eol_type, Qmac))
1965 c = '\n';
1966 }
1967 *charbuf++ = c;
1968 char_offset++;
1969 }
1970 else if (c == 0x80)
1971 {
1972 ONE_MORE_BYTE (c);
1973 if (c - 0xF2 >= COMPOSITION_RELATIVE
1974 && c - 0xF2 <= COMPOSITION_WITH_RULE_ALTCHARS)
1975 DECODE_EMACS_MULE_21_COMPOSITION (c);
1976 else if (c < 0xC0)
1977 DECODE_EMACS_MULE_20_RELATIVE_COMPOSITION (c);
1978 else if (c == 0xFF)
1979 DECODE_EMACS_MULE_20_RULEBASE_COMPOSITION (c);
1980 else
1981 goto invalid_code;
1982 }
1983 else if (c < 0xA0 && emacs_mule_bytes[c] > 1)
1984 {
1985 int nbytes, nchars;
1986 int id;
1987
1988 src = src_base;
1989 consumed_chars = consumed_chars_base;
1990 c = emacs_mule_char (coding, src, &nbytes, &nchars, &id);
1991 if (c < 0)
1992 {
1993 if (c == -2)
1994 break;
1995 goto invalid_code;
1996 }
1997 if (last_id != id)
1998 {
1999 if (last_id != charset_ascii)
2000 ADD_CHARSET_DATA (charbuf, last_offset, char_offset, last_id);
2001 last_id = id;
2002 last_offset = char_offset;
2003 }
2004 *charbuf++ = c;
2005 src += nbytes;
2006 consumed_chars += nchars;
2007 char_offset++;
2008 }
2009 continue;
2010
2011 invalid_code:
2012 src = src_base;
2013 consumed_chars = consumed_chars_base;
2014 ONE_MORE_BYTE (c);
2015 *charbuf++ = ASCII_BYTE_P (c) ? c : BYTE8_TO_CHAR (c);
2016 char_offset++;
2017 coding->errors++;
2018 }
2019
2020 no_more_source:
2021 if (last_id != charset_ascii)
2022 ADD_CHARSET_DATA (charbuf, last_offset, char_offset, last_id);
2023 coding->consumed_char += consumed_chars_base;
2024 coding->consumed = src_base - coding->source;
2025 coding->charbuf_used = charbuf - coding->charbuf;
2026 }
2027
2028
2029 #define EMACS_MULE_LEADING_CODES(id, codes) \
2030 do { \
2031 if (id < 0xA0) \
2032 codes[0] = id, codes[1] = 0; \
2033 else if (id < 0xE0) \
2034 codes[0] = 0x9A, codes[1] = id; \
2035 else if (id < 0xF0) \
2036 codes[0] = 0x9B, codes[1] = id; \
2037 else if (id < 0xF5) \
2038 codes[0] = 0x9C, codes[1] = id; \
2039 else \
2040 codes[0] = 0x9D, codes[1] = id; \
2041 } while (0);
2042
2043
2044 static int
2045 encode_coding_emacs_mule (coding)
2046 struct coding_system *coding;
2047 {
2048 int multibytep = coding->dst_multibyte;
2049 int *charbuf = coding->charbuf;
2050 int *charbuf_end = charbuf + coding->charbuf_used;
2051 unsigned char *dst = coding->destination + coding->produced;
2052 unsigned char *dst_end = coding->destination + coding->dst_bytes;
2053 int safe_room = 8;
2054 int produced_chars = 0;
2055 Lisp_Object attrs, eol_type, charset_list;
2056 int c;
2057 int preferred_charset_id = -1;
2058
2059 CODING_GET_INFO (coding, attrs, eol_type, charset_list);
2060 if (! EQ (charset_list, Vemacs_mule_charset_list))
2061 {
2062 CODING_ATTR_CHARSET_LIST (attrs)
2063 = charset_list = Vemacs_mule_charset_list;
2064 }
2065
2066 while (charbuf < charbuf_end)
2067 {
2068 ASSURE_DESTINATION (safe_room);
2069 c = *charbuf++;
2070
2071 if (c < 0)
2072 {
2073 /* Handle an annotation. */
2074 switch (*charbuf)
2075 {
2076 case CODING_ANNOTATE_COMPOSITION_MASK:
2077 /* Not yet implemented. */
2078 break;
2079 case CODING_ANNOTATE_CHARSET_MASK:
2080 preferred_charset_id = charbuf[3];
2081 if (preferred_charset_id >= 0
2082 && NILP (Fmemq (make_number (preferred_charset_id),
2083 charset_list)))
2084 preferred_charset_id = -1;
2085 break;
2086 default:
2087 abort ();
2088 }
2089 charbuf += -c - 1;
2090 continue;
2091 }
2092
2093 if (ASCII_CHAR_P (c))
2094 EMIT_ONE_ASCII_BYTE (c);
2095 else if (CHAR_BYTE8_P (c))
2096 {
2097 c = CHAR_TO_BYTE8 (c);
2098 EMIT_ONE_BYTE (c);
2099 }
2100 else
2101 {
2102 struct charset *charset;
2103 unsigned code;
2104 int dimension;
2105 int emacs_mule_id;
2106 unsigned char leading_codes[2];
2107
2108 if (preferred_charset_id >= 0)
2109 {
2110 charset = CHARSET_FROM_ID (preferred_charset_id);
2111 if (! CHAR_CHARSET_P (c, charset))
2112 charset = char_charset (c, charset_list, NULL);
2113 }
2114 else
2115 charset = char_charset (c, charset_list, &code);
2116 if (! charset)
2117 {
2118 c = coding->default_char;
2119 if (ASCII_CHAR_P (c))
2120 {
2121 EMIT_ONE_ASCII_BYTE (c);
2122 continue;
2123 }
2124 charset = char_charset (c, charset_list, &code);
2125 }
2126 dimension = CHARSET_DIMENSION (charset);
2127 emacs_mule_id = CHARSET_EMACS_MULE_ID (charset);
2128 EMACS_MULE_LEADING_CODES (emacs_mule_id, leading_codes);
2129 EMIT_ONE_BYTE (leading_codes[0]);
2130 if (leading_codes[1])
2131 EMIT_ONE_BYTE (leading_codes[1]);
2132 if (dimension == 1)
2133 EMIT_ONE_BYTE (code | 0x80);
2134 else
2135 {
2136 code |= 0x8080;
2137 EMIT_ONE_BYTE (code >> 8);
2138 EMIT_ONE_BYTE (code & 0xFF);
2139 }
2140 }
2141 }
2142 coding->result = CODING_RESULT_SUCCESS;
2143 coding->produced_char += produced_chars;
2144 coding->produced = dst - coding->destination;
2145 return 0;
2146 }
2147
2148 \f
2149 /*** 7. ISO2022 handlers ***/
2150
2151 /* The following note describes the coding system ISO2022 briefly.
2152 Since the intention of this note is to help understand the
2153 functions in this file, some parts are NOT ACCURATE or are OVERLY
2154 SIMPLIFIED. For thorough understanding, please refer to the
2155 original document of ISO2022. This is equivalent to the standard
2156 ECMA-35, obtainable from <URL:http://www.ecma.ch/> (*).
2157
2158 ISO2022 provides many mechanisms to encode several character sets
2159 in 7-bit and 8-bit environments. For 7-bit environments, all text
2160 is encoded using bytes less than 128. This may make the encoded
2161 text a little bit longer, but the text passes more easily through
2162 several types of gateway, some of which strip off the MSB (Most
2163 Significant Bit).
2164
2165 There are two kinds of character sets: control character sets and
2166 graphic character sets. The former contain control characters such
2167 as `newline' and `escape' to provide control functions (control
2168 functions are also provided by escape sequences). The latter
2169 contain graphic characters such as 'A' and '-'. Emacs recognizes
2170 two control character sets and many graphic character sets.
2171
2172 Graphic character sets are classified into one of the following
2173 four classes, according to the number of bytes (DIMENSION) and
2174 number of characters in one dimension (CHARS) of the set:
2175 - DIMENSION1_CHARS94
2176 - DIMENSION1_CHARS96
2177 - DIMENSION2_CHARS94
2178 - DIMENSION2_CHARS96
2179
2180 In addition, each character set is assigned an identification tag,
2181 unique for each set, called the "final character" (denoted as <F>
2182 hereafter). The <F> of each character set is decided by ECMA(*)
2183 when it is registered in ISO. The code range of <F> is 0x30..0x7F
2184 (0x30..0x3F are for private use only).
2185
2186 Note (*): ECMA = European Computer Manufacturers Association
2187
2188 Here are examples of graphic character sets [NAME(<F>)]:
2189 o DIMENSION1_CHARS94 -- ASCII('B'), right-half-of-JISX0201('I'), ...
2190 o DIMENSION1_CHARS96 -- right-half-of-ISO8859-1('A'), ...
2191 o DIMENSION2_CHARS94 -- GB2312('A'), JISX0208('B'), ...
2192 o DIMENSION2_CHARS96 -- none for the moment
2193
2194 A code area (1 byte=8 bits) is divided into 4 areas, C0, GL, C1, and GR.
2195 C0 [0x00..0x1F] -- control character plane 0
2196 GL [0x20..0x7F] -- graphic character plane 0
2197 C1 [0x80..0x9F] -- control character plane 1
2198 GR [0xA0..0xFF] -- graphic character plane 1
2199
2200 A control character set is directly designated and invoked to C0 or
2201 C1 by an escape sequence. The most common case is that:
2202 - ISO646's control character set is designated/invoked to C0, and
2203 - ISO6429's control character set is designated/invoked to C1,
2204 and usually these designations/invocations are omitted in encoded
2205 text. In a 7-bit environment, only C0 can be used, and a control
2206 character for C1 is encoded by an appropriate escape sequence to
2207 fit into the environment. All control characters for C1 are
2208 defined to have corresponding escape sequences.
2209
2210 A graphic character set is at first designated to one of four
2211 graphic registers (G0 through G3), then these graphic registers are
2212 invoked to GL or GR. These designations and invocations can be
2213 done independently. The most common case is that G0 is invoked to
2214 GL, G1 is invoked to GR, and ASCII is designated to G0. Usually
2215 these invocations and designations are omitted in encoded text.
2216 In a 7-bit environment, only GL can be used.
2217
2218 When a graphic character set of CHARS94 is invoked to GL, codes
2219 0x20 and 0x7F of the GL area work as control characters SPACE and
2220 DEL respectively, and codes 0xA0 and 0xFF of the GR area should not
2221 be used.
2222
2223 There are two ways of invocation: locking-shift and single-shift.
2224 With locking-shift, the invocation lasts until the next different
2225 invocation, whereas with single-shift, the invocation affects the
2226 following character only and doesn't affect the locking-shift
2227 state. Invocations are done by the following control characters or
2228 escape sequences:
2229
2230 ----------------------------------------------------------------------
2231 abbrev function cntrl escape seq description
2232 ----------------------------------------------------------------------
2233 SI/LS0 (shift-in) 0x0F none invoke G0 into GL
2234 SO/LS1 (shift-out) 0x0E none invoke G1 into GL
2235 LS2 (locking-shift-2) none ESC 'n' invoke G2 into GL
2236 LS3 (locking-shift-3) none ESC 'o' invoke G3 into GL
2237 LS1R (locking-shift-1 right) none ESC '~' invoke G1 into GR (*)
2238 LS2R (locking-shift-2 right) none ESC '}' invoke G2 into GR (*)
2239 LS3R (locking-shift 3 right) none ESC '|' invoke G3 into GR (*)
2240 SS2 (single-shift-2) 0x8E ESC 'N' invoke G2 for one char
2241 SS3 (single-shift-3) 0x8F ESC 'O' invoke G3 for one char
2242 ----------------------------------------------------------------------
2243 (*) These are not used by any known coding system.
2244
2245 Control characters for these functions are defined by macros
2246 ISO_CODE_XXX in `coding.h'.
2247
2248 Designations are done by the following escape sequences:
2249 ----------------------------------------------------------------------
2250 escape sequence description
2251 ----------------------------------------------------------------------
2252 ESC '(' <F> designate DIMENSION1_CHARS94<F> to G0
2253 ESC ')' <F> designate DIMENSION1_CHARS94<F> to G1
2254 ESC '*' <F> designate DIMENSION1_CHARS94<F> to G2
2255 ESC '+' <F> designate DIMENSION1_CHARS94<F> to G3
2256 ESC ',' <F> designate DIMENSION1_CHARS96<F> to G0 (*)
2257 ESC '-' <F> designate DIMENSION1_CHARS96<F> to G1
2258 ESC '.' <F> designate DIMENSION1_CHARS96<F> to G2
2259 ESC '/' <F> designate DIMENSION1_CHARS96<F> to G3
2260 ESC '$' '(' <F> designate DIMENSION2_CHARS94<F> to G0 (**)
2261 ESC '$' ')' <F> designate DIMENSION2_CHARS94<F> to G1
2262 ESC '$' '*' <F> designate DIMENSION2_CHARS94<F> to G2
2263 ESC '$' '+' <F> designate DIMENSION2_CHARS94<F> to G3
2264 ESC '$' ',' <F> designate DIMENSION2_CHARS96<F> to G0 (*)
2265 ESC '$' '-' <F> designate DIMENSION2_CHARS96<F> to G1
2266 ESC '$' '.' <F> designate DIMENSION2_CHARS96<F> to G2
2267 ESC '$' '/' <F> designate DIMENSION2_CHARS96<F> to G3
2268 ----------------------------------------------------------------------
2269
2270 In this list, "DIMENSION1_CHARS94<F>" means a graphic character set
2271 of dimension 1, chars 94, and final character <F>, etc...
2272
2273 Note (*): Although these designations are not allowed in ISO2022,
2274 Emacs accepts them on decoding, and produces them on encoding
2275 CHARS96 character sets in a coding system which is characterized as
2276 7-bit environment, non-locking-shift, and non-single-shift.
2277
2278 Note (**): If <F> is '@', 'A', or 'B', the intermediate character
2279 '(' must be omitted. We refer to this as "short-form" hereafter.
2280
2281 Now you may notice that there are a lot of ways of encoding the
2282 same multilingual text in ISO2022. Actually, there exist many
2283 coding systems such as Compound Text (used in X11's inter client
2284 communication, ISO-2022-JP (used in Japanese Internet), ISO-2022-KR
2285 (used in Korean Internet), EUC (Extended UNIX Code, used in Asian
2286 localized platforms), and all of these are variants of ISO2022.
2287
2288 In addition to the above, Emacs handles two more kinds of escape
2289 sequences: ISO6429's direction specification and Emacs' private
2290 sequence for specifying character composition.
2291
2292 ISO6429's direction specification takes the following form:
2293 o CSI ']' -- end of the current direction
2294 o CSI '0' ']' -- end of the current direction
2295 o CSI '1' ']' -- start of left-to-right text
2296 o CSI '2' ']' -- start of right-to-left text
2297 The control character CSI (0x9B: control sequence introducer) is
2298 abbreviated to the escape sequence ESC '[' in a 7-bit environment.
2299
2300 Character composition specification takes the following form:
2301 o ESC '0' -- start relative composition
2302 o ESC '1' -- end composition
2303 o ESC '2' -- start rule-base composition (*)
2304 o ESC '3' -- start relative composition with alternate chars (**)
2305 o ESC '4' -- start rule-base composition with alternate chars (**)
2306 Since these are not standard escape sequences of any ISO standard,
2307 the use of them with these meanings is restricted to Emacs only.
2308
2309 (*) This form is used only in Emacs 20.7 and older versions,
2310 but newer versions can safely decode it.
2311 (**) This form is used only in Emacs 21.1 and newer versions,
2312 and older versions can't decode it.
2313
2314 Here's a list of example usages of these composition escape
2315 sequences (categorized by `enum composition_method').
2316
2317 COMPOSITION_RELATIVE:
2318 ESC 0 CHAR [ CHAR ] ESC 1
2319 COMPOSITION_WITH_RULE:
2320 ESC 2 CHAR [ RULE CHAR ] ESC 1
2321 COMPOSITION_WITH_ALTCHARS:
2322 ESC 3 ALTCHAR [ ALTCHAR ] ESC 0 CHAR [ CHAR ] ESC 1
2323 COMPOSITION_WITH_RULE_ALTCHARS:
2324 ESC 4 ALTCHAR [ RULE ALTCHAR ] ESC 0 CHAR [ CHAR ] ESC 1 */
2325
2326 enum iso_code_class_type iso_code_class[256];
2327
2328 #define SAFE_CHARSET_P(coding, id) \
2329 ((id) <= (coding)->max_charset_id \
2330 && (coding)->safe_charsets[id] >= 0)
2331
2332
2333 #define SHIFT_OUT_OK(category) \
2334 (CODING_ISO_INITIAL (&coding_categories[category], 1) >= 0)
2335
2336 static void
2337 setup_iso_safe_charsets (attrs)
2338 Lisp_Object attrs;
2339 {
2340 Lisp_Object charset_list, safe_charsets;
2341 Lisp_Object request;
2342 Lisp_Object reg_usage;
2343 Lisp_Object tail;
2344 int reg94, reg96;
2345 int flags = XINT (AREF (attrs, coding_attr_iso_flags));
2346 int max_charset_id;
2347
2348 charset_list = CODING_ATTR_CHARSET_LIST (attrs);
2349 if ((flags & CODING_ISO_FLAG_FULL_SUPPORT)
2350 && ! EQ (charset_list, Viso_2022_charset_list))
2351 {
2352 CODING_ATTR_CHARSET_LIST (attrs)
2353 = charset_list = Viso_2022_charset_list;
2354 ASET (attrs, coding_attr_safe_charsets, Qnil);
2355 }
2356
2357 if (STRINGP (AREF (attrs, coding_attr_safe_charsets)))
2358 return;
2359
2360 max_charset_id = 0;
2361 for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
2362 {
2363 int id = XINT (XCAR (tail));
2364 if (max_charset_id < id)
2365 max_charset_id = id;
2366 }
2367
2368 safe_charsets = Fmake_string (make_number (max_charset_id + 1),
2369 make_number (255));
2370 request = AREF (attrs, coding_attr_iso_request);
2371 reg_usage = AREF (attrs, coding_attr_iso_usage);
2372 reg94 = XINT (XCAR (reg_usage));
2373 reg96 = XINT (XCDR (reg_usage));
2374
2375 for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
2376 {
2377 Lisp_Object id;
2378 Lisp_Object reg;
2379 struct charset *charset;
2380
2381 id = XCAR (tail);
2382 charset = CHARSET_FROM_ID (XINT (id));
2383 reg = Fcdr (Fassq (id, request));
2384 if (! NILP (reg))
2385 SSET (safe_charsets, XINT (id), XINT (reg));
2386 else if (charset->iso_chars_96)
2387 {
2388 if (reg96 < 4)
2389 SSET (safe_charsets, XINT (id), reg96);
2390 }
2391 else
2392 {
2393 if (reg94 < 4)
2394 SSET (safe_charsets, XINT (id), reg94);
2395 }
2396 }
2397 ASET (attrs, coding_attr_safe_charsets, safe_charsets);
2398 }
2399
2400
2401 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
2402 Check if a text is encoded in one of ISO-2022 based codig systems.
2403 If it is, return 1, else return 0. */
2404
2405 static int
2406 detect_coding_iso_2022 (coding, detect_info)
2407 struct coding_system *coding;
2408 struct coding_detection_info *detect_info;
2409 {
2410 const unsigned char *src = coding->source, *src_base = src;
2411 const unsigned char *src_end = coding->source + coding->src_bytes;
2412 int multibytep = coding->src_multibyte;
2413 int single_shifting = 0;
2414 int id;
2415 int c, c1;
2416 int consumed_chars = 0;
2417 int i;
2418 int rejected = 0;
2419 int found = 0;
2420
2421 detect_info->checked |= CATEGORY_MASK_ISO;
2422
2423 for (i = coding_category_iso_7; i <= coding_category_iso_8_else; i++)
2424 {
2425 struct coding_system *this = &(coding_categories[i]);
2426 Lisp_Object attrs, val;
2427
2428 attrs = CODING_ID_ATTRS (this->id);
2429 if (CODING_ISO_FLAGS (this) & CODING_ISO_FLAG_FULL_SUPPORT
2430 && ! EQ (CODING_ATTR_SAFE_CHARSETS (attrs), Viso_2022_charset_list))
2431 setup_iso_safe_charsets (attrs);
2432 val = CODING_ATTR_SAFE_CHARSETS (attrs);
2433 this->max_charset_id = SCHARS (val) - 1;
2434 this->safe_charsets = (char *) SDATA (val);
2435 }
2436
2437 /* A coding system of this category is always ASCII compatible. */
2438 src += coding->head_ascii;
2439
2440 while (rejected != CATEGORY_MASK_ISO)
2441 {
2442 ONE_MORE_BYTE (c);
2443 switch (c)
2444 {
2445 case ISO_CODE_ESC:
2446 if (inhibit_iso_escape_detection)
2447 break;
2448 single_shifting = 0;
2449 ONE_MORE_BYTE (c);
2450 if (c >= '(' && c <= '/')
2451 {
2452 /* Designation sequence for a charset of dimension 1. */
2453 ONE_MORE_BYTE (c1);
2454 if (c1 < ' ' || c1 >= 0x80
2455 || (id = iso_charset_table[0][c >= ','][c1]) < 0)
2456 /* Invalid designation sequence. Just ignore. */
2457 break;
2458 }
2459 else if (c == '$')
2460 {
2461 /* Designation sequence for a charset of dimension 2. */
2462 ONE_MORE_BYTE (c);
2463 if (c >= '@' && c <= 'B')
2464 /* Designation for JISX0208.1978, GB2312, or JISX0208. */
2465 id = iso_charset_table[1][0][c];
2466 else if (c >= '(' && c <= '/')
2467 {
2468 ONE_MORE_BYTE (c1);
2469 if (c1 < ' ' || c1 >= 0x80
2470 || (id = iso_charset_table[1][c >= ','][c1]) < 0)
2471 /* Invalid designation sequence. Just ignore. */
2472 break;
2473 }
2474 else
2475 /* Invalid designation sequence. Just ignore it. */
2476 break;
2477 }
2478 else if (c == 'N' || c == 'O')
2479 {
2480 /* ESC <Fe> for SS2 or SS3. */
2481 single_shifting = 1;
2482 rejected |= CATEGORY_MASK_ISO_7BIT | CATEGORY_MASK_ISO_8BIT;
2483 break;
2484 }
2485 else if (c >= '0' && c <= '4')
2486 {
2487 /* ESC <Fp> for start/end composition. */
2488 found |= CATEGORY_MASK_ISO;
2489 break;
2490 }
2491 else
2492 {
2493 /* Invalid escape sequence. Just ignore it. */
2494 break;
2495 }
2496
2497 /* We found a valid designation sequence for CHARSET. */
2498 rejected |= CATEGORY_MASK_ISO_8BIT;
2499 if (SAFE_CHARSET_P (&coding_categories[coding_category_iso_7],
2500 id))
2501 found |= CATEGORY_MASK_ISO_7;
2502 else
2503 rejected |= CATEGORY_MASK_ISO_7;
2504 if (SAFE_CHARSET_P (&coding_categories[coding_category_iso_7_tight],
2505 id))
2506 found |= CATEGORY_MASK_ISO_7_TIGHT;
2507 else
2508 rejected |= CATEGORY_MASK_ISO_7_TIGHT;
2509 if (SAFE_CHARSET_P (&coding_categories[coding_category_iso_7_else],
2510 id))
2511 found |= CATEGORY_MASK_ISO_7_ELSE;
2512 else
2513 rejected |= CATEGORY_MASK_ISO_7_ELSE;
2514 if (SAFE_CHARSET_P (&coding_categories[coding_category_iso_8_else],
2515 id))
2516 found |= CATEGORY_MASK_ISO_8_ELSE;
2517 else
2518 rejected |= CATEGORY_MASK_ISO_8_ELSE;
2519 break;
2520
2521 case ISO_CODE_SO:
2522 case ISO_CODE_SI:
2523 /* Locking shift out/in. */
2524 if (inhibit_iso_escape_detection)
2525 break;
2526 single_shifting = 0;
2527 rejected |= CATEGORY_MASK_ISO_7BIT | CATEGORY_MASK_ISO_8BIT;
2528 found |= CATEGORY_MASK_ISO_ELSE;
2529 break;
2530
2531 case ISO_CODE_CSI:
2532 /* Control sequence introducer. */
2533 single_shifting = 0;
2534 rejected |= CATEGORY_MASK_ISO_7BIT | CATEGORY_MASK_ISO_7_ELSE;
2535 found |= CATEGORY_MASK_ISO_8_ELSE;
2536 goto check_extra_latin;
2537
2538
2539 case ISO_CODE_SS2:
2540 case ISO_CODE_SS3:
2541 /* Single shift. */
2542 if (inhibit_iso_escape_detection)
2543 break;
2544 single_shifting = 1;
2545 rejected |= CATEGORY_MASK_ISO_7BIT;
2546 if (CODING_ISO_FLAGS (&coding_categories[coding_category_iso_8_1])
2547 & CODING_ISO_FLAG_SINGLE_SHIFT)
2548 found |= CATEGORY_MASK_ISO_8_1;
2549 if (CODING_ISO_FLAGS (&coding_categories[coding_category_iso_8_2])
2550 & CODING_ISO_FLAG_SINGLE_SHIFT)
2551 found |= CATEGORY_MASK_ISO_8_2;
2552 goto check_extra_latin;
2553
2554 default:
2555 if (c < 0x80)
2556 {
2557 single_shifting = 0;
2558 break;
2559 }
2560 if (c >= 0xA0)
2561 {
2562 rejected |= CATEGORY_MASK_ISO_7BIT | CATEGORY_MASK_ISO_7_ELSE;
2563 found |= CATEGORY_MASK_ISO_8_1;
2564 /* Check the length of succeeding codes of the range
2565 0xA0..0FF. If the byte length is even, we include
2566 CATEGORY_MASK_ISO_8_2 in `found'. We can check this
2567 only when we are not single shifting. */
2568 if (! single_shifting
2569 && ! (rejected & CATEGORY_MASK_ISO_8_2))
2570 {
2571 int i = 1;
2572 while (src < src_end)
2573 {
2574 ONE_MORE_BYTE (c);
2575 if (c < 0xA0)
2576 break;
2577 i++;
2578 }
2579
2580 if (i & 1 && src < src_end)
2581 rejected |= CATEGORY_MASK_ISO_8_2;
2582 else
2583 found |= CATEGORY_MASK_ISO_8_2;
2584 }
2585 break;
2586 }
2587 check_extra_latin:
2588 single_shifting = 0;
2589 if (! VECTORP (Vlatin_extra_code_table)
2590 || NILP (XVECTOR (Vlatin_extra_code_table)->contents[c]))
2591 {
2592 rejected = CATEGORY_MASK_ISO;
2593 break;
2594 }
2595 if (CODING_ISO_FLAGS (&coding_categories[coding_category_iso_8_1])
2596 & CODING_ISO_FLAG_LATIN_EXTRA)
2597 found |= CATEGORY_MASK_ISO_8_1;
2598 else
2599 rejected |= CATEGORY_MASK_ISO_8_1;
2600 if (CODING_ISO_FLAGS (&coding_categories[coding_category_iso_8_2])
2601 & CODING_ISO_FLAG_LATIN_EXTRA)
2602 found |= CATEGORY_MASK_ISO_8_2;
2603 else
2604 rejected |= CATEGORY_MASK_ISO_8_2;
2605 }
2606 }
2607 detect_info->rejected |= CATEGORY_MASK_ISO;
2608 return 0;
2609
2610 no_more_source:
2611 detect_info->rejected |= rejected;
2612 detect_info->found |= (found & ~rejected);
2613 return 1;
2614 }
2615
2616
2617 /* Set designation state into CODING. */
2618 #define DECODE_DESIGNATION(reg, dim, chars_96, final) \
2619 do { \
2620 int id, prev; \
2621 \
2622 if (final < '0' || final >= 128 \
2623 || ((id = ISO_CHARSET_TABLE (dim, chars_96, final)) < 0) \
2624 || !SAFE_CHARSET_P (coding, id)) \
2625 { \
2626 CODING_ISO_DESIGNATION (coding, reg) = -2; \
2627 goto invalid_code; \
2628 } \
2629 prev = CODING_ISO_DESIGNATION (coding, reg); \
2630 if (id == charset_jisx0201_roman) \
2631 { \
2632 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_USE_ROMAN) \
2633 id = charset_ascii; \
2634 } \
2635 else if (id == charset_jisx0208_1978) \
2636 { \
2637 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_USE_OLDJIS) \
2638 id = charset_jisx0208; \
2639 } \
2640 CODING_ISO_DESIGNATION (coding, reg) = id; \
2641 /* If there was an invalid designation to REG previously, and this \
2642 designation is ASCII to REG, we should keep this designation \
2643 sequence. */ \
2644 if (prev == -2 && id == charset_ascii) \
2645 goto invalid_code; \
2646 } while (0)
2647
2648
2649 #define MAYBE_FINISH_COMPOSITION() \
2650 do { \
2651 int i; \
2652 if (composition_state == COMPOSING_NO) \
2653 break; \
2654 /* It is assured that we have enough room for producing \
2655 characters stored in the table `components'. */ \
2656 if (charbuf + component_idx > charbuf_end) \
2657 goto no_more_source; \
2658 composition_state = COMPOSING_NO; \
2659 if (method == COMPOSITION_RELATIVE \
2660 || method == COMPOSITION_WITH_ALTCHARS) \
2661 { \
2662 for (i = 0; i < component_idx; i++) \
2663 *charbuf++ = components[i]; \
2664 char_offset += component_idx; \
2665 } \
2666 else \
2667 { \
2668 for (i = 0; i < component_idx; i += 2) \
2669 *charbuf++ = components[i]; \
2670 char_offset += (component_idx / 2) + 1; \
2671 } \
2672 } while (0)
2673
2674
2675 /* Handle composition start sequence ESC 0, ESC 2, ESC 3, or ESC 4.
2676 ESC 0 : relative composition : ESC 0 CHAR ... ESC 1
2677 ESC 2 : rulebase composition : ESC 2 CHAR RULE CHAR RULE ... CHAR ESC 1
2678 ESC 3 : altchar composition : ESC 3 CHAR ... ESC 0 CHAR ... ESC 1
2679 ESC 4 : alt&rule composition : ESC 4 CHAR RULE ... CHAR ESC 0 CHAR ... ESC 1
2680 */
2681
2682 #define DECODE_COMPOSITION_START(c1) \
2683 do { \
2684 if (c1 == '0' \
2685 && composition_state == COMPOSING_COMPONENT_RULE) \
2686 { \
2687 component_len = component_idx; \
2688 composition_state = COMPOSING_CHAR; \
2689 } \
2690 else \
2691 { \
2692 const unsigned char *p; \
2693 \
2694 MAYBE_FINISH_COMPOSITION (); \
2695 if (charbuf + MAX_COMPOSITION_COMPONENTS > charbuf_end) \
2696 goto no_more_source; \
2697 for (p = src; p < src_end - 1; p++) \
2698 if (*p == ISO_CODE_ESC && p[1] == '1') \
2699 break; \
2700 if (p == src_end - 1) \
2701 { \
2702 if (coding->mode & CODING_MODE_LAST_BLOCK) \
2703 goto invalid_code; \
2704 goto no_more_source; \
2705 } \
2706 \
2707 /* This is surely the start of a composition. */ \
2708 method = (c1 == '0' ? COMPOSITION_RELATIVE \
2709 : c1 == '2' ? COMPOSITION_WITH_RULE \
2710 : c1 == '3' ? COMPOSITION_WITH_ALTCHARS \
2711 : COMPOSITION_WITH_RULE_ALTCHARS); \
2712 composition_state = (c1 <= '2' ? COMPOSING_CHAR \
2713 : COMPOSING_COMPONENT_CHAR); \
2714 component_idx = component_len = 0; \
2715 } \
2716 } while (0)
2717
2718
2719 /* Handle compositoin end sequence ESC 1. */
2720
2721 #define DECODE_COMPOSITION_END() \
2722 do { \
2723 int nchars = (component_len > 0 ? component_idx - component_len \
2724 : method == COMPOSITION_RELATIVE ? component_idx \
2725 : (component_idx + 1) / 2); \
2726 int i; \
2727 int *saved_charbuf = charbuf; \
2728 int from = char_offset; \
2729 int to = from + nchars; \
2730 \
2731 ADD_COMPOSITION_DATA (charbuf, from, to, method); \
2732 if (method != COMPOSITION_RELATIVE) \
2733 { \
2734 if (component_len == 0) \
2735 for (i = 0; i < component_idx; i++) \
2736 *charbuf++ = components[i]; \
2737 else \
2738 for (i = 0; i < component_len; i++) \
2739 *charbuf++ = components[i]; \
2740 *saved_charbuf = saved_charbuf - charbuf; \
2741 } \
2742 if (method == COMPOSITION_WITH_RULE) \
2743 for (i = 0; i < component_idx; i += 2, char_offset++) \
2744 *charbuf++ = components[i]; \
2745 else \
2746 for (i = component_len; i < component_idx; i++, char_offset++) \
2747 *charbuf++ = components[i]; \
2748 coding->annotated = 1; \
2749 composition_state = COMPOSING_NO; \
2750 } while (0)
2751
2752
2753 /* Decode a composition rule from the byte C1 (and maybe one more byte
2754 from SRC) and store one encoded composition rule in
2755 coding->cmp_data. */
2756
2757 #define DECODE_COMPOSITION_RULE(c1) \
2758 do { \
2759 (c1) -= 32; \
2760 if (c1 < 81) /* old format (before ver.21) */ \
2761 { \
2762 int gref = (c1) / 9; \
2763 int nref = (c1) % 9; \
2764 if (gref == 4) gref = 10; \
2765 if (nref == 4) nref = 10; \
2766 c1 = COMPOSITION_ENCODE_RULE (gref, nref); \
2767 } \
2768 else if (c1 < 93) /* new format (after ver.21) */ \
2769 { \
2770 ONE_MORE_BYTE (c2); \
2771 c1 = COMPOSITION_ENCODE_RULE (c1 - 81, c2 - 32); \
2772 } \
2773 else \
2774 c1 = 0; \
2775 } while (0)
2776
2777
2778 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions". */
2779
2780 static void
2781 decode_coding_iso_2022 (coding)
2782 struct coding_system *coding;
2783 {
2784 const unsigned char *src = coding->source + coding->consumed;
2785 const unsigned char *src_end = coding->source + coding->src_bytes;
2786 const unsigned char *src_base;
2787 int *charbuf = coding->charbuf;
2788 int *charbuf_end
2789 = charbuf + coding->charbuf_size - 4 - MAX_ANNOTATION_LENGTH;
2790 int consumed_chars = 0, consumed_chars_base;
2791 int multibytep = coding->src_multibyte;
2792 /* Charsets invoked to graphic plane 0 and 1 respectively. */
2793 int charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
2794 int charset_id_1 = CODING_ISO_INVOKED_CHARSET (coding, 1);
2795 struct charset *charset;
2796 int c;
2797 /* For handling composition sequence. */
2798 #define COMPOSING_NO 0
2799 #define COMPOSING_CHAR 1
2800 #define COMPOSING_RULE 2
2801 #define COMPOSING_COMPONENT_CHAR 3
2802 #define COMPOSING_COMPONENT_RULE 4
2803
2804 int composition_state = COMPOSING_NO;
2805 enum composition_method method;
2806 int components[MAX_COMPOSITION_COMPONENTS * 2 + 1];
2807 int component_idx;
2808 int component_len;
2809 Lisp_Object attrs, eol_type, charset_list;
2810 int char_offset = coding->produced_char;
2811 int last_offset = char_offset;
2812 int last_id = charset_ascii;
2813
2814 CODING_GET_INFO (coding, attrs, eol_type, charset_list);
2815 setup_iso_safe_charsets (attrs);
2816
2817 while (1)
2818 {
2819 int c1, c2;
2820
2821 src_base = src;
2822 consumed_chars_base = consumed_chars;
2823
2824 if (charbuf >= charbuf_end)
2825 break;
2826
2827 ONE_MORE_BYTE (c1);
2828
2829 /* We produce at most one character. */
2830 switch (iso_code_class [c1])
2831 {
2832 case ISO_0x20_or_0x7F:
2833 if (composition_state != COMPOSING_NO)
2834 {
2835 if (composition_state == COMPOSING_RULE
2836 || composition_state == COMPOSING_COMPONENT_RULE)
2837 {
2838 DECODE_COMPOSITION_RULE (c1);
2839 components[component_idx++] = c1;
2840 composition_state--;
2841 continue;
2842 }
2843 }
2844 if (charset_id_0 < 0
2845 || ! CHARSET_ISO_CHARS_96 (CHARSET_FROM_ID (charset_id_0)))
2846 /* This is SPACE or DEL. */
2847 charset = CHARSET_FROM_ID (charset_ascii);
2848 else
2849 charset = CHARSET_FROM_ID (charset_id_0);
2850 break;
2851
2852 case ISO_graphic_plane_0:
2853 if (composition_state != COMPOSING_NO)
2854 {
2855 if (composition_state == COMPOSING_RULE
2856 || composition_state == COMPOSING_COMPONENT_RULE)
2857 {
2858 DECODE_COMPOSITION_RULE (c1);
2859 components[component_idx++] = c1;
2860 composition_state--;
2861 continue;
2862 }
2863 }
2864 charset = CHARSET_FROM_ID (charset_id_0);
2865 break;
2866
2867 case ISO_0xA0_or_0xFF:
2868 if (charset_id_1 < 0
2869 || ! CHARSET_ISO_CHARS_96 (CHARSET_FROM_ID (charset_id_1))
2870 || CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS)
2871 goto invalid_code;
2872 /* This is a graphic character, we fall down ... */
2873
2874 case ISO_graphic_plane_1:
2875 if (charset_id_1 < 0)
2876 goto invalid_code;
2877 charset = CHARSET_FROM_ID (charset_id_1);
2878 break;
2879
2880 case ISO_carriage_return:
2881 if (c1 == '\r')
2882 {
2883 if (EQ (eol_type, Qdos))
2884 {
2885 if (src == src_end)
2886 {
2887 coding->result = CODING_RESULT_INSUFFICIENT_SRC;
2888 goto no_more_source;
2889 }
2890 if (*src == '\n')
2891 ONE_MORE_BYTE (c1);
2892 }
2893 else if (EQ (eol_type, Qmac))
2894 c1 = '\n';
2895 }
2896 /* fall through */
2897
2898 case ISO_control_0:
2899 MAYBE_FINISH_COMPOSITION ();
2900 charset = CHARSET_FROM_ID (charset_ascii);
2901 break;
2902
2903 case ISO_control_1:
2904 MAYBE_FINISH_COMPOSITION ();
2905 goto invalid_code;
2906
2907 case ISO_shift_out:
2908 if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LOCKING_SHIFT)
2909 || CODING_ISO_DESIGNATION (coding, 1) < 0)
2910 goto invalid_code;
2911 CODING_ISO_INVOCATION (coding, 0) = 1;
2912 charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
2913 continue;
2914
2915 case ISO_shift_in:
2916 if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LOCKING_SHIFT))
2917 goto invalid_code;
2918 CODING_ISO_INVOCATION (coding, 0) = 0;
2919 charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
2920 continue;
2921
2922 case ISO_single_shift_2_7:
2923 case ISO_single_shift_2:
2924 if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT))
2925 goto invalid_code;
2926 /* SS2 is handled as an escape sequence of ESC 'N' */
2927 c1 = 'N';
2928 goto label_escape_sequence;
2929
2930 case ISO_single_shift_3:
2931 if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT))
2932 goto invalid_code;
2933 /* SS2 is handled as an escape sequence of ESC 'O' */
2934 c1 = 'O';
2935 goto label_escape_sequence;
2936
2937 case ISO_control_sequence_introducer:
2938 /* CSI is handled as an escape sequence of ESC '[' ... */
2939 c1 = '[';
2940 goto label_escape_sequence;
2941
2942 case ISO_escape:
2943 ONE_MORE_BYTE (c1);
2944 label_escape_sequence:
2945 /* Escape sequences handled here are invocation,
2946 designation, direction specification, and character
2947 composition specification. */
2948 switch (c1)
2949 {
2950 case '&': /* revision of following character set */
2951 ONE_MORE_BYTE (c1);
2952 if (!(c1 >= '@' && c1 <= '~'))
2953 goto invalid_code;
2954 ONE_MORE_BYTE (c1);
2955 if (c1 != ISO_CODE_ESC)
2956 goto invalid_code;
2957 ONE_MORE_BYTE (c1);
2958 goto label_escape_sequence;
2959
2960 case '$': /* designation of 2-byte character set */
2961 if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DESIGNATION))
2962 goto invalid_code;
2963 ONE_MORE_BYTE (c1);
2964 if (c1 >= '@' && c1 <= 'B')
2965 { /* designation of JISX0208.1978, GB2312.1980,
2966 or JISX0208.1980 */
2967 DECODE_DESIGNATION (0, 2, 0, c1);
2968 }
2969 else if (c1 >= 0x28 && c1 <= 0x2B)
2970 { /* designation of DIMENSION2_CHARS94 character set */
2971 ONE_MORE_BYTE (c2);
2972 DECODE_DESIGNATION (c1 - 0x28, 2, 0, c2);
2973 }
2974 else if (c1 >= 0x2C && c1 <= 0x2F)
2975 { /* designation of DIMENSION2_CHARS96 character set */
2976 ONE_MORE_BYTE (c2);
2977 DECODE_DESIGNATION (c1 - 0x2C, 2, 1, c2);
2978 }
2979 else
2980 goto invalid_code;
2981 /* We must update these variables now. */
2982 charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
2983 charset_id_1 = CODING_ISO_INVOKED_CHARSET (coding, 1);
2984 continue;
2985
2986 case 'n': /* invocation of locking-shift-2 */
2987 if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LOCKING_SHIFT)
2988 || CODING_ISO_DESIGNATION (coding, 2) < 0)
2989 goto invalid_code;
2990 CODING_ISO_INVOCATION (coding, 0) = 2;
2991 charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
2992 continue;
2993
2994 case 'o': /* invocation of locking-shift-3 */
2995 if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LOCKING_SHIFT)
2996 || CODING_ISO_DESIGNATION (coding, 3) < 0)
2997 goto invalid_code;
2998 CODING_ISO_INVOCATION (coding, 0) = 3;
2999 charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3000 continue;
3001
3002 case 'N': /* invocation of single-shift-2 */
3003 if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT)
3004 || CODING_ISO_DESIGNATION (coding, 2) < 0)
3005 goto invalid_code;
3006 charset = CHARSET_FROM_ID (CODING_ISO_DESIGNATION (coding, 2));
3007 ONE_MORE_BYTE (c1);
3008 if (c1 < 0x20 || (c1 >= 0x80 && c1 < 0xA0))
3009 goto invalid_code;
3010 break;
3011
3012 case 'O': /* invocation of single-shift-3 */
3013 if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT)
3014 || CODING_ISO_DESIGNATION (coding, 3) < 0)
3015 goto invalid_code;
3016 charset = CHARSET_FROM_ID (CODING_ISO_DESIGNATION (coding, 3));
3017 ONE_MORE_BYTE (c1);
3018 if (c1 < 0x20 || (c1 >= 0x80 && c1 < 0xA0))
3019 goto invalid_code;
3020 break;
3021
3022 case '0': case '2': case '3': case '4': /* start composition */
3023 if (! (coding->common_flags & CODING_ANNOTATE_COMPOSITION_MASK))
3024 goto invalid_code;
3025 DECODE_COMPOSITION_START (c1);
3026 continue;
3027
3028 case '1': /* end composition */
3029 if (composition_state == COMPOSING_NO)
3030 goto invalid_code;
3031 DECODE_COMPOSITION_END ();
3032 continue;
3033
3034 case '[': /* specification of direction */
3035 if (! CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DIRECTION)
3036 goto invalid_code;
3037 /* For the moment, nested direction is not supported.
3038 So, `coding->mode & CODING_MODE_DIRECTION' zero means
3039 left-to-right, and nozero means right-to-left. */
3040 ONE_MORE_BYTE (c1);
3041 switch (c1)
3042 {
3043 case ']': /* end of the current direction */
3044 coding->mode &= ~CODING_MODE_DIRECTION;
3045
3046 case '0': /* end of the current direction */
3047 case '1': /* start of left-to-right direction */
3048 ONE_MORE_BYTE (c1);
3049 if (c1 == ']')
3050 coding->mode &= ~CODING_MODE_DIRECTION;
3051 else
3052 goto invalid_code;
3053 break;
3054
3055 case '2': /* start of right-to-left direction */
3056 ONE_MORE_BYTE (c1);
3057 if (c1 == ']')
3058 coding->mode |= CODING_MODE_DIRECTION;
3059 else
3060 goto invalid_code;
3061 break;
3062
3063 default:
3064 goto invalid_code;
3065 }
3066 continue;
3067
3068 case '%':
3069 ONE_MORE_BYTE (c1);
3070 if (c1 == '/')
3071 {
3072 /* CTEXT extended segment:
3073 ESC % / [0-4] M L --ENCODING-NAME-- \002 --BYTES--
3074 We keep these bytes as is for the moment.
3075 They may be decoded by post-read-conversion. */
3076 int dim, M, L;
3077 int size;
3078
3079 ONE_MORE_BYTE (dim);
3080 ONE_MORE_BYTE (M);
3081 ONE_MORE_BYTE (L);
3082 size = ((M - 128) * 128) + (L - 128);
3083 if (charbuf + 8 + size > charbuf_end)
3084 goto break_loop;
3085 *charbuf++ = ISO_CODE_ESC;
3086 *charbuf++ = '%';
3087 *charbuf++ = '/';
3088 *charbuf++ = dim;
3089 *charbuf++ = BYTE8_TO_CHAR (M);
3090 *charbuf++ = BYTE8_TO_CHAR (L);
3091 while (size-- > 0)
3092 {
3093 ONE_MORE_BYTE (c1);
3094 *charbuf++ = ASCII_BYTE_P (c1) ? c1 : BYTE8_TO_CHAR (c1);
3095 }
3096 }
3097 else if (c1 == 'G')
3098 {
3099 /* XFree86 extension for embedding UTF-8 in CTEXT:
3100 ESC % G --UTF-8-BYTES-- ESC % @
3101 We keep these bytes as is for the moment.
3102 They may be decoded by post-read-conversion. */
3103 int *p = charbuf;
3104
3105 if (p + 6 > charbuf_end)
3106 goto break_loop;
3107 *p++ = ISO_CODE_ESC;
3108 *p++ = '%';
3109 *p++ = 'G';
3110 while (p < charbuf_end)
3111 {
3112 ONE_MORE_BYTE (c1);
3113 if (c1 == ISO_CODE_ESC
3114 && src + 1 < src_end
3115 && src[0] == '%'
3116 && src[1] == '@')
3117 break;
3118 *p++ = ASCII_BYTE_P (c1) ? c1 : BYTE8_TO_CHAR (c1);
3119 }
3120 if (p + 3 > charbuf_end)
3121 goto break_loop;
3122 *p++ = ISO_CODE_ESC;
3123 *p++ = '%';
3124 *p++ = '@';
3125 charbuf = p;
3126 }
3127 else
3128 goto invalid_code;
3129 continue;
3130 break;
3131
3132 default:
3133 if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DESIGNATION))
3134 goto invalid_code;
3135 if (c1 >= 0x28 && c1 <= 0x2B)
3136 { /* designation of DIMENSION1_CHARS94 character set */
3137 ONE_MORE_BYTE (c2);
3138 DECODE_DESIGNATION (c1 - 0x28, 1, 0, c2);
3139 }
3140 else if (c1 >= 0x2C && c1 <= 0x2F)
3141 { /* designation of DIMENSION1_CHARS96 character set */
3142 ONE_MORE_BYTE (c2);
3143 DECODE_DESIGNATION (c1 - 0x2C, 1, 1, c2);
3144 }
3145 else
3146 goto invalid_code;
3147 /* We must update these variables now. */
3148 charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3149 charset_id_1 = CODING_ISO_INVOKED_CHARSET (coding, 1);
3150 continue;
3151 }
3152 }
3153
3154 if (charset->id != charset_ascii
3155 && last_id != charset->id)
3156 {
3157 if (last_id != charset_ascii)
3158 ADD_CHARSET_DATA (charbuf, last_offset, char_offset, last_id);
3159 last_id = charset->id;
3160 last_offset = char_offset;
3161 }
3162
3163 /* Now we know CHARSET and 1st position code C1 of a character.
3164 Produce a decoded character while getting 2nd position code
3165 C2 if necessary. */
3166 c1 &= 0x7F;
3167 if (CHARSET_DIMENSION (charset) > 1)
3168 {
3169 ONE_MORE_BYTE (c2);
3170 if (c2 < 0x20 || (c2 >= 0x80 && c2 < 0xA0))
3171 /* C2 is not in a valid range. */
3172 goto invalid_code;
3173 c1 = (c1 << 8) | (c2 & 0x7F);
3174 if (CHARSET_DIMENSION (charset) > 2)
3175 {
3176 ONE_MORE_BYTE (c2);
3177 if (c2 < 0x20 || (c2 >= 0x80 && c2 < 0xA0))
3178 /* C2 is not in a valid range. */
3179 goto invalid_code;
3180 c1 = (c1 << 8) | (c2 & 0x7F);
3181 }
3182 }
3183
3184 CODING_DECODE_CHAR (coding, src, src_base, src_end, charset, c1, c);
3185 if (c < 0)
3186 {
3187 MAYBE_FINISH_COMPOSITION ();
3188 for (; src_base < src; src_base++, char_offset++)
3189 {
3190 if (ASCII_BYTE_P (*src_base))
3191 *charbuf++ = *src_base;
3192 else
3193 *charbuf++ = BYTE8_TO_CHAR (*src_base);
3194 }
3195 }
3196 else if (composition_state == COMPOSING_NO)
3197 {
3198 *charbuf++ = c;
3199 char_offset++;
3200 }
3201 else
3202 {
3203 components[component_idx++] = c;
3204 if (method == COMPOSITION_WITH_RULE
3205 || (method == COMPOSITION_WITH_RULE_ALTCHARS
3206 && composition_state == COMPOSING_COMPONENT_CHAR))
3207 composition_state++;
3208 }
3209 continue;
3210
3211 invalid_code:
3212 MAYBE_FINISH_COMPOSITION ();
3213 src = src_base;
3214 consumed_chars = consumed_chars_base;
3215 ONE_MORE_BYTE (c);
3216 *charbuf++ = ASCII_BYTE_P (c) ? c : BYTE8_TO_CHAR (c);
3217 char_offset++;
3218 coding->errors++;
3219 continue;
3220
3221 break_loop:
3222 break;
3223 }
3224
3225 no_more_source:
3226 if (last_id != charset_ascii)
3227 ADD_CHARSET_DATA (charbuf, last_offset, char_offset, last_id);
3228 coding->consumed_char += consumed_chars_base;
3229 coding->consumed = src_base - coding->source;
3230 coding->charbuf_used = charbuf - coding->charbuf;
3231 }
3232
3233
3234 /* ISO2022 encoding stuff. */
3235
3236 /*
3237 It is not enough to say just "ISO2022" on encoding, we have to
3238 specify more details. In Emacs, each coding system of ISO2022
3239 variant has the following specifications:
3240 1. Initial designation to G0 thru G3.
3241 2. Allows short-form designation?
3242 3. ASCII should be designated to G0 before control characters?
3243 4. ASCII should be designated to G0 at end of line?
3244 5. 7-bit environment or 8-bit environment?
3245 6. Use locking-shift?
3246 7. Use Single-shift?
3247 And the following two are only for Japanese:
3248 8. Use ASCII in place of JIS0201-1976-Roman?
3249 9. Use JISX0208-1983 in place of JISX0208-1978?
3250 These specifications are encoded in CODING_ISO_FLAGS (coding) as flag bits
3251 defined by macros CODING_ISO_FLAG_XXX. See `coding.h' for more
3252 details.
3253 */
3254
3255 /* Produce codes (escape sequence) for designating CHARSET to graphic
3256 register REG at DST, and increment DST. If <final-char> of CHARSET is
3257 '@', 'A', or 'B' and the coding system CODING allows, produce
3258 designation sequence of short-form. */
3259
3260 #define ENCODE_DESIGNATION(charset, reg, coding) \
3261 do { \
3262 unsigned char final_char = CHARSET_ISO_FINAL (charset); \
3263 char *intermediate_char_94 = "()*+"; \
3264 char *intermediate_char_96 = ",-./"; \
3265 int revision = -1; \
3266 int c; \
3267 \
3268 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_REVISION) \
3269 revision = CHARSET_ISO_REVISION (charset); \
3270 \
3271 if (revision >= 0) \
3272 { \
3273 EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, '&'); \
3274 EMIT_ONE_BYTE ('@' + revision); \
3275 } \
3276 EMIT_ONE_ASCII_BYTE (ISO_CODE_ESC); \
3277 if (CHARSET_DIMENSION (charset) == 1) \
3278 { \
3279 if (! CHARSET_ISO_CHARS_96 (charset)) \
3280 c = intermediate_char_94[reg]; \
3281 else \
3282 c = intermediate_char_96[reg]; \
3283 EMIT_ONE_ASCII_BYTE (c); \
3284 } \
3285 else \
3286 { \
3287 EMIT_ONE_ASCII_BYTE ('$'); \
3288 if (! CHARSET_ISO_CHARS_96 (charset)) \
3289 { \
3290 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LONG_FORM \
3291 || reg != 0 \
3292 || final_char < '@' || final_char > 'B') \
3293 EMIT_ONE_ASCII_BYTE (intermediate_char_94[reg]); \
3294 } \
3295 else \
3296 EMIT_ONE_ASCII_BYTE (intermediate_char_96[reg]); \
3297 } \
3298 EMIT_ONE_ASCII_BYTE (final_char); \
3299 \
3300 CODING_ISO_DESIGNATION (coding, reg) = CHARSET_ID (charset); \
3301 } while (0)
3302
3303
3304 /* The following two macros produce codes (control character or escape
3305 sequence) for ISO2022 single-shift functions (single-shift-2 and
3306 single-shift-3). */
3307
3308 #define ENCODE_SINGLE_SHIFT_2 \
3309 do { \
3310 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS) \
3311 EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'N'); \
3312 else \
3313 EMIT_ONE_BYTE (ISO_CODE_SS2); \
3314 CODING_ISO_SINGLE_SHIFTING (coding) = 1; \
3315 } while (0)
3316
3317
3318 #define ENCODE_SINGLE_SHIFT_3 \
3319 do { \
3320 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS) \
3321 EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'O'); \
3322 else \
3323 EMIT_ONE_BYTE (ISO_CODE_SS3); \
3324 CODING_ISO_SINGLE_SHIFTING (coding) = 1; \
3325 } while (0)
3326
3327
3328 /* The following four macros produce codes (control character or
3329 escape sequence) for ISO2022 locking-shift functions (shift-in,
3330 shift-out, locking-shift-2, and locking-shift-3). */
3331
3332 #define ENCODE_SHIFT_IN \
3333 do { \
3334 EMIT_ONE_ASCII_BYTE (ISO_CODE_SI); \
3335 CODING_ISO_INVOCATION (coding, 0) = 0; \
3336 } while (0)
3337
3338
3339 #define ENCODE_SHIFT_OUT \
3340 do { \
3341 EMIT_ONE_ASCII_BYTE (ISO_CODE_SO); \
3342 CODING_ISO_INVOCATION (coding, 0) = 1; \
3343 } while (0)
3344
3345
3346 #define ENCODE_LOCKING_SHIFT_2 \
3347 do { \
3348 EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'n'); \
3349 CODING_ISO_INVOCATION (coding, 0) = 2; \
3350 } while (0)
3351
3352
3353 #define ENCODE_LOCKING_SHIFT_3 \
3354 do { \
3355 EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'n'); \
3356 CODING_ISO_INVOCATION (coding, 0) = 3; \
3357 } while (0)
3358
3359
3360 /* Produce codes for a DIMENSION1 character whose character set is
3361 CHARSET and whose position-code is C1. Designation and invocation
3362 sequences are also produced in advance if necessary. */
3363
3364 #define ENCODE_ISO_CHARACTER_DIMENSION1(charset, c1) \
3365 do { \
3366 int id = CHARSET_ID (charset); \
3367 \
3368 if ((CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_USE_ROMAN) \
3369 && id == charset_ascii) \
3370 { \
3371 id = charset_jisx0201_roman; \
3372 charset = CHARSET_FROM_ID (id); \
3373 } \
3374 \
3375 if (CODING_ISO_SINGLE_SHIFTING (coding)) \
3376 { \
3377 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS) \
3378 EMIT_ONE_ASCII_BYTE (c1 & 0x7F); \
3379 else \
3380 EMIT_ONE_BYTE (c1 | 0x80); \
3381 CODING_ISO_SINGLE_SHIFTING (coding) = 0; \
3382 break; \
3383 } \
3384 else if (id == CODING_ISO_INVOKED_CHARSET (coding, 0)) \
3385 { \
3386 EMIT_ONE_ASCII_BYTE (c1 & 0x7F); \
3387 break; \
3388 } \
3389 else if (id == CODING_ISO_INVOKED_CHARSET (coding, 1)) \
3390 { \
3391 EMIT_ONE_BYTE (c1 | 0x80); \
3392 break; \
3393 } \
3394 else \
3395 /* Since CHARSET is not yet invoked to any graphic planes, we \
3396 must invoke it, or, at first, designate it to some graphic \
3397 register. Then repeat the loop to actually produce the \
3398 character. */ \
3399 dst = encode_invocation_designation (charset, coding, dst, \
3400 &produced_chars); \
3401 } while (1)
3402
3403
3404 /* Produce codes for a DIMENSION2 character whose character set is
3405 CHARSET and whose position-codes are C1 and C2. Designation and
3406 invocation codes are also produced in advance if necessary. */
3407
3408 #define ENCODE_ISO_CHARACTER_DIMENSION2(charset, c1, c2) \
3409 do { \
3410 int id = CHARSET_ID (charset); \
3411 \
3412 if ((CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_USE_OLDJIS) \
3413 && id == charset_jisx0208) \
3414 { \
3415 id = charset_jisx0208_1978; \
3416 charset = CHARSET_FROM_ID (id); \
3417 } \
3418 \
3419 if (CODING_ISO_SINGLE_SHIFTING (coding)) \
3420 { \
3421 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS) \
3422 EMIT_TWO_ASCII_BYTES ((c1) & 0x7F, (c2) & 0x7F); \
3423 else \
3424 EMIT_TWO_BYTES ((c1) | 0x80, (c2) | 0x80); \
3425 CODING_ISO_SINGLE_SHIFTING (coding) = 0; \
3426 break; \
3427 } \
3428 else if (id == CODING_ISO_INVOKED_CHARSET (coding, 0)) \
3429 { \
3430 EMIT_TWO_ASCII_BYTES ((c1) & 0x7F, (c2) & 0x7F); \
3431 break; \
3432 } \
3433 else if (id == CODING_ISO_INVOKED_CHARSET (coding, 1)) \
3434 { \
3435 EMIT_TWO_BYTES ((c1) | 0x80, (c2) | 0x80); \
3436 break; \
3437 } \
3438 else \
3439 /* Since CHARSET is not yet invoked to any graphic planes, we \
3440 must invoke it, or, at first, designate it to some graphic \
3441 register. Then repeat the loop to actually produce the \
3442 character. */ \
3443 dst = encode_invocation_designation (charset, coding, dst, \
3444 &produced_chars); \
3445 } while (1)
3446
3447
3448 #define ENCODE_ISO_CHARACTER(charset, c) \
3449 do { \
3450 int code = ENCODE_CHAR ((charset),(c)); \
3451 \
3452 if (CHARSET_DIMENSION (charset) == 1) \
3453 ENCODE_ISO_CHARACTER_DIMENSION1 ((charset), code); \
3454 else \
3455 ENCODE_ISO_CHARACTER_DIMENSION2 ((charset), code >> 8, code & 0xFF); \
3456 } while (0)
3457
3458
3459 /* Produce designation and invocation codes at a place pointed by DST
3460 to use CHARSET. The element `spec.iso_2022' of *CODING is updated.
3461 Return new DST. */
3462
3463 unsigned char *
3464 encode_invocation_designation (charset, coding, dst, p_nchars)
3465 struct charset *charset;
3466 struct coding_system *coding;
3467 unsigned char *dst;
3468 int *p_nchars;
3469 {
3470 int multibytep = coding->dst_multibyte;
3471 int produced_chars = *p_nchars;
3472 int reg; /* graphic register number */
3473 int id = CHARSET_ID (charset);
3474
3475 /* At first, check designations. */
3476 for (reg = 0; reg < 4; reg++)
3477 if (id == CODING_ISO_DESIGNATION (coding, reg))
3478 break;
3479
3480 if (reg >= 4)
3481 {
3482 /* CHARSET is not yet designated to any graphic registers. */
3483 /* At first check the requested designation. */
3484 reg = CODING_ISO_REQUEST (coding, id);
3485 if (reg < 0)
3486 /* Since CHARSET requests no special designation, designate it
3487 to graphic register 0. */
3488 reg = 0;
3489
3490 ENCODE_DESIGNATION (charset, reg, coding);
3491 }
3492
3493 if (CODING_ISO_INVOCATION (coding, 0) != reg
3494 && CODING_ISO_INVOCATION (coding, 1) != reg)
3495 {
3496 /* Since the graphic register REG is not invoked to any graphic
3497 planes, invoke it to graphic plane 0. */
3498 switch (reg)
3499 {
3500 case 0: /* graphic register 0 */
3501 ENCODE_SHIFT_IN;
3502 break;
3503
3504 case 1: /* graphic register 1 */
3505 ENCODE_SHIFT_OUT;
3506 break;
3507
3508 case 2: /* graphic register 2 */
3509 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT)
3510 ENCODE_SINGLE_SHIFT_2;
3511 else
3512 ENCODE_LOCKING_SHIFT_2;
3513 break;
3514
3515 case 3: /* graphic register 3 */
3516 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT)
3517 ENCODE_SINGLE_SHIFT_3;
3518 else
3519 ENCODE_LOCKING_SHIFT_3;
3520 break;
3521 }
3522 }
3523
3524 *p_nchars = produced_chars;
3525 return dst;
3526 }
3527
3528 /* The following three macros produce codes for indicating direction
3529 of text. */
3530 #define ENCODE_CONTROL_SEQUENCE_INTRODUCER \
3531 do { \
3532 if (CODING_ISO_FLAGS (coding) == CODING_ISO_FLAG_SEVEN_BITS) \
3533 EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, '['); \
3534 else \
3535 EMIT_ONE_BYTE (ISO_CODE_CSI); \
3536 } while (0)
3537
3538
3539 #define ENCODE_DIRECTION_R2L() \
3540 do { \
3541 ENCODE_CONTROL_SEQUENCE_INTRODUCER (dst); \
3542 EMIT_TWO_ASCII_BYTES ('2', ']'); \
3543 } while (0)
3544
3545
3546 #define ENCODE_DIRECTION_L2R() \
3547 do { \
3548 ENCODE_CONTROL_SEQUENCE_INTRODUCER (dst); \
3549 EMIT_TWO_ASCII_BYTES ('0', ']'); \
3550 } while (0)
3551
3552
3553 /* Produce codes for designation and invocation to reset the graphic
3554 planes and registers to initial state. */
3555 #define ENCODE_RESET_PLANE_AND_REGISTER() \
3556 do { \
3557 int reg; \
3558 struct charset *charset; \
3559 \
3560 if (CODING_ISO_INVOCATION (coding, 0) != 0) \
3561 ENCODE_SHIFT_IN; \
3562 for (reg = 0; reg < 4; reg++) \
3563 if (CODING_ISO_INITIAL (coding, reg) >= 0 \
3564 && (CODING_ISO_DESIGNATION (coding, reg) \
3565 != CODING_ISO_INITIAL (coding, reg))) \
3566 { \
3567 charset = CHARSET_FROM_ID (CODING_ISO_INITIAL (coding, reg)); \
3568 ENCODE_DESIGNATION (charset, reg, coding); \
3569 } \
3570 } while (0)
3571
3572
3573 /* Produce designation sequences of charsets in the line started from
3574 SRC to a place pointed by DST, and return updated DST.
3575
3576 If the current block ends before any end-of-line, we may fail to
3577 find all the necessary designations. */
3578
3579 static unsigned char *
3580 encode_designation_at_bol (coding, charbuf, charbuf_end, dst)
3581 struct coding_system *coding;
3582 int *charbuf, *charbuf_end;
3583 unsigned char *dst;
3584 {
3585 struct charset *charset;
3586 /* Table of charsets to be designated to each graphic register. */
3587 int r[4];
3588 int c, found = 0, reg;
3589 int produced_chars = 0;
3590 int multibytep = coding->dst_multibyte;
3591 Lisp_Object attrs;
3592 Lisp_Object charset_list;
3593
3594 attrs = CODING_ID_ATTRS (coding->id);
3595 charset_list = CODING_ATTR_CHARSET_LIST (attrs);
3596 if (EQ (charset_list, Qiso_2022))
3597 charset_list = Viso_2022_charset_list;
3598
3599 for (reg = 0; reg < 4; reg++)
3600 r[reg] = -1;
3601
3602 while (found < 4)
3603 {
3604 int id;
3605
3606 c = *charbuf++;
3607 if (c == '\n')
3608 break;
3609 charset = char_charset (c, charset_list, NULL);
3610 id = CHARSET_ID (charset);
3611 reg = CODING_ISO_REQUEST (coding, id);
3612 if (reg >= 0 && r[reg] < 0)
3613 {
3614 found++;
3615 r[reg] = id;
3616 }
3617 }
3618
3619 if (found)
3620 {
3621 for (reg = 0; reg < 4; reg++)
3622 if (r[reg] >= 0
3623 && CODING_ISO_DESIGNATION (coding, reg) != r[reg])
3624 ENCODE_DESIGNATION (CHARSET_FROM_ID (r[reg]), reg, coding);
3625 }
3626
3627 return dst;
3628 }
3629
3630 /* See the above "GENERAL NOTES on `encode_coding_XXX ()' functions". */
3631
3632 static int
3633 encode_coding_iso_2022 (coding)
3634 struct coding_system *coding;
3635 {
3636 int multibytep = coding->dst_multibyte;
3637 int *charbuf = coding->charbuf;
3638 int *charbuf_end = charbuf + coding->charbuf_used;
3639 unsigned char *dst = coding->destination + coding->produced;
3640 unsigned char *dst_end = coding->destination + coding->dst_bytes;
3641 int safe_room = 16;
3642 int bol_designation
3643 = (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DESIGNATE_AT_BOL
3644 && CODING_ISO_BOL (coding));
3645 int produced_chars = 0;
3646 Lisp_Object attrs, eol_type, charset_list;
3647 int ascii_compatible;
3648 int c;
3649 int preferred_charset_id = -1;
3650
3651 CODING_GET_INFO (coding, attrs, eol_type, charset_list);
3652 setup_iso_safe_charsets (attrs);
3653 /* Charset list may have been changed. */
3654 charset_list = CODING_ATTR_CHARSET_LIST (attrs); \
3655 coding->safe_charsets = (char *) SDATA (CODING_ATTR_SAFE_CHARSETS(attrs));
3656
3657 ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs));
3658
3659 while (charbuf < charbuf_end)
3660 {
3661 ASSURE_DESTINATION (safe_room);
3662
3663 if (bol_designation)
3664 {
3665 unsigned char *dst_prev = dst;
3666
3667 /* We have to produce designation sequences if any now. */
3668 dst = encode_designation_at_bol (coding, charbuf, charbuf_end, dst);
3669 bol_designation = 0;
3670 /* We are sure that designation sequences are all ASCII bytes. */
3671 produced_chars += dst - dst_prev;
3672 }
3673
3674 c = *charbuf++;
3675
3676 if (c < 0)
3677 {
3678 /* Handle an annotation. */
3679 switch (*charbuf)
3680 {
3681 case CODING_ANNOTATE_COMPOSITION_MASK:
3682 /* Not yet implemented. */
3683 break;
3684 case CODING_ANNOTATE_CHARSET_MASK:
3685 preferred_charset_id = charbuf[3];
3686 if (preferred_charset_id >= 0
3687 && NILP (Fmemq (make_number (preferred_charset_id),
3688 charset_list)))
3689 preferred_charset_id = -1;
3690 break;
3691 default:
3692 abort ();
3693 }
3694 charbuf += -c - 1;
3695 continue;
3696 }
3697
3698 /* Now encode the character C. */
3699 if (c < 0x20 || c == 0x7F)
3700 {
3701 if (c == '\n'
3702 || (c == '\r' && EQ (eol_type, Qmac)))
3703 {
3704 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_RESET_AT_EOL)
3705 ENCODE_RESET_PLANE_AND_REGISTER ();
3706 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_INIT_AT_BOL)
3707 {
3708 int i;
3709
3710 for (i = 0; i < 4; i++)
3711 CODING_ISO_DESIGNATION (coding, i)
3712 = CODING_ISO_INITIAL (coding, i);
3713 }
3714 bol_designation
3715 = CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DESIGNATE_AT_BOL;
3716 }
3717 else if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_RESET_AT_CNTL)
3718 ENCODE_RESET_PLANE_AND_REGISTER ();
3719 EMIT_ONE_ASCII_BYTE (c);
3720 }
3721 else if (ASCII_CHAR_P (c))
3722 {
3723 if (ascii_compatible)
3724 EMIT_ONE_ASCII_BYTE (c);
3725 else
3726 {
3727 struct charset *charset = CHARSET_FROM_ID (charset_ascii);
3728 ENCODE_ISO_CHARACTER (charset, c);
3729 }
3730 }
3731 else if (CHAR_BYTE8_P (c))
3732 {
3733 c = CHAR_TO_BYTE8 (c);
3734 EMIT_ONE_BYTE (c);
3735 }
3736 else
3737 {
3738 struct charset *charset;
3739
3740 if (preferred_charset_id >= 0)
3741 {
3742 charset = CHARSET_FROM_ID (preferred_charset_id);
3743 if (! CHAR_CHARSET_P (c, charset))
3744 charset = char_charset (c, charset_list, NULL);
3745 }
3746 else
3747 charset = char_charset (c, charset_list, NULL);
3748 if (!charset)
3749 {
3750 if (coding->mode & CODING_MODE_SAFE_ENCODING)
3751 {
3752 c = CODING_INHIBIT_CHARACTER_SUBSTITUTION;
3753 charset = CHARSET_FROM_ID (charset_ascii);
3754 }
3755 else
3756 {
3757 c = coding->default_char;
3758 charset = char_charset (c, charset_list, NULL);
3759 }
3760 }
3761 ENCODE_ISO_CHARACTER (charset, c);
3762 }
3763 }
3764
3765 if (coding->mode & CODING_MODE_LAST_BLOCK
3766 && CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_RESET_AT_EOL)
3767 {
3768 ASSURE_DESTINATION (safe_room);
3769 ENCODE_RESET_PLANE_AND_REGISTER ();
3770 }
3771 coding->result = CODING_RESULT_SUCCESS;
3772 CODING_ISO_BOL (coding) = bol_designation;
3773 coding->produced_char += produced_chars;
3774 coding->produced = dst - coding->destination;
3775 return 0;
3776 }
3777
3778 \f
3779 /*** 8,9. SJIS and BIG5 handlers ***/
3780
3781 /* Although SJIS and BIG5 are not ISO's coding system, they are used
3782 quite widely. So, for the moment, Emacs supports them in the bare
3783 C code. But, in the future, they may be supported only by CCL. */
3784
3785 /* SJIS is a coding system encoding three character sets: ASCII, right
3786 half of JISX0201-Kana, and JISX0208. An ASCII character is encoded
3787 as is. A character of charset katakana-jisx0201 is encoded by
3788 "position-code + 0x80". A character of charset japanese-jisx0208
3789 is encoded in 2-byte but two position-codes are divided and shifted
3790 so that it fit in the range below.
3791
3792 --- CODE RANGE of SJIS ---
3793 (character set) (range)
3794 ASCII 0x00 .. 0x7F
3795 KATAKANA-JISX0201 0xA0 .. 0xDF
3796 JISX0208 (1st byte) 0x81 .. 0x9F and 0xE0 .. 0xEF
3797 (2nd byte) 0x40 .. 0x7E and 0x80 .. 0xFC
3798 -------------------------------
3799
3800 */
3801
3802 /* BIG5 is a coding system encoding two character sets: ASCII and
3803 Big5. An ASCII character is encoded as is. Big5 is a two-byte
3804 character set and is encoded in two-byte.
3805
3806 --- CODE RANGE of BIG5 ---
3807 (character set) (range)
3808 ASCII 0x00 .. 0x7F
3809 Big5 (1st byte) 0xA1 .. 0xFE
3810 (2nd byte) 0x40 .. 0x7E and 0xA1 .. 0xFE
3811 --------------------------
3812
3813 */
3814
3815 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
3816 Check if a text is encoded in SJIS. If it is, return
3817 CATEGORY_MASK_SJIS, else return 0. */
3818
3819 static int
3820 detect_coding_sjis (coding, detect_info)
3821 struct coding_system *coding;
3822 struct coding_detection_info *detect_info;
3823 {
3824 const unsigned char *src = coding->source, *src_base = src;
3825 const unsigned char *src_end = coding->source + coding->src_bytes;
3826 int multibytep = coding->src_multibyte;
3827 int consumed_chars = 0;
3828 int found = 0;
3829 int c;
3830 int incomplete;
3831
3832 detect_info->checked |= CATEGORY_MASK_SJIS;
3833 /* A coding system of this category is always ASCII compatible. */
3834 src += coding->head_ascii;
3835
3836 while (1)
3837 {
3838 incomplete = 0;
3839 ONE_MORE_BYTE (c);
3840 incomplete = 1;
3841 if (c < 0x80)
3842 continue;
3843 if ((c >= 0x81 && c <= 0x9F) || (c >= 0xE0 && c <= 0xEF))
3844 {
3845 ONE_MORE_BYTE (c);
3846 if (c < 0x40 || c == 0x7F || c > 0xFC)
3847 break;
3848 found = CATEGORY_MASK_SJIS;
3849 }
3850 else if (c >= 0xA0 && c < 0xE0)
3851 found = CATEGORY_MASK_SJIS;
3852 else
3853 break;
3854 }
3855 detect_info->rejected |= CATEGORY_MASK_SJIS;
3856 return 0;
3857
3858 no_more_source:
3859 if (incomplete && coding->mode & CODING_MODE_LAST_BLOCK)
3860 {
3861 detect_info->rejected |= CATEGORY_MASK_SJIS;
3862 return 0;
3863 }
3864 detect_info->found |= found;
3865 return 1;
3866 }
3867
3868 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
3869 Check if a text is encoded in BIG5. If it is, return
3870 CATEGORY_MASK_BIG5, else return 0. */
3871
3872 static int
3873 detect_coding_big5 (coding, detect_info)
3874 struct coding_system *coding;
3875 struct coding_detection_info *detect_info;
3876 {
3877 const unsigned char *src = coding->source, *src_base = src;
3878 const unsigned char *src_end = coding->source + coding->src_bytes;
3879 int multibytep = coding->src_multibyte;
3880 int consumed_chars = 0;
3881 int found = 0;
3882 int c;
3883 int incomplete;
3884
3885 detect_info->checked |= CATEGORY_MASK_BIG5;
3886 /* A coding system of this category is always ASCII compatible. */
3887 src += coding->head_ascii;
3888
3889 while (1)
3890 {
3891 incomplete = 0;
3892 ONE_MORE_BYTE (c);
3893 incomplete = 1;
3894 if (c < 0x80)
3895 continue;
3896 if (c >= 0xA1)
3897 {
3898 ONE_MORE_BYTE (c);
3899 if (c < 0x40 || (c >= 0x7F && c <= 0xA0))
3900 return 0;
3901 found = CATEGORY_MASK_BIG5;
3902 }
3903 else
3904 break;
3905 }
3906 detect_info->rejected |= CATEGORY_MASK_BIG5;
3907 return 0;
3908
3909 no_more_source:
3910 if (incomplete && coding->mode & CODING_MODE_LAST_BLOCK)
3911 {
3912 detect_info->rejected |= CATEGORY_MASK_BIG5;
3913 return 0;
3914 }
3915 detect_info->found |= found;
3916 return 1;
3917 }
3918
3919 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".
3920 If SJIS_P is 1, decode SJIS text, else decode BIG5 test. */
3921
3922 static void
3923 decode_coding_sjis (coding)
3924 struct coding_system *coding;
3925 {
3926 const unsigned char *src = coding->source + coding->consumed;
3927 const unsigned char *src_end = coding->source + coding->src_bytes;
3928 const unsigned char *src_base;
3929 int *charbuf = coding->charbuf;
3930 int *charbuf_end = charbuf + coding->charbuf_size - MAX_ANNOTATION_LENGTH;
3931 int consumed_chars = 0, consumed_chars_base;
3932 int multibytep = coding->src_multibyte;
3933 struct charset *charset_roman, *charset_kanji, *charset_kana;
3934 Lisp_Object attrs, eol_type, charset_list, val;
3935 int char_offset = coding->produced_char;
3936 int last_offset = char_offset;
3937 int last_id = charset_ascii;
3938
3939 CODING_GET_INFO (coding, attrs, eol_type, charset_list);
3940
3941 val = charset_list;
3942 charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
3943 charset_kana = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
3944 charset_kanji = CHARSET_FROM_ID (XINT (XCAR (val)));
3945
3946 while (1)
3947 {
3948 int c, c1;
3949
3950 src_base = src;
3951 consumed_chars_base = consumed_chars;
3952
3953 if (charbuf >= charbuf_end)
3954 break;
3955
3956 ONE_MORE_BYTE (c);
3957
3958 if (c == '\r')
3959 {
3960 if (EQ (eol_type, Qdos))
3961 {
3962 if (src == src_end)
3963 {
3964 coding->result = CODING_RESULT_INSUFFICIENT_SRC;
3965 goto no_more_source;
3966 }
3967 if (*src == '\n')
3968 ONE_MORE_BYTE (c);
3969 }
3970 else if (EQ (eol_type, Qmac))
3971 c = '\n';
3972 }
3973 else
3974 {
3975 struct charset *charset;
3976
3977 if (c < 0x80)
3978 charset = charset_roman;
3979 else
3980 {
3981 if (c >= 0xF0)
3982 goto invalid_code;
3983 if (c < 0xA0 || c >= 0xE0)
3984 {
3985 /* SJIS -> JISX0208 */
3986 ONE_MORE_BYTE (c1);
3987 if (c1 < 0x40 || c1 == 0x7F || c1 > 0xFC)
3988 goto invalid_code;
3989 c = (c << 8) | c1;
3990 SJIS_TO_JIS (c);
3991 charset = charset_kanji;
3992 }
3993 else if (c > 0xA0)
3994 {
3995 /* SJIS -> JISX0201-Kana */
3996 c &= 0x7F;
3997 charset = charset_kana;
3998 }
3999 else
4000 goto invalid_code;
4001 }
4002 if (charset->id != charset_ascii
4003 && last_id != charset->id)
4004 {
4005 if (last_id != charset_ascii)
4006 ADD_CHARSET_DATA (charbuf, last_offset, char_offset, last_id);
4007 last_id = charset->id;
4008 last_offset = char_offset;
4009 }
4010 CODING_DECODE_CHAR (coding, src, src_base, src_end, charset, c, c);
4011 }
4012 *charbuf++ = c;
4013 char_offset++;
4014 continue;
4015
4016 invalid_code:
4017 src = src_base;
4018 consumed_chars = consumed_chars_base;
4019 ONE_MORE_BYTE (c);
4020 *charbuf++ = ASCII_BYTE_P (c) ? c : BYTE8_TO_CHAR (c);
4021 char_offset++;
4022 coding->errors++;
4023 }
4024
4025 no_more_source:
4026 if (last_id != charset_ascii)
4027 ADD_CHARSET_DATA (charbuf, last_offset, char_offset, last_id);
4028 coding->consumed_char += consumed_chars_base;
4029 coding->consumed = src_base - coding->source;
4030 coding->charbuf_used = charbuf - coding->charbuf;
4031 }
4032
4033 static void
4034 decode_coding_big5 (coding)
4035 struct coding_system *coding;
4036 {
4037 const unsigned char *src = coding->source + coding->consumed;
4038 const unsigned char *src_end = coding->source + coding->src_bytes;
4039 const unsigned char *src_base;
4040 int *charbuf = coding->charbuf;
4041 int *charbuf_end = charbuf + coding->charbuf_size - MAX_ANNOTATION_LENGTH;
4042 int consumed_chars = 0, consumed_chars_base;
4043 int multibytep = coding->src_multibyte;
4044 struct charset *charset_roman, *charset_big5;
4045 Lisp_Object attrs, eol_type, charset_list, val;
4046 int char_offset = coding->produced_char;
4047 int last_offset = char_offset;
4048 int last_id = charset_ascii;
4049
4050 CODING_GET_INFO (coding, attrs, eol_type, charset_list);
4051 val = charset_list;
4052 charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4053 charset_big5 = CHARSET_FROM_ID (XINT (XCAR (val)));
4054
4055 while (1)
4056 {
4057 int c, c1;
4058
4059 src_base = src;
4060 consumed_chars_base = consumed_chars;
4061
4062 if (charbuf >= charbuf_end)
4063 break;
4064
4065 ONE_MORE_BYTE (c);
4066
4067 if (c == '\r')
4068 {
4069 if (EQ (eol_type, Qdos))
4070 {
4071 if (src == src_end)
4072 {
4073 coding->result = CODING_RESULT_INSUFFICIENT_SRC;
4074 goto no_more_source;
4075 }
4076 if (*src == '\n')
4077 ONE_MORE_BYTE (c);
4078 }
4079 else if (EQ (eol_type, Qmac))
4080 c = '\n';
4081 }
4082 else
4083 {
4084 struct charset *charset;
4085 if (c < 0x80)
4086 charset = charset_roman;
4087 else
4088 {
4089 /* BIG5 -> Big5 */
4090 if (c < 0xA1 || c > 0xFE)
4091 goto invalid_code;
4092 ONE_MORE_BYTE (c1);
4093 if (c1 < 0x40 || (c1 > 0x7E && c1 < 0xA1) || c1 > 0xFE)
4094 goto invalid_code;
4095 c = c << 8 | c1;
4096 charset = charset_big5;
4097 }
4098 if (charset->id != charset_ascii
4099 && last_id != charset->id)
4100 {
4101 if (last_id != charset_ascii)
4102 ADD_CHARSET_DATA (charbuf, last_offset, char_offset, last_id);
4103 last_id = charset->id;
4104 last_offset = char_offset;
4105 }
4106 CODING_DECODE_CHAR (coding, src, src_base, src_end, charset, c, c);
4107 }
4108
4109 *charbuf++ = c;
4110 char_offset++;
4111 continue;
4112
4113 invalid_code:
4114 src = src_base;
4115 consumed_chars = consumed_chars_base;
4116 ONE_MORE_BYTE (c);
4117 *charbuf++ = ASCII_BYTE_P (c) ? c : BYTE8_TO_CHAR (c);
4118 char_offset++;
4119 coding->errors++;
4120 }
4121
4122 no_more_source:
4123 if (last_id != charset_ascii)
4124 ADD_CHARSET_DATA (charbuf, last_offset, char_offset, last_id);
4125 coding->consumed_char += consumed_chars_base;
4126 coding->consumed = src_base - coding->source;
4127 coding->charbuf_used = charbuf - coding->charbuf;
4128 }
4129
4130 /* See the above "GENERAL NOTES on `encode_coding_XXX ()' functions".
4131 This function can encode charsets `ascii', `katakana-jisx0201',
4132 `japanese-jisx0208', `chinese-big5-1', and `chinese-big5-2'. We
4133 are sure that all these charsets are registered as official charset
4134 (i.e. do not have extended leading-codes). Characters of other
4135 charsets are produced without any encoding. If SJIS_P is 1, encode
4136 SJIS text, else encode BIG5 text. */
4137
4138 static int
4139 encode_coding_sjis (coding)
4140 struct coding_system *coding;
4141 {
4142 int multibytep = coding->dst_multibyte;
4143 int *charbuf = coding->charbuf;
4144 int *charbuf_end = charbuf + coding->charbuf_used;
4145 unsigned char *dst = coding->destination + coding->produced;
4146 unsigned char *dst_end = coding->destination + coding->dst_bytes;
4147 int safe_room = 4;
4148 int produced_chars = 0;
4149 Lisp_Object attrs, eol_type, charset_list, val;
4150 int ascii_compatible;
4151 struct charset *charset_roman, *charset_kanji, *charset_kana;
4152 int c;
4153
4154 CODING_GET_INFO (coding, attrs, eol_type, charset_list);
4155 val = charset_list;
4156 charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4157 charset_kana = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4158 charset_kanji = CHARSET_FROM_ID (XINT (XCAR (val)));
4159
4160 ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs));
4161
4162 while (charbuf < charbuf_end)
4163 {
4164 ASSURE_DESTINATION (safe_room);
4165 c = *charbuf++;
4166 /* Now encode the character C. */
4167 if (ASCII_CHAR_P (c) && ascii_compatible)
4168 EMIT_ONE_ASCII_BYTE (c);
4169 else if (CHAR_BYTE8_P (c))
4170 {
4171 c = CHAR_TO_BYTE8 (c);
4172 EMIT_ONE_BYTE (c);
4173 }
4174 else
4175 {
4176 unsigned code;
4177 struct charset *charset = char_charset (c, charset_list, &code);
4178
4179 if (!charset)
4180 {
4181 if (coding->mode & CODING_MODE_SAFE_ENCODING)
4182 {
4183 code = CODING_INHIBIT_CHARACTER_SUBSTITUTION;
4184 charset = CHARSET_FROM_ID (charset_ascii);
4185 }
4186 else
4187 {
4188 c = coding->default_char;
4189 charset = char_charset (c, charset_list, &code);
4190 }
4191 }
4192 if (code == CHARSET_INVALID_CODE (charset))
4193 abort ();
4194 if (charset == charset_kanji)
4195 {
4196 int c1, c2;
4197 JIS_TO_SJIS (code);
4198 c1 = code >> 8, c2 = code & 0xFF;
4199 EMIT_TWO_BYTES (c1, c2);
4200 }
4201 else if (charset == charset_kana)
4202 EMIT_ONE_BYTE (code | 0x80);
4203 else
4204 EMIT_ONE_ASCII_BYTE (code & 0x7F);
4205 }
4206 }
4207 coding->result = CODING_RESULT_SUCCESS;
4208 coding->produced_char += produced_chars;
4209 coding->produced = dst - coding->destination;
4210 return 0;
4211 }
4212
4213 static int
4214 encode_coding_big5 (coding)
4215 struct coding_system *coding;
4216 {
4217 int multibytep = coding->dst_multibyte;
4218 int *charbuf = coding->charbuf;
4219 int *charbuf_end = charbuf + coding->charbuf_used;
4220 unsigned char *dst = coding->destination + coding->produced;
4221 unsigned char *dst_end = coding->destination + coding->dst_bytes;
4222 int safe_room = 4;
4223 int produced_chars = 0;
4224 Lisp_Object attrs, eol_type, charset_list, val;
4225 int ascii_compatible;
4226 struct charset *charset_roman, *charset_big5;
4227 int c;
4228
4229 CODING_GET_INFO (coding, attrs, eol_type, charset_list);
4230 val = charset_list;
4231 charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4232 charset_big5 = CHARSET_FROM_ID (XINT (XCAR (val)));
4233 ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs));
4234
4235 while (charbuf < charbuf_end)
4236 {
4237 ASSURE_DESTINATION (safe_room);
4238 c = *charbuf++;
4239 /* Now encode the character C. */
4240 if (ASCII_CHAR_P (c) && ascii_compatible)
4241 EMIT_ONE_ASCII_BYTE (c);
4242 else if (CHAR_BYTE8_P (c))
4243 {
4244 c = CHAR_TO_BYTE8 (c);
4245 EMIT_ONE_BYTE (c);
4246 }
4247 else
4248 {
4249 unsigned code;
4250 struct charset *charset = char_charset (c, charset_list, &code);
4251
4252 if (! charset)
4253 {
4254 if (coding->mode & CODING_MODE_SAFE_ENCODING)
4255 {
4256 code = CODING_INHIBIT_CHARACTER_SUBSTITUTION;
4257 charset = CHARSET_FROM_ID (charset_ascii);
4258 }
4259 else
4260 {
4261 c = coding->default_char;
4262 charset = char_charset (c, charset_list, &code);
4263 }
4264 }
4265 if (code == CHARSET_INVALID_CODE (charset))
4266 abort ();
4267 if (charset == charset_big5)
4268 {
4269 int c1, c2;
4270
4271 c1 = code >> 8, c2 = code & 0xFF;
4272 EMIT_TWO_BYTES (c1, c2);
4273 }
4274 else
4275 EMIT_ONE_ASCII_BYTE (code & 0x7F);
4276 }
4277 }
4278 coding->result = CODING_RESULT_SUCCESS;
4279 coding->produced_char += produced_chars;
4280 coding->produced = dst - coding->destination;
4281 return 0;
4282 }
4283
4284 \f
4285 /*** 10. CCL handlers ***/
4286
4287 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
4288 Check if a text is encoded in a coding system of which
4289 encoder/decoder are written in CCL program. If it is, return
4290 CATEGORY_MASK_CCL, else return 0. */
4291
4292 static int
4293 detect_coding_ccl (coding, detect_info)
4294 struct coding_system *coding;
4295 struct coding_detection_info *detect_info;
4296 {
4297 const unsigned char *src = coding->source, *src_base = src;
4298 const unsigned char *src_end = coding->source + coding->src_bytes;
4299 int multibytep = coding->src_multibyte;
4300 int consumed_chars = 0;
4301 int found = 0;
4302 unsigned char *valids = CODING_CCL_VALIDS (coding);
4303 int head_ascii = coding->head_ascii;
4304 Lisp_Object attrs;
4305
4306 detect_info->checked |= CATEGORY_MASK_CCL;
4307
4308 coding = &coding_categories[coding_category_ccl];
4309 attrs = CODING_ID_ATTRS (coding->id);
4310 if (! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
4311 src += head_ascii;
4312
4313 while (1)
4314 {
4315 int c;
4316 ONE_MORE_BYTE (c);
4317 if (! valids[c])
4318 break;
4319 if ((valids[c] > 1))
4320 found = CATEGORY_MASK_CCL;
4321 }
4322 detect_info->rejected |= CATEGORY_MASK_CCL;
4323 return 0;
4324
4325 no_more_source:
4326 detect_info->found |= found;
4327 return 1;
4328 }
4329
4330 static void
4331 decode_coding_ccl (coding)
4332 struct coding_system *coding;
4333 {
4334 const unsigned char *src = coding->source + coding->consumed;
4335 const unsigned char *src_end = coding->source + coding->src_bytes;
4336 int *charbuf = coding->charbuf;
4337 int *charbuf_end = charbuf + coding->charbuf_size;
4338 int consumed_chars = 0;
4339 int multibytep = coding->src_multibyte;
4340 struct ccl_program ccl;
4341 int source_charbuf[1024];
4342 int source_byteidx[1024];
4343 Lisp_Object attrs, eol_type, charset_list;
4344
4345 CODING_GET_INFO (coding, attrs, eol_type, charset_list);
4346 setup_ccl_program (&ccl, CODING_CCL_DECODER (coding));
4347
4348 while (src < src_end)
4349 {
4350 const unsigned char *p = src;
4351 int *source, *source_end;
4352 int i = 0;
4353
4354 if (multibytep)
4355 while (i < 1024 && p < src_end)
4356 {
4357 source_byteidx[i] = p - src;
4358 source_charbuf[i++] = STRING_CHAR_ADVANCE (p);
4359 }
4360 else
4361 while (i < 1024 && p < src_end)
4362 source_charbuf[i++] = *p++;
4363
4364 if (p == src_end && coding->mode & CODING_MODE_LAST_BLOCK)
4365 ccl.last_block = 1;
4366
4367 source = source_charbuf;
4368 source_end = source + i;
4369 while (source < source_end)
4370 {
4371 ccl_driver (&ccl, source, charbuf,
4372 source_end - source, charbuf_end - charbuf,
4373 charset_list);
4374 source += ccl.consumed;
4375 charbuf += ccl.produced;
4376 if (ccl.status != CCL_STAT_SUSPEND_BY_DST)
4377 break;
4378 }
4379 if (source < source_end)
4380 src += source_byteidx[source - source_charbuf];
4381 else
4382 src = p;
4383 consumed_chars += source - source_charbuf;
4384
4385 if (ccl.status != CCL_STAT_SUSPEND_BY_SRC
4386 && ccl.status != CODING_RESULT_INSUFFICIENT_SRC)
4387 break;
4388 }
4389
4390 switch (ccl.status)
4391 {
4392 case CCL_STAT_SUSPEND_BY_SRC:
4393 coding->result = CODING_RESULT_INSUFFICIENT_SRC;
4394 break;
4395 case CCL_STAT_SUSPEND_BY_DST:
4396 break;
4397 case CCL_STAT_QUIT:
4398 case CCL_STAT_INVALID_CMD:
4399 coding->result = CODING_RESULT_INTERRUPT;
4400 break;
4401 default:
4402 coding->result = CODING_RESULT_SUCCESS;
4403 break;
4404 }
4405 coding->consumed_char += consumed_chars;
4406 coding->consumed = src - coding->source;
4407 coding->charbuf_used = charbuf - coding->charbuf;
4408 }
4409
4410 static int
4411 encode_coding_ccl (coding)
4412 struct coding_system *coding;
4413 {
4414 struct ccl_program ccl;
4415 int multibytep = coding->dst_multibyte;
4416 int *charbuf = coding->charbuf;
4417 int *charbuf_end = charbuf + coding->charbuf_used;
4418 unsigned char *dst = coding->destination + coding->produced;
4419 unsigned char *dst_end = coding->destination + coding->dst_bytes;
4420 unsigned char *adjusted_dst_end = dst_end - 1;
4421 int destination_charbuf[1024];
4422 int i, produced_chars = 0;
4423 Lisp_Object attrs, eol_type, charset_list;
4424
4425 CODING_GET_INFO (coding, attrs, eol_type, charset_list);
4426 setup_ccl_program (&ccl, CODING_CCL_ENCODER (coding));
4427
4428 ccl.last_block = coding->mode & CODING_MODE_LAST_BLOCK;
4429 ccl.dst_multibyte = coding->dst_multibyte;
4430
4431 while (charbuf < charbuf_end && dst < adjusted_dst_end)
4432 {
4433 int dst_bytes = dst_end - dst;
4434 if (dst_bytes > 1024)
4435 dst_bytes = 1024;
4436
4437 ccl_driver (&ccl, charbuf, destination_charbuf,
4438 charbuf_end - charbuf, dst_bytes, charset_list);
4439 charbuf += ccl.consumed;
4440 if (multibytep)
4441 for (i = 0; i < ccl.produced; i++)
4442 EMIT_ONE_BYTE (destination_charbuf[i] & 0xFF);
4443 else
4444 {
4445 for (i = 0; i < ccl.produced; i++)
4446 *dst++ = destination_charbuf[i] & 0xFF;
4447 produced_chars += ccl.produced;
4448 }
4449 }
4450
4451 switch (ccl.status)
4452 {
4453 case CCL_STAT_SUSPEND_BY_SRC:
4454 coding->result = CODING_RESULT_INSUFFICIENT_SRC;
4455 break;
4456 case CCL_STAT_SUSPEND_BY_DST:
4457 coding->result = CODING_RESULT_INSUFFICIENT_DST;
4458 break;
4459 case CCL_STAT_QUIT:
4460 case CCL_STAT_INVALID_CMD:
4461 coding->result = CODING_RESULT_INTERRUPT;
4462 break;
4463 default:
4464 coding->result = CODING_RESULT_SUCCESS;
4465 break;
4466 }
4467
4468 coding->produced_char += produced_chars;
4469 coding->produced = dst - coding->destination;
4470 return 0;
4471 }
4472
4473
4474 \f
4475 /*** 10, 11. no-conversion handlers ***/
4476
4477 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions". */
4478
4479 static void
4480 decode_coding_raw_text (coding)
4481 struct coding_system *coding;
4482 {
4483 coding->chars_at_source = 1;
4484 coding->consumed_char = 0;
4485 coding->consumed = 0;
4486 coding->result = CODING_RESULT_SUCCESS;
4487 }
4488
4489 static int
4490 encode_coding_raw_text (coding)
4491 struct coding_system *coding;
4492 {
4493 int multibytep = coding->dst_multibyte;
4494 int *charbuf = coding->charbuf;
4495 int *charbuf_end = coding->charbuf + coding->charbuf_used;
4496 unsigned char *dst = coding->destination + coding->produced;
4497 unsigned char *dst_end = coding->destination + coding->dst_bytes;
4498 int produced_chars = 0;
4499 int c;
4500
4501 if (multibytep)
4502 {
4503 int safe_room = MAX_MULTIBYTE_LENGTH * 2;
4504
4505 if (coding->src_multibyte)
4506 while (charbuf < charbuf_end)
4507 {
4508 ASSURE_DESTINATION (safe_room);
4509 c = *charbuf++;
4510 if (ASCII_CHAR_P (c))
4511 EMIT_ONE_ASCII_BYTE (c);
4512 else if (CHAR_BYTE8_P (c))
4513 {
4514 c = CHAR_TO_BYTE8 (c);
4515 EMIT_ONE_BYTE (c);
4516 }
4517 else
4518 {
4519 unsigned char str[MAX_MULTIBYTE_LENGTH], *p0 = str, *p1 = str;
4520
4521 CHAR_STRING_ADVANCE (c, p1);
4522 while (p0 < p1)
4523 {
4524 EMIT_ONE_BYTE (*p0);
4525 p0++;
4526 }
4527 }
4528 }
4529 else
4530 while (charbuf < charbuf_end)
4531 {
4532 ASSURE_DESTINATION (safe_room);
4533 c = *charbuf++;
4534 EMIT_ONE_BYTE (c);
4535 }
4536 }
4537 else
4538 {
4539 if (coding->src_multibyte)
4540 {
4541 int safe_room = MAX_MULTIBYTE_LENGTH;
4542
4543 while (charbuf < charbuf_end)
4544 {
4545 ASSURE_DESTINATION (safe_room);
4546 c = *charbuf++;
4547 if (ASCII_CHAR_P (c))
4548 *dst++ = c;
4549 else if (CHAR_BYTE8_P (c))
4550 *dst++ = CHAR_TO_BYTE8 (c);
4551 else
4552 CHAR_STRING_ADVANCE (c, dst);
4553 produced_chars++;
4554 }
4555 }
4556 else
4557 {
4558 ASSURE_DESTINATION (charbuf_end - charbuf);
4559 while (charbuf < charbuf_end && dst < dst_end)
4560 *dst++ = *charbuf++;
4561 produced_chars = dst - (coding->destination + coding->dst_bytes);
4562 }
4563 }
4564 coding->result = CODING_RESULT_SUCCESS;
4565 coding->produced_char += produced_chars;
4566 coding->produced = dst - coding->destination;
4567 return 0;
4568 }
4569
4570 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
4571 Check if a text is encoded in a charset-based coding system. If it
4572 is, return 1, else return 0. */
4573
4574 static int
4575 detect_coding_charset (coding, detect_info)
4576 struct coding_system *coding;
4577 struct coding_detection_info *detect_info;
4578 {
4579 const unsigned char *src = coding->source, *src_base = src;
4580 const unsigned char *src_end = coding->source + coding->src_bytes;
4581 int multibytep = coding->src_multibyte;
4582 int consumed_chars = 0;
4583 Lisp_Object attrs, valids;
4584 int found = 0;
4585
4586 detect_info->checked |= CATEGORY_MASK_CHARSET;
4587
4588 coding = &coding_categories[coding_category_charset];
4589 attrs = CODING_ID_ATTRS (coding->id);
4590 valids = AREF (attrs, coding_attr_charset_valids);
4591
4592 if (! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
4593 src += coding->head_ascii;
4594
4595 while (1)
4596 {
4597 int c;
4598
4599 ONE_MORE_BYTE (c);
4600 if (NILP (AREF (valids, c)))
4601 break;
4602 if (c >= 0x80)
4603 found = CATEGORY_MASK_CHARSET;
4604 }
4605 detect_info->rejected |= CATEGORY_MASK_CHARSET;
4606 return 0;
4607
4608 no_more_source:
4609 detect_info->found |= found;
4610 return 1;
4611 }
4612
4613 static void
4614 decode_coding_charset (coding)
4615 struct coding_system *coding;
4616 {
4617 const unsigned char *src = coding->source + coding->consumed;
4618 const unsigned char *src_end = coding->source + coding->src_bytes;
4619 const unsigned char *src_base;
4620 int *charbuf = coding->charbuf;
4621 int *charbuf_end = charbuf + coding->charbuf_size - MAX_ANNOTATION_LENGTH;
4622 int consumed_chars = 0, consumed_chars_base;
4623 int multibytep = coding->src_multibyte;
4624 Lisp_Object attrs, eol_type, charset_list, valids;
4625 int char_offset = coding->produced_char;
4626 int last_offset = char_offset;
4627 int last_id = charset_ascii;
4628
4629 CODING_GET_INFO (coding, attrs, eol_type, charset_list);
4630 valids = AREF (attrs, coding_attr_charset_valids);
4631
4632 while (1)
4633 {
4634 int c;
4635
4636 src_base = src;
4637 consumed_chars_base = consumed_chars;
4638
4639 if (charbuf >= charbuf_end)
4640 break;
4641
4642 ONE_MORE_BYTE (c);
4643 if (c == '\r')
4644 {
4645 /* Here we assume that no charset maps '\r' to something
4646 else. */
4647 if (EQ (eol_type, Qdos))
4648 {
4649 if (src == src_end)
4650 {
4651 coding->result = CODING_RESULT_INSUFFICIENT_SRC;
4652 goto no_more_source;
4653 }
4654 if (*src == '\n')
4655 ONE_MORE_BYTE (c);
4656 }
4657 else if (EQ (eol_type, Qmac))
4658 c = '\n';
4659 }
4660 else
4661 {
4662 Lisp_Object val;
4663 struct charset *charset;
4664 int dim;
4665 int len = 1;
4666 unsigned code = c;
4667
4668 val = AREF (valids, c);
4669 if (NILP (val))
4670 goto invalid_code;
4671 if (INTEGERP (val))
4672 {
4673 charset = CHARSET_FROM_ID (XFASTINT (val));
4674 dim = CHARSET_DIMENSION (charset);
4675 while (len < dim)
4676 {
4677 ONE_MORE_BYTE (c);
4678 code = (code << 8) | c;
4679 len++;
4680 }
4681 CODING_DECODE_CHAR (coding, src, src_base, src_end,
4682 charset, code, c);
4683 }
4684 else
4685 {
4686 /* VAL is a list of charset IDs. It is assured that the
4687 list is sorted by charset dimensions (smaller one
4688 comes first). */
4689 while (CONSP (val))
4690 {
4691 charset = CHARSET_FROM_ID (XFASTINT (XCAR (val)));
4692 dim = CHARSET_DIMENSION (charset);
4693 while (len < dim)
4694 {
4695 ONE_MORE_BYTE (c);
4696 code = (code << 8) | c;
4697 len++;
4698 }
4699 CODING_DECODE_CHAR (coding, src, src_base,
4700 src_end, charset, code, c);
4701 if (c >= 0)
4702 break;
4703 val = XCDR (val);
4704 }
4705 }
4706 if (c < 0)
4707 goto invalid_code;
4708 if (charset->id != charset_ascii
4709 && last_id != charset->id)
4710 {
4711 if (last_id != charset_ascii)
4712 ADD_CHARSET_DATA (charbuf, last_offset, char_offset, last_id);
4713 last_id = charset->id;
4714 last_offset = char_offset;
4715 }
4716 }
4717 *charbuf++ = c;
4718 char_offset++;
4719 continue;
4720
4721 invalid_code:
4722 src = src_base;
4723 consumed_chars = consumed_chars_base;
4724 ONE_MORE_BYTE (c);
4725 *charbuf++ = ASCII_BYTE_P (c) ? c : BYTE8_TO_CHAR (c);
4726 char_offset++;
4727 coding->errors++;
4728 }
4729
4730 no_more_source:
4731 if (last_id != charset_ascii)
4732 ADD_CHARSET_DATA (charbuf, last_offset, char_offset, last_id);
4733 coding->consumed_char += consumed_chars_base;
4734 coding->consumed = src_base - coding->source;
4735 coding->charbuf_used = charbuf - coding->charbuf;
4736 }
4737
4738 static int
4739 encode_coding_charset (coding)
4740 struct coding_system *coding;
4741 {
4742 int multibytep = coding->dst_multibyte;
4743 int *charbuf = coding->charbuf;
4744 int *charbuf_end = charbuf + coding->charbuf_used;
4745 unsigned char *dst = coding->destination + coding->produced;
4746 unsigned char *dst_end = coding->destination + coding->dst_bytes;
4747 int safe_room = MAX_MULTIBYTE_LENGTH;
4748 int produced_chars = 0;
4749 Lisp_Object attrs, eol_type, charset_list;
4750 int ascii_compatible;
4751 int c;
4752
4753 CODING_GET_INFO (coding, attrs, eol_type, charset_list);
4754 ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs));
4755
4756 while (charbuf < charbuf_end)
4757 {
4758 struct charset *charset;
4759 unsigned code;
4760
4761 ASSURE_DESTINATION (safe_room);
4762 c = *charbuf++;
4763 if (ascii_compatible && ASCII_CHAR_P (c))
4764 EMIT_ONE_ASCII_BYTE (c);
4765 else if (CHAR_BYTE8_P (c))
4766 {
4767 c = CHAR_TO_BYTE8 (c);
4768 EMIT_ONE_BYTE (c);
4769 }
4770 else
4771 {
4772 charset = char_charset (c, charset_list, &code);
4773 if (charset)
4774 {
4775 if (CHARSET_DIMENSION (charset) == 1)
4776 EMIT_ONE_BYTE (code);
4777 else if (CHARSET_DIMENSION (charset) == 2)
4778 EMIT_TWO_BYTES (code >> 8, code & 0xFF);
4779 else if (CHARSET_DIMENSION (charset) == 3)
4780 EMIT_THREE_BYTES (code >> 16, (code >> 8) & 0xFF, code & 0xFF);
4781 else
4782 EMIT_FOUR_BYTES (code >> 24, (code >> 16) & 0xFF,
4783 (code >> 8) & 0xFF, code & 0xFF);
4784 }
4785 else
4786 {
4787 if (coding->mode & CODING_MODE_SAFE_ENCODING)
4788 c = CODING_INHIBIT_CHARACTER_SUBSTITUTION;
4789 else
4790 c = coding->default_char;
4791 EMIT_ONE_BYTE (c);
4792 }
4793 }
4794 }
4795
4796 coding->result = CODING_RESULT_SUCCESS;
4797 coding->produced_char += produced_chars;
4798 coding->produced = dst - coding->destination;
4799 return 0;
4800 }
4801
4802 \f
4803 /*** 7. C library functions ***/
4804
4805 /* Setup coding context CODING from information about CODING_SYSTEM.
4806 If CODING_SYSTEM is nil, `no-conversion' is assumed. If
4807 CODING_SYSTEM is invalid, signal an error. */
4808
4809 void
4810 setup_coding_system (coding_system, coding)
4811 Lisp_Object coding_system;
4812 struct coding_system *coding;
4813 {
4814 Lisp_Object attrs;
4815 Lisp_Object eol_type;
4816 Lisp_Object coding_type;
4817 Lisp_Object val;
4818
4819 if (NILP (coding_system))
4820 coding_system = Qno_conversion;
4821
4822 CHECK_CODING_SYSTEM_GET_ID (coding_system, coding->id);
4823
4824 attrs = CODING_ID_ATTRS (coding->id);
4825 eol_type = CODING_ID_EOL_TYPE (coding->id);
4826
4827 coding->mode = 0;
4828 coding->head_ascii = -1;
4829 coding->common_flags
4830 = (VECTORP (eol_type) ? CODING_REQUIRE_DETECTION_MASK : 0);
4831 if (! NILP (CODING_ATTR_POST_READ (attrs)))
4832 coding->common_flags |= CODING_REQUIRE_DECODING_MASK;
4833 if (! NILP (CODING_ATTR_PRE_WRITE (attrs)))
4834 coding->common_flags |= CODING_REQUIRE_ENCODING_MASK;
4835 if (! NILP (CODING_ATTR_FOR_UNIBYTE (attrs)))
4836 coding->common_flags |= CODING_FOR_UNIBYTE_MASK;
4837
4838 val = CODING_ATTR_SAFE_CHARSETS (attrs);
4839 coding->max_charset_id = SCHARS (val) - 1;
4840 coding->safe_charsets = (char *) SDATA (val);
4841 coding->default_char = XINT (CODING_ATTR_DEFAULT_CHAR (attrs));
4842
4843 coding_type = CODING_ATTR_TYPE (attrs);
4844 if (EQ (coding_type, Qundecided))
4845 {
4846 coding->detector = NULL;
4847 coding->decoder = decode_coding_raw_text;
4848 coding->encoder = encode_coding_raw_text;
4849 coding->common_flags |= CODING_REQUIRE_DETECTION_MASK;
4850 }
4851 else if (EQ (coding_type, Qiso_2022))
4852 {
4853 int i;
4854 int flags = XINT (AREF (attrs, coding_attr_iso_flags));
4855
4856 /* Invoke graphic register 0 to plane 0. */
4857 CODING_ISO_INVOCATION (coding, 0) = 0;
4858 /* Invoke graphic register 1 to plane 1 if we can use 8-bit. */
4859 CODING_ISO_INVOCATION (coding, 1)
4860 = (flags & CODING_ISO_FLAG_SEVEN_BITS ? -1 : 1);
4861 /* Setup the initial status of designation. */
4862 for (i = 0; i < 4; i++)
4863 CODING_ISO_DESIGNATION (coding, i) = CODING_ISO_INITIAL (coding, i);
4864 /* Not single shifting initially. */
4865 CODING_ISO_SINGLE_SHIFTING (coding) = 0;
4866 /* Beginning of buffer should also be regarded as bol. */
4867 CODING_ISO_BOL (coding) = 1;
4868 coding->detector = detect_coding_iso_2022;
4869 coding->decoder = decode_coding_iso_2022;
4870 coding->encoder = encode_coding_iso_2022;
4871 if (flags & CODING_ISO_FLAG_SAFE)
4872 coding->mode |= CODING_MODE_SAFE_ENCODING;
4873 coding->common_flags
4874 |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK
4875 | CODING_REQUIRE_FLUSHING_MASK);
4876 if (flags & CODING_ISO_FLAG_COMPOSITION)
4877 coding->common_flags |= CODING_ANNOTATE_COMPOSITION_MASK;
4878 if (flags & CODING_ISO_FLAG_DESIGNATION)
4879 coding->common_flags |= CODING_ANNOTATE_CHARSET_MASK;
4880 if (flags & CODING_ISO_FLAG_FULL_SUPPORT)
4881 {
4882 setup_iso_safe_charsets (attrs);
4883 val = CODING_ATTR_SAFE_CHARSETS (attrs);
4884 coding->max_charset_id = SCHARS (val) - 1;
4885 coding->safe_charsets = (char *) SDATA (val);
4886 }
4887 CODING_ISO_FLAGS (coding) = flags;
4888 }
4889 else if (EQ (coding_type, Qcharset))
4890 {
4891 coding->detector = detect_coding_charset;
4892 coding->decoder = decode_coding_charset;
4893 coding->encoder = encode_coding_charset;
4894 coding->common_flags
4895 |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
4896 }
4897 else if (EQ (coding_type, Qutf_8))
4898 {
4899 coding->detector = detect_coding_utf_8;
4900 coding->decoder = decode_coding_utf_8;
4901 coding->encoder = encode_coding_utf_8;
4902 coding->common_flags
4903 |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
4904 }
4905 else if (EQ (coding_type, Qutf_16))
4906 {
4907 val = AREF (attrs, coding_attr_utf_16_bom);
4908 CODING_UTF_16_BOM (coding) = (CONSP (val) ? utf_16_detect_bom
4909 : EQ (val, Qt) ? utf_16_with_bom
4910 : utf_16_without_bom);
4911 val = AREF (attrs, coding_attr_utf_16_endian);
4912 CODING_UTF_16_ENDIAN (coding) = (EQ (val, Qbig) ? utf_16_big_endian
4913 : utf_16_little_endian);
4914 CODING_UTF_16_SURROGATE (coding) = 0;
4915 coding->detector = detect_coding_utf_16;
4916 coding->decoder = decode_coding_utf_16;
4917 coding->encoder = encode_coding_utf_16;
4918 coding->common_flags
4919 |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
4920 if (CODING_UTF_16_BOM (coding) == utf_16_detect_bom)
4921 coding->common_flags |= CODING_REQUIRE_DETECTION_MASK;
4922 }
4923 else if (EQ (coding_type, Qccl))
4924 {
4925 coding->detector = detect_coding_ccl;
4926 coding->decoder = decode_coding_ccl;
4927 coding->encoder = encode_coding_ccl;
4928 coding->common_flags
4929 |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK
4930 | CODING_REQUIRE_FLUSHING_MASK);
4931 }
4932 else if (EQ (coding_type, Qemacs_mule))
4933 {
4934 coding->detector = detect_coding_emacs_mule;
4935 coding->decoder = decode_coding_emacs_mule;
4936 coding->encoder = encode_coding_emacs_mule;
4937 coding->common_flags
4938 |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
4939 if (! NILP (AREF (attrs, coding_attr_emacs_mule_full))
4940 && ! EQ (CODING_ATTR_CHARSET_LIST (attrs), Vemacs_mule_charset_list))
4941 {
4942 Lisp_Object tail, safe_charsets;
4943 int max_charset_id = 0;
4944
4945 for (tail = Vemacs_mule_charset_list; CONSP (tail);
4946 tail = XCDR (tail))
4947 if (max_charset_id < XFASTINT (XCAR (tail)))
4948 max_charset_id = XFASTINT (XCAR (tail));
4949 safe_charsets = Fmake_string (make_number (max_charset_id + 1),
4950 make_number (255));
4951 for (tail = Vemacs_mule_charset_list; CONSP (tail);
4952 tail = XCDR (tail))
4953 SSET (safe_charsets, XFASTINT (XCAR (tail)), 0);
4954 coding->max_charset_id = max_charset_id;
4955 coding->safe_charsets = (char *) SDATA (safe_charsets);
4956 }
4957 }
4958 else if (EQ (coding_type, Qshift_jis))
4959 {
4960 coding->detector = detect_coding_sjis;
4961 coding->decoder = decode_coding_sjis;
4962 coding->encoder = encode_coding_sjis;
4963 coding->common_flags
4964 |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
4965 }
4966 else if (EQ (coding_type, Qbig5))
4967 {
4968 coding->detector = detect_coding_big5;
4969 coding->decoder = decode_coding_big5;
4970 coding->encoder = encode_coding_big5;
4971 coding->common_flags
4972 |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
4973 }
4974 else /* EQ (coding_type, Qraw_text) */
4975 {
4976 coding->detector = NULL;
4977 coding->decoder = decode_coding_raw_text;
4978 coding->encoder = encode_coding_raw_text;
4979 }
4980
4981 return;
4982 }
4983
4984 /* Return raw-text or one of its subsidiaries that has the same
4985 eol_type as CODING-SYSTEM. */
4986
4987 Lisp_Object
4988 raw_text_coding_system (coding_system)
4989 Lisp_Object coding_system;
4990 {
4991 Lisp_Object spec, attrs;
4992 Lisp_Object eol_type, raw_text_eol_type;
4993
4994 if (NILP (coding_system))
4995 return Qraw_text;
4996 spec = CODING_SYSTEM_SPEC (coding_system);
4997 attrs = AREF (spec, 0);
4998
4999 if (EQ (CODING_ATTR_TYPE (attrs), Qraw_text))
5000 return coding_system;
5001
5002 eol_type = AREF (spec, 2);
5003 if (VECTORP (eol_type))
5004 return Qraw_text;
5005 spec = CODING_SYSTEM_SPEC (Qraw_text);
5006 raw_text_eol_type = AREF (spec, 2);
5007 return (EQ (eol_type, Qunix) ? AREF (raw_text_eol_type, 0)
5008 : EQ (eol_type, Qdos) ? AREF (raw_text_eol_type, 1)
5009 : AREF (raw_text_eol_type, 2));
5010 }
5011
5012
5013 /* If CODING_SYSTEM doesn't specify end-of-line format but PARENT
5014 does, return one of the subsidiary that has the same eol-spec as
5015 PARENT. Otherwise, return CODING_SYSTEM. */
5016
5017 Lisp_Object
5018 coding_inherit_eol_type (coding_system, parent)
5019 Lisp_Object coding_system, parent;
5020 {
5021 Lisp_Object spec, eol_type;
5022
5023 if (NILP (coding_system))
5024 coding_system = Qraw_text;
5025 spec = CODING_SYSTEM_SPEC (coding_system);
5026 eol_type = AREF (spec, 2);
5027 if (VECTORP (eol_type)
5028 && ! NILP (parent))
5029 {
5030 Lisp_Object parent_spec;
5031 Lisp_Object parent_eol_type;
5032
5033 parent_spec
5034 = CODING_SYSTEM_SPEC (buffer_defaults.buffer_file_coding_system);
5035 parent_eol_type = AREF (parent_spec, 2);
5036 if (EQ (parent_eol_type, Qunix))
5037 coding_system = AREF (eol_type, 0);
5038 else if (EQ (parent_eol_type, Qdos))
5039 coding_system = AREF (eol_type, 1);
5040 else if (EQ (parent_eol_type, Qmac))
5041 coding_system = AREF (eol_type, 2);
5042 }
5043 return coding_system;
5044 }
5045
5046 /* Emacs has a mechanism to automatically detect a coding system if it
5047 is one of Emacs' internal format, ISO2022, SJIS, and BIG5. But,
5048 it's impossible to distinguish some coding systems accurately
5049 because they use the same range of codes. So, at first, coding
5050 systems are categorized into 7, those are:
5051
5052 o coding-category-emacs-mule
5053
5054 The category for a coding system which has the same code range
5055 as Emacs' internal format. Assigned the coding-system (Lisp
5056 symbol) `emacs-mule' by default.
5057
5058 o coding-category-sjis
5059
5060 The category for a coding system which has the same code range
5061 as SJIS. Assigned the coding-system (Lisp
5062 symbol) `japanese-shift-jis' by default.
5063
5064 o coding-category-iso-7
5065
5066 The category for a coding system which has the same code range
5067 as ISO2022 of 7-bit environment. This doesn't use any locking
5068 shift and single shift functions. This can encode/decode all
5069 charsets. Assigned the coding-system (Lisp symbol)
5070 `iso-2022-7bit' by default.
5071
5072 o coding-category-iso-7-tight
5073
5074 Same as coding-category-iso-7 except that this can
5075 encode/decode only the specified charsets.
5076
5077 o coding-category-iso-8-1
5078
5079 The category for a coding system which has the same code range
5080 as ISO2022 of 8-bit environment and graphic plane 1 used only
5081 for DIMENSION1 charset. This doesn't use any locking shift
5082 and single shift functions. Assigned the coding-system (Lisp
5083 symbol) `iso-latin-1' by default.
5084
5085 o coding-category-iso-8-2
5086
5087 The category for a coding system which has the same code range
5088 as ISO2022 of 8-bit environment and graphic plane 1 used only
5089 for DIMENSION2 charset. This doesn't use any locking shift
5090 and single shift functions. Assigned the coding-system (Lisp
5091 symbol) `japanese-iso-8bit' by default.
5092
5093 o coding-category-iso-7-else
5094
5095 The category for a coding system which has the same code range
5096 as ISO2022 of 7-bit environemnt but uses locking shift or
5097 single shift functions. Assigned the coding-system (Lisp
5098 symbol) `iso-2022-7bit-lock' by default.
5099
5100 o coding-category-iso-8-else
5101
5102 The category for a coding system which has the same code range
5103 as ISO2022 of 8-bit environemnt but uses locking shift or
5104 single shift functions. Assigned the coding-system (Lisp
5105 symbol) `iso-2022-8bit-ss2' by default.
5106
5107 o coding-category-big5
5108
5109 The category for a coding system which has the same code range
5110 as BIG5. Assigned the coding-system (Lisp symbol)
5111 `cn-big5' by default.
5112
5113 o coding-category-utf-8
5114
5115 The category for a coding system which has the same code range
5116 as UTF-8 (cf. RFC2279). Assigned the coding-system (Lisp
5117 symbol) `utf-8' by default.
5118
5119 o coding-category-utf-16-be
5120
5121 The category for a coding system in which a text has an
5122 Unicode signature (cf. Unicode Standard) in the order of BIG
5123 endian at the head. Assigned the coding-system (Lisp symbol)
5124 `utf-16-be' by default.
5125
5126 o coding-category-utf-16-le
5127
5128 The category for a coding system in which a text has an
5129 Unicode signature (cf. Unicode Standard) in the order of
5130 LITTLE endian at the head. Assigned the coding-system (Lisp
5131 symbol) `utf-16-le' by default.
5132
5133 o coding-category-ccl
5134
5135 The category for a coding system of which encoder/decoder is
5136 written in CCL programs. The default value is nil, i.e., no
5137 coding system is assigned.
5138
5139 o coding-category-binary
5140
5141 The category for a coding system not categorized in any of the
5142 above. Assigned the coding-system (Lisp symbol)
5143 `no-conversion' by default.
5144
5145 Each of them is a Lisp symbol and the value is an actual
5146 `coding-system's (this is also a Lisp symbol) assigned by a user.
5147 What Emacs does actually is to detect a category of coding system.
5148 Then, it uses a `coding-system' assigned to it. If Emacs can't
5149 decide only one possible category, it selects a category of the
5150 highest priority. Priorities of categories are also specified by a
5151 user in a Lisp variable `coding-category-list'.
5152
5153 */
5154
5155 #define EOL_SEEN_NONE 0
5156 #define EOL_SEEN_LF 1
5157 #define EOL_SEEN_CR 2
5158 #define EOL_SEEN_CRLF 4
5159
5160 /* Detect how end-of-line of a text of length SRC_BYTES pointed by
5161 SOURCE is encoded. If CATEGORY is one of
5162 coding_category_utf_16_XXXX, assume that CR and LF are encoded by
5163 two-byte, else they are encoded by one-byte.
5164
5165 Return one of EOL_SEEN_XXX. */
5166
5167 #define MAX_EOL_CHECK_COUNT 3
5168
5169 static int
5170 detect_eol (source, src_bytes, category)
5171 unsigned char *source;
5172 EMACS_INT src_bytes;
5173 enum coding_category category;
5174 {
5175 unsigned char *src = source, *src_end = src + src_bytes;
5176 unsigned char c;
5177 int total = 0;
5178 int eol_seen = EOL_SEEN_NONE;
5179
5180 if ((1 << category) & CATEGORY_MASK_UTF_16)
5181 {
5182 int msb, lsb;
5183
5184 msb = category == (coding_category_utf_16_le
5185 | coding_category_utf_16_le_nosig);
5186 lsb = 1 - msb;
5187
5188 while (src + 1 < src_end)
5189 {
5190 c = src[lsb];
5191 if (src[msb] == 0 && (c == '\n' || c == '\r'))
5192 {
5193 int this_eol;
5194
5195 if (c == '\n')
5196 this_eol = EOL_SEEN_LF;
5197 else if (src + 3 >= src_end
5198 || src[msb + 2] != 0
5199 || src[lsb + 2] != '\n')
5200 this_eol = EOL_SEEN_CR;
5201 else
5202 this_eol = EOL_SEEN_CRLF;
5203
5204 if (eol_seen == EOL_SEEN_NONE)
5205 /* This is the first end-of-line. */
5206 eol_seen = this_eol;
5207 else if (eol_seen != this_eol)
5208 {
5209 /* The found type is different from what found before. */
5210 eol_seen = EOL_SEEN_LF;
5211 break;
5212 }
5213 if (++total == MAX_EOL_CHECK_COUNT)
5214 break;
5215 }
5216 src += 2;
5217 }
5218 }
5219 else
5220 {
5221 while (src < src_end)
5222 {
5223 c = *src++;
5224 if (c == '\n' || c == '\r')
5225 {
5226 int this_eol;
5227
5228 if (c == '\n')
5229 this_eol = EOL_SEEN_LF;
5230 else if (src >= src_end || *src != '\n')
5231 this_eol = EOL_SEEN_CR;
5232 else
5233 this_eol = EOL_SEEN_CRLF, src++;
5234
5235 if (eol_seen == EOL_SEEN_NONE)
5236 /* This is the first end-of-line. */
5237 eol_seen = this_eol;
5238 else if (eol_seen != this_eol)
5239 {
5240 /* The found type is different from what found before. */
5241 eol_seen = EOL_SEEN_LF;
5242 break;
5243 }
5244 if (++total == MAX_EOL_CHECK_COUNT)
5245 break;
5246 }
5247 }
5248 }
5249 return eol_seen;
5250 }
5251
5252
5253 static void
5254 adjust_coding_eol_type (coding, eol_seen)
5255 struct coding_system *coding;
5256 int eol_seen;
5257 {
5258 Lisp_Object eol_type;
5259
5260 eol_type = CODING_ID_EOL_TYPE (coding->id);
5261 if (eol_seen & EOL_SEEN_LF)
5262 coding->id = CODING_SYSTEM_ID (AREF (eol_type, 0));
5263 else if (eol_seen & EOL_SEEN_CRLF)
5264 coding->id = CODING_SYSTEM_ID (AREF (eol_type, 1));
5265 else if (eol_seen & EOL_SEEN_CR)
5266 coding->id = CODING_SYSTEM_ID (AREF (eol_type, 2));
5267 }
5268
5269 /* Detect how a text specified in CODING is encoded. If a coding
5270 system is detected, update fields of CODING by the detected coding
5271 system. */
5272
5273 void
5274 detect_coding (coding)
5275 struct coding_system *coding;
5276 {
5277 const unsigned char *src, *src_end;
5278 Lisp_Object attrs, coding_type;
5279
5280 coding->consumed = coding->consumed_char = 0;
5281 coding->produced = coding->produced_char = 0;
5282 coding_set_source (coding);
5283
5284 src_end = coding->source + coding->src_bytes;
5285
5286 /* If we have not yet decided the text encoding type, detect it
5287 now. */
5288 if (EQ (CODING_ATTR_TYPE (CODING_ID_ATTRS (coding->id)), Qundecided))
5289 {
5290 int c, i;
5291
5292 for (src = coding->source; src < src_end; src++)
5293 {
5294 c = *src;
5295 if (c & 0x80 || (c < 0x20 && (c == ISO_CODE_ESC
5296 || c == ISO_CODE_SI
5297 || c == ISO_CODE_SO)))
5298 break;
5299 }
5300 coding->head_ascii = src - (coding->source + coding->consumed);
5301
5302 if (coding->head_ascii < coding->src_bytes)
5303 {
5304 struct coding_detection_info detect_info;
5305 enum coding_category category;
5306 struct coding_system *this;
5307
5308 detect_info.checked = detect_info.found = detect_info.rejected = 0;
5309 for (i = 0; i < coding_category_raw_text; i++)
5310 {
5311 category = coding_priorities[i];
5312 this = coding_categories + category;
5313 if (this->id < 0)
5314 {
5315 /* No coding system of this category is defined. */
5316 detect_info.rejected |= (1 << category);
5317 }
5318 else if (category >= coding_category_raw_text)
5319 continue;
5320 else if (detect_info.checked & (1 << category))
5321 {
5322 if (detect_info.found & (1 << category))
5323 break;
5324 }
5325 else if ((*(this->detector)) (coding, &detect_info)
5326 && detect_info.found & (1 << category))
5327 break;
5328 }
5329 if (i < coding_category_raw_text)
5330 setup_coding_system (CODING_ID_NAME (this->id), coding);
5331 else if (detect_info.rejected == CATEGORY_MASK_ANY)
5332 setup_coding_system (Qraw_text, coding);
5333 else if (detect_info.rejected)
5334 for (i = 0; i < coding_category_raw_text; i++)
5335 if (! (detect_info.rejected & (1 << coding_priorities[i])))
5336 {
5337 this = coding_categories + coding_priorities[i];
5338 setup_coding_system (CODING_ID_NAME (this->id), coding);
5339 break;
5340 }
5341 }
5342 }
5343 else if (EQ (CODING_ATTR_TYPE (CODING_ID_ATTRS (coding->id)), Qutf_16))
5344 {
5345 Lisp_Object coding_systems;
5346 struct coding_detection_info detect_info;
5347
5348 coding_systems
5349 = AREF (CODING_ID_ATTRS (coding->id), coding_attr_utf_16_bom);
5350 detect_info.found = detect_info.rejected = 0;
5351 if (CONSP (coding_systems)
5352 && detect_coding_utf_16 (coding, &detect_info)
5353 && (detect_info.found & (CATEGORY_MASK_UTF_16_LE
5354 | CATEGORY_MASK_UTF_16_BE)))
5355 {
5356 if (detect_info.found & CATEGORY_MASK_UTF_16_LE)
5357 setup_coding_system (XCAR (coding_systems), coding);
5358 else
5359 setup_coding_system (XCDR (coding_systems), coding);
5360 }
5361 }
5362
5363 attrs = CODING_ID_ATTRS (coding->id);
5364 coding_type = CODING_ATTR_TYPE (attrs);
5365
5366 /* If we have not yet decided the EOL type, detect it now. But, the
5367 detection is impossible for a CCL based coding system, in which
5368 case, we detct the EOL type after decoding. */
5369 if (VECTORP (CODING_ID_EOL_TYPE (coding->id))
5370 && ! EQ (coding_type, Qccl))
5371 {
5372 int eol_seen = detect_eol (coding->source, coding->src_bytes,
5373 (enum coding_category) XINT (CODING_ATTR_CATEGORY (attrs)));
5374
5375 if (eol_seen != EOL_SEEN_NONE)
5376 adjust_coding_eol_type (coding, eol_seen);
5377 }
5378 }
5379
5380
5381 static void
5382 decode_eol (coding)
5383 struct coding_system *coding;
5384 {
5385 if (VECTORP (CODING_ID_EOL_TYPE (coding->id)))
5386 {
5387 unsigned char *p = CHAR_POS_ADDR (coding->dst_pos);
5388 unsigned char *pend = p + coding->produced;
5389 int eol_seen = EOL_SEEN_NONE;
5390
5391 for (; p < pend; p++)
5392 {
5393 if (*p == '\n')
5394 eol_seen |= EOL_SEEN_LF;
5395 else if (*p == '\r')
5396 {
5397 if (p + 1 < pend && *(p + 1) == '\n')
5398 {
5399 eol_seen |= EOL_SEEN_CRLF;
5400 p++;
5401 }
5402 else
5403 eol_seen |= EOL_SEEN_CR;
5404 }
5405 }
5406 if (eol_seen != EOL_SEEN_NONE)
5407 adjust_coding_eol_type (coding, eol_seen);
5408 }
5409
5410 if (EQ (CODING_ID_EOL_TYPE (coding->id), Qmac))
5411 {
5412 unsigned char *p = CHAR_POS_ADDR (coding->dst_pos);
5413 unsigned char *pend = p + coding->produced;
5414
5415 for (; p < pend; p++)
5416 if (*p == '\r')
5417 *p = '\n';
5418 }
5419 else if (EQ (CODING_ID_EOL_TYPE (coding->id), Qdos))
5420 {
5421 unsigned char *p, *pbeg, *pend;
5422 Lisp_Object undo_list;
5423
5424 move_gap_both (coding->dst_pos + coding->produced_char,
5425 coding->dst_pos_byte + coding->produced);
5426 undo_list = current_buffer->undo_list;
5427 current_buffer->undo_list = Qt;
5428 del_range_2 (coding->dst_pos, coding->dst_pos_byte, GPT, GPT_BYTE, 0);
5429 current_buffer->undo_list = undo_list;
5430 pbeg = GPT_ADDR;
5431 pend = pbeg + coding->produced;
5432
5433 for (p = pend - 1; p >= pbeg; p--)
5434 if (*p == '\r')
5435 {
5436 safe_bcopy ((char *) (p + 1), (char *) p, pend - p - 1);
5437 pend--;
5438 }
5439 coding->produced_char -= coding->produced - (pend - pbeg);
5440 coding->produced = pend - pbeg;
5441 insert_from_gap (coding->produced_char, coding->produced);
5442 }
5443 }
5444
5445 static void
5446 translate_chars (coding, table)
5447 struct coding_system *coding;
5448 Lisp_Object table;
5449 {
5450 int *charbuf = coding->charbuf;
5451 int *charbuf_end = charbuf + coding->charbuf_used;
5452 int c;
5453
5454 if (coding->chars_at_source)
5455 return;
5456
5457 while (charbuf < charbuf_end)
5458 {
5459 c = *charbuf;
5460 if (c < 0)
5461 charbuf += c;
5462 else
5463 *charbuf++ = translate_char (table, c);
5464 }
5465 }
5466
5467 static int
5468 produce_chars (coding)
5469 struct coding_system *coding;
5470 {
5471 unsigned char *dst = coding->destination + coding->produced;
5472 unsigned char *dst_end = coding->destination + coding->dst_bytes;
5473 int produced;
5474 int produced_chars = 0;
5475
5476 if (! coding->chars_at_source)
5477 {
5478 /* Characters are in coding->charbuf. */
5479 int *buf = coding->charbuf;
5480 int *buf_end = buf + coding->charbuf_used;
5481 unsigned char *adjusted_dst_end;
5482
5483 if (BUFFERP (coding->src_object)
5484 && EQ (coding->src_object, coding->dst_object))
5485 dst_end = ((unsigned char *) coding->source) + coding->consumed;
5486 adjusted_dst_end = dst_end - MAX_MULTIBYTE_LENGTH;
5487
5488 while (buf < buf_end)
5489 {
5490 int c = *buf++;
5491
5492 if (dst >= adjusted_dst_end)
5493 {
5494 dst = alloc_destination (coding,
5495 buf_end - buf + MAX_MULTIBYTE_LENGTH,
5496 dst);
5497 dst_end = coding->destination + coding->dst_bytes;
5498 adjusted_dst_end = dst_end - MAX_MULTIBYTE_LENGTH;
5499 }
5500 if (c >= 0)
5501 {
5502 if (coding->dst_multibyte
5503 || ! CHAR_BYTE8_P (c))
5504 CHAR_STRING_ADVANCE (c, dst);
5505 else
5506 *dst++ = CHAR_TO_BYTE8 (c);
5507 produced_chars++;
5508 }
5509 else
5510 /* This is an annotation datum. (-C) is the length of
5511 it. */
5512 buf += -c - 1;
5513 }
5514 }
5515 else
5516 {
5517 const unsigned char *src = coding->source;
5518 const unsigned char *src_end = src + coding->src_bytes;
5519 Lisp_Object eol_type;
5520
5521 eol_type = CODING_ID_EOL_TYPE (coding->id);
5522
5523 if (coding->src_multibyte != coding->dst_multibyte)
5524 {
5525 if (coding->src_multibyte)
5526 {
5527 int multibytep = 1;
5528 int consumed_chars;
5529
5530 while (1)
5531 {
5532 const unsigned char *src_base = src;
5533 int c;
5534
5535 ONE_MORE_BYTE (c);
5536 if (c == '\r')
5537 {
5538 if (EQ (eol_type, Qdos))
5539 {
5540 if (src == src_end)
5541 {
5542 coding->result = CODING_RESULT_INSUFFICIENT_SRC;
5543 goto no_more_source;
5544 }
5545 if (*src == '\n')
5546 c = *src++;
5547 }
5548 else if (EQ (eol_type, Qmac))
5549 c = '\n';
5550 }
5551 if (dst == dst_end)
5552 {
5553 coding->consumed = src - coding->source;
5554
5555 if (EQ (coding->src_object, coding->dst_object))
5556 dst_end = (unsigned char *) src;
5557 if (dst == dst_end)
5558 {
5559 dst = alloc_destination (coding, src_end - src + 1,
5560 dst);
5561 dst_end = coding->destination + coding->dst_bytes;
5562 coding_set_source (coding);
5563 src = coding->source + coding->consumed;
5564 src_end = coding->source + coding->src_bytes;
5565 }
5566 }
5567 *dst++ = c;
5568 produced_chars++;
5569 }
5570 no_more_source:
5571 ;
5572 }
5573 else
5574 while (src < src_end)
5575 {
5576 int multibytep = 1;
5577 int c = *src++;
5578
5579 if (c == '\r')
5580 {
5581 if (EQ (eol_type, Qdos))
5582 {
5583 if (src < src_end
5584 && *src == '\n')
5585 c = *src++;
5586 }
5587 else if (EQ (eol_type, Qmac))
5588 c = '\n';
5589 }
5590 if (dst >= dst_end - 1)
5591 {
5592 coding->consumed = src - coding->source;
5593
5594 if (EQ (coding->src_object, coding->dst_object))
5595 dst_end = (unsigned char *) src;
5596 if (dst >= dst_end - 1)
5597 {
5598 dst = alloc_destination (coding, src_end - src + 2,
5599 dst);
5600 dst_end = coding->destination + coding->dst_bytes;
5601 coding_set_source (coding);
5602 src = coding->source + coding->consumed;
5603 src_end = coding->source + coding->src_bytes;
5604 }
5605 }
5606 EMIT_ONE_BYTE (c);
5607 }
5608 }
5609 else
5610 {
5611 if (!EQ (coding->src_object, coding->dst_object))
5612 {
5613 int require = coding->src_bytes - coding->dst_bytes;
5614
5615 if (require > 0)
5616 {
5617 EMACS_INT offset = src - coding->source;
5618
5619 dst = alloc_destination (coding, require, dst);
5620 coding_set_source (coding);
5621 src = coding->source + offset;
5622 src_end = coding->source + coding->src_bytes;
5623 }
5624 }
5625 produced_chars = coding->src_chars;
5626 while (src < src_end)
5627 {
5628 int c = *src++;
5629
5630 if (c == '\r')
5631 {
5632 if (EQ (eol_type, Qdos))
5633 {
5634 if (src < src_end
5635 && *src == '\n')
5636 c = *src++;
5637 produced_chars--;
5638 }
5639 else if (EQ (eol_type, Qmac))
5640 c = '\n';
5641 }
5642 *dst++ = c;
5643 }
5644 }
5645 coding->consumed = coding->src_bytes;
5646 coding->consumed_char = coding->src_chars;
5647 }
5648
5649 produced = dst - (coding->destination + coding->produced);
5650 if (BUFFERP (coding->dst_object))
5651 insert_from_gap (produced_chars, produced);
5652 coding->produced += produced;
5653 coding->produced_char += produced_chars;
5654 return produced_chars;
5655 }
5656
5657 /* Compose text in CODING->object according to the annotation data at
5658 CHARBUF. CHARBUF is an array:
5659 [ -LENGTH ANNOTATION_MASK FROM TO METHOD COMP_LEN [ COMPONENTS... ] ]
5660 */
5661
5662 static INLINE void
5663 produce_composition (coding, charbuf)
5664 struct coding_system *coding;
5665 int *charbuf;
5666 {
5667 int len;
5668 EMACS_INT from, to;
5669 enum composition_method method;
5670 Lisp_Object components;
5671
5672 len = -charbuf[0];
5673 from = coding->dst_pos + charbuf[2];
5674 to = coding->dst_pos + charbuf[3];
5675 method = (enum composition_method) (charbuf[4]);
5676
5677 if (method == COMPOSITION_RELATIVE)
5678 components = Qnil;
5679 else
5680 {
5681 Lisp_Object args[MAX_COMPOSITION_COMPONENTS * 2 - 1];
5682 int i;
5683
5684 len -= 5;
5685 charbuf += 5;
5686 for (i = 0; i < len; i++)
5687 args[i] = make_number (charbuf[i]);
5688 components = (method == COMPOSITION_WITH_ALTCHARS
5689 ? Fstring (len, args) : Fvector (len, args));
5690 }
5691 compose_text (from, to, components, Qnil, coding->dst_object);
5692 }
5693
5694
5695 /* Put `charset' property on text in CODING->object according to
5696 the annotation data at CHARBUF. CHARBUF is an array:
5697 [ -LENGTH ANNOTATION_MASK FROM TO CHARSET-ID ]
5698 */
5699
5700 static INLINE void
5701 produce_charset (coding, charbuf)
5702 struct coding_system *coding;
5703 int *charbuf;
5704 {
5705 EMACS_INT from = coding->dst_pos + charbuf[2];
5706 EMACS_INT to = coding->dst_pos + charbuf[3];
5707 struct charset *charset = CHARSET_FROM_ID (charbuf[4]);
5708
5709 Fput_text_property (make_number (from), make_number (to),
5710 Qcharset, CHARSET_NAME (charset),
5711 coding->dst_object);
5712 }
5713
5714
5715 #define CHARBUF_SIZE 0x4000
5716
5717 #define ALLOC_CONVERSION_WORK_AREA(coding) \
5718 do { \
5719 int size = CHARBUF_SIZE;; \
5720 \
5721 coding->charbuf = NULL; \
5722 while (size > 1024) \
5723 { \
5724 coding->charbuf = (int *) alloca (sizeof (int) * size); \
5725 if (coding->charbuf) \
5726 break; \
5727 size >>= 1; \
5728 } \
5729 if (! coding->charbuf) \
5730 { \
5731 coding->result = CODING_RESULT_INSUFFICIENT_MEM; \
5732 return coding->result; \
5733 } \
5734 coding->charbuf_size = size; \
5735 } while (0)
5736
5737
5738 static void
5739 produce_annotation (coding)
5740 struct coding_system *coding;
5741 {
5742 int *charbuf = coding->charbuf;
5743 int *charbuf_end = charbuf + coding->charbuf_used;
5744
5745 if (NILP (coding->dst_object))
5746 return;
5747
5748 while (charbuf < charbuf_end)
5749 {
5750 if (*charbuf >= 0)
5751 charbuf++;
5752 else
5753 {
5754 int len = -*charbuf;
5755 switch (charbuf[1])
5756 {
5757 case CODING_ANNOTATE_COMPOSITION_MASK:
5758 produce_composition (coding, charbuf);
5759 break;
5760 case CODING_ANNOTATE_CHARSET_MASK:
5761 produce_charset (coding, charbuf);
5762 break;
5763 default:
5764 abort ();
5765 }
5766 charbuf += len;
5767 }
5768 }
5769 }
5770
5771 /* Decode the data at CODING->src_object into CODING->dst_object.
5772 CODING->src_object is a buffer, a string, or nil.
5773 CODING->dst_object is a buffer.
5774
5775 If CODING->src_object is a buffer, it must be the current buffer.
5776 In this case, if CODING->src_pos is positive, it is a position of
5777 the source text in the buffer, otherwise, the source text is in the
5778 gap area of the buffer, and CODING->src_pos specifies the offset of
5779 the text from GPT (which must be the same as PT). If this is the
5780 same buffer as CODING->dst_object, CODING->src_pos must be
5781 negative.
5782
5783 If CODING->src_object is a string, CODING->src_pos in an index to
5784 that string.
5785
5786 If CODING->src_object is nil, CODING->source must already point to
5787 the non-relocatable memory area. In this case, CODING->src_pos is
5788 an offset from CODING->source.
5789
5790 The decoded data is inserted at the current point of the buffer
5791 CODING->dst_object.
5792 */
5793
5794 static int
5795 decode_coding (coding)
5796 struct coding_system *coding;
5797 {
5798 Lisp_Object attrs;
5799
5800 if (BUFFERP (coding->src_object)
5801 && coding->src_pos > 0
5802 && coding->src_pos < GPT
5803 && coding->src_pos + coding->src_chars > GPT)
5804 move_gap_both (coding->src_pos, coding->src_pos_byte);
5805
5806 if (BUFFERP (coding->dst_object))
5807 {
5808 if (current_buffer != XBUFFER (coding->dst_object))
5809 set_buffer_internal (XBUFFER (coding->dst_object));
5810 if (GPT != PT)
5811 move_gap_both (PT, PT_BYTE);
5812 }
5813
5814 coding->consumed = coding->consumed_char = 0;
5815 coding->produced = coding->produced_char = 0;
5816 coding->chars_at_source = 0;
5817 coding->result = CODING_RESULT_SUCCESS;
5818 coding->errors = 0;
5819
5820 ALLOC_CONVERSION_WORK_AREA (coding);
5821
5822 attrs = CODING_ID_ATTRS (coding->id);
5823
5824 do
5825 {
5826 coding_set_source (coding);
5827 coding->annotated = 0;
5828 (*(coding->decoder)) (coding);
5829 if (!NILP (CODING_ATTR_DECODE_TBL (attrs)))
5830 translate_chars (coding, CODING_ATTR_DECODE_TBL (attrs));
5831 else if (!NILP (Vstandard_translation_table_for_decode))
5832 translate_chars (coding, Vstandard_translation_table_for_decode);
5833 coding_set_destination (coding);
5834 produce_chars (coding);
5835 if (coding->annotated)
5836 produce_annotation (coding);
5837 }
5838 while (coding->consumed < coding->src_bytes
5839 && ! coding->result);
5840
5841 if (EQ (CODING_ATTR_TYPE (CODING_ID_ATTRS (coding->id)), Qccl)
5842 && SYMBOLP (CODING_ID_EOL_TYPE (coding->id))
5843 && ! EQ (CODING_ID_EOL_TYPE (coding->id), Qunix))
5844 decode_eol (coding);
5845
5846 coding->carryover_bytes = 0;
5847 if (coding->consumed < coding->src_bytes)
5848 {
5849 int nbytes = coding->src_bytes - coding->consumed;
5850 const unsigned char *src;
5851
5852 coding_set_source (coding);
5853 coding_set_destination (coding);
5854 src = coding->source + coding->consumed;
5855
5856 if (coding->mode & CODING_MODE_LAST_BLOCK)
5857 {
5858 /* Flush out unprocessed data as binary chars. We are sure
5859 that the number of data is less than the size of
5860 coding->charbuf. */
5861 while (nbytes-- > 0)
5862 {
5863 int c = *src++;
5864
5865 coding->charbuf[coding->charbuf_used++] = (c & 0x80 ? - c : c);
5866 }
5867 produce_chars (coding);
5868 }
5869 else
5870 {
5871 /* Record unprocessed bytes in coding->carryover. We are
5872 sure that the number of data is less than the size of
5873 coding->carryover. */
5874 unsigned char *p = coding->carryover;
5875
5876 coding->carryover_bytes = nbytes;
5877 while (nbytes-- > 0)
5878 *p++ = *src++;
5879 }
5880 coding->consumed = coding->src_bytes;
5881 }
5882
5883 return coding->result;
5884 }
5885
5886
5887 /* Extract an annotation datum from a composition starting at POS and
5888 ending before LIMIT of CODING->src_object (buffer or string), store
5889 the data in BUF, set *STOP to a starting position of the next
5890 composition (if any) or to LIMIT, and return the address of the
5891 next element of BUF.
5892
5893 If such an annotation is not found, set *STOP to a starting
5894 position of a composition after POS (if any) or to LIMIT, and
5895 return BUF. */
5896
5897 static INLINE int *
5898 handle_composition_annotation (pos, limit, coding, buf, stop)
5899 EMACS_INT pos, limit;
5900 struct coding_system *coding;
5901 int *buf;
5902 EMACS_INT *stop;
5903 {
5904 EMACS_INT start, end;
5905 Lisp_Object prop;
5906
5907 if (! find_composition (pos, limit, &start, &end, &prop, coding->src_object)
5908 || end > limit)
5909 *stop = limit;
5910 else if (start > pos)
5911 *stop = start;
5912 else
5913 {
5914 if (start == pos)
5915 {
5916 /* We found a composition. Store the corresponding
5917 annotation data in BUF. */
5918 int *head = buf;
5919 enum composition_method method = COMPOSITION_METHOD (prop);
5920 int nchars = COMPOSITION_LENGTH (prop);
5921
5922 ADD_COMPOSITION_DATA (buf, 0, nchars, method);
5923 if (method != COMPOSITION_RELATIVE)
5924 {
5925 Lisp_Object components;
5926 int len, i, i_byte;
5927
5928 components = COMPOSITION_COMPONENTS (prop);
5929 if (VECTORP (components))
5930 {
5931 len = XVECTOR (components)->size;
5932 for (i = 0; i < len; i++)
5933 *buf++ = XINT (AREF (components, i));
5934 }
5935 else if (STRINGP (components))
5936 {
5937 len = SCHARS (components);
5938 i = i_byte = 0;
5939 while (i < len)
5940 {
5941 FETCH_STRING_CHAR_ADVANCE (*buf, components, i, i_byte);
5942 buf++;
5943 }
5944 }
5945 else if (INTEGERP (components))
5946 {
5947 len = 1;
5948 *buf++ = XINT (components);
5949 }
5950 else if (CONSP (components))
5951 {
5952 for (len = 0; CONSP (components);
5953 len++, components = XCDR (components))
5954 *buf++ = XINT (XCAR (components));
5955 }
5956 else
5957 abort ();
5958 *head -= len;
5959 }
5960 }
5961
5962 if (find_composition (end, limit, &start, &end, &prop,
5963 coding->src_object)
5964 && end <= limit)
5965 *stop = start;
5966 else
5967 *stop = limit;
5968 }
5969 return buf;
5970 }
5971
5972
5973 /* Extract an annotation datum from a text property `charset' at POS of
5974 CODING->src_object (buffer of string), store the data in BUF, set
5975 *STOP to the position where the value of `charset' property changes
5976 (limiting by LIMIT), and return the address of the next element of
5977 BUF.
5978
5979 If the property value is nil, set *STOP to the position where the
5980 property value is non-nil (limiting by LIMIT), and return BUF. */
5981
5982 static INLINE int *
5983 handle_charset_annotation (pos, limit, coding, buf, stop)
5984 EMACS_INT pos, limit;
5985 struct coding_system *coding;
5986 int *buf;
5987 EMACS_INT *stop;
5988 {
5989 Lisp_Object val, next;
5990 int id;
5991
5992 val = Fget_text_property (make_number (pos), Qcharset, coding->src_object);
5993 if (! NILP (val) && CHARSETP (val))
5994 id = XINT (CHARSET_SYMBOL_ID (val));
5995 else
5996 id = -1;
5997 ADD_CHARSET_DATA (buf, 0, 0, id);
5998 next = Fnext_single_property_change (make_number (pos), Qcharset,
5999 coding->src_object,
6000 make_number (limit));
6001 *stop = XINT (next);
6002 return buf;
6003 }
6004
6005
6006 static void
6007 consume_chars (coding)
6008 struct coding_system *coding;
6009 {
6010 int *buf = coding->charbuf;
6011 int *buf_end = coding->charbuf + coding->charbuf_size;
6012 const unsigned char *src = coding->source + coding->consumed;
6013 const unsigned char *src_end = coding->source + coding->src_bytes;
6014 EMACS_INT pos = coding->src_pos + coding->consumed_char;
6015 EMACS_INT end_pos = coding->src_pos + coding->src_chars;
6016 int multibytep = coding->src_multibyte;
6017 Lisp_Object eol_type;
6018 int c;
6019 EMACS_INT stop, stop_composition, stop_charset;
6020
6021 eol_type = CODING_ID_EOL_TYPE (coding->id);
6022 if (VECTORP (eol_type))
6023 eol_type = Qunix;
6024
6025 /* Note: composition handling is not yet implemented. */
6026 coding->common_flags &= ~CODING_ANNOTATE_COMPOSITION_MASK;
6027
6028 if (NILP (coding->src_object))
6029 stop = stop_composition = stop_charset = end_pos;
6030 else
6031 {
6032 if (coding->common_flags & CODING_ANNOTATE_COMPOSITION_MASK)
6033 stop = stop_composition = pos;
6034 else
6035 stop = stop_composition = end_pos;
6036 if (coding->common_flags & CODING_ANNOTATE_CHARSET_MASK)
6037 stop = stop_charset = pos;
6038 else
6039 stop_charset = end_pos;
6040 }
6041
6042 /* Compensate for CRLF and annotation. */
6043 buf_end -= 1 + MAX_ANNOTATION_LENGTH;
6044 while (buf < buf_end)
6045 {
6046 if (pos == stop)
6047 {
6048 if (pos == end_pos)
6049 break;
6050 if (pos == stop_composition)
6051 buf = handle_composition_annotation (pos, end_pos, coding,
6052 buf, &stop_composition);
6053 if (pos == stop_charset)
6054 buf = handle_charset_annotation (pos, end_pos, coding,
6055 buf, &stop_charset);
6056 stop = (stop_composition < stop_charset
6057 ? stop_composition : stop_charset);
6058 }
6059
6060 if (! multibytep)
6061 {
6062 EMACS_INT bytes;
6063
6064 if (! CODING_FOR_UNIBYTE (coding)
6065 && (bytes = MULTIBYTE_LENGTH (src, src_end)) > 0)
6066 c = STRING_CHAR_ADVANCE (src), pos += bytes;
6067 else
6068 c = *src++, pos++;
6069 }
6070 else
6071 c = STRING_CHAR_ADVANCE (src), pos++;
6072 if ((c == '\r') && (coding->mode & CODING_MODE_SELECTIVE_DISPLAY))
6073 c = '\n';
6074 if (! EQ (eol_type, Qunix))
6075 {
6076 if (c == '\n')
6077 {
6078 if (EQ (eol_type, Qdos))
6079 *buf++ = '\r';
6080 else
6081 c = '\r';
6082 }
6083 }
6084 *buf++ = c;
6085 }
6086
6087 coding->consumed = src - coding->source;
6088 coding->consumed_char = pos - coding->src_pos;
6089 coding->charbuf_used = buf - coding->charbuf;
6090 coding->chars_at_source = 0;
6091 }
6092
6093
6094 /* Encode the text at CODING->src_object into CODING->dst_object.
6095 CODING->src_object is a buffer or a string.
6096 CODING->dst_object is a buffer or nil.
6097
6098 If CODING->src_object is a buffer, it must be the current buffer.
6099 In this case, if CODING->src_pos is positive, it is a position of
6100 the source text in the buffer, otherwise. the source text is in the
6101 gap area of the buffer, and coding->src_pos specifies the offset of
6102 the text from GPT (which must be the same as PT). If this is the
6103 same buffer as CODING->dst_object, CODING->src_pos must be
6104 negative and CODING should not have `pre-write-conversion'.
6105
6106 If CODING->src_object is a string, CODING should not have
6107 `pre-write-conversion'.
6108
6109 If CODING->dst_object is a buffer, the encoded data is inserted at
6110 the current point of that buffer.
6111
6112 If CODING->dst_object is nil, the encoded data is placed at the
6113 memory area specified by CODING->destination. */
6114
6115 static int
6116 encode_coding (coding)
6117 struct coding_system *coding;
6118 {
6119 Lisp_Object attrs;
6120
6121 attrs = CODING_ID_ATTRS (coding->id);
6122
6123 if (BUFFERP (coding->dst_object))
6124 {
6125 set_buffer_internal (XBUFFER (coding->dst_object));
6126 coding->dst_multibyte
6127 = ! NILP (current_buffer->enable_multibyte_characters);
6128 }
6129
6130 coding->consumed = coding->consumed_char = 0;
6131 coding->produced = coding->produced_char = 0;
6132 coding->result = CODING_RESULT_SUCCESS;
6133 coding->errors = 0;
6134
6135 ALLOC_CONVERSION_WORK_AREA (coding);
6136
6137 do {
6138 coding_set_source (coding);
6139 consume_chars (coding);
6140
6141 if (!NILP (CODING_ATTR_ENCODE_TBL (attrs)))
6142 translate_chars (coding, CODING_ATTR_ENCODE_TBL (attrs));
6143 else if (!NILP (Vstandard_translation_table_for_encode))
6144 translate_chars (coding, Vstandard_translation_table_for_encode);
6145
6146 coding_set_destination (coding);
6147 (*(coding->encoder)) (coding);
6148 } while (coding->consumed_char < coding->src_chars);
6149
6150 if (BUFFERP (coding->dst_object))
6151 insert_from_gap (coding->produced_char, coding->produced);
6152
6153 return (coding->result);
6154 }
6155
6156
6157 /* Stack of working buffers used in code conversion. An nil element
6158 means that the code conversion of that level is not using a working
6159 buffer. */
6160 Lisp_Object Vcode_conversion_work_buf_list;
6161
6162 /* A working buffer used by the top level conversion. */
6163 Lisp_Object Vcode_conversion_reused_work_buf;
6164
6165
6166 /* Return a working buffer that can be freely used by the following
6167 code conversion. MULTIBYTEP specifies the multibyteness of the
6168 buffer. */
6169
6170 Lisp_Object
6171 make_conversion_work_buffer (multibytep, depth)
6172 int multibytep, depth;
6173 {
6174 struct buffer *current = current_buffer;
6175 Lisp_Object buf, name;
6176
6177 if (depth == 0)
6178 {
6179 if (NILP (Vcode_conversion_reused_work_buf))
6180 Vcode_conversion_reused_work_buf
6181 = Fget_buffer_create (build_string (" *code-converting-work<0>*"));
6182 buf = Vcode_conversion_reused_work_buf;
6183 }
6184 else
6185 {
6186 if (depth < 0)
6187 {
6188 name = build_string (" *code-converting-work*");
6189 name = Fgenerate_new_buffer_name (name, Qnil);
6190 }
6191 else
6192 {
6193 char str[128];
6194
6195 sprintf (str, " *code-converting-work*<%d>", depth);
6196 name = build_string (str);
6197 }
6198 buf = Fget_buffer_create (name);
6199 }
6200 set_buffer_internal (XBUFFER (buf));
6201 current_buffer->undo_list = Qt;
6202 Ferase_buffer ();
6203 Fset_buffer_multibyte (multibytep ? Qt : Qnil);
6204 set_buffer_internal (current);
6205 return buf;
6206 }
6207
6208 static Lisp_Object
6209 code_conversion_restore (buffer)
6210 Lisp_Object buffer;
6211 {
6212 Lisp_Object workbuf;
6213
6214 workbuf = XCAR (Vcode_conversion_work_buf_list);
6215 if (! NILP (workbuf)
6216 && ! EQ (workbuf, Vcode_conversion_reused_work_buf)
6217 && ! NILP (Fbuffer_live_p (workbuf)))
6218 Fkill_buffer (workbuf);
6219 Vcode_conversion_work_buf_list = XCDR (Vcode_conversion_work_buf_list);
6220 set_buffer_internal (XBUFFER (buffer));
6221 return Qnil;
6222 }
6223
6224 static Lisp_Object
6225 code_conversion_save (buffer, with_work_buf, multibyte)
6226 Lisp_Object buffer;
6227 int with_work_buf, multibyte;
6228 {
6229 Lisp_Object workbuf;
6230
6231 if (with_work_buf)
6232 {
6233 int depth = XINT (Flength (Vcode_conversion_work_buf_list));
6234
6235 workbuf = make_conversion_work_buffer (multibyte, depth);
6236 }
6237 else
6238 workbuf = Qnil;
6239 Vcode_conversion_work_buf_list
6240 = Fcons (workbuf, Vcode_conversion_work_buf_list);
6241 record_unwind_protect (code_conversion_restore, buffer);
6242 return workbuf;
6243 }
6244
6245 int
6246 decode_coding_gap (coding, chars, bytes)
6247 struct coding_system *coding;
6248 EMACS_INT chars, bytes;
6249 {
6250 int count = specpdl_ptr - specpdl;
6251 Lisp_Object attrs;
6252 Lisp_Object buffer;
6253
6254 buffer = Fcurrent_buffer ();
6255 code_conversion_save (buffer, 0, 0);
6256
6257 coding->src_object = buffer;
6258 coding->src_chars = chars;
6259 coding->src_bytes = bytes;
6260 coding->src_pos = -chars;
6261 coding->src_pos_byte = -bytes;
6262 coding->src_multibyte = chars < bytes;
6263 coding->dst_object = buffer;
6264 coding->dst_pos = PT;
6265 coding->dst_pos_byte = PT_BYTE;
6266 coding->dst_multibyte = ! NILP (current_buffer->enable_multibyte_characters);
6267 coding->mode |= CODING_MODE_LAST_BLOCK;
6268
6269 if (CODING_REQUIRE_DETECTION (coding))
6270 detect_coding (coding);
6271
6272 decode_coding (coding);
6273
6274 attrs = CODING_ID_ATTRS (coding->id);
6275 if (! NILP (CODING_ATTR_POST_READ (attrs)))
6276 {
6277 EMACS_INT prev_Z = Z, prev_Z_BYTE = Z_BYTE;
6278 Lisp_Object val;
6279
6280 TEMP_SET_PT_BOTH (coding->dst_pos, coding->dst_pos_byte);
6281 val = call1 (CODING_ATTR_POST_READ (attrs),
6282 make_number (coding->produced_char));
6283 CHECK_NATNUM (val);
6284 coding->produced_char += Z - prev_Z;
6285 coding->produced += Z_BYTE - prev_Z_BYTE;
6286 }
6287
6288 unbind_to (count, Qnil);
6289 return coding->result;
6290 }
6291
6292 int
6293 encode_coding_gap (coding, chars, bytes)
6294 struct coding_system *coding;
6295 EMACS_INT chars, bytes;
6296 {
6297 int count = specpdl_ptr - specpdl;
6298 Lisp_Object buffer;
6299
6300 buffer = Fcurrent_buffer ();
6301 code_conversion_save (buffer, 0, 0);
6302
6303 coding->src_object = buffer;
6304 coding->src_chars = chars;
6305 coding->src_bytes = bytes;
6306 coding->src_pos = -chars;
6307 coding->src_pos_byte = -bytes;
6308 coding->src_multibyte = chars < bytes;
6309 coding->dst_object = coding->src_object;
6310 coding->dst_pos = PT;
6311 coding->dst_pos_byte = PT_BYTE;
6312
6313 encode_coding (coding);
6314
6315 unbind_to (count, Qnil);
6316 return coding->result;
6317 }
6318
6319
6320 /* Decode the text in the range FROM/FROM_BYTE and TO/TO_BYTE in
6321 SRC_OBJECT into DST_OBJECT by coding context CODING.
6322
6323 SRC_OBJECT is a buffer, a string, or Qnil.
6324
6325 If it is a buffer, the text is at point of the buffer. FROM and TO
6326 are positions in the buffer.
6327
6328 If it is a string, the text is at the beginning of the string.
6329 FROM and TO are indices to the string.
6330
6331 If it is nil, the text is at coding->source. FROM and TO are
6332 indices to coding->source.
6333
6334 DST_OBJECT is a buffer, Qt, or Qnil.
6335
6336 If it is a buffer, the decoded text is inserted at point of the
6337 buffer. If the buffer is the same as SRC_OBJECT, the source text
6338 is deleted.
6339
6340 If it is Qt, a string is made from the decoded text, and
6341 set in CODING->dst_object.
6342
6343 If it is Qnil, the decoded text is stored at CODING->destination.
6344 The caller must allocate CODING->dst_bytes bytes at
6345 CODING->destination by xmalloc. If the decoded text is longer than
6346 CODING->dst_bytes, CODING->destination is relocated by xrealloc.
6347 */
6348
6349 void
6350 decode_coding_object (coding, src_object, from, from_byte, to, to_byte,
6351 dst_object)
6352 struct coding_system *coding;
6353 Lisp_Object src_object;
6354 EMACS_INT from, from_byte, to, to_byte;
6355 Lisp_Object dst_object;
6356 {
6357 int count = specpdl_ptr - specpdl;
6358 unsigned char *destination;
6359 EMACS_INT dst_bytes;
6360 EMACS_INT chars = to - from;
6361 EMACS_INT bytes = to_byte - from_byte;
6362 Lisp_Object attrs;
6363 Lisp_Object buffer;
6364 int saved_pt = -1, saved_pt_byte;
6365
6366 buffer = Fcurrent_buffer ();
6367
6368 if (NILP (dst_object))
6369 {
6370 destination = coding->destination;
6371 dst_bytes = coding->dst_bytes;
6372 }
6373
6374 coding->src_object = src_object;
6375 coding->src_chars = chars;
6376 coding->src_bytes = bytes;
6377 coding->src_multibyte = chars < bytes;
6378
6379 if (STRINGP (src_object))
6380 {
6381 coding->src_pos = from;
6382 coding->src_pos_byte = from_byte;
6383 }
6384 else if (BUFFERP (src_object))
6385 {
6386 set_buffer_internal (XBUFFER (src_object));
6387 if (from != GPT)
6388 move_gap_both (from, from_byte);
6389 if (EQ (src_object, dst_object))
6390 {
6391 saved_pt = PT, saved_pt_byte = PT_BYTE;
6392 TEMP_SET_PT_BOTH (from, from_byte);
6393 del_range_both (from, from_byte, to, to_byte, 1);
6394 coding->src_pos = -chars;
6395 coding->src_pos_byte = -bytes;
6396 }
6397 else
6398 {
6399 coding->src_pos = from;
6400 coding->src_pos_byte = from_byte;
6401 }
6402 }
6403
6404 if (CODING_REQUIRE_DETECTION (coding))
6405 detect_coding (coding);
6406 attrs = CODING_ID_ATTRS (coding->id);
6407
6408 if (EQ (dst_object, Qt)
6409 || (! NILP (CODING_ATTR_POST_READ (attrs))
6410 && NILP (dst_object)))
6411 {
6412 coding->dst_object = code_conversion_save (buffer, 1, 1);
6413 coding->dst_pos = BEG;
6414 coding->dst_pos_byte = BEG_BYTE;
6415 coding->dst_multibyte = 1;
6416 }
6417 else if (BUFFERP (dst_object))
6418 {
6419 code_conversion_save (buffer, 0, 0);
6420 coding->dst_object = dst_object;
6421 coding->dst_pos = BUF_PT (XBUFFER (dst_object));
6422 coding->dst_pos_byte = BUF_PT_BYTE (XBUFFER (dst_object));
6423 coding->dst_multibyte
6424 = ! NILP (XBUFFER (dst_object)->enable_multibyte_characters);
6425 }
6426 else
6427 {
6428 code_conversion_save (buffer, 0, 0);
6429 coding->dst_object = Qnil;
6430 coding->dst_multibyte = 1;
6431 }
6432
6433 decode_coding (coding);
6434
6435 if (BUFFERP (coding->dst_object))
6436 set_buffer_internal (XBUFFER (coding->dst_object));
6437
6438 if (! NILP (CODING_ATTR_POST_READ (attrs)))
6439 {
6440 struct gcpro gcpro1, gcpro2;
6441 EMACS_INT prev_Z = Z, prev_Z_BYTE = Z_BYTE;
6442 Lisp_Object val;
6443
6444 TEMP_SET_PT_BOTH (coding->dst_pos, coding->dst_pos_byte);
6445 GCPRO2 (coding->src_object, coding->dst_object);
6446 val = call1 (CODING_ATTR_POST_READ (attrs),
6447 make_number (coding->produced_char));
6448 UNGCPRO;
6449 CHECK_NATNUM (val);
6450 coding->produced_char += Z - prev_Z;
6451 coding->produced += Z_BYTE - prev_Z_BYTE;
6452 }
6453
6454 if (EQ (dst_object, Qt))
6455 {
6456 coding->dst_object = Fbuffer_string ();
6457 }
6458 else if (NILP (dst_object) && BUFFERP (coding->dst_object))
6459 {
6460 set_buffer_internal (XBUFFER (coding->dst_object));
6461 if (dst_bytes < coding->produced)
6462 {
6463 destination
6464 = (unsigned char *) xrealloc (destination, coding->produced);
6465 if (! destination)
6466 {
6467 coding->result = CODING_RESULT_INSUFFICIENT_DST;
6468 unbind_to (count, Qnil);
6469 return;
6470 }
6471 if (BEGV < GPT && GPT < BEGV + coding->produced_char)
6472 move_gap_both (BEGV, BEGV_BYTE);
6473 bcopy (BEGV_ADDR, destination, coding->produced);
6474 coding->destination = destination;
6475 }
6476 }
6477
6478 if (saved_pt >= 0)
6479 {
6480 /* This is the case of:
6481 (BUFFERP (src_object) && EQ (src_object, dst_object))
6482 As we have moved PT while replacing the original buffer
6483 contents, we must recover it now. */
6484 set_buffer_internal (XBUFFER (src_object));
6485 if (saved_pt < from)
6486 TEMP_SET_PT_BOTH (saved_pt, saved_pt_byte);
6487 else if (saved_pt < from + chars)
6488 TEMP_SET_PT_BOTH (from, from_byte);
6489 else if (! NILP (current_buffer->enable_multibyte_characters))
6490 TEMP_SET_PT_BOTH (saved_pt + (coding->produced_char - chars),
6491 saved_pt_byte + (coding->produced - bytes));
6492 else
6493 TEMP_SET_PT_BOTH (saved_pt + (coding->produced - bytes),
6494 saved_pt_byte + (coding->produced - bytes));
6495 }
6496
6497 unbind_to (count, Qnil);
6498 }
6499
6500
6501 void
6502 encode_coding_object (coding, src_object, from, from_byte, to, to_byte,
6503 dst_object)
6504 struct coding_system *coding;
6505 Lisp_Object src_object;
6506 EMACS_INT from, from_byte, to, to_byte;
6507 Lisp_Object dst_object;
6508 {
6509 int count = specpdl_ptr - specpdl;
6510 EMACS_INT chars = to - from;
6511 EMACS_INT bytes = to_byte - from_byte;
6512 Lisp_Object attrs;
6513 Lisp_Object buffer;
6514 int saved_pt = -1, saved_pt_byte;
6515
6516 buffer = Fcurrent_buffer ();
6517
6518 coding->src_object = src_object;
6519 coding->src_chars = chars;
6520 coding->src_bytes = bytes;
6521 coding->src_multibyte = chars < bytes;
6522
6523 attrs = CODING_ID_ATTRS (coding->id);
6524
6525 if (! NILP (CODING_ATTR_PRE_WRITE (attrs)))
6526 {
6527 coding->src_object = code_conversion_save (buffer, 1,
6528 coding->src_multibyte);
6529 set_buffer_internal (XBUFFER (coding->src_object));
6530 if (STRINGP (src_object))
6531 insert_from_string (src_object, from, from_byte, chars, bytes, 0);
6532 else if (BUFFERP (src_object))
6533 insert_from_buffer (XBUFFER (src_object), from, chars, 0);
6534 else
6535 insert_1_both (coding->source + from, chars, bytes, 0, 0, 0);
6536
6537 if (EQ (src_object, dst_object))
6538 {
6539 set_buffer_internal (XBUFFER (src_object));
6540 saved_pt = PT, saved_pt_byte = PT_BYTE;
6541 del_range_both (from, from_byte, to, to_byte, 1);
6542 set_buffer_internal (XBUFFER (coding->src_object));
6543 }
6544
6545 call2 (CODING_ATTR_PRE_WRITE (attrs),
6546 make_number (BEG), make_number (Z));
6547 coding->src_object = Fcurrent_buffer ();
6548 if (BEG != GPT)
6549 move_gap_both (BEG, BEG_BYTE);
6550 coding->src_chars = Z - BEG;
6551 coding->src_bytes = Z_BYTE - BEG_BYTE;
6552 coding->src_pos = BEG;
6553 coding->src_pos_byte = BEG_BYTE;
6554 coding->src_multibyte = Z < Z_BYTE;
6555 }
6556 else if (STRINGP (src_object))
6557 {
6558 code_conversion_save (buffer, 0, 0);
6559 coding->src_pos = from;
6560 coding->src_pos_byte = from_byte;
6561 }
6562 else if (BUFFERP (src_object))
6563 {
6564 code_conversion_save (buffer, 0, 0);
6565 set_buffer_internal (XBUFFER (src_object));
6566 if (EQ (src_object, dst_object))
6567 {
6568 saved_pt = PT, saved_pt_byte = PT_BYTE;
6569 coding->src_object = del_range_1 (from, to, 1, 1);
6570 coding->src_pos = 0;
6571 coding->src_pos_byte = 0;
6572 }
6573 else
6574 {
6575 if (from < GPT && to >= GPT)
6576 move_gap_both (from, from_byte);
6577 coding->src_pos = from;
6578 coding->src_pos_byte = from_byte;
6579 }
6580 }
6581 else
6582 code_conversion_save (buffer, 0, 0);
6583
6584 if (BUFFERP (dst_object))
6585 {
6586 coding->dst_object = dst_object;
6587 if (EQ (src_object, dst_object))
6588 {
6589 coding->dst_pos = from;
6590 coding->dst_pos_byte = from_byte;
6591 }
6592 else
6593 {
6594 coding->dst_pos = BUF_PT (XBUFFER (dst_object));
6595 coding->dst_pos_byte = BUF_PT_BYTE (XBUFFER (dst_object));
6596 }
6597 coding->dst_multibyte
6598 = ! NILP (XBUFFER (dst_object)->enable_multibyte_characters);
6599 }
6600 else if (EQ (dst_object, Qt))
6601 {
6602 coding->dst_object = Qnil;
6603 coding->dst_bytes = coding->src_chars;
6604 if (coding->dst_bytes == 0)
6605 coding->dst_bytes = 1;
6606 coding->destination = (unsigned char *) xmalloc (coding->dst_bytes);
6607 coding->dst_multibyte = 0;
6608 }
6609 else
6610 {
6611 coding->dst_object = Qnil;
6612 coding->dst_multibyte = 0;
6613 }
6614
6615 encode_coding (coding);
6616
6617 if (EQ (dst_object, Qt))
6618 {
6619 if (BUFFERP (coding->dst_object))
6620 coding->dst_object = Fbuffer_string ();
6621 else
6622 {
6623 coding->dst_object
6624 = make_unibyte_string ((char *) coding->destination,
6625 coding->produced);
6626 xfree (coding->destination);
6627 }
6628 }
6629
6630 if (saved_pt >= 0)
6631 {
6632 /* This is the case of:
6633 (BUFFERP (src_object) && EQ (src_object, dst_object))
6634 As we have moved PT while replacing the original buffer
6635 contents, we must recover it now. */
6636 set_buffer_internal (XBUFFER (src_object));
6637 if (saved_pt < from)
6638 TEMP_SET_PT_BOTH (saved_pt, saved_pt_byte);
6639 else if (saved_pt < from + chars)
6640 TEMP_SET_PT_BOTH (from, from_byte);
6641 else if (! NILP (current_buffer->enable_multibyte_characters))
6642 TEMP_SET_PT_BOTH (saved_pt + (coding->produced_char - chars),
6643 saved_pt_byte + (coding->produced - bytes));
6644 else
6645 TEMP_SET_PT_BOTH (saved_pt + (coding->produced - bytes),
6646 saved_pt_byte + (coding->produced - bytes));
6647 }
6648
6649 unbind_to (count, Qnil);
6650 }
6651
6652
6653 Lisp_Object
6654 preferred_coding_system ()
6655 {
6656 int id = coding_categories[coding_priorities[0]].id;
6657
6658 return CODING_ID_NAME (id);
6659 }
6660
6661 \f
6662 #ifdef emacs
6663 /*** 8. Emacs Lisp library functions ***/
6664
6665 DEFUN ("coding-system-p", Fcoding_system_p, Scoding_system_p, 1, 1, 0,
6666 doc: /* Return t if OBJECT is nil or a coding-system.
6667 See the documentation of `define-coding-system' for information
6668 about coding-system objects. */)
6669 (obj)
6670 Lisp_Object obj;
6671 {
6672 return ((NILP (obj) || CODING_SYSTEM_P (obj)) ? Qt : Qnil);
6673 }
6674
6675 DEFUN ("read-non-nil-coding-system", Fread_non_nil_coding_system,
6676 Sread_non_nil_coding_system, 1, 1, 0,
6677 doc: /* Read a coding system from the minibuffer, prompting with string PROMPT. */)
6678 (prompt)
6679 Lisp_Object prompt;
6680 {
6681 Lisp_Object val;
6682 do
6683 {
6684 val = Fcompleting_read (prompt, Vcoding_system_alist, Qnil,
6685 Qt, Qnil, Qcoding_system_history, Qnil, Qnil);
6686 }
6687 while (SCHARS (val) == 0);
6688 return (Fintern (val, Qnil));
6689 }
6690
6691 DEFUN ("read-coding-system", Fread_coding_system, Sread_coding_system, 1, 2, 0,
6692 doc: /* Read a coding system from the minibuffer, prompting with string PROMPT.
6693 If the user enters null input, return second argument DEFAULT-CODING-SYSTEM. */)
6694 (prompt, default_coding_system)
6695 Lisp_Object prompt, default_coding_system;
6696 {
6697 Lisp_Object val;
6698 if (SYMBOLP (default_coding_system))
6699 XSETSTRING (default_coding_system, XPNTR (SYMBOL_NAME (default_coding_system)));
6700 val = Fcompleting_read (prompt, Vcoding_system_alist, Qnil,
6701 Qt, Qnil, Qcoding_system_history,
6702 default_coding_system, Qnil);
6703 return (SCHARS (val) == 0 ? Qnil : Fintern (val, Qnil));
6704 }
6705
6706 DEFUN ("check-coding-system", Fcheck_coding_system, Scheck_coding_system,
6707 1, 1, 0,
6708 doc: /* Check validity of CODING-SYSTEM.
6709 If valid, return CODING-SYSTEM, else signal a `coding-system-error' error. */)
6710 (coding_system)
6711 Lisp_Object coding_system;
6712 {
6713 CHECK_SYMBOL (coding_system);
6714 if (!NILP (Fcoding_system_p (coding_system)))
6715 return coding_system;
6716 while (1)
6717 Fsignal (Qcoding_system_error, Fcons (coding_system, Qnil));
6718 }
6719
6720 \f
6721 /* Detect how the bytes at SRC of length SRC_BYTES are encoded. If
6722 HIGHEST is nonzero, return the coding system of the highest
6723 priority among the detected coding systems. Otherwize return a
6724 list of detected coding systems sorted by their priorities. If
6725 MULTIBYTEP is nonzero, it is assumed that the bytes are in correct
6726 multibyte form but contains only ASCII and eight-bit chars.
6727 Otherwise, the bytes are raw bytes.
6728
6729 CODING-SYSTEM controls the detection as below:
6730
6731 If it is nil, detect both text-format and eol-format. If the
6732 text-format part of CODING-SYSTEM is already specified
6733 (e.g. `iso-latin-1'), detect only eol-format. If the eol-format
6734 part of CODING-SYSTEM is already specified (e.g. `undecided-unix'),
6735 detect only text-format. */
6736
6737 Lisp_Object
6738 detect_coding_system (src, src_bytes, highest, multibytep, coding_system)
6739 const unsigned char *src;
6740 int src_bytes, highest;
6741 int multibytep;
6742 Lisp_Object coding_system;
6743 {
6744 const unsigned char *src_end = src + src_bytes;
6745 Lisp_Object attrs, eol_type;
6746 Lisp_Object val;
6747 struct coding_system coding;
6748 int id;
6749 struct coding_detection_info detect_info;
6750
6751 if (NILP (coding_system))
6752 coding_system = Qundecided;
6753 setup_coding_system (coding_system, &coding);
6754 attrs = CODING_ID_ATTRS (coding.id);
6755 eol_type = CODING_ID_EOL_TYPE (coding.id);
6756 coding_system = CODING_ATTR_BASE_NAME (attrs);
6757
6758 coding.source = src;
6759 coding.src_bytes = src_bytes;
6760 coding.src_multibyte = multibytep;
6761 coding.consumed = 0;
6762 coding.mode |= CODING_MODE_LAST_BLOCK;
6763
6764 detect_info.checked = detect_info.found = detect_info.rejected = 0;
6765
6766 /* At first, detect text-format if necessary. */
6767 if (XINT (CODING_ATTR_CATEGORY (attrs)) == coding_category_undecided)
6768 {
6769 enum coding_category category;
6770 struct coding_system *this;
6771 int c, i;
6772
6773 for (; src < src_end; src++)
6774 {
6775 c = *src;
6776 if (c & 0x80
6777 || (c < 0x20 && (c == ISO_CODE_ESC
6778 || c == ISO_CODE_SI
6779 || c == ISO_CODE_SO)))
6780 break;
6781 }
6782 coding.head_ascii = src - coding.source;
6783
6784 if (src < src_end)
6785 for (i = 0; i < coding_category_raw_text; i++)
6786 {
6787 category = coding_priorities[i];
6788 this = coding_categories + category;
6789
6790 if (this->id < 0)
6791 {
6792 /* No coding system of this category is defined. */
6793 detect_info.rejected |= (1 << category);
6794 }
6795 else if (category >= coding_category_raw_text)
6796 continue;
6797 else if (detect_info.checked & (1 << category))
6798 {
6799 if (highest
6800 && (detect_info.found & (1 << category)))
6801 break;
6802 }
6803 else
6804 {
6805 if ((*(this->detector)) (&coding, &detect_info)
6806 && highest
6807 && (detect_info.found & (1 << category)))
6808 break;
6809 }
6810 }
6811
6812
6813 if (detect_info.rejected == CATEGORY_MASK_ANY)
6814 {
6815 detect_info.found = CATEGORY_MASK_RAW_TEXT;
6816 id = coding_categories[coding_category_raw_text].id;
6817 val = Fcons (make_number (id), Qnil);
6818 }
6819 else if (! detect_info.rejected && ! detect_info.found)
6820 {
6821 detect_info.found = CATEGORY_MASK_ANY;
6822 id = coding_categories[coding_category_undecided].id;
6823 val = Fcons (make_number (id), Qnil);
6824 }
6825 else if (highest)
6826 {
6827 if (detect_info.found)
6828 {
6829 detect_info.found = 1 << category;
6830 val = Fcons (make_number (this->id), Qnil);
6831 }
6832 else
6833 for (i = 0; i < coding_category_raw_text; i++)
6834 if (! (detect_info.rejected & (1 << coding_priorities[i])))
6835 {
6836 detect_info.found = 1 << coding_priorities[i];
6837 id = coding_categories[coding_priorities[i]].id;
6838 val = Fcons (make_number (id), Qnil);
6839 break;
6840 }
6841 }
6842 else
6843 {
6844 int mask = detect_info.rejected | detect_info.found;
6845 int found = 0;
6846 val = Qnil;
6847
6848 for (i = coding_category_raw_text - 1; i >= 0; i--)
6849 {
6850 category = coding_priorities[i];
6851 if (! (mask & (1 << category)))
6852 {
6853 found |= 1 << category;
6854 id = coding_categories[category].id;
6855 val = Fcons (make_number (id), val);
6856 }
6857 }
6858 for (i = coding_category_raw_text - 1; i >= 0; i--)
6859 {
6860 category = coding_priorities[i];
6861 if (detect_info.found & (1 << category))
6862 {
6863 id = coding_categories[category].id;
6864 val = Fcons (make_number (id), val);
6865 }
6866 }
6867 detect_info.found |= found;
6868 }
6869 }
6870 else
6871 {
6872 detect_info.found = 1 << XINT (CODING_ATTR_CATEGORY (attrs));
6873 val = Fcons (make_number (coding.id), Qnil);
6874 }
6875
6876 /* Then, detect eol-format if necessary. */
6877 {
6878 int normal_eol = -1, utf_16_be_eol = -1, utf_16_le_eol;
6879 Lisp_Object tail;
6880
6881 if (VECTORP (eol_type))
6882 {
6883 if (detect_info.found & ~CATEGORY_MASK_UTF_16)
6884 normal_eol = detect_eol (coding.source, src_bytes,
6885 coding_category_raw_text);
6886 if (detect_info.found & (CATEGORY_MASK_UTF_16_BE
6887 | CATEGORY_MASK_UTF_16_BE_NOSIG))
6888 utf_16_be_eol = detect_eol (coding.source, src_bytes,
6889 coding_category_utf_16_be);
6890 if (detect_info.found & (CATEGORY_MASK_UTF_16_LE
6891 | CATEGORY_MASK_UTF_16_LE_NOSIG))
6892 utf_16_le_eol = detect_eol (coding.source, src_bytes,
6893 coding_category_utf_16_le);
6894 }
6895 else
6896 {
6897 if (EQ (eol_type, Qunix))
6898 normal_eol = utf_16_be_eol = utf_16_le_eol = EOL_SEEN_LF;
6899 else if (EQ (eol_type, Qdos))
6900 normal_eol = utf_16_be_eol = utf_16_le_eol = EOL_SEEN_CRLF;
6901 else
6902 normal_eol = utf_16_be_eol = utf_16_le_eol = EOL_SEEN_CR;
6903 }
6904
6905 for (tail = val; CONSP (tail); tail = XCDR (tail))
6906 {
6907 enum coding_category category;
6908 int this_eol;
6909
6910 id = XINT (XCAR (tail));
6911 attrs = CODING_ID_ATTRS (id);
6912 category = XINT (CODING_ATTR_CATEGORY (attrs));
6913 eol_type = CODING_ID_EOL_TYPE (id);
6914 if (VECTORP (eol_type))
6915 {
6916 if (category == coding_category_utf_16_be
6917 || category == coding_category_utf_16_be_nosig)
6918 this_eol = utf_16_be_eol;
6919 else if (category == coding_category_utf_16_le
6920 || category == coding_category_utf_16_le_nosig)
6921 this_eol = utf_16_le_eol;
6922 else
6923 this_eol = normal_eol;
6924
6925 if (this_eol == EOL_SEEN_LF)
6926 XSETCAR (tail, AREF (eol_type, 0));
6927 else if (this_eol == EOL_SEEN_CRLF)
6928 XSETCAR (tail, AREF (eol_type, 1));
6929 else if (this_eol == EOL_SEEN_CR)
6930 XSETCAR (tail, AREF (eol_type, 2));
6931 else
6932 XSETCAR (tail, CODING_ID_NAME (id));
6933 }
6934 else
6935 XSETCAR (tail, CODING_ID_NAME (id));
6936 }
6937 }
6938
6939 return (highest ? XCAR (val) : val);
6940 }
6941
6942
6943 DEFUN ("detect-coding-region", Fdetect_coding_region, Sdetect_coding_region,
6944 2, 3, 0,
6945 doc: /* Detect coding system of the text in the region between START and END.
6946 Return a list of possible coding systems ordered by priority.
6947
6948 If only ASCII characters are found, it returns a list of single element
6949 `undecided' or its subsidiary coding system according to a detected
6950 end-of-line format.
6951
6952 If optional argument HIGHEST is non-nil, return the coding system of
6953 highest priority. */)
6954 (start, end, highest)
6955 Lisp_Object start, end, highest;
6956 {
6957 int from, to;
6958 int from_byte, to_byte;
6959
6960 CHECK_NUMBER_COERCE_MARKER (start);
6961 CHECK_NUMBER_COERCE_MARKER (end);
6962
6963 validate_region (&start, &end);
6964 from = XINT (start), to = XINT (end);
6965 from_byte = CHAR_TO_BYTE (from);
6966 to_byte = CHAR_TO_BYTE (to);
6967
6968 if (from < GPT && to >= GPT)
6969 move_gap_both (to, to_byte);
6970
6971 return detect_coding_system (BYTE_POS_ADDR (from_byte),
6972 to_byte - from_byte,
6973 !NILP (highest),
6974 !NILP (current_buffer
6975 ->enable_multibyte_characters),
6976 Qnil);
6977 }
6978
6979 DEFUN ("detect-coding-string", Fdetect_coding_string, Sdetect_coding_string,
6980 1, 2, 0,
6981 doc: /* Detect coding system of the text in STRING.
6982 Return a list of possible coding systems ordered by priority.
6983
6984 If only ASCII characters are found, it returns a list of single element
6985 `undecided' or its subsidiary coding system according to a detected
6986 end-of-line format.
6987
6988 If optional argument HIGHEST is non-nil, return the coding system of
6989 highest priority. */)
6990 (string, highest)
6991 Lisp_Object string, highest;
6992 {
6993 CHECK_STRING (string);
6994
6995 return detect_coding_system (SDATA (string), SBYTES (string),
6996 !NILP (highest), STRING_MULTIBYTE (string),
6997 Qnil);
6998 }
6999
7000
7001 static INLINE int
7002 char_encodable_p (c, attrs)
7003 int c;
7004 Lisp_Object attrs;
7005 {
7006 Lisp_Object tail;
7007 struct charset *charset;
7008
7009 for (tail = CODING_ATTR_CHARSET_LIST (attrs);
7010 CONSP (tail); tail = XCDR (tail))
7011 {
7012 charset = CHARSET_FROM_ID (XINT (XCAR (tail)));
7013 if (CHAR_CHARSET_P (c, charset))
7014 break;
7015 }
7016 return (! NILP (tail));
7017 }
7018
7019
7020 /* Return a list of coding systems that safely encode the text between
7021 START and END. If EXCLUDE is non-nil, it is a list of coding
7022 systems not to check. The returned list doesn't contain any such
7023 coding systems. In any case, if the text contains only ASCII or is
7024 unibyte, return t. */
7025
7026 DEFUN ("find-coding-systems-region-internal",
7027 Ffind_coding_systems_region_internal,
7028 Sfind_coding_systems_region_internal, 2, 3, 0,
7029 doc: /* Internal use only. */)
7030 (start, end, exclude)
7031 Lisp_Object start, end, exclude;
7032 {
7033 Lisp_Object coding_attrs_list, safe_codings;
7034 EMACS_INT start_byte, end_byte;
7035 const unsigned char *p, *pbeg, *pend;
7036 int c;
7037 Lisp_Object tail, elt;
7038
7039 if (STRINGP (start))
7040 {
7041 if (!STRING_MULTIBYTE (start)
7042 || SCHARS (start) == SBYTES (start))
7043 return Qt;
7044 start_byte = 0;
7045 end_byte = SBYTES (start);
7046 }
7047 else
7048 {
7049 CHECK_NUMBER_COERCE_MARKER (start);
7050 CHECK_NUMBER_COERCE_MARKER (end);
7051 if (XINT (start) < BEG || XINT (end) > Z || XINT (start) > XINT (end))
7052 args_out_of_range (start, end);
7053 if (NILP (current_buffer->enable_multibyte_characters))
7054 return Qt;
7055 start_byte = CHAR_TO_BYTE (XINT (start));
7056 end_byte = CHAR_TO_BYTE (XINT (end));
7057 if (XINT (end) - XINT (start) == end_byte - start_byte)
7058 return Qt;
7059
7060 if (XINT (start) < GPT && XINT (end) > GPT)
7061 {
7062 if ((GPT - XINT (start)) < (XINT (end) - GPT))
7063 move_gap_both (XINT (start), start_byte);
7064 else
7065 move_gap_both (XINT (end), end_byte);
7066 }
7067 }
7068
7069 coding_attrs_list = Qnil;
7070 for (tail = Vcoding_system_list; CONSP (tail); tail = XCDR (tail))
7071 if (NILP (exclude)
7072 || NILP (Fmemq (XCAR (tail), exclude)))
7073 {
7074 Lisp_Object attrs;
7075
7076 attrs = AREF (CODING_SYSTEM_SPEC (XCAR (tail)), 0);
7077 if (EQ (XCAR (tail), CODING_ATTR_BASE_NAME (attrs))
7078 && ! EQ (CODING_ATTR_TYPE (attrs), Qundecided))
7079 coding_attrs_list = Fcons (attrs, coding_attrs_list);
7080 }
7081
7082 if (STRINGP (start))
7083 p = pbeg = SDATA (start);
7084 else
7085 p = pbeg = BYTE_POS_ADDR (start_byte);
7086 pend = p + (end_byte - start_byte);
7087
7088 while (p < pend && ASCII_BYTE_P (*p)) p++;
7089 while (p < pend && ASCII_BYTE_P (*(pend - 1))) pend--;
7090
7091 while (p < pend)
7092 {
7093 if (ASCII_BYTE_P (*p))
7094 p++;
7095 else
7096 {
7097 c = STRING_CHAR_ADVANCE (p);
7098
7099 charset_map_loaded = 0;
7100 for (tail = coding_attrs_list; CONSP (tail);)
7101 {
7102 elt = XCAR (tail);
7103 if (NILP (elt))
7104 tail = XCDR (tail);
7105 else if (char_encodable_p (c, elt))
7106 tail = XCDR (tail);
7107 else if (CONSP (XCDR (tail)))
7108 {
7109 XSETCAR (tail, XCAR (XCDR (tail)));
7110 XSETCDR (tail, XCDR (XCDR (tail)));
7111 }
7112 else
7113 {
7114 XSETCAR (tail, Qnil);
7115 tail = XCDR (tail);
7116 }
7117 }
7118 if (charset_map_loaded)
7119 {
7120 EMACS_INT p_offset = p - pbeg, pend_offset = pend - pbeg;
7121
7122 if (STRINGP (start))
7123 pbeg = SDATA (start);
7124 else
7125 pbeg = BYTE_POS_ADDR (start_byte);
7126 p = pbeg + p_offset;
7127 pend = pbeg + pend_offset;
7128 }
7129 }
7130 }
7131
7132 safe_codings = Qnil;
7133 for (tail = coding_attrs_list; CONSP (tail); tail = XCDR (tail))
7134 if (! NILP (XCAR (tail)))
7135 safe_codings = Fcons (CODING_ATTR_BASE_NAME (XCAR (tail)), safe_codings);
7136
7137 return safe_codings;
7138 }
7139
7140
7141 DEFUN ("unencodable-char-position", Funencodable_char_position,
7142 Sunencodable_char_position, 3, 5, 0,
7143 doc: /*
7144 Return position of first un-encodable character in a region.
7145 START and END specfiy the region and CODING-SYSTEM specifies the
7146 encoding to check. Return nil if CODING-SYSTEM does encode the region.
7147
7148 If optional 4th argument COUNT is non-nil, it specifies at most how
7149 many un-encodable characters to search. In this case, the value is a
7150 list of positions.
7151
7152 If optional 5th argument STRING is non-nil, it is a string to search
7153 for un-encodable characters. In that case, START and END are indexes
7154 to the string. */)
7155 (start, end, coding_system, count, string)
7156 Lisp_Object start, end, coding_system, count, string;
7157 {
7158 int n;
7159 struct coding_system coding;
7160 Lisp_Object attrs, charset_list;
7161 Lisp_Object positions;
7162 int from, to;
7163 const unsigned char *p, *stop, *pend;
7164 int ascii_compatible;
7165
7166 setup_coding_system (Fcheck_coding_system (coding_system), &coding);
7167 attrs = CODING_ID_ATTRS (coding.id);
7168 if (EQ (CODING_ATTR_TYPE (attrs), Qraw_text))
7169 return Qnil;
7170 ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs));
7171 charset_list = CODING_ATTR_CHARSET_LIST (attrs);
7172
7173 if (NILP (string))
7174 {
7175 validate_region (&start, &end);
7176 from = XINT (start);
7177 to = XINT (end);
7178 if (NILP (current_buffer->enable_multibyte_characters)
7179 || (ascii_compatible
7180 && (to - from) == (CHAR_TO_BYTE (to) - (CHAR_TO_BYTE (from)))))
7181 return Qnil;
7182 p = CHAR_POS_ADDR (from);
7183 pend = CHAR_POS_ADDR (to);
7184 if (from < GPT && to >= GPT)
7185 stop = GPT_ADDR;
7186 else
7187 stop = pend;
7188 }
7189 else
7190 {
7191 CHECK_STRING (string);
7192 CHECK_NATNUM (start);
7193 CHECK_NATNUM (end);
7194 from = XINT (start);
7195 to = XINT (end);
7196 if (from > to
7197 || to > SCHARS (string))
7198 args_out_of_range_3 (string, start, end);
7199 if (! STRING_MULTIBYTE (string))
7200 return Qnil;
7201 p = SDATA (string) + string_char_to_byte (string, from);
7202 stop = pend = SDATA (string) + string_char_to_byte (string, to);
7203 if (ascii_compatible && (to - from) == (pend - p))
7204 return Qnil;
7205 }
7206
7207 if (NILP (count))
7208 n = 1;
7209 else
7210 {
7211 CHECK_NATNUM (count);
7212 n = XINT (count);
7213 }
7214
7215 positions = Qnil;
7216 while (1)
7217 {
7218 int c;
7219
7220 if (ascii_compatible)
7221 while (p < stop && ASCII_BYTE_P (*p))
7222 p++, from++;
7223 if (p >= stop)
7224 {
7225 if (p >= pend)
7226 break;
7227 stop = pend;
7228 p = GAP_END_ADDR;
7229 }
7230
7231 c = STRING_CHAR_ADVANCE (p);
7232 if (! (ASCII_CHAR_P (c) && ascii_compatible)
7233 && ! char_charset (c, charset_list, NULL))
7234 {
7235 positions = Fcons (make_number (from), positions);
7236 n--;
7237 if (n == 0)
7238 break;
7239 }
7240
7241 from++;
7242 }
7243
7244 return (NILP (count) ? Fcar (positions) : Fnreverse (positions));
7245 }
7246
7247
7248 DEFUN ("check-coding-systems-region", Fcheck_coding_systems_region,
7249 Scheck_coding_systems_region, 3, 3, 0,
7250 doc: /* Check if the region is encodable by coding systems.
7251
7252 START and END are buffer positions specifying the region.
7253 CODING-SYSTEM-LIST is a list of coding systems to check.
7254
7255 The value is an alist ((CODING-SYSTEM POS0 POS1 ...) ...), where
7256 CODING-SYSTEM is a member of CODING-SYSTEM-LIst and can't encode the
7257 whole region, POS0, POS1, ... are buffer positions where non-encodable
7258 characters are found.
7259
7260 If all coding systems in CODING-SYSTEM-LIST can encode the region, the
7261 value is nil.
7262
7263 START may be a string. In that case, check if the string is
7264 encodable, and the value contains indices to the string instead of
7265 buffer positions. END is ignored. */)
7266 (start, end, coding_system_list)
7267 Lisp_Object start, end, coding_system_list;
7268 {
7269 Lisp_Object list;
7270 EMACS_INT start_byte, end_byte;
7271 int pos;
7272 const unsigned char *p, *pbeg, *pend;
7273 int c;
7274 Lisp_Object tail, elt;
7275
7276 if (STRINGP (start))
7277 {
7278 if (!STRING_MULTIBYTE (start)
7279 && SCHARS (start) != SBYTES (start))
7280 return Qnil;
7281 start_byte = 0;
7282 end_byte = SBYTES (start);
7283 pos = 0;
7284 }
7285 else
7286 {
7287 CHECK_NUMBER_COERCE_MARKER (start);
7288 CHECK_NUMBER_COERCE_MARKER (end);
7289 if (XINT (start) < BEG || XINT (end) > Z || XINT (start) > XINT (end))
7290 args_out_of_range (start, end);
7291 if (NILP (current_buffer->enable_multibyte_characters))
7292 return Qnil;
7293 start_byte = CHAR_TO_BYTE (XINT (start));
7294 end_byte = CHAR_TO_BYTE (XINT (end));
7295 if (XINT (end) - XINT (start) == end_byte - start_byte)
7296 return Qt;
7297
7298 if (XINT (start) < GPT && XINT (end) > GPT)
7299 {
7300 if ((GPT - XINT (start)) < (XINT (end) - GPT))
7301 move_gap_both (XINT (start), start_byte);
7302 else
7303 move_gap_both (XINT (end), end_byte);
7304 }
7305 pos = XINT (start);
7306 }
7307
7308 list = Qnil;
7309 for (tail = coding_system_list; CONSP (tail); tail = XCDR (tail))
7310 {
7311 elt = XCAR (tail);
7312 list = Fcons (Fcons (elt, Fcons (AREF (CODING_SYSTEM_SPEC (elt), 0),
7313 Qnil)),
7314 list);
7315 }
7316
7317 if (STRINGP (start))
7318 p = pbeg = SDATA (start);
7319 else
7320 p = pbeg = BYTE_POS_ADDR (start_byte);
7321 pend = p + (end_byte - start_byte);
7322
7323 while (p < pend && ASCII_BYTE_P (*p)) p++, pos++;
7324 while (p < pend && ASCII_BYTE_P (*(pend - 1))) pend--;
7325
7326 while (p < pend)
7327 {
7328 if (ASCII_BYTE_P (*p))
7329 p++;
7330 else
7331 {
7332 c = STRING_CHAR_ADVANCE (p);
7333
7334 charset_map_loaded = 0;
7335 for (tail = list; CONSP (tail); tail = XCDR (tail))
7336 {
7337 elt = XCDR (XCAR (tail));
7338 if (! char_encodable_p (c, XCAR (elt)))
7339 XSETCDR (elt, Fcons (make_number (pos), XCDR (elt)));
7340 }
7341 if (charset_map_loaded)
7342 {
7343 EMACS_INT p_offset = p - pbeg, pend_offset = pend - pbeg;
7344
7345 if (STRINGP (start))
7346 pbeg = SDATA (start);
7347 else
7348 pbeg = BYTE_POS_ADDR (start_byte);
7349 p = pbeg + p_offset;
7350 pend = pbeg + pend_offset;
7351 }
7352 }
7353 pos++;
7354 }
7355
7356 tail = list;
7357 list = Qnil;
7358 for (; CONSP (tail); tail = XCDR (tail))
7359 {
7360 elt = XCAR (tail);
7361 if (CONSP (XCDR (XCDR (elt))))
7362 list = Fcons (Fcons (XCAR (elt), Fnreverse (XCDR (XCDR (elt)))),
7363 list);
7364 }
7365
7366 return list;
7367 }
7368
7369
7370
7371 Lisp_Object
7372 code_convert_region (start, end, coding_system, dst_object, encodep, norecord)
7373 Lisp_Object start, end, coding_system, dst_object;
7374 int encodep, norecord;
7375 {
7376 struct coding_system coding;
7377 EMACS_INT from, from_byte, to, to_byte;
7378 Lisp_Object src_object;
7379
7380 CHECK_NUMBER_COERCE_MARKER (start);
7381 CHECK_NUMBER_COERCE_MARKER (end);
7382 if (NILP (coding_system))
7383 coding_system = Qno_conversion;
7384 else
7385 CHECK_CODING_SYSTEM (coding_system);
7386 src_object = Fcurrent_buffer ();
7387 if (NILP (dst_object))
7388 dst_object = src_object;
7389 else if (! EQ (dst_object, Qt))
7390 CHECK_BUFFER (dst_object);
7391
7392 validate_region (&start, &end);
7393 from = XFASTINT (start);
7394 from_byte = CHAR_TO_BYTE (from);
7395 to = XFASTINT (end);
7396 to_byte = CHAR_TO_BYTE (to);
7397
7398 setup_coding_system (coding_system, &coding);
7399 coding.mode |= CODING_MODE_LAST_BLOCK;
7400
7401 if (encodep)
7402 encode_coding_object (&coding, src_object, from, from_byte, to, to_byte,
7403 dst_object);
7404 else
7405 decode_coding_object (&coding, src_object, from, from_byte, to, to_byte,
7406 dst_object);
7407 if (! norecord)
7408 Vlast_coding_system_used = CODING_ID_NAME (coding.id);
7409
7410 if (coding.result != CODING_RESULT_SUCCESS)
7411 error ("Code conversion error: %d", coding.result);
7412
7413 return (BUFFERP (dst_object)
7414 ? make_number (coding.produced_char)
7415 : coding.dst_object);
7416 }
7417
7418
7419 DEFUN ("decode-coding-region", Fdecode_coding_region, Sdecode_coding_region,
7420 3, 4, "r\nzCoding system: ",
7421 doc: /* Decode the current region from the specified coding system.
7422 When called from a program, takes four arguments:
7423 START, END, CODING-SYSTEM, and DESTINATION.
7424 START and END are buffer positions.
7425
7426 Optional 4th arguments DESTINATION specifies where the decoded text goes.
7427 If nil, the region between START and END is replace by the decoded text.
7428 If buffer, the decoded text is inserted in the buffer.
7429 If t, the decoded text is returned.
7430
7431 This function sets `last-coding-system-used' to the precise coding system
7432 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
7433 not fully specified.)
7434 It returns the length of the decoded text. */)
7435 (start, end, coding_system, destination)
7436 Lisp_Object start, end, coding_system, destination;
7437 {
7438 return code_convert_region (start, end, coding_system, destination, 0, 0);
7439 }
7440
7441 DEFUN ("encode-coding-region", Fencode_coding_region, Sencode_coding_region,
7442 3, 4, "r\nzCoding system: ",
7443 doc: /* Encode the current region by specified coding system.
7444 When called from a program, takes three arguments:
7445 START, END, and CODING-SYSTEM. START and END are buffer positions.
7446
7447 Optional 4th arguments DESTINATION specifies where the encoded text goes.
7448 If nil, the region between START and END is replace by the encoded text.
7449 If buffer, the encoded text is inserted in the buffer.
7450 If t, the encoded text is returned.
7451
7452 This function sets `last-coding-system-used' to the precise coding system
7453 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
7454 not fully specified.)
7455 It returns the length of the encoded text. */)
7456 (start, end, coding_system, destination)
7457 Lisp_Object start, end, coding_system, destination;
7458 {
7459 return code_convert_region (start, end, coding_system, destination, 1, 0);
7460 }
7461
7462 Lisp_Object
7463 code_convert_string (string, coding_system, dst_object,
7464 encodep, nocopy, norecord)
7465 Lisp_Object string, coding_system, dst_object;
7466 int encodep, nocopy, norecord;
7467 {
7468 struct coding_system coding;
7469 EMACS_INT chars, bytes;
7470
7471 CHECK_STRING (string);
7472 if (NILP (coding_system))
7473 {
7474 if (! norecord)
7475 Vlast_coding_system_used = Qno_conversion;
7476 if (NILP (dst_object))
7477 return (nocopy ? Fcopy_sequence (string) : string);
7478 }
7479
7480 if (NILP (coding_system))
7481 coding_system = Qno_conversion;
7482 else
7483 CHECK_CODING_SYSTEM (coding_system);
7484 if (NILP (dst_object))
7485 dst_object = Qt;
7486 else if (! EQ (dst_object, Qt))
7487 CHECK_BUFFER (dst_object);
7488
7489 setup_coding_system (coding_system, &coding);
7490 coding.mode |= CODING_MODE_LAST_BLOCK;
7491 chars = SCHARS (string);
7492 bytes = SBYTES (string);
7493 if (encodep)
7494 encode_coding_object (&coding, string, 0, 0, chars, bytes, dst_object);
7495 else
7496 decode_coding_object (&coding, string, 0, 0, chars, bytes, dst_object);
7497 if (! norecord)
7498 Vlast_coding_system_used = CODING_ID_NAME (coding.id);
7499
7500 if (coding.result != CODING_RESULT_SUCCESS)
7501 error ("Code conversion error: %d", coding.result);
7502
7503 return (BUFFERP (dst_object)
7504 ? make_number (coding.produced_char)
7505 : coding.dst_object);
7506 }
7507
7508
7509 /* Encode or decode STRING according to CODING_SYSTEM.
7510 Do not set Vlast_coding_system_used.
7511
7512 This function is called only from macros DECODE_FILE and
7513 ENCODE_FILE, thus we ignore character composition. */
7514
7515 Lisp_Object
7516 code_convert_string_norecord (string, coding_system, encodep)
7517 Lisp_Object string, coding_system;
7518 int encodep;
7519 {
7520 return code_convert_string (string, coding_system, Qt, encodep, 0, 1);
7521 }
7522
7523
7524 DEFUN ("decode-coding-string", Fdecode_coding_string, Sdecode_coding_string,
7525 2, 4, 0,
7526 doc: /* Decode STRING which is encoded in CODING-SYSTEM, and return the result.
7527
7528 Optional third arg NOCOPY non-nil means it is OK to return STRING itself
7529 if the decoding operation is trivial.
7530
7531 Optional fourth arg BUFFER non-nil meant that the decoded text is
7532 inserted in BUFFER instead of returned as a string. In this case,
7533 the return value is BUFFER.
7534
7535 This function sets `last-coding-system-used' to the precise coding system
7536 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
7537 not fully specified. */)
7538 (string, coding_system, nocopy, buffer)
7539 Lisp_Object string, coding_system, nocopy, buffer;
7540 {
7541 return code_convert_string (string, coding_system, buffer,
7542 0, ! NILP (nocopy), 0);
7543 }
7544
7545 DEFUN ("encode-coding-string", Fencode_coding_string, Sencode_coding_string,
7546 2, 4, 0,
7547 doc: /* Encode STRING to CODING-SYSTEM, and return the result.
7548
7549 Optional third arg NOCOPY non-nil means it is OK to return STRING
7550 itself if the encoding operation is trivial.
7551
7552 Optional fourth arg BUFFER non-nil meant that the encoded text is
7553 inserted in BUFFER instead of returned as a string. In this case,
7554 the return value is BUFFER.
7555
7556 This function sets `last-coding-system-used' to the precise coding system
7557 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
7558 not fully specified.) */)
7559 (string, coding_system, nocopy, buffer)
7560 Lisp_Object string, coding_system, nocopy, buffer;
7561 {
7562 return code_convert_string (string, coding_system, buffer,
7563 1, ! NILP (nocopy), 1);
7564 }
7565
7566 \f
7567 DEFUN ("decode-sjis-char", Fdecode_sjis_char, Sdecode_sjis_char, 1, 1, 0,
7568 doc: /* Decode a Japanese character which has CODE in shift_jis encoding.
7569 Return the corresponding character. */)
7570 (code)
7571 Lisp_Object code;
7572 {
7573 Lisp_Object spec, attrs, val;
7574 struct charset *charset_roman, *charset_kanji, *charset_kana, *charset;
7575 int c;
7576
7577 CHECK_NATNUM (code);
7578 c = XFASTINT (code);
7579 CHECK_CODING_SYSTEM_GET_SPEC (Vsjis_coding_system, spec);
7580 attrs = AREF (spec, 0);
7581
7582 if (ASCII_BYTE_P (c)
7583 && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
7584 return code;
7585
7586 val = CODING_ATTR_CHARSET_LIST (attrs);
7587 charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
7588 charset_kana = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
7589 charset_kanji = CHARSET_FROM_ID (XINT (XCAR (val)));
7590
7591 if (c <= 0x7F)
7592 charset = charset_roman;
7593 else if (c >= 0xA0 && c < 0xDF)
7594 {
7595 charset = charset_kana;
7596 c -= 0x80;
7597 }
7598 else
7599 {
7600 int s1 = c >> 8, s2 = c & 0xFF;
7601
7602 if (s1 < 0x81 || (s1 > 0x9F && s1 < 0xE0) || s1 > 0xEF
7603 || s2 < 0x40 || s2 == 0x7F || s2 > 0xFC)
7604 error ("Invalid code: %d", code);
7605 SJIS_TO_JIS (c);
7606 charset = charset_kanji;
7607 }
7608 c = DECODE_CHAR (charset, c);
7609 if (c < 0)
7610 error ("Invalid code: %d", code);
7611 return make_number (c);
7612 }
7613
7614
7615 DEFUN ("encode-sjis-char", Fencode_sjis_char, Sencode_sjis_char, 1, 1, 0,
7616 doc: /* Encode a Japanese character CHAR to shift_jis encoding.
7617 Return the corresponding code in SJIS. */)
7618 (ch)
7619 Lisp_Object ch;
7620 {
7621 Lisp_Object spec, attrs, charset_list;
7622 int c;
7623 struct charset *charset;
7624 unsigned code;
7625
7626 CHECK_CHARACTER (ch);
7627 c = XFASTINT (ch);
7628 CHECK_CODING_SYSTEM_GET_SPEC (Vsjis_coding_system, spec);
7629 attrs = AREF (spec, 0);
7630
7631 if (ASCII_CHAR_P (c)
7632 && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
7633 return ch;
7634
7635 charset_list = CODING_ATTR_CHARSET_LIST (attrs);
7636 charset = char_charset (c, charset_list, &code);
7637 if (code == CHARSET_INVALID_CODE (charset))
7638 error ("Can't encode by shift_jis encoding: %d", c);
7639 JIS_TO_SJIS (code);
7640
7641 return make_number (code);
7642 }
7643
7644 DEFUN ("decode-big5-char", Fdecode_big5_char, Sdecode_big5_char, 1, 1, 0,
7645 doc: /* Decode a Big5 character which has CODE in BIG5 coding system.
7646 Return the corresponding character. */)
7647 (code)
7648 Lisp_Object code;
7649 {
7650 Lisp_Object spec, attrs, val;
7651 struct charset *charset_roman, *charset_big5, *charset;
7652 int c;
7653
7654 CHECK_NATNUM (code);
7655 c = XFASTINT (code);
7656 CHECK_CODING_SYSTEM_GET_SPEC (Vbig5_coding_system, spec);
7657 attrs = AREF (spec, 0);
7658
7659 if (ASCII_BYTE_P (c)
7660 && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
7661 return code;
7662
7663 val = CODING_ATTR_CHARSET_LIST (attrs);
7664 charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
7665 charset_big5 = CHARSET_FROM_ID (XINT (XCAR (val)));
7666
7667 if (c <= 0x7F)
7668 charset = charset_roman;
7669 else
7670 {
7671 int b1 = c >> 8, b2 = c & 0x7F;
7672 if (b1 < 0xA1 || b1 > 0xFE
7673 || b2 < 0x40 || (b2 > 0x7E && b2 < 0xA1) || b2 > 0xFE)
7674 error ("Invalid code: %d", code);
7675 charset = charset_big5;
7676 }
7677 c = DECODE_CHAR (charset, (unsigned )c);
7678 if (c < 0)
7679 error ("Invalid code: %d", code);
7680 return make_number (c);
7681 }
7682
7683 DEFUN ("encode-big5-char", Fencode_big5_char, Sencode_big5_char, 1, 1, 0,
7684 doc: /* Encode the Big5 character CHAR to BIG5 coding system.
7685 Return the corresponding character code in Big5. */)
7686 (ch)
7687 Lisp_Object ch;
7688 {
7689 Lisp_Object spec, attrs, charset_list;
7690 struct charset *charset;
7691 int c;
7692 unsigned code;
7693
7694 CHECK_CHARACTER (ch);
7695 c = XFASTINT (ch);
7696 CHECK_CODING_SYSTEM_GET_SPEC (Vbig5_coding_system, spec);
7697 attrs = AREF (spec, 0);
7698 if (ASCII_CHAR_P (c)
7699 && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
7700 return ch;
7701
7702 charset_list = CODING_ATTR_CHARSET_LIST (attrs);
7703 charset = char_charset (c, charset_list, &code);
7704 if (code == CHARSET_INVALID_CODE (charset))
7705 error ("Can't encode by Big5 encoding: %d", c);
7706
7707 return make_number (code);
7708 }
7709
7710 \f
7711 DEFUN ("set-terminal-coding-system-internal",
7712 Fset_terminal_coding_system_internal,
7713 Sset_terminal_coding_system_internal, 1, 1, 0,
7714 doc: /* Internal use only. */)
7715 (coding_system)
7716 Lisp_Object coding_system;
7717 {
7718 CHECK_SYMBOL (coding_system);
7719 setup_coding_system (Fcheck_coding_system (coding_system),
7720 &terminal_coding);
7721
7722 /* We had better not send unsafe characters to terminal. */
7723 terminal_coding.mode |= CODING_MODE_SAFE_ENCODING;
7724 /* Characer composition should be disabled. */
7725 terminal_coding.common_flags &= ~CODING_ANNOTATE_COMPOSITION_MASK;
7726 terminal_coding.src_multibyte = 1;
7727 terminal_coding.dst_multibyte = 0;
7728 return Qnil;
7729 }
7730
7731 DEFUN ("set-safe-terminal-coding-system-internal",
7732 Fset_safe_terminal_coding_system_internal,
7733 Sset_safe_terminal_coding_system_internal, 1, 1, 0,
7734 doc: /* Internal use only. */)
7735 (coding_system)
7736 Lisp_Object coding_system;
7737 {
7738 CHECK_SYMBOL (coding_system);
7739 setup_coding_system (Fcheck_coding_system (coding_system),
7740 &safe_terminal_coding);
7741 /* Characer composition should be disabled. */
7742 safe_terminal_coding.common_flags &= ~CODING_ANNOTATE_COMPOSITION_MASK;
7743 safe_terminal_coding.src_multibyte = 1;
7744 safe_terminal_coding.dst_multibyte = 0;
7745 return Qnil;
7746 }
7747
7748 DEFUN ("terminal-coding-system",
7749 Fterminal_coding_system, Sterminal_coding_system, 0, 0, 0,
7750 doc: /* Return coding system specified for terminal output. */)
7751 ()
7752 {
7753 return CODING_ID_NAME (terminal_coding.id);
7754 }
7755
7756 DEFUN ("set-keyboard-coding-system-internal",
7757 Fset_keyboard_coding_system_internal,
7758 Sset_keyboard_coding_system_internal, 1, 1, 0,
7759 doc: /* Internal use only. */)
7760 (coding_system)
7761 Lisp_Object coding_system;
7762 {
7763 CHECK_SYMBOL (coding_system);
7764 setup_coding_system (Fcheck_coding_system (coding_system),
7765 &keyboard_coding);
7766 /* Characer composition should be disabled. */
7767 keyboard_coding.common_flags &= ~CODING_ANNOTATE_COMPOSITION_MASK;
7768 return Qnil;
7769 }
7770
7771 DEFUN ("keyboard-coding-system",
7772 Fkeyboard_coding_system, Skeyboard_coding_system, 0, 0, 0,
7773 doc: /* Return coding system specified for decoding keyboard input. */)
7774 ()
7775 {
7776 return CODING_ID_NAME (keyboard_coding.id);
7777 }
7778
7779 \f
7780 DEFUN ("find-operation-coding-system", Ffind_operation_coding_system,
7781 Sfind_operation_coding_system, 1, MANY, 0,
7782 doc: /* Choose a coding system for an operation based on the target name.
7783 The value names a pair of coding systems: (DECODING-SYSTEM . ENCODING-SYSTEM).
7784 DECODING-SYSTEM is the coding system to use for decoding
7785 \(in case OPERATION does decoding), and ENCODING-SYSTEM is the coding system
7786 for encoding (in case OPERATION does encoding).
7787
7788 The first argument OPERATION specifies an I/O primitive:
7789 For file I/O, `insert-file-contents' or `write-region'.
7790 For process I/O, `call-process', `call-process-region', or `start-process'.
7791 For network I/O, `open-network-stream'.
7792
7793 The remaining arguments should be the same arguments that were passed
7794 to the primitive. Depending on which primitive, one of those arguments
7795 is selected as the TARGET. For example, if OPERATION does file I/O,
7796 whichever argument specifies the file name is TARGET.
7797
7798 TARGET has a meaning which depends on OPERATION:
7799 For file I/O, TARGET is a file name.
7800 For process I/O, TARGET is a process name.
7801 For network I/O, TARGET is a service name or a port number
7802
7803 This function looks up what specified for TARGET in,
7804 `file-coding-system-alist', `process-coding-system-alist',
7805 or `network-coding-system-alist' depending on OPERATION.
7806 They may specify a coding system, a cons of coding systems,
7807 or a function symbol to call.
7808 In the last case, we call the function with one argument,
7809 which is a list of all the arguments given to this function.
7810
7811 usage: (find-operation-coding-system OPERATION ARGUMENTS ...) */)
7812 (nargs, args)
7813 int nargs;
7814 Lisp_Object *args;
7815 {
7816 Lisp_Object operation, target_idx, target, val;
7817 register Lisp_Object chain;
7818
7819 if (nargs < 2)
7820 error ("Too few arguments");
7821 operation = args[0];
7822 if (!SYMBOLP (operation)
7823 || !INTEGERP (target_idx = Fget (operation, Qtarget_idx)))
7824 error ("Invalid first arguement");
7825 if (nargs < 1 + XINT (target_idx))
7826 error ("Too few arguments for operation: %s",
7827 SDATA (SYMBOL_NAME (operation)));
7828 target = args[XINT (target_idx) + 1];
7829 if (!(STRINGP (target)
7830 || (EQ (operation, Qopen_network_stream) && INTEGERP (target))))
7831 error ("Invalid %dth argument", XINT (target_idx) + 1);
7832
7833 chain = ((EQ (operation, Qinsert_file_contents)
7834 || EQ (operation, Qwrite_region))
7835 ? Vfile_coding_system_alist
7836 : (EQ (operation, Qopen_network_stream)
7837 ? Vnetwork_coding_system_alist
7838 : Vprocess_coding_system_alist));
7839 if (NILP (chain))
7840 return Qnil;
7841
7842 for (; CONSP (chain); chain = XCDR (chain))
7843 {
7844 Lisp_Object elt;
7845
7846 elt = XCAR (chain);
7847 if (CONSP (elt)
7848 && ((STRINGP (target)
7849 && STRINGP (XCAR (elt))
7850 && fast_string_match (XCAR (elt), target) >= 0)
7851 || (INTEGERP (target) && EQ (target, XCAR (elt)))))
7852 {
7853 val = XCDR (elt);
7854 /* Here, if VAL is both a valid coding system and a valid
7855 function symbol, we return VAL as a coding system. */
7856 if (CONSP (val))
7857 return val;
7858 if (! SYMBOLP (val))
7859 return Qnil;
7860 if (! NILP (Fcoding_system_p (val)))
7861 return Fcons (val, val);
7862 if (! NILP (Ffboundp (val)))
7863 {
7864 val = call1 (val, Flist (nargs, args));
7865 if (CONSP (val))
7866 return val;
7867 if (SYMBOLP (val) && ! NILP (Fcoding_system_p (val)))
7868 return Fcons (val, val);
7869 }
7870 return Qnil;
7871 }
7872 }
7873 return Qnil;
7874 }
7875
7876 DEFUN ("set-coding-system-priority", Fset_coding_system_priority,
7877 Sset_coding_system_priority, 0, MANY, 0,
7878 doc: /* Assign higher priority to the coding systems given as arguments.
7879 If multiple coding systems belongs to the same category,
7880 all but the first one are ignored.
7881
7882 usage: (set-coding-system-priority ...) */)
7883 (nargs, args)
7884 int nargs;
7885 Lisp_Object *args;
7886 {
7887 int i, j;
7888 int changed[coding_category_max];
7889 enum coding_category priorities[coding_category_max];
7890
7891 bzero (changed, sizeof changed);
7892
7893 for (i = j = 0; i < nargs; i++)
7894 {
7895 enum coding_category category;
7896 Lisp_Object spec, attrs;
7897
7898 CHECK_CODING_SYSTEM_GET_SPEC (args[i], spec);
7899 attrs = AREF (spec, 0);
7900 category = XINT (CODING_ATTR_CATEGORY (attrs));
7901 if (changed[category])
7902 /* Ignore this coding system because a coding system of the
7903 same category already had a higher priority. */
7904 continue;
7905 changed[category] = 1;
7906 priorities[j++] = category;
7907 if (coding_categories[category].id >= 0
7908 && ! EQ (args[i], CODING_ID_NAME (coding_categories[category].id)))
7909 setup_coding_system (args[i], &coding_categories[category]);
7910 Fset (AREF (Vcoding_category_table, category), args[i]);
7911 }
7912
7913 /* Now we have decided top J priorities. Reflect the order of the
7914 original priorities to the remaining priorities. */
7915
7916 for (i = j, j = 0; i < coding_category_max; i++, j++)
7917 {
7918 while (j < coding_category_max
7919 && changed[coding_priorities[j]])
7920 j++;
7921 if (j == coding_category_max)
7922 abort ();
7923 priorities[i] = coding_priorities[j];
7924 }
7925
7926 bcopy (priorities, coding_priorities, sizeof priorities);
7927
7928 /* Update `coding-category-list'. */
7929 Vcoding_category_list = Qnil;
7930 for (i = coding_category_max - 1; i >= 0; i--)
7931 Vcoding_category_list
7932 = Fcons (AREF (Vcoding_category_table, priorities[i]),
7933 Vcoding_category_list);
7934
7935 return Qnil;
7936 }
7937
7938 DEFUN ("coding-system-priority-list", Fcoding_system_priority_list,
7939 Scoding_system_priority_list, 0, 1, 0,
7940 doc: /* Return a list of coding systems ordered by their priorities.
7941 HIGHESTP non-nil means just return the highest priority one. */)
7942 (highestp)
7943 Lisp_Object highestp;
7944 {
7945 int i;
7946 Lisp_Object val;
7947
7948 for (i = 0, val = Qnil; i < coding_category_max; i++)
7949 {
7950 enum coding_category category = coding_priorities[i];
7951 int id = coding_categories[category].id;
7952 Lisp_Object attrs;
7953
7954 if (id < 0)
7955 continue;
7956 attrs = CODING_ID_ATTRS (id);
7957 if (! NILP (highestp))
7958 return CODING_ATTR_BASE_NAME (attrs);
7959 val = Fcons (CODING_ATTR_BASE_NAME (attrs), val);
7960 }
7961 return Fnreverse (val);
7962 }
7963
7964 static char *suffixes[] = { "-unix", "-dos", "-mac" };
7965
7966 static Lisp_Object
7967 make_subsidiaries (base)
7968 Lisp_Object base;
7969 {
7970 Lisp_Object subsidiaries;
7971 int base_name_len = SBYTES (SYMBOL_NAME (base));
7972 char *buf = (char *) alloca (base_name_len + 6);
7973 int i;
7974
7975 bcopy (SDATA (SYMBOL_NAME (base)), buf, base_name_len);
7976 subsidiaries = Fmake_vector (make_number (3), Qnil);
7977 for (i = 0; i < 3; i++)
7978 {
7979 bcopy (suffixes[i], buf + base_name_len, strlen (suffixes[i]) + 1);
7980 ASET (subsidiaries, i, intern (buf));
7981 }
7982 return subsidiaries;
7983 }
7984
7985
7986 DEFUN ("define-coding-system-internal", Fdefine_coding_system_internal,
7987 Sdefine_coding_system_internal, coding_arg_max, MANY, 0,
7988 doc: /* For internal use only.
7989 usage: (define-coding-system-internal ...) */)
7990 (nargs, args)
7991 int nargs;
7992 Lisp_Object *args;
7993 {
7994 Lisp_Object name;
7995 Lisp_Object spec_vec; /* [ ATTRS ALIASE EOL_TYPE ] */
7996 Lisp_Object attrs; /* Vector of attributes. */
7997 Lisp_Object eol_type;
7998 Lisp_Object aliases;
7999 Lisp_Object coding_type, charset_list, safe_charsets;
8000 enum coding_category category;
8001 Lisp_Object tail, val;
8002 int max_charset_id = 0;
8003 int i;
8004
8005 if (nargs < coding_arg_max)
8006 goto short_args;
8007
8008 attrs = Fmake_vector (make_number (coding_attr_last_index), Qnil);
8009
8010 name = args[coding_arg_name];
8011 CHECK_SYMBOL (name);
8012 CODING_ATTR_BASE_NAME (attrs) = name;
8013
8014 val = args[coding_arg_mnemonic];
8015 if (! STRINGP (val))
8016 CHECK_CHARACTER (val);
8017 CODING_ATTR_MNEMONIC (attrs) = val;
8018
8019 coding_type = args[coding_arg_coding_type];
8020 CHECK_SYMBOL (coding_type);
8021 CODING_ATTR_TYPE (attrs) = coding_type;
8022
8023 charset_list = args[coding_arg_charset_list];
8024 if (SYMBOLP (charset_list))
8025 {
8026 if (EQ (charset_list, Qiso_2022))
8027 {
8028 if (! EQ (coding_type, Qiso_2022))
8029 error ("Invalid charset-list");
8030 charset_list = Viso_2022_charset_list;
8031 }
8032 else if (EQ (charset_list, Qemacs_mule))
8033 {
8034 if (! EQ (coding_type, Qemacs_mule))
8035 error ("Invalid charset-list");
8036 charset_list = Vemacs_mule_charset_list;
8037 }
8038 for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
8039 if (max_charset_id < XFASTINT (XCAR (tail)))
8040 max_charset_id = XFASTINT (XCAR (tail));
8041 }
8042 else
8043 {
8044 charset_list = Fcopy_sequence (charset_list);
8045 for (tail = charset_list; !NILP (tail); tail = Fcdr (tail))
8046 {
8047 struct charset *charset;
8048
8049 val = Fcar (tail);
8050 CHECK_CHARSET_GET_CHARSET (val, charset);
8051 if (EQ (coding_type, Qiso_2022)
8052 ? CHARSET_ISO_FINAL (charset) < 0
8053 : EQ (coding_type, Qemacs_mule)
8054 ? CHARSET_EMACS_MULE_ID (charset) < 0
8055 : 0)
8056 error ("Can't handle charset `%s'",
8057 SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
8058
8059 XSETCAR (tail, make_number (charset->id));
8060 if (max_charset_id < charset->id)
8061 max_charset_id = charset->id;
8062 }
8063 }
8064 CODING_ATTR_CHARSET_LIST (attrs) = charset_list;
8065
8066 safe_charsets = Fmake_string (make_number (max_charset_id + 1),
8067 make_number (255));
8068 for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
8069 SSET (safe_charsets, XFASTINT (XCAR (tail)), 0);
8070 CODING_ATTR_SAFE_CHARSETS (attrs) = safe_charsets;
8071
8072 CODING_ATTR_ASCII_COMPAT (attrs) = args[coding_arg_ascii_compatible_p];
8073
8074 val = args[coding_arg_decode_translation_table];
8075 if (! NILP (val))
8076 CHECK_CHAR_TABLE (val);
8077 CODING_ATTR_DECODE_TBL (attrs) = val;
8078
8079 val = args[coding_arg_encode_translation_table];
8080 if (! NILP (val))
8081 CHECK_CHAR_TABLE (val);
8082 CODING_ATTR_ENCODE_TBL (attrs) = val;
8083
8084 val = args[coding_arg_post_read_conversion];
8085 CHECK_SYMBOL (val);
8086 CODING_ATTR_POST_READ (attrs) = val;
8087
8088 val = args[coding_arg_pre_write_conversion];
8089 CHECK_SYMBOL (val);
8090 CODING_ATTR_PRE_WRITE (attrs) = val;
8091
8092 val = args[coding_arg_default_char];
8093 if (NILP (val))
8094 CODING_ATTR_DEFAULT_CHAR (attrs) = make_number (' ');
8095 else
8096 {
8097 CHECK_CHARACTER (val);
8098 CODING_ATTR_DEFAULT_CHAR (attrs) = val;
8099 }
8100
8101 val = args[coding_arg_for_unibyte];
8102 CODING_ATTR_FOR_UNIBYTE (attrs) = NILP (val) ? Qnil : Qt;
8103
8104 val = args[coding_arg_plist];
8105 CHECK_LIST (val);
8106 CODING_ATTR_PLIST (attrs) = val;
8107
8108 if (EQ (coding_type, Qcharset))
8109 {
8110 /* Generate a lisp vector of 256 elements. Each element is nil,
8111 integer, or a list of charset IDs.
8112
8113 If Nth element is nil, the byte code N is invalid in this
8114 coding system.
8115
8116 If Nth element is a number NUM, N is the first byte of a
8117 charset whose ID is NUM.
8118
8119 If Nth element is a list of charset IDs, N is the first byte
8120 of one of them. The list is sorted by dimensions of the
8121 charsets. A charset of smaller dimension comes firtst. */
8122 Lisp_Object list;
8123 int maybe_ascii_compatible = 1;
8124
8125 for (list = Qnil, tail = charset_list; CONSP (tail); tail = XCDR (tail))
8126 {
8127 struct charset *charset = CHARSET_FROM_ID (XFASTINT (XCAR (tail)));
8128
8129 if (charset->method == CHARSET_METHOD_SUPERSET)
8130 {
8131 val = CHARSET_SUPERSET (charset);
8132 for (; CONSP (val); val = XCDR (val))
8133 list = Fcons (XCAR (XCAR (val)), list);
8134 maybe_ascii_compatible = 0;
8135 }
8136 else
8137 list = Fcons (XCAR (tail), list);
8138 }
8139
8140 val = Fmake_vector (make_number (256), Qnil);
8141
8142 for (tail = Fnreverse (list); CONSP (tail); tail = XCDR (tail))
8143 {
8144 struct charset *charset = CHARSET_FROM_ID (XFASTINT (XCAR (tail)));
8145 int dim = CHARSET_DIMENSION (charset);
8146 int idx = (dim - 1) * 4;
8147
8148 if (CHARSET_ASCII_COMPATIBLE_P (charset)
8149 && maybe_ascii_compatible)
8150 CODING_ATTR_ASCII_COMPAT (attrs) = Qt;
8151
8152 for (i = charset->code_space[idx];
8153 i <= charset->code_space[idx + 1]; i++)
8154 {
8155 Lisp_Object tmp, tmp2;
8156 int dim2;
8157
8158 tmp = AREF (val, i);
8159 if (NILP (tmp))
8160 tmp = XCAR (tail);
8161 else if (NUMBERP (tmp))
8162 {
8163 dim2 = CHARSET_DIMENSION (CHARSET_FROM_ID (XFASTINT (tmp)));
8164 if (dim < dim2)
8165 tmp = Fcons (XCAR (tail), Fcons (tmp, Qnil));
8166 else
8167 tmp = Fcons (tmp, Fcons (XCAR (tail), Qnil));
8168 }
8169 else
8170 {
8171 for (tmp2 = tmp; CONSP (tmp2); tmp2 = XCDR (tmp2))
8172 {
8173 dim2 = CHARSET_DIMENSION (CHARSET_FROM_ID (XFASTINT (XCAR (tmp2))));
8174 if (dim < dim2)
8175 break;
8176 }
8177 if (NILP (tmp2))
8178 tmp = nconc2 (tmp, Fcons (XCAR (tail), Qnil));
8179 else
8180 {
8181 XSETCDR (tmp2, Fcons (XCAR (tmp2), XCDR (tmp2)));
8182 XSETCAR (tmp2, XCAR (tail));
8183 }
8184 }
8185 ASET (val, i, tmp);
8186 }
8187 }
8188 ASET (attrs, coding_attr_charset_valids, val);
8189 category = coding_category_charset;
8190 }
8191 else if (EQ (coding_type, Qccl))
8192 {
8193 Lisp_Object valids;
8194
8195 if (nargs < coding_arg_ccl_max)
8196 goto short_args;
8197
8198 val = args[coding_arg_ccl_decoder];
8199 CHECK_CCL_PROGRAM (val);
8200 if (VECTORP (val))
8201 val = Fcopy_sequence (val);
8202 ASET (attrs, coding_attr_ccl_decoder, val);
8203
8204 val = args[coding_arg_ccl_encoder];
8205 CHECK_CCL_PROGRAM (val);
8206 if (VECTORP (val))
8207 val = Fcopy_sequence (val);
8208 ASET (attrs, coding_attr_ccl_encoder, val);
8209
8210 val = args[coding_arg_ccl_valids];
8211 valids = Fmake_string (make_number (256), make_number (0));
8212 for (tail = val; !NILP (tail); tail = Fcdr (tail))
8213 {
8214 int from, to;
8215
8216 val = Fcar (tail);
8217 if (INTEGERP (val))
8218 {
8219 from = to = XINT (val);
8220 if (from < 0 || from > 255)
8221 args_out_of_range_3 (val, make_number (0), make_number (255));
8222 }
8223 else
8224 {
8225 CHECK_CONS (val);
8226 CHECK_NATNUM_CAR (val);
8227 CHECK_NATNUM_CDR (val);
8228 from = XINT (XCAR (val));
8229 if (from > 255)
8230 args_out_of_range_3 (XCAR (val),
8231 make_number (0), make_number (255));
8232 to = XINT (XCDR (val));
8233 if (to < from || to > 255)
8234 args_out_of_range_3 (XCDR (val),
8235 XCAR (val), make_number (255));
8236 }
8237 for (i = from; i <= to; i++)
8238 SSET (valids, i, 1);
8239 }
8240 ASET (attrs, coding_attr_ccl_valids, valids);
8241
8242 category = coding_category_ccl;
8243 }
8244 else if (EQ (coding_type, Qutf_16))
8245 {
8246 Lisp_Object bom, endian;
8247
8248 CODING_ATTR_ASCII_COMPAT (attrs) = Qnil;
8249
8250 if (nargs < coding_arg_utf16_max)
8251 goto short_args;
8252
8253 bom = args[coding_arg_utf16_bom];
8254 if (! NILP (bom) && ! EQ (bom, Qt))
8255 {
8256 CHECK_CONS (bom);
8257 val = XCAR (bom);
8258 CHECK_CODING_SYSTEM (val);
8259 val = XCDR (bom);
8260 CHECK_CODING_SYSTEM (val);
8261 }
8262 ASET (attrs, coding_attr_utf_16_bom, bom);
8263
8264 endian = args[coding_arg_utf16_endian];
8265 CHECK_SYMBOL (endian);
8266 if (NILP (endian))
8267 endian = Qbig;
8268 else if (! EQ (endian, Qbig) && ! EQ (endian, Qlittle))
8269 error ("Invalid endian: %s", SDATA (SYMBOL_NAME (endian)));
8270 ASET (attrs, coding_attr_utf_16_endian, endian);
8271
8272 category = (CONSP (bom)
8273 ? coding_category_utf_16_auto
8274 : NILP (bom)
8275 ? (EQ (endian, Qbig)
8276 ? coding_category_utf_16_be_nosig
8277 : coding_category_utf_16_le_nosig)
8278 : (EQ (endian, Qbig)
8279 ? coding_category_utf_16_be
8280 : coding_category_utf_16_le));
8281 }
8282 else if (EQ (coding_type, Qiso_2022))
8283 {
8284 Lisp_Object initial, reg_usage, request, flags;
8285 int i;
8286
8287 if (nargs < coding_arg_iso2022_max)
8288 goto short_args;
8289
8290 initial = Fcopy_sequence (args[coding_arg_iso2022_initial]);
8291 CHECK_VECTOR (initial);
8292 for (i = 0; i < 4; i++)
8293 {
8294 val = Faref (initial, make_number (i));
8295 if (! NILP (val))
8296 {
8297 struct charset *charset;
8298
8299 CHECK_CHARSET_GET_CHARSET (val, charset);
8300 ASET (initial, i, make_number (CHARSET_ID (charset)));
8301 if (i == 0 && CHARSET_ASCII_COMPATIBLE_P (charset))
8302 CODING_ATTR_ASCII_COMPAT (attrs) = Qt;
8303 }
8304 else
8305 ASET (initial, i, make_number (-1));
8306 }
8307
8308 reg_usage = args[coding_arg_iso2022_reg_usage];
8309 CHECK_CONS (reg_usage);
8310 CHECK_NUMBER_CAR (reg_usage);
8311 CHECK_NUMBER_CDR (reg_usage);
8312
8313 request = Fcopy_sequence (args[coding_arg_iso2022_request]);
8314 for (tail = request; ! NILP (tail); tail = Fcdr (tail))
8315 {
8316 int id;
8317 Lisp_Object tmp;
8318
8319 val = Fcar (tail);
8320 CHECK_CONS (val);
8321 tmp = XCAR (val);
8322 CHECK_CHARSET_GET_ID (tmp, id);
8323 CHECK_NATNUM_CDR (val);
8324 if (XINT (XCDR (val)) >= 4)
8325 error ("Invalid graphic register number: %d", XINT (XCDR (val)));
8326 XSETCAR (val, make_number (id));
8327 }
8328
8329 flags = args[coding_arg_iso2022_flags];
8330 CHECK_NATNUM (flags);
8331 i = XINT (flags);
8332 if (EQ (args[coding_arg_charset_list], Qiso_2022))
8333 flags = make_number (i | CODING_ISO_FLAG_FULL_SUPPORT);
8334
8335 ASET (attrs, coding_attr_iso_initial, initial);
8336 ASET (attrs, coding_attr_iso_usage, reg_usage);
8337 ASET (attrs, coding_attr_iso_request, request);
8338 ASET (attrs, coding_attr_iso_flags, flags);
8339 setup_iso_safe_charsets (attrs);
8340
8341 if (i & CODING_ISO_FLAG_SEVEN_BITS)
8342 category = ((i & (CODING_ISO_FLAG_LOCKING_SHIFT
8343 | CODING_ISO_FLAG_SINGLE_SHIFT))
8344 ? coding_category_iso_7_else
8345 : EQ (args[coding_arg_charset_list], Qiso_2022)
8346 ? coding_category_iso_7
8347 : coding_category_iso_7_tight);
8348 else
8349 {
8350 int id = XINT (AREF (initial, 1));
8351
8352 category = (((i & CODING_ISO_FLAG_LOCKING_SHIFT)
8353 || EQ (args[coding_arg_charset_list], Qiso_2022)
8354 || id < 0)
8355 ? coding_category_iso_8_else
8356 : (CHARSET_DIMENSION (CHARSET_FROM_ID (id)) == 1)
8357 ? coding_category_iso_8_1
8358 : coding_category_iso_8_2);
8359 }
8360 if (category != coding_category_iso_8_1
8361 && category != coding_category_iso_8_2)
8362 CODING_ATTR_ASCII_COMPAT (attrs) = Qnil;
8363 }
8364 else if (EQ (coding_type, Qemacs_mule))
8365 {
8366 if (EQ (args[coding_arg_charset_list], Qemacs_mule))
8367 ASET (attrs, coding_attr_emacs_mule_full, Qt);
8368 CODING_ATTR_ASCII_COMPAT (attrs) = Qt;
8369 category = coding_category_emacs_mule;
8370 }
8371 else if (EQ (coding_type, Qshift_jis))
8372 {
8373
8374 struct charset *charset;
8375
8376 if (XINT (Flength (charset_list)) != 3)
8377 error ("There should be just three charsets");
8378
8379 charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
8380 if (CHARSET_DIMENSION (charset) != 1)
8381 error ("Dimension of charset %s is not one",
8382 SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
8383 if (CHARSET_ASCII_COMPATIBLE_P (charset))
8384 CODING_ATTR_ASCII_COMPAT (attrs) = Qt;
8385
8386 charset_list = XCDR (charset_list);
8387 charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
8388 if (CHARSET_DIMENSION (charset) != 1)
8389 error ("Dimension of charset %s is not one",
8390 SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
8391
8392 charset_list = XCDR (charset_list);
8393 charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
8394 if (CHARSET_DIMENSION (charset) != 2)
8395 error ("Dimension of charset %s is not two",
8396 SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
8397
8398 category = coding_category_sjis;
8399 Vsjis_coding_system = name;
8400 }
8401 else if (EQ (coding_type, Qbig5))
8402 {
8403 struct charset *charset;
8404
8405 if (XINT (Flength (charset_list)) != 2)
8406 error ("There should be just two charsets");
8407
8408 charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
8409 if (CHARSET_DIMENSION (charset) != 1)
8410 error ("Dimension of charset %s is not one",
8411 SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
8412 if (CHARSET_ASCII_COMPATIBLE_P (charset))
8413 CODING_ATTR_ASCII_COMPAT (attrs) = Qt;
8414
8415 charset_list = XCDR (charset_list);
8416 charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
8417 if (CHARSET_DIMENSION (charset) != 2)
8418 error ("Dimension of charset %s is not two",
8419 SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
8420
8421 category = coding_category_big5;
8422 Vbig5_coding_system = name;
8423 }
8424 else if (EQ (coding_type, Qraw_text))
8425 {
8426 category = coding_category_raw_text;
8427 CODING_ATTR_ASCII_COMPAT (attrs) = Qt;
8428 }
8429 else if (EQ (coding_type, Qutf_8))
8430 {
8431 category = coding_category_utf_8;
8432 CODING_ATTR_ASCII_COMPAT (attrs) = Qt;
8433 }
8434 else if (EQ (coding_type, Qundecided))
8435 category = coding_category_undecided;
8436 else
8437 error ("Invalid coding system type: %s",
8438 SDATA (SYMBOL_NAME (coding_type)));
8439
8440 CODING_ATTR_CATEGORY (attrs) = make_number (category);
8441 CODING_ATTR_PLIST (attrs)
8442 = Fcons (QCcategory, Fcons (AREF (Vcoding_category_table, category),
8443 CODING_ATTR_PLIST (attrs)));
8444
8445 eol_type = args[coding_arg_eol_type];
8446 if (! NILP (eol_type)
8447 && ! EQ (eol_type, Qunix)
8448 && ! EQ (eol_type, Qdos)
8449 && ! EQ (eol_type, Qmac))
8450 error ("Invalid eol-type");
8451
8452 aliases = Fcons (name, Qnil);
8453
8454 if (NILP (eol_type))
8455 {
8456 eol_type = make_subsidiaries (name);
8457 for (i = 0; i < 3; i++)
8458 {
8459 Lisp_Object this_spec, this_name, this_aliases, this_eol_type;
8460
8461 this_name = AREF (eol_type, i);
8462 this_aliases = Fcons (this_name, Qnil);
8463 this_eol_type = (i == 0 ? Qunix : i == 1 ? Qdos : Qmac);
8464 this_spec = Fmake_vector (make_number (3), attrs);
8465 ASET (this_spec, 1, this_aliases);
8466 ASET (this_spec, 2, this_eol_type);
8467 Fputhash (this_name, this_spec, Vcoding_system_hash_table);
8468 Vcoding_system_list = Fcons (this_name, Vcoding_system_list);
8469 Vcoding_system_alist = Fcons (Fcons (Fsymbol_name (this_name), Qnil),
8470 Vcoding_system_alist);
8471 }
8472 }
8473
8474 spec_vec = Fmake_vector (make_number (3), attrs);
8475 ASET (spec_vec, 1, aliases);
8476 ASET (spec_vec, 2, eol_type);
8477
8478 Fputhash (name, spec_vec, Vcoding_system_hash_table);
8479 Vcoding_system_list = Fcons (name, Vcoding_system_list);
8480 Vcoding_system_alist = Fcons (Fcons (Fsymbol_name (name), Qnil),
8481 Vcoding_system_alist);
8482
8483 {
8484 int id = coding_categories[category].id;
8485
8486 if (id < 0 || EQ (name, CODING_ID_NAME (id)))
8487 setup_coding_system (name, &coding_categories[category]);
8488 }
8489
8490 return Qnil;
8491
8492 short_args:
8493 return Fsignal (Qwrong_number_of_arguments,
8494 Fcons (intern ("define-coding-system-internal"),
8495 make_number (nargs)));
8496 }
8497
8498
8499 DEFUN ("define-coding-system-alias", Fdefine_coding_system_alias,
8500 Sdefine_coding_system_alias, 2, 2, 0,
8501 doc: /* Define ALIAS as an alias for CODING-SYSTEM. */)
8502 (alias, coding_system)
8503 Lisp_Object alias, coding_system;
8504 {
8505 Lisp_Object spec, aliases, eol_type;
8506
8507 CHECK_SYMBOL (alias);
8508 CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
8509 aliases = AREF (spec, 1);
8510 /* ALISES should be a list of length more than zero, and the first
8511 element is a base coding system. Append ALIAS at the tail of the
8512 list. */
8513 while (!NILP (XCDR (aliases)))
8514 aliases = XCDR (aliases);
8515 XSETCDR (aliases, Fcons (alias, Qnil));
8516
8517 eol_type = AREF (spec, 2);
8518 if (VECTORP (eol_type))
8519 {
8520 Lisp_Object subsidiaries;
8521 int i;
8522
8523 subsidiaries = make_subsidiaries (alias);
8524 for (i = 0; i < 3; i++)
8525 Fdefine_coding_system_alias (AREF (subsidiaries, i),
8526 AREF (eol_type, i));
8527 }
8528
8529 Fputhash (alias, spec, Vcoding_system_hash_table);
8530 Vcoding_system_list = Fcons (alias, Vcoding_system_list);
8531 Vcoding_system_alist = Fcons (Fcons (Fsymbol_name (alias), Qnil),
8532 Vcoding_system_alist);
8533
8534 return Qnil;
8535 }
8536
8537 DEFUN ("coding-system-base", Fcoding_system_base, Scoding_system_base,
8538 1, 1, 0,
8539 doc: /* Return the base of CODING-SYSTEM.
8540 Any alias or subsidiary coding system is not a base coding system. */)
8541 (coding_system)
8542 Lisp_Object coding_system;
8543 {
8544 Lisp_Object spec, attrs;
8545
8546 if (NILP (coding_system))
8547 return (Qno_conversion);
8548 CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
8549 attrs = AREF (spec, 0);
8550 return CODING_ATTR_BASE_NAME (attrs);
8551 }
8552
8553 DEFUN ("coding-system-plist", Fcoding_system_plist, Scoding_system_plist,
8554 1, 1, 0,
8555 doc: "Return the property list of CODING-SYSTEM.")
8556 (coding_system)
8557 Lisp_Object coding_system;
8558 {
8559 Lisp_Object spec, attrs;
8560
8561 if (NILP (coding_system))
8562 coding_system = Qno_conversion;
8563 CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
8564 attrs = AREF (spec, 0);
8565 return CODING_ATTR_PLIST (attrs);
8566 }
8567
8568
8569 DEFUN ("coding-system-aliases", Fcoding_system_aliases, Scoding_system_aliases,
8570 1, 1, 0,
8571 doc: /* Return the list of aliases of CODING-SYSTEM. */)
8572 (coding_system)
8573 Lisp_Object coding_system;
8574 {
8575 Lisp_Object spec;
8576
8577 if (NILP (coding_system))
8578 coding_system = Qno_conversion;
8579 CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
8580 return AREF (spec, 1);
8581 }
8582
8583 DEFUN ("coding-system-eol-type", Fcoding_system_eol_type,
8584 Scoding_system_eol_type, 1, 1, 0,
8585 doc: /* Return eol-type of CODING-SYSTEM.
8586 An eol-type is integer 0, 1, 2, or a vector of coding systems.
8587
8588 Integer values 0, 1, and 2 indicate a format of end-of-line; LF, CRLF,
8589 and CR respectively.
8590
8591 A vector value indicates that a format of end-of-line should be
8592 detected automatically. Nth element of the vector is the subsidiary
8593 coding system whose eol-type is N. */)
8594 (coding_system)
8595 Lisp_Object coding_system;
8596 {
8597 Lisp_Object spec, eol_type;
8598 int n;
8599
8600 if (NILP (coding_system))
8601 coding_system = Qno_conversion;
8602 if (! CODING_SYSTEM_P (coding_system))
8603 return Qnil;
8604 spec = CODING_SYSTEM_SPEC (coding_system);
8605 eol_type = AREF (spec, 2);
8606 if (VECTORP (eol_type))
8607 return Fcopy_sequence (eol_type);
8608 n = EQ (eol_type, Qunix) ? 0 : EQ (eol_type, Qdos) ? 1 : 2;
8609 return make_number (n);
8610 }
8611
8612 #endif /* emacs */
8613
8614 \f
8615 /*** 9. Post-amble ***/
8616
8617 void
8618 init_coding_once ()
8619 {
8620 int i;
8621
8622 for (i = 0; i < coding_category_max; i++)
8623 {
8624 coding_categories[i].id = -1;
8625 coding_priorities[i] = i;
8626 }
8627
8628 /* ISO2022 specific initialize routine. */
8629 for (i = 0; i < 0x20; i++)
8630 iso_code_class[i] = ISO_control_0;
8631 for (i = 0x21; i < 0x7F; i++)
8632 iso_code_class[i] = ISO_graphic_plane_0;
8633 for (i = 0x80; i < 0xA0; i++)
8634 iso_code_class[i] = ISO_control_1;
8635 for (i = 0xA1; i < 0xFF; i++)
8636 iso_code_class[i] = ISO_graphic_plane_1;
8637 iso_code_class[0x20] = iso_code_class[0x7F] = ISO_0x20_or_0x7F;
8638 iso_code_class[0xA0] = iso_code_class[0xFF] = ISO_0xA0_or_0xFF;
8639 iso_code_class[ISO_CODE_CR] = ISO_carriage_return;
8640 iso_code_class[ISO_CODE_SO] = ISO_shift_out;
8641 iso_code_class[ISO_CODE_SI] = ISO_shift_in;
8642 iso_code_class[ISO_CODE_SS2_7] = ISO_single_shift_2_7;
8643 iso_code_class[ISO_CODE_ESC] = ISO_escape;
8644 iso_code_class[ISO_CODE_SS2] = ISO_single_shift_2;
8645 iso_code_class[ISO_CODE_SS3] = ISO_single_shift_3;
8646 iso_code_class[ISO_CODE_CSI] = ISO_control_sequence_introducer;
8647
8648 for (i = 0; i < 256; i++)
8649 {
8650 emacs_mule_bytes[i] = 1;
8651 }
8652 emacs_mule_bytes[EMACS_MULE_LEADING_CODE_PRIVATE_11] = 3;
8653 emacs_mule_bytes[EMACS_MULE_LEADING_CODE_PRIVATE_12] = 3;
8654 emacs_mule_bytes[EMACS_MULE_LEADING_CODE_PRIVATE_21] = 4;
8655 emacs_mule_bytes[EMACS_MULE_LEADING_CODE_PRIVATE_22] = 4;
8656 }
8657
8658 #ifdef emacs
8659
8660 void
8661 syms_of_coding ()
8662 {
8663 staticpro (&Vcoding_system_hash_table);
8664 {
8665 Lisp_Object args[2];
8666 args[0] = QCtest;
8667 args[1] = Qeq;
8668 Vcoding_system_hash_table = Fmake_hash_table (2, args);
8669 }
8670
8671 staticpro (&Vsjis_coding_system);
8672 Vsjis_coding_system = Qnil;
8673
8674 staticpro (&Vbig5_coding_system);
8675 Vbig5_coding_system = Qnil;
8676
8677 staticpro (&Vcode_conversion_work_buf_list);
8678 Vcode_conversion_work_buf_list = Qnil;
8679
8680 staticpro (&Vcode_conversion_reused_work_buf);
8681 Vcode_conversion_reused_work_buf = Qnil;
8682
8683 DEFSYM (Qcharset, "charset");
8684 DEFSYM (Qtarget_idx, "target-idx");
8685 DEFSYM (Qcoding_system_history, "coding-system-history");
8686 Fset (Qcoding_system_history, Qnil);
8687
8688 /* Target FILENAME is the first argument. */
8689 Fput (Qinsert_file_contents, Qtarget_idx, make_number (0));
8690 /* Target FILENAME is the third argument. */
8691 Fput (Qwrite_region, Qtarget_idx, make_number (2));
8692
8693 DEFSYM (Qcall_process, "call-process");
8694 /* Target PROGRAM is the first argument. */
8695 Fput (Qcall_process, Qtarget_idx, make_number (0));
8696
8697 DEFSYM (Qcall_process_region, "call-process-region");
8698 /* Target PROGRAM is the third argument. */
8699 Fput (Qcall_process_region, Qtarget_idx, make_number (2));
8700
8701 DEFSYM (Qstart_process, "start-process");
8702 /* Target PROGRAM is the third argument. */
8703 Fput (Qstart_process, Qtarget_idx, make_number (2));
8704
8705 DEFSYM (Qopen_network_stream, "open-network-stream");
8706 /* Target SERVICE is the fourth argument. */
8707 Fput (Qopen_network_stream, Qtarget_idx, make_number (3));
8708
8709 DEFSYM (Qcoding_system, "coding-system");
8710 DEFSYM (Qcoding_aliases, "coding-aliases");
8711
8712 DEFSYM (Qeol_type, "eol-type");
8713 DEFSYM (Qunix, "unix");
8714 DEFSYM (Qdos, "dos");
8715
8716 DEFSYM (Qbuffer_file_coding_system, "buffer-file-coding-system");
8717 DEFSYM (Qpost_read_conversion, "post-read-conversion");
8718 DEFSYM (Qpre_write_conversion, "pre-write-conversion");
8719 DEFSYM (Qdefault_char, "default-char");
8720 DEFSYM (Qundecided, "undecided");
8721 DEFSYM (Qno_conversion, "no-conversion");
8722 DEFSYM (Qraw_text, "raw-text");
8723
8724 DEFSYM (Qiso_2022, "iso-2022");
8725
8726 DEFSYM (Qutf_8, "utf-8");
8727 DEFSYM (Qutf_8_emacs, "utf-8-emacs");
8728
8729 DEFSYM (Qutf_16, "utf-16");
8730 DEFSYM (Qbig, "big");
8731 DEFSYM (Qlittle, "little");
8732
8733 DEFSYM (Qshift_jis, "shift-jis");
8734 DEFSYM (Qbig5, "big5");
8735
8736 DEFSYM (Qcoding_system_p, "coding-system-p");
8737
8738 DEFSYM (Qcoding_system_error, "coding-system-error");
8739 Fput (Qcoding_system_error, Qerror_conditions,
8740 Fcons (Qcoding_system_error, Fcons (Qerror, Qnil)));
8741 Fput (Qcoding_system_error, Qerror_message,
8742 build_string ("Invalid coding system"));
8743
8744 /* Intern this now in case it isn't already done.
8745 Setting this variable twice is harmless.
8746 But don't staticpro it here--that is done in alloc.c. */
8747 Qchar_table_extra_slots = intern ("char-table-extra-slots");
8748
8749 DEFSYM (Qtranslation_table, "translation-table");
8750 Fput (Qtranslation_table, Qchar_table_extra_slots, make_number (1));
8751 DEFSYM (Qtranslation_table_id, "translation-table-id");
8752 DEFSYM (Qtranslation_table_for_decode, "translation-table-for-decode");
8753 DEFSYM (Qtranslation_table_for_encode, "translation-table-for-encode");
8754
8755 DEFSYM (Qvalid_codes, "valid-codes");
8756
8757 DEFSYM (Qemacs_mule, "emacs-mule");
8758
8759 DEFSYM (QCcategory, ":category");
8760
8761 Vcoding_category_table
8762 = Fmake_vector (make_number (coding_category_max), Qnil);
8763 staticpro (&Vcoding_category_table);
8764 /* Followings are target of code detection. */
8765 ASET (Vcoding_category_table, coding_category_iso_7,
8766 intern ("coding-category-iso-7"));
8767 ASET (Vcoding_category_table, coding_category_iso_7_tight,
8768 intern ("coding-category-iso-7-tight"));
8769 ASET (Vcoding_category_table, coding_category_iso_8_1,
8770 intern ("coding-category-iso-8-1"));
8771 ASET (Vcoding_category_table, coding_category_iso_8_2,
8772 intern ("coding-category-iso-8-2"));
8773 ASET (Vcoding_category_table, coding_category_iso_7_else,
8774 intern ("coding-category-iso-7-else"));
8775 ASET (Vcoding_category_table, coding_category_iso_8_else,
8776 intern ("coding-category-iso-8-else"));
8777 ASET (Vcoding_category_table, coding_category_utf_8,
8778 intern ("coding-category-utf-8"));
8779 ASET (Vcoding_category_table, coding_category_utf_16_be,
8780 intern ("coding-category-utf-16-be"));
8781 ASET (Vcoding_category_table, coding_category_utf_16_auto,
8782 intern ("coding-category-utf-16-auto"));
8783 ASET (Vcoding_category_table, coding_category_utf_16_le,
8784 intern ("coding-category-utf-16-le"));
8785 ASET (Vcoding_category_table, coding_category_utf_16_be_nosig,
8786 intern ("coding-category-utf-16-be-nosig"));
8787 ASET (Vcoding_category_table, coding_category_utf_16_le_nosig,
8788 intern ("coding-category-utf-16-le-nosig"));
8789 ASET (Vcoding_category_table, coding_category_charset,
8790 intern ("coding-category-charset"));
8791 ASET (Vcoding_category_table, coding_category_sjis,
8792 intern ("coding-category-sjis"));
8793 ASET (Vcoding_category_table, coding_category_big5,
8794 intern ("coding-category-big5"));
8795 ASET (Vcoding_category_table, coding_category_ccl,
8796 intern ("coding-category-ccl"));
8797 ASET (Vcoding_category_table, coding_category_emacs_mule,
8798 intern ("coding-category-emacs-mule"));
8799 /* Followings are NOT target of code detection. */
8800 ASET (Vcoding_category_table, coding_category_raw_text,
8801 intern ("coding-category-raw-text"));
8802 ASET (Vcoding_category_table, coding_category_undecided,
8803 intern ("coding-category-undecided"));
8804
8805 defsubr (&Scoding_system_p);
8806 defsubr (&Sread_coding_system);
8807 defsubr (&Sread_non_nil_coding_system);
8808 defsubr (&Scheck_coding_system);
8809 defsubr (&Sdetect_coding_region);
8810 defsubr (&Sdetect_coding_string);
8811 defsubr (&Sfind_coding_systems_region_internal);
8812 defsubr (&Sunencodable_char_position);
8813 defsubr (&Scheck_coding_systems_region);
8814 defsubr (&Sdecode_coding_region);
8815 defsubr (&Sencode_coding_region);
8816 defsubr (&Sdecode_coding_string);
8817 defsubr (&Sencode_coding_string);
8818 defsubr (&Sdecode_sjis_char);
8819 defsubr (&Sencode_sjis_char);
8820 defsubr (&Sdecode_big5_char);
8821 defsubr (&Sencode_big5_char);
8822 defsubr (&Sset_terminal_coding_system_internal);
8823 defsubr (&Sset_safe_terminal_coding_system_internal);
8824 defsubr (&Sterminal_coding_system);
8825 defsubr (&Sset_keyboard_coding_system_internal);
8826 defsubr (&Skeyboard_coding_system);
8827 defsubr (&Sfind_operation_coding_system);
8828 defsubr (&Sset_coding_system_priority);
8829 defsubr (&Sdefine_coding_system_internal);
8830 defsubr (&Sdefine_coding_system_alias);
8831 defsubr (&Scoding_system_base);
8832 defsubr (&Scoding_system_plist);
8833 defsubr (&Scoding_system_aliases);
8834 defsubr (&Scoding_system_eol_type);
8835 defsubr (&Scoding_system_priority_list);
8836
8837 DEFVAR_LISP ("coding-system-list", &Vcoding_system_list,
8838 doc: /* List of coding systems.
8839
8840 Do not alter the value of this variable manually. This variable should be
8841 updated by the functions `define-coding-system' and
8842 `define-coding-system-alias'. */);
8843 Vcoding_system_list = Qnil;
8844
8845 DEFVAR_LISP ("coding-system-alist", &Vcoding_system_alist,
8846 doc: /* Alist of coding system names.
8847 Each element is one element list of coding system name.
8848 This variable is given to `completing-read' as TABLE argument.
8849
8850 Do not alter the value of this variable manually. This variable should be
8851 updated by the functions `make-coding-system' and
8852 `define-coding-system-alias'. */);
8853 Vcoding_system_alist = Qnil;
8854
8855 DEFVAR_LISP ("coding-category-list", &Vcoding_category_list,
8856 doc: /* List of coding-categories (symbols) ordered by priority.
8857
8858 On detecting a coding system, Emacs tries code detection algorithms
8859 associated with each coding-category one by one in this order. When
8860 one algorithm agrees with a byte sequence of source text, the coding
8861 system bound to the corresponding coding-category is selected. */);
8862 {
8863 int i;
8864
8865 Vcoding_category_list = Qnil;
8866 for (i = coding_category_max - 1; i >= 0; i--)
8867 Vcoding_category_list
8868 = Fcons (XVECTOR (Vcoding_category_table)->contents[i],
8869 Vcoding_category_list);
8870 }
8871
8872 DEFVAR_LISP ("coding-system-for-read", &Vcoding_system_for_read,
8873 doc: /* Specify the coding system for read operations.
8874 It is useful to bind this variable with `let', but do not set it globally.
8875 If the value is a coding system, it is used for decoding on read operation.
8876 If not, an appropriate element is used from one of the coding system alists:
8877 There are three such tables, `file-coding-system-alist',
8878 `process-coding-system-alist', and `network-coding-system-alist'. */);
8879 Vcoding_system_for_read = Qnil;
8880
8881 DEFVAR_LISP ("coding-system-for-write", &Vcoding_system_for_write,
8882 doc: /* Specify the coding system for write operations.
8883 Programs bind this variable with `let', but you should not set it globally.
8884 If the value is a coding system, it is used for encoding of output,
8885 when writing it to a file and when sending it to a file or subprocess.
8886
8887 If this does not specify a coding system, an appropriate element
8888 is used from one of the coding system alists:
8889 There are three such tables, `file-coding-system-alist',
8890 `process-coding-system-alist', and `network-coding-system-alist'.
8891 For output to files, if the above procedure does not specify a coding system,
8892 the value of `buffer-file-coding-system' is used. */);
8893 Vcoding_system_for_write = Qnil;
8894
8895 DEFVAR_LISP ("last-coding-system-used", &Vlast_coding_system_used,
8896 doc: /*
8897 Coding system used in the latest file or process I/O. */);
8898 Vlast_coding_system_used = Qnil;
8899
8900 DEFVAR_BOOL ("inhibit-eol-conversion", &inhibit_eol_conversion,
8901 doc: /*
8902 *Non-nil means always inhibit code conversion of end-of-line format.
8903 See info node `Coding Systems' and info node `Text and Binary' concerning
8904 such conversion. */);
8905 inhibit_eol_conversion = 0;
8906
8907 DEFVAR_BOOL ("inherit-process-coding-system", &inherit_process_coding_system,
8908 doc: /*
8909 Non-nil means process buffer inherits coding system of process output.
8910 Bind it to t if the process output is to be treated as if it were a file
8911 read from some filesystem. */);
8912 inherit_process_coding_system = 0;
8913
8914 DEFVAR_LISP ("file-coding-system-alist", &Vfile_coding_system_alist,
8915 doc: /*
8916 Alist to decide a coding system to use for a file I/O operation.
8917 The format is ((PATTERN . VAL) ...),
8918 where PATTERN is a regular expression matching a file name,
8919 VAL is a coding system, a cons of coding systems, or a function symbol.
8920 If VAL is a coding system, it is used for both decoding and encoding
8921 the file contents.
8922 If VAL is a cons of coding systems, the car part is used for decoding,
8923 and the cdr part is used for encoding.
8924 If VAL is a function symbol, the function must return a coding system
8925 or a cons of coding systems which are used as above. The function gets
8926 the arguments with which `find-operation-coding-systems' was called.
8927
8928 See also the function `find-operation-coding-system'
8929 and the variable `auto-coding-alist'. */);
8930 Vfile_coding_system_alist = Qnil;
8931
8932 DEFVAR_LISP ("process-coding-system-alist", &Vprocess_coding_system_alist,
8933 doc: /*
8934 Alist to decide a coding system to use for a process I/O operation.
8935 The format is ((PATTERN . VAL) ...),
8936 where PATTERN is a regular expression matching a program name,
8937 VAL is a coding system, a cons of coding systems, or a function symbol.
8938 If VAL is a coding system, it is used for both decoding what received
8939 from the program and encoding what sent to the program.
8940 If VAL is a cons of coding systems, the car part is used for decoding,
8941 and the cdr part is used for encoding.
8942 If VAL is a function symbol, the function must return a coding system
8943 or a cons of coding systems which are used as above.
8944
8945 See also the function `find-operation-coding-system'. */);
8946 Vprocess_coding_system_alist = Qnil;
8947
8948 DEFVAR_LISP ("network-coding-system-alist", &Vnetwork_coding_system_alist,
8949 doc: /*
8950 Alist to decide a coding system to use for a network I/O operation.
8951 The format is ((PATTERN . VAL) ...),
8952 where PATTERN is a regular expression matching a network service name
8953 or is a port number to connect to,
8954 VAL is a coding system, a cons of coding systems, or a function symbol.
8955 If VAL is a coding system, it is used for both decoding what received
8956 from the network stream and encoding what sent to the network stream.
8957 If VAL is a cons of coding systems, the car part is used for decoding,
8958 and the cdr part is used for encoding.
8959 If VAL is a function symbol, the function must return a coding system
8960 or a cons of coding systems which are used as above.
8961
8962 See also the function `find-operation-coding-system'. */);
8963 Vnetwork_coding_system_alist = Qnil;
8964
8965 DEFVAR_LISP ("locale-coding-system", &Vlocale_coding_system,
8966 doc: /* Coding system to use with system messages.
8967 Also used for decoding keyboard input on X Window system. */);
8968 Vlocale_coding_system = Qnil;
8969
8970 /* The eol mnemonics are reset in startup.el system-dependently. */
8971 DEFVAR_LISP ("eol-mnemonic-unix", &eol_mnemonic_unix,
8972 doc: /*
8973 *String displayed in mode line for UNIX-like (LF) end-of-line format. */);
8974 eol_mnemonic_unix = build_string (":");
8975
8976 DEFVAR_LISP ("eol-mnemonic-dos", &eol_mnemonic_dos,
8977 doc: /*
8978 *String displayed in mode line for DOS-like (CRLF) end-of-line format. */);
8979 eol_mnemonic_dos = build_string ("\\");
8980
8981 DEFVAR_LISP ("eol-mnemonic-mac", &eol_mnemonic_mac,
8982 doc: /*
8983 *String displayed in mode line for MAC-like (CR) end-of-line format. */);
8984 eol_mnemonic_mac = build_string ("/");
8985
8986 DEFVAR_LISP ("eol-mnemonic-undecided", &eol_mnemonic_undecided,
8987 doc: /*
8988 *String displayed in mode line when end-of-line format is not yet determined. */);
8989 eol_mnemonic_undecided = build_string (":");
8990
8991 DEFVAR_LISP ("enable-character-translation", &Venable_character_translation,
8992 doc: /*
8993 *Non-nil enables character translation while encoding and decoding. */);
8994 Venable_character_translation = Qt;
8995
8996 DEFVAR_LISP ("standard-translation-table-for-decode",
8997 &Vstandard_translation_table_for_decode,
8998 doc: /* Table for translating characters while decoding. */);
8999 Vstandard_translation_table_for_decode = Qnil;
9000
9001 DEFVAR_LISP ("standard-translation-table-for-encode",
9002 &Vstandard_translation_table_for_encode,
9003 doc: /* Table for translating characters while encoding. */);
9004 Vstandard_translation_table_for_encode = Qnil;
9005
9006 DEFVAR_LISP ("charset-revision-table", &Vcharset_revision_table,
9007 doc: /* Alist of charsets vs revision numbers.
9008 While encoding, if a charset (car part of an element) is found,
9009 designate it with the escape sequence identifying revision (cdr part
9010 of the element). */);
9011 Vcharset_revision_table = Qnil;
9012
9013 DEFVAR_LISP ("default-process-coding-system",
9014 &Vdefault_process_coding_system,
9015 doc: /* Cons of coding systems used for process I/O by default.
9016 The car part is used for decoding a process output,
9017 the cdr part is used for encoding a text to be sent to a process. */);
9018 Vdefault_process_coding_system = Qnil;
9019
9020 DEFVAR_LISP ("latin-extra-code-table", &Vlatin_extra_code_table,
9021 doc: /*
9022 Table of extra Latin codes in the range 128..159 (inclusive).
9023 This is a vector of length 256.
9024 If Nth element is non-nil, the existence of code N in a file
9025 \(or output of subprocess) doesn't prevent it to be detected as
9026 a coding system of ISO 2022 variant which has a flag
9027 `accept-latin-extra-code' t (e.g. iso-latin-1) on reading a file
9028 or reading output of a subprocess.
9029 Only 128th through 159th elements has a meaning. */);
9030 Vlatin_extra_code_table = Fmake_vector (make_number (256), Qnil);
9031
9032 DEFVAR_LISP ("select-safe-coding-system-function",
9033 &Vselect_safe_coding_system_function,
9034 doc: /*
9035 Function to call to select safe coding system for encoding a text.
9036
9037 If set, this function is called to force a user to select a proper
9038 coding system which can encode the text in the case that a default
9039 coding system used in each operation can't encode the text.
9040
9041 The default value is `select-safe-coding-system' (which see). */);
9042 Vselect_safe_coding_system_function = Qnil;
9043
9044 DEFVAR_BOOL ("coding-system-require-warning",
9045 &coding_system_require_warning,
9046 doc: /* Internal use only.
9047 If non-nil, on writing a file, `select-safe-coding-system-function' is
9048 called even if `coding-system-for-write' is non-nil. The command
9049 `universal-coding-system-argument' binds this variable to t temporarily. */);
9050 coding_system_require_warning = 0;
9051
9052
9053 DEFVAR_BOOL ("inhibit-iso-escape-detection",
9054 &inhibit_iso_escape_detection,
9055 doc: /*
9056 If non-nil, Emacs ignores ISO2022's escape sequence on code detection.
9057
9058 By default, on reading a file, Emacs tries to detect how the text is
9059 encoded. This code detection is sensitive to escape sequences. If
9060 the sequence is valid as ISO2022, the code is determined as one of
9061 the ISO2022 encodings, and the file is decoded by the corresponding
9062 coding system (e.g. `iso-2022-7bit').
9063
9064 However, there may be a case that you want to read escape sequences in
9065 a file as is. In such a case, you can set this variable to non-nil.
9066 Then, as the code detection ignores any escape sequences, no file is
9067 detected as encoded in some ISO2022 encoding. The result is that all
9068 escape sequences become visible in a buffer.
9069
9070 The default value is nil, and it is strongly recommended not to change
9071 it. That is because many Emacs Lisp source files that contain
9072 non-ASCII characters are encoded by the coding system `iso-2022-7bit'
9073 in Emacs's distribution, and they won't be decoded correctly on
9074 reading if you suppress escape sequence detection.
9075
9076 The other way to read escape sequences in a file without decoding is
9077 to explicitly specify some coding system that doesn't use ISO2022's
9078 escape sequence (e.g `latin-1') on reading by \\[universal-coding-system-argument]. */);
9079 inhibit_iso_escape_detection = 0;
9080
9081 DEFVAR_LISP ("translation-table-for-input", &Vtranslation_table_for_input,
9082 doc: /* Char table for translating self-inserting characters.
9083 This is applied to the result of input methods, not their input. See also
9084 `keyboard-translate-table'. */);
9085 Vtranslation_table_for_input = Qnil;
9086
9087 {
9088 Lisp_Object args[coding_arg_max];
9089 Lisp_Object plist[16];
9090 int i;
9091
9092 for (i = 0; i < coding_arg_max; i++)
9093 args[i] = Qnil;
9094
9095 plist[0] = intern (":name");
9096 plist[1] = args[coding_arg_name] = Qno_conversion;
9097 plist[2] = intern (":mnemonic");
9098 plist[3] = args[coding_arg_mnemonic] = make_number ('=');
9099 plist[4] = intern (":coding-type");
9100 plist[5] = args[coding_arg_coding_type] = Qraw_text;
9101 plist[6] = intern (":ascii-compatible-p");
9102 plist[7] = args[coding_arg_ascii_compatible_p] = Qt;
9103 plist[8] = intern (":default-char");
9104 plist[9] = args[coding_arg_default_char] = make_number (0);
9105 plist[10] = intern (":for-unibyte");
9106 plist[11] = args[coding_arg_for_unibyte] = Qt;
9107 plist[12] = intern (":docstring");
9108 plist[13] = build_string ("Do no conversion.\n\
9109 \n\
9110 When you visit a file with this coding, the file is read into a\n\
9111 unibyte buffer as is, thus each byte of a file is treated as a\n\
9112 character.");
9113 plist[14] = intern (":eol-type");
9114 plist[15] = args[coding_arg_eol_type] = Qunix;
9115 args[coding_arg_plist] = Flist (16, plist);
9116 Fdefine_coding_system_internal (coding_arg_max, args);
9117 }
9118
9119 setup_coding_system (Qno_conversion, &keyboard_coding);
9120 setup_coding_system (Qno_conversion, &terminal_coding);
9121 setup_coding_system (Qno_conversion, &safe_terminal_coding);
9122
9123 {
9124 int i;
9125
9126 for (i = 0; i < coding_category_max; i++)
9127 Fset (AREF (Vcoding_category_table, i), Qno_conversion);
9128 }
9129 }
9130
9131 char *
9132 emacs_strerror (error_number)
9133 int error_number;
9134 {
9135 char *str;
9136
9137 synchronize_system_messages_locale ();
9138 str = strerror (error_number);
9139
9140 if (! NILP (Vlocale_coding_system))
9141 {
9142 Lisp_Object dec = code_convert_string_norecord (build_string (str),
9143 Vlocale_coding_system,
9144 0);
9145 str = (char *) SDATA (dec);
9146 }
9147
9148 return str;
9149 }
9150
9151 #endif /* emacs */