Add many prototypes for static functions.
[bpt/emacs.git] / src / coding.c
1 /* Coding system handler (conversion, detection, etc).
2 Copyright (C) 1995, 1997, 1998 Electrotechnical Laboratory, JAPAN.
3 Licensed to the Free Software Foundation.
4 Copyright (C) 2001, 2002 Free Software Foundation, Inc.
5 Copyright (C) 2003
6 National Institute of Advanced Industrial Science and Technology (AIST)
7 Registration Number H13PRO009
8
9 This file is part of GNU Emacs.
10
11 GNU Emacs is free software; you can redistribute it and/or modify
12 it under the terms of the GNU General Public License as published by
13 the Free Software Foundation; either version 2, or (at your option)
14 any later version.
15
16 GNU Emacs is distributed in the hope that it will be useful,
17 but WITHOUT ANY WARRANTY; without even the implied warranty of
18 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19 GNU General Public License for more details.
20
21 You should have received a copy of the GNU General Public License
22 along with GNU Emacs; see the file COPYING. If not, write to
23 the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
24 Boston, MA 02111-1307, USA. */
25
26 /*** TABLE OF CONTENTS ***
27
28 0. General comments
29 1. Preamble
30 2. Emacs' internal format (emacs-utf-8) handlers
31 3. UTF-8 handlers
32 4. UTF-16 handlers
33 5. Charset-base coding systems handlers
34 6. emacs-mule (old Emacs' internal format) handlers
35 7. ISO2022 handlers
36 8. Shift-JIS and BIG5 handlers
37 9. CCL handlers
38 10. C library functions
39 11. Emacs Lisp library functions
40 12. Postamble
41
42 */
43
44 /*** 0. General comments ***
45
46
47 CODING SYSTEM
48
49 A coding system is an object for an encoding mechanism that contains
50 information about how to convert byte sequences to character
51 sequences and vice versa. When we say "decode", it means converting
52 a byte sequence of a specific coding system into a character
53 sequence that is represented by Emacs' internal coding system
54 `emacs-utf-8', and when we say "encode", it means converting a
55 character sequence of emacs-utf-8 to a byte sequence of a specific
56 coding system.
57
58 In Emacs Lisp, a coding system is represented by a Lisp symbol. In
59 C level, a coding system is represented by a vector of attributes
60 stored in the hash table Vcharset_hash_table. The conversion from
61 coding system symbol to attributes vector is done by looking up
62 Vcharset_hash_table by the symbol.
63
64 Coding systems are classified into the following types depending on
65 the encoding mechanism. Here's a brief description of the types.
66
67 o UTF-8
68
69 o UTF-16
70
71 o Charset-base coding system
72
73 A coding system defined by one or more (coded) character sets.
74 Decoding and encoding are done by a code converter defined for each
75 character set.
76
77 o Old Emacs internal format (emacs-mule)
78
79 The coding system adopted by old versions of Emacs (20 and 21).
80
81 o ISO2022-base coding system
82
83 The most famous coding system for multiple character sets. X's
84 Compound Text, various EUCs (Extended Unix Code), and coding systems
85 used in the Internet communication such as ISO-2022-JP are all
86 variants of ISO2022.
87
88 o SJIS (or Shift-JIS or MS-Kanji-Code)
89
90 A coding system to encode character sets: ASCII, JISX0201, and
91 JISX0208. Widely used for PC's in Japan. Details are described in
92 section 8.
93
94 o BIG5
95
96 A coding system to encode character sets: ASCII and Big5. Widely
97 used for Chinese (mainly in Taiwan and Hong Kong). Details are
98 described in section 8. In this file, when we write "big5" (all
99 lowercase), we mean the coding system, and when we write "Big5"
100 (capitalized), we mean the character set.
101
102 o CCL
103
104 If a user wants to decode/encode text encoded in a coding system
105 not listed above, he can supply a decoder and an encoder for it in
106 CCL (Code Conversion Language) programs. Emacs executes the CCL
107 program while decoding/encoding.
108
109 o Raw-text
110
111 A coding system for text containing raw eight-bit data. Emacs
112 treats each byte of source text as a character (except for
113 end-of-line conversion).
114
115 o No-conversion
116
117 Like raw text, but don't do end-of-line conversion.
118
119
120 END-OF-LINE FORMAT
121
122 How text end-of-line is encoded depends on operating system. For
123 instance, Unix's format is just one byte of LF (line-feed) code,
124 whereas DOS's format is two-byte sequence of `carriage-return' and
125 `line-feed' codes. MacOS's format is usually one byte of
126 `carriage-return'.
127
128 Since text character encoding and end-of-line encoding are
129 independent, any coding system described above can take any format
130 of end-of-line (except for no-conversion).
131
132 STRUCT CODING_SYSTEM
133
134 Before using a coding system for code conversion (i.e. decoding and
135 encoding), we setup a structure of type `struct coding_system'.
136 This structure keeps various information about a specific code
137 conversion (e.g. the location of source and destination data).
138
139 */
140
141 /* COMMON MACROS */
142
143
144 /*** GENERAL NOTES on `detect_coding_XXX ()' functions ***
145
146 These functions check if a byte sequence specified as a source in
147 CODING conforms to the format of XXX, and update the members of
148 DETECT_INFO.
149
150 Return 1 if the byte sequence conforms to XXX, otherwise return 0.
151
152 Below is the template of these functions. */
153
154 #if 0
155 static int
156 detect_coding_XXX (coding, detect_info)
157 struct coding_system *coding;
158 struct coding_detection_info *detect_info;
159 {
160 unsigned char *src = coding->source;
161 unsigned char *src_end = coding->source + coding->src_bytes;
162 int multibytep = coding->src_multibyte;
163 int consumed_chars = 0;
164 int found = 0;
165 ...;
166
167 while (1)
168 {
169 /* Get one byte from the source. If the souce is exausted, jump
170 to no_more_source:. */
171 ONE_MORE_BYTE (c);
172
173 if (! __C_conforms_to_XXX___ (c))
174 break;
175 if (! __C_strongly_suggests_XXX__ (c))
176 found = CATEGORY_MASK_XXX;
177 }
178 /* The byte sequence is invalid for XXX. */
179 detect_info->rejected |= CATEGORY_MASK_XXX;
180 return 0;
181
182 no_more_source:
183 /* The source exausted successfully. */
184 detect_info->found |= found;
185 return 1;
186 }
187 #endif
188
189 /*** GENERAL NOTES on `decode_coding_XXX ()' functions ***
190
191 These functions decode a byte sequence specified as a source by
192 CODING. The resulting multibyte text goes to a place pointed to by
193 CODING->charbuf, the length of which should not exceed
194 CODING->charbuf_size;
195
196 These functions set the information of original and decoded texts in
197 CODING->consumed, CODING->consumed_char, and CODING->charbuf_used.
198 They also set CODING->result to one of CODING_RESULT_XXX indicating
199 how the decoding is finished.
200
201 Below is the template of these functions. */
202
203 #if 0
204 static void
205 decode_coding_XXXX (coding)
206 struct coding_system *coding;
207 {
208 unsigned char *src = coding->source + coding->consumed;
209 unsigned char *src_end = coding->source + coding->src_bytes;
210 /* SRC_BASE remembers the start position in source in each loop.
211 The loop will be exited when there's not enough source code, or
212 when there's no room in CHARBUF for a decoded character. */
213 unsigned char *src_base;
214 /* A buffer to produce decoded characters. */
215 int *charbuf = coding->charbuf + coding->charbuf_used;
216 int *charbuf_end = coding->charbuf + coding->charbuf_size;
217 int multibytep = coding->src_multibyte;
218
219 while (1)
220 {
221 src_base = src;
222 if (charbuf < charbuf_end)
223 /* No more room to produce a decoded character. */
224 break;
225 ONE_MORE_BYTE (c);
226 /* Decode it. */
227 }
228
229 no_more_source:
230 if (src_base < src_end
231 && coding->mode & CODING_MODE_LAST_BLOCK)
232 /* If the source ends by partial bytes to construct a character,
233 treat them as eight-bit raw data. */
234 while (src_base < src_end && charbuf < charbuf_end)
235 *charbuf++ = *src_base++;
236 /* Remember how many bytes and characters we consumed. If the
237 source is multibyte, the bytes and chars are not identical. */
238 coding->consumed = coding->consumed_char = src_base - coding->source;
239 /* Remember how many characters we produced. */
240 coding->charbuf_used = charbuf - coding->charbuf;
241 }
242 #endif
243
244 /*** GENERAL NOTES on `encode_coding_XXX ()' functions ***
245
246 These functions encode SRC_BYTES length text at SOURCE of Emacs'
247 internal multibyte format by CODING. The resulting byte sequence
248 goes to a place pointed to by DESTINATION, the length of which
249 should not exceed DST_BYTES.
250
251 These functions set the information of original and encoded texts in
252 the members produced, produced_char, consumed, and consumed_char of
253 the structure *CODING. They also set the member result to one of
254 CODING_RESULT_XXX indicating how the encoding finished.
255
256 DST_BYTES zero means that source area and destination area are
257 overlapped, which means that we can produce a encoded text until it
258 reaches at the head of not-yet-encoded source text.
259
260 Below is a template of these functions. */
261 #if 0
262 static void
263 encode_coding_XXX (coding)
264 struct coding_system *coding;
265 {
266 int multibytep = coding->dst_multibyte;
267 int *charbuf = coding->charbuf;
268 int *charbuf_end = charbuf->charbuf + coding->charbuf_used;
269 unsigned char *dst = coding->destination + coding->produced;
270 unsigned char *dst_end = coding->destination + coding->dst_bytes;
271 unsigned char *adjusted_dst_end = dst_end - _MAX_BYTES_PRODUCED_IN_LOOP_;
272 int produced_chars = 0;
273
274 for (; charbuf < charbuf_end && dst < adjusted_dst_end; charbuf++)
275 {
276 int c = *charbuf;
277 /* Encode C into DST, and increment DST. */
278 }
279 label_no_more_destination:
280 /* How many chars and bytes we produced. */
281 coding->produced_char += produced_chars;
282 coding->produced = dst - coding->destination;
283 }
284 #endif
285
286 \f
287 /*** 1. Preamble ***/
288
289 #include <config.h>
290 #include <stdio.h>
291
292 #include "lisp.h"
293 #include "buffer.h"
294 #include "character.h"
295 #include "charset.h"
296 #include "ccl.h"
297 #include "composite.h"
298 #include "coding.h"
299 #include "window.h"
300
301 Lisp_Object Vcoding_system_hash_table;
302
303 Lisp_Object Qcoding_system, Qcoding_aliases, Qeol_type;
304 Lisp_Object Qunix, Qdos;
305 extern Lisp_Object Qmac; /* frame.c */
306 Lisp_Object Qbuffer_file_coding_system;
307 Lisp_Object Qpost_read_conversion, Qpre_write_conversion;
308 Lisp_Object Qdefault_char;
309 Lisp_Object Qno_conversion, Qundecided;
310 Lisp_Object Qcharset, Qiso_2022, Qutf_8, Qutf_16, Qshift_jis, Qbig5;
311 Lisp_Object Qbig, Qlittle;
312 Lisp_Object Qcoding_system_history;
313 Lisp_Object Qvalid_codes;
314 Lisp_Object QCcategory, QCmnemonic, QCdefalut_char;
315 Lisp_Object QCdecode_translation_table, QCencode_translation_table;
316 Lisp_Object QCpost_read_conversion, QCpre_write_conversion;
317
318 extern Lisp_Object Qinsert_file_contents, Qwrite_region;
319 Lisp_Object Qcall_process, Qcall_process_region, Qprocess_argument;
320 Lisp_Object Qstart_process, Qopen_network_stream;
321 Lisp_Object Qtarget_idx;
322
323 Lisp_Object Qinsufficient_source, Qinconsistent_eol, Qinvalid_source;
324 Lisp_Object Qinterrupted, Qinsufficient_memory;
325
326 int coding_system_require_warning;
327
328 Lisp_Object Vselect_safe_coding_system_function;
329
330 /* Mnemonic string for each format of end-of-line. */
331 Lisp_Object eol_mnemonic_unix, eol_mnemonic_dos, eol_mnemonic_mac;
332 /* Mnemonic string to indicate format of end-of-line is not yet
333 decided. */
334 Lisp_Object eol_mnemonic_undecided;
335
336 #ifdef emacs
337
338 Lisp_Object Vcoding_system_list, Vcoding_system_alist;
339
340 Lisp_Object Qcoding_system_p, Qcoding_system_error;
341
342 /* Coding system emacs-mule and raw-text are for converting only
343 end-of-line format. */
344 Lisp_Object Qemacs_mule, Qraw_text;
345 Lisp_Object Qutf_8_emacs;
346
347 /* Coding-systems are handed between Emacs Lisp programs and C internal
348 routines by the following three variables. */
349 /* Coding-system for reading files and receiving data from process. */
350 Lisp_Object Vcoding_system_for_read;
351 /* Coding-system for writing files and sending data to process. */
352 Lisp_Object Vcoding_system_for_write;
353 /* Coding-system actually used in the latest I/O. */
354 Lisp_Object Vlast_coding_system_used;
355 /* Set to non-nil when an error is detected while code conversion. */
356 Lisp_Object Vlast_code_conversion_error;
357 /* A vector of length 256 which contains information about special
358 Latin codes (especially for dealing with Microsoft codes). */
359 Lisp_Object Vlatin_extra_code_table;
360
361 /* Flag to inhibit code conversion of end-of-line format. */
362 int inhibit_eol_conversion;
363
364 /* Flag to inhibit ISO2022 escape sequence detection. */
365 int inhibit_iso_escape_detection;
366
367 /* Flag to make buffer-file-coding-system inherit from process-coding. */
368 int inherit_process_coding_system;
369
370 /* Coding system to be used to encode text for terminal display. */
371 struct coding_system terminal_coding;
372
373 /* Coding system to be used to encode text for terminal display when
374 terminal coding system is nil. */
375 struct coding_system safe_terminal_coding;
376
377 /* Coding system of what is sent from terminal keyboard. */
378 struct coding_system keyboard_coding;
379
380 Lisp_Object Vfile_coding_system_alist;
381 Lisp_Object Vprocess_coding_system_alist;
382 Lisp_Object Vnetwork_coding_system_alist;
383
384 Lisp_Object Vlocale_coding_system;
385
386 #endif /* emacs */
387
388 /* Flag to tell if we look up translation table on character code
389 conversion. */
390 Lisp_Object Venable_character_translation;
391 /* Standard translation table to look up on decoding (reading). */
392 Lisp_Object Vstandard_translation_table_for_decode;
393 /* Standard translation table to look up on encoding (writing). */
394 Lisp_Object Vstandard_translation_table_for_encode;
395
396 Lisp_Object Qtranslation_table;
397 Lisp_Object Qtranslation_table_id;
398 Lisp_Object Qtranslation_table_for_decode;
399 Lisp_Object Qtranslation_table_for_encode;
400
401 /* Alist of charsets vs revision number. */
402 static Lisp_Object Vcharset_revision_table;
403
404 /* Default coding systems used for process I/O. */
405 Lisp_Object Vdefault_process_coding_system;
406
407 /* Char table for translating Quail and self-inserting input. */
408 Lisp_Object Vtranslation_table_for_input;
409
410 /* Two special coding systems. */
411 Lisp_Object Vsjis_coding_system;
412 Lisp_Object Vbig5_coding_system;
413
414 /* ISO2022 section */
415
416 #define CODING_ISO_INITIAL(coding, reg) \
417 (XINT (AREF (AREF (CODING_ID_ATTRS ((coding)->id), \
418 coding_attr_iso_initial), \
419 reg)))
420
421
422 #define CODING_ISO_REQUEST(coding, charset_id) \
423 ((charset_id <= (coding)->max_charset_id \
424 ? (coding)->safe_charsets[charset_id] \
425 : -1))
426
427
428 #define CODING_ISO_FLAGS(coding) \
429 ((coding)->spec.iso_2022.flags)
430 #define CODING_ISO_DESIGNATION(coding, reg) \
431 ((coding)->spec.iso_2022.current_designation[reg])
432 #define CODING_ISO_INVOCATION(coding, plane) \
433 ((coding)->spec.iso_2022.current_invocation[plane])
434 #define CODING_ISO_SINGLE_SHIFTING(coding) \
435 ((coding)->spec.iso_2022.single_shifting)
436 #define CODING_ISO_BOL(coding) \
437 ((coding)->spec.iso_2022.bol)
438 #define CODING_ISO_INVOKED_CHARSET(coding, plane) \
439 CODING_ISO_DESIGNATION ((coding), CODING_ISO_INVOCATION ((coding), (plane)))
440
441 /* Control characters of ISO2022. */
442 /* code */ /* function */
443 #define ISO_CODE_LF 0x0A /* line-feed */
444 #define ISO_CODE_CR 0x0D /* carriage-return */
445 #define ISO_CODE_SO 0x0E /* shift-out */
446 #define ISO_CODE_SI 0x0F /* shift-in */
447 #define ISO_CODE_SS2_7 0x19 /* single-shift-2 for 7-bit code */
448 #define ISO_CODE_ESC 0x1B /* escape */
449 #define ISO_CODE_SS2 0x8E /* single-shift-2 */
450 #define ISO_CODE_SS3 0x8F /* single-shift-3 */
451 #define ISO_CODE_CSI 0x9B /* control-sequence-introducer */
452
453 /* All code (1-byte) of ISO2022 is classified into one of the
454 followings. */
455 enum iso_code_class_type
456 {
457 ISO_control_0, /* Control codes in the range
458 0x00..0x1F and 0x7F, except for the
459 following 5 codes. */
460 ISO_shift_out, /* ISO_CODE_SO (0x0E) */
461 ISO_shift_in, /* ISO_CODE_SI (0x0F) */
462 ISO_single_shift_2_7, /* ISO_CODE_SS2_7 (0x19) */
463 ISO_escape, /* ISO_CODE_SO (0x1B) */
464 ISO_control_1, /* Control codes in the range
465 0x80..0x9F, except for the
466 following 3 codes. */
467 ISO_single_shift_2, /* ISO_CODE_SS2 (0x8E) */
468 ISO_single_shift_3, /* ISO_CODE_SS3 (0x8F) */
469 ISO_control_sequence_introducer, /* ISO_CODE_CSI (0x9B) */
470 ISO_0x20_or_0x7F, /* Codes of the values 0x20 or 0x7F. */
471 ISO_graphic_plane_0, /* Graphic codes in the range 0x21..0x7E. */
472 ISO_0xA0_or_0xFF, /* Codes of the values 0xA0 or 0xFF. */
473 ISO_graphic_plane_1 /* Graphic codes in the range 0xA1..0xFE. */
474 };
475
476 /** The macros CODING_ISO_FLAG_XXX defines a flag bit of the
477 `iso-flags' attribute of an iso2022 coding system. */
478
479 /* If set, produce long-form designation sequence (e.g. ESC $ ( A)
480 instead of the correct short-form sequence (e.g. ESC $ A). */
481 #define CODING_ISO_FLAG_LONG_FORM 0x0001
482
483 /* If set, reset graphic planes and registers at end-of-line to the
484 initial state. */
485 #define CODING_ISO_FLAG_RESET_AT_EOL 0x0002
486
487 /* If set, reset graphic planes and registers before any control
488 characters to the initial state. */
489 #define CODING_ISO_FLAG_RESET_AT_CNTL 0x0004
490
491 /* If set, encode by 7-bit environment. */
492 #define CODING_ISO_FLAG_SEVEN_BITS 0x0008
493
494 /* If set, use locking-shift function. */
495 #define CODING_ISO_FLAG_LOCKING_SHIFT 0x0010
496
497 /* If set, use single-shift function. Overwrite
498 CODING_ISO_FLAG_LOCKING_SHIFT. */
499 #define CODING_ISO_FLAG_SINGLE_SHIFT 0x0020
500
501 /* If set, use designation escape sequence. */
502 #define CODING_ISO_FLAG_DESIGNATION 0x0040
503
504 /* If set, produce revision number sequence. */
505 #define CODING_ISO_FLAG_REVISION 0x0080
506
507 /* If set, produce ISO6429's direction specifying sequence. */
508 #define CODING_ISO_FLAG_DIRECTION 0x0100
509
510 /* If set, assume designation states are reset at beginning of line on
511 output. */
512 #define CODING_ISO_FLAG_INIT_AT_BOL 0x0200
513
514 /* If set, designation sequence should be placed at beginning of line
515 on output. */
516 #define CODING_ISO_FLAG_DESIGNATE_AT_BOL 0x0400
517
518 /* If set, do not encode unsafe charactes on output. */
519 #define CODING_ISO_FLAG_SAFE 0x0800
520
521 /* If set, extra latin codes (128..159) are accepted as a valid code
522 on input. */
523 #define CODING_ISO_FLAG_LATIN_EXTRA 0x1000
524
525 #define CODING_ISO_FLAG_COMPOSITION 0x2000
526
527 #define CODING_ISO_FLAG_EUC_TW_SHIFT 0x4000
528
529 #define CODING_ISO_FLAG_USE_ROMAN 0x8000
530
531 #define CODING_ISO_FLAG_USE_OLDJIS 0x10000
532
533 #define CODING_ISO_FLAG_FULL_SUPPORT 0x100000
534
535 /* A character to be produced on output if encoding of the original
536 character is prohibited by CODING_ISO_FLAG_SAFE. */
537 #define CODING_INHIBIT_CHARACTER_SUBSTITUTION '?'
538
539
540 /* UTF-16 section */
541 #define CODING_UTF_16_BOM(coding) \
542 ((coding)->spec.utf_16.bom)
543
544 #define CODING_UTF_16_ENDIAN(coding) \
545 ((coding)->spec.utf_16.endian)
546
547 #define CODING_UTF_16_SURROGATE(coding) \
548 ((coding)->spec.utf_16.surrogate)
549
550
551 /* CCL section */
552 #define CODING_CCL_DECODER(coding) \
553 AREF (CODING_ID_ATTRS ((coding)->id), coding_attr_ccl_decoder)
554 #define CODING_CCL_ENCODER(coding) \
555 AREF (CODING_ID_ATTRS ((coding)->id), coding_attr_ccl_encoder)
556 #define CODING_CCL_VALIDS(coding) \
557 (SDATA (AREF (CODING_ID_ATTRS ((coding)->id), coding_attr_ccl_valids)))
558
559 /* Index for each coding category in `coding_categories' */
560
561 enum coding_category
562 {
563 coding_category_iso_7,
564 coding_category_iso_7_tight,
565 coding_category_iso_8_1,
566 coding_category_iso_8_2,
567 coding_category_iso_7_else,
568 coding_category_iso_8_else,
569 coding_category_utf_8,
570 coding_category_utf_16_auto,
571 coding_category_utf_16_be,
572 coding_category_utf_16_le,
573 coding_category_utf_16_be_nosig,
574 coding_category_utf_16_le_nosig,
575 coding_category_charset,
576 coding_category_sjis,
577 coding_category_big5,
578 coding_category_ccl,
579 coding_category_emacs_mule,
580 /* All above are targets of code detection. */
581 coding_category_raw_text,
582 coding_category_undecided,
583 coding_category_max
584 };
585
586 /* Definitions of flag bits used in detect_coding_XXXX. */
587 #define CATEGORY_MASK_ISO_7 (1 << coding_category_iso_7)
588 #define CATEGORY_MASK_ISO_7_TIGHT (1 << coding_category_iso_7_tight)
589 #define CATEGORY_MASK_ISO_8_1 (1 << coding_category_iso_8_1)
590 #define CATEGORY_MASK_ISO_8_2 (1 << coding_category_iso_8_2)
591 #define CATEGORY_MASK_ISO_7_ELSE (1 << coding_category_iso_7_else)
592 #define CATEGORY_MASK_ISO_8_ELSE (1 << coding_category_iso_8_else)
593 #define CATEGORY_MASK_UTF_8 (1 << coding_category_utf_8)
594 #define CATEGORY_MASK_UTF_16_AUTO (1 << coding_category_utf_16_auto)
595 #define CATEGORY_MASK_UTF_16_BE (1 << coding_category_utf_16_be)
596 #define CATEGORY_MASK_UTF_16_LE (1 << coding_category_utf_16_le)
597 #define CATEGORY_MASK_UTF_16_BE_NOSIG (1 << coding_category_utf_16_be_nosig)
598 #define CATEGORY_MASK_UTF_16_LE_NOSIG (1 << coding_category_utf_16_le_nosig)
599 #define CATEGORY_MASK_CHARSET (1 << coding_category_charset)
600 #define CATEGORY_MASK_SJIS (1 << coding_category_sjis)
601 #define CATEGORY_MASK_BIG5 (1 << coding_category_big5)
602 #define CATEGORY_MASK_CCL (1 << coding_category_ccl)
603 #define CATEGORY_MASK_EMACS_MULE (1 << coding_category_emacs_mule)
604 #define CATEGORY_MASK_RAW_TEXT (1 << coding_category_raw_text)
605
606 /* This value is returned if detect_coding_mask () find nothing other
607 than ASCII characters. */
608 #define CATEGORY_MASK_ANY \
609 (CATEGORY_MASK_ISO_7 \
610 | CATEGORY_MASK_ISO_7_TIGHT \
611 | CATEGORY_MASK_ISO_8_1 \
612 | CATEGORY_MASK_ISO_8_2 \
613 | CATEGORY_MASK_ISO_7_ELSE \
614 | CATEGORY_MASK_ISO_8_ELSE \
615 | CATEGORY_MASK_UTF_8 \
616 | CATEGORY_MASK_UTF_16_BE \
617 | CATEGORY_MASK_UTF_16_LE \
618 | CATEGORY_MASK_UTF_16_BE_NOSIG \
619 | CATEGORY_MASK_UTF_16_LE_NOSIG \
620 | CATEGORY_MASK_CHARSET \
621 | CATEGORY_MASK_SJIS \
622 | CATEGORY_MASK_BIG5 \
623 | CATEGORY_MASK_CCL \
624 | CATEGORY_MASK_EMACS_MULE)
625
626
627 #define CATEGORY_MASK_ISO_7BIT \
628 (CATEGORY_MASK_ISO_7 | CATEGORY_MASK_ISO_7_TIGHT)
629
630 #define CATEGORY_MASK_ISO_8BIT \
631 (CATEGORY_MASK_ISO_8_1 | CATEGORY_MASK_ISO_8_2)
632
633 #define CATEGORY_MASK_ISO_ELSE \
634 (CATEGORY_MASK_ISO_7_ELSE | CATEGORY_MASK_ISO_8_ELSE)
635
636 #define CATEGORY_MASK_ISO_ESCAPE \
637 (CATEGORY_MASK_ISO_7 \
638 | CATEGORY_MASK_ISO_7_TIGHT \
639 | CATEGORY_MASK_ISO_7_ELSE \
640 | CATEGORY_MASK_ISO_8_ELSE)
641
642 #define CATEGORY_MASK_ISO \
643 ( CATEGORY_MASK_ISO_7BIT \
644 | CATEGORY_MASK_ISO_8BIT \
645 | CATEGORY_MASK_ISO_ELSE)
646
647 #define CATEGORY_MASK_UTF_16 \
648 (CATEGORY_MASK_UTF_16_BE \
649 | CATEGORY_MASK_UTF_16_LE \
650 | CATEGORY_MASK_UTF_16_BE_NOSIG \
651 | CATEGORY_MASK_UTF_16_LE_NOSIG)
652
653
654 /* List of symbols `coding-category-xxx' ordered by priority. This
655 variable is exposed to Emacs Lisp. */
656 static Lisp_Object Vcoding_category_list;
657
658 /* Table of coding categories (Lisp symbols). This variable is for
659 internal use oly. */
660 static Lisp_Object Vcoding_category_table;
661
662 /* Table of coding-categories ordered by priority. */
663 static enum coding_category coding_priorities[coding_category_max];
664
665 /* Nth element is a coding context for the coding system bound to the
666 Nth coding category. */
667 static struct coding_system coding_categories[coding_category_max];
668
669 /*** Commonly used macros and functions ***/
670
671 #ifndef min
672 #define min(a, b) ((a) < (b) ? (a) : (b))
673 #endif
674 #ifndef max
675 #define max(a, b) ((a) > (b) ? (a) : (b))
676 #endif
677
678 #define CODING_GET_INFO(coding, attrs, charset_list) \
679 do { \
680 (attrs) = CODING_ID_ATTRS ((coding)->id); \
681 (charset_list) = CODING_ATTR_CHARSET_LIST (attrs); \
682 } while (0)
683
684
685 /* Safely get one byte from the source text pointed by SRC which ends
686 at SRC_END, and set C to that byte. If there are not enough bytes
687 in the source, it jumps to `no_more_source'. If multibytep is
688 nonzero, and a multibyte character is found at SRC, set C to the
689 negative value of the character code. The caller should declare
690 and set these variables appropriately in advance:
691 src, src_end, multibytep */
692
693 #define ONE_MORE_BYTE(c) \
694 do { \
695 if (src == src_end) \
696 { \
697 if (src_base < src) \
698 record_conversion_result \
699 (coding, CODING_RESULT_INSUFFICIENT_SRC); \
700 goto no_more_source; \
701 } \
702 c = *src++; \
703 if (multibytep && (c & 0x80)) \
704 { \
705 if ((c & 0xFE) == 0xC0) \
706 c = ((c & 1) << 6) | *src++; \
707 else \
708 { \
709 c = - string_char (--src, &src, NULL); \
710 record_conversion_result \
711 (coding, CODING_RESULT_INVALID_SRC); \
712 } \
713 } \
714 consumed_chars++; \
715 } while (0)
716
717
718 #define ONE_MORE_BYTE_NO_CHECK(c) \
719 do { \
720 c = *src++; \
721 if (multibytep && (c & 0x80)) \
722 { \
723 if ((c & 0xFE) == 0xC0) \
724 c = ((c & 1) << 6) | *src++; \
725 else \
726 { \
727 c = - string_char (--src, &src, NULL); \
728 record_conversion_result \
729 (coding, CODING_RESULT_INVALID_SRC); \
730 } \
731 } \
732 consumed_chars++; \
733 } while (0)
734
735
736 /* Store a byte C in the place pointed by DST and increment DST to the
737 next free point, and increment PRODUCED_CHARS. The caller should
738 assure that C is 0..127, and declare and set the variable `dst'
739 appropriately in advance.
740 */
741
742
743 #define EMIT_ONE_ASCII_BYTE(c) \
744 do { \
745 produced_chars++; \
746 *dst++ = (c); \
747 } while (0)
748
749
750 /* Like EMIT_ONE_ASCII_BYTE byt store two bytes; C1 and C2. */
751
752 #define EMIT_TWO_ASCII_BYTES(c1, c2) \
753 do { \
754 produced_chars += 2; \
755 *dst++ = (c1), *dst++ = (c2); \
756 } while (0)
757
758
759 /* Store a byte C in the place pointed by DST and increment DST to the
760 next free point, and increment PRODUCED_CHARS. If MULTIBYTEP is
761 nonzero, store in an appropriate multibyte from. The caller should
762 declare and set the variables `dst' and `multibytep' appropriately
763 in advance. */
764
765 #define EMIT_ONE_BYTE(c) \
766 do { \
767 produced_chars++; \
768 if (multibytep) \
769 { \
770 int ch = (c); \
771 if (ch >= 0x80) \
772 ch = BYTE8_TO_CHAR (ch); \
773 CHAR_STRING_ADVANCE (ch, dst); \
774 } \
775 else \
776 *dst++ = (c); \
777 } while (0)
778
779
780 /* Like EMIT_ONE_BYTE, but emit two bytes; C1 and C2. */
781
782 #define EMIT_TWO_BYTES(c1, c2) \
783 do { \
784 produced_chars += 2; \
785 if (multibytep) \
786 { \
787 int ch; \
788 \
789 ch = (c1); \
790 if (ch >= 0x80) \
791 ch = BYTE8_TO_CHAR (ch); \
792 CHAR_STRING_ADVANCE (ch, dst); \
793 ch = (c2); \
794 if (ch >= 0x80) \
795 ch = BYTE8_TO_CHAR (ch); \
796 CHAR_STRING_ADVANCE (ch, dst); \
797 } \
798 else \
799 { \
800 *dst++ = (c1); \
801 *dst++ = (c2); \
802 } \
803 } while (0)
804
805
806 #define EMIT_THREE_BYTES(c1, c2, c3) \
807 do { \
808 EMIT_ONE_BYTE (c1); \
809 EMIT_TWO_BYTES (c2, c3); \
810 } while (0)
811
812
813 #define EMIT_FOUR_BYTES(c1, c2, c3, c4) \
814 do { \
815 EMIT_TWO_BYTES (c1, c2); \
816 EMIT_TWO_BYTES (c3, c4); \
817 } while (0)
818
819
820 /* Prototypes for static functions. */
821 static void record_conversion_result P_ ((struct coding_system *coding,
822 enum coding_result_code result));
823 static int detect_coding_utf_8 P_ ((struct coding_system *,
824 struct coding_detection_info *info));
825 static void decode_coding_utf_8 P_ ((struct coding_system *));
826 static int encode_coding_utf_8 P_ ((struct coding_system *));
827
828 static int detect_coding_utf_16 P_ ((struct coding_system *,
829 struct coding_detection_info *info));
830 static void decode_coding_utf_16 P_ ((struct coding_system *));
831 static int encode_coding_utf_16 P_ ((struct coding_system *));
832
833 static int detect_coding_iso_2022 P_ ((struct coding_system *,
834 struct coding_detection_info *info));
835 static void decode_coding_iso_2022 P_ ((struct coding_system *));
836 static int encode_coding_iso_2022 P_ ((struct coding_system *));
837
838 static int detect_coding_emacs_mule P_ ((struct coding_system *,
839 struct coding_detection_info *info));
840 static void decode_coding_emacs_mule P_ ((struct coding_system *));
841 static int encode_coding_emacs_mule P_ ((struct coding_system *));
842
843 static int detect_coding_sjis P_ ((struct coding_system *,
844 struct coding_detection_info *info));
845 static void decode_coding_sjis P_ ((struct coding_system *));
846 static int encode_coding_sjis P_ ((struct coding_system *));
847
848 static int detect_coding_big5 P_ ((struct coding_system *,
849 struct coding_detection_info *info));
850 static void decode_coding_big5 P_ ((struct coding_system *));
851 static int encode_coding_big5 P_ ((struct coding_system *));
852
853 static int detect_coding_ccl P_ ((struct coding_system *,
854 struct coding_detection_info *info));
855 static void decode_coding_ccl P_ ((struct coding_system *));
856 static int encode_coding_ccl P_ ((struct coding_system *));
857
858 static void decode_coding_raw_text P_ ((struct coding_system *));
859 static int encode_coding_raw_text P_ ((struct coding_system *));
860
861 static void coding_set_source P_ ((struct coding_system *));
862 static void coding_set_destination P_ ((struct coding_system *));
863 static void coding_alloc_by_realloc P_ ((struct coding_system *, EMACS_INT));
864 static void coding_alloc_by_making_gap P_ ((struct coding_system *,
865 EMACS_INT));
866 static unsigned char *alloc_destination P_ ((struct coding_system *,
867 EMACS_INT, unsigned char *));
868 static void setup_iso_safe_charsets P_ ((Lisp_Object));
869 static unsigned char *encode_designation_at_bol P_ ((struct coding_system *,
870 int *, int *,
871 unsigned char *));
872 static int detect_eol P_ ((const unsigned char *,
873 EMACS_INT, enum coding_category));
874 static Lisp_Object adjust_coding_eol_type P_ ((struct coding_system *, int));
875 static void decode_eol P_ ((struct coding_system *));
876 static Lisp_Object get_translation_table P_ ((Lisp_Object, int, int *));
877 static Lisp_Object get_translation P_ ((Lisp_Object, int *, int *,
878 int, int *, int *));
879 static int produce_chars P_ ((struct coding_system *, Lisp_Object, int));
880 static INLINE void produce_composition P_ ((struct coding_system *, int *,
881 EMACS_INT));
882 static INLINE void produce_charset P_ ((struct coding_system *, int *,
883 EMACS_INT));
884 static void produce_annotation P_ ((struct coding_system *, EMACS_INT));
885 static int decode_coding P_ ((struct coding_system *));
886 static INLINE int *handle_composition_annotation P_ ((EMACS_INT, EMACS_INT,
887 struct coding_system *,
888 int *, EMACS_INT *));
889 static INLINE int *handle_charset_annotation P_ ((EMACS_INT, EMACS_INT,
890 struct coding_system *,
891 int *, EMACS_INT *));
892 static void consume_chars P_ ((struct coding_system *, Lisp_Object, int));
893 static int encode_coding P_ ((struct coding_system *));
894 static Lisp_Object make_conversion_work_buffer P_ ((int));
895 static Lisp_Object code_conversion_restore P_ ((Lisp_Object));
896 static INLINE int char_encodable_p P_ ((int, Lisp_Object));
897 static Lisp_Object make_subsidiaries P_ ((Lisp_Object));
898
899 static void
900 record_conversion_result (struct coding_system *coding,
901 enum coding_result_code result)
902 {
903 coding->result = result;
904 switch (result)
905 {
906 case CODING_RESULT_INSUFFICIENT_SRC:
907 Vlast_code_conversion_error = Qinsufficient_source;
908 break;
909 case CODING_RESULT_INCONSISTENT_EOL:
910 Vlast_code_conversion_error = Qinconsistent_eol;
911 break;
912 case CODING_RESULT_INVALID_SRC:
913 Vlast_code_conversion_error = Qinvalid_source;
914 break;
915 case CODING_RESULT_INTERRUPT:
916 Vlast_code_conversion_error = Qinterrupted;
917 break;
918 case CODING_RESULT_INSUFFICIENT_MEM:
919 Vlast_code_conversion_error = Qinsufficient_memory;
920 break;
921 }
922 }
923
924 #define CODING_DECODE_CHAR(coding, src, src_base, src_end, charset, code, c) \
925 do { \
926 charset_map_loaded = 0; \
927 c = DECODE_CHAR (charset, code); \
928 if (charset_map_loaded) \
929 { \
930 const unsigned char *orig = coding->source; \
931 EMACS_INT offset; \
932 \
933 coding_set_source (coding); \
934 offset = coding->source - orig; \
935 src += offset; \
936 src_base += offset; \
937 src_end += offset; \
938 } \
939 } while (0)
940
941
942 #define ASSURE_DESTINATION(bytes) \
943 do { \
944 if (dst + (bytes) >= dst_end) \
945 { \
946 int more_bytes = charbuf_end - charbuf + (bytes); \
947 \
948 dst = alloc_destination (coding, more_bytes, dst); \
949 dst_end = coding->destination + coding->dst_bytes; \
950 } \
951 } while (0)
952
953
954
955 static void
956 coding_set_source (coding)
957 struct coding_system *coding;
958 {
959 if (BUFFERP (coding->src_object))
960 {
961 struct buffer *buf = XBUFFER (coding->src_object);
962
963 if (coding->src_pos < 0)
964 coding->source = BUF_GAP_END_ADDR (buf) + coding->src_pos_byte;
965 else
966 coding->source = BUF_BYTE_ADDRESS (buf, coding->src_pos_byte);
967 }
968 else if (STRINGP (coding->src_object))
969 {
970 coding->source = SDATA (coding->src_object) + coding->src_pos_byte;
971 }
972 else
973 /* Otherwise, the source is C string and is never relocated
974 automatically. Thus we don't have to update anything. */
975 ;
976 }
977
978 static void
979 coding_set_destination (coding)
980 struct coding_system *coding;
981 {
982 if (BUFFERP (coding->dst_object))
983 {
984 if (coding->src_pos < 0)
985 {
986 coding->destination = BEG_ADDR + coding->dst_pos_byte - 1;
987 coding->dst_bytes = (GAP_END_ADDR
988 - (coding->src_bytes - coding->consumed)
989 - coding->destination);
990 }
991 else
992 {
993 /* We are sure that coding->dst_pos_byte is before the gap
994 of the buffer. */
995 coding->destination = (BUF_BEG_ADDR (XBUFFER (coding->dst_object))
996 + coding->dst_pos_byte - 1);
997 coding->dst_bytes = (BUF_GAP_END_ADDR (XBUFFER (coding->dst_object))
998 - coding->destination);
999 }
1000 }
1001 else
1002 /* Otherwise, the destination is C string and is never relocated
1003 automatically. Thus we don't have to update anything. */
1004 ;
1005 }
1006
1007
1008 static void
1009 coding_alloc_by_realloc (coding, bytes)
1010 struct coding_system *coding;
1011 EMACS_INT bytes;
1012 {
1013 coding->destination = (unsigned char *) xrealloc (coding->destination,
1014 coding->dst_bytes + bytes);
1015 coding->dst_bytes += bytes;
1016 }
1017
1018 static void
1019 coding_alloc_by_making_gap (coding, bytes)
1020 struct coding_system *coding;
1021 EMACS_INT bytes;
1022 {
1023 if (BUFFERP (coding->dst_object)
1024 && EQ (coding->src_object, coding->dst_object))
1025 {
1026 EMACS_INT add = coding->src_bytes - coding->consumed;
1027
1028 GAP_SIZE -= add; ZV += add; Z += add; ZV_BYTE += add; Z_BYTE += add;
1029 make_gap (bytes);
1030 GAP_SIZE += add; ZV -= add; Z -= add; ZV_BYTE -= add; Z_BYTE -= add;
1031 }
1032 else
1033 {
1034 Lisp_Object this_buffer;
1035
1036 this_buffer = Fcurrent_buffer ();
1037 set_buffer_internal (XBUFFER (coding->dst_object));
1038 make_gap (bytes);
1039 set_buffer_internal (XBUFFER (this_buffer));
1040 }
1041 }
1042
1043
1044 static unsigned char *
1045 alloc_destination (coding, nbytes, dst)
1046 struct coding_system *coding;
1047 EMACS_INT nbytes;
1048 unsigned char *dst;
1049 {
1050 EMACS_INT offset = dst - coding->destination;
1051
1052 if (BUFFERP (coding->dst_object))
1053 coding_alloc_by_making_gap (coding, nbytes);
1054 else
1055 coding_alloc_by_realloc (coding, nbytes);
1056 record_conversion_result (coding, CODING_RESULT_SUCCESS);
1057 coding_set_destination (coding);
1058 dst = coding->destination + offset;
1059 return dst;
1060 }
1061
1062 /** Macros for annotations. */
1063
1064 /* Maximum length of annotation data (sum of annotations for
1065 composition and charset). */
1066 #define MAX_ANNOTATION_LENGTH (4 + (MAX_COMPOSITION_COMPONENTS * 2) - 1 + 4)
1067
1068 /* An annotation data is stored in the array coding->charbuf in this
1069 format:
1070 [ -LENGTH ANNOTATION_MASK NCHARS ... ]
1071 LENGTH is the number of elements in the annotation.
1072 ANNOTATION_MASK is one of CODING_ANNOTATE_XXX_MASK.
1073 NCHARS is the number of characters in the text annotated.
1074
1075 The format of the following elements depend on ANNOTATION_MASK.
1076
1077 In the case of CODING_ANNOTATE_COMPOSITION_MASK, these elements
1078 follows:
1079 ... METHOD [ COMPOSITION-COMPONENTS ... ]
1080 METHOD is one of enum composition_method.
1081 Optionnal COMPOSITION-COMPONENTS are characters and composition
1082 rules.
1083
1084 In the case of CODING_ANNOTATE_CHARSET_MASK, one element CHARSET-ID
1085 follows. */
1086
1087 #define ADD_ANNOTATION_DATA(buf, len, mask, nchars) \
1088 do { \
1089 *(buf)++ = -(len); \
1090 *(buf)++ = (mask); \
1091 *(buf)++ = (nchars); \
1092 coding->annotated = 1; \
1093 } while (0);
1094
1095 #define ADD_COMPOSITION_DATA(buf, nchars, method) \
1096 do { \
1097 ADD_ANNOTATION_DATA (buf, 4, CODING_ANNOTATE_COMPOSITION_MASK, nchars); \
1098 *buf++ = method; \
1099 } while (0)
1100
1101
1102 #define ADD_CHARSET_DATA(buf, nchars, id) \
1103 do { \
1104 ADD_ANNOTATION_DATA (buf, 4, CODING_ANNOTATE_CHARSET_MASK, nchars); \
1105 *buf++ = id; \
1106 } while (0)
1107
1108 \f
1109 /*** 2. Emacs' internal format (emacs-utf-8) ***/
1110
1111
1112
1113 \f
1114 /*** 3. UTF-8 ***/
1115
1116 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
1117 Check if a text is encoded in UTF-8. If it is, return 1, else
1118 return 0. */
1119
1120 #define UTF_8_1_OCTET_P(c) ((c) < 0x80)
1121 #define UTF_8_EXTRA_OCTET_P(c) (((c) & 0xC0) == 0x80)
1122 #define UTF_8_2_OCTET_LEADING_P(c) (((c) & 0xE0) == 0xC0)
1123 #define UTF_8_3_OCTET_LEADING_P(c) (((c) & 0xF0) == 0xE0)
1124 #define UTF_8_4_OCTET_LEADING_P(c) (((c) & 0xF8) == 0xF0)
1125 #define UTF_8_5_OCTET_LEADING_P(c) (((c) & 0xFC) == 0xF8)
1126
1127 static int
1128 detect_coding_utf_8 (coding, detect_info)
1129 struct coding_system *coding;
1130 struct coding_detection_info *detect_info;
1131 {
1132 const unsigned char *src = coding->source, *src_base;
1133 const unsigned char *src_end = coding->source + coding->src_bytes;
1134 int multibytep = coding->src_multibyte;
1135 int consumed_chars = 0;
1136 int found = 0;
1137
1138 detect_info->checked |= CATEGORY_MASK_UTF_8;
1139 /* A coding system of this category is always ASCII compatible. */
1140 src += coding->head_ascii;
1141
1142 while (1)
1143 {
1144 int c, c1, c2, c3, c4;
1145
1146 src_base = src;
1147 ONE_MORE_BYTE (c);
1148 if (c < 0 || UTF_8_1_OCTET_P (c))
1149 continue;
1150 ONE_MORE_BYTE (c1);
1151 if (c1 < 0 || ! UTF_8_EXTRA_OCTET_P (c1))
1152 break;
1153 if (UTF_8_2_OCTET_LEADING_P (c))
1154 {
1155 found = CATEGORY_MASK_UTF_8;
1156 continue;
1157 }
1158 ONE_MORE_BYTE (c2);
1159 if (c2 < 0 || ! UTF_8_EXTRA_OCTET_P (c2))
1160 break;
1161 if (UTF_8_3_OCTET_LEADING_P (c))
1162 {
1163 found = CATEGORY_MASK_UTF_8;
1164 continue;
1165 }
1166 ONE_MORE_BYTE (c3);
1167 if (c3 < 0 || ! UTF_8_EXTRA_OCTET_P (c3))
1168 break;
1169 if (UTF_8_4_OCTET_LEADING_P (c))
1170 {
1171 found = CATEGORY_MASK_UTF_8;
1172 continue;
1173 }
1174 ONE_MORE_BYTE (c4);
1175 if (c4 < 0 || ! UTF_8_EXTRA_OCTET_P (c4))
1176 break;
1177 if (UTF_8_5_OCTET_LEADING_P (c))
1178 {
1179 found = CATEGORY_MASK_UTF_8;
1180 continue;
1181 }
1182 break;
1183 }
1184 detect_info->rejected |= CATEGORY_MASK_UTF_8;
1185 return 0;
1186
1187 no_more_source:
1188 if (src_base < src && coding->mode & CODING_MODE_LAST_BLOCK)
1189 {
1190 detect_info->rejected |= CATEGORY_MASK_UTF_8;
1191 return 0;
1192 }
1193 detect_info->found |= found;
1194 return 1;
1195 }
1196
1197
1198 static void
1199 decode_coding_utf_8 (coding)
1200 struct coding_system *coding;
1201 {
1202 const unsigned char *src = coding->source + coding->consumed;
1203 const unsigned char *src_end = coding->source + coding->src_bytes;
1204 const unsigned char *src_base;
1205 int *charbuf = coding->charbuf + coding->charbuf_used;
1206 int *charbuf_end = coding->charbuf + coding->charbuf_size;
1207 int consumed_chars = 0, consumed_chars_base;
1208 int multibytep = coding->src_multibyte;
1209 Lisp_Object attr, charset_list;
1210
1211 CODING_GET_INFO (coding, attr, charset_list);
1212
1213 while (1)
1214 {
1215 int c, c1, c2, c3, c4, c5;
1216
1217 src_base = src;
1218 consumed_chars_base = consumed_chars;
1219
1220 if (charbuf >= charbuf_end)
1221 break;
1222
1223 ONE_MORE_BYTE (c1);
1224 if (c1 < 0)
1225 {
1226 c = - c1;
1227 }
1228 else if (UTF_8_1_OCTET_P(c1))
1229 {
1230 c = c1;
1231 }
1232 else
1233 {
1234 ONE_MORE_BYTE (c2);
1235 if (c2 < 0 || ! UTF_8_EXTRA_OCTET_P (c2))
1236 goto invalid_code;
1237 if (UTF_8_2_OCTET_LEADING_P (c1))
1238 {
1239 c = ((c1 & 0x1F) << 6) | (c2 & 0x3F);
1240 /* Reject overlong sequences here and below. Encoders
1241 producing them are incorrect, they can be misleading,
1242 and they mess up read/write invariance. */
1243 if (c < 128)
1244 goto invalid_code;
1245 }
1246 else
1247 {
1248 ONE_MORE_BYTE (c3);
1249 if (c3 < 0 || ! UTF_8_EXTRA_OCTET_P (c3))
1250 goto invalid_code;
1251 if (UTF_8_3_OCTET_LEADING_P (c1))
1252 {
1253 c = (((c1 & 0xF) << 12)
1254 | ((c2 & 0x3F) << 6) | (c3 & 0x3F));
1255 if (c < 0x800
1256 || (c >= 0xd800 && c < 0xe000)) /* surrogates (invalid) */
1257 goto invalid_code;
1258 }
1259 else
1260 {
1261 ONE_MORE_BYTE (c4);
1262 if (c4 < 0 || ! UTF_8_EXTRA_OCTET_P (c4))
1263 goto invalid_code;
1264 if (UTF_8_4_OCTET_LEADING_P (c1))
1265 {
1266 c = (((c1 & 0x7) << 18) | ((c2 & 0x3F) << 12)
1267 | ((c3 & 0x3F) << 6) | (c4 & 0x3F));
1268 if (c < 0x10000)
1269 goto invalid_code;
1270 }
1271 else
1272 {
1273 ONE_MORE_BYTE (c5);
1274 if (c5 < 0 || ! UTF_8_EXTRA_OCTET_P (c5))
1275 goto invalid_code;
1276 if (UTF_8_5_OCTET_LEADING_P (c1))
1277 {
1278 c = (((c1 & 0x3) << 24) | ((c2 & 0x3F) << 18)
1279 | ((c3 & 0x3F) << 12) | ((c4 & 0x3F) << 6)
1280 | (c5 & 0x3F));
1281 if ((c > MAX_CHAR) || (c < 0x200000))
1282 goto invalid_code;
1283 }
1284 else
1285 goto invalid_code;
1286 }
1287 }
1288 }
1289 }
1290
1291 *charbuf++ = c;
1292 continue;
1293
1294 invalid_code:
1295 src = src_base;
1296 consumed_chars = consumed_chars_base;
1297 ONE_MORE_BYTE (c);
1298 *charbuf++ = ASCII_BYTE_P (c) ? c : BYTE8_TO_CHAR (c);
1299 coding->errors++;
1300 }
1301
1302 no_more_source:
1303 coding->consumed_char += consumed_chars_base;
1304 coding->consumed = src_base - coding->source;
1305 coding->charbuf_used = charbuf - coding->charbuf;
1306 }
1307
1308
1309 static int
1310 encode_coding_utf_8 (coding)
1311 struct coding_system *coding;
1312 {
1313 int multibytep = coding->dst_multibyte;
1314 int *charbuf = coding->charbuf;
1315 int *charbuf_end = charbuf + coding->charbuf_used;
1316 unsigned char *dst = coding->destination + coding->produced;
1317 unsigned char *dst_end = coding->destination + coding->dst_bytes;
1318 int produced_chars = 0;
1319 int c;
1320
1321 if (multibytep)
1322 {
1323 int safe_room = MAX_MULTIBYTE_LENGTH * 2;
1324
1325 while (charbuf < charbuf_end)
1326 {
1327 unsigned char str[MAX_MULTIBYTE_LENGTH], *p, *pend = str;
1328
1329 ASSURE_DESTINATION (safe_room);
1330 c = *charbuf++;
1331 if (CHAR_BYTE8_P (c))
1332 {
1333 c = CHAR_TO_BYTE8 (c);
1334 EMIT_ONE_BYTE (c);
1335 }
1336 else
1337 {
1338 CHAR_STRING_ADVANCE (c, pend);
1339 for (p = str; p < pend; p++)
1340 EMIT_ONE_BYTE (*p);
1341 }
1342 }
1343 }
1344 else
1345 {
1346 int safe_room = MAX_MULTIBYTE_LENGTH;
1347
1348 while (charbuf < charbuf_end)
1349 {
1350 ASSURE_DESTINATION (safe_room);
1351 c = *charbuf++;
1352 dst += CHAR_STRING (c, dst);
1353 produced_chars++;
1354 }
1355 }
1356 record_conversion_result (coding, CODING_RESULT_SUCCESS);
1357 coding->produced_char += produced_chars;
1358 coding->produced = dst - coding->destination;
1359 return 0;
1360 }
1361
1362
1363 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
1364 Check if a text is encoded in one of UTF-16 based coding systems.
1365 If it is, return 1, else return 0. */
1366
1367 #define UTF_16_HIGH_SURROGATE_P(val) \
1368 (((val) & 0xFC00) == 0xD800)
1369
1370 #define UTF_16_LOW_SURROGATE_P(val) \
1371 (((val) & 0xFC00) == 0xDC00)
1372
1373 #define UTF_16_INVALID_P(val) \
1374 (((val) == 0xFFFE) \
1375 || ((val) == 0xFFFF) \
1376 || UTF_16_LOW_SURROGATE_P (val))
1377
1378
1379 static int
1380 detect_coding_utf_16 (coding, detect_info)
1381 struct coding_system *coding;
1382 struct coding_detection_info *detect_info;
1383 {
1384 const unsigned char *src = coding->source, *src_base = src;
1385 const unsigned char *src_end = coding->source + coding->src_bytes;
1386 int multibytep = coding->src_multibyte;
1387 int consumed_chars = 0;
1388 int c1, c2;
1389
1390 detect_info->checked |= CATEGORY_MASK_UTF_16;
1391 if (coding->mode & CODING_MODE_LAST_BLOCK
1392 && (coding->src_chars & 1))
1393 {
1394 detect_info->rejected |= CATEGORY_MASK_UTF_16;
1395 return 0;
1396 }
1397
1398 ONE_MORE_BYTE (c1);
1399 ONE_MORE_BYTE (c2);
1400 if ((c1 == 0xFF) && (c2 == 0xFE))
1401 {
1402 detect_info->found |= (CATEGORY_MASK_UTF_16_LE
1403 | CATEGORY_MASK_UTF_16_AUTO);
1404 detect_info->rejected |= (CATEGORY_MASK_UTF_16_BE
1405 | CATEGORY_MASK_UTF_16_BE_NOSIG
1406 | CATEGORY_MASK_UTF_16_LE_NOSIG);
1407 }
1408 else if ((c1 == 0xFE) && (c2 == 0xFF))
1409 {
1410 detect_info->found |= (CATEGORY_MASK_UTF_16_BE
1411 | CATEGORY_MASK_UTF_16_AUTO);
1412 detect_info->rejected |= (CATEGORY_MASK_UTF_16_LE
1413 | CATEGORY_MASK_UTF_16_BE_NOSIG
1414 | CATEGORY_MASK_UTF_16_LE_NOSIG);
1415 }
1416 else if (c1 >= 0 && c2 >= 0)
1417 {
1418 unsigned char b1[256], b2[256];
1419 int b1_variants = 1, b2_variants = 1;
1420 int n;
1421
1422 bzero (b1, 256), bzero (b2, 256);
1423 b1[c1]++, b2[c2]++;
1424 for (n = 0; n < 256 && src < src_end; n++)
1425 {
1426 src_base = src;
1427 ONE_MORE_BYTE (c1);
1428 ONE_MORE_BYTE (c2);
1429 if (c1 < 0 || c2 < 0)
1430 break;
1431 if (! b1[c1++]) b1_variants++;
1432 if (! b2[c2++]) b2_variants++;
1433 }
1434 if (b1_variants < b2_variants)
1435 detect_info->found |= CATEGORY_MASK_UTF_16_BE_NOSIG;
1436 else
1437 detect_info->found |= CATEGORY_MASK_UTF_16_LE_NOSIG;
1438 detect_info->rejected
1439 |= (CATEGORY_MASK_UTF_16_BE | CATEGORY_MASK_UTF_16_LE);
1440 }
1441 no_more_source:
1442 return 1;
1443 }
1444
1445 static void
1446 decode_coding_utf_16 (coding)
1447 struct coding_system *coding;
1448 {
1449 const unsigned char *src = coding->source + coding->consumed;
1450 const unsigned char *src_end = coding->source + coding->src_bytes;
1451 const unsigned char *src_base;
1452 int *charbuf = coding->charbuf + coding->charbuf_used;
1453 int *charbuf_end = coding->charbuf + coding->charbuf_size;
1454 int consumed_chars = 0, consumed_chars_base;
1455 int multibytep = coding->src_multibyte;
1456 enum utf_16_bom_type bom = CODING_UTF_16_BOM (coding);
1457 enum utf_16_endian_type endian = CODING_UTF_16_ENDIAN (coding);
1458 int surrogate = CODING_UTF_16_SURROGATE (coding);
1459 Lisp_Object attr, charset_list;
1460
1461 CODING_GET_INFO (coding, attr, charset_list);
1462
1463 if (bom == utf_16_with_bom)
1464 {
1465 int c, c1, c2;
1466
1467 src_base = src;
1468 ONE_MORE_BYTE (c1);
1469 ONE_MORE_BYTE (c2);
1470 c = (c1 << 8) | c2;
1471
1472 if (endian == utf_16_big_endian
1473 ? c != 0xFEFF : c != 0xFFFE)
1474 {
1475 /* The first two bytes are not BOM. Treat them as bytes
1476 for a normal character. */
1477 src = src_base;
1478 coding->errors++;
1479 }
1480 CODING_UTF_16_BOM (coding) = utf_16_without_bom;
1481 }
1482 else if (bom == utf_16_detect_bom)
1483 {
1484 /* We have already tried to detect BOM and failed in
1485 detect_coding. */
1486 CODING_UTF_16_BOM (coding) = utf_16_without_bom;
1487 }
1488
1489 while (1)
1490 {
1491 int c, c1, c2;
1492
1493 src_base = src;
1494 consumed_chars_base = consumed_chars;
1495
1496 if (charbuf + 2 >= charbuf_end)
1497 break;
1498
1499 ONE_MORE_BYTE (c1);
1500 if (c1 < 0)
1501 {
1502 *charbuf++ = -c1;
1503 continue;
1504 }
1505 ONE_MORE_BYTE (c2);
1506 if (c2 < 0)
1507 {
1508 *charbuf++ = ASCII_BYTE_P (c1) ? c1 : BYTE8_TO_CHAR (c1);
1509 *charbuf++ = -c2;
1510 continue;
1511 }
1512 c = (endian == utf_16_big_endian
1513 ? ((c1 << 8) | c2) : ((c2 << 8) | c1));
1514 if (surrogate)
1515 {
1516 if (! UTF_16_LOW_SURROGATE_P (c))
1517 {
1518 if (endian == utf_16_big_endian)
1519 c1 = surrogate >> 8, c2 = surrogate & 0xFF;
1520 else
1521 c1 = surrogate & 0xFF, c2 = surrogate >> 8;
1522 *charbuf++ = c1;
1523 *charbuf++ = c2;
1524 coding->errors++;
1525 if (UTF_16_HIGH_SURROGATE_P (c))
1526 CODING_UTF_16_SURROGATE (coding) = surrogate = c;
1527 else
1528 *charbuf++ = c;
1529 }
1530 else
1531 {
1532 c = ((surrogate - 0xD800) << 10) | (c - 0xDC00);
1533 CODING_UTF_16_SURROGATE (coding) = surrogate = 0;
1534 *charbuf++ = 0x10000 + c;
1535 }
1536 }
1537 else
1538 {
1539 if (UTF_16_HIGH_SURROGATE_P (c))
1540 CODING_UTF_16_SURROGATE (coding) = surrogate = c;
1541 else
1542 *charbuf++ = c;
1543 }
1544 }
1545
1546 no_more_source:
1547 coding->consumed_char += consumed_chars_base;
1548 coding->consumed = src_base - coding->source;
1549 coding->charbuf_used = charbuf - coding->charbuf;
1550 }
1551
1552 static int
1553 encode_coding_utf_16 (coding)
1554 struct coding_system *coding;
1555 {
1556 int multibytep = coding->dst_multibyte;
1557 int *charbuf = coding->charbuf;
1558 int *charbuf_end = charbuf + coding->charbuf_used;
1559 unsigned char *dst = coding->destination + coding->produced;
1560 unsigned char *dst_end = coding->destination + coding->dst_bytes;
1561 int safe_room = 8;
1562 enum utf_16_bom_type bom = CODING_UTF_16_BOM (coding);
1563 int big_endian = CODING_UTF_16_ENDIAN (coding) == utf_16_big_endian;
1564 int produced_chars = 0;
1565 Lisp_Object attrs, charset_list;
1566 int c;
1567
1568 CODING_GET_INFO (coding, attrs, charset_list);
1569
1570 if (bom != utf_16_without_bom)
1571 {
1572 ASSURE_DESTINATION (safe_room);
1573 if (big_endian)
1574 EMIT_TWO_BYTES (0xFE, 0xFF);
1575 else
1576 EMIT_TWO_BYTES (0xFF, 0xFE);
1577 CODING_UTF_16_BOM (coding) = utf_16_without_bom;
1578 }
1579
1580 while (charbuf < charbuf_end)
1581 {
1582 ASSURE_DESTINATION (safe_room);
1583 c = *charbuf++;
1584 if (c >= MAX_UNICODE_CHAR)
1585 c = coding->default_char;
1586
1587 if (c < 0x10000)
1588 {
1589 if (big_endian)
1590 EMIT_TWO_BYTES (c >> 8, c & 0xFF);
1591 else
1592 EMIT_TWO_BYTES (c & 0xFF, c >> 8);
1593 }
1594 else
1595 {
1596 int c1, c2;
1597
1598 c -= 0x10000;
1599 c1 = (c >> 10) + 0xD800;
1600 c2 = (c & 0x3FF) + 0xDC00;
1601 if (big_endian)
1602 EMIT_FOUR_BYTES (c1 >> 8, c1 & 0xFF, c2 >> 8, c2 & 0xFF);
1603 else
1604 EMIT_FOUR_BYTES (c1 & 0xFF, c1 >> 8, c2 & 0xFF, c2 >> 8);
1605 }
1606 }
1607 record_conversion_result (coding, CODING_RESULT_SUCCESS);
1608 coding->produced = dst - coding->destination;
1609 coding->produced_char += produced_chars;
1610 return 0;
1611 }
1612
1613 \f
1614 /*** 6. Old Emacs' internal format (emacs-mule) ***/
1615
1616 /* Emacs' internal format for representation of multiple character
1617 sets is a kind of multi-byte encoding, i.e. characters are
1618 represented by variable-length sequences of one-byte codes.
1619
1620 ASCII characters and control characters (e.g. `tab', `newline') are
1621 represented by one-byte sequences which are their ASCII codes, in
1622 the range 0x00 through 0x7F.
1623
1624 8-bit characters of the range 0x80..0x9F are represented by
1625 two-byte sequences of LEADING_CODE_8_BIT_CONTROL and (their 8-bit
1626 code + 0x20).
1627
1628 8-bit characters of the range 0xA0..0xFF are represented by
1629 one-byte sequences which are their 8-bit code.
1630
1631 The other characters are represented by a sequence of `base
1632 leading-code', optional `extended leading-code', and one or two
1633 `position-code's. The length of the sequence is determined by the
1634 base leading-code. Leading-code takes the range 0x81 through 0x9D,
1635 whereas extended leading-code and position-code take the range 0xA0
1636 through 0xFF. See `charset.h' for more details about leading-code
1637 and position-code.
1638
1639 --- CODE RANGE of Emacs' internal format ---
1640 character set range
1641 ------------- -----
1642 ascii 0x00..0x7F
1643 eight-bit-control LEADING_CODE_8_BIT_CONTROL + 0xA0..0xBF
1644 eight-bit-graphic 0xA0..0xBF
1645 ELSE 0x81..0x9D + [0xA0..0xFF]+
1646 ---------------------------------------------
1647
1648 As this is the internal character representation, the format is
1649 usually not used externally (i.e. in a file or in a data sent to a
1650 process). But, it is possible to have a text externally in this
1651 format (i.e. by encoding by the coding system `emacs-mule').
1652
1653 In that case, a sequence of one-byte codes has a slightly different
1654 form.
1655
1656 At first, all characters in eight-bit-control are represented by
1657 one-byte sequences which are their 8-bit code.
1658
1659 Next, character composition data are represented by the byte
1660 sequence of the form: 0x80 METHOD BYTES CHARS COMPONENT ...,
1661 where,
1662 METHOD is 0xF0 plus one of composition method (enum
1663 composition_method),
1664
1665 BYTES is 0xA0 plus a byte length of this composition data,
1666
1667 CHARS is 0x20 plus a number of characters composed by this
1668 data,
1669
1670 COMPONENTs are characters of multibye form or composition
1671 rules encoded by two-byte of ASCII codes.
1672
1673 In addition, for backward compatibility, the following formats are
1674 also recognized as composition data on decoding.
1675
1676 0x80 MSEQ ...
1677 0x80 0xFF MSEQ RULE MSEQ RULE ... MSEQ
1678
1679 Here,
1680 MSEQ is a multibyte form but in these special format:
1681 ASCII: 0xA0 ASCII_CODE+0x80,
1682 other: LEADING_CODE+0x20 FOLLOWING-BYTE ...,
1683 RULE is a one byte code of the range 0xA0..0xF0 that
1684 represents a composition rule.
1685 */
1686
1687 char emacs_mule_bytes[256];
1688
1689 int
1690 emacs_mule_char (coding, src, nbytes, nchars, id)
1691 struct coding_system *coding;
1692 const unsigned char *src;
1693 int *nbytes, *nchars, *id;
1694 {
1695 const unsigned char *src_end = coding->source + coding->src_bytes;
1696 const unsigned char *src_base = src;
1697 int multibytep = coding->src_multibyte;
1698 struct charset *charset;
1699 unsigned code;
1700 int c;
1701 int consumed_chars = 0;
1702
1703 ONE_MORE_BYTE (c);
1704 if (c < 0)
1705 {
1706 c = -c;
1707 charset = emacs_mule_charset[0];
1708 }
1709 else
1710 {
1711 switch (emacs_mule_bytes[c])
1712 {
1713 case 2:
1714 if (! (charset = emacs_mule_charset[c]))
1715 goto invalid_code;
1716 ONE_MORE_BYTE (c);
1717 if (c < 0)
1718 goto invalid_code;
1719 code = c & 0x7F;
1720 break;
1721
1722 case 3:
1723 if (c == EMACS_MULE_LEADING_CODE_PRIVATE_11
1724 || c == EMACS_MULE_LEADING_CODE_PRIVATE_12)
1725 {
1726 ONE_MORE_BYTE (c);
1727 if (c < 0 || ! (charset = emacs_mule_charset[c]))
1728 goto invalid_code;
1729 ONE_MORE_BYTE (c);
1730 if (c < 0)
1731 goto invalid_code;
1732 code = c & 0x7F;
1733 }
1734 else
1735 {
1736 if (! (charset = emacs_mule_charset[c]))
1737 goto invalid_code;
1738 ONE_MORE_BYTE (c);
1739 if (c < 0)
1740 goto invalid_code;
1741 code = (c & 0x7F) << 8;
1742 ONE_MORE_BYTE (c);
1743 if (c < 0)
1744 goto invalid_code;
1745 code |= c & 0x7F;
1746 }
1747 break;
1748
1749 case 4:
1750 ONE_MORE_BYTE (c);
1751 if (c < 0 || ! (charset = emacs_mule_charset[c]))
1752 goto invalid_code;
1753 ONE_MORE_BYTE (c);
1754 if (c < 0)
1755 goto invalid_code;
1756 code = (c & 0x7F) << 8;
1757 ONE_MORE_BYTE (c);
1758 if (c < 0)
1759 goto invalid_code;
1760 code |= c & 0x7F;
1761 break;
1762
1763 case 1:
1764 code = c;
1765 charset = CHARSET_FROM_ID (ASCII_BYTE_P (code)
1766 ? charset_ascii : charset_eight_bit);
1767 break;
1768
1769 default:
1770 abort ();
1771 }
1772 c = DECODE_CHAR (charset, code);
1773 if (c < 0)
1774 goto invalid_code;
1775 }
1776 *nbytes = src - src_base;
1777 *nchars = consumed_chars;
1778 if (id)
1779 *id = charset->id;
1780 return c;
1781
1782 no_more_source:
1783 return -2;
1784
1785 invalid_code:
1786 return -1;
1787 }
1788
1789
1790 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
1791 Check if a text is encoded in `emacs-mule'. If it is, return 1,
1792 else return 0. */
1793
1794 static int
1795 detect_coding_emacs_mule (coding, detect_info)
1796 struct coding_system *coding;
1797 struct coding_detection_info *detect_info;
1798 {
1799 const unsigned char *src = coding->source, *src_base;
1800 const unsigned char *src_end = coding->source + coding->src_bytes;
1801 int multibytep = coding->src_multibyte;
1802 int consumed_chars = 0;
1803 int c;
1804 int found = 0;
1805
1806 detect_info->checked |= CATEGORY_MASK_EMACS_MULE;
1807 /* A coding system of this category is always ASCII compatible. */
1808 src += coding->head_ascii;
1809
1810 while (1)
1811 {
1812 src_base = src;
1813 ONE_MORE_BYTE (c);
1814 if (c < 0)
1815 continue;
1816 if (c == 0x80)
1817 {
1818 /* Perhaps the start of composite character. We simple skip
1819 it because analyzing it is too heavy for detecting. But,
1820 at least, we check that the composite character
1821 constitues of more than 4 bytes. */
1822 const unsigned char *src_base;
1823
1824 repeat:
1825 src_base = src;
1826 do
1827 {
1828 ONE_MORE_BYTE (c);
1829 }
1830 while (c >= 0xA0);
1831
1832 if (src - src_base <= 4)
1833 break;
1834 found = CATEGORY_MASK_EMACS_MULE;
1835 if (c == 0x80)
1836 goto repeat;
1837 }
1838
1839 if (c < 0x80)
1840 {
1841 if (c < 0x20
1842 && (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO))
1843 break;
1844 }
1845 else
1846 {
1847 int more_bytes = emacs_mule_bytes[*src_base] - 1;
1848
1849 while (more_bytes > 0)
1850 {
1851 ONE_MORE_BYTE (c);
1852 if (c < 0xA0)
1853 {
1854 src--; /* Unread the last byte. */
1855 break;
1856 }
1857 more_bytes--;
1858 }
1859 if (more_bytes != 0)
1860 break;
1861 found = CATEGORY_MASK_EMACS_MULE;
1862 }
1863 }
1864 detect_info->rejected |= CATEGORY_MASK_EMACS_MULE;
1865 return 0;
1866
1867 no_more_source:
1868 if (src_base < src && coding->mode & CODING_MODE_LAST_BLOCK)
1869 {
1870 detect_info->rejected |= CATEGORY_MASK_EMACS_MULE;
1871 return 0;
1872 }
1873 detect_info->found |= found;
1874 return 1;
1875 }
1876
1877
1878 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions". */
1879
1880 /* Decode a character represented as a component of composition
1881 sequence of Emacs 20/21 style at SRC. Set C to that character and
1882 update SRC to the head of next character (or an encoded composition
1883 rule). If SRC doesn't points a composition component, set C to -1.
1884 If SRC points an invalid byte sequence, global exit by a return
1885 value 0. */
1886
1887 #define DECODE_EMACS_MULE_COMPOSITION_CHAR(buf) \
1888 if (1) \
1889 { \
1890 int c; \
1891 int nbytes, nchars; \
1892 \
1893 if (src == src_end) \
1894 break; \
1895 c = emacs_mule_char (coding, src, &nbytes, &nchars, NULL);\
1896 if (c < 0) \
1897 { \
1898 if (c == -2) \
1899 break; \
1900 goto invalid_code; \
1901 } \
1902 *buf++ = c; \
1903 src += nbytes; \
1904 consumed_chars += nchars; \
1905 } \
1906 else
1907
1908
1909 /* Decode a composition rule represented as a component of composition
1910 sequence of Emacs 20 style at SRC. Store the decoded rule in *BUF,
1911 and increment BUF. If SRC points an invalid byte sequence, set C
1912 to -1. */
1913
1914 #define DECODE_EMACS_MULE_COMPOSITION_RULE_20(buf) \
1915 do { \
1916 int c, gref, nref; \
1917 \
1918 if (src >= src_end) \
1919 goto invalid_code; \
1920 ONE_MORE_BYTE_NO_CHECK (c); \
1921 c -= 0x20; \
1922 if (c < 0 || c >= 81) \
1923 goto invalid_code; \
1924 \
1925 gref = c / 9, nref = c % 9; \
1926 *buf++ = COMPOSITION_ENCODE_RULE (gref, nref); \
1927 } while (0)
1928
1929
1930 /* Decode a composition rule represented as a component of composition
1931 sequence of Emacs 21 style at SRC. Store the decoded rule in *BUF,
1932 and increment BUF. If SRC points an invalid byte sequence, set C
1933 to -1. */
1934
1935 #define DECODE_EMACS_MULE_COMPOSITION_RULE_21(buf) \
1936 do { \
1937 int gref, nref; \
1938 \
1939 if (src + 1>= src_end) \
1940 goto invalid_code; \
1941 ONE_MORE_BYTE_NO_CHECK (gref); \
1942 gref -= 0x20; \
1943 ONE_MORE_BYTE_NO_CHECK (nref); \
1944 nref -= 0x20; \
1945 if (gref < 0 || gref >= 81 \
1946 || nref < 0 || nref >= 81) \
1947 goto invalid_code; \
1948 *buf++ = COMPOSITION_ENCODE_RULE (gref, nref); \
1949 } while (0)
1950
1951
1952 #define DECODE_EMACS_MULE_21_COMPOSITION(c) \
1953 do { \
1954 /* Emacs 21 style format. The first three bytes at SRC are \
1955 (METHOD - 0xF2), (BYTES - 0xA0), (CHARS - 0xA0), where BYTES is \
1956 the byte length of this composition information, CHARS is the \
1957 number of characters composed by this composition. */ \
1958 enum composition_method method = c - 0xF2; \
1959 int *charbuf_base = charbuf; \
1960 int consumed_chars_limit; \
1961 int nbytes, nchars; \
1962 \
1963 ONE_MORE_BYTE (c); \
1964 if (c < 0) \
1965 goto invalid_code; \
1966 nbytes = c - 0xA0; \
1967 if (nbytes < 3) \
1968 goto invalid_code; \
1969 ONE_MORE_BYTE (c); \
1970 if (c < 0) \
1971 goto invalid_code; \
1972 nchars = c - 0xA0; \
1973 ADD_COMPOSITION_DATA (charbuf, nchars, method); \
1974 consumed_chars_limit = consumed_chars_base + nbytes; \
1975 if (method != COMPOSITION_RELATIVE) \
1976 { \
1977 int i = 0; \
1978 while (consumed_chars < consumed_chars_limit) \
1979 { \
1980 if (i % 2 && method != COMPOSITION_WITH_ALTCHARS) \
1981 DECODE_EMACS_MULE_COMPOSITION_RULE_21 (charbuf); \
1982 else \
1983 DECODE_EMACS_MULE_COMPOSITION_CHAR (charbuf); \
1984 i++; \
1985 } \
1986 if (consumed_chars < consumed_chars_limit) \
1987 goto invalid_code; \
1988 charbuf_base[0] -= i; \
1989 } \
1990 } while (0)
1991
1992
1993 #define DECODE_EMACS_MULE_20_RELATIVE_COMPOSITION(c) \
1994 do { \
1995 /* Emacs 20 style format for relative composition. */ \
1996 /* Store multibyte form of characters to be composed. */ \
1997 enum composition_method method = COMPOSITION_RELATIVE; \
1998 int components[MAX_COMPOSITION_COMPONENTS * 2 - 1]; \
1999 int *buf = components; \
2000 int i, j; \
2001 \
2002 src = src_base; \
2003 ONE_MORE_BYTE (c); /* skip 0x80 */ \
2004 for (i = 0; i < MAX_COMPOSITION_COMPONENTS; i++) \
2005 DECODE_EMACS_MULE_COMPOSITION_CHAR (buf); \
2006 if (i < 2) \
2007 goto invalid_code; \
2008 ADD_COMPOSITION_DATA (charbuf, i, method); \
2009 for (j = 0; j < i; j++) \
2010 *charbuf++ = components[j]; \
2011 } while (0)
2012
2013
2014 #define DECODE_EMACS_MULE_20_RULEBASE_COMPOSITION(c) \
2015 do { \
2016 /* Emacs 20 style format for rule-base composition. */ \
2017 /* Store multibyte form of characters to be composed. */ \
2018 enum composition_method method = COMPOSITION_WITH_RULE; \
2019 int components[MAX_COMPOSITION_COMPONENTS * 2 - 1]; \
2020 int *buf = components; \
2021 int i, j; \
2022 \
2023 DECODE_EMACS_MULE_COMPOSITION_CHAR (buf); \
2024 for (i = 0; i < MAX_COMPOSITION_COMPONENTS; i++) \
2025 { \
2026 DECODE_EMACS_MULE_COMPOSITION_RULE_20 (buf); \
2027 DECODE_EMACS_MULE_COMPOSITION_CHAR (buf); \
2028 } \
2029 if (i < 1 || (buf - components) % 2 == 0) \
2030 goto invalid_code; \
2031 if (charbuf + i + (i / 2) + 1 < charbuf_end) \
2032 goto no_more_source; \
2033 ADD_COMPOSITION_DATA (buf, i, method); \
2034 for (j = 0; j < i; j++) \
2035 *charbuf++ = components[j]; \
2036 for (j = 0; j < i; j += 2) \
2037 *charbuf++ = components[j]; \
2038 } while (0)
2039
2040
2041 static void
2042 decode_coding_emacs_mule (coding)
2043 struct coding_system *coding;
2044 {
2045 const unsigned char *src = coding->source + coding->consumed;
2046 const unsigned char *src_end = coding->source + coding->src_bytes;
2047 const unsigned char *src_base;
2048 int *charbuf = coding->charbuf + coding->charbuf_used;
2049 int *charbuf_end
2050 = coding->charbuf + coding->charbuf_size - MAX_ANNOTATION_LENGTH;
2051 int consumed_chars = 0, consumed_chars_base;
2052 int multibytep = coding->src_multibyte;
2053 Lisp_Object attrs, charset_list;
2054 int char_offset = coding->produced_char;
2055 int last_offset = char_offset;
2056 int last_id = charset_ascii;
2057
2058 CODING_GET_INFO (coding, attrs, charset_list);
2059
2060 while (1)
2061 {
2062 int c;
2063
2064 src_base = src;
2065 consumed_chars_base = consumed_chars;
2066
2067 if (charbuf >= charbuf_end)
2068 break;
2069
2070 ONE_MORE_BYTE (c);
2071 if (c < 0)
2072 {
2073 *charbuf++ = -c;
2074 char_offset++;
2075 }
2076 else if (c < 0x80)
2077 {
2078 *charbuf++ = c;
2079 char_offset++;
2080 }
2081 else if (c == 0x80)
2082 {
2083 ONE_MORE_BYTE (c);
2084 if (c < 0)
2085 goto invalid_code;
2086 if (c - 0xF2 >= COMPOSITION_RELATIVE
2087 && c - 0xF2 <= COMPOSITION_WITH_RULE_ALTCHARS)
2088 DECODE_EMACS_MULE_21_COMPOSITION (c);
2089 else if (c < 0xC0)
2090 DECODE_EMACS_MULE_20_RELATIVE_COMPOSITION (c);
2091 else if (c == 0xFF)
2092 DECODE_EMACS_MULE_20_RULEBASE_COMPOSITION (c);
2093 else
2094 goto invalid_code;
2095 }
2096 else if (c < 0xA0 && emacs_mule_bytes[c] > 1)
2097 {
2098 int nbytes, nchars;
2099 int id;
2100
2101 src = src_base;
2102 consumed_chars = consumed_chars_base;
2103 c = emacs_mule_char (coding, src, &nbytes, &nchars, &id);
2104 if (c < 0)
2105 {
2106 if (c == -2)
2107 break;
2108 goto invalid_code;
2109 }
2110 if (last_id != id)
2111 {
2112 if (last_id != charset_ascii)
2113 ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
2114 last_id = id;
2115 last_offset = char_offset;
2116 }
2117 *charbuf++ = c;
2118 src += nbytes;
2119 consumed_chars += nchars;
2120 char_offset++;
2121 }
2122 continue;
2123
2124 invalid_code:
2125 src = src_base;
2126 consumed_chars = consumed_chars_base;
2127 ONE_MORE_BYTE (c);
2128 *charbuf++ = ASCII_BYTE_P (c) ? c : BYTE8_TO_CHAR (c);
2129 char_offset++;
2130 coding->errors++;
2131 }
2132
2133 no_more_source:
2134 if (last_id != charset_ascii)
2135 ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
2136 coding->consumed_char += consumed_chars_base;
2137 coding->consumed = src_base - coding->source;
2138 coding->charbuf_used = charbuf - coding->charbuf;
2139 }
2140
2141
2142 #define EMACS_MULE_LEADING_CODES(id, codes) \
2143 do { \
2144 if (id < 0xA0) \
2145 codes[0] = id, codes[1] = 0; \
2146 else if (id < 0xE0) \
2147 codes[0] = 0x9A, codes[1] = id; \
2148 else if (id < 0xF0) \
2149 codes[0] = 0x9B, codes[1] = id; \
2150 else if (id < 0xF5) \
2151 codes[0] = 0x9C, codes[1] = id; \
2152 else \
2153 codes[0] = 0x9D, codes[1] = id; \
2154 } while (0);
2155
2156
2157 static int
2158 encode_coding_emacs_mule (coding)
2159 struct coding_system *coding;
2160 {
2161 int multibytep = coding->dst_multibyte;
2162 int *charbuf = coding->charbuf;
2163 int *charbuf_end = charbuf + coding->charbuf_used;
2164 unsigned char *dst = coding->destination + coding->produced;
2165 unsigned char *dst_end = coding->destination + coding->dst_bytes;
2166 int safe_room = 8;
2167 int produced_chars = 0;
2168 Lisp_Object attrs, charset_list;
2169 int c;
2170 int preferred_charset_id = -1;
2171
2172 CODING_GET_INFO (coding, attrs, charset_list);
2173 if (! EQ (charset_list, Vemacs_mule_charset_list))
2174 {
2175 CODING_ATTR_CHARSET_LIST (attrs)
2176 = charset_list = Vemacs_mule_charset_list;
2177 }
2178
2179 while (charbuf < charbuf_end)
2180 {
2181 ASSURE_DESTINATION (safe_room);
2182 c = *charbuf++;
2183
2184 if (c < 0)
2185 {
2186 /* Handle an annotation. */
2187 switch (*charbuf)
2188 {
2189 case CODING_ANNOTATE_COMPOSITION_MASK:
2190 /* Not yet implemented. */
2191 break;
2192 case CODING_ANNOTATE_CHARSET_MASK:
2193 preferred_charset_id = charbuf[3];
2194 if (preferred_charset_id >= 0
2195 && NILP (Fmemq (make_number (preferred_charset_id),
2196 charset_list)))
2197 preferred_charset_id = -1;
2198 break;
2199 default:
2200 abort ();
2201 }
2202 charbuf += -c - 1;
2203 continue;
2204 }
2205
2206 if (ASCII_CHAR_P (c))
2207 EMIT_ONE_ASCII_BYTE (c);
2208 else if (CHAR_BYTE8_P (c))
2209 {
2210 c = CHAR_TO_BYTE8 (c);
2211 EMIT_ONE_BYTE (c);
2212 }
2213 else
2214 {
2215 struct charset *charset;
2216 unsigned code;
2217 int dimension;
2218 int emacs_mule_id;
2219 unsigned char leading_codes[2];
2220
2221 if (preferred_charset_id >= 0)
2222 {
2223 charset = CHARSET_FROM_ID (preferred_charset_id);
2224 if (! CHAR_CHARSET_P (c, charset))
2225 charset = char_charset (c, charset_list, NULL);
2226 }
2227 else
2228 charset = char_charset (c, charset_list, &code);
2229 if (! charset)
2230 {
2231 c = coding->default_char;
2232 if (ASCII_CHAR_P (c))
2233 {
2234 EMIT_ONE_ASCII_BYTE (c);
2235 continue;
2236 }
2237 charset = char_charset (c, charset_list, &code);
2238 }
2239 dimension = CHARSET_DIMENSION (charset);
2240 emacs_mule_id = CHARSET_EMACS_MULE_ID (charset);
2241 EMACS_MULE_LEADING_CODES (emacs_mule_id, leading_codes);
2242 EMIT_ONE_BYTE (leading_codes[0]);
2243 if (leading_codes[1])
2244 EMIT_ONE_BYTE (leading_codes[1]);
2245 if (dimension == 1)
2246 EMIT_ONE_BYTE (code | 0x80);
2247 else
2248 {
2249 code |= 0x8080;
2250 EMIT_ONE_BYTE (code >> 8);
2251 EMIT_ONE_BYTE (code & 0xFF);
2252 }
2253 }
2254 }
2255 record_conversion_result (coding, CODING_RESULT_SUCCESS);
2256 coding->produced_char += produced_chars;
2257 coding->produced = dst - coding->destination;
2258 return 0;
2259 }
2260
2261 \f
2262 /*** 7. ISO2022 handlers ***/
2263
2264 /* The following note describes the coding system ISO2022 briefly.
2265 Since the intention of this note is to help understand the
2266 functions in this file, some parts are NOT ACCURATE or are OVERLY
2267 SIMPLIFIED. For thorough understanding, please refer to the
2268 original document of ISO2022. This is equivalent to the standard
2269 ECMA-35, obtainable from <URL:http://www.ecma.ch/> (*).
2270
2271 ISO2022 provides many mechanisms to encode several character sets
2272 in 7-bit and 8-bit environments. For 7-bit environments, all text
2273 is encoded using bytes less than 128. This may make the encoded
2274 text a little bit longer, but the text passes more easily through
2275 several types of gateway, some of which strip off the MSB (Most
2276 Significant Bit).
2277
2278 There are two kinds of character sets: control character sets and
2279 graphic character sets. The former contain control characters such
2280 as `newline' and `escape' to provide control functions (control
2281 functions are also provided by escape sequences). The latter
2282 contain graphic characters such as 'A' and '-'. Emacs recognizes
2283 two control character sets and many graphic character sets.
2284
2285 Graphic character sets are classified into one of the following
2286 four classes, according to the number of bytes (DIMENSION) and
2287 number of characters in one dimension (CHARS) of the set:
2288 - DIMENSION1_CHARS94
2289 - DIMENSION1_CHARS96
2290 - DIMENSION2_CHARS94
2291 - DIMENSION2_CHARS96
2292
2293 In addition, each character set is assigned an identification tag,
2294 unique for each set, called the "final character" (denoted as <F>
2295 hereafter). The <F> of each character set is decided by ECMA(*)
2296 when it is registered in ISO. The code range of <F> is 0x30..0x7F
2297 (0x30..0x3F are for private use only).
2298
2299 Note (*): ECMA = European Computer Manufacturers Association
2300
2301 Here are examples of graphic character sets [NAME(<F>)]:
2302 o DIMENSION1_CHARS94 -- ASCII('B'), right-half-of-JISX0201('I'), ...
2303 o DIMENSION1_CHARS96 -- right-half-of-ISO8859-1('A'), ...
2304 o DIMENSION2_CHARS94 -- GB2312('A'), JISX0208('B'), ...
2305 o DIMENSION2_CHARS96 -- none for the moment
2306
2307 A code area (1 byte=8 bits) is divided into 4 areas, C0, GL, C1, and GR.
2308 C0 [0x00..0x1F] -- control character plane 0
2309 GL [0x20..0x7F] -- graphic character plane 0
2310 C1 [0x80..0x9F] -- control character plane 1
2311 GR [0xA0..0xFF] -- graphic character plane 1
2312
2313 A control character set is directly designated and invoked to C0 or
2314 C1 by an escape sequence. The most common case is that:
2315 - ISO646's control character set is designated/invoked to C0, and
2316 - ISO6429's control character set is designated/invoked to C1,
2317 and usually these designations/invocations are omitted in encoded
2318 text. In a 7-bit environment, only C0 can be used, and a control
2319 character for C1 is encoded by an appropriate escape sequence to
2320 fit into the environment. All control characters for C1 are
2321 defined to have corresponding escape sequences.
2322
2323 A graphic character set is at first designated to one of four
2324 graphic registers (G0 through G3), then these graphic registers are
2325 invoked to GL or GR. These designations and invocations can be
2326 done independently. The most common case is that G0 is invoked to
2327 GL, G1 is invoked to GR, and ASCII is designated to G0. Usually
2328 these invocations and designations are omitted in encoded text.
2329 In a 7-bit environment, only GL can be used.
2330
2331 When a graphic character set of CHARS94 is invoked to GL, codes
2332 0x20 and 0x7F of the GL area work as control characters SPACE and
2333 DEL respectively, and codes 0xA0 and 0xFF of the GR area should not
2334 be used.
2335
2336 There are two ways of invocation: locking-shift and single-shift.
2337 With locking-shift, the invocation lasts until the next different
2338 invocation, whereas with single-shift, the invocation affects the
2339 following character only and doesn't affect the locking-shift
2340 state. Invocations are done by the following control characters or
2341 escape sequences:
2342
2343 ----------------------------------------------------------------------
2344 abbrev function cntrl escape seq description
2345 ----------------------------------------------------------------------
2346 SI/LS0 (shift-in) 0x0F none invoke G0 into GL
2347 SO/LS1 (shift-out) 0x0E none invoke G1 into GL
2348 LS2 (locking-shift-2) none ESC 'n' invoke G2 into GL
2349 LS3 (locking-shift-3) none ESC 'o' invoke G3 into GL
2350 LS1R (locking-shift-1 right) none ESC '~' invoke G1 into GR (*)
2351 LS2R (locking-shift-2 right) none ESC '}' invoke G2 into GR (*)
2352 LS3R (locking-shift 3 right) none ESC '|' invoke G3 into GR (*)
2353 SS2 (single-shift-2) 0x8E ESC 'N' invoke G2 for one char
2354 SS3 (single-shift-3) 0x8F ESC 'O' invoke G3 for one char
2355 ----------------------------------------------------------------------
2356 (*) These are not used by any known coding system.
2357
2358 Control characters for these functions are defined by macros
2359 ISO_CODE_XXX in `coding.h'.
2360
2361 Designations are done by the following escape sequences:
2362 ----------------------------------------------------------------------
2363 escape sequence description
2364 ----------------------------------------------------------------------
2365 ESC '(' <F> designate DIMENSION1_CHARS94<F> to G0
2366 ESC ')' <F> designate DIMENSION1_CHARS94<F> to G1
2367 ESC '*' <F> designate DIMENSION1_CHARS94<F> to G2
2368 ESC '+' <F> designate DIMENSION1_CHARS94<F> to G3
2369 ESC ',' <F> designate DIMENSION1_CHARS96<F> to G0 (*)
2370 ESC '-' <F> designate DIMENSION1_CHARS96<F> to G1
2371 ESC '.' <F> designate DIMENSION1_CHARS96<F> to G2
2372 ESC '/' <F> designate DIMENSION1_CHARS96<F> to G3
2373 ESC '$' '(' <F> designate DIMENSION2_CHARS94<F> to G0 (**)
2374 ESC '$' ')' <F> designate DIMENSION2_CHARS94<F> to G1
2375 ESC '$' '*' <F> designate DIMENSION2_CHARS94<F> to G2
2376 ESC '$' '+' <F> designate DIMENSION2_CHARS94<F> to G3
2377 ESC '$' ',' <F> designate DIMENSION2_CHARS96<F> to G0 (*)
2378 ESC '$' '-' <F> designate DIMENSION2_CHARS96<F> to G1
2379 ESC '$' '.' <F> designate DIMENSION2_CHARS96<F> to G2
2380 ESC '$' '/' <F> designate DIMENSION2_CHARS96<F> to G3
2381 ----------------------------------------------------------------------
2382
2383 In this list, "DIMENSION1_CHARS94<F>" means a graphic character set
2384 of dimension 1, chars 94, and final character <F>, etc...
2385
2386 Note (*): Although these designations are not allowed in ISO2022,
2387 Emacs accepts them on decoding, and produces them on encoding
2388 CHARS96 character sets in a coding system which is characterized as
2389 7-bit environment, non-locking-shift, and non-single-shift.
2390
2391 Note (**): If <F> is '@', 'A', or 'B', the intermediate character
2392 '(' must be omitted. We refer to this as "short-form" hereafter.
2393
2394 Now you may notice that there are a lot of ways of encoding the
2395 same multilingual text in ISO2022. Actually, there exist many
2396 coding systems such as Compound Text (used in X11's inter client
2397 communication, ISO-2022-JP (used in Japanese Internet), ISO-2022-KR
2398 (used in Korean Internet), EUC (Extended UNIX Code, used in Asian
2399 localized platforms), and all of these are variants of ISO2022.
2400
2401 In addition to the above, Emacs handles two more kinds of escape
2402 sequences: ISO6429's direction specification and Emacs' private
2403 sequence for specifying character composition.
2404
2405 ISO6429's direction specification takes the following form:
2406 o CSI ']' -- end of the current direction
2407 o CSI '0' ']' -- end of the current direction
2408 o CSI '1' ']' -- start of left-to-right text
2409 o CSI '2' ']' -- start of right-to-left text
2410 The control character CSI (0x9B: control sequence introducer) is
2411 abbreviated to the escape sequence ESC '[' in a 7-bit environment.
2412
2413 Character composition specification takes the following form:
2414 o ESC '0' -- start relative composition
2415 o ESC '1' -- end composition
2416 o ESC '2' -- start rule-base composition (*)
2417 o ESC '3' -- start relative composition with alternate chars (**)
2418 o ESC '4' -- start rule-base composition with alternate chars (**)
2419 Since these are not standard escape sequences of any ISO standard,
2420 the use of them with these meanings is restricted to Emacs only.
2421
2422 (*) This form is used only in Emacs 20.7 and older versions,
2423 but newer versions can safely decode it.
2424 (**) This form is used only in Emacs 21.1 and newer versions,
2425 and older versions can't decode it.
2426
2427 Here's a list of example usages of these composition escape
2428 sequences (categorized by `enum composition_method').
2429
2430 COMPOSITION_RELATIVE:
2431 ESC 0 CHAR [ CHAR ] ESC 1
2432 COMPOSITION_WITH_RULE:
2433 ESC 2 CHAR [ RULE CHAR ] ESC 1
2434 COMPOSITION_WITH_ALTCHARS:
2435 ESC 3 ALTCHAR [ ALTCHAR ] ESC 0 CHAR [ CHAR ] ESC 1
2436 COMPOSITION_WITH_RULE_ALTCHARS:
2437 ESC 4 ALTCHAR [ RULE ALTCHAR ] ESC 0 CHAR [ CHAR ] ESC 1 */
2438
2439 enum iso_code_class_type iso_code_class[256];
2440
2441 #define SAFE_CHARSET_P(coding, id) \
2442 ((id) <= (coding)->max_charset_id \
2443 && (coding)->safe_charsets[id] >= 0)
2444
2445
2446 #define SHIFT_OUT_OK(category) \
2447 (CODING_ISO_INITIAL (&coding_categories[category], 1) >= 0)
2448
2449 static void
2450 setup_iso_safe_charsets (attrs)
2451 Lisp_Object attrs;
2452 {
2453 Lisp_Object charset_list, safe_charsets;
2454 Lisp_Object request;
2455 Lisp_Object reg_usage;
2456 Lisp_Object tail;
2457 int reg94, reg96;
2458 int flags = XINT (AREF (attrs, coding_attr_iso_flags));
2459 int max_charset_id;
2460
2461 charset_list = CODING_ATTR_CHARSET_LIST (attrs);
2462 if ((flags & CODING_ISO_FLAG_FULL_SUPPORT)
2463 && ! EQ (charset_list, Viso_2022_charset_list))
2464 {
2465 CODING_ATTR_CHARSET_LIST (attrs)
2466 = charset_list = Viso_2022_charset_list;
2467 ASET (attrs, coding_attr_safe_charsets, Qnil);
2468 }
2469
2470 if (STRINGP (AREF (attrs, coding_attr_safe_charsets)))
2471 return;
2472
2473 max_charset_id = 0;
2474 for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
2475 {
2476 int id = XINT (XCAR (tail));
2477 if (max_charset_id < id)
2478 max_charset_id = id;
2479 }
2480
2481 safe_charsets = Fmake_string (make_number (max_charset_id + 1),
2482 make_number (255));
2483 request = AREF (attrs, coding_attr_iso_request);
2484 reg_usage = AREF (attrs, coding_attr_iso_usage);
2485 reg94 = XINT (XCAR (reg_usage));
2486 reg96 = XINT (XCDR (reg_usage));
2487
2488 for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
2489 {
2490 Lisp_Object id;
2491 Lisp_Object reg;
2492 struct charset *charset;
2493
2494 id = XCAR (tail);
2495 charset = CHARSET_FROM_ID (XINT (id));
2496 reg = Fcdr (Fassq (id, request));
2497 if (! NILP (reg))
2498 SSET (safe_charsets, XINT (id), XINT (reg));
2499 else if (charset->iso_chars_96)
2500 {
2501 if (reg96 < 4)
2502 SSET (safe_charsets, XINT (id), reg96);
2503 }
2504 else
2505 {
2506 if (reg94 < 4)
2507 SSET (safe_charsets, XINT (id), reg94);
2508 }
2509 }
2510 ASET (attrs, coding_attr_safe_charsets, safe_charsets);
2511 }
2512
2513
2514 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
2515 Check if a text is encoded in one of ISO-2022 based codig systems.
2516 If it is, return 1, else return 0. */
2517
2518 static int
2519 detect_coding_iso_2022 (coding, detect_info)
2520 struct coding_system *coding;
2521 struct coding_detection_info *detect_info;
2522 {
2523 const unsigned char *src = coding->source, *src_base = src;
2524 const unsigned char *src_end = coding->source + coding->src_bytes;
2525 int multibytep = coding->src_multibyte;
2526 int single_shifting = 0;
2527 int id;
2528 int c, c1;
2529 int consumed_chars = 0;
2530 int i;
2531 int rejected = 0;
2532 int found = 0;
2533
2534 detect_info->checked |= CATEGORY_MASK_ISO;
2535
2536 for (i = coding_category_iso_7; i <= coding_category_iso_8_else; i++)
2537 {
2538 struct coding_system *this = &(coding_categories[i]);
2539 Lisp_Object attrs, val;
2540
2541 attrs = CODING_ID_ATTRS (this->id);
2542 if (CODING_ISO_FLAGS (this) & CODING_ISO_FLAG_FULL_SUPPORT
2543 && ! EQ (CODING_ATTR_SAFE_CHARSETS (attrs), Viso_2022_charset_list))
2544 setup_iso_safe_charsets (attrs);
2545 val = CODING_ATTR_SAFE_CHARSETS (attrs);
2546 this->max_charset_id = SCHARS (val) - 1;
2547 this->safe_charsets = (char *) SDATA (val);
2548 }
2549
2550 /* A coding system of this category is always ASCII compatible. */
2551 src += coding->head_ascii;
2552
2553 while (rejected != CATEGORY_MASK_ISO)
2554 {
2555 src_base = src;
2556 ONE_MORE_BYTE (c);
2557 switch (c)
2558 {
2559 case ISO_CODE_ESC:
2560 if (inhibit_iso_escape_detection)
2561 break;
2562 single_shifting = 0;
2563 ONE_MORE_BYTE (c);
2564 if (c >= '(' && c <= '/')
2565 {
2566 /* Designation sequence for a charset of dimension 1. */
2567 ONE_MORE_BYTE (c1);
2568 if (c1 < ' ' || c1 >= 0x80
2569 || (id = iso_charset_table[0][c >= ','][c1]) < 0)
2570 /* Invalid designation sequence. Just ignore. */
2571 break;
2572 }
2573 else if (c == '$')
2574 {
2575 /* Designation sequence for a charset of dimension 2. */
2576 ONE_MORE_BYTE (c);
2577 if (c >= '@' && c <= 'B')
2578 /* Designation for JISX0208.1978, GB2312, or JISX0208. */
2579 id = iso_charset_table[1][0][c];
2580 else if (c >= '(' && c <= '/')
2581 {
2582 ONE_MORE_BYTE (c1);
2583 if (c1 < ' ' || c1 >= 0x80
2584 || (id = iso_charset_table[1][c >= ','][c1]) < 0)
2585 /* Invalid designation sequence. Just ignore. */
2586 break;
2587 }
2588 else
2589 /* Invalid designation sequence. Just ignore it. */
2590 break;
2591 }
2592 else if (c == 'N' || c == 'O')
2593 {
2594 /* ESC <Fe> for SS2 or SS3. */
2595 single_shifting = 1;
2596 rejected |= CATEGORY_MASK_ISO_7BIT | CATEGORY_MASK_ISO_8BIT;
2597 break;
2598 }
2599 else if (c >= '0' && c <= '4')
2600 {
2601 /* ESC <Fp> for start/end composition. */
2602 found |= CATEGORY_MASK_ISO;
2603 break;
2604 }
2605 else
2606 {
2607 /* Invalid escape sequence. Just ignore it. */
2608 break;
2609 }
2610
2611 /* We found a valid designation sequence for CHARSET. */
2612 rejected |= CATEGORY_MASK_ISO_8BIT;
2613 if (SAFE_CHARSET_P (&coding_categories[coding_category_iso_7],
2614 id))
2615 found |= CATEGORY_MASK_ISO_7;
2616 else
2617 rejected |= CATEGORY_MASK_ISO_7;
2618 if (SAFE_CHARSET_P (&coding_categories[coding_category_iso_7_tight],
2619 id))
2620 found |= CATEGORY_MASK_ISO_7_TIGHT;
2621 else
2622 rejected |= CATEGORY_MASK_ISO_7_TIGHT;
2623 if (SAFE_CHARSET_P (&coding_categories[coding_category_iso_7_else],
2624 id))
2625 found |= CATEGORY_MASK_ISO_7_ELSE;
2626 else
2627 rejected |= CATEGORY_MASK_ISO_7_ELSE;
2628 if (SAFE_CHARSET_P (&coding_categories[coding_category_iso_8_else],
2629 id))
2630 found |= CATEGORY_MASK_ISO_8_ELSE;
2631 else
2632 rejected |= CATEGORY_MASK_ISO_8_ELSE;
2633 break;
2634
2635 case ISO_CODE_SO:
2636 case ISO_CODE_SI:
2637 /* Locking shift out/in. */
2638 if (inhibit_iso_escape_detection)
2639 break;
2640 single_shifting = 0;
2641 rejected |= CATEGORY_MASK_ISO_7BIT | CATEGORY_MASK_ISO_8BIT;
2642 found |= CATEGORY_MASK_ISO_ELSE;
2643 break;
2644
2645 case ISO_CODE_CSI:
2646 /* Control sequence introducer. */
2647 single_shifting = 0;
2648 rejected |= CATEGORY_MASK_ISO_7BIT | CATEGORY_MASK_ISO_7_ELSE;
2649 found |= CATEGORY_MASK_ISO_8_ELSE;
2650 goto check_extra_latin;
2651
2652 case ISO_CODE_SS2:
2653 case ISO_CODE_SS3:
2654 /* Single shift. */
2655 if (inhibit_iso_escape_detection)
2656 break;
2657 single_shifting = 0;
2658 rejected |= CATEGORY_MASK_ISO_7BIT;
2659 if (CODING_ISO_FLAGS (&coding_categories[coding_category_iso_8_1])
2660 & CODING_ISO_FLAG_SINGLE_SHIFT)
2661 found |= CATEGORY_MASK_ISO_8_1, single_shifting = 1;
2662 if (CODING_ISO_FLAGS (&coding_categories[coding_category_iso_8_2])
2663 & CODING_ISO_FLAG_SINGLE_SHIFT)
2664 found |= CATEGORY_MASK_ISO_8_2, single_shifting = 1;
2665 if (single_shifting)
2666 break;
2667 goto check_extra_latin;
2668
2669 default:
2670 if (c < 0)
2671 continue;
2672 if (c < 0x80)
2673 {
2674 single_shifting = 0;
2675 break;
2676 }
2677 if (c >= 0xA0)
2678 {
2679 rejected |= CATEGORY_MASK_ISO_7BIT | CATEGORY_MASK_ISO_7_ELSE;
2680 found |= CATEGORY_MASK_ISO_8_1;
2681 /* Check the length of succeeding codes of the range
2682 0xA0..0FF. If the byte length is even, we include
2683 CATEGORY_MASK_ISO_8_2 in `found'. We can check this
2684 only when we are not single shifting. */
2685 if (! single_shifting
2686 && ! (rejected & CATEGORY_MASK_ISO_8_2))
2687 {
2688 int i = 1;
2689 while (src < src_end)
2690 {
2691 ONE_MORE_BYTE (c);
2692 if (c < 0xA0)
2693 break;
2694 i++;
2695 }
2696
2697 if (i & 1 && src < src_end)
2698 rejected |= CATEGORY_MASK_ISO_8_2;
2699 else
2700 found |= CATEGORY_MASK_ISO_8_2;
2701 }
2702 break;
2703 }
2704 check_extra_latin:
2705 single_shifting = 0;
2706 if (! VECTORP (Vlatin_extra_code_table)
2707 || NILP (XVECTOR (Vlatin_extra_code_table)->contents[c]))
2708 {
2709 rejected = CATEGORY_MASK_ISO;
2710 break;
2711 }
2712 if (CODING_ISO_FLAGS (&coding_categories[coding_category_iso_8_1])
2713 & CODING_ISO_FLAG_LATIN_EXTRA)
2714 found |= CATEGORY_MASK_ISO_8_1;
2715 else
2716 rejected |= CATEGORY_MASK_ISO_8_1;
2717 rejected |= CATEGORY_MASK_ISO_8_2;
2718 }
2719 }
2720 detect_info->rejected |= CATEGORY_MASK_ISO;
2721 return 0;
2722
2723 no_more_source:
2724 detect_info->rejected |= rejected;
2725 detect_info->found |= (found & ~rejected);
2726 return 1;
2727 }
2728
2729
2730 /* Set designation state into CODING. */
2731 #define DECODE_DESIGNATION(reg, dim, chars_96, final) \
2732 do { \
2733 int id, prev; \
2734 \
2735 if (final < '0' || final >= 128 \
2736 || ((id = ISO_CHARSET_TABLE (dim, chars_96, final)) < 0) \
2737 || !SAFE_CHARSET_P (coding, id)) \
2738 { \
2739 CODING_ISO_DESIGNATION (coding, reg) = -2; \
2740 goto invalid_code; \
2741 } \
2742 prev = CODING_ISO_DESIGNATION (coding, reg); \
2743 if (id == charset_jisx0201_roman) \
2744 { \
2745 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_USE_ROMAN) \
2746 id = charset_ascii; \
2747 } \
2748 else if (id == charset_jisx0208_1978) \
2749 { \
2750 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_USE_OLDJIS) \
2751 id = charset_jisx0208; \
2752 } \
2753 CODING_ISO_DESIGNATION (coding, reg) = id; \
2754 /* If there was an invalid designation to REG previously, and this \
2755 designation is ASCII to REG, we should keep this designation \
2756 sequence. */ \
2757 if (prev == -2 && id == charset_ascii) \
2758 goto invalid_code; \
2759 } while (0)
2760
2761
2762 #define MAYBE_FINISH_COMPOSITION() \
2763 do { \
2764 int i; \
2765 if (composition_state == COMPOSING_NO) \
2766 break; \
2767 /* It is assured that we have enough room for producing \
2768 characters stored in the table `components'. */ \
2769 if (charbuf + component_idx > charbuf_end) \
2770 goto no_more_source; \
2771 composition_state = COMPOSING_NO; \
2772 if (method == COMPOSITION_RELATIVE \
2773 || method == COMPOSITION_WITH_ALTCHARS) \
2774 { \
2775 for (i = 0; i < component_idx; i++) \
2776 *charbuf++ = components[i]; \
2777 char_offset += component_idx; \
2778 } \
2779 else \
2780 { \
2781 for (i = 0; i < component_idx; i += 2) \
2782 *charbuf++ = components[i]; \
2783 char_offset += (component_idx / 2) + 1; \
2784 } \
2785 } while (0)
2786
2787
2788 /* Handle composition start sequence ESC 0, ESC 2, ESC 3, or ESC 4.
2789 ESC 0 : relative composition : ESC 0 CHAR ... ESC 1
2790 ESC 2 : rulebase composition : ESC 2 CHAR RULE CHAR RULE ... CHAR ESC 1
2791 ESC 3 : altchar composition : ESC 3 CHAR ... ESC 0 CHAR ... ESC 1
2792 ESC 4 : alt&rule composition : ESC 4 CHAR RULE ... CHAR ESC 0 CHAR ... ESC 1
2793 */
2794
2795 #define DECODE_COMPOSITION_START(c1) \
2796 do { \
2797 if (c1 == '0' \
2798 && composition_state == COMPOSING_COMPONENT_RULE) \
2799 { \
2800 component_len = component_idx; \
2801 composition_state = COMPOSING_CHAR; \
2802 } \
2803 else \
2804 { \
2805 const unsigned char *p; \
2806 \
2807 MAYBE_FINISH_COMPOSITION (); \
2808 if (charbuf + MAX_COMPOSITION_COMPONENTS > charbuf_end) \
2809 goto no_more_source; \
2810 for (p = src; p < src_end - 1; p++) \
2811 if (*p == ISO_CODE_ESC && p[1] == '1') \
2812 break; \
2813 if (p == src_end - 1) \
2814 { \
2815 if (coding->mode & CODING_MODE_LAST_BLOCK) \
2816 goto invalid_code; \
2817 goto no_more_source; \
2818 } \
2819 \
2820 /* This is surely the start of a composition. */ \
2821 method = (c1 == '0' ? COMPOSITION_RELATIVE \
2822 : c1 == '2' ? COMPOSITION_WITH_RULE \
2823 : c1 == '3' ? COMPOSITION_WITH_ALTCHARS \
2824 : COMPOSITION_WITH_RULE_ALTCHARS); \
2825 composition_state = (c1 <= '2' ? COMPOSING_CHAR \
2826 : COMPOSING_COMPONENT_CHAR); \
2827 component_idx = component_len = 0; \
2828 } \
2829 } while (0)
2830
2831
2832 /* Handle compositoin end sequence ESC 1. */
2833
2834 #define DECODE_COMPOSITION_END() \
2835 do { \
2836 int nchars = (component_len > 0 ? component_idx - component_len \
2837 : method == COMPOSITION_RELATIVE ? component_idx \
2838 : (component_idx + 1) / 2); \
2839 int i; \
2840 int *saved_charbuf = charbuf; \
2841 \
2842 ADD_COMPOSITION_DATA (charbuf, nchars, method); \
2843 if (method != COMPOSITION_RELATIVE) \
2844 { \
2845 if (component_len == 0) \
2846 for (i = 0; i < component_idx; i++) \
2847 *charbuf++ = components[i]; \
2848 else \
2849 for (i = 0; i < component_len; i++) \
2850 *charbuf++ = components[i]; \
2851 *saved_charbuf = saved_charbuf - charbuf; \
2852 } \
2853 if (method == COMPOSITION_WITH_RULE) \
2854 for (i = 0; i < component_idx; i += 2, char_offset++) \
2855 *charbuf++ = components[i]; \
2856 else \
2857 for (i = component_len; i < component_idx; i++, char_offset++) \
2858 *charbuf++ = components[i]; \
2859 coding->annotated = 1; \
2860 composition_state = COMPOSING_NO; \
2861 } while (0)
2862
2863
2864 /* Decode a composition rule from the byte C1 (and maybe one more byte
2865 from SRC) and store one encoded composition rule in
2866 coding->cmp_data. */
2867
2868 #define DECODE_COMPOSITION_RULE(c1) \
2869 do { \
2870 (c1) -= 32; \
2871 if (c1 < 81) /* old format (before ver.21) */ \
2872 { \
2873 int gref = (c1) / 9; \
2874 int nref = (c1) % 9; \
2875 if (gref == 4) gref = 10; \
2876 if (nref == 4) nref = 10; \
2877 c1 = COMPOSITION_ENCODE_RULE (gref, nref); \
2878 } \
2879 else if (c1 < 93) /* new format (after ver.21) */ \
2880 { \
2881 ONE_MORE_BYTE (c2); \
2882 c1 = COMPOSITION_ENCODE_RULE (c1 - 81, c2 - 32); \
2883 } \
2884 else \
2885 c1 = 0; \
2886 } while (0)
2887
2888
2889 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions". */
2890
2891 static void
2892 decode_coding_iso_2022 (coding)
2893 struct coding_system *coding;
2894 {
2895 const unsigned char *src = coding->source + coding->consumed;
2896 const unsigned char *src_end = coding->source + coding->src_bytes;
2897 const unsigned char *src_base;
2898 int *charbuf = coding->charbuf + coding->charbuf_used;
2899 int *charbuf_end
2900 = coding->charbuf + coding->charbuf_size - 4 - MAX_ANNOTATION_LENGTH;
2901 int consumed_chars = 0, consumed_chars_base;
2902 int multibytep = coding->src_multibyte;
2903 /* Charsets invoked to graphic plane 0 and 1 respectively. */
2904 int charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
2905 int charset_id_1 = CODING_ISO_INVOKED_CHARSET (coding, 1);
2906 struct charset *charset;
2907 int c;
2908 /* For handling composition sequence. */
2909 #define COMPOSING_NO 0
2910 #define COMPOSING_CHAR 1
2911 #define COMPOSING_RULE 2
2912 #define COMPOSING_COMPONENT_CHAR 3
2913 #define COMPOSING_COMPONENT_RULE 4
2914
2915 int composition_state = COMPOSING_NO;
2916 enum composition_method method;
2917 int components[MAX_COMPOSITION_COMPONENTS * 2 + 1];
2918 int component_idx;
2919 int component_len;
2920 Lisp_Object attrs, charset_list;
2921 int char_offset = coding->produced_char;
2922 int last_offset = char_offset;
2923 int last_id = charset_ascii;
2924
2925 CODING_GET_INFO (coding, attrs, charset_list);
2926 setup_iso_safe_charsets (attrs);
2927
2928 while (1)
2929 {
2930 int c1, c2;
2931
2932 src_base = src;
2933 consumed_chars_base = consumed_chars;
2934
2935 if (charbuf >= charbuf_end)
2936 break;
2937
2938 ONE_MORE_BYTE (c1);
2939 if (c1 < 0)
2940 goto invalid_code;
2941
2942 /* We produce at most one character. */
2943 switch (iso_code_class [c1])
2944 {
2945 case ISO_0x20_or_0x7F:
2946 if (composition_state != COMPOSING_NO)
2947 {
2948 if (composition_state == COMPOSING_RULE
2949 || composition_state == COMPOSING_COMPONENT_RULE)
2950 {
2951 DECODE_COMPOSITION_RULE (c1);
2952 components[component_idx++] = c1;
2953 composition_state--;
2954 continue;
2955 }
2956 }
2957 if (charset_id_0 < 0
2958 || ! CHARSET_ISO_CHARS_96 (CHARSET_FROM_ID (charset_id_0)))
2959 /* This is SPACE or DEL. */
2960 charset = CHARSET_FROM_ID (charset_ascii);
2961 else
2962 charset = CHARSET_FROM_ID (charset_id_0);
2963 break;
2964
2965 case ISO_graphic_plane_0:
2966 if (composition_state != COMPOSING_NO)
2967 {
2968 if (composition_state == COMPOSING_RULE
2969 || composition_state == COMPOSING_COMPONENT_RULE)
2970 {
2971 DECODE_COMPOSITION_RULE (c1);
2972 components[component_idx++] = c1;
2973 composition_state--;
2974 continue;
2975 }
2976 }
2977 charset = CHARSET_FROM_ID (charset_id_0);
2978 break;
2979
2980 case ISO_0xA0_or_0xFF:
2981 if (charset_id_1 < 0
2982 || ! CHARSET_ISO_CHARS_96 (CHARSET_FROM_ID (charset_id_1))
2983 || CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS)
2984 goto invalid_code;
2985 /* This is a graphic character, we fall down ... */
2986
2987 case ISO_graphic_plane_1:
2988 if (charset_id_1 < 0)
2989 goto invalid_code;
2990 charset = CHARSET_FROM_ID (charset_id_1);
2991 break;
2992
2993 case ISO_control_0:
2994 MAYBE_FINISH_COMPOSITION ();
2995 charset = CHARSET_FROM_ID (charset_ascii);
2996 break;
2997
2998 case ISO_control_1:
2999 MAYBE_FINISH_COMPOSITION ();
3000 goto invalid_code;
3001
3002 case ISO_shift_out:
3003 if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LOCKING_SHIFT)
3004 || CODING_ISO_DESIGNATION (coding, 1) < 0)
3005 goto invalid_code;
3006 CODING_ISO_INVOCATION (coding, 0) = 1;
3007 charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3008 continue;
3009
3010 case ISO_shift_in:
3011 if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LOCKING_SHIFT))
3012 goto invalid_code;
3013 CODING_ISO_INVOCATION (coding, 0) = 0;
3014 charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3015 continue;
3016
3017 case ISO_single_shift_2_7:
3018 case ISO_single_shift_2:
3019 if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT))
3020 goto invalid_code;
3021 /* SS2 is handled as an escape sequence of ESC 'N' */
3022 c1 = 'N';
3023 goto label_escape_sequence;
3024
3025 case ISO_single_shift_3:
3026 if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT))
3027 goto invalid_code;
3028 /* SS2 is handled as an escape sequence of ESC 'O' */
3029 c1 = 'O';
3030 goto label_escape_sequence;
3031
3032 case ISO_control_sequence_introducer:
3033 /* CSI is handled as an escape sequence of ESC '[' ... */
3034 c1 = '[';
3035 goto label_escape_sequence;
3036
3037 case ISO_escape:
3038 ONE_MORE_BYTE (c1);
3039 label_escape_sequence:
3040 /* Escape sequences handled here are invocation,
3041 designation, direction specification, and character
3042 composition specification. */
3043 switch (c1)
3044 {
3045 case '&': /* revision of following character set */
3046 ONE_MORE_BYTE (c1);
3047 if (!(c1 >= '@' && c1 <= '~'))
3048 goto invalid_code;
3049 ONE_MORE_BYTE (c1);
3050 if (c1 != ISO_CODE_ESC)
3051 goto invalid_code;
3052 ONE_MORE_BYTE (c1);
3053 goto label_escape_sequence;
3054
3055 case '$': /* designation of 2-byte character set */
3056 if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DESIGNATION))
3057 goto invalid_code;
3058 ONE_MORE_BYTE (c1);
3059 if (c1 >= '@' && c1 <= 'B')
3060 { /* designation of JISX0208.1978, GB2312.1980,
3061 or JISX0208.1980 */
3062 DECODE_DESIGNATION (0, 2, 0, c1);
3063 }
3064 else if (c1 >= 0x28 && c1 <= 0x2B)
3065 { /* designation of DIMENSION2_CHARS94 character set */
3066 ONE_MORE_BYTE (c2);
3067 DECODE_DESIGNATION (c1 - 0x28, 2, 0, c2);
3068 }
3069 else if (c1 >= 0x2C && c1 <= 0x2F)
3070 { /* designation of DIMENSION2_CHARS96 character set */
3071 ONE_MORE_BYTE (c2);
3072 DECODE_DESIGNATION (c1 - 0x2C, 2, 1, c2);
3073 }
3074 else
3075 goto invalid_code;
3076 /* We must update these variables now. */
3077 charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3078 charset_id_1 = CODING_ISO_INVOKED_CHARSET (coding, 1);
3079 continue;
3080
3081 case 'n': /* invocation of locking-shift-2 */
3082 if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LOCKING_SHIFT)
3083 || CODING_ISO_DESIGNATION (coding, 2) < 0)
3084 goto invalid_code;
3085 CODING_ISO_INVOCATION (coding, 0) = 2;
3086 charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3087 continue;
3088
3089 case 'o': /* invocation of locking-shift-3 */
3090 if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LOCKING_SHIFT)
3091 || CODING_ISO_DESIGNATION (coding, 3) < 0)
3092 goto invalid_code;
3093 CODING_ISO_INVOCATION (coding, 0) = 3;
3094 charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3095 continue;
3096
3097 case 'N': /* invocation of single-shift-2 */
3098 if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT)
3099 || CODING_ISO_DESIGNATION (coding, 2) < 0)
3100 goto invalid_code;
3101 charset = CHARSET_FROM_ID (CODING_ISO_DESIGNATION (coding, 2));
3102 ONE_MORE_BYTE (c1);
3103 if (c1 < 0x20 || (c1 >= 0x80 && c1 < 0xA0))
3104 goto invalid_code;
3105 break;
3106
3107 case 'O': /* invocation of single-shift-3 */
3108 if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT)
3109 || CODING_ISO_DESIGNATION (coding, 3) < 0)
3110 goto invalid_code;
3111 charset = CHARSET_FROM_ID (CODING_ISO_DESIGNATION (coding, 3));
3112 ONE_MORE_BYTE (c1);
3113 if (c1 < 0x20 || (c1 >= 0x80 && c1 < 0xA0))
3114 goto invalid_code;
3115 break;
3116
3117 case '0': case '2': case '3': case '4': /* start composition */
3118 if (! (coding->common_flags & CODING_ANNOTATE_COMPOSITION_MASK))
3119 goto invalid_code;
3120 DECODE_COMPOSITION_START (c1);
3121 continue;
3122
3123 case '1': /* end composition */
3124 if (composition_state == COMPOSING_NO)
3125 goto invalid_code;
3126 DECODE_COMPOSITION_END ();
3127 continue;
3128
3129 case '[': /* specification of direction */
3130 if (! CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DIRECTION)
3131 goto invalid_code;
3132 /* For the moment, nested direction is not supported.
3133 So, `coding->mode & CODING_MODE_DIRECTION' zero means
3134 left-to-right, and nozero means right-to-left. */
3135 ONE_MORE_BYTE (c1);
3136 switch (c1)
3137 {
3138 case ']': /* end of the current direction */
3139 coding->mode &= ~CODING_MODE_DIRECTION;
3140
3141 case '0': /* end of the current direction */
3142 case '1': /* start of left-to-right direction */
3143 ONE_MORE_BYTE (c1);
3144 if (c1 == ']')
3145 coding->mode &= ~CODING_MODE_DIRECTION;
3146 else
3147 goto invalid_code;
3148 break;
3149
3150 case '2': /* start of right-to-left direction */
3151 ONE_MORE_BYTE (c1);
3152 if (c1 == ']')
3153 coding->mode |= CODING_MODE_DIRECTION;
3154 else
3155 goto invalid_code;
3156 break;
3157
3158 default:
3159 goto invalid_code;
3160 }
3161 continue;
3162
3163 case '%':
3164 ONE_MORE_BYTE (c1);
3165 if (c1 == '/')
3166 {
3167 /* CTEXT extended segment:
3168 ESC % / [0-4] M L --ENCODING-NAME-- \002 --BYTES--
3169 We keep these bytes as is for the moment.
3170 They may be decoded by post-read-conversion. */
3171 int dim, M, L;
3172 int size;
3173
3174 ONE_MORE_BYTE (dim);
3175 ONE_MORE_BYTE (M);
3176 ONE_MORE_BYTE (L);
3177 size = ((M - 128) * 128) + (L - 128);
3178 if (charbuf + 8 + size > charbuf_end)
3179 goto break_loop;
3180 *charbuf++ = ISO_CODE_ESC;
3181 *charbuf++ = '%';
3182 *charbuf++ = '/';
3183 *charbuf++ = dim;
3184 *charbuf++ = BYTE8_TO_CHAR (M);
3185 *charbuf++ = BYTE8_TO_CHAR (L);
3186 while (size-- > 0)
3187 {
3188 ONE_MORE_BYTE (c1);
3189 *charbuf++ = ASCII_BYTE_P (c1) ? c1 : BYTE8_TO_CHAR (c1);
3190 }
3191 }
3192 else if (c1 == 'G')
3193 {
3194 /* XFree86 extension for embedding UTF-8 in CTEXT:
3195 ESC % G --UTF-8-BYTES-- ESC % @
3196 We keep these bytes as is for the moment.
3197 They may be decoded by post-read-conversion. */
3198 int *p = charbuf;
3199
3200 if (p + 6 > charbuf_end)
3201 goto break_loop;
3202 *p++ = ISO_CODE_ESC;
3203 *p++ = '%';
3204 *p++ = 'G';
3205 while (p < charbuf_end)
3206 {
3207 ONE_MORE_BYTE (c1);
3208 if (c1 == ISO_CODE_ESC
3209 && src + 1 < src_end
3210 && src[0] == '%'
3211 && src[1] == '@')
3212 break;
3213 *p++ = ASCII_BYTE_P (c1) ? c1 : BYTE8_TO_CHAR (c1);
3214 }
3215 if (p + 3 > charbuf_end)
3216 goto break_loop;
3217 *p++ = ISO_CODE_ESC;
3218 *p++ = '%';
3219 *p++ = '@';
3220 charbuf = p;
3221 }
3222 else
3223 goto invalid_code;
3224 continue;
3225 break;
3226
3227 default:
3228 if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DESIGNATION))
3229 goto invalid_code;
3230 if (c1 >= 0x28 && c1 <= 0x2B)
3231 { /* designation of DIMENSION1_CHARS94 character set */
3232 ONE_MORE_BYTE (c2);
3233 DECODE_DESIGNATION (c1 - 0x28, 1, 0, c2);
3234 }
3235 else if (c1 >= 0x2C && c1 <= 0x2F)
3236 { /* designation of DIMENSION1_CHARS96 character set */
3237 ONE_MORE_BYTE (c2);
3238 DECODE_DESIGNATION (c1 - 0x2C, 1, 1, c2);
3239 }
3240 else
3241 goto invalid_code;
3242 /* We must update these variables now. */
3243 charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3244 charset_id_1 = CODING_ISO_INVOKED_CHARSET (coding, 1);
3245 continue;
3246 }
3247 }
3248
3249 if (charset->id != charset_ascii
3250 && last_id != charset->id)
3251 {
3252 if (last_id != charset_ascii)
3253 ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
3254 last_id = charset->id;
3255 last_offset = char_offset;
3256 }
3257
3258 /* Now we know CHARSET and 1st position code C1 of a character.
3259 Produce a decoded character while getting 2nd position code
3260 C2 if necessary. */
3261 c1 &= 0x7F;
3262 if (CHARSET_DIMENSION (charset) > 1)
3263 {
3264 ONE_MORE_BYTE (c2);
3265 if (c2 < 0x20 || (c2 >= 0x80 && c2 < 0xA0))
3266 /* C2 is not in a valid range. */
3267 goto invalid_code;
3268 c1 = (c1 << 8) | (c2 & 0x7F);
3269 if (CHARSET_DIMENSION (charset) > 2)
3270 {
3271 ONE_MORE_BYTE (c2);
3272 if (c2 < 0x20 || (c2 >= 0x80 && c2 < 0xA0))
3273 /* C2 is not in a valid range. */
3274 goto invalid_code;
3275 c1 = (c1 << 8) | (c2 & 0x7F);
3276 }
3277 }
3278
3279 CODING_DECODE_CHAR (coding, src, src_base, src_end, charset, c1, c);
3280 if (c < 0)
3281 {
3282 MAYBE_FINISH_COMPOSITION ();
3283 for (; src_base < src; src_base++, char_offset++)
3284 {
3285 if (ASCII_BYTE_P (*src_base))
3286 *charbuf++ = *src_base;
3287 else
3288 *charbuf++ = BYTE8_TO_CHAR (*src_base);
3289 }
3290 }
3291 else if (composition_state == COMPOSING_NO)
3292 {
3293 *charbuf++ = c;
3294 char_offset++;
3295 }
3296 else
3297 {
3298 components[component_idx++] = c;
3299 if (method == COMPOSITION_WITH_RULE
3300 || (method == COMPOSITION_WITH_RULE_ALTCHARS
3301 && composition_state == COMPOSING_COMPONENT_CHAR))
3302 composition_state++;
3303 }
3304 continue;
3305
3306 invalid_code:
3307 MAYBE_FINISH_COMPOSITION ();
3308 src = src_base;
3309 consumed_chars = consumed_chars_base;
3310 ONE_MORE_BYTE (c);
3311 *charbuf++ = c < 0 ? -c : ASCII_BYTE_P (c) ? c : BYTE8_TO_CHAR (c);
3312 char_offset++;
3313 coding->errors++;
3314 continue;
3315
3316 break_loop:
3317 break;
3318 }
3319
3320 no_more_source:
3321 if (last_id != charset_ascii)
3322 ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
3323 coding->consumed_char += consumed_chars_base;
3324 coding->consumed = src_base - coding->source;
3325 coding->charbuf_used = charbuf - coding->charbuf;
3326 }
3327
3328
3329 /* ISO2022 encoding stuff. */
3330
3331 /*
3332 It is not enough to say just "ISO2022" on encoding, we have to
3333 specify more details. In Emacs, each coding system of ISO2022
3334 variant has the following specifications:
3335 1. Initial designation to G0 thru G3.
3336 2. Allows short-form designation?
3337 3. ASCII should be designated to G0 before control characters?
3338 4. ASCII should be designated to G0 at end of line?
3339 5. 7-bit environment or 8-bit environment?
3340 6. Use locking-shift?
3341 7. Use Single-shift?
3342 And the following two are only for Japanese:
3343 8. Use ASCII in place of JIS0201-1976-Roman?
3344 9. Use JISX0208-1983 in place of JISX0208-1978?
3345 These specifications are encoded in CODING_ISO_FLAGS (coding) as flag bits
3346 defined by macros CODING_ISO_FLAG_XXX. See `coding.h' for more
3347 details.
3348 */
3349
3350 /* Produce codes (escape sequence) for designating CHARSET to graphic
3351 register REG at DST, and increment DST. If <final-char> of CHARSET is
3352 '@', 'A', or 'B' and the coding system CODING allows, produce
3353 designation sequence of short-form. */
3354
3355 #define ENCODE_DESIGNATION(charset, reg, coding) \
3356 do { \
3357 unsigned char final_char = CHARSET_ISO_FINAL (charset); \
3358 char *intermediate_char_94 = "()*+"; \
3359 char *intermediate_char_96 = ",-./"; \
3360 int revision = -1; \
3361 int c; \
3362 \
3363 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_REVISION) \
3364 revision = CHARSET_ISO_REVISION (charset); \
3365 \
3366 if (revision >= 0) \
3367 { \
3368 EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, '&'); \
3369 EMIT_ONE_BYTE ('@' + revision); \
3370 } \
3371 EMIT_ONE_ASCII_BYTE (ISO_CODE_ESC); \
3372 if (CHARSET_DIMENSION (charset) == 1) \
3373 { \
3374 if (! CHARSET_ISO_CHARS_96 (charset)) \
3375 c = intermediate_char_94[reg]; \
3376 else \
3377 c = intermediate_char_96[reg]; \
3378 EMIT_ONE_ASCII_BYTE (c); \
3379 } \
3380 else \
3381 { \
3382 EMIT_ONE_ASCII_BYTE ('$'); \
3383 if (! CHARSET_ISO_CHARS_96 (charset)) \
3384 { \
3385 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LONG_FORM \
3386 || reg != 0 \
3387 || final_char < '@' || final_char > 'B') \
3388 EMIT_ONE_ASCII_BYTE (intermediate_char_94[reg]); \
3389 } \
3390 else \
3391 EMIT_ONE_ASCII_BYTE (intermediate_char_96[reg]); \
3392 } \
3393 EMIT_ONE_ASCII_BYTE (final_char); \
3394 \
3395 CODING_ISO_DESIGNATION (coding, reg) = CHARSET_ID (charset); \
3396 } while (0)
3397
3398
3399 /* The following two macros produce codes (control character or escape
3400 sequence) for ISO2022 single-shift functions (single-shift-2 and
3401 single-shift-3). */
3402
3403 #define ENCODE_SINGLE_SHIFT_2 \
3404 do { \
3405 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS) \
3406 EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'N'); \
3407 else \
3408 EMIT_ONE_BYTE (ISO_CODE_SS2); \
3409 CODING_ISO_SINGLE_SHIFTING (coding) = 1; \
3410 } while (0)
3411
3412
3413 #define ENCODE_SINGLE_SHIFT_3 \
3414 do { \
3415 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS) \
3416 EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'O'); \
3417 else \
3418 EMIT_ONE_BYTE (ISO_CODE_SS3); \
3419 CODING_ISO_SINGLE_SHIFTING (coding) = 1; \
3420 } while (0)
3421
3422
3423 /* The following four macros produce codes (control character or
3424 escape sequence) for ISO2022 locking-shift functions (shift-in,
3425 shift-out, locking-shift-2, and locking-shift-3). */
3426
3427 #define ENCODE_SHIFT_IN \
3428 do { \
3429 EMIT_ONE_ASCII_BYTE (ISO_CODE_SI); \
3430 CODING_ISO_INVOCATION (coding, 0) = 0; \
3431 } while (0)
3432
3433
3434 #define ENCODE_SHIFT_OUT \
3435 do { \
3436 EMIT_ONE_ASCII_BYTE (ISO_CODE_SO); \
3437 CODING_ISO_INVOCATION (coding, 0) = 1; \
3438 } while (0)
3439
3440
3441 #define ENCODE_LOCKING_SHIFT_2 \
3442 do { \
3443 EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'n'); \
3444 CODING_ISO_INVOCATION (coding, 0) = 2; \
3445 } while (0)
3446
3447
3448 #define ENCODE_LOCKING_SHIFT_3 \
3449 do { \
3450 EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'n'); \
3451 CODING_ISO_INVOCATION (coding, 0) = 3; \
3452 } while (0)
3453
3454
3455 /* Produce codes for a DIMENSION1 character whose character set is
3456 CHARSET and whose position-code is C1. Designation and invocation
3457 sequences are also produced in advance if necessary. */
3458
3459 #define ENCODE_ISO_CHARACTER_DIMENSION1(charset, c1) \
3460 do { \
3461 int id = CHARSET_ID (charset); \
3462 \
3463 if ((CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_USE_ROMAN) \
3464 && id == charset_ascii) \
3465 { \
3466 id = charset_jisx0201_roman; \
3467 charset = CHARSET_FROM_ID (id); \
3468 } \
3469 \
3470 if (CODING_ISO_SINGLE_SHIFTING (coding)) \
3471 { \
3472 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS) \
3473 EMIT_ONE_ASCII_BYTE (c1 & 0x7F); \
3474 else \
3475 EMIT_ONE_BYTE (c1 | 0x80); \
3476 CODING_ISO_SINGLE_SHIFTING (coding) = 0; \
3477 break; \
3478 } \
3479 else if (id == CODING_ISO_INVOKED_CHARSET (coding, 0)) \
3480 { \
3481 EMIT_ONE_ASCII_BYTE (c1 & 0x7F); \
3482 break; \
3483 } \
3484 else if (id == CODING_ISO_INVOKED_CHARSET (coding, 1)) \
3485 { \
3486 EMIT_ONE_BYTE (c1 | 0x80); \
3487 break; \
3488 } \
3489 else \
3490 /* Since CHARSET is not yet invoked to any graphic planes, we \
3491 must invoke it, or, at first, designate it to some graphic \
3492 register. Then repeat the loop to actually produce the \
3493 character. */ \
3494 dst = encode_invocation_designation (charset, coding, dst, \
3495 &produced_chars); \
3496 } while (1)
3497
3498
3499 /* Produce codes for a DIMENSION2 character whose character set is
3500 CHARSET and whose position-codes are C1 and C2. Designation and
3501 invocation codes are also produced in advance if necessary. */
3502
3503 #define ENCODE_ISO_CHARACTER_DIMENSION2(charset, c1, c2) \
3504 do { \
3505 int id = CHARSET_ID (charset); \
3506 \
3507 if ((CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_USE_OLDJIS) \
3508 && id == charset_jisx0208) \
3509 { \
3510 id = charset_jisx0208_1978; \
3511 charset = CHARSET_FROM_ID (id); \
3512 } \
3513 \
3514 if (CODING_ISO_SINGLE_SHIFTING (coding)) \
3515 { \
3516 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS) \
3517 EMIT_TWO_ASCII_BYTES ((c1) & 0x7F, (c2) & 0x7F); \
3518 else \
3519 EMIT_TWO_BYTES ((c1) | 0x80, (c2) | 0x80); \
3520 CODING_ISO_SINGLE_SHIFTING (coding) = 0; \
3521 break; \
3522 } \
3523 else if (id == CODING_ISO_INVOKED_CHARSET (coding, 0)) \
3524 { \
3525 EMIT_TWO_ASCII_BYTES ((c1) & 0x7F, (c2) & 0x7F); \
3526 break; \
3527 } \
3528 else if (id == CODING_ISO_INVOKED_CHARSET (coding, 1)) \
3529 { \
3530 EMIT_TWO_BYTES ((c1) | 0x80, (c2) | 0x80); \
3531 break; \
3532 } \
3533 else \
3534 /* Since CHARSET is not yet invoked to any graphic planes, we \
3535 must invoke it, or, at first, designate it to some graphic \
3536 register. Then repeat the loop to actually produce the \
3537 character. */ \
3538 dst = encode_invocation_designation (charset, coding, dst, \
3539 &produced_chars); \
3540 } while (1)
3541
3542
3543 #define ENCODE_ISO_CHARACTER(charset, c) \
3544 do { \
3545 int code = ENCODE_CHAR ((charset),(c)); \
3546 \
3547 if (CHARSET_DIMENSION (charset) == 1) \
3548 ENCODE_ISO_CHARACTER_DIMENSION1 ((charset), code); \
3549 else \
3550 ENCODE_ISO_CHARACTER_DIMENSION2 ((charset), code >> 8, code & 0xFF); \
3551 } while (0)
3552
3553
3554 /* Produce designation and invocation codes at a place pointed by DST
3555 to use CHARSET. The element `spec.iso_2022' of *CODING is updated.
3556 Return new DST. */
3557
3558 unsigned char *
3559 encode_invocation_designation (charset, coding, dst, p_nchars)
3560 struct charset *charset;
3561 struct coding_system *coding;
3562 unsigned char *dst;
3563 int *p_nchars;
3564 {
3565 int multibytep = coding->dst_multibyte;
3566 int produced_chars = *p_nchars;
3567 int reg; /* graphic register number */
3568 int id = CHARSET_ID (charset);
3569
3570 /* At first, check designations. */
3571 for (reg = 0; reg < 4; reg++)
3572 if (id == CODING_ISO_DESIGNATION (coding, reg))
3573 break;
3574
3575 if (reg >= 4)
3576 {
3577 /* CHARSET is not yet designated to any graphic registers. */
3578 /* At first check the requested designation. */
3579 reg = CODING_ISO_REQUEST (coding, id);
3580 if (reg < 0)
3581 /* Since CHARSET requests no special designation, designate it
3582 to graphic register 0. */
3583 reg = 0;
3584
3585 ENCODE_DESIGNATION (charset, reg, coding);
3586 }
3587
3588 if (CODING_ISO_INVOCATION (coding, 0) != reg
3589 && CODING_ISO_INVOCATION (coding, 1) != reg)
3590 {
3591 /* Since the graphic register REG is not invoked to any graphic
3592 planes, invoke it to graphic plane 0. */
3593 switch (reg)
3594 {
3595 case 0: /* graphic register 0 */
3596 ENCODE_SHIFT_IN;
3597 break;
3598
3599 case 1: /* graphic register 1 */
3600 ENCODE_SHIFT_OUT;
3601 break;
3602
3603 case 2: /* graphic register 2 */
3604 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT)
3605 ENCODE_SINGLE_SHIFT_2;
3606 else
3607 ENCODE_LOCKING_SHIFT_2;
3608 break;
3609
3610 case 3: /* graphic register 3 */
3611 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT)
3612 ENCODE_SINGLE_SHIFT_3;
3613 else
3614 ENCODE_LOCKING_SHIFT_3;
3615 break;
3616 }
3617 }
3618
3619 *p_nchars = produced_chars;
3620 return dst;
3621 }
3622
3623 /* The following three macros produce codes for indicating direction
3624 of text. */
3625 #define ENCODE_CONTROL_SEQUENCE_INTRODUCER \
3626 do { \
3627 if (CODING_ISO_FLAGS (coding) == CODING_ISO_FLAG_SEVEN_BITS) \
3628 EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, '['); \
3629 else \
3630 EMIT_ONE_BYTE (ISO_CODE_CSI); \
3631 } while (0)
3632
3633
3634 #define ENCODE_DIRECTION_R2L() \
3635 do { \
3636 ENCODE_CONTROL_SEQUENCE_INTRODUCER (dst); \
3637 EMIT_TWO_ASCII_BYTES ('2', ']'); \
3638 } while (0)
3639
3640
3641 #define ENCODE_DIRECTION_L2R() \
3642 do { \
3643 ENCODE_CONTROL_SEQUENCE_INTRODUCER (dst); \
3644 EMIT_TWO_ASCII_BYTES ('0', ']'); \
3645 } while (0)
3646
3647
3648 /* Produce codes for designation and invocation to reset the graphic
3649 planes and registers to initial state. */
3650 #define ENCODE_RESET_PLANE_AND_REGISTER() \
3651 do { \
3652 int reg; \
3653 struct charset *charset; \
3654 \
3655 if (CODING_ISO_INVOCATION (coding, 0) != 0) \
3656 ENCODE_SHIFT_IN; \
3657 for (reg = 0; reg < 4; reg++) \
3658 if (CODING_ISO_INITIAL (coding, reg) >= 0 \
3659 && (CODING_ISO_DESIGNATION (coding, reg) \
3660 != CODING_ISO_INITIAL (coding, reg))) \
3661 { \
3662 charset = CHARSET_FROM_ID (CODING_ISO_INITIAL (coding, reg)); \
3663 ENCODE_DESIGNATION (charset, reg, coding); \
3664 } \
3665 } while (0)
3666
3667
3668 /* Produce designation sequences of charsets in the line started from
3669 SRC to a place pointed by DST, and return updated DST.
3670
3671 If the current block ends before any end-of-line, we may fail to
3672 find all the necessary designations. */
3673
3674 static unsigned char *
3675 encode_designation_at_bol (coding, charbuf, charbuf_end, dst)
3676 struct coding_system *coding;
3677 int *charbuf, *charbuf_end;
3678 unsigned char *dst;
3679 {
3680 struct charset *charset;
3681 /* Table of charsets to be designated to each graphic register. */
3682 int r[4];
3683 int c, found = 0, reg;
3684 int produced_chars = 0;
3685 int multibytep = coding->dst_multibyte;
3686 Lisp_Object attrs;
3687 Lisp_Object charset_list;
3688
3689 attrs = CODING_ID_ATTRS (coding->id);
3690 charset_list = CODING_ATTR_CHARSET_LIST (attrs);
3691 if (EQ (charset_list, Qiso_2022))
3692 charset_list = Viso_2022_charset_list;
3693
3694 for (reg = 0; reg < 4; reg++)
3695 r[reg] = -1;
3696
3697 while (found < 4)
3698 {
3699 int id;
3700
3701 c = *charbuf++;
3702 if (c == '\n')
3703 break;
3704 charset = char_charset (c, charset_list, NULL);
3705 id = CHARSET_ID (charset);
3706 reg = CODING_ISO_REQUEST (coding, id);
3707 if (reg >= 0 && r[reg] < 0)
3708 {
3709 found++;
3710 r[reg] = id;
3711 }
3712 }
3713
3714 if (found)
3715 {
3716 for (reg = 0; reg < 4; reg++)
3717 if (r[reg] >= 0
3718 && CODING_ISO_DESIGNATION (coding, reg) != r[reg])
3719 ENCODE_DESIGNATION (CHARSET_FROM_ID (r[reg]), reg, coding);
3720 }
3721
3722 return dst;
3723 }
3724
3725 /* See the above "GENERAL NOTES on `encode_coding_XXX ()' functions". */
3726
3727 static int
3728 encode_coding_iso_2022 (coding)
3729 struct coding_system *coding;
3730 {
3731 int multibytep = coding->dst_multibyte;
3732 int *charbuf = coding->charbuf;
3733 int *charbuf_end = charbuf + coding->charbuf_used;
3734 unsigned char *dst = coding->destination + coding->produced;
3735 unsigned char *dst_end = coding->destination + coding->dst_bytes;
3736 int safe_room = 16;
3737 int bol_designation
3738 = (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DESIGNATE_AT_BOL
3739 && CODING_ISO_BOL (coding));
3740 int produced_chars = 0;
3741 Lisp_Object attrs, eol_type, charset_list;
3742 int ascii_compatible;
3743 int c;
3744 int preferred_charset_id = -1;
3745
3746 CODING_GET_INFO (coding, attrs, charset_list);
3747 eol_type = CODING_ID_EOL_TYPE (coding->id);
3748 if (VECTORP (eol_type))
3749 eol_type = Qunix;
3750
3751 setup_iso_safe_charsets (attrs);
3752 /* Charset list may have been changed. */
3753 charset_list = CODING_ATTR_CHARSET_LIST (attrs); \
3754 coding->safe_charsets = (char *) SDATA (CODING_ATTR_SAFE_CHARSETS(attrs));
3755
3756 ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs));
3757
3758 while (charbuf < charbuf_end)
3759 {
3760 ASSURE_DESTINATION (safe_room);
3761
3762 if (bol_designation)
3763 {
3764 unsigned char *dst_prev = dst;
3765
3766 /* We have to produce designation sequences if any now. */
3767 dst = encode_designation_at_bol (coding, charbuf, charbuf_end, dst);
3768 bol_designation = 0;
3769 /* We are sure that designation sequences are all ASCII bytes. */
3770 produced_chars += dst - dst_prev;
3771 }
3772
3773 c = *charbuf++;
3774
3775 if (c < 0)
3776 {
3777 /* Handle an annotation. */
3778 switch (*charbuf)
3779 {
3780 case CODING_ANNOTATE_COMPOSITION_MASK:
3781 /* Not yet implemented. */
3782 break;
3783 case CODING_ANNOTATE_CHARSET_MASK:
3784 preferred_charset_id = charbuf[3];
3785 if (preferred_charset_id >= 0
3786 && NILP (Fmemq (make_number (preferred_charset_id),
3787 charset_list)))
3788 preferred_charset_id = -1;
3789 break;
3790 default:
3791 abort ();
3792 }
3793 charbuf += -c - 1;
3794 continue;
3795 }
3796
3797 /* Now encode the character C. */
3798 if (c < 0x20 || c == 0x7F)
3799 {
3800 if (c == '\n'
3801 || (c == '\r' && EQ (eol_type, Qmac)))
3802 {
3803 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_RESET_AT_EOL)
3804 ENCODE_RESET_PLANE_AND_REGISTER ();
3805 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_INIT_AT_BOL)
3806 {
3807 int i;
3808
3809 for (i = 0; i < 4; i++)
3810 CODING_ISO_DESIGNATION (coding, i)
3811 = CODING_ISO_INITIAL (coding, i);
3812 }
3813 bol_designation
3814 = CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DESIGNATE_AT_BOL;
3815 }
3816 else if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_RESET_AT_CNTL)
3817 ENCODE_RESET_PLANE_AND_REGISTER ();
3818 EMIT_ONE_ASCII_BYTE (c);
3819 }
3820 else if (ASCII_CHAR_P (c))
3821 {
3822 if (ascii_compatible)
3823 EMIT_ONE_ASCII_BYTE (c);
3824 else
3825 {
3826 struct charset *charset = CHARSET_FROM_ID (charset_ascii);
3827 ENCODE_ISO_CHARACTER (charset, c);
3828 }
3829 }
3830 else if (CHAR_BYTE8_P (c))
3831 {
3832 c = CHAR_TO_BYTE8 (c);
3833 EMIT_ONE_BYTE (c);
3834 }
3835 else
3836 {
3837 struct charset *charset;
3838
3839 if (preferred_charset_id >= 0)
3840 {
3841 charset = CHARSET_FROM_ID (preferred_charset_id);
3842 if (! CHAR_CHARSET_P (c, charset))
3843 charset = char_charset (c, charset_list, NULL);
3844 }
3845 else
3846 charset = char_charset (c, charset_list, NULL);
3847 if (!charset)
3848 {
3849 if (coding->mode & CODING_MODE_SAFE_ENCODING)
3850 {
3851 c = CODING_INHIBIT_CHARACTER_SUBSTITUTION;
3852 charset = CHARSET_FROM_ID (charset_ascii);
3853 }
3854 else
3855 {
3856 c = coding->default_char;
3857 charset = char_charset (c, charset_list, NULL);
3858 }
3859 }
3860 ENCODE_ISO_CHARACTER (charset, c);
3861 }
3862 }
3863
3864 if (coding->mode & CODING_MODE_LAST_BLOCK
3865 && CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_RESET_AT_EOL)
3866 {
3867 ASSURE_DESTINATION (safe_room);
3868 ENCODE_RESET_PLANE_AND_REGISTER ();
3869 }
3870 record_conversion_result (coding, CODING_RESULT_SUCCESS);
3871 CODING_ISO_BOL (coding) = bol_designation;
3872 coding->produced_char += produced_chars;
3873 coding->produced = dst - coding->destination;
3874 return 0;
3875 }
3876
3877 \f
3878 /*** 8,9. SJIS and BIG5 handlers ***/
3879
3880 /* Although SJIS and BIG5 are not ISO's coding system, they are used
3881 quite widely. So, for the moment, Emacs supports them in the bare
3882 C code. But, in the future, they may be supported only by CCL. */
3883
3884 /* SJIS is a coding system encoding three character sets: ASCII, right
3885 half of JISX0201-Kana, and JISX0208. An ASCII character is encoded
3886 as is. A character of charset katakana-jisx0201 is encoded by
3887 "position-code + 0x80". A character of charset japanese-jisx0208
3888 is encoded in 2-byte but two position-codes are divided and shifted
3889 so that it fit in the range below.
3890
3891 --- CODE RANGE of SJIS ---
3892 (character set) (range)
3893 ASCII 0x00 .. 0x7F
3894 KATAKANA-JISX0201 0xA0 .. 0xDF
3895 JISX0208 (1st byte) 0x81 .. 0x9F and 0xE0 .. 0xEF
3896 (2nd byte) 0x40 .. 0x7E and 0x80 .. 0xFC
3897 -------------------------------
3898
3899 */
3900
3901 /* BIG5 is a coding system encoding two character sets: ASCII and
3902 Big5. An ASCII character is encoded as is. Big5 is a two-byte
3903 character set and is encoded in two-byte.
3904
3905 --- CODE RANGE of BIG5 ---
3906 (character set) (range)
3907 ASCII 0x00 .. 0x7F
3908 Big5 (1st byte) 0xA1 .. 0xFE
3909 (2nd byte) 0x40 .. 0x7E and 0xA1 .. 0xFE
3910 --------------------------
3911
3912 */
3913
3914 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
3915 Check if a text is encoded in SJIS. If it is, return
3916 CATEGORY_MASK_SJIS, else return 0. */
3917
3918 static int
3919 detect_coding_sjis (coding, detect_info)
3920 struct coding_system *coding;
3921 struct coding_detection_info *detect_info;
3922 {
3923 const unsigned char *src = coding->source, *src_base;
3924 const unsigned char *src_end = coding->source + coding->src_bytes;
3925 int multibytep = coding->src_multibyte;
3926 int consumed_chars = 0;
3927 int found = 0;
3928 int c;
3929
3930 detect_info->checked |= CATEGORY_MASK_SJIS;
3931 /* A coding system of this category is always ASCII compatible. */
3932 src += coding->head_ascii;
3933
3934 while (1)
3935 {
3936 src_base = src;
3937 ONE_MORE_BYTE (c);
3938 if (c < 0x80)
3939 continue;
3940 if ((c >= 0x81 && c <= 0x9F) || (c >= 0xE0 && c <= 0xEF))
3941 {
3942 ONE_MORE_BYTE (c);
3943 if (c < 0x40 || c == 0x7F || c > 0xFC)
3944 break;
3945 found = CATEGORY_MASK_SJIS;
3946 }
3947 else if (c >= 0xA0 && c < 0xE0)
3948 found = CATEGORY_MASK_SJIS;
3949 else
3950 break;
3951 }
3952 detect_info->rejected |= CATEGORY_MASK_SJIS;
3953 return 0;
3954
3955 no_more_source:
3956 if (src_base < src && coding->mode & CODING_MODE_LAST_BLOCK)
3957 {
3958 detect_info->rejected |= CATEGORY_MASK_SJIS;
3959 return 0;
3960 }
3961 detect_info->found |= found;
3962 return 1;
3963 }
3964
3965 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
3966 Check if a text is encoded in BIG5. If it is, return
3967 CATEGORY_MASK_BIG5, else return 0. */
3968
3969 static int
3970 detect_coding_big5 (coding, detect_info)
3971 struct coding_system *coding;
3972 struct coding_detection_info *detect_info;
3973 {
3974 const unsigned char *src = coding->source, *src_base;
3975 const unsigned char *src_end = coding->source + coding->src_bytes;
3976 int multibytep = coding->src_multibyte;
3977 int consumed_chars = 0;
3978 int found = 0;
3979 int c;
3980
3981 detect_info->checked |= CATEGORY_MASK_BIG5;
3982 /* A coding system of this category is always ASCII compatible. */
3983 src += coding->head_ascii;
3984
3985 while (1)
3986 {
3987 src_base = src;
3988 ONE_MORE_BYTE (c);
3989 if (c < 0x80)
3990 continue;
3991 if (c >= 0xA1)
3992 {
3993 ONE_MORE_BYTE (c);
3994 if (c < 0x40 || (c >= 0x7F && c <= 0xA0))
3995 return 0;
3996 found = CATEGORY_MASK_BIG5;
3997 }
3998 else
3999 break;
4000 }
4001 detect_info->rejected |= CATEGORY_MASK_BIG5;
4002 return 0;
4003
4004 no_more_source:
4005 if (src_base < src && coding->mode & CODING_MODE_LAST_BLOCK)
4006 {
4007 detect_info->rejected |= CATEGORY_MASK_BIG5;
4008 return 0;
4009 }
4010 detect_info->found |= found;
4011 return 1;
4012 }
4013
4014 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".
4015 If SJIS_P is 1, decode SJIS text, else decode BIG5 test. */
4016
4017 static void
4018 decode_coding_sjis (coding)
4019 struct coding_system *coding;
4020 {
4021 const unsigned char *src = coding->source + coding->consumed;
4022 const unsigned char *src_end = coding->source + coding->src_bytes;
4023 const unsigned char *src_base;
4024 int *charbuf = coding->charbuf + coding->charbuf_used;
4025 int *charbuf_end
4026 = coding->charbuf + coding->charbuf_size - MAX_ANNOTATION_LENGTH;
4027 int consumed_chars = 0, consumed_chars_base;
4028 int multibytep = coding->src_multibyte;
4029 struct charset *charset_roman, *charset_kanji, *charset_kana;
4030 struct charset *charset_kanji2;
4031 Lisp_Object attrs, charset_list, val;
4032 int char_offset = coding->produced_char;
4033 int last_offset = char_offset;
4034 int last_id = charset_ascii;
4035
4036 CODING_GET_INFO (coding, attrs, charset_list);
4037
4038 val = charset_list;
4039 charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4040 charset_kana = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4041 charset_kanji = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4042 charset_kanji2 = NILP (val) ? NULL : CHARSET_FROM_ID (XINT (XCAR (val)));
4043
4044 while (1)
4045 {
4046 int c, c1;
4047 struct charset *charset;
4048
4049 src_base = src;
4050 consumed_chars_base = consumed_chars;
4051
4052 if (charbuf >= charbuf_end)
4053 break;
4054
4055 ONE_MORE_BYTE (c);
4056 if (c < 0)
4057 goto invalid_code;
4058 if (c < 0x80)
4059 charset = charset_roman;
4060 else if (c == 0x80 || c == 0xA0)
4061 goto invalid_code;
4062 else if (c >= 0xA1 && c <= 0xDF)
4063 {
4064 /* SJIS -> JISX0201-Kana */
4065 c &= 0x7F;
4066 charset = charset_kana;
4067 }
4068 else if (c <= 0xEF)
4069 {
4070 /* SJIS -> JISX0208 */
4071 ONE_MORE_BYTE (c1);
4072 if (c1 < 0x40 || c1 == 0x7F || c1 > 0xFC)
4073 goto invalid_code;
4074 c = (c << 8) | c1;
4075 SJIS_TO_JIS (c);
4076 charset = charset_kanji;
4077 }
4078 else if (c <= 0xFC && charset_kanji2)
4079 {
4080 /* SJIS -> JISX0213-2 */
4081 ONE_MORE_BYTE (c1);
4082 if (c1 < 0x40 || c1 == 0x7F || c1 > 0xFC)
4083 goto invalid_code;
4084 c = (c << 8) | c1;
4085 SJIS_TO_JIS2 (c);
4086 charset = charset_kanji2;
4087 }
4088 else
4089 goto invalid_code;
4090 if (charset->id != charset_ascii
4091 && last_id != charset->id)
4092 {
4093 if (last_id != charset_ascii)
4094 ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
4095 last_id = charset->id;
4096 last_offset = char_offset;
4097 }
4098 CODING_DECODE_CHAR (coding, src, src_base, src_end, charset, c, c);
4099 *charbuf++ = c;
4100 char_offset++;
4101 continue;
4102
4103 invalid_code:
4104 src = src_base;
4105 consumed_chars = consumed_chars_base;
4106 ONE_MORE_BYTE (c);
4107 *charbuf++ = c < 0 ? -c : BYTE8_TO_CHAR (c);
4108 char_offset++;
4109 coding->errors++;
4110 }
4111
4112 no_more_source:
4113 if (last_id != charset_ascii)
4114 ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
4115 coding->consumed_char += consumed_chars_base;
4116 coding->consumed = src_base - coding->source;
4117 coding->charbuf_used = charbuf - coding->charbuf;
4118 }
4119
4120 static void
4121 decode_coding_big5 (coding)
4122 struct coding_system *coding;
4123 {
4124 const unsigned char *src = coding->source + coding->consumed;
4125 const unsigned char *src_end = coding->source + coding->src_bytes;
4126 const unsigned char *src_base;
4127 int *charbuf = coding->charbuf + coding->charbuf_used;
4128 int *charbuf_end
4129 = coding->charbuf + coding->charbuf_size - MAX_ANNOTATION_LENGTH;
4130 int consumed_chars = 0, consumed_chars_base;
4131 int multibytep = coding->src_multibyte;
4132 struct charset *charset_roman, *charset_big5;
4133 Lisp_Object attrs, charset_list, val;
4134 int char_offset = coding->produced_char;
4135 int last_offset = char_offset;
4136 int last_id = charset_ascii;
4137
4138 CODING_GET_INFO (coding, attrs, charset_list);
4139 val = charset_list;
4140 charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4141 charset_big5 = CHARSET_FROM_ID (XINT (XCAR (val)));
4142
4143 while (1)
4144 {
4145 int c, c1;
4146 struct charset *charset;
4147
4148 src_base = src;
4149 consumed_chars_base = consumed_chars;
4150
4151 if (charbuf >= charbuf_end)
4152 break;
4153
4154 ONE_MORE_BYTE (c);
4155
4156 if (c < 0)
4157 goto invalid_code;
4158 if (c < 0x80)
4159 charset = charset_roman;
4160 else
4161 {
4162 /* BIG5 -> Big5 */
4163 if (c < 0xA1 || c > 0xFE)
4164 goto invalid_code;
4165 ONE_MORE_BYTE (c1);
4166 if (c1 < 0x40 || (c1 > 0x7E && c1 < 0xA1) || c1 > 0xFE)
4167 goto invalid_code;
4168 c = c << 8 | c1;
4169 charset = charset_big5;
4170 }
4171 if (charset->id != charset_ascii
4172 && last_id != charset->id)
4173 {
4174 if (last_id != charset_ascii)
4175 ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
4176 last_id = charset->id;
4177 last_offset = char_offset;
4178 }
4179 CODING_DECODE_CHAR (coding, src, src_base, src_end, charset, c, c);
4180 *charbuf++ = c;
4181 char_offset++;
4182 continue;
4183
4184 invalid_code:
4185 src = src_base;
4186 consumed_chars = consumed_chars_base;
4187 ONE_MORE_BYTE (c);
4188 *charbuf++ = c < 0 ? -c : BYTE8_TO_CHAR (c);
4189 char_offset++;
4190 coding->errors++;
4191 }
4192
4193 no_more_source:
4194 if (last_id != charset_ascii)
4195 ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
4196 coding->consumed_char += consumed_chars_base;
4197 coding->consumed = src_base - coding->source;
4198 coding->charbuf_used = charbuf - coding->charbuf;
4199 }
4200
4201 /* See the above "GENERAL NOTES on `encode_coding_XXX ()' functions".
4202 This function can encode charsets `ascii', `katakana-jisx0201',
4203 `japanese-jisx0208', `chinese-big5-1', and `chinese-big5-2'. We
4204 are sure that all these charsets are registered as official charset
4205 (i.e. do not have extended leading-codes). Characters of other
4206 charsets are produced without any encoding. If SJIS_P is 1, encode
4207 SJIS text, else encode BIG5 text. */
4208
4209 static int
4210 encode_coding_sjis (coding)
4211 struct coding_system *coding;
4212 {
4213 int multibytep = coding->dst_multibyte;
4214 int *charbuf = coding->charbuf;
4215 int *charbuf_end = charbuf + coding->charbuf_used;
4216 unsigned char *dst = coding->destination + coding->produced;
4217 unsigned char *dst_end = coding->destination + coding->dst_bytes;
4218 int safe_room = 4;
4219 int produced_chars = 0;
4220 Lisp_Object attrs, charset_list, val;
4221 int ascii_compatible;
4222 struct charset *charset_roman, *charset_kanji, *charset_kana;
4223 struct charset *charset_kanji2;
4224 int c;
4225
4226 CODING_GET_INFO (coding, attrs, charset_list);
4227 val = charset_list;
4228 charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4229 charset_kana = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4230 charset_kanji = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4231 charset_kanji2 = NILP (val) ? NULL : CHARSET_FROM_ID (XINT (XCAR (val)));
4232
4233 ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs));
4234
4235 while (charbuf < charbuf_end)
4236 {
4237 ASSURE_DESTINATION (safe_room);
4238 c = *charbuf++;
4239 /* Now encode the character C. */
4240 if (ASCII_CHAR_P (c) && ascii_compatible)
4241 EMIT_ONE_ASCII_BYTE (c);
4242 else if (CHAR_BYTE8_P (c))
4243 {
4244 c = CHAR_TO_BYTE8 (c);
4245 EMIT_ONE_BYTE (c);
4246 }
4247 else
4248 {
4249 unsigned code;
4250 struct charset *charset = char_charset (c, charset_list, &code);
4251
4252 if (!charset)
4253 {
4254 if (coding->mode & CODING_MODE_SAFE_ENCODING)
4255 {
4256 code = CODING_INHIBIT_CHARACTER_SUBSTITUTION;
4257 charset = CHARSET_FROM_ID (charset_ascii);
4258 }
4259 else
4260 {
4261 c = coding->default_char;
4262 charset = char_charset (c, charset_list, &code);
4263 }
4264 }
4265 if (code == CHARSET_INVALID_CODE (charset))
4266 abort ();
4267 if (charset == charset_kanji)
4268 {
4269 int c1, c2;
4270 JIS_TO_SJIS (code);
4271 c1 = code >> 8, c2 = code & 0xFF;
4272 EMIT_TWO_BYTES (c1, c2);
4273 }
4274 else if (charset == charset_kana)
4275 EMIT_ONE_BYTE (code | 0x80);
4276 else if (charset_kanji2 && charset == charset_kanji2)
4277 {
4278 int c1, c2;
4279
4280 c1 = code >> 8;
4281 if (c1 == 0x21 || (c1 >= 0x23 && c1 < 0x25)
4282 || (c1 >= 0x2C && c1 <= 0x2F) || c1 >= 0x6E)
4283 {
4284 JIS_TO_SJIS2 (code);
4285 c1 = code >> 8, c2 = code & 0xFF;
4286 EMIT_TWO_BYTES (c1, c2);
4287 }
4288 else
4289 EMIT_ONE_ASCII_BYTE (code & 0x7F);
4290 }
4291 else
4292 EMIT_ONE_ASCII_BYTE (code & 0x7F);
4293 }
4294 }
4295 record_conversion_result (coding, CODING_RESULT_SUCCESS);
4296 coding->produced_char += produced_chars;
4297 coding->produced = dst - coding->destination;
4298 return 0;
4299 }
4300
4301 static int
4302 encode_coding_big5 (coding)
4303 struct coding_system *coding;
4304 {
4305 int multibytep = coding->dst_multibyte;
4306 int *charbuf = coding->charbuf;
4307 int *charbuf_end = charbuf + coding->charbuf_used;
4308 unsigned char *dst = coding->destination + coding->produced;
4309 unsigned char *dst_end = coding->destination + coding->dst_bytes;
4310 int safe_room = 4;
4311 int produced_chars = 0;
4312 Lisp_Object attrs, charset_list, val;
4313 int ascii_compatible;
4314 struct charset *charset_roman, *charset_big5;
4315 int c;
4316
4317 CODING_GET_INFO (coding, attrs, charset_list);
4318 val = charset_list;
4319 charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4320 charset_big5 = CHARSET_FROM_ID (XINT (XCAR (val)));
4321 ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs));
4322
4323 while (charbuf < charbuf_end)
4324 {
4325 ASSURE_DESTINATION (safe_room);
4326 c = *charbuf++;
4327 /* Now encode the character C. */
4328 if (ASCII_CHAR_P (c) && ascii_compatible)
4329 EMIT_ONE_ASCII_BYTE (c);
4330 else if (CHAR_BYTE8_P (c))
4331 {
4332 c = CHAR_TO_BYTE8 (c);
4333 EMIT_ONE_BYTE (c);
4334 }
4335 else
4336 {
4337 unsigned code;
4338 struct charset *charset = char_charset (c, charset_list, &code);
4339
4340 if (! charset)
4341 {
4342 if (coding->mode & CODING_MODE_SAFE_ENCODING)
4343 {
4344 code = CODING_INHIBIT_CHARACTER_SUBSTITUTION;
4345 charset = CHARSET_FROM_ID (charset_ascii);
4346 }
4347 else
4348 {
4349 c = coding->default_char;
4350 charset = char_charset (c, charset_list, &code);
4351 }
4352 }
4353 if (code == CHARSET_INVALID_CODE (charset))
4354 abort ();
4355 if (charset == charset_big5)
4356 {
4357 int c1, c2;
4358
4359 c1 = code >> 8, c2 = code & 0xFF;
4360 EMIT_TWO_BYTES (c1, c2);
4361 }
4362 else
4363 EMIT_ONE_ASCII_BYTE (code & 0x7F);
4364 }
4365 }
4366 record_conversion_result (coding, CODING_RESULT_SUCCESS);
4367 coding->produced_char += produced_chars;
4368 coding->produced = dst - coding->destination;
4369 return 0;
4370 }
4371
4372 \f
4373 /*** 10. CCL handlers ***/
4374
4375 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
4376 Check if a text is encoded in a coding system of which
4377 encoder/decoder are written in CCL program. If it is, return
4378 CATEGORY_MASK_CCL, else return 0. */
4379
4380 static int
4381 detect_coding_ccl (coding, detect_info)
4382 struct coding_system *coding;
4383 struct coding_detection_info *detect_info;
4384 {
4385 const unsigned char *src = coding->source, *src_base;
4386 const unsigned char *src_end = coding->source + coding->src_bytes;
4387 int multibytep = coding->src_multibyte;
4388 int consumed_chars = 0;
4389 int found = 0;
4390 unsigned char *valids;
4391 int head_ascii = coding->head_ascii;
4392 Lisp_Object attrs;
4393
4394 detect_info->checked |= CATEGORY_MASK_CCL;
4395
4396 coding = &coding_categories[coding_category_ccl];
4397 valids = CODING_CCL_VALIDS (coding);
4398 attrs = CODING_ID_ATTRS (coding->id);
4399 if (! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
4400 src += head_ascii;
4401
4402 while (1)
4403 {
4404 int c;
4405
4406 src_base = src;
4407 ONE_MORE_BYTE (c);
4408 if (c < 0 || ! valids[c])
4409 break;
4410 if ((valids[c] > 1))
4411 found = CATEGORY_MASK_CCL;
4412 }
4413 detect_info->rejected |= CATEGORY_MASK_CCL;
4414 return 0;
4415
4416 no_more_source:
4417 detect_info->found |= found;
4418 return 1;
4419 }
4420
4421 static void
4422 decode_coding_ccl (coding)
4423 struct coding_system *coding;
4424 {
4425 const unsigned char *src = coding->source + coding->consumed;
4426 const unsigned char *src_end = coding->source + coding->src_bytes;
4427 int *charbuf = coding->charbuf + coding->charbuf_used;
4428 int *charbuf_end = coding->charbuf + coding->charbuf_size;
4429 int consumed_chars = 0;
4430 int multibytep = coding->src_multibyte;
4431 struct ccl_program ccl;
4432 int source_charbuf[1024];
4433 int source_byteidx[1024];
4434 Lisp_Object attrs, charset_list;
4435
4436 CODING_GET_INFO (coding, attrs, charset_list);
4437 setup_ccl_program (&ccl, CODING_CCL_DECODER (coding));
4438
4439 while (src < src_end)
4440 {
4441 const unsigned char *p = src;
4442 int *source, *source_end;
4443 int i = 0;
4444
4445 if (multibytep)
4446 while (i < 1024 && p < src_end)
4447 {
4448 source_byteidx[i] = p - src;
4449 source_charbuf[i++] = STRING_CHAR_ADVANCE (p);
4450 }
4451 else
4452 while (i < 1024 && p < src_end)
4453 source_charbuf[i++] = *p++;
4454
4455 if (p == src_end && coding->mode & CODING_MODE_LAST_BLOCK)
4456 ccl.last_block = 1;
4457
4458 source = source_charbuf;
4459 source_end = source + i;
4460 while (source < source_end)
4461 {
4462 ccl_driver (&ccl, source, charbuf,
4463 source_end - source, charbuf_end - charbuf,
4464 charset_list);
4465 source += ccl.consumed;
4466 charbuf += ccl.produced;
4467 if (ccl.status != CCL_STAT_SUSPEND_BY_DST)
4468 break;
4469 }
4470 if (source < source_end)
4471 src += source_byteidx[source - source_charbuf];
4472 else
4473 src = p;
4474 consumed_chars += source - source_charbuf;
4475
4476 if (ccl.status != CCL_STAT_SUSPEND_BY_SRC
4477 && ccl.status != CODING_RESULT_INSUFFICIENT_SRC)
4478 break;
4479 }
4480
4481 switch (ccl.status)
4482 {
4483 case CCL_STAT_SUSPEND_BY_SRC:
4484 record_conversion_result (coding, CODING_RESULT_INSUFFICIENT_SRC);
4485 break;
4486 case CCL_STAT_SUSPEND_BY_DST:
4487 break;
4488 case CCL_STAT_QUIT:
4489 case CCL_STAT_INVALID_CMD:
4490 record_conversion_result (coding, CODING_RESULT_INTERRUPT);
4491 break;
4492 default:
4493 record_conversion_result (coding, CODING_RESULT_SUCCESS);
4494 break;
4495 }
4496 coding->consumed_char += consumed_chars;
4497 coding->consumed = src - coding->source;
4498 coding->charbuf_used = charbuf - coding->charbuf;
4499 }
4500
4501 static int
4502 encode_coding_ccl (coding)
4503 struct coding_system *coding;
4504 {
4505 struct ccl_program ccl;
4506 int multibytep = coding->dst_multibyte;
4507 int *charbuf = coding->charbuf;
4508 int *charbuf_end = charbuf + coding->charbuf_used;
4509 unsigned char *dst = coding->destination + coding->produced;
4510 unsigned char *dst_end = coding->destination + coding->dst_bytes;
4511 unsigned char *adjusted_dst_end = dst_end - 1;
4512 int destination_charbuf[1024];
4513 int i, produced_chars = 0;
4514 Lisp_Object attrs, charset_list;
4515
4516 CODING_GET_INFO (coding, attrs, charset_list);
4517 setup_ccl_program (&ccl, CODING_CCL_ENCODER (coding));
4518
4519 ccl.last_block = coding->mode & CODING_MODE_LAST_BLOCK;
4520 ccl.dst_multibyte = coding->dst_multibyte;
4521
4522 while (charbuf < charbuf_end && dst < adjusted_dst_end)
4523 {
4524 int dst_bytes = dst_end - dst;
4525 if (dst_bytes > 1024)
4526 dst_bytes = 1024;
4527
4528 ccl_driver (&ccl, charbuf, destination_charbuf,
4529 charbuf_end - charbuf, dst_bytes, charset_list);
4530 charbuf += ccl.consumed;
4531 if (multibytep)
4532 for (i = 0; i < ccl.produced; i++)
4533 EMIT_ONE_BYTE (destination_charbuf[i] & 0xFF);
4534 else
4535 {
4536 for (i = 0; i < ccl.produced; i++)
4537 *dst++ = destination_charbuf[i] & 0xFF;
4538 produced_chars += ccl.produced;
4539 }
4540 }
4541
4542 switch (ccl.status)
4543 {
4544 case CCL_STAT_SUSPEND_BY_SRC:
4545 record_conversion_result (coding, CODING_RESULT_INSUFFICIENT_SRC);
4546 break;
4547 case CCL_STAT_SUSPEND_BY_DST:
4548 record_conversion_result (coding, CODING_RESULT_INSUFFICIENT_DST);
4549 break;
4550 case CCL_STAT_QUIT:
4551 case CCL_STAT_INVALID_CMD:
4552 record_conversion_result (coding, CODING_RESULT_INTERRUPT);
4553 break;
4554 default:
4555 record_conversion_result (coding, CODING_RESULT_SUCCESS);
4556 break;
4557 }
4558
4559 coding->produced_char += produced_chars;
4560 coding->produced = dst - coding->destination;
4561 return 0;
4562 }
4563
4564
4565 \f
4566 /*** 10, 11. no-conversion handlers ***/
4567
4568 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions". */
4569
4570 static void
4571 decode_coding_raw_text (coding)
4572 struct coding_system *coding;
4573 {
4574 coding->chars_at_source = 1;
4575 coding->consumed_char = 0;
4576 coding->consumed = 0;
4577 record_conversion_result (coding, CODING_RESULT_SUCCESS);
4578 }
4579
4580 static int
4581 encode_coding_raw_text (coding)
4582 struct coding_system *coding;
4583 {
4584 int multibytep = coding->dst_multibyte;
4585 int *charbuf = coding->charbuf;
4586 int *charbuf_end = coding->charbuf + coding->charbuf_used;
4587 unsigned char *dst = coding->destination + coding->produced;
4588 unsigned char *dst_end = coding->destination + coding->dst_bytes;
4589 int produced_chars = 0;
4590 int c;
4591
4592 if (multibytep)
4593 {
4594 int safe_room = MAX_MULTIBYTE_LENGTH * 2;
4595
4596 if (coding->src_multibyte)
4597 while (charbuf < charbuf_end)
4598 {
4599 ASSURE_DESTINATION (safe_room);
4600 c = *charbuf++;
4601 if (ASCII_CHAR_P (c))
4602 EMIT_ONE_ASCII_BYTE (c);
4603 else if (CHAR_BYTE8_P (c))
4604 {
4605 c = CHAR_TO_BYTE8 (c);
4606 EMIT_ONE_BYTE (c);
4607 }
4608 else
4609 {
4610 unsigned char str[MAX_MULTIBYTE_LENGTH], *p0 = str, *p1 = str;
4611
4612 CHAR_STRING_ADVANCE (c, p1);
4613 while (p0 < p1)
4614 {
4615 EMIT_ONE_BYTE (*p0);
4616 p0++;
4617 }
4618 }
4619 }
4620 else
4621 while (charbuf < charbuf_end)
4622 {
4623 ASSURE_DESTINATION (safe_room);
4624 c = *charbuf++;
4625 EMIT_ONE_BYTE (c);
4626 }
4627 }
4628 else
4629 {
4630 if (coding->src_multibyte)
4631 {
4632 int safe_room = MAX_MULTIBYTE_LENGTH;
4633
4634 while (charbuf < charbuf_end)
4635 {
4636 ASSURE_DESTINATION (safe_room);
4637 c = *charbuf++;
4638 if (ASCII_CHAR_P (c))
4639 *dst++ = c;
4640 else if (CHAR_BYTE8_P (c))
4641 *dst++ = CHAR_TO_BYTE8 (c);
4642 else
4643 CHAR_STRING_ADVANCE (c, dst);
4644 produced_chars++;
4645 }
4646 }
4647 else
4648 {
4649 ASSURE_DESTINATION (charbuf_end - charbuf);
4650 while (charbuf < charbuf_end && dst < dst_end)
4651 *dst++ = *charbuf++;
4652 produced_chars = dst - (coding->destination + coding->dst_bytes);
4653 }
4654 }
4655 record_conversion_result (coding, CODING_RESULT_SUCCESS);
4656 coding->produced_char += produced_chars;
4657 coding->produced = dst - coding->destination;
4658 return 0;
4659 }
4660
4661 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
4662 Check if a text is encoded in a charset-based coding system. If it
4663 is, return 1, else return 0. */
4664
4665 static int
4666 detect_coding_charset (coding, detect_info)
4667 struct coding_system *coding;
4668 struct coding_detection_info *detect_info;
4669 {
4670 const unsigned char *src = coding->source, *src_base;
4671 const unsigned char *src_end = coding->source + coding->src_bytes;
4672 int multibytep = coding->src_multibyte;
4673 int consumed_chars = 0;
4674 Lisp_Object attrs, valids;
4675 int found = 0;
4676
4677 detect_info->checked |= CATEGORY_MASK_CHARSET;
4678
4679 coding = &coding_categories[coding_category_charset];
4680 attrs = CODING_ID_ATTRS (coding->id);
4681 valids = AREF (attrs, coding_attr_charset_valids);
4682
4683 if (! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
4684 src += coding->head_ascii;
4685
4686 while (1)
4687 {
4688 int c;
4689
4690 src_base = src;
4691 ONE_MORE_BYTE (c);
4692 if (c < 0)
4693 continue;
4694 if (NILP (AREF (valids, c)))
4695 break;
4696 if (c >= 0x80)
4697 found = CATEGORY_MASK_CHARSET;
4698 }
4699 detect_info->rejected |= CATEGORY_MASK_CHARSET;
4700 return 0;
4701
4702 no_more_source:
4703 detect_info->found |= found;
4704 return 1;
4705 }
4706
4707 static void
4708 decode_coding_charset (coding)
4709 struct coding_system *coding;
4710 {
4711 const unsigned char *src = coding->source + coding->consumed;
4712 const unsigned char *src_end = coding->source + coding->src_bytes;
4713 const unsigned char *src_base;
4714 int *charbuf = coding->charbuf + coding->charbuf_used;
4715 int *charbuf_end
4716 = coding->charbuf + coding->charbuf_size - MAX_ANNOTATION_LENGTH;
4717 int consumed_chars = 0, consumed_chars_base;
4718 int multibytep = coding->src_multibyte;
4719 Lisp_Object attrs, charset_list, valids;
4720 int char_offset = coding->produced_char;
4721 int last_offset = char_offset;
4722 int last_id = charset_ascii;
4723
4724 CODING_GET_INFO (coding, attrs, charset_list);
4725 valids = AREF (attrs, coding_attr_charset_valids);
4726
4727 while (1)
4728 {
4729 int c;
4730 Lisp_Object val;
4731 struct charset *charset;
4732 int dim;
4733 int len = 1;
4734 unsigned code;
4735
4736 src_base = src;
4737 consumed_chars_base = consumed_chars;
4738
4739 if (charbuf >= charbuf_end)
4740 break;
4741
4742 ONE_MORE_BYTE (c);
4743 if (c < 0)
4744 goto invalid_code;
4745 code = c;
4746
4747 val = AREF (valids, c);
4748 if (NILP (val))
4749 goto invalid_code;
4750 if (INTEGERP (val))
4751 {
4752 charset = CHARSET_FROM_ID (XFASTINT (val));
4753 dim = CHARSET_DIMENSION (charset);
4754 while (len < dim)
4755 {
4756 ONE_MORE_BYTE (c);
4757 code = (code << 8) | c;
4758 len++;
4759 }
4760 CODING_DECODE_CHAR (coding, src, src_base, src_end,
4761 charset, code, c);
4762 }
4763 else
4764 {
4765 /* VAL is a list of charset IDs. It is assured that the
4766 list is sorted by charset dimensions (smaller one
4767 comes first). */
4768 while (CONSP (val))
4769 {
4770 charset = CHARSET_FROM_ID (XFASTINT (XCAR (val)));
4771 dim = CHARSET_DIMENSION (charset);
4772 while (len < dim)
4773 {
4774 ONE_MORE_BYTE (c);
4775 code = (code << 8) | c;
4776 len++;
4777 }
4778 CODING_DECODE_CHAR (coding, src, src_base,
4779 src_end, charset, code, c);
4780 if (c >= 0)
4781 break;
4782 val = XCDR (val);
4783 }
4784 }
4785 if (c < 0)
4786 goto invalid_code;
4787 if (charset->id != charset_ascii
4788 && last_id != charset->id)
4789 {
4790 if (last_id != charset_ascii)
4791 ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
4792 last_id = charset->id;
4793 last_offset = char_offset;
4794 }
4795
4796 *charbuf++ = c;
4797 char_offset++;
4798 continue;
4799
4800 invalid_code:
4801 src = src_base;
4802 consumed_chars = consumed_chars_base;
4803 ONE_MORE_BYTE (c);
4804 *charbuf++ = c < 0 ? -c : ASCII_BYTE_P (c) ? c : BYTE8_TO_CHAR (c);
4805 char_offset++;
4806 coding->errors++;
4807 }
4808
4809 no_more_source:
4810 if (last_id != charset_ascii)
4811 ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
4812 coding->consumed_char += consumed_chars_base;
4813 coding->consumed = src_base - coding->source;
4814 coding->charbuf_used = charbuf - coding->charbuf;
4815 }
4816
4817 static int
4818 encode_coding_charset (coding)
4819 struct coding_system *coding;
4820 {
4821 int multibytep = coding->dst_multibyte;
4822 int *charbuf = coding->charbuf;
4823 int *charbuf_end = charbuf + coding->charbuf_used;
4824 unsigned char *dst = coding->destination + coding->produced;
4825 unsigned char *dst_end = coding->destination + coding->dst_bytes;
4826 int safe_room = MAX_MULTIBYTE_LENGTH;
4827 int produced_chars = 0;
4828 Lisp_Object attrs, charset_list;
4829 int ascii_compatible;
4830 int c;
4831
4832 CODING_GET_INFO (coding, attrs, charset_list);
4833 ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs));
4834
4835 while (charbuf < charbuf_end)
4836 {
4837 struct charset *charset;
4838 unsigned code;
4839
4840 ASSURE_DESTINATION (safe_room);
4841 c = *charbuf++;
4842 if (ascii_compatible && ASCII_CHAR_P (c))
4843 EMIT_ONE_ASCII_BYTE (c);
4844 else if (CHAR_BYTE8_P (c))
4845 {
4846 c = CHAR_TO_BYTE8 (c);
4847 EMIT_ONE_BYTE (c);
4848 }
4849 else
4850 {
4851 charset = char_charset (c, charset_list, &code);
4852 if (charset)
4853 {
4854 if (CHARSET_DIMENSION (charset) == 1)
4855 EMIT_ONE_BYTE (code);
4856 else if (CHARSET_DIMENSION (charset) == 2)
4857 EMIT_TWO_BYTES (code >> 8, code & 0xFF);
4858 else if (CHARSET_DIMENSION (charset) == 3)
4859 EMIT_THREE_BYTES (code >> 16, (code >> 8) & 0xFF, code & 0xFF);
4860 else
4861 EMIT_FOUR_BYTES (code >> 24, (code >> 16) & 0xFF,
4862 (code >> 8) & 0xFF, code & 0xFF);
4863 }
4864 else
4865 {
4866 if (coding->mode & CODING_MODE_SAFE_ENCODING)
4867 c = CODING_INHIBIT_CHARACTER_SUBSTITUTION;
4868 else
4869 c = coding->default_char;
4870 EMIT_ONE_BYTE (c);
4871 }
4872 }
4873 }
4874
4875 record_conversion_result (coding, CODING_RESULT_SUCCESS);
4876 coding->produced_char += produced_chars;
4877 coding->produced = dst - coding->destination;
4878 return 0;
4879 }
4880
4881 \f
4882 /*** 7. C library functions ***/
4883
4884 /* Setup coding context CODING from information about CODING_SYSTEM.
4885 If CODING_SYSTEM is nil, `no-conversion' is assumed. If
4886 CODING_SYSTEM is invalid, signal an error. */
4887
4888 void
4889 setup_coding_system (coding_system, coding)
4890 Lisp_Object coding_system;
4891 struct coding_system *coding;
4892 {
4893 Lisp_Object attrs;
4894 Lisp_Object eol_type;
4895 Lisp_Object coding_type;
4896 Lisp_Object val;
4897
4898 if (NILP (coding_system))
4899 coding_system = Qno_conversion;
4900
4901 CHECK_CODING_SYSTEM_GET_ID (coding_system, coding->id);
4902
4903 attrs = CODING_ID_ATTRS (coding->id);
4904 eol_type = CODING_ID_EOL_TYPE (coding->id);
4905
4906 coding->mode = 0;
4907 coding->head_ascii = -1;
4908 coding->common_flags
4909 = (VECTORP (eol_type) ? CODING_REQUIRE_DETECTION_MASK : 0);
4910 if (! NILP (CODING_ATTR_POST_READ (attrs)))
4911 coding->common_flags |= CODING_REQUIRE_DECODING_MASK;
4912 if (! NILP (CODING_ATTR_PRE_WRITE (attrs)))
4913 coding->common_flags |= CODING_REQUIRE_ENCODING_MASK;
4914 if (! NILP (CODING_ATTR_FOR_UNIBYTE (attrs)))
4915 coding->common_flags |= CODING_FOR_UNIBYTE_MASK;
4916
4917 val = CODING_ATTR_SAFE_CHARSETS (attrs);
4918 coding->max_charset_id = SCHARS (val) - 1;
4919 coding->safe_charsets = (char *) SDATA (val);
4920 coding->default_char = XINT (CODING_ATTR_DEFAULT_CHAR (attrs));
4921
4922 coding_type = CODING_ATTR_TYPE (attrs);
4923 if (EQ (coding_type, Qundecided))
4924 {
4925 coding->detector = NULL;
4926 coding->decoder = decode_coding_raw_text;
4927 coding->encoder = encode_coding_raw_text;
4928 coding->common_flags |= CODING_REQUIRE_DETECTION_MASK;
4929 }
4930 else if (EQ (coding_type, Qiso_2022))
4931 {
4932 int i;
4933 int flags = XINT (AREF (attrs, coding_attr_iso_flags));
4934
4935 /* Invoke graphic register 0 to plane 0. */
4936 CODING_ISO_INVOCATION (coding, 0) = 0;
4937 /* Invoke graphic register 1 to plane 1 if we can use 8-bit. */
4938 CODING_ISO_INVOCATION (coding, 1)
4939 = (flags & CODING_ISO_FLAG_SEVEN_BITS ? -1 : 1);
4940 /* Setup the initial status of designation. */
4941 for (i = 0; i < 4; i++)
4942 CODING_ISO_DESIGNATION (coding, i) = CODING_ISO_INITIAL (coding, i);
4943 /* Not single shifting initially. */
4944 CODING_ISO_SINGLE_SHIFTING (coding) = 0;
4945 /* Beginning of buffer should also be regarded as bol. */
4946 CODING_ISO_BOL (coding) = 1;
4947 coding->detector = detect_coding_iso_2022;
4948 coding->decoder = decode_coding_iso_2022;
4949 coding->encoder = encode_coding_iso_2022;
4950 if (flags & CODING_ISO_FLAG_SAFE)
4951 coding->mode |= CODING_MODE_SAFE_ENCODING;
4952 coding->common_flags
4953 |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK
4954 | CODING_REQUIRE_FLUSHING_MASK);
4955 if (flags & CODING_ISO_FLAG_COMPOSITION)
4956 coding->common_flags |= CODING_ANNOTATE_COMPOSITION_MASK;
4957 if (flags & CODING_ISO_FLAG_DESIGNATION)
4958 coding->common_flags |= CODING_ANNOTATE_CHARSET_MASK;
4959 if (flags & CODING_ISO_FLAG_FULL_SUPPORT)
4960 {
4961 setup_iso_safe_charsets (attrs);
4962 val = CODING_ATTR_SAFE_CHARSETS (attrs);
4963 coding->max_charset_id = SCHARS (val) - 1;
4964 coding->safe_charsets = (char *) SDATA (val);
4965 }
4966 CODING_ISO_FLAGS (coding) = flags;
4967 }
4968 else if (EQ (coding_type, Qcharset))
4969 {
4970 coding->detector = detect_coding_charset;
4971 coding->decoder = decode_coding_charset;
4972 coding->encoder = encode_coding_charset;
4973 coding->common_flags
4974 |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
4975 }
4976 else if (EQ (coding_type, Qutf_8))
4977 {
4978 coding->detector = detect_coding_utf_8;
4979 coding->decoder = decode_coding_utf_8;
4980 coding->encoder = encode_coding_utf_8;
4981 coding->common_flags
4982 |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
4983 }
4984 else if (EQ (coding_type, Qutf_16))
4985 {
4986 val = AREF (attrs, coding_attr_utf_16_bom);
4987 CODING_UTF_16_BOM (coding) = (CONSP (val) ? utf_16_detect_bom
4988 : EQ (val, Qt) ? utf_16_with_bom
4989 : utf_16_without_bom);
4990 val = AREF (attrs, coding_attr_utf_16_endian);
4991 CODING_UTF_16_ENDIAN (coding) = (EQ (val, Qbig) ? utf_16_big_endian
4992 : utf_16_little_endian);
4993 CODING_UTF_16_SURROGATE (coding) = 0;
4994 coding->detector = detect_coding_utf_16;
4995 coding->decoder = decode_coding_utf_16;
4996 coding->encoder = encode_coding_utf_16;
4997 coding->common_flags
4998 |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
4999 if (CODING_UTF_16_BOM (coding) == utf_16_detect_bom)
5000 coding->common_flags |= CODING_REQUIRE_DETECTION_MASK;
5001 }
5002 else if (EQ (coding_type, Qccl))
5003 {
5004 coding->detector = detect_coding_ccl;
5005 coding->decoder = decode_coding_ccl;
5006 coding->encoder = encode_coding_ccl;
5007 coding->common_flags
5008 |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK
5009 | CODING_REQUIRE_FLUSHING_MASK);
5010 }
5011 else if (EQ (coding_type, Qemacs_mule))
5012 {
5013 coding->detector = detect_coding_emacs_mule;
5014 coding->decoder = decode_coding_emacs_mule;
5015 coding->encoder = encode_coding_emacs_mule;
5016 coding->common_flags
5017 |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
5018 if (! NILP (AREF (attrs, coding_attr_emacs_mule_full))
5019 && ! EQ (CODING_ATTR_CHARSET_LIST (attrs), Vemacs_mule_charset_list))
5020 {
5021 Lisp_Object tail, safe_charsets;
5022 int max_charset_id = 0;
5023
5024 for (tail = Vemacs_mule_charset_list; CONSP (tail);
5025 tail = XCDR (tail))
5026 if (max_charset_id < XFASTINT (XCAR (tail)))
5027 max_charset_id = XFASTINT (XCAR (tail));
5028 safe_charsets = Fmake_string (make_number (max_charset_id + 1),
5029 make_number (255));
5030 for (tail = Vemacs_mule_charset_list; CONSP (tail);
5031 tail = XCDR (tail))
5032 SSET (safe_charsets, XFASTINT (XCAR (tail)), 0);
5033 coding->max_charset_id = max_charset_id;
5034 coding->safe_charsets = (char *) SDATA (safe_charsets);
5035 }
5036 }
5037 else if (EQ (coding_type, Qshift_jis))
5038 {
5039 coding->detector = detect_coding_sjis;
5040 coding->decoder = decode_coding_sjis;
5041 coding->encoder = encode_coding_sjis;
5042 coding->common_flags
5043 |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
5044 }
5045 else if (EQ (coding_type, Qbig5))
5046 {
5047 coding->detector = detect_coding_big5;
5048 coding->decoder = decode_coding_big5;
5049 coding->encoder = encode_coding_big5;
5050 coding->common_flags
5051 |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
5052 }
5053 else /* EQ (coding_type, Qraw_text) */
5054 {
5055 coding->detector = NULL;
5056 coding->decoder = decode_coding_raw_text;
5057 coding->encoder = encode_coding_raw_text;
5058 }
5059
5060 return;
5061 }
5062
5063 /* Return raw-text or one of its subsidiaries that has the same
5064 eol_type as CODING-SYSTEM. */
5065
5066 Lisp_Object
5067 raw_text_coding_system (coding_system)
5068 Lisp_Object coding_system;
5069 {
5070 Lisp_Object spec, attrs;
5071 Lisp_Object eol_type, raw_text_eol_type;
5072
5073 if (NILP (coding_system))
5074 return Qraw_text;
5075 spec = CODING_SYSTEM_SPEC (coding_system);
5076 attrs = AREF (spec, 0);
5077
5078 if (EQ (CODING_ATTR_TYPE (attrs), Qraw_text))
5079 return coding_system;
5080
5081 eol_type = AREF (spec, 2);
5082 if (VECTORP (eol_type))
5083 return Qraw_text;
5084 spec = CODING_SYSTEM_SPEC (Qraw_text);
5085 raw_text_eol_type = AREF (spec, 2);
5086 return (EQ (eol_type, Qunix) ? AREF (raw_text_eol_type, 0)
5087 : EQ (eol_type, Qdos) ? AREF (raw_text_eol_type, 1)
5088 : AREF (raw_text_eol_type, 2));
5089 }
5090
5091
5092 /* If CODING_SYSTEM doesn't specify end-of-line format but PARENT
5093 does, return one of the subsidiary that has the same eol-spec as
5094 PARENT. Otherwise, return CODING_SYSTEM. */
5095
5096 Lisp_Object
5097 coding_inherit_eol_type (coding_system, parent)
5098 Lisp_Object coding_system, parent;
5099 {
5100 Lisp_Object spec, eol_type;
5101
5102 if (NILP (coding_system))
5103 coding_system = Qraw_text;
5104 spec = CODING_SYSTEM_SPEC (coding_system);
5105 eol_type = AREF (spec, 2);
5106 if (VECTORP (eol_type)
5107 && ! NILP (parent))
5108 {
5109 Lisp_Object parent_spec;
5110 Lisp_Object parent_eol_type;
5111
5112 parent_spec
5113 = CODING_SYSTEM_SPEC (buffer_defaults.buffer_file_coding_system);
5114 parent_eol_type = AREF (parent_spec, 2);
5115 if (EQ (parent_eol_type, Qunix))
5116 coding_system = AREF (eol_type, 0);
5117 else if (EQ (parent_eol_type, Qdos))
5118 coding_system = AREF (eol_type, 1);
5119 else if (EQ (parent_eol_type, Qmac))
5120 coding_system = AREF (eol_type, 2);
5121 }
5122 return coding_system;
5123 }
5124
5125 /* Emacs has a mechanism to automatically detect a coding system if it
5126 is one of Emacs' internal format, ISO2022, SJIS, and BIG5. But,
5127 it's impossible to distinguish some coding systems accurately
5128 because they use the same range of codes. So, at first, coding
5129 systems are categorized into 7, those are:
5130
5131 o coding-category-emacs-mule
5132
5133 The category for a coding system which has the same code range
5134 as Emacs' internal format. Assigned the coding-system (Lisp
5135 symbol) `emacs-mule' by default.
5136
5137 o coding-category-sjis
5138
5139 The category for a coding system which has the same code range
5140 as SJIS. Assigned the coding-system (Lisp
5141 symbol) `japanese-shift-jis' by default.
5142
5143 o coding-category-iso-7
5144
5145 The category for a coding system which has the same code range
5146 as ISO2022 of 7-bit environment. This doesn't use any locking
5147 shift and single shift functions. This can encode/decode all
5148 charsets. Assigned the coding-system (Lisp symbol)
5149 `iso-2022-7bit' by default.
5150
5151 o coding-category-iso-7-tight
5152
5153 Same as coding-category-iso-7 except that this can
5154 encode/decode only the specified charsets.
5155
5156 o coding-category-iso-8-1
5157
5158 The category for a coding system which has the same code range
5159 as ISO2022 of 8-bit environment and graphic plane 1 used only
5160 for DIMENSION1 charset. This doesn't use any locking shift
5161 and single shift functions. Assigned the coding-system (Lisp
5162 symbol) `iso-latin-1' by default.
5163
5164 o coding-category-iso-8-2
5165
5166 The category for a coding system which has the same code range
5167 as ISO2022 of 8-bit environment and graphic plane 1 used only
5168 for DIMENSION2 charset. This doesn't use any locking shift
5169 and single shift functions. Assigned the coding-system (Lisp
5170 symbol) `japanese-iso-8bit' by default.
5171
5172 o coding-category-iso-7-else
5173
5174 The category for a coding system which has the same code range
5175 as ISO2022 of 7-bit environemnt but uses locking shift or
5176 single shift functions. Assigned the coding-system (Lisp
5177 symbol) `iso-2022-7bit-lock' by default.
5178
5179 o coding-category-iso-8-else
5180
5181 The category for a coding system which has the same code range
5182 as ISO2022 of 8-bit environemnt but uses locking shift or
5183 single shift functions. Assigned the coding-system (Lisp
5184 symbol) `iso-2022-8bit-ss2' by default.
5185
5186 o coding-category-big5
5187
5188 The category for a coding system which has the same code range
5189 as BIG5. Assigned the coding-system (Lisp symbol)
5190 `cn-big5' by default.
5191
5192 o coding-category-utf-8
5193
5194 The category for a coding system which has the same code range
5195 as UTF-8 (cf. RFC3629). Assigned the coding-system (Lisp
5196 symbol) `utf-8' by default.
5197
5198 o coding-category-utf-16-be
5199
5200 The category for a coding system in which a text has an
5201 Unicode signature (cf. Unicode Standard) in the order of BIG
5202 endian at the head. Assigned the coding-system (Lisp symbol)
5203 `utf-16-be' by default.
5204
5205 o coding-category-utf-16-le
5206
5207 The category for a coding system in which a text has an
5208 Unicode signature (cf. Unicode Standard) in the order of
5209 LITTLE endian at the head. Assigned the coding-system (Lisp
5210 symbol) `utf-16-le' by default.
5211
5212 o coding-category-ccl
5213
5214 The category for a coding system of which encoder/decoder is
5215 written in CCL programs. The default value is nil, i.e., no
5216 coding system is assigned.
5217
5218 o coding-category-binary
5219
5220 The category for a coding system not categorized in any of the
5221 above. Assigned the coding-system (Lisp symbol)
5222 `no-conversion' by default.
5223
5224 Each of them is a Lisp symbol and the value is an actual
5225 `coding-system's (this is also a Lisp symbol) assigned by a user.
5226 What Emacs does actually is to detect a category of coding system.
5227 Then, it uses a `coding-system' assigned to it. If Emacs can't
5228 decide only one possible category, it selects a category of the
5229 highest priority. Priorities of categories are also specified by a
5230 user in a Lisp variable `coding-category-list'.
5231
5232 */
5233
5234 #define EOL_SEEN_NONE 0
5235 #define EOL_SEEN_LF 1
5236 #define EOL_SEEN_CR 2
5237 #define EOL_SEEN_CRLF 4
5238
5239 /* Detect how end-of-line of a text of length SRC_BYTES pointed by
5240 SOURCE is encoded. If CATEGORY is one of
5241 coding_category_utf_16_XXXX, assume that CR and LF are encoded by
5242 two-byte, else they are encoded by one-byte.
5243
5244 Return one of EOL_SEEN_XXX. */
5245
5246 #define MAX_EOL_CHECK_COUNT 3
5247
5248 static int
5249 detect_eol (source, src_bytes, category)
5250 const unsigned char *source;
5251 EMACS_INT src_bytes;
5252 enum coding_category category;
5253 {
5254 const unsigned char *src = source, *src_end = src + src_bytes;
5255 unsigned char c;
5256 int total = 0;
5257 int eol_seen = EOL_SEEN_NONE;
5258
5259 if ((1 << category) & CATEGORY_MASK_UTF_16)
5260 {
5261 int msb, lsb;
5262
5263 msb = category == (coding_category_utf_16_le
5264 | coding_category_utf_16_le_nosig);
5265 lsb = 1 - msb;
5266
5267 while (src + 1 < src_end)
5268 {
5269 c = src[lsb];
5270 if (src[msb] == 0 && (c == '\n' || c == '\r'))
5271 {
5272 int this_eol;
5273
5274 if (c == '\n')
5275 this_eol = EOL_SEEN_LF;
5276 else if (src + 3 >= src_end
5277 || src[msb + 2] != 0
5278 || src[lsb + 2] != '\n')
5279 this_eol = EOL_SEEN_CR;
5280 else
5281 this_eol = EOL_SEEN_CRLF;
5282
5283 if (eol_seen == EOL_SEEN_NONE)
5284 /* This is the first end-of-line. */
5285 eol_seen = this_eol;
5286 else if (eol_seen != this_eol)
5287 {
5288 /* The found type is different from what found before. */
5289 eol_seen = EOL_SEEN_LF;
5290 break;
5291 }
5292 if (++total == MAX_EOL_CHECK_COUNT)
5293 break;
5294 }
5295 src += 2;
5296 }
5297 }
5298 else
5299 {
5300 while (src < src_end)
5301 {
5302 c = *src++;
5303 if (c == '\n' || c == '\r')
5304 {
5305 int this_eol;
5306
5307 if (c == '\n')
5308 this_eol = EOL_SEEN_LF;
5309 else if (src >= src_end || *src != '\n')
5310 this_eol = EOL_SEEN_CR;
5311 else
5312 this_eol = EOL_SEEN_CRLF, src++;
5313
5314 if (eol_seen == EOL_SEEN_NONE)
5315 /* This is the first end-of-line. */
5316 eol_seen = this_eol;
5317 else if (eol_seen != this_eol)
5318 {
5319 /* The found type is different from what found before. */
5320 eol_seen = EOL_SEEN_LF;
5321 break;
5322 }
5323 if (++total == MAX_EOL_CHECK_COUNT)
5324 break;
5325 }
5326 }
5327 }
5328 return eol_seen;
5329 }
5330
5331
5332 static Lisp_Object
5333 adjust_coding_eol_type (coding, eol_seen)
5334 struct coding_system *coding;
5335 int eol_seen;
5336 {
5337 Lisp_Object eol_type;
5338
5339 eol_type = CODING_ID_EOL_TYPE (coding->id);
5340 if (eol_seen & EOL_SEEN_LF)
5341 {
5342 coding->id = CODING_SYSTEM_ID (AREF (eol_type, 0));
5343 eol_type = Qunix;
5344 }
5345 else if (eol_seen & EOL_SEEN_CRLF)
5346 {
5347 coding->id = CODING_SYSTEM_ID (AREF (eol_type, 1));
5348 eol_type = Qdos;
5349 }
5350 else if (eol_seen & EOL_SEEN_CR)
5351 {
5352 coding->id = CODING_SYSTEM_ID (AREF (eol_type, 2));
5353 eol_type = Qmac;
5354 }
5355 return eol_type;
5356 }
5357
5358 /* Detect how a text specified in CODING is encoded. If a coding
5359 system is detected, update fields of CODING by the detected coding
5360 system. */
5361
5362 void
5363 detect_coding (coding)
5364 struct coding_system *coding;
5365 {
5366 const unsigned char *src, *src_end;
5367 Lisp_Object attrs, coding_type;
5368
5369 coding->consumed = coding->consumed_char = 0;
5370 coding->produced = coding->produced_char = 0;
5371 coding_set_source (coding);
5372
5373 src_end = coding->source + coding->src_bytes;
5374
5375 /* If we have not yet decided the text encoding type, detect it
5376 now. */
5377 if (EQ (CODING_ATTR_TYPE (CODING_ID_ATTRS (coding->id)), Qundecided))
5378 {
5379 int c, i;
5380
5381 for (i = 0, src = coding->source; src < src_end; i++, src++)
5382 {
5383 c = *src;
5384 if (c & 0x80 || (c < 0x20 && (c == ISO_CODE_ESC
5385 || c == ISO_CODE_SI
5386 || c == ISO_CODE_SO)))
5387 break;
5388 }
5389 coding->head_ascii = src - (coding->source + coding->consumed);
5390
5391 if (coding->head_ascii < coding->src_bytes)
5392 {
5393 struct coding_detection_info detect_info;
5394 enum coding_category category;
5395 struct coding_system *this;
5396
5397 detect_info.checked = detect_info.found = detect_info.rejected = 0;
5398 for (i = 0; i < coding_category_raw_text; i++)
5399 {
5400 category = coding_priorities[i];
5401 this = coding_categories + category;
5402 if (this->id < 0)
5403 {
5404 /* No coding system of this category is defined. */
5405 detect_info.rejected |= (1 << category);
5406 }
5407 else if (category >= coding_category_raw_text)
5408 continue;
5409 else if (detect_info.checked & (1 << category))
5410 {
5411 if (detect_info.found & (1 << category))
5412 break;
5413 }
5414 else if ((*(this->detector)) (coding, &detect_info)
5415 && detect_info.found & (1 << category))
5416 {
5417 if (category == coding_category_utf_16_auto)
5418 {
5419 if (detect_info.found & CATEGORY_MASK_UTF_16_LE)
5420 category = coding_category_utf_16_le;
5421 else
5422 category = coding_category_utf_16_be;
5423 }
5424 break;
5425 }
5426 }
5427 if (i < coding_category_raw_text)
5428 setup_coding_system (CODING_ID_NAME (this->id), coding);
5429 else if (detect_info.rejected == CATEGORY_MASK_ANY)
5430 setup_coding_system (Qraw_text, coding);
5431 else if (detect_info.rejected)
5432 for (i = 0; i < coding_category_raw_text; i++)
5433 if (! (detect_info.rejected & (1 << coding_priorities[i])))
5434 {
5435 this = coding_categories + coding_priorities[i];
5436 setup_coding_system (CODING_ID_NAME (this->id), coding);
5437 break;
5438 }
5439 }
5440 }
5441 else if (XINT (CODING_ATTR_CATEGORY (CODING_ID_ATTRS (coding->id)))
5442 == coding_category_utf_16_auto)
5443 {
5444 Lisp_Object coding_systems;
5445 struct coding_detection_info detect_info;
5446
5447 coding_systems
5448 = AREF (CODING_ID_ATTRS (coding->id), coding_attr_utf_16_bom);
5449 detect_info.found = detect_info.rejected = 0;
5450 if (CONSP (coding_systems)
5451 && detect_coding_utf_16 (coding, &detect_info))
5452 {
5453 if (detect_info.found & CATEGORY_MASK_UTF_16_LE)
5454 setup_coding_system (XCAR (coding_systems), coding);
5455 else if (detect_info.found & CATEGORY_MASK_UTF_16_BE)
5456 setup_coding_system (XCDR (coding_systems), coding);
5457 }
5458 }
5459 }
5460
5461
5462 static void
5463 decode_eol (coding)
5464 struct coding_system *coding;
5465 {
5466 Lisp_Object eol_type;
5467 unsigned char *p, *pbeg, *pend;
5468
5469 eol_type = CODING_ID_EOL_TYPE (coding->id);
5470 if (EQ (eol_type, Qunix))
5471 return;
5472
5473 if (NILP (coding->dst_object))
5474 pbeg = coding->destination;
5475 else
5476 pbeg = BYTE_POS_ADDR (coding->dst_pos_byte);
5477 pend = pbeg + coding->produced;
5478
5479 if (VECTORP (eol_type))
5480 {
5481 int eol_seen = EOL_SEEN_NONE;
5482
5483 for (p = pbeg; p < pend; p++)
5484 {
5485 if (*p == '\n')
5486 eol_seen |= EOL_SEEN_LF;
5487 else if (*p == '\r')
5488 {
5489 if (p + 1 < pend && *(p + 1) == '\n')
5490 {
5491 eol_seen |= EOL_SEEN_CRLF;
5492 p++;
5493 }
5494 else
5495 eol_seen |= EOL_SEEN_CR;
5496 }
5497 }
5498 if (eol_seen != EOL_SEEN_NONE
5499 && eol_seen != EOL_SEEN_LF
5500 && eol_seen != EOL_SEEN_CRLF
5501 && eol_seen != EOL_SEEN_CR)
5502 eol_seen = EOL_SEEN_LF;
5503 if (eol_seen != EOL_SEEN_NONE)
5504 eol_type = adjust_coding_eol_type (coding, eol_seen);
5505 }
5506
5507 if (EQ (eol_type, Qmac))
5508 {
5509 for (p = pbeg; p < pend; p++)
5510 if (*p == '\r')
5511 *p = '\n';
5512 }
5513 else if (EQ (eol_type, Qdos))
5514 {
5515 int n = 0;
5516
5517 if (NILP (coding->dst_object))
5518 {
5519 for (p = pend - 2; p >= pbeg; p--)
5520 if (*p == '\r')
5521 {
5522 safe_bcopy ((char *) (p + 1), (char *) p, pend-- - p - 1);
5523 n++;
5524 }
5525 }
5526 else
5527 {
5528 for (p = pend - 2; p >= pbeg; p--)
5529 if (*p == '\r')
5530 {
5531 int pos_byte = coding->dst_pos_byte + (p - pbeg);
5532 int pos = BYTE_TO_CHAR (pos_byte);
5533
5534 del_range_2 (pos, pos_byte, pos + 1, pos_byte + 1, 0);
5535 n++;
5536 }
5537 }
5538 coding->produced -= n;
5539 coding->produced_char -= n;
5540 }
5541 }
5542
5543
5544 /* Return a translation table (or list of them) from coding system
5545 attribute vector ATTRS for encoding (ENCODEP is nonzero) or
5546 decoding (ENCODEP is zero). */
5547
5548 static Lisp_Object
5549 get_translation_table (attrs, encodep, max_lookup)
5550 Lisp_Object attrs;
5551 int encodep, *max_lookup;
5552 {
5553 Lisp_Object standard, translation_table;
5554 Lisp_Object val;
5555
5556 if (encodep)
5557 translation_table = CODING_ATTR_ENCODE_TBL (attrs),
5558 standard = Vstandard_translation_table_for_encode;
5559 else
5560 translation_table = CODING_ATTR_DECODE_TBL (attrs),
5561 standard = Vstandard_translation_table_for_decode;
5562 if (NILP (translation_table))
5563 translation_table = standard;
5564 else
5565 {
5566 if (SYMBOLP (translation_table))
5567 translation_table = Fget (translation_table, Qtranslation_table);
5568 else if (CONSP (translation_table))
5569 {
5570 translation_table = Fcopy_sequence (translation_table);
5571 for (val = translation_table; CONSP (val); val = XCDR (val))
5572 if (SYMBOLP (XCAR (val)))
5573 XSETCAR (val, Fget (XCAR (val), Qtranslation_table));
5574 }
5575 if (CHAR_TABLE_P (standard))
5576 {
5577 if (CONSP (translation_table))
5578 translation_table = nconc2 (translation_table,
5579 Fcons (standard, Qnil));
5580 else
5581 translation_table = Fcons (translation_table,
5582 Fcons (standard, Qnil));
5583 }
5584 }
5585
5586 if (max_lookup)
5587 {
5588 *max_lookup = 1;
5589 if (CHAR_TABLE_P (translation_table)
5590 && CHAR_TABLE_EXTRA_SLOTS (XCHAR_TABLE (translation_table)) > 1)
5591 {
5592 val = XCHAR_TABLE (translation_table)->extras[1];
5593 if (NATNUMP (val) && *max_lookup < XFASTINT (val))
5594 *max_lookup = XFASTINT (val);
5595 }
5596 else if (CONSP (translation_table))
5597 {
5598 Lisp_Object tail, val;
5599
5600 for (tail = translation_table; CONSP (tail); tail = XCDR (tail))
5601 if (CHAR_TABLE_P (XCAR (tail))
5602 && CHAR_TABLE_EXTRA_SLOTS (XCHAR_TABLE (XCAR (tail))) > 1)
5603 {
5604 val = XCHAR_TABLE (XCAR (tail))->extras[1];
5605 if (NATNUMP (val) && *max_lookup < XFASTINT (val))
5606 *max_lookup = XFASTINT (val);
5607 }
5608 }
5609 }
5610 return translation_table;
5611 }
5612
5613 #define LOOKUP_TRANSLATION_TABLE(table, c, trans) \
5614 do { \
5615 trans = Qnil; \
5616 if (CHAR_TABLE_P (table)) \
5617 { \
5618 trans = CHAR_TABLE_REF (table, c); \
5619 if (CHARACTERP (trans)) \
5620 c = XFASTINT (trans), trans = Qnil; \
5621 } \
5622 else if (CONSP (table)) \
5623 { \
5624 Lisp_Object tail; \
5625 \
5626 for (tail = table; CONSP (tail); tail = XCDR (tail)) \
5627 if (CHAR_TABLE_P (XCAR (tail))) \
5628 { \
5629 trans = CHAR_TABLE_REF (XCAR (tail), c); \
5630 if (CHARACTERP (trans)) \
5631 c = XFASTINT (trans), trans = Qnil; \
5632 else if (! NILP (trans)) \
5633 break; \
5634 } \
5635 } \
5636 } while (0)
5637
5638
5639 static Lisp_Object
5640 get_translation (val, buf, buf_end, last_block, from_nchars, to_nchars)
5641 Lisp_Object val;
5642 int *buf, *buf_end;
5643 int last_block;
5644 int *from_nchars, *to_nchars;
5645 {
5646 /* VAL is TO or (([FROM-CHAR ...] . TO) ...) where TO is TO-CHAR or
5647 [TO-CHAR ...]. */
5648 if (CONSP (val))
5649 {
5650 Lisp_Object from, tail;
5651 int i, len;
5652
5653 for (tail = val; CONSP (tail); tail = XCDR (tail))
5654 {
5655 val = XCAR (tail);
5656 from = XCAR (val);
5657 len = ASIZE (from);
5658 for (i = 0; i < len; i++)
5659 {
5660 if (buf + i == buf_end)
5661 {
5662 if (! last_block)
5663 return Qt;
5664 break;
5665 }
5666 if (XINT (AREF (from, i)) != buf[i])
5667 break;
5668 }
5669 if (i == len)
5670 {
5671 val = XCDR (val);
5672 *from_nchars = len;
5673 break;
5674 }
5675 }
5676 if (! CONSP (tail))
5677 return Qnil;
5678 }
5679 if (VECTORP (val))
5680 *buf = XINT (AREF (val, 0)), *to_nchars = ASIZE (val);
5681 else
5682 *buf = XINT (val);
5683 return val;
5684 }
5685
5686
5687 static int
5688 produce_chars (coding, translation_table, last_block)
5689 struct coding_system *coding;
5690 Lisp_Object translation_table;
5691 int last_block;
5692 {
5693 unsigned char *dst = coding->destination + coding->produced;
5694 unsigned char *dst_end = coding->destination + coding->dst_bytes;
5695 int produced;
5696 int produced_chars = 0;
5697 int carryover = 0;
5698
5699 if (! coding->chars_at_source)
5700 {
5701 /* Characters are in coding->charbuf. */
5702 int *buf = coding->charbuf;
5703 int *buf_end = buf + coding->charbuf_used;
5704
5705 if (BUFFERP (coding->src_object)
5706 && EQ (coding->src_object, coding->dst_object))
5707 dst_end = ((unsigned char *) coding->source) + coding->consumed;
5708
5709 while (buf < buf_end)
5710 {
5711 int c = *buf, i;
5712
5713 if (c >= 0)
5714 {
5715 int from_nchars = 1, to_nchars = 1;
5716 Lisp_Object trans = Qnil;
5717
5718 LOOKUP_TRANSLATION_TABLE (translation_table, c, trans);
5719 if (! NILP (trans))
5720 {
5721 trans = get_translation (trans, buf, buf_end, last_block,
5722 &from_nchars, &to_nchars);
5723 if (EQ (trans, Qt))
5724 break;
5725 c = *buf;
5726 }
5727
5728 if (dst + MAX_MULTIBYTE_LENGTH * to_nchars > dst_end)
5729 {
5730 dst = alloc_destination (coding,
5731 buf_end - buf
5732 + MAX_MULTIBYTE_LENGTH * to_nchars,
5733 dst);
5734 dst_end = coding->destination + coding->dst_bytes;
5735 }
5736
5737 for (i = 0; i < to_nchars; i++)
5738 {
5739 if (i > 0)
5740 c = XINT (AREF (trans, i));
5741 if (coding->dst_multibyte
5742 || ! CHAR_BYTE8_P (c))
5743 CHAR_STRING_ADVANCE (c, dst);
5744 else
5745 *dst++ = CHAR_TO_BYTE8 (c);
5746 }
5747 produced_chars += to_nchars;
5748 *buf++ = to_nchars;
5749 while (--from_nchars > 0)
5750 *buf++ = 0;
5751 }
5752 else
5753 /* This is an annotation datum. (-C) is the length. */
5754 buf += -c;
5755 }
5756 carryover = buf_end - buf;
5757 }
5758 else
5759 {
5760 const unsigned char *src = coding->source;
5761 const unsigned char *src_end = src + coding->src_bytes;
5762 Lisp_Object eol_type;
5763
5764 eol_type = CODING_ID_EOL_TYPE (coding->id);
5765
5766 if (coding->src_multibyte != coding->dst_multibyte)
5767 {
5768 if (coding->src_multibyte)
5769 {
5770 int multibytep = 1;
5771 int consumed_chars;
5772
5773 while (1)
5774 {
5775 const unsigned char *src_base = src;
5776 int c;
5777
5778 ONE_MORE_BYTE (c);
5779 if (c == '\r')
5780 {
5781 if (EQ (eol_type, Qdos))
5782 {
5783 if (src == src_end)
5784 {
5785 record_conversion_result
5786 (coding, CODING_RESULT_INSUFFICIENT_SRC);
5787 goto no_more_source;
5788 }
5789 if (*src == '\n')
5790 c = *src++;
5791 }
5792 else if (EQ (eol_type, Qmac))
5793 c = '\n';
5794 }
5795 if (dst == dst_end)
5796 {
5797 coding->consumed = src - coding->source;
5798
5799 if (EQ (coding->src_object, coding->dst_object))
5800 dst_end = (unsigned char *) src;
5801 if (dst == dst_end)
5802 {
5803 dst = alloc_destination (coding, src_end - src + 1,
5804 dst);
5805 dst_end = coding->destination + coding->dst_bytes;
5806 coding_set_source (coding);
5807 src = coding->source + coding->consumed;
5808 src_end = coding->source + coding->src_bytes;
5809 }
5810 }
5811 *dst++ = c;
5812 produced_chars++;
5813 }
5814 no_more_source:
5815 ;
5816 }
5817 else
5818 while (src < src_end)
5819 {
5820 int multibytep = 1;
5821 int c = *src++;
5822
5823 if (c == '\r')
5824 {
5825 if (EQ (eol_type, Qdos))
5826 {
5827 if (src < src_end
5828 && *src == '\n')
5829 c = *src++;
5830 }
5831 else if (EQ (eol_type, Qmac))
5832 c = '\n';
5833 }
5834 if (dst >= dst_end - 1)
5835 {
5836 coding->consumed = src - coding->source;
5837
5838 if (EQ (coding->src_object, coding->dst_object))
5839 dst_end = (unsigned char *) src;
5840 if (dst >= dst_end - 1)
5841 {
5842 dst = alloc_destination (coding, src_end - src + 2,
5843 dst);
5844 dst_end = coding->destination + coding->dst_bytes;
5845 coding_set_source (coding);
5846 src = coding->source + coding->consumed;
5847 src_end = coding->source + coding->src_bytes;
5848 }
5849 }
5850 EMIT_ONE_BYTE (c);
5851 }
5852 }
5853 else
5854 {
5855 if (!EQ (coding->src_object, coding->dst_object))
5856 {
5857 int require = coding->src_bytes - coding->dst_bytes;
5858
5859 if (require > 0)
5860 {
5861 EMACS_INT offset = src - coding->source;
5862
5863 dst = alloc_destination (coding, require, dst);
5864 coding_set_source (coding);
5865 src = coding->source + offset;
5866 src_end = coding->source + coding->src_bytes;
5867 }
5868 }
5869 produced_chars = coding->src_chars;
5870 while (src < src_end)
5871 {
5872 int c = *src++;
5873
5874 if (c == '\r')
5875 {
5876 if (EQ (eol_type, Qdos))
5877 {
5878 if (src < src_end
5879 && *src == '\n')
5880 c = *src++;
5881 produced_chars--;
5882 }
5883 else if (EQ (eol_type, Qmac))
5884 c = '\n';
5885 }
5886 *dst++ = c;
5887 }
5888 }
5889 coding->consumed = coding->src_bytes;
5890 coding->consumed_char = coding->src_chars;
5891 }
5892
5893 produced = dst - (coding->destination + coding->produced);
5894 if (BUFFERP (coding->dst_object))
5895 insert_from_gap (produced_chars, produced);
5896 coding->produced += produced;
5897 coding->produced_char += produced_chars;
5898 return carryover;
5899 }
5900
5901 /* Compose text in CODING->object according to the annotation data at
5902 CHARBUF. CHARBUF is an array:
5903 [ -LENGTH ANNOTATION_MASK FROM TO METHOD COMP_LEN [ COMPONENTS... ] ]
5904 */
5905
5906 static INLINE void
5907 produce_composition (coding, charbuf, pos)
5908 struct coding_system *coding;
5909 int *charbuf;
5910 EMACS_INT pos;
5911 {
5912 int len;
5913 EMACS_INT to;
5914 enum composition_method method;
5915 Lisp_Object components;
5916
5917 len = -charbuf[0];
5918 to = pos + charbuf[2];
5919 method = (enum composition_method) (charbuf[3]);
5920
5921 if (method == COMPOSITION_RELATIVE)
5922 components = Qnil;
5923 else
5924 {
5925 Lisp_Object args[MAX_COMPOSITION_COMPONENTS * 2 - 1];
5926 int i;
5927
5928 len -= 4;
5929 charbuf += 4;
5930 for (i = 0; i < len; i++)
5931 args[i] = make_number (charbuf[i]);
5932 components = (method == COMPOSITION_WITH_ALTCHARS
5933 ? Fstring (len, args) : Fvector (len, args));
5934 }
5935 compose_text (pos, to, components, Qnil, coding->dst_object);
5936 }
5937
5938
5939 /* Put `charset' property on text in CODING->object according to
5940 the annotation data at CHARBUF. CHARBUF is an array:
5941 [ -LENGTH ANNOTATION_MASK NCHARS CHARSET-ID ]
5942 */
5943
5944 static INLINE void
5945 produce_charset (coding, charbuf, pos)
5946 struct coding_system *coding;
5947 int *charbuf;
5948 EMACS_INT pos;
5949 {
5950 EMACS_INT from = pos - charbuf[2];
5951 struct charset *charset = CHARSET_FROM_ID (charbuf[3]);
5952
5953 Fput_text_property (make_number (from), make_number (pos),
5954 Qcharset, CHARSET_NAME (charset),
5955 coding->dst_object);
5956 }
5957
5958
5959 #define CHARBUF_SIZE 0x4000
5960
5961 #define ALLOC_CONVERSION_WORK_AREA(coding) \
5962 do { \
5963 int size = CHARBUF_SIZE;; \
5964 \
5965 coding->charbuf = NULL; \
5966 while (size > 1024) \
5967 { \
5968 coding->charbuf = (int *) alloca (sizeof (int) * size); \
5969 if (coding->charbuf) \
5970 break; \
5971 size >>= 1; \
5972 } \
5973 if (! coding->charbuf) \
5974 { \
5975 record_conversion_result (coding, CODING_RESULT_INSUFFICIENT_MEM); \
5976 return coding->result; \
5977 } \
5978 coding->charbuf_size = size; \
5979 } while (0)
5980
5981
5982 static void
5983 produce_annotation (coding, pos)
5984 struct coding_system *coding;
5985 EMACS_INT pos;
5986 {
5987 int *charbuf = coding->charbuf;
5988 int *charbuf_end = charbuf + coding->charbuf_used;
5989
5990 if (NILP (coding->dst_object))
5991 return;
5992
5993 while (charbuf < charbuf_end)
5994 {
5995 if (*charbuf >= 0)
5996 pos += *charbuf++;
5997 else
5998 {
5999 int len = -*charbuf;
6000 switch (charbuf[1])
6001 {
6002 case CODING_ANNOTATE_COMPOSITION_MASK:
6003 produce_composition (coding, charbuf, pos);
6004 break;
6005 case CODING_ANNOTATE_CHARSET_MASK:
6006 produce_charset (coding, charbuf, pos);
6007 break;
6008 default:
6009 abort ();
6010 }
6011 charbuf += len;
6012 }
6013 }
6014 }
6015
6016 /* Decode the data at CODING->src_object into CODING->dst_object.
6017 CODING->src_object is a buffer, a string, or nil.
6018 CODING->dst_object is a buffer.
6019
6020 If CODING->src_object is a buffer, it must be the current buffer.
6021 In this case, if CODING->src_pos is positive, it is a position of
6022 the source text in the buffer, otherwise, the source text is in the
6023 gap area of the buffer, and CODING->src_pos specifies the offset of
6024 the text from GPT (which must be the same as PT). If this is the
6025 same buffer as CODING->dst_object, CODING->src_pos must be
6026 negative.
6027
6028 If CODING->src_object is a string, CODING->src_pos in an index to
6029 that string.
6030
6031 If CODING->src_object is nil, CODING->source must already point to
6032 the non-relocatable memory area. In this case, CODING->src_pos is
6033 an offset from CODING->source.
6034
6035 The decoded data is inserted at the current point of the buffer
6036 CODING->dst_object.
6037 */
6038
6039 static int
6040 decode_coding (coding)
6041 struct coding_system *coding;
6042 {
6043 Lisp_Object attrs;
6044 Lisp_Object undo_list;
6045 Lisp_Object translation_table;
6046 int carryover;
6047 int i;
6048
6049 if (BUFFERP (coding->src_object)
6050 && coding->src_pos > 0
6051 && coding->src_pos < GPT
6052 && coding->src_pos + coding->src_chars > GPT)
6053 move_gap_both (coding->src_pos, coding->src_pos_byte);
6054
6055 undo_list = Qt;
6056 if (BUFFERP (coding->dst_object))
6057 {
6058 if (current_buffer != XBUFFER (coding->dst_object))
6059 set_buffer_internal (XBUFFER (coding->dst_object));
6060 if (GPT != PT)
6061 move_gap_both (PT, PT_BYTE);
6062 undo_list = current_buffer->undo_list;
6063 current_buffer->undo_list = Qt;
6064 }
6065
6066 coding->consumed = coding->consumed_char = 0;
6067 coding->produced = coding->produced_char = 0;
6068 coding->chars_at_source = 0;
6069 record_conversion_result (coding, CODING_RESULT_SUCCESS);
6070 coding->errors = 0;
6071
6072 ALLOC_CONVERSION_WORK_AREA (coding);
6073
6074 attrs = CODING_ID_ATTRS (coding->id);
6075 translation_table = get_translation_table (attrs, 0, NULL);
6076
6077 carryover = 0;
6078 do
6079 {
6080 EMACS_INT pos = coding->dst_pos + coding->produced_char;
6081
6082 coding_set_source (coding);
6083 coding->annotated = 0;
6084 coding->charbuf_used = carryover;
6085 (*(coding->decoder)) (coding);
6086 coding_set_destination (coding);
6087 carryover = produce_chars (coding, translation_table, 0);
6088 if (coding->annotated)
6089 produce_annotation (coding, pos);
6090 for (i = 0; i < carryover; i++)
6091 coding->charbuf[i]
6092 = coding->charbuf[coding->charbuf_used - carryover + i];
6093 }
6094 while (coding->consumed < coding->src_bytes
6095 && ! coding->result);
6096
6097 if (carryover > 0)
6098 {
6099 coding_set_destination (coding);
6100 coding->charbuf_used = carryover;
6101 produce_chars (coding, translation_table, 1);
6102 }
6103
6104 coding->carryover_bytes = 0;
6105 if (coding->consumed < coding->src_bytes)
6106 {
6107 int nbytes = coding->src_bytes - coding->consumed;
6108 const unsigned char *src;
6109
6110 coding_set_source (coding);
6111 coding_set_destination (coding);
6112 src = coding->source + coding->consumed;
6113
6114 if (coding->mode & CODING_MODE_LAST_BLOCK)
6115 {
6116 /* Flush out unprocessed data as binary chars. We are sure
6117 that the number of data is less than the size of
6118 coding->charbuf. */
6119 coding->charbuf_used = 0;
6120 while (nbytes-- > 0)
6121 {
6122 int c = *src++;
6123
6124 coding->charbuf[coding->charbuf_used++] = (c & 0x80 ? - c : c);
6125 }
6126 produce_chars (coding, Qnil, 1);
6127 }
6128 else
6129 {
6130 /* Record unprocessed bytes in coding->carryover. We are
6131 sure that the number of data is less than the size of
6132 coding->carryover. */
6133 unsigned char *p = coding->carryover;
6134
6135 coding->carryover_bytes = nbytes;
6136 while (nbytes-- > 0)
6137 *p++ = *src++;
6138 }
6139 coding->consumed = coding->src_bytes;
6140 }
6141
6142 if (BUFFERP (coding->dst_object))
6143 {
6144 current_buffer->undo_list = undo_list;
6145 record_insert (coding->dst_pos, coding->produced_char);
6146 }
6147 if (! EQ (CODING_ID_EOL_TYPE (coding->id), Qunix))
6148 decode_eol (coding);
6149 return coding->result;
6150 }
6151
6152
6153 /* Extract an annotation datum from a composition starting at POS and
6154 ending before LIMIT of CODING->src_object (buffer or string), store
6155 the data in BUF, set *STOP to a starting position of the next
6156 composition (if any) or to LIMIT, and return the address of the
6157 next element of BUF.
6158
6159 If such an annotation is not found, set *STOP to a starting
6160 position of a composition after POS (if any) or to LIMIT, and
6161 return BUF. */
6162
6163 static INLINE int *
6164 handle_composition_annotation (pos, limit, coding, buf, stop)
6165 EMACS_INT pos, limit;
6166 struct coding_system *coding;
6167 int *buf;
6168 EMACS_INT *stop;
6169 {
6170 EMACS_INT start, end;
6171 Lisp_Object prop;
6172
6173 if (! find_composition (pos, limit, &start, &end, &prop, coding->src_object)
6174 || end > limit)
6175 *stop = limit;
6176 else if (start > pos)
6177 *stop = start;
6178 else
6179 {
6180 if (start == pos)
6181 {
6182 /* We found a composition. Store the corresponding
6183 annotation data in BUF. */
6184 int *head = buf;
6185 enum composition_method method = COMPOSITION_METHOD (prop);
6186 int nchars = COMPOSITION_LENGTH (prop);
6187
6188 ADD_COMPOSITION_DATA (buf, nchars, method);
6189 if (method != COMPOSITION_RELATIVE)
6190 {
6191 Lisp_Object components;
6192 int len, i, i_byte;
6193
6194 components = COMPOSITION_COMPONENTS (prop);
6195 if (VECTORP (components))
6196 {
6197 len = XVECTOR (components)->size;
6198 for (i = 0; i < len; i++)
6199 *buf++ = XINT (AREF (components, i));
6200 }
6201 else if (STRINGP (components))
6202 {
6203 len = SCHARS (components);
6204 i = i_byte = 0;
6205 while (i < len)
6206 {
6207 FETCH_STRING_CHAR_ADVANCE (*buf, components, i, i_byte);
6208 buf++;
6209 }
6210 }
6211 else if (INTEGERP (components))
6212 {
6213 len = 1;
6214 *buf++ = XINT (components);
6215 }
6216 else if (CONSP (components))
6217 {
6218 for (len = 0; CONSP (components);
6219 len++, components = XCDR (components))
6220 *buf++ = XINT (XCAR (components));
6221 }
6222 else
6223 abort ();
6224 *head -= len;
6225 }
6226 }
6227
6228 if (find_composition (end, limit, &start, &end, &prop,
6229 coding->src_object)
6230 && end <= limit)
6231 *stop = start;
6232 else
6233 *stop = limit;
6234 }
6235 return buf;
6236 }
6237
6238
6239 /* Extract an annotation datum from a text property `charset' at POS of
6240 CODING->src_object (buffer of string), store the data in BUF, set
6241 *STOP to the position where the value of `charset' property changes
6242 (limiting by LIMIT), and return the address of the next element of
6243 BUF.
6244
6245 If the property value is nil, set *STOP to the position where the
6246 property value is non-nil (limiting by LIMIT), and return BUF. */
6247
6248 static INLINE int *
6249 handle_charset_annotation (pos, limit, coding, buf, stop)
6250 EMACS_INT pos, limit;
6251 struct coding_system *coding;
6252 int *buf;
6253 EMACS_INT *stop;
6254 {
6255 Lisp_Object val, next;
6256 int id;
6257
6258 val = Fget_text_property (make_number (pos), Qcharset, coding->src_object);
6259 if (! NILP (val) && CHARSETP (val))
6260 id = XINT (CHARSET_SYMBOL_ID (val));
6261 else
6262 id = -1;
6263 ADD_CHARSET_DATA (buf, 0, id);
6264 next = Fnext_single_property_change (make_number (pos), Qcharset,
6265 coding->src_object,
6266 make_number (limit));
6267 *stop = XINT (next);
6268 return buf;
6269 }
6270
6271
6272 static void
6273 consume_chars (coding, translation_table, max_lookup)
6274 struct coding_system *coding;
6275 Lisp_Object translation_table;
6276 int max_lookup;
6277 {
6278 int *buf = coding->charbuf;
6279 int *buf_end = coding->charbuf + coding->charbuf_size;
6280 const unsigned char *src = coding->source + coding->consumed;
6281 const unsigned char *src_end = coding->source + coding->src_bytes;
6282 EMACS_INT pos = coding->src_pos + coding->consumed_char;
6283 EMACS_INT end_pos = coding->src_pos + coding->src_chars;
6284 int multibytep = coding->src_multibyte;
6285 Lisp_Object eol_type;
6286 int c;
6287 EMACS_INT stop, stop_composition, stop_charset;
6288 int *lookup_buf = NULL;
6289
6290 if (! NILP (translation_table))
6291 lookup_buf = alloca (sizeof (int) * max_lookup);
6292
6293 eol_type = CODING_ID_EOL_TYPE (coding->id);
6294 if (VECTORP (eol_type))
6295 eol_type = Qunix;
6296
6297 /* Note: composition handling is not yet implemented. */
6298 coding->common_flags &= ~CODING_ANNOTATE_COMPOSITION_MASK;
6299
6300 if (NILP (coding->src_object))
6301 stop = stop_composition = stop_charset = end_pos;
6302 else
6303 {
6304 if (coding->common_flags & CODING_ANNOTATE_COMPOSITION_MASK)
6305 stop = stop_composition = pos;
6306 else
6307 stop = stop_composition = end_pos;
6308 if (coding->common_flags & CODING_ANNOTATE_CHARSET_MASK)
6309 stop = stop_charset = pos;
6310 else
6311 stop_charset = end_pos;
6312 }
6313
6314 /* Compensate for CRLF and conversion. */
6315 buf_end -= 1 + MAX_ANNOTATION_LENGTH;
6316 while (buf < buf_end)
6317 {
6318 Lisp_Object trans;
6319
6320 if (pos == stop)
6321 {
6322 if (pos == end_pos)
6323 break;
6324 if (pos == stop_composition)
6325 buf = handle_composition_annotation (pos, end_pos, coding,
6326 buf, &stop_composition);
6327 if (pos == stop_charset)
6328 buf = handle_charset_annotation (pos, end_pos, coding,
6329 buf, &stop_charset);
6330 stop = (stop_composition < stop_charset
6331 ? stop_composition : stop_charset);
6332 }
6333
6334 if (! multibytep)
6335 {
6336 EMACS_INT bytes;
6337
6338 if (! CODING_FOR_UNIBYTE (coding)
6339 && (bytes = MULTIBYTE_LENGTH (src, src_end)) > 0)
6340 c = STRING_CHAR_ADVANCE (src), pos += bytes;
6341 else
6342 c = *src++, pos++;
6343 }
6344 else
6345 c = STRING_CHAR_ADVANCE (src), pos++;
6346 if ((c == '\r') && (coding->mode & CODING_MODE_SELECTIVE_DISPLAY))
6347 c = '\n';
6348 if (! EQ (eol_type, Qunix))
6349 {
6350 if (c == '\n')
6351 {
6352 if (EQ (eol_type, Qdos))
6353 *buf++ = '\r';
6354 else
6355 c = '\r';
6356 }
6357 }
6358
6359 trans = Qnil;
6360 LOOKUP_TRANSLATION_TABLE (translation_table, c, trans);
6361 if (NILP (trans))
6362 *buf++ = c;
6363 else
6364 {
6365 int from_nchars = 1, to_nchars = 1;
6366 int *lookup_buf_end;
6367 const unsigned char *p = src;
6368 int i;
6369
6370 lookup_buf[0] = c;
6371 for (i = 1; i < max_lookup && p < src_end; i++)
6372 lookup_buf[i] = STRING_CHAR_ADVANCE (p);
6373 lookup_buf_end = lookup_buf + i;
6374 trans = get_translation (trans, lookup_buf, lookup_buf_end, 1,
6375 &from_nchars, &to_nchars);
6376 if (EQ (trans, Qt)
6377 || buf + to_nchars > buf_end)
6378 break;
6379 *buf++ = *lookup_buf;
6380 for (i = 1; i < to_nchars; i++)
6381 *buf++ = XINT (AREF (trans, i));
6382 for (i = 1; i < from_nchars; i++, pos++)
6383 src += MULTIBYTE_LENGTH_NO_CHECK (src);
6384 }
6385 }
6386
6387 coding->consumed = src - coding->source;
6388 coding->consumed_char = pos - coding->src_pos;
6389 coding->charbuf_used = buf - coding->charbuf;
6390 coding->chars_at_source = 0;
6391 }
6392
6393
6394 /* Encode the text at CODING->src_object into CODING->dst_object.
6395 CODING->src_object is a buffer or a string.
6396 CODING->dst_object is a buffer or nil.
6397
6398 If CODING->src_object is a buffer, it must be the current buffer.
6399 In this case, if CODING->src_pos is positive, it is a position of
6400 the source text in the buffer, otherwise. the source text is in the
6401 gap area of the buffer, and coding->src_pos specifies the offset of
6402 the text from GPT (which must be the same as PT). If this is the
6403 same buffer as CODING->dst_object, CODING->src_pos must be
6404 negative and CODING should not have `pre-write-conversion'.
6405
6406 If CODING->src_object is a string, CODING should not have
6407 `pre-write-conversion'.
6408
6409 If CODING->dst_object is a buffer, the encoded data is inserted at
6410 the current point of that buffer.
6411
6412 If CODING->dst_object is nil, the encoded data is placed at the
6413 memory area specified by CODING->destination. */
6414
6415 static int
6416 encode_coding (coding)
6417 struct coding_system *coding;
6418 {
6419 Lisp_Object attrs;
6420 Lisp_Object translation_table;
6421 int max_lookup;
6422
6423 attrs = CODING_ID_ATTRS (coding->id);
6424 translation_table = get_translation_table (attrs, 1, &max_lookup);
6425
6426 if (BUFFERP (coding->dst_object))
6427 {
6428 set_buffer_internal (XBUFFER (coding->dst_object));
6429 coding->dst_multibyte
6430 = ! NILP (current_buffer->enable_multibyte_characters);
6431 }
6432
6433 coding->consumed = coding->consumed_char = 0;
6434 coding->produced = coding->produced_char = 0;
6435 record_conversion_result (coding, CODING_RESULT_SUCCESS);
6436 coding->errors = 0;
6437
6438 ALLOC_CONVERSION_WORK_AREA (coding);
6439
6440 do {
6441 coding_set_source (coding);
6442 consume_chars (coding, translation_table, max_lookup);
6443 coding_set_destination (coding);
6444 (*(coding->encoder)) (coding);
6445 } while (coding->consumed_char < coding->src_chars);
6446
6447 if (BUFFERP (coding->dst_object))
6448 insert_from_gap (coding->produced_char, coding->produced);
6449
6450 return (coding->result);
6451 }
6452
6453
6454 /* Name (or base name) of work buffer for code conversion. */
6455 static Lisp_Object Vcode_conversion_workbuf_name;
6456
6457 /* A working buffer used by the top level conversion. Once it is
6458 created, it is never destroyed. It has the name
6459 Vcode_conversion_workbuf_name. The other working buffers are
6460 destroyed after the use is finished, and their names are modified
6461 versions of Vcode_conversion_workbuf_name. */
6462 static Lisp_Object Vcode_conversion_reused_workbuf;
6463
6464 /* 1 iff Vcode_conversion_reused_workbuf is already in use. */
6465 static int reused_workbuf_in_use;
6466
6467
6468 /* Return a working buffer of code convesion. MULTIBYTE specifies the
6469 multibyteness of returning buffer. */
6470
6471 static Lisp_Object
6472 make_conversion_work_buffer (multibyte)
6473 int multibyte;
6474 {
6475 Lisp_Object name, workbuf;
6476 struct buffer *current;
6477
6478 if (reused_workbuf_in_use++)
6479 {
6480 name = Fgenerate_new_buffer_name (Vcode_conversion_workbuf_name, Qnil);
6481 workbuf = Fget_buffer_create (name);
6482 }
6483 else
6484 {
6485 name = Vcode_conversion_workbuf_name;
6486 workbuf = Fget_buffer_create (name);
6487 if (NILP (Vcode_conversion_reused_workbuf))
6488 Vcode_conversion_reused_workbuf = workbuf;
6489 }
6490 current = current_buffer;
6491 set_buffer_internal (XBUFFER (workbuf));
6492 Ferase_buffer ();
6493 current_buffer->undo_list = Qt;
6494 current_buffer->enable_multibyte_characters = multibyte ? Qt : Qnil;
6495 set_buffer_internal (current);
6496 return workbuf;
6497 }
6498
6499
6500 static Lisp_Object
6501 code_conversion_restore (arg)
6502 Lisp_Object arg;
6503 {
6504 Lisp_Object current, workbuf;
6505
6506 current = XCAR (arg);
6507 workbuf = XCDR (arg);
6508 if (! NILP (workbuf))
6509 {
6510 if (EQ (workbuf, Vcode_conversion_reused_workbuf))
6511 reused_workbuf_in_use = 0;
6512 else if (! NILP (Fbuffer_live_p (workbuf)))
6513 Fkill_buffer (workbuf);
6514 }
6515 set_buffer_internal (XBUFFER (current));
6516 return Qnil;
6517 }
6518
6519 Lisp_Object
6520 code_conversion_save (with_work_buf, multibyte)
6521 int with_work_buf, multibyte;
6522 {
6523 Lisp_Object workbuf = Qnil;
6524
6525 if (with_work_buf)
6526 workbuf = make_conversion_work_buffer (multibyte);
6527 record_unwind_protect (code_conversion_restore,
6528 Fcons (Fcurrent_buffer (), workbuf));
6529 return workbuf;
6530 }
6531
6532 int
6533 decode_coding_gap (coding, chars, bytes)
6534 struct coding_system *coding;
6535 EMACS_INT chars, bytes;
6536 {
6537 int count = specpdl_ptr - specpdl;
6538 Lisp_Object attrs;
6539
6540 code_conversion_save (0, 0);
6541
6542 coding->src_object = Fcurrent_buffer ();
6543 coding->src_chars = chars;
6544 coding->src_bytes = bytes;
6545 coding->src_pos = -chars;
6546 coding->src_pos_byte = -bytes;
6547 coding->src_multibyte = chars < bytes;
6548 coding->dst_object = coding->src_object;
6549 coding->dst_pos = PT;
6550 coding->dst_pos_byte = PT_BYTE;
6551 coding->dst_multibyte = ! NILP (current_buffer->enable_multibyte_characters);
6552 coding->mode |= CODING_MODE_LAST_BLOCK;
6553
6554 if (CODING_REQUIRE_DETECTION (coding))
6555 detect_coding (coding);
6556
6557 decode_coding (coding);
6558
6559 attrs = CODING_ID_ATTRS (coding->id);
6560 if (! NILP (CODING_ATTR_POST_READ (attrs)))
6561 {
6562 EMACS_INT prev_Z = Z, prev_Z_BYTE = Z_BYTE;
6563 Lisp_Object val;
6564
6565 TEMP_SET_PT_BOTH (coding->dst_pos, coding->dst_pos_byte);
6566 val = call1 (CODING_ATTR_POST_READ (attrs),
6567 make_number (coding->produced_char));
6568 CHECK_NATNUM (val);
6569 coding->produced_char += Z - prev_Z;
6570 coding->produced += Z_BYTE - prev_Z_BYTE;
6571 }
6572
6573 unbind_to (count, Qnil);
6574 return coding->result;
6575 }
6576
6577 int
6578 encode_coding_gap (coding, chars, bytes)
6579 struct coding_system *coding;
6580 EMACS_INT chars, bytes;
6581 {
6582 int count = specpdl_ptr - specpdl;
6583
6584 code_conversion_save (0, 0);
6585
6586 coding->src_object = Fcurrent_buffer ();
6587 coding->src_chars = chars;
6588 coding->src_bytes = bytes;
6589 coding->src_pos = -chars;
6590 coding->src_pos_byte = -bytes;
6591 coding->src_multibyte = chars < bytes;
6592 coding->dst_object = coding->src_object;
6593 coding->dst_pos = PT;
6594 coding->dst_pos_byte = PT_BYTE;
6595
6596 encode_coding (coding);
6597
6598 unbind_to (count, Qnil);
6599 return coding->result;
6600 }
6601
6602
6603 /* Decode the text in the range FROM/FROM_BYTE and TO/TO_BYTE in
6604 SRC_OBJECT into DST_OBJECT by coding context CODING.
6605
6606 SRC_OBJECT is a buffer, a string, or Qnil.
6607
6608 If it is a buffer, the text is at point of the buffer. FROM and TO
6609 are positions in the buffer.
6610
6611 If it is a string, the text is at the beginning of the string.
6612 FROM and TO are indices to the string.
6613
6614 If it is nil, the text is at coding->source. FROM and TO are
6615 indices to coding->source.
6616
6617 DST_OBJECT is a buffer, Qt, or Qnil.
6618
6619 If it is a buffer, the decoded text is inserted at point of the
6620 buffer. If the buffer is the same as SRC_OBJECT, the source text
6621 is deleted.
6622
6623 If it is Qt, a string is made from the decoded text, and
6624 set in CODING->dst_object.
6625
6626 If it is Qnil, the decoded text is stored at CODING->destination.
6627 The caller must allocate CODING->dst_bytes bytes at
6628 CODING->destination by xmalloc. If the decoded text is longer than
6629 CODING->dst_bytes, CODING->destination is relocated by xrealloc.
6630 */
6631
6632 void
6633 decode_coding_object (coding, src_object, from, from_byte, to, to_byte,
6634 dst_object)
6635 struct coding_system *coding;
6636 Lisp_Object src_object;
6637 EMACS_INT from, from_byte, to, to_byte;
6638 Lisp_Object dst_object;
6639 {
6640 int count = specpdl_ptr - specpdl;
6641 unsigned char *destination;
6642 EMACS_INT dst_bytes;
6643 EMACS_INT chars = to - from;
6644 EMACS_INT bytes = to_byte - from_byte;
6645 Lisp_Object attrs;
6646 Lisp_Object buffer;
6647 int saved_pt = -1, saved_pt_byte;
6648
6649 buffer = Fcurrent_buffer ();
6650
6651 if (NILP (dst_object))
6652 {
6653 destination = coding->destination;
6654 dst_bytes = coding->dst_bytes;
6655 }
6656
6657 coding->src_object = src_object;
6658 coding->src_chars = chars;
6659 coding->src_bytes = bytes;
6660 coding->src_multibyte = chars < bytes;
6661
6662 if (STRINGP (src_object))
6663 {
6664 coding->src_pos = from;
6665 coding->src_pos_byte = from_byte;
6666 }
6667 else if (BUFFERP (src_object))
6668 {
6669 set_buffer_internal (XBUFFER (src_object));
6670 if (from != GPT)
6671 move_gap_both (from, from_byte);
6672 if (EQ (src_object, dst_object))
6673 {
6674 saved_pt = PT, saved_pt_byte = PT_BYTE;
6675 TEMP_SET_PT_BOTH (from, from_byte);
6676 del_range_both (from, from_byte, to, to_byte, 1);
6677 coding->src_pos = -chars;
6678 coding->src_pos_byte = -bytes;
6679 }
6680 else
6681 {
6682 coding->src_pos = from;
6683 coding->src_pos_byte = from_byte;
6684 }
6685 }
6686
6687 if (CODING_REQUIRE_DETECTION (coding))
6688 detect_coding (coding);
6689 attrs = CODING_ID_ATTRS (coding->id);
6690
6691 if (EQ (dst_object, Qt)
6692 || (! NILP (CODING_ATTR_POST_READ (attrs))
6693 && NILP (dst_object)))
6694 {
6695 coding->dst_object = code_conversion_save (1, 1);
6696 coding->dst_pos = BEG;
6697 coding->dst_pos_byte = BEG_BYTE;
6698 coding->dst_multibyte = 1;
6699 }
6700 else if (BUFFERP (dst_object))
6701 {
6702 code_conversion_save (0, 0);
6703 coding->dst_object = dst_object;
6704 coding->dst_pos = BUF_PT (XBUFFER (dst_object));
6705 coding->dst_pos_byte = BUF_PT_BYTE (XBUFFER (dst_object));
6706 coding->dst_multibyte
6707 = ! NILP (XBUFFER (dst_object)->enable_multibyte_characters);
6708 }
6709 else
6710 {
6711 code_conversion_save (0, 0);
6712 coding->dst_object = Qnil;
6713 coding->dst_multibyte = 1;
6714 }
6715
6716 decode_coding (coding);
6717
6718 if (BUFFERP (coding->dst_object))
6719 set_buffer_internal (XBUFFER (coding->dst_object));
6720
6721 if (! NILP (CODING_ATTR_POST_READ (attrs)))
6722 {
6723 struct gcpro gcpro1, gcpro2;
6724 EMACS_INT prev_Z = Z, prev_Z_BYTE = Z_BYTE;
6725 Lisp_Object val;
6726
6727 TEMP_SET_PT_BOTH (coding->dst_pos, coding->dst_pos_byte);
6728 GCPRO2 (coding->src_object, coding->dst_object);
6729 val = call1 (CODING_ATTR_POST_READ (attrs),
6730 make_number (coding->produced_char));
6731 UNGCPRO;
6732 CHECK_NATNUM (val);
6733 coding->produced_char += Z - prev_Z;
6734 coding->produced += Z_BYTE - prev_Z_BYTE;
6735 }
6736
6737 if (EQ (dst_object, Qt))
6738 {
6739 coding->dst_object = Fbuffer_string ();
6740 }
6741 else if (NILP (dst_object) && BUFFERP (coding->dst_object))
6742 {
6743 set_buffer_internal (XBUFFER (coding->dst_object));
6744 if (dst_bytes < coding->produced)
6745 {
6746 destination
6747 = (unsigned char *) xrealloc (destination, coding->produced);
6748 if (! destination)
6749 {
6750 record_conversion_result (coding,
6751 CODING_RESULT_INSUFFICIENT_DST);
6752 unbind_to (count, Qnil);
6753 return;
6754 }
6755 if (BEGV < GPT && GPT < BEGV + coding->produced_char)
6756 move_gap_both (BEGV, BEGV_BYTE);
6757 bcopy (BEGV_ADDR, destination, coding->produced);
6758 coding->destination = destination;
6759 }
6760 }
6761
6762 if (saved_pt >= 0)
6763 {
6764 /* This is the case of:
6765 (BUFFERP (src_object) && EQ (src_object, dst_object))
6766 As we have moved PT while replacing the original buffer
6767 contents, we must recover it now. */
6768 set_buffer_internal (XBUFFER (src_object));
6769 if (saved_pt < from)
6770 TEMP_SET_PT_BOTH (saved_pt, saved_pt_byte);
6771 else if (saved_pt < from + chars)
6772 TEMP_SET_PT_BOTH (from, from_byte);
6773 else if (! NILP (current_buffer->enable_multibyte_characters))
6774 TEMP_SET_PT_BOTH (saved_pt + (coding->produced_char - chars),
6775 saved_pt_byte + (coding->produced - bytes));
6776 else
6777 TEMP_SET_PT_BOTH (saved_pt + (coding->produced - bytes),
6778 saved_pt_byte + (coding->produced - bytes));
6779 }
6780
6781 unbind_to (count, coding->dst_object);
6782 }
6783
6784
6785 void
6786 encode_coding_object (coding, src_object, from, from_byte, to, to_byte,
6787 dst_object)
6788 struct coding_system *coding;
6789 Lisp_Object src_object;
6790 EMACS_INT from, from_byte, to, to_byte;
6791 Lisp_Object dst_object;
6792 {
6793 int count = specpdl_ptr - specpdl;
6794 EMACS_INT chars = to - from;
6795 EMACS_INT bytes = to_byte - from_byte;
6796 Lisp_Object attrs;
6797 Lisp_Object buffer;
6798 int saved_pt = -1, saved_pt_byte;
6799
6800 buffer = Fcurrent_buffer ();
6801
6802 coding->src_object = src_object;
6803 coding->src_chars = chars;
6804 coding->src_bytes = bytes;
6805 coding->src_multibyte = chars < bytes;
6806
6807 attrs = CODING_ID_ATTRS (coding->id);
6808
6809 if (! NILP (CODING_ATTR_PRE_WRITE (attrs)))
6810 {
6811 coding->src_object = code_conversion_save (1, coding->src_multibyte);
6812 set_buffer_internal (XBUFFER (coding->src_object));
6813 if (STRINGP (src_object))
6814 insert_from_string (src_object, from, from_byte, chars, bytes, 0);
6815 else if (BUFFERP (src_object))
6816 insert_from_buffer (XBUFFER (src_object), from, chars, 0);
6817 else
6818 insert_1_both (coding->source + from, chars, bytes, 0, 0, 0);
6819
6820 if (EQ (src_object, dst_object))
6821 {
6822 set_buffer_internal (XBUFFER (src_object));
6823 saved_pt = PT, saved_pt_byte = PT_BYTE;
6824 del_range_both (from, from_byte, to, to_byte, 1);
6825 set_buffer_internal (XBUFFER (coding->src_object));
6826 }
6827
6828 call2 (CODING_ATTR_PRE_WRITE (attrs),
6829 make_number (BEG), make_number (Z));
6830 coding->src_object = Fcurrent_buffer ();
6831 if (BEG != GPT)
6832 move_gap_both (BEG, BEG_BYTE);
6833 coding->src_chars = Z - BEG;
6834 coding->src_bytes = Z_BYTE - BEG_BYTE;
6835 coding->src_pos = BEG;
6836 coding->src_pos_byte = BEG_BYTE;
6837 coding->src_multibyte = Z < Z_BYTE;
6838 }
6839 else if (STRINGP (src_object))
6840 {
6841 code_conversion_save (0, 0);
6842 coding->src_pos = from;
6843 coding->src_pos_byte = from_byte;
6844 }
6845 else if (BUFFERP (src_object))
6846 {
6847 code_conversion_save (0, 0);
6848 set_buffer_internal (XBUFFER (src_object));
6849 if (EQ (src_object, dst_object))
6850 {
6851 saved_pt = PT, saved_pt_byte = PT_BYTE;
6852 coding->src_object = del_range_1 (from, to, 1, 1);
6853 coding->src_pos = 0;
6854 coding->src_pos_byte = 0;
6855 }
6856 else
6857 {
6858 if (from < GPT && to >= GPT)
6859 move_gap_both (from, from_byte);
6860 coding->src_pos = from;
6861 coding->src_pos_byte = from_byte;
6862 }
6863 }
6864 else
6865 code_conversion_save (0, 0);
6866
6867 if (BUFFERP (dst_object))
6868 {
6869 coding->dst_object = dst_object;
6870 if (EQ (src_object, dst_object))
6871 {
6872 coding->dst_pos = from;
6873 coding->dst_pos_byte = from_byte;
6874 }
6875 else
6876 {
6877 coding->dst_pos = BUF_PT (XBUFFER (dst_object));
6878 coding->dst_pos_byte = BUF_PT_BYTE (XBUFFER (dst_object));
6879 }
6880 coding->dst_multibyte
6881 = ! NILP (XBUFFER (dst_object)->enable_multibyte_characters);
6882 }
6883 else if (EQ (dst_object, Qt))
6884 {
6885 coding->dst_object = Qnil;
6886 coding->dst_bytes = coding->src_chars;
6887 if (coding->dst_bytes == 0)
6888 coding->dst_bytes = 1;
6889 coding->destination = (unsigned char *) xmalloc (coding->dst_bytes);
6890 coding->dst_multibyte = 0;
6891 }
6892 else
6893 {
6894 coding->dst_object = Qnil;
6895 coding->dst_multibyte = 0;
6896 }
6897
6898 encode_coding (coding);
6899
6900 if (EQ (dst_object, Qt))
6901 {
6902 if (BUFFERP (coding->dst_object))
6903 coding->dst_object = Fbuffer_string ();
6904 else
6905 {
6906 coding->dst_object
6907 = make_unibyte_string ((char *) coding->destination,
6908 coding->produced);
6909 xfree (coding->destination);
6910 }
6911 }
6912
6913 if (saved_pt >= 0)
6914 {
6915 /* This is the case of:
6916 (BUFFERP (src_object) && EQ (src_object, dst_object))
6917 As we have moved PT while replacing the original buffer
6918 contents, we must recover it now. */
6919 set_buffer_internal (XBUFFER (src_object));
6920 if (saved_pt < from)
6921 TEMP_SET_PT_BOTH (saved_pt, saved_pt_byte);
6922 else if (saved_pt < from + chars)
6923 TEMP_SET_PT_BOTH (from, from_byte);
6924 else if (! NILP (current_buffer->enable_multibyte_characters))
6925 TEMP_SET_PT_BOTH (saved_pt + (coding->produced_char - chars),
6926 saved_pt_byte + (coding->produced - bytes));
6927 else
6928 TEMP_SET_PT_BOTH (saved_pt + (coding->produced - bytes),
6929 saved_pt_byte + (coding->produced - bytes));
6930 }
6931
6932 unbind_to (count, Qnil);
6933 }
6934
6935
6936 Lisp_Object
6937 preferred_coding_system ()
6938 {
6939 int id = coding_categories[coding_priorities[0]].id;
6940
6941 return CODING_ID_NAME (id);
6942 }
6943
6944 \f
6945 #ifdef emacs
6946 /*** 8. Emacs Lisp library functions ***/
6947
6948 DEFUN ("coding-system-p", Fcoding_system_p, Scoding_system_p, 1, 1, 0,
6949 doc: /* Return t if OBJECT is nil or a coding-system.
6950 See the documentation of `define-coding-system' for information
6951 about coding-system objects. */)
6952 (obj)
6953 Lisp_Object obj;
6954 {
6955 return ((NILP (obj) || CODING_SYSTEM_P (obj)) ? Qt : Qnil);
6956 }
6957
6958 DEFUN ("read-non-nil-coding-system", Fread_non_nil_coding_system,
6959 Sread_non_nil_coding_system, 1, 1, 0,
6960 doc: /* Read a coding system from the minibuffer, prompting with string PROMPT. */)
6961 (prompt)
6962 Lisp_Object prompt;
6963 {
6964 Lisp_Object val;
6965 do
6966 {
6967 val = Fcompleting_read (prompt, Vcoding_system_alist, Qnil,
6968 Qt, Qnil, Qcoding_system_history, Qnil, Qnil);
6969 }
6970 while (SCHARS (val) == 0);
6971 return (Fintern (val, Qnil));
6972 }
6973
6974 DEFUN ("read-coding-system", Fread_coding_system, Sread_coding_system, 1, 2, 0,
6975 doc: /* Read a coding system from the minibuffer, prompting with string PROMPT.
6976 If the user enters null input, return second argument DEFAULT-CODING-SYSTEM. */)
6977 (prompt, default_coding_system)
6978 Lisp_Object prompt, default_coding_system;
6979 {
6980 Lisp_Object val;
6981 if (SYMBOLP (default_coding_system))
6982 XSETSTRING (default_coding_system, XPNTR (SYMBOL_NAME (default_coding_system)));
6983 val = Fcompleting_read (prompt, Vcoding_system_alist, Qnil,
6984 Qt, Qnil, Qcoding_system_history,
6985 default_coding_system, Qnil);
6986 return (SCHARS (val) == 0 ? Qnil : Fintern (val, Qnil));
6987 }
6988
6989 DEFUN ("check-coding-system", Fcheck_coding_system, Scheck_coding_system,
6990 1, 1, 0,
6991 doc: /* Check validity of CODING-SYSTEM.
6992 If valid, return CODING-SYSTEM, else signal a `coding-system-error' error. */)
6993 (coding_system)
6994 Lisp_Object coding_system;
6995 {
6996 CHECK_SYMBOL (coding_system);
6997 if (!NILP (Fcoding_system_p (coding_system)))
6998 return coding_system;
6999 while (1)
7000 Fsignal (Qcoding_system_error, Fcons (coding_system, Qnil));
7001 }
7002
7003 \f
7004 /* Detect how the bytes at SRC of length SRC_BYTES are encoded. If
7005 HIGHEST is nonzero, return the coding system of the highest
7006 priority among the detected coding systems. Otherwize return a
7007 list of detected coding systems sorted by their priorities. If
7008 MULTIBYTEP is nonzero, it is assumed that the bytes are in correct
7009 multibyte form but contains only ASCII and eight-bit chars.
7010 Otherwise, the bytes are raw bytes.
7011
7012 CODING-SYSTEM controls the detection as below:
7013
7014 If it is nil, detect both text-format and eol-format. If the
7015 text-format part of CODING-SYSTEM is already specified
7016 (e.g. `iso-latin-1'), detect only eol-format. If the eol-format
7017 part of CODING-SYSTEM is already specified (e.g. `undecided-unix'),
7018 detect only text-format. */
7019
7020 Lisp_Object
7021 detect_coding_system (src, src_chars, src_bytes, highest, multibytep,
7022 coding_system)
7023 const unsigned char *src;
7024 int src_chars, src_bytes, highest;
7025 int multibytep;
7026 Lisp_Object coding_system;
7027 {
7028 const unsigned char *src_end = src + src_bytes;
7029 Lisp_Object attrs, eol_type;
7030 Lisp_Object val;
7031 struct coding_system coding;
7032 int id;
7033 struct coding_detection_info detect_info;
7034 enum coding_category base_category;
7035
7036 if (NILP (coding_system))
7037 coding_system = Qundecided;
7038 setup_coding_system (coding_system, &coding);
7039 attrs = CODING_ID_ATTRS (coding.id);
7040 eol_type = CODING_ID_EOL_TYPE (coding.id);
7041 coding_system = CODING_ATTR_BASE_NAME (attrs);
7042
7043 coding.source = src;
7044 coding.src_chars = src_chars;
7045 coding.src_bytes = src_bytes;
7046 coding.src_multibyte = multibytep;
7047 coding.consumed = 0;
7048 coding.mode |= CODING_MODE_LAST_BLOCK;
7049
7050 detect_info.checked = detect_info.found = detect_info.rejected = 0;
7051
7052 /* At first, detect text-format if necessary. */
7053 base_category = XINT (CODING_ATTR_CATEGORY (attrs));
7054 if (base_category == coding_category_undecided)
7055 {
7056 enum coding_category category;
7057 struct coding_system *this;
7058 int c, i;
7059
7060 /* Skip all ASCII bytes except for a few ISO2022 controls. */
7061 for (i = 0; src < src_end; i++, src++)
7062 {
7063 c = *src;
7064 if (c & 0x80 || (c < 0x20 && (c == ISO_CODE_ESC
7065 || c == ISO_CODE_SI
7066 || c == ISO_CODE_SO)))
7067 break;
7068 }
7069 coding.head_ascii = src - coding.source;
7070
7071 if (src < src_end)
7072 for (i = 0; i < coding_category_raw_text; i++)
7073 {
7074 category = coding_priorities[i];
7075 this = coding_categories + category;
7076
7077 if (this->id < 0)
7078 {
7079 /* No coding system of this category is defined. */
7080 detect_info.rejected |= (1 << category);
7081 }
7082 else if (category >= coding_category_raw_text)
7083 continue;
7084 else if (detect_info.checked & (1 << category))
7085 {
7086 if (highest
7087 && (detect_info.found & (1 << category)))
7088 break;
7089 }
7090 else
7091 {
7092 if ((*(this->detector)) (&coding, &detect_info)
7093 && highest
7094 && (detect_info.found & (1 << category)))
7095 {
7096 if (category == coding_category_utf_16_auto)
7097 {
7098 if (detect_info.found & CATEGORY_MASK_UTF_16_LE)
7099 category = coding_category_utf_16_le;
7100 else
7101 category = coding_category_utf_16_be;
7102 }
7103 break;
7104 }
7105 }
7106 }
7107
7108 if (detect_info.rejected == CATEGORY_MASK_ANY)
7109 {
7110 detect_info.found = CATEGORY_MASK_RAW_TEXT;
7111 id = coding_categories[coding_category_raw_text].id;
7112 val = Fcons (make_number (id), Qnil);
7113 }
7114 else if (! detect_info.rejected && ! detect_info.found)
7115 {
7116 detect_info.found = CATEGORY_MASK_ANY;
7117 id = coding_categories[coding_category_undecided].id;
7118 val = Fcons (make_number (id), Qnil);
7119 }
7120 else if (highest)
7121 {
7122 if (detect_info.found)
7123 {
7124 detect_info.found = 1 << category;
7125 val = Fcons (make_number (this->id), Qnil);
7126 }
7127 else
7128 for (i = 0; i < coding_category_raw_text; i++)
7129 if (! (detect_info.rejected & (1 << coding_priorities[i])))
7130 {
7131 detect_info.found = 1 << coding_priorities[i];
7132 id = coding_categories[coding_priorities[i]].id;
7133 val = Fcons (make_number (id), Qnil);
7134 break;
7135 }
7136 }
7137 else
7138 {
7139 int mask = detect_info.rejected | detect_info.found;
7140 int found = 0;
7141 val = Qnil;
7142
7143 for (i = coding_category_raw_text - 1; i >= 0; i--)
7144 {
7145 category = coding_priorities[i];
7146 if (! (mask & (1 << category)))
7147 {
7148 found |= 1 << category;
7149 id = coding_categories[category].id;
7150 val = Fcons (make_number (id), val);
7151 }
7152 }
7153 for (i = coding_category_raw_text - 1; i >= 0; i--)
7154 {
7155 category = coding_priorities[i];
7156 if (detect_info.found & (1 << category))
7157 {
7158 id = coding_categories[category].id;
7159 val = Fcons (make_number (id), val);
7160 }
7161 }
7162 detect_info.found |= found;
7163 }
7164 }
7165 else if (base_category == coding_category_utf_16_auto)
7166 {
7167 if (detect_coding_utf_16 (&coding, &detect_info))
7168 {
7169 enum coding_category category;
7170 struct coding_system *this;
7171
7172 if (detect_info.found & CATEGORY_MASK_UTF_16_LE)
7173 this = coding_categories + coding_category_utf_16_le;
7174 else if (detect_info.found & CATEGORY_MASK_UTF_16_BE)
7175 this = coding_categories + coding_category_utf_16_be;
7176 else if (detect_info.rejected & CATEGORY_MASK_UTF_16_LE_NOSIG)
7177 this = coding_categories + coding_category_utf_16_be_nosig;
7178 else
7179 this = coding_categories + coding_category_utf_16_le_nosig;
7180 val = Fcons (make_number (this->id), Qnil);
7181 }
7182 }
7183 else
7184 {
7185 detect_info.found = 1 << XINT (CODING_ATTR_CATEGORY (attrs));
7186 val = Fcons (make_number (coding.id), Qnil);
7187 }
7188
7189 /* Then, detect eol-format if necessary. */
7190 {
7191 int normal_eol = -1, utf_16_be_eol = -1, utf_16_le_eol;
7192 Lisp_Object tail;
7193
7194 if (VECTORP (eol_type))
7195 {
7196 if (detect_info.found & ~CATEGORY_MASK_UTF_16)
7197 normal_eol = detect_eol (coding.source, src_bytes,
7198 coding_category_raw_text);
7199 if (detect_info.found & (CATEGORY_MASK_UTF_16_BE
7200 | CATEGORY_MASK_UTF_16_BE_NOSIG))
7201 utf_16_be_eol = detect_eol (coding.source, src_bytes,
7202 coding_category_utf_16_be);
7203 if (detect_info.found & (CATEGORY_MASK_UTF_16_LE
7204 | CATEGORY_MASK_UTF_16_LE_NOSIG))
7205 utf_16_le_eol = detect_eol (coding.source, src_bytes,
7206 coding_category_utf_16_le);
7207 }
7208 else
7209 {
7210 if (EQ (eol_type, Qunix))
7211 normal_eol = utf_16_be_eol = utf_16_le_eol = EOL_SEEN_LF;
7212 else if (EQ (eol_type, Qdos))
7213 normal_eol = utf_16_be_eol = utf_16_le_eol = EOL_SEEN_CRLF;
7214 else
7215 normal_eol = utf_16_be_eol = utf_16_le_eol = EOL_SEEN_CR;
7216 }
7217
7218 for (tail = val; CONSP (tail); tail = XCDR (tail))
7219 {
7220 enum coding_category category;
7221 int this_eol;
7222
7223 id = XINT (XCAR (tail));
7224 attrs = CODING_ID_ATTRS (id);
7225 category = XINT (CODING_ATTR_CATEGORY (attrs));
7226 eol_type = CODING_ID_EOL_TYPE (id);
7227 if (VECTORP (eol_type))
7228 {
7229 if (category == coding_category_utf_16_be
7230 || category == coding_category_utf_16_be_nosig)
7231 this_eol = utf_16_be_eol;
7232 else if (category == coding_category_utf_16_le
7233 || category == coding_category_utf_16_le_nosig)
7234 this_eol = utf_16_le_eol;
7235 else
7236 this_eol = normal_eol;
7237
7238 if (this_eol == EOL_SEEN_LF)
7239 XSETCAR (tail, AREF (eol_type, 0));
7240 else if (this_eol == EOL_SEEN_CRLF)
7241 XSETCAR (tail, AREF (eol_type, 1));
7242 else if (this_eol == EOL_SEEN_CR)
7243 XSETCAR (tail, AREF (eol_type, 2));
7244 else
7245 XSETCAR (tail, CODING_ID_NAME (id));
7246 }
7247 else
7248 XSETCAR (tail, CODING_ID_NAME (id));
7249 }
7250 }
7251
7252 return (highest ? XCAR (val) : val);
7253 }
7254
7255
7256 DEFUN ("detect-coding-region", Fdetect_coding_region, Sdetect_coding_region,
7257 2, 3, 0,
7258 doc: /* Detect coding system of the text in the region between START and END.
7259 Return a list of possible coding systems ordered by priority.
7260
7261 If only ASCII characters are found, it returns a list of single element
7262 `undecided' or its subsidiary coding system according to a detected
7263 end-of-line format.
7264
7265 If optional argument HIGHEST is non-nil, return the coding system of
7266 highest priority. */)
7267 (start, end, highest)
7268 Lisp_Object start, end, highest;
7269 {
7270 int from, to;
7271 int from_byte, to_byte;
7272
7273 CHECK_NUMBER_COERCE_MARKER (start);
7274 CHECK_NUMBER_COERCE_MARKER (end);
7275
7276 validate_region (&start, &end);
7277 from = XINT (start), to = XINT (end);
7278 from_byte = CHAR_TO_BYTE (from);
7279 to_byte = CHAR_TO_BYTE (to);
7280
7281 if (from < GPT && to >= GPT)
7282 move_gap_both (to, to_byte);
7283
7284 return detect_coding_system (BYTE_POS_ADDR (from_byte),
7285 to - from, to_byte - from_byte,
7286 !NILP (highest),
7287 !NILP (current_buffer
7288 ->enable_multibyte_characters),
7289 Qnil);
7290 }
7291
7292 DEFUN ("detect-coding-string", Fdetect_coding_string, Sdetect_coding_string,
7293 1, 2, 0,
7294 doc: /* Detect coding system of the text in STRING.
7295 Return a list of possible coding systems ordered by priority.
7296
7297 If only ASCII characters are found, it returns a list of single element
7298 `undecided' or its subsidiary coding system according to a detected
7299 end-of-line format.
7300
7301 If optional argument HIGHEST is non-nil, return the coding system of
7302 highest priority. */)
7303 (string, highest)
7304 Lisp_Object string, highest;
7305 {
7306 CHECK_STRING (string);
7307
7308 return detect_coding_system (SDATA (string),
7309 SCHARS (string), SBYTES (string),
7310 !NILP (highest), STRING_MULTIBYTE (string),
7311 Qnil);
7312 }
7313
7314
7315 static INLINE int
7316 char_encodable_p (c, attrs)
7317 int c;
7318 Lisp_Object attrs;
7319 {
7320 Lisp_Object tail;
7321 struct charset *charset;
7322 Lisp_Object translation_table;
7323
7324 translation_table = CODING_ATTR_TRANS_TBL (attrs);
7325 if (! NILP (translation_table))
7326 c = translate_char (translation_table, c);
7327 for (tail = CODING_ATTR_CHARSET_LIST (attrs);
7328 CONSP (tail); tail = XCDR (tail))
7329 {
7330 charset = CHARSET_FROM_ID (XINT (XCAR (tail)));
7331 if (CHAR_CHARSET_P (c, charset))
7332 break;
7333 }
7334 return (! NILP (tail));
7335 }
7336
7337
7338 /* Return a list of coding systems that safely encode the text between
7339 START and END. If EXCLUDE is non-nil, it is a list of coding
7340 systems not to check. The returned list doesn't contain any such
7341 coding systems. In any case, if the text contains only ASCII or is
7342 unibyte, return t. */
7343
7344 DEFUN ("find-coding-systems-region-internal",
7345 Ffind_coding_systems_region_internal,
7346 Sfind_coding_systems_region_internal, 2, 3, 0,
7347 doc: /* Internal use only. */)
7348 (start, end, exclude)
7349 Lisp_Object start, end, exclude;
7350 {
7351 Lisp_Object coding_attrs_list, safe_codings;
7352 EMACS_INT start_byte, end_byte;
7353 const unsigned char *p, *pbeg, *pend;
7354 int c;
7355 Lisp_Object tail, elt;
7356
7357 if (STRINGP (start))
7358 {
7359 if (!STRING_MULTIBYTE (start)
7360 || SCHARS (start) == SBYTES (start))
7361 return Qt;
7362 start_byte = 0;
7363 end_byte = SBYTES (start);
7364 }
7365 else
7366 {
7367 CHECK_NUMBER_COERCE_MARKER (start);
7368 CHECK_NUMBER_COERCE_MARKER (end);
7369 if (XINT (start) < BEG || XINT (end) > Z || XINT (start) > XINT (end))
7370 args_out_of_range (start, end);
7371 if (NILP (current_buffer->enable_multibyte_characters))
7372 return Qt;
7373 start_byte = CHAR_TO_BYTE (XINT (start));
7374 end_byte = CHAR_TO_BYTE (XINT (end));
7375 if (XINT (end) - XINT (start) == end_byte - start_byte)
7376 return Qt;
7377
7378 if (XINT (start) < GPT && XINT (end) > GPT)
7379 {
7380 if ((GPT - XINT (start)) < (XINT (end) - GPT))
7381 move_gap_both (XINT (start), start_byte);
7382 else
7383 move_gap_both (XINT (end), end_byte);
7384 }
7385 }
7386
7387 coding_attrs_list = Qnil;
7388 for (tail = Vcoding_system_list; CONSP (tail); tail = XCDR (tail))
7389 if (NILP (exclude)
7390 || NILP (Fmemq (XCAR (tail), exclude)))
7391 {
7392 Lisp_Object attrs;
7393
7394 attrs = AREF (CODING_SYSTEM_SPEC (XCAR (tail)), 0);
7395 if (EQ (XCAR (tail), CODING_ATTR_BASE_NAME (attrs))
7396 && ! EQ (CODING_ATTR_TYPE (attrs), Qundecided))
7397 {
7398 ASET (attrs, coding_attr_trans_tbl,
7399 get_translation_table (attrs, 1, NULL));
7400 coding_attrs_list = Fcons (attrs, coding_attrs_list);
7401 }
7402 }
7403
7404 if (STRINGP (start))
7405 p = pbeg = SDATA (start);
7406 else
7407 p = pbeg = BYTE_POS_ADDR (start_byte);
7408 pend = p + (end_byte - start_byte);
7409
7410 while (p < pend && ASCII_BYTE_P (*p)) p++;
7411 while (p < pend && ASCII_BYTE_P (*(pend - 1))) pend--;
7412
7413 while (p < pend)
7414 {
7415 if (ASCII_BYTE_P (*p))
7416 p++;
7417 else
7418 {
7419 c = STRING_CHAR_ADVANCE (p);
7420
7421 charset_map_loaded = 0;
7422 for (tail = coding_attrs_list; CONSP (tail);)
7423 {
7424 elt = XCAR (tail);
7425 if (NILP (elt))
7426 tail = XCDR (tail);
7427 else if (char_encodable_p (c, elt))
7428 tail = XCDR (tail);
7429 else if (CONSP (XCDR (tail)))
7430 {
7431 XSETCAR (tail, XCAR (XCDR (tail)));
7432 XSETCDR (tail, XCDR (XCDR (tail)));
7433 }
7434 else
7435 {
7436 XSETCAR (tail, Qnil);
7437 tail = XCDR (tail);
7438 }
7439 }
7440 if (charset_map_loaded)
7441 {
7442 EMACS_INT p_offset = p - pbeg, pend_offset = pend - pbeg;
7443
7444 if (STRINGP (start))
7445 pbeg = SDATA (start);
7446 else
7447 pbeg = BYTE_POS_ADDR (start_byte);
7448 p = pbeg + p_offset;
7449 pend = pbeg + pend_offset;
7450 }
7451 }
7452 }
7453
7454 safe_codings = Qnil;
7455 for (tail = coding_attrs_list; CONSP (tail); tail = XCDR (tail))
7456 if (! NILP (XCAR (tail)))
7457 safe_codings = Fcons (CODING_ATTR_BASE_NAME (XCAR (tail)), safe_codings);
7458
7459 return safe_codings;
7460 }
7461
7462
7463 DEFUN ("unencodable-char-position", Funencodable_char_position,
7464 Sunencodable_char_position, 3, 5, 0,
7465 doc: /*
7466 Return position of first un-encodable character in a region.
7467 START and END specfiy the region and CODING-SYSTEM specifies the
7468 encoding to check. Return nil if CODING-SYSTEM does encode the region.
7469
7470 If optional 4th argument COUNT is non-nil, it specifies at most how
7471 many un-encodable characters to search. In this case, the value is a
7472 list of positions.
7473
7474 If optional 5th argument STRING is non-nil, it is a string to search
7475 for un-encodable characters. In that case, START and END are indexes
7476 to the string. */)
7477 (start, end, coding_system, count, string)
7478 Lisp_Object start, end, coding_system, count, string;
7479 {
7480 int n;
7481 struct coding_system coding;
7482 Lisp_Object attrs, charset_list, translation_table;
7483 Lisp_Object positions;
7484 int from, to;
7485 const unsigned char *p, *stop, *pend;
7486 int ascii_compatible;
7487
7488 setup_coding_system (Fcheck_coding_system (coding_system), &coding);
7489 attrs = CODING_ID_ATTRS (coding.id);
7490 if (EQ (CODING_ATTR_TYPE (attrs), Qraw_text))
7491 return Qnil;
7492 ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs));
7493 charset_list = CODING_ATTR_CHARSET_LIST (attrs);
7494 translation_table = get_translation_table (attrs, 1, NULL);
7495
7496 if (NILP (string))
7497 {
7498 validate_region (&start, &end);
7499 from = XINT (start);
7500 to = XINT (end);
7501 if (NILP (current_buffer->enable_multibyte_characters)
7502 || (ascii_compatible
7503 && (to - from) == (CHAR_TO_BYTE (to) - (CHAR_TO_BYTE (from)))))
7504 return Qnil;
7505 p = CHAR_POS_ADDR (from);
7506 pend = CHAR_POS_ADDR (to);
7507 if (from < GPT && to >= GPT)
7508 stop = GPT_ADDR;
7509 else
7510 stop = pend;
7511 }
7512 else
7513 {
7514 CHECK_STRING (string);
7515 CHECK_NATNUM (start);
7516 CHECK_NATNUM (end);
7517 from = XINT (start);
7518 to = XINT (end);
7519 if (from > to
7520 || to > SCHARS (string))
7521 args_out_of_range_3 (string, start, end);
7522 if (! STRING_MULTIBYTE (string))
7523 return Qnil;
7524 p = SDATA (string) + string_char_to_byte (string, from);
7525 stop = pend = SDATA (string) + string_char_to_byte (string, to);
7526 if (ascii_compatible && (to - from) == (pend - p))
7527 return Qnil;
7528 }
7529
7530 if (NILP (count))
7531 n = 1;
7532 else
7533 {
7534 CHECK_NATNUM (count);
7535 n = XINT (count);
7536 }
7537
7538 positions = Qnil;
7539 while (1)
7540 {
7541 int c;
7542
7543 if (ascii_compatible)
7544 while (p < stop && ASCII_BYTE_P (*p))
7545 p++, from++;
7546 if (p >= stop)
7547 {
7548 if (p >= pend)
7549 break;
7550 stop = pend;
7551 p = GAP_END_ADDR;
7552 }
7553
7554 c = STRING_CHAR_ADVANCE (p);
7555 if (! (ASCII_CHAR_P (c) && ascii_compatible)
7556 && ! char_charset (translate_char (translation_table, c),
7557 charset_list, NULL))
7558 {
7559 positions = Fcons (make_number (from), positions);
7560 n--;
7561 if (n == 0)
7562 break;
7563 }
7564
7565 from++;
7566 }
7567
7568 return (NILP (count) ? Fcar (positions) : Fnreverse (positions));
7569 }
7570
7571
7572 DEFUN ("check-coding-systems-region", Fcheck_coding_systems_region,
7573 Scheck_coding_systems_region, 3, 3, 0,
7574 doc: /* Check if the region is encodable by coding systems.
7575
7576 START and END are buffer positions specifying the region.
7577 CODING-SYSTEM-LIST is a list of coding systems to check.
7578
7579 The value is an alist ((CODING-SYSTEM POS0 POS1 ...) ...), where
7580 CODING-SYSTEM is a member of CODING-SYSTEM-LIst and can't encode the
7581 whole region, POS0, POS1, ... are buffer positions where non-encodable
7582 characters are found.
7583
7584 If all coding systems in CODING-SYSTEM-LIST can encode the region, the
7585 value is nil.
7586
7587 START may be a string. In that case, check if the string is
7588 encodable, and the value contains indices to the string instead of
7589 buffer positions. END is ignored. */)
7590 (start, end, coding_system_list)
7591 Lisp_Object start, end, coding_system_list;
7592 {
7593 Lisp_Object list;
7594 EMACS_INT start_byte, end_byte;
7595 int pos;
7596 const unsigned char *p, *pbeg, *pend;
7597 int c;
7598 Lisp_Object tail, elt, attrs;
7599
7600 if (STRINGP (start))
7601 {
7602 if (!STRING_MULTIBYTE (start)
7603 && SCHARS (start) != SBYTES (start))
7604 return Qnil;
7605 start_byte = 0;
7606 end_byte = SBYTES (start);
7607 pos = 0;
7608 }
7609 else
7610 {
7611 CHECK_NUMBER_COERCE_MARKER (start);
7612 CHECK_NUMBER_COERCE_MARKER (end);
7613 if (XINT (start) < BEG || XINT (end) > Z || XINT (start) > XINT (end))
7614 args_out_of_range (start, end);
7615 if (NILP (current_buffer->enable_multibyte_characters))
7616 return Qnil;
7617 start_byte = CHAR_TO_BYTE (XINT (start));
7618 end_byte = CHAR_TO_BYTE (XINT (end));
7619 if (XINT (end) - XINT (start) == end_byte - start_byte)
7620 return Qt;
7621
7622 if (XINT (start) < GPT && XINT (end) > GPT)
7623 {
7624 if ((GPT - XINT (start)) < (XINT (end) - GPT))
7625 move_gap_both (XINT (start), start_byte);
7626 else
7627 move_gap_both (XINT (end), end_byte);
7628 }
7629 pos = XINT (start);
7630 }
7631
7632 list = Qnil;
7633 for (tail = coding_system_list; CONSP (tail); tail = XCDR (tail))
7634 {
7635 elt = XCAR (tail);
7636 attrs = AREF (CODING_SYSTEM_SPEC (elt), 0);
7637 ASET (attrs, coding_attr_trans_tbl,
7638 get_translation_table (attrs, 1, NULL));
7639 list = Fcons (Fcons (elt, Fcons (attrs, Qnil)), list);
7640 }
7641
7642 if (STRINGP (start))
7643 p = pbeg = SDATA (start);
7644 else
7645 p = pbeg = BYTE_POS_ADDR (start_byte);
7646 pend = p + (end_byte - start_byte);
7647
7648 while (p < pend && ASCII_BYTE_P (*p)) p++, pos++;
7649 while (p < pend && ASCII_BYTE_P (*(pend - 1))) pend--;
7650
7651 while (p < pend)
7652 {
7653 if (ASCII_BYTE_P (*p))
7654 p++;
7655 else
7656 {
7657 c = STRING_CHAR_ADVANCE (p);
7658
7659 charset_map_loaded = 0;
7660 for (tail = list; CONSP (tail); tail = XCDR (tail))
7661 {
7662 elt = XCDR (XCAR (tail));
7663 if (! char_encodable_p (c, XCAR (elt)))
7664 XSETCDR (elt, Fcons (make_number (pos), XCDR (elt)));
7665 }
7666 if (charset_map_loaded)
7667 {
7668 EMACS_INT p_offset = p - pbeg, pend_offset = pend - pbeg;
7669
7670 if (STRINGP (start))
7671 pbeg = SDATA (start);
7672 else
7673 pbeg = BYTE_POS_ADDR (start_byte);
7674 p = pbeg + p_offset;
7675 pend = pbeg + pend_offset;
7676 }
7677 }
7678 pos++;
7679 }
7680
7681 tail = list;
7682 list = Qnil;
7683 for (; CONSP (tail); tail = XCDR (tail))
7684 {
7685 elt = XCAR (tail);
7686 if (CONSP (XCDR (XCDR (elt))))
7687 list = Fcons (Fcons (XCAR (elt), Fnreverse (XCDR (XCDR (elt)))),
7688 list);
7689 }
7690
7691 return list;
7692 }
7693
7694
7695 Lisp_Object
7696 code_convert_region (start, end, coding_system, dst_object, encodep, norecord)
7697 Lisp_Object start, end, coding_system, dst_object;
7698 int encodep, norecord;
7699 {
7700 struct coding_system coding;
7701 EMACS_INT from, from_byte, to, to_byte;
7702 Lisp_Object src_object;
7703
7704 CHECK_NUMBER_COERCE_MARKER (start);
7705 CHECK_NUMBER_COERCE_MARKER (end);
7706 if (NILP (coding_system))
7707 coding_system = Qno_conversion;
7708 else
7709 CHECK_CODING_SYSTEM (coding_system);
7710 src_object = Fcurrent_buffer ();
7711 if (NILP (dst_object))
7712 dst_object = src_object;
7713 else if (! EQ (dst_object, Qt))
7714 CHECK_BUFFER (dst_object);
7715
7716 validate_region (&start, &end);
7717 from = XFASTINT (start);
7718 from_byte = CHAR_TO_BYTE (from);
7719 to = XFASTINT (end);
7720 to_byte = CHAR_TO_BYTE (to);
7721
7722 setup_coding_system (coding_system, &coding);
7723 coding.mode |= CODING_MODE_LAST_BLOCK;
7724
7725 if (encodep)
7726 encode_coding_object (&coding, src_object, from, from_byte, to, to_byte,
7727 dst_object);
7728 else
7729 decode_coding_object (&coding, src_object, from, from_byte, to, to_byte,
7730 dst_object);
7731 if (! norecord)
7732 Vlast_coding_system_used = CODING_ID_NAME (coding.id);
7733
7734 return (BUFFERP (dst_object)
7735 ? make_number (coding.produced_char)
7736 : coding.dst_object);
7737 }
7738
7739
7740 DEFUN ("decode-coding-region", Fdecode_coding_region, Sdecode_coding_region,
7741 3, 4, "r\nzCoding system: ",
7742 doc: /* Decode the current region from the specified coding system.
7743 When called from a program, takes four arguments:
7744 START, END, CODING-SYSTEM, and DESTINATION.
7745 START and END are buffer positions.
7746
7747 Optional 4th arguments DESTINATION specifies where the decoded text goes.
7748 If nil, the region between START and END is replace by the decoded text.
7749 If buffer, the decoded text is inserted in the buffer.
7750 If t, the decoded text is returned.
7751
7752 This function sets `last-coding-system-used' to the precise coding system
7753 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
7754 not fully specified.)
7755 It returns the length of the decoded text. */)
7756 (start, end, coding_system, destination)
7757 Lisp_Object start, end, coding_system, destination;
7758 {
7759 return code_convert_region (start, end, coding_system, destination, 0, 0);
7760 }
7761
7762 DEFUN ("encode-coding-region", Fencode_coding_region, Sencode_coding_region,
7763 3, 4, "r\nzCoding system: ",
7764 doc: /* Encode the current region by specified coding system.
7765 When called from a program, takes three arguments:
7766 START, END, and CODING-SYSTEM. START and END are buffer positions.
7767
7768 Optional 4th arguments DESTINATION specifies where the encoded text goes.
7769 If nil, the region between START and END is replace by the encoded text.
7770 If buffer, the encoded text is inserted in the buffer.
7771 If t, the encoded text is returned.
7772
7773 This function sets `last-coding-system-used' to the precise coding system
7774 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
7775 not fully specified.)
7776 It returns the length of the encoded text. */)
7777 (start, end, coding_system, destination)
7778 Lisp_Object start, end, coding_system, destination;
7779 {
7780 return code_convert_region (start, end, coding_system, destination, 1, 0);
7781 }
7782
7783 Lisp_Object
7784 code_convert_string (string, coding_system, dst_object,
7785 encodep, nocopy, norecord)
7786 Lisp_Object string, coding_system, dst_object;
7787 int encodep, nocopy, norecord;
7788 {
7789 struct coding_system coding;
7790 EMACS_INT chars, bytes;
7791
7792 CHECK_STRING (string);
7793 if (NILP (coding_system))
7794 {
7795 if (! norecord)
7796 Vlast_coding_system_used = Qno_conversion;
7797 if (NILP (dst_object))
7798 return (nocopy ? Fcopy_sequence (string) : string);
7799 }
7800
7801 if (NILP (coding_system))
7802 coding_system = Qno_conversion;
7803 else
7804 CHECK_CODING_SYSTEM (coding_system);
7805 if (NILP (dst_object))
7806 dst_object = Qt;
7807 else if (! EQ (dst_object, Qt))
7808 CHECK_BUFFER (dst_object);
7809
7810 setup_coding_system (coding_system, &coding);
7811 coding.mode |= CODING_MODE_LAST_BLOCK;
7812 chars = SCHARS (string);
7813 bytes = SBYTES (string);
7814 if (encodep)
7815 encode_coding_object (&coding, string, 0, 0, chars, bytes, dst_object);
7816 else
7817 decode_coding_object (&coding, string, 0, 0, chars, bytes, dst_object);
7818 if (! norecord)
7819 Vlast_coding_system_used = CODING_ID_NAME (coding.id);
7820
7821 return (BUFFERP (dst_object)
7822 ? make_number (coding.produced_char)
7823 : coding.dst_object);
7824 }
7825
7826
7827 /* Encode or decode STRING according to CODING_SYSTEM.
7828 Do not set Vlast_coding_system_used.
7829
7830 This function is called only from macros DECODE_FILE and
7831 ENCODE_FILE, thus we ignore character composition. */
7832
7833 Lisp_Object
7834 code_convert_string_norecord (string, coding_system, encodep)
7835 Lisp_Object string, coding_system;
7836 int encodep;
7837 {
7838 return code_convert_string (string, coding_system, Qt, encodep, 0, 1);
7839 }
7840
7841
7842 DEFUN ("decode-coding-string", Fdecode_coding_string, Sdecode_coding_string,
7843 2, 4, 0,
7844 doc: /* Decode STRING which is encoded in CODING-SYSTEM, and return the result.
7845
7846 Optional third arg NOCOPY non-nil means it is OK to return STRING itself
7847 if the decoding operation is trivial.
7848
7849 Optional fourth arg BUFFER non-nil meant that the decoded text is
7850 inserted in BUFFER instead of returned as a string. In this case,
7851 the return value is BUFFER.
7852
7853 This function sets `last-coding-system-used' to the precise coding system
7854 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
7855 not fully specified. */)
7856 (string, coding_system, nocopy, buffer)
7857 Lisp_Object string, coding_system, nocopy, buffer;
7858 {
7859 return code_convert_string (string, coding_system, buffer,
7860 0, ! NILP (nocopy), 0);
7861 }
7862
7863 DEFUN ("encode-coding-string", Fencode_coding_string, Sencode_coding_string,
7864 2, 4, 0,
7865 doc: /* Encode STRING to CODING-SYSTEM, and return the result.
7866
7867 Optional third arg NOCOPY non-nil means it is OK to return STRING
7868 itself if the encoding operation is trivial.
7869
7870 Optional fourth arg BUFFER non-nil meant that the encoded text is
7871 inserted in BUFFER instead of returned as a string. In this case,
7872 the return value is BUFFER.
7873
7874 This function sets `last-coding-system-used' to the precise coding system
7875 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
7876 not fully specified.) */)
7877 (string, coding_system, nocopy, buffer)
7878 Lisp_Object string, coding_system, nocopy, buffer;
7879 {
7880 return code_convert_string (string, coding_system, buffer,
7881 1, ! NILP (nocopy), 1);
7882 }
7883
7884 \f
7885 DEFUN ("decode-sjis-char", Fdecode_sjis_char, Sdecode_sjis_char, 1, 1, 0,
7886 doc: /* Decode a Japanese character which has CODE in shift_jis encoding.
7887 Return the corresponding character. */)
7888 (code)
7889 Lisp_Object code;
7890 {
7891 Lisp_Object spec, attrs, val;
7892 struct charset *charset_roman, *charset_kanji, *charset_kana, *charset;
7893 int c;
7894
7895 CHECK_NATNUM (code);
7896 c = XFASTINT (code);
7897 CHECK_CODING_SYSTEM_GET_SPEC (Vsjis_coding_system, spec);
7898 attrs = AREF (spec, 0);
7899
7900 if (ASCII_BYTE_P (c)
7901 && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
7902 return code;
7903
7904 val = CODING_ATTR_CHARSET_LIST (attrs);
7905 charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
7906 charset_kana = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
7907 charset_kanji = CHARSET_FROM_ID (XINT (XCAR (val)));
7908
7909 if (c <= 0x7F)
7910 charset = charset_roman;
7911 else if (c >= 0xA0 && c < 0xDF)
7912 {
7913 charset = charset_kana;
7914 c -= 0x80;
7915 }
7916 else
7917 {
7918 int s1 = c >> 8, s2 = c & 0xFF;
7919
7920 if (s1 < 0x81 || (s1 > 0x9F && s1 < 0xE0) || s1 > 0xEF
7921 || s2 < 0x40 || s2 == 0x7F || s2 > 0xFC)
7922 error ("Invalid code: %d", code);
7923 SJIS_TO_JIS (c);
7924 charset = charset_kanji;
7925 }
7926 c = DECODE_CHAR (charset, c);
7927 if (c < 0)
7928 error ("Invalid code: %d", code);
7929 return make_number (c);
7930 }
7931
7932
7933 DEFUN ("encode-sjis-char", Fencode_sjis_char, Sencode_sjis_char, 1, 1, 0,
7934 doc: /* Encode a Japanese character CHAR to shift_jis encoding.
7935 Return the corresponding code in SJIS. */)
7936 (ch)
7937 Lisp_Object ch;
7938 {
7939 Lisp_Object spec, attrs, charset_list;
7940 int c;
7941 struct charset *charset;
7942 unsigned code;
7943
7944 CHECK_CHARACTER (ch);
7945 c = XFASTINT (ch);
7946 CHECK_CODING_SYSTEM_GET_SPEC (Vsjis_coding_system, spec);
7947 attrs = AREF (spec, 0);
7948
7949 if (ASCII_CHAR_P (c)
7950 && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
7951 return ch;
7952
7953 charset_list = CODING_ATTR_CHARSET_LIST (attrs);
7954 charset = char_charset (c, charset_list, &code);
7955 if (code == CHARSET_INVALID_CODE (charset))
7956 error ("Can't encode by shift_jis encoding: %d", c);
7957 JIS_TO_SJIS (code);
7958
7959 return make_number (code);
7960 }
7961
7962 DEFUN ("decode-big5-char", Fdecode_big5_char, Sdecode_big5_char, 1, 1, 0,
7963 doc: /* Decode a Big5 character which has CODE in BIG5 coding system.
7964 Return the corresponding character. */)
7965 (code)
7966 Lisp_Object code;
7967 {
7968 Lisp_Object spec, attrs, val;
7969 struct charset *charset_roman, *charset_big5, *charset;
7970 int c;
7971
7972 CHECK_NATNUM (code);
7973 c = XFASTINT (code);
7974 CHECK_CODING_SYSTEM_GET_SPEC (Vbig5_coding_system, spec);
7975 attrs = AREF (spec, 0);
7976
7977 if (ASCII_BYTE_P (c)
7978 && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
7979 return code;
7980
7981 val = CODING_ATTR_CHARSET_LIST (attrs);
7982 charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
7983 charset_big5 = CHARSET_FROM_ID (XINT (XCAR (val)));
7984
7985 if (c <= 0x7F)
7986 charset = charset_roman;
7987 else
7988 {
7989 int b1 = c >> 8, b2 = c & 0x7F;
7990 if (b1 < 0xA1 || b1 > 0xFE
7991 || b2 < 0x40 || (b2 > 0x7E && b2 < 0xA1) || b2 > 0xFE)
7992 error ("Invalid code: %d", code);
7993 charset = charset_big5;
7994 }
7995 c = DECODE_CHAR (charset, (unsigned )c);
7996 if (c < 0)
7997 error ("Invalid code: %d", code);
7998 return make_number (c);
7999 }
8000
8001 DEFUN ("encode-big5-char", Fencode_big5_char, Sencode_big5_char, 1, 1, 0,
8002 doc: /* Encode the Big5 character CHAR to BIG5 coding system.
8003 Return the corresponding character code in Big5. */)
8004 (ch)
8005 Lisp_Object ch;
8006 {
8007 Lisp_Object spec, attrs, charset_list;
8008 struct charset *charset;
8009 int c;
8010 unsigned code;
8011
8012 CHECK_CHARACTER (ch);
8013 c = XFASTINT (ch);
8014 CHECK_CODING_SYSTEM_GET_SPEC (Vbig5_coding_system, spec);
8015 attrs = AREF (spec, 0);
8016 if (ASCII_CHAR_P (c)
8017 && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
8018 return ch;
8019
8020 charset_list = CODING_ATTR_CHARSET_LIST (attrs);
8021 charset = char_charset (c, charset_list, &code);
8022 if (code == CHARSET_INVALID_CODE (charset))
8023 error ("Can't encode by Big5 encoding: %d", c);
8024
8025 return make_number (code);
8026 }
8027
8028 \f
8029 DEFUN ("set-terminal-coding-system-internal",
8030 Fset_terminal_coding_system_internal,
8031 Sset_terminal_coding_system_internal, 1, 1, 0,
8032 doc: /* Internal use only. */)
8033 (coding_system)
8034 Lisp_Object coding_system;
8035 {
8036 CHECK_SYMBOL (coding_system);
8037 setup_coding_system (Fcheck_coding_system (coding_system),
8038 &terminal_coding);
8039
8040 /* We had better not send unsafe characters to terminal. */
8041 terminal_coding.mode |= CODING_MODE_SAFE_ENCODING;
8042 /* Characer composition should be disabled. */
8043 terminal_coding.common_flags &= ~CODING_ANNOTATE_COMPOSITION_MASK;
8044 terminal_coding.src_multibyte = 1;
8045 terminal_coding.dst_multibyte = 0;
8046 return Qnil;
8047 }
8048
8049 DEFUN ("set-safe-terminal-coding-system-internal",
8050 Fset_safe_terminal_coding_system_internal,
8051 Sset_safe_terminal_coding_system_internal, 1, 1, 0,
8052 doc: /* Internal use only. */)
8053 (coding_system)
8054 Lisp_Object coding_system;
8055 {
8056 CHECK_SYMBOL (coding_system);
8057 setup_coding_system (Fcheck_coding_system (coding_system),
8058 &safe_terminal_coding);
8059 /* Characer composition should be disabled. */
8060 safe_terminal_coding.common_flags &= ~CODING_ANNOTATE_COMPOSITION_MASK;
8061 safe_terminal_coding.src_multibyte = 1;
8062 safe_terminal_coding.dst_multibyte = 0;
8063 return Qnil;
8064 }
8065
8066 DEFUN ("terminal-coding-system",
8067 Fterminal_coding_system, Sterminal_coding_system, 0, 0, 0,
8068 doc: /* Return coding system specified for terminal output. */)
8069 ()
8070 {
8071 return CODING_ID_NAME (terminal_coding.id);
8072 }
8073
8074 DEFUN ("set-keyboard-coding-system-internal",
8075 Fset_keyboard_coding_system_internal,
8076 Sset_keyboard_coding_system_internal, 1, 1, 0,
8077 doc: /* Internal use only. */)
8078 (coding_system)
8079 Lisp_Object coding_system;
8080 {
8081 CHECK_SYMBOL (coding_system);
8082 setup_coding_system (Fcheck_coding_system (coding_system),
8083 &keyboard_coding);
8084 /* Characer composition should be disabled. */
8085 keyboard_coding.common_flags &= ~CODING_ANNOTATE_COMPOSITION_MASK;
8086 return Qnil;
8087 }
8088
8089 DEFUN ("keyboard-coding-system",
8090 Fkeyboard_coding_system, Skeyboard_coding_system, 0, 0, 0,
8091 doc: /* Return coding system specified for decoding keyboard input. */)
8092 ()
8093 {
8094 return CODING_ID_NAME (keyboard_coding.id);
8095 }
8096
8097 \f
8098 DEFUN ("find-operation-coding-system", Ffind_operation_coding_system,
8099 Sfind_operation_coding_system, 1, MANY, 0,
8100 doc: /* Choose a coding system for an operation based on the target name.
8101 The value names a pair of coding systems: (DECODING-SYSTEM . ENCODING-SYSTEM).
8102 DECODING-SYSTEM is the coding system to use for decoding
8103 \(in case OPERATION does decoding), and ENCODING-SYSTEM is the coding system
8104 for encoding (in case OPERATION does encoding).
8105
8106 The first argument OPERATION specifies an I/O primitive:
8107 For file I/O, `insert-file-contents' or `write-region'.
8108 For process I/O, `call-process', `call-process-region', or `start-process'.
8109 For network I/O, `open-network-stream'.
8110
8111 The remaining arguments should be the same arguments that were passed
8112 to the primitive. Depending on which primitive, one of those arguments
8113 is selected as the TARGET. For example, if OPERATION does file I/O,
8114 whichever argument specifies the file name is TARGET.
8115
8116 TARGET has a meaning which depends on OPERATION:
8117 For file I/O, TARGET is a file name.
8118 For process I/O, TARGET is a process name.
8119 For network I/O, TARGET is a service name or a port number
8120
8121 This function looks up what specified for TARGET in,
8122 `file-coding-system-alist', `process-coding-system-alist',
8123 or `network-coding-system-alist' depending on OPERATION.
8124 They may specify a coding system, a cons of coding systems,
8125 or a function symbol to call.
8126 In the last case, we call the function with one argument,
8127 which is a list of all the arguments given to this function.
8128
8129 usage: (find-operation-coding-system OPERATION ARGUMENTS ...) */)
8130 (nargs, args)
8131 int nargs;
8132 Lisp_Object *args;
8133 {
8134 Lisp_Object operation, target_idx, target, val;
8135 register Lisp_Object chain;
8136
8137 if (nargs < 2)
8138 error ("Too few arguments");
8139 operation = args[0];
8140 if (!SYMBOLP (operation)
8141 || !INTEGERP (target_idx = Fget (operation, Qtarget_idx)))
8142 error ("Invalid first arguement");
8143 if (nargs < 1 + XINT (target_idx))
8144 error ("Too few arguments for operation: %s",
8145 SDATA (SYMBOL_NAME (operation)));
8146 target = args[XINT (target_idx) + 1];
8147 if (!(STRINGP (target)
8148 || (EQ (operation, Qopen_network_stream) && INTEGERP (target))))
8149 error ("Invalid %dth argument", XINT (target_idx) + 1);
8150
8151 chain = ((EQ (operation, Qinsert_file_contents)
8152 || EQ (operation, Qwrite_region))
8153 ? Vfile_coding_system_alist
8154 : (EQ (operation, Qopen_network_stream)
8155 ? Vnetwork_coding_system_alist
8156 : Vprocess_coding_system_alist));
8157 if (NILP (chain))
8158 return Qnil;
8159
8160 for (; CONSP (chain); chain = XCDR (chain))
8161 {
8162 Lisp_Object elt;
8163
8164 elt = XCAR (chain);
8165 if (CONSP (elt)
8166 && ((STRINGP (target)
8167 && STRINGP (XCAR (elt))
8168 && fast_string_match (XCAR (elt), target) >= 0)
8169 || (INTEGERP (target) && EQ (target, XCAR (elt)))))
8170 {
8171 val = XCDR (elt);
8172 /* Here, if VAL is both a valid coding system and a valid
8173 function symbol, we return VAL as a coding system. */
8174 if (CONSP (val))
8175 return val;
8176 if (! SYMBOLP (val))
8177 return Qnil;
8178 if (! NILP (Fcoding_system_p (val)))
8179 return Fcons (val, val);
8180 if (! NILP (Ffboundp (val)))
8181 {
8182 val = call1 (val, Flist (nargs, args));
8183 if (CONSP (val))
8184 return val;
8185 if (SYMBOLP (val) && ! NILP (Fcoding_system_p (val)))
8186 return Fcons (val, val);
8187 }
8188 return Qnil;
8189 }
8190 }
8191 return Qnil;
8192 }
8193
8194 DEFUN ("set-coding-system-priority", Fset_coding_system_priority,
8195 Sset_coding_system_priority, 0, MANY, 0,
8196 doc: /* Assign higher priority to the coding systems given as arguments.
8197 If multiple coding systems belongs to the same category,
8198 all but the first one are ignored.
8199
8200 usage: (set-coding-system-priority ...) */)
8201 (nargs, args)
8202 int nargs;
8203 Lisp_Object *args;
8204 {
8205 int i, j;
8206 int changed[coding_category_max];
8207 enum coding_category priorities[coding_category_max];
8208
8209 bzero (changed, sizeof changed);
8210
8211 for (i = j = 0; i < nargs; i++)
8212 {
8213 enum coding_category category;
8214 Lisp_Object spec, attrs;
8215
8216 CHECK_CODING_SYSTEM_GET_SPEC (args[i], spec);
8217 attrs = AREF (spec, 0);
8218 category = XINT (CODING_ATTR_CATEGORY (attrs));
8219 if (changed[category])
8220 /* Ignore this coding system because a coding system of the
8221 same category already had a higher priority. */
8222 continue;
8223 changed[category] = 1;
8224 priorities[j++] = category;
8225 if (coding_categories[category].id >= 0
8226 && ! EQ (args[i], CODING_ID_NAME (coding_categories[category].id)))
8227 setup_coding_system (args[i], &coding_categories[category]);
8228 Fset (AREF (Vcoding_category_table, category), args[i]);
8229 }
8230
8231 /* Now we have decided top J priorities. Reflect the order of the
8232 original priorities to the remaining priorities. */
8233
8234 for (i = j, j = 0; i < coding_category_max; i++, j++)
8235 {
8236 while (j < coding_category_max
8237 && changed[coding_priorities[j]])
8238 j++;
8239 if (j == coding_category_max)
8240 abort ();
8241 priorities[i] = coding_priorities[j];
8242 }
8243
8244 bcopy (priorities, coding_priorities, sizeof priorities);
8245
8246 /* Update `coding-category-list'. */
8247 Vcoding_category_list = Qnil;
8248 for (i = coding_category_max - 1; i >= 0; i--)
8249 Vcoding_category_list
8250 = Fcons (AREF (Vcoding_category_table, priorities[i]),
8251 Vcoding_category_list);
8252
8253 return Qnil;
8254 }
8255
8256 DEFUN ("coding-system-priority-list", Fcoding_system_priority_list,
8257 Scoding_system_priority_list, 0, 1, 0,
8258 doc: /* Return a list of coding systems ordered by their priorities.
8259 HIGHESTP non-nil means just return the highest priority one. */)
8260 (highestp)
8261 Lisp_Object highestp;
8262 {
8263 int i;
8264 Lisp_Object val;
8265
8266 for (i = 0, val = Qnil; i < coding_category_max; i++)
8267 {
8268 enum coding_category category = coding_priorities[i];
8269 int id = coding_categories[category].id;
8270 Lisp_Object attrs;
8271
8272 if (id < 0)
8273 continue;
8274 attrs = CODING_ID_ATTRS (id);
8275 if (! NILP (highestp))
8276 return CODING_ATTR_BASE_NAME (attrs);
8277 val = Fcons (CODING_ATTR_BASE_NAME (attrs), val);
8278 }
8279 return Fnreverse (val);
8280 }
8281
8282 static char *suffixes[] = { "-unix", "-dos", "-mac" };
8283
8284 static Lisp_Object
8285 make_subsidiaries (base)
8286 Lisp_Object base;
8287 {
8288 Lisp_Object subsidiaries;
8289 int base_name_len = SBYTES (SYMBOL_NAME (base));
8290 char *buf = (char *) alloca (base_name_len + 6);
8291 int i;
8292
8293 bcopy (SDATA (SYMBOL_NAME (base)), buf, base_name_len);
8294 subsidiaries = Fmake_vector (make_number (3), Qnil);
8295 for (i = 0; i < 3; i++)
8296 {
8297 bcopy (suffixes[i], buf + base_name_len, strlen (suffixes[i]) + 1);
8298 ASET (subsidiaries, i, intern (buf));
8299 }
8300 return subsidiaries;
8301 }
8302
8303
8304 DEFUN ("define-coding-system-internal", Fdefine_coding_system_internal,
8305 Sdefine_coding_system_internal, coding_arg_max, MANY, 0,
8306 doc: /* For internal use only.
8307 usage: (define-coding-system-internal ...) */)
8308 (nargs, args)
8309 int nargs;
8310 Lisp_Object *args;
8311 {
8312 Lisp_Object name;
8313 Lisp_Object spec_vec; /* [ ATTRS ALIASE EOL_TYPE ] */
8314 Lisp_Object attrs; /* Vector of attributes. */
8315 Lisp_Object eol_type;
8316 Lisp_Object aliases;
8317 Lisp_Object coding_type, charset_list, safe_charsets;
8318 enum coding_category category;
8319 Lisp_Object tail, val;
8320 int max_charset_id = 0;
8321 int i;
8322
8323 if (nargs < coding_arg_max)
8324 goto short_args;
8325
8326 attrs = Fmake_vector (make_number (coding_attr_last_index), Qnil);
8327
8328 name = args[coding_arg_name];
8329 CHECK_SYMBOL (name);
8330 CODING_ATTR_BASE_NAME (attrs) = name;
8331
8332 val = args[coding_arg_mnemonic];
8333 if (! STRINGP (val))
8334 CHECK_CHARACTER (val);
8335 CODING_ATTR_MNEMONIC (attrs) = val;
8336
8337 coding_type = args[coding_arg_coding_type];
8338 CHECK_SYMBOL (coding_type);
8339 CODING_ATTR_TYPE (attrs) = coding_type;
8340
8341 charset_list = args[coding_arg_charset_list];
8342 if (SYMBOLP (charset_list))
8343 {
8344 if (EQ (charset_list, Qiso_2022))
8345 {
8346 if (! EQ (coding_type, Qiso_2022))
8347 error ("Invalid charset-list");
8348 charset_list = Viso_2022_charset_list;
8349 }
8350 else if (EQ (charset_list, Qemacs_mule))
8351 {
8352 if (! EQ (coding_type, Qemacs_mule))
8353 error ("Invalid charset-list");
8354 charset_list = Vemacs_mule_charset_list;
8355 }
8356 for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
8357 if (max_charset_id < XFASTINT (XCAR (tail)))
8358 max_charset_id = XFASTINT (XCAR (tail));
8359 }
8360 else
8361 {
8362 charset_list = Fcopy_sequence (charset_list);
8363 for (tail = charset_list; !NILP (tail); tail = Fcdr (tail))
8364 {
8365 struct charset *charset;
8366
8367 val = Fcar (tail);
8368 CHECK_CHARSET_GET_CHARSET (val, charset);
8369 if (EQ (coding_type, Qiso_2022)
8370 ? CHARSET_ISO_FINAL (charset) < 0
8371 : EQ (coding_type, Qemacs_mule)
8372 ? CHARSET_EMACS_MULE_ID (charset) < 0
8373 : 0)
8374 error ("Can't handle charset `%s'",
8375 SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
8376
8377 XSETCAR (tail, make_number (charset->id));
8378 if (max_charset_id < charset->id)
8379 max_charset_id = charset->id;
8380 }
8381 }
8382 CODING_ATTR_CHARSET_LIST (attrs) = charset_list;
8383
8384 safe_charsets = Fmake_string (make_number (max_charset_id + 1),
8385 make_number (255));
8386 for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
8387 SSET (safe_charsets, XFASTINT (XCAR (tail)), 0);
8388 CODING_ATTR_SAFE_CHARSETS (attrs) = safe_charsets;
8389
8390 CODING_ATTR_ASCII_COMPAT (attrs) = args[coding_arg_ascii_compatible_p];
8391
8392 val = args[coding_arg_decode_translation_table];
8393 if (! CHAR_TABLE_P (val) && ! CONSP (val))
8394 CHECK_SYMBOL (val);
8395 CODING_ATTR_DECODE_TBL (attrs) = val;
8396
8397 val = args[coding_arg_encode_translation_table];
8398 if (! CHAR_TABLE_P (val) && ! CONSP (val))
8399 CHECK_SYMBOL (val);
8400 CODING_ATTR_ENCODE_TBL (attrs) = val;
8401
8402 val = args[coding_arg_post_read_conversion];
8403 CHECK_SYMBOL (val);
8404 CODING_ATTR_POST_READ (attrs) = val;
8405
8406 val = args[coding_arg_pre_write_conversion];
8407 CHECK_SYMBOL (val);
8408 CODING_ATTR_PRE_WRITE (attrs) = val;
8409
8410 val = args[coding_arg_default_char];
8411 if (NILP (val))
8412 CODING_ATTR_DEFAULT_CHAR (attrs) = make_number (' ');
8413 else
8414 {
8415 CHECK_CHARACTER (val);
8416 CODING_ATTR_DEFAULT_CHAR (attrs) = val;
8417 }
8418
8419 val = args[coding_arg_for_unibyte];
8420 CODING_ATTR_FOR_UNIBYTE (attrs) = NILP (val) ? Qnil : Qt;
8421
8422 val = args[coding_arg_plist];
8423 CHECK_LIST (val);
8424 CODING_ATTR_PLIST (attrs) = val;
8425
8426 if (EQ (coding_type, Qcharset))
8427 {
8428 /* Generate a lisp vector of 256 elements. Each element is nil,
8429 integer, or a list of charset IDs.
8430
8431 If Nth element is nil, the byte code N is invalid in this
8432 coding system.
8433
8434 If Nth element is a number NUM, N is the first byte of a
8435 charset whose ID is NUM.
8436
8437 If Nth element is a list of charset IDs, N is the first byte
8438 of one of them. The list is sorted by dimensions of the
8439 charsets. A charset of smaller dimension comes firtst. */
8440 val = Fmake_vector (make_number (256), Qnil);
8441
8442 for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
8443 {
8444 struct charset *charset = CHARSET_FROM_ID (XFASTINT (XCAR (tail)));
8445 int dim = CHARSET_DIMENSION (charset);
8446 int idx = (dim - 1) * 4;
8447
8448 if (CHARSET_ASCII_COMPATIBLE_P (charset))
8449 CODING_ATTR_ASCII_COMPAT (attrs) = Qt;
8450
8451 for (i = charset->code_space[idx];
8452 i <= charset->code_space[idx + 1]; i++)
8453 {
8454 Lisp_Object tmp, tmp2;
8455 int dim2;
8456
8457 tmp = AREF (val, i);
8458 if (NILP (tmp))
8459 tmp = XCAR (tail);
8460 else if (NUMBERP (tmp))
8461 {
8462 dim2 = CHARSET_DIMENSION (CHARSET_FROM_ID (XFASTINT (tmp)));
8463 if (dim < dim2)
8464 tmp = Fcons (XCAR (tail), Fcons (tmp, Qnil));
8465 else
8466 tmp = Fcons (tmp, Fcons (XCAR (tail), Qnil));
8467 }
8468 else
8469 {
8470 for (tmp2 = tmp; CONSP (tmp2); tmp2 = XCDR (tmp2))
8471 {
8472 dim2 = CHARSET_DIMENSION (CHARSET_FROM_ID (XFASTINT (XCAR (tmp2))));
8473 if (dim < dim2)
8474 break;
8475 }
8476 if (NILP (tmp2))
8477 tmp = nconc2 (tmp, Fcons (XCAR (tail), Qnil));
8478 else
8479 {
8480 XSETCDR (tmp2, Fcons (XCAR (tmp2), XCDR (tmp2)));
8481 XSETCAR (tmp2, XCAR (tail));
8482 }
8483 }
8484 ASET (val, i, tmp);
8485 }
8486 }
8487 ASET (attrs, coding_attr_charset_valids, val);
8488 category = coding_category_charset;
8489 }
8490 else if (EQ (coding_type, Qccl))
8491 {
8492 Lisp_Object valids;
8493
8494 if (nargs < coding_arg_ccl_max)
8495 goto short_args;
8496
8497 val = args[coding_arg_ccl_decoder];
8498 CHECK_CCL_PROGRAM (val);
8499 if (VECTORP (val))
8500 val = Fcopy_sequence (val);
8501 ASET (attrs, coding_attr_ccl_decoder, val);
8502
8503 val = args[coding_arg_ccl_encoder];
8504 CHECK_CCL_PROGRAM (val);
8505 if (VECTORP (val))
8506 val = Fcopy_sequence (val);
8507 ASET (attrs, coding_attr_ccl_encoder, val);
8508
8509 val = args[coding_arg_ccl_valids];
8510 valids = Fmake_string (make_number (256), make_number (0));
8511 for (tail = val; !NILP (tail); tail = Fcdr (tail))
8512 {
8513 int from, to;
8514
8515 val = Fcar (tail);
8516 if (INTEGERP (val))
8517 {
8518 from = to = XINT (val);
8519 if (from < 0 || from > 255)
8520 args_out_of_range_3 (val, make_number (0), make_number (255));
8521 }
8522 else
8523 {
8524 CHECK_CONS (val);
8525 CHECK_NATNUM_CAR (val);
8526 CHECK_NATNUM_CDR (val);
8527 from = XINT (XCAR (val));
8528 if (from > 255)
8529 args_out_of_range_3 (XCAR (val),
8530 make_number (0), make_number (255));
8531 to = XINT (XCDR (val));
8532 if (to < from || to > 255)
8533 args_out_of_range_3 (XCDR (val),
8534 XCAR (val), make_number (255));
8535 }
8536 for (i = from; i <= to; i++)
8537 SSET (valids, i, 1);
8538 }
8539 ASET (attrs, coding_attr_ccl_valids, valids);
8540
8541 category = coding_category_ccl;
8542 }
8543 else if (EQ (coding_type, Qutf_16))
8544 {
8545 Lisp_Object bom, endian;
8546
8547 CODING_ATTR_ASCII_COMPAT (attrs) = Qnil;
8548
8549 if (nargs < coding_arg_utf16_max)
8550 goto short_args;
8551
8552 bom = args[coding_arg_utf16_bom];
8553 if (! NILP (bom) && ! EQ (bom, Qt))
8554 {
8555 CHECK_CONS (bom);
8556 val = XCAR (bom);
8557 CHECK_CODING_SYSTEM (val);
8558 val = XCDR (bom);
8559 CHECK_CODING_SYSTEM (val);
8560 }
8561 ASET (attrs, coding_attr_utf_16_bom, bom);
8562
8563 endian = args[coding_arg_utf16_endian];
8564 CHECK_SYMBOL (endian);
8565 if (NILP (endian))
8566 endian = Qbig;
8567 else if (! EQ (endian, Qbig) && ! EQ (endian, Qlittle))
8568 error ("Invalid endian: %s", SDATA (SYMBOL_NAME (endian)));
8569 ASET (attrs, coding_attr_utf_16_endian, endian);
8570
8571 category = (CONSP (bom)
8572 ? coding_category_utf_16_auto
8573 : NILP (bom)
8574 ? (EQ (endian, Qbig)
8575 ? coding_category_utf_16_be_nosig
8576 : coding_category_utf_16_le_nosig)
8577 : (EQ (endian, Qbig)
8578 ? coding_category_utf_16_be
8579 : coding_category_utf_16_le));
8580 }
8581 else if (EQ (coding_type, Qiso_2022))
8582 {
8583 Lisp_Object initial, reg_usage, request, flags;
8584 int i;
8585
8586 if (nargs < coding_arg_iso2022_max)
8587 goto short_args;
8588
8589 initial = Fcopy_sequence (args[coding_arg_iso2022_initial]);
8590 CHECK_VECTOR (initial);
8591 for (i = 0; i < 4; i++)
8592 {
8593 val = Faref (initial, make_number (i));
8594 if (! NILP (val))
8595 {
8596 struct charset *charset;
8597
8598 CHECK_CHARSET_GET_CHARSET (val, charset);
8599 ASET (initial, i, make_number (CHARSET_ID (charset)));
8600 if (i == 0 && CHARSET_ASCII_COMPATIBLE_P (charset))
8601 CODING_ATTR_ASCII_COMPAT (attrs) = Qt;
8602 }
8603 else
8604 ASET (initial, i, make_number (-1));
8605 }
8606
8607 reg_usage = args[coding_arg_iso2022_reg_usage];
8608 CHECK_CONS (reg_usage);
8609 CHECK_NUMBER_CAR (reg_usage);
8610 CHECK_NUMBER_CDR (reg_usage);
8611
8612 request = Fcopy_sequence (args[coding_arg_iso2022_request]);
8613 for (tail = request; ! NILP (tail); tail = Fcdr (tail))
8614 {
8615 int id;
8616 Lisp_Object tmp;
8617
8618 val = Fcar (tail);
8619 CHECK_CONS (val);
8620 tmp = XCAR (val);
8621 CHECK_CHARSET_GET_ID (tmp, id);
8622 CHECK_NATNUM_CDR (val);
8623 if (XINT (XCDR (val)) >= 4)
8624 error ("Invalid graphic register number: %d", XINT (XCDR (val)));
8625 XSETCAR (val, make_number (id));
8626 }
8627
8628 flags = args[coding_arg_iso2022_flags];
8629 CHECK_NATNUM (flags);
8630 i = XINT (flags);
8631 if (EQ (args[coding_arg_charset_list], Qiso_2022))
8632 flags = make_number (i | CODING_ISO_FLAG_FULL_SUPPORT);
8633
8634 ASET (attrs, coding_attr_iso_initial, initial);
8635 ASET (attrs, coding_attr_iso_usage, reg_usage);
8636 ASET (attrs, coding_attr_iso_request, request);
8637 ASET (attrs, coding_attr_iso_flags, flags);
8638 setup_iso_safe_charsets (attrs);
8639
8640 if (i & CODING_ISO_FLAG_SEVEN_BITS)
8641 category = ((i & (CODING_ISO_FLAG_LOCKING_SHIFT
8642 | CODING_ISO_FLAG_SINGLE_SHIFT))
8643 ? coding_category_iso_7_else
8644 : EQ (args[coding_arg_charset_list], Qiso_2022)
8645 ? coding_category_iso_7
8646 : coding_category_iso_7_tight);
8647 else
8648 {
8649 int id = XINT (AREF (initial, 1));
8650
8651 category = (((i & CODING_ISO_FLAG_LOCKING_SHIFT)
8652 || EQ (args[coding_arg_charset_list], Qiso_2022)
8653 || id < 0)
8654 ? coding_category_iso_8_else
8655 : (CHARSET_DIMENSION (CHARSET_FROM_ID (id)) == 1)
8656 ? coding_category_iso_8_1
8657 : coding_category_iso_8_2);
8658 }
8659 if (category != coding_category_iso_8_1
8660 && category != coding_category_iso_8_2)
8661 CODING_ATTR_ASCII_COMPAT (attrs) = Qnil;
8662 }
8663 else if (EQ (coding_type, Qemacs_mule))
8664 {
8665 if (EQ (args[coding_arg_charset_list], Qemacs_mule))
8666 ASET (attrs, coding_attr_emacs_mule_full, Qt);
8667 CODING_ATTR_ASCII_COMPAT (attrs) = Qt;
8668 category = coding_category_emacs_mule;
8669 }
8670 else if (EQ (coding_type, Qshift_jis))
8671 {
8672
8673 struct charset *charset;
8674
8675 if (XINT (Flength (charset_list)) != 3
8676 && XINT (Flength (charset_list)) != 4)
8677 error ("There should be three or four charsets");
8678
8679 charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
8680 if (CHARSET_DIMENSION (charset) != 1)
8681 error ("Dimension of charset %s is not one",
8682 SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
8683 if (CHARSET_ASCII_COMPATIBLE_P (charset))
8684 CODING_ATTR_ASCII_COMPAT (attrs) = Qt;
8685
8686 charset_list = XCDR (charset_list);
8687 charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
8688 if (CHARSET_DIMENSION (charset) != 1)
8689 error ("Dimension of charset %s is not one",
8690 SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
8691
8692 charset_list = XCDR (charset_list);
8693 charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
8694 if (CHARSET_DIMENSION (charset) != 2)
8695 error ("Dimension of charset %s is not two",
8696 SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
8697
8698 charset_list = XCDR (charset_list);
8699 if (! NILP (charset_list))
8700 {
8701 charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
8702 if (CHARSET_DIMENSION (charset) != 2)
8703 error ("Dimension of charset %s is not two",
8704 SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
8705 }
8706
8707 category = coding_category_sjis;
8708 Vsjis_coding_system = name;
8709 }
8710 else if (EQ (coding_type, Qbig5))
8711 {
8712 struct charset *charset;
8713
8714 if (XINT (Flength (charset_list)) != 2)
8715 error ("There should be just two charsets");
8716
8717 charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
8718 if (CHARSET_DIMENSION (charset) != 1)
8719 error ("Dimension of charset %s is not one",
8720 SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
8721 if (CHARSET_ASCII_COMPATIBLE_P (charset))
8722 CODING_ATTR_ASCII_COMPAT (attrs) = Qt;
8723
8724 charset_list = XCDR (charset_list);
8725 charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
8726 if (CHARSET_DIMENSION (charset) != 2)
8727 error ("Dimension of charset %s is not two",
8728 SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
8729
8730 category = coding_category_big5;
8731 Vbig5_coding_system = name;
8732 }
8733 else if (EQ (coding_type, Qraw_text))
8734 {
8735 category = coding_category_raw_text;
8736 CODING_ATTR_ASCII_COMPAT (attrs) = Qt;
8737 }
8738 else if (EQ (coding_type, Qutf_8))
8739 {
8740 category = coding_category_utf_8;
8741 CODING_ATTR_ASCII_COMPAT (attrs) = Qt;
8742 }
8743 else if (EQ (coding_type, Qundecided))
8744 category = coding_category_undecided;
8745 else
8746 error ("Invalid coding system type: %s",
8747 SDATA (SYMBOL_NAME (coding_type)));
8748
8749 CODING_ATTR_CATEGORY (attrs) = make_number (category);
8750 CODING_ATTR_PLIST (attrs)
8751 = Fcons (QCcategory, Fcons (AREF (Vcoding_category_table, category),
8752 CODING_ATTR_PLIST (attrs)));
8753
8754 eol_type = args[coding_arg_eol_type];
8755 if (! NILP (eol_type)
8756 && ! EQ (eol_type, Qunix)
8757 && ! EQ (eol_type, Qdos)
8758 && ! EQ (eol_type, Qmac))
8759 error ("Invalid eol-type");
8760
8761 aliases = Fcons (name, Qnil);
8762
8763 if (NILP (eol_type))
8764 {
8765 eol_type = make_subsidiaries (name);
8766 for (i = 0; i < 3; i++)
8767 {
8768 Lisp_Object this_spec, this_name, this_aliases, this_eol_type;
8769
8770 this_name = AREF (eol_type, i);
8771 this_aliases = Fcons (this_name, Qnil);
8772 this_eol_type = (i == 0 ? Qunix : i == 1 ? Qdos : Qmac);
8773 this_spec = Fmake_vector (make_number (3), attrs);
8774 ASET (this_spec, 1, this_aliases);
8775 ASET (this_spec, 2, this_eol_type);
8776 Fputhash (this_name, this_spec, Vcoding_system_hash_table);
8777 Vcoding_system_list = Fcons (this_name, Vcoding_system_list);
8778 Vcoding_system_alist = Fcons (Fcons (Fsymbol_name (this_name), Qnil),
8779 Vcoding_system_alist);
8780 }
8781 }
8782
8783 spec_vec = Fmake_vector (make_number (3), attrs);
8784 ASET (spec_vec, 1, aliases);
8785 ASET (spec_vec, 2, eol_type);
8786
8787 Fputhash (name, spec_vec, Vcoding_system_hash_table);
8788 Vcoding_system_list = Fcons (name, Vcoding_system_list);
8789 Vcoding_system_alist = Fcons (Fcons (Fsymbol_name (name), Qnil),
8790 Vcoding_system_alist);
8791
8792 {
8793 int id = coding_categories[category].id;
8794
8795 if (id < 0 || EQ (name, CODING_ID_NAME (id)))
8796 setup_coding_system (name, &coding_categories[category]);
8797 }
8798
8799 return Qnil;
8800
8801 short_args:
8802 return Fsignal (Qwrong_number_of_arguments,
8803 Fcons (intern ("define-coding-system-internal"),
8804 make_number (nargs)));
8805 }
8806
8807
8808 DEFUN ("coding-system-put", Fcoding_system_put, Scoding_system_put,
8809 3, 3, 0,
8810 doc: /* Change value in CODING-SYSTEM's property list PROP to VAL. */)
8811 (coding_system, prop, val)
8812 Lisp_Object coding_system, prop, val;
8813 {
8814 Lisp_Object spec, attrs, plist;
8815
8816 CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
8817 attrs = AREF (spec, 0);
8818 if (EQ (prop, QCmnemonic))
8819 {
8820 if (! STRINGP (val))
8821 CHECK_CHARACTER (val);
8822 CODING_ATTR_MNEMONIC (attrs) = val;
8823 }
8824 else if (EQ (prop, QCdefalut_char))
8825 {
8826 if (NILP (val))
8827 val = make_number (' ');
8828 else
8829 CHECK_CHARACTER (val);
8830 CODING_ATTR_DEFAULT_CHAR (attrs) = val;
8831 }
8832 else if (EQ (prop, QCdecode_translation_table))
8833 {
8834 if (! CHAR_TABLE_P (val) && ! CONSP (val))
8835 CHECK_SYMBOL (val);
8836 CODING_ATTR_DECODE_TBL (attrs) = val;
8837 }
8838 else if (EQ (prop, QCencode_translation_table))
8839 {
8840 if (! CHAR_TABLE_P (val) && ! CONSP (val))
8841 CHECK_SYMBOL (val);
8842 CODING_ATTR_ENCODE_TBL (attrs) = val;
8843 }
8844 else if (EQ (prop, QCpost_read_conversion))
8845 {
8846 CHECK_SYMBOL (val);
8847 CODING_ATTR_POST_READ (attrs) = val;
8848 }
8849 else if (EQ (prop, QCpre_write_conversion))
8850 {
8851 CHECK_SYMBOL (val);
8852 CODING_ATTR_PRE_WRITE (attrs) = val;
8853 }
8854
8855 CODING_ATTR_PLIST (attrs)
8856 = Fplist_put (CODING_ATTR_PLIST (attrs), prop, val);
8857 return val;
8858 }
8859
8860
8861 DEFUN ("define-coding-system-alias", Fdefine_coding_system_alias,
8862 Sdefine_coding_system_alias, 2, 2, 0,
8863 doc: /* Define ALIAS as an alias for CODING-SYSTEM. */)
8864 (alias, coding_system)
8865 Lisp_Object alias, coding_system;
8866 {
8867 Lisp_Object spec, aliases, eol_type;
8868
8869 CHECK_SYMBOL (alias);
8870 CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
8871 aliases = AREF (spec, 1);
8872 /* ALISES should be a list of length more than zero, and the first
8873 element is a base coding system. Append ALIAS at the tail of the
8874 list. */
8875 while (!NILP (XCDR (aliases)))
8876 aliases = XCDR (aliases);
8877 XSETCDR (aliases, Fcons (alias, Qnil));
8878
8879 eol_type = AREF (spec, 2);
8880 if (VECTORP (eol_type))
8881 {
8882 Lisp_Object subsidiaries;
8883 int i;
8884
8885 subsidiaries = make_subsidiaries (alias);
8886 for (i = 0; i < 3; i++)
8887 Fdefine_coding_system_alias (AREF (subsidiaries, i),
8888 AREF (eol_type, i));
8889 }
8890
8891 Fputhash (alias, spec, Vcoding_system_hash_table);
8892 Vcoding_system_list = Fcons (alias, Vcoding_system_list);
8893 Vcoding_system_alist = Fcons (Fcons (Fsymbol_name (alias), Qnil),
8894 Vcoding_system_alist);
8895
8896 return Qnil;
8897 }
8898
8899 DEFUN ("coding-system-base", Fcoding_system_base, Scoding_system_base,
8900 1, 1, 0,
8901 doc: /* Return the base of CODING-SYSTEM.
8902 Any alias or subsidiary coding system is not a base coding system. */)
8903 (coding_system)
8904 Lisp_Object coding_system;
8905 {
8906 Lisp_Object spec, attrs;
8907
8908 if (NILP (coding_system))
8909 return (Qno_conversion);
8910 CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
8911 attrs = AREF (spec, 0);
8912 return CODING_ATTR_BASE_NAME (attrs);
8913 }
8914
8915 DEFUN ("coding-system-plist", Fcoding_system_plist, Scoding_system_plist,
8916 1, 1, 0,
8917 doc: "Return the property list of CODING-SYSTEM.")
8918 (coding_system)
8919 Lisp_Object coding_system;
8920 {
8921 Lisp_Object spec, attrs;
8922
8923 if (NILP (coding_system))
8924 coding_system = Qno_conversion;
8925 CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
8926 attrs = AREF (spec, 0);
8927 return CODING_ATTR_PLIST (attrs);
8928 }
8929
8930
8931 DEFUN ("coding-system-aliases", Fcoding_system_aliases, Scoding_system_aliases,
8932 1, 1, 0,
8933 doc: /* Return the list of aliases of CODING-SYSTEM. */)
8934 (coding_system)
8935 Lisp_Object coding_system;
8936 {
8937 Lisp_Object spec;
8938
8939 if (NILP (coding_system))
8940 coding_system = Qno_conversion;
8941 CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
8942 return AREF (spec, 1);
8943 }
8944
8945 DEFUN ("coding-system-eol-type", Fcoding_system_eol_type,
8946 Scoding_system_eol_type, 1, 1, 0,
8947 doc: /* Return eol-type of CODING-SYSTEM.
8948 An eol-type is integer 0, 1, 2, or a vector of coding systems.
8949
8950 Integer values 0, 1, and 2 indicate a format of end-of-line; LF, CRLF,
8951 and CR respectively.
8952
8953 A vector value indicates that a format of end-of-line should be
8954 detected automatically. Nth element of the vector is the subsidiary
8955 coding system whose eol-type is N. */)
8956 (coding_system)
8957 Lisp_Object coding_system;
8958 {
8959 Lisp_Object spec, eol_type;
8960 int n;
8961
8962 if (NILP (coding_system))
8963 coding_system = Qno_conversion;
8964 if (! CODING_SYSTEM_P (coding_system))
8965 return Qnil;
8966 spec = CODING_SYSTEM_SPEC (coding_system);
8967 eol_type = AREF (spec, 2);
8968 if (VECTORP (eol_type))
8969 return Fcopy_sequence (eol_type);
8970 n = EQ (eol_type, Qunix) ? 0 : EQ (eol_type, Qdos) ? 1 : 2;
8971 return make_number (n);
8972 }
8973
8974 #endif /* emacs */
8975
8976 \f
8977 /*** 9. Post-amble ***/
8978
8979 void
8980 init_coding_once ()
8981 {
8982 int i;
8983
8984 for (i = 0; i < coding_category_max; i++)
8985 {
8986 coding_categories[i].id = -1;
8987 coding_priorities[i] = i;
8988 }
8989
8990 /* ISO2022 specific initialize routine. */
8991 for (i = 0; i < 0x20; i++)
8992 iso_code_class[i] = ISO_control_0;
8993 for (i = 0x21; i < 0x7F; i++)
8994 iso_code_class[i] = ISO_graphic_plane_0;
8995 for (i = 0x80; i < 0xA0; i++)
8996 iso_code_class[i] = ISO_control_1;
8997 for (i = 0xA1; i < 0xFF; i++)
8998 iso_code_class[i] = ISO_graphic_plane_1;
8999 iso_code_class[0x20] = iso_code_class[0x7F] = ISO_0x20_or_0x7F;
9000 iso_code_class[0xA0] = iso_code_class[0xFF] = ISO_0xA0_or_0xFF;
9001 iso_code_class[ISO_CODE_SO] = ISO_shift_out;
9002 iso_code_class[ISO_CODE_SI] = ISO_shift_in;
9003 iso_code_class[ISO_CODE_SS2_7] = ISO_single_shift_2_7;
9004 iso_code_class[ISO_CODE_ESC] = ISO_escape;
9005 iso_code_class[ISO_CODE_SS2] = ISO_single_shift_2;
9006 iso_code_class[ISO_CODE_SS3] = ISO_single_shift_3;
9007 iso_code_class[ISO_CODE_CSI] = ISO_control_sequence_introducer;
9008
9009 for (i = 0; i < 256; i++)
9010 {
9011 emacs_mule_bytes[i] = 1;
9012 }
9013 emacs_mule_bytes[EMACS_MULE_LEADING_CODE_PRIVATE_11] = 3;
9014 emacs_mule_bytes[EMACS_MULE_LEADING_CODE_PRIVATE_12] = 3;
9015 emacs_mule_bytes[EMACS_MULE_LEADING_CODE_PRIVATE_21] = 4;
9016 emacs_mule_bytes[EMACS_MULE_LEADING_CODE_PRIVATE_22] = 4;
9017 }
9018
9019 #ifdef emacs
9020
9021 void
9022 syms_of_coding ()
9023 {
9024 staticpro (&Vcoding_system_hash_table);
9025 {
9026 Lisp_Object args[2];
9027 args[0] = QCtest;
9028 args[1] = Qeq;
9029 Vcoding_system_hash_table = Fmake_hash_table (2, args);
9030 }
9031
9032 staticpro (&Vsjis_coding_system);
9033 Vsjis_coding_system = Qnil;
9034
9035 staticpro (&Vbig5_coding_system);
9036 Vbig5_coding_system = Qnil;
9037
9038 staticpro (&Vcode_conversion_reused_workbuf);
9039 Vcode_conversion_reused_workbuf = Qnil;
9040
9041 staticpro (&Vcode_conversion_workbuf_name);
9042 Vcode_conversion_workbuf_name = build_string (" *code-conversion-work*");
9043
9044 reused_workbuf_in_use = 0;
9045
9046 DEFSYM (Qcharset, "charset");
9047 DEFSYM (Qtarget_idx, "target-idx");
9048 DEFSYM (Qcoding_system_history, "coding-system-history");
9049 Fset (Qcoding_system_history, Qnil);
9050
9051 /* Target FILENAME is the first argument. */
9052 Fput (Qinsert_file_contents, Qtarget_idx, make_number (0));
9053 /* Target FILENAME is the third argument. */
9054 Fput (Qwrite_region, Qtarget_idx, make_number (2));
9055
9056 DEFSYM (Qcall_process, "call-process");
9057 /* Target PROGRAM is the first argument. */
9058 Fput (Qcall_process, Qtarget_idx, make_number (0));
9059
9060 DEFSYM (Qcall_process_region, "call-process-region");
9061 /* Target PROGRAM is the third argument. */
9062 Fput (Qcall_process_region, Qtarget_idx, make_number (2));
9063
9064 DEFSYM (Qstart_process, "start-process");
9065 /* Target PROGRAM is the third argument. */
9066 Fput (Qstart_process, Qtarget_idx, make_number (2));
9067
9068 DEFSYM (Qopen_network_stream, "open-network-stream");
9069 /* Target SERVICE is the fourth argument. */
9070 Fput (Qopen_network_stream, Qtarget_idx, make_number (3));
9071
9072 DEFSYM (Qcoding_system, "coding-system");
9073 DEFSYM (Qcoding_aliases, "coding-aliases");
9074
9075 DEFSYM (Qeol_type, "eol-type");
9076 DEFSYM (Qunix, "unix");
9077 DEFSYM (Qdos, "dos");
9078
9079 DEFSYM (Qbuffer_file_coding_system, "buffer-file-coding-system");
9080 DEFSYM (Qpost_read_conversion, "post-read-conversion");
9081 DEFSYM (Qpre_write_conversion, "pre-write-conversion");
9082 DEFSYM (Qdefault_char, "default-char");
9083 DEFSYM (Qundecided, "undecided");
9084 DEFSYM (Qno_conversion, "no-conversion");
9085 DEFSYM (Qraw_text, "raw-text");
9086
9087 DEFSYM (Qiso_2022, "iso-2022");
9088
9089 DEFSYM (Qutf_8, "utf-8");
9090 DEFSYM (Qutf_8_emacs, "utf-8-emacs");
9091
9092 DEFSYM (Qutf_16, "utf-16");
9093 DEFSYM (Qbig, "big");
9094 DEFSYM (Qlittle, "little");
9095
9096 DEFSYM (Qshift_jis, "shift-jis");
9097 DEFSYM (Qbig5, "big5");
9098
9099 DEFSYM (Qcoding_system_p, "coding-system-p");
9100
9101 DEFSYM (Qcoding_system_error, "coding-system-error");
9102 Fput (Qcoding_system_error, Qerror_conditions,
9103 Fcons (Qcoding_system_error, Fcons (Qerror, Qnil)));
9104 Fput (Qcoding_system_error, Qerror_message,
9105 build_string ("Invalid coding system"));
9106
9107 /* Intern this now in case it isn't already done.
9108 Setting this variable twice is harmless.
9109 But don't staticpro it here--that is done in alloc.c. */
9110 Qchar_table_extra_slots = intern ("char-table-extra-slots");
9111
9112 DEFSYM (Qtranslation_table, "translation-table");
9113 Fput (Qtranslation_table, Qchar_table_extra_slots, make_number (2));
9114 DEFSYM (Qtranslation_table_id, "translation-table-id");
9115 DEFSYM (Qtranslation_table_for_decode, "translation-table-for-decode");
9116 DEFSYM (Qtranslation_table_for_encode, "translation-table-for-encode");
9117
9118 DEFSYM (Qvalid_codes, "valid-codes");
9119
9120 DEFSYM (Qemacs_mule, "emacs-mule");
9121
9122 DEFSYM (QCcategory, ":category");
9123 DEFSYM (QCmnemonic, ":mnemonic");
9124 DEFSYM (QCdefalut_char, ":default-char");
9125 DEFSYM (QCdecode_translation_table, ":decode-translation-table");
9126 DEFSYM (QCencode_translation_table, ":encode-translation-table");
9127 DEFSYM (QCpost_read_conversion, ":post-read-conversion");
9128 DEFSYM (QCpre_write_conversion, ":pre-write-conversion");
9129
9130 Vcoding_category_table
9131 = Fmake_vector (make_number (coding_category_max), Qnil);
9132 staticpro (&Vcoding_category_table);
9133 /* Followings are target of code detection. */
9134 ASET (Vcoding_category_table, coding_category_iso_7,
9135 intern ("coding-category-iso-7"));
9136 ASET (Vcoding_category_table, coding_category_iso_7_tight,
9137 intern ("coding-category-iso-7-tight"));
9138 ASET (Vcoding_category_table, coding_category_iso_8_1,
9139 intern ("coding-category-iso-8-1"));
9140 ASET (Vcoding_category_table, coding_category_iso_8_2,
9141 intern ("coding-category-iso-8-2"));
9142 ASET (Vcoding_category_table, coding_category_iso_7_else,
9143 intern ("coding-category-iso-7-else"));
9144 ASET (Vcoding_category_table, coding_category_iso_8_else,
9145 intern ("coding-category-iso-8-else"));
9146 ASET (Vcoding_category_table, coding_category_utf_8,
9147 intern ("coding-category-utf-8"));
9148 ASET (Vcoding_category_table, coding_category_utf_16_be,
9149 intern ("coding-category-utf-16-be"));
9150 ASET (Vcoding_category_table, coding_category_utf_16_auto,
9151 intern ("coding-category-utf-16-auto"));
9152 ASET (Vcoding_category_table, coding_category_utf_16_le,
9153 intern ("coding-category-utf-16-le"));
9154 ASET (Vcoding_category_table, coding_category_utf_16_be_nosig,
9155 intern ("coding-category-utf-16-be-nosig"));
9156 ASET (Vcoding_category_table, coding_category_utf_16_le_nosig,
9157 intern ("coding-category-utf-16-le-nosig"));
9158 ASET (Vcoding_category_table, coding_category_charset,
9159 intern ("coding-category-charset"));
9160 ASET (Vcoding_category_table, coding_category_sjis,
9161 intern ("coding-category-sjis"));
9162 ASET (Vcoding_category_table, coding_category_big5,
9163 intern ("coding-category-big5"));
9164 ASET (Vcoding_category_table, coding_category_ccl,
9165 intern ("coding-category-ccl"));
9166 ASET (Vcoding_category_table, coding_category_emacs_mule,
9167 intern ("coding-category-emacs-mule"));
9168 /* Followings are NOT target of code detection. */
9169 ASET (Vcoding_category_table, coding_category_raw_text,
9170 intern ("coding-category-raw-text"));
9171 ASET (Vcoding_category_table, coding_category_undecided,
9172 intern ("coding-category-undecided"));
9173
9174 DEFSYM (Qinsufficient_source, "insufficient-source");
9175 DEFSYM (Qinconsistent_eol, "inconsistent-eol");
9176 DEFSYM (Qinvalid_source, "invalid-source");
9177 DEFSYM (Qinterrupted, "interrupted");
9178 DEFSYM (Qinsufficient_memory, "insufficient-memory");
9179
9180 defsubr (&Scoding_system_p);
9181 defsubr (&Sread_coding_system);
9182 defsubr (&Sread_non_nil_coding_system);
9183 defsubr (&Scheck_coding_system);
9184 defsubr (&Sdetect_coding_region);
9185 defsubr (&Sdetect_coding_string);
9186 defsubr (&Sfind_coding_systems_region_internal);
9187 defsubr (&Sunencodable_char_position);
9188 defsubr (&Scheck_coding_systems_region);
9189 defsubr (&Sdecode_coding_region);
9190 defsubr (&Sencode_coding_region);
9191 defsubr (&Sdecode_coding_string);
9192 defsubr (&Sencode_coding_string);
9193 defsubr (&Sdecode_sjis_char);
9194 defsubr (&Sencode_sjis_char);
9195 defsubr (&Sdecode_big5_char);
9196 defsubr (&Sencode_big5_char);
9197 defsubr (&Sset_terminal_coding_system_internal);
9198 defsubr (&Sset_safe_terminal_coding_system_internal);
9199 defsubr (&Sterminal_coding_system);
9200 defsubr (&Sset_keyboard_coding_system_internal);
9201 defsubr (&Skeyboard_coding_system);
9202 defsubr (&Sfind_operation_coding_system);
9203 defsubr (&Sset_coding_system_priority);
9204 defsubr (&Sdefine_coding_system_internal);
9205 defsubr (&Sdefine_coding_system_alias);
9206 defsubr (&Scoding_system_put);
9207 defsubr (&Scoding_system_base);
9208 defsubr (&Scoding_system_plist);
9209 defsubr (&Scoding_system_aliases);
9210 defsubr (&Scoding_system_eol_type);
9211 defsubr (&Scoding_system_priority_list);
9212
9213 DEFVAR_LISP ("coding-system-list", &Vcoding_system_list,
9214 doc: /* List of coding systems.
9215
9216 Do not alter the value of this variable manually. This variable should be
9217 updated by the functions `define-coding-system' and
9218 `define-coding-system-alias'. */);
9219 Vcoding_system_list = Qnil;
9220
9221 DEFVAR_LISP ("coding-system-alist", &Vcoding_system_alist,
9222 doc: /* Alist of coding system names.
9223 Each element is one element list of coding system name.
9224 This variable is given to `completing-read' as TABLE argument.
9225
9226 Do not alter the value of this variable manually. This variable should be
9227 updated by the functions `make-coding-system' and
9228 `define-coding-system-alias'. */);
9229 Vcoding_system_alist = Qnil;
9230
9231 DEFVAR_LISP ("coding-category-list", &Vcoding_category_list,
9232 doc: /* List of coding-categories (symbols) ordered by priority.
9233
9234 On detecting a coding system, Emacs tries code detection algorithms
9235 associated with each coding-category one by one in this order. When
9236 one algorithm agrees with a byte sequence of source text, the coding
9237 system bound to the corresponding coding-category is selected. */);
9238 {
9239 int i;
9240
9241 Vcoding_category_list = Qnil;
9242 for (i = coding_category_max - 1; i >= 0; i--)
9243 Vcoding_category_list
9244 = Fcons (XVECTOR (Vcoding_category_table)->contents[i],
9245 Vcoding_category_list);
9246 }
9247
9248 DEFVAR_LISP ("coding-system-for-read", &Vcoding_system_for_read,
9249 doc: /* Specify the coding system for read operations.
9250 It is useful to bind this variable with `let', but do not set it globally.
9251 If the value is a coding system, it is used for decoding on read operation.
9252 If not, an appropriate element is used from one of the coding system alists:
9253 There are three such tables, `file-coding-system-alist',
9254 `process-coding-system-alist', and `network-coding-system-alist'. */);
9255 Vcoding_system_for_read = Qnil;
9256
9257 DEFVAR_LISP ("coding-system-for-write", &Vcoding_system_for_write,
9258 doc: /* Specify the coding system for write operations.
9259 Programs bind this variable with `let', but you should not set it globally.
9260 If the value is a coding system, it is used for encoding of output,
9261 when writing it to a file and when sending it to a file or subprocess.
9262
9263 If this does not specify a coding system, an appropriate element
9264 is used from one of the coding system alists:
9265 There are three such tables, `file-coding-system-alist',
9266 `process-coding-system-alist', and `network-coding-system-alist'.
9267 For output to files, if the above procedure does not specify a coding system,
9268 the value of `buffer-file-coding-system' is used. */);
9269 Vcoding_system_for_write = Qnil;
9270
9271 DEFVAR_LISP ("last-coding-system-used", &Vlast_coding_system_used,
9272 doc: /*
9273 Coding system used in the latest file or process I/O. */);
9274 Vlast_coding_system_used = Qnil;
9275
9276 DEFVAR_LISP ("last-code-conversion-error", &Vlast_code_conversion_error,
9277 doc: /*
9278 Error status of the last code conversion.
9279
9280 When an error was detected in the last code conversion, this variable
9281 is set to one of the following symbols.
9282 `insufficient-source'
9283 `inconsistent-eol'
9284 `invalid-source'
9285 `interrupted'
9286 `insufficient-memory'
9287 When no error was detected, the value doesn't change. So, to check
9288 the error status of a code conversion by this variable, you must
9289 explicitly set this variable to nil before performing code
9290 conversion. */);
9291 Vlast_code_conversion_error = Qnil;
9292
9293 DEFVAR_BOOL ("inhibit-eol-conversion", &inhibit_eol_conversion,
9294 doc: /*
9295 *Non-nil means always inhibit code conversion of end-of-line format.
9296 See info node `Coding Systems' and info node `Text and Binary' concerning
9297 such conversion. */);
9298 inhibit_eol_conversion = 0;
9299
9300 DEFVAR_BOOL ("inherit-process-coding-system", &inherit_process_coding_system,
9301 doc: /*
9302 Non-nil means process buffer inherits coding system of process output.
9303 Bind it to t if the process output is to be treated as if it were a file
9304 read from some filesystem. */);
9305 inherit_process_coding_system = 0;
9306
9307 DEFVAR_LISP ("file-coding-system-alist", &Vfile_coding_system_alist,
9308 doc: /*
9309 Alist to decide a coding system to use for a file I/O operation.
9310 The format is ((PATTERN . VAL) ...),
9311 where PATTERN is a regular expression matching a file name,
9312 VAL is a coding system, a cons of coding systems, or a function symbol.
9313 If VAL is a coding system, it is used for both decoding and encoding
9314 the file contents.
9315 If VAL is a cons of coding systems, the car part is used for decoding,
9316 and the cdr part is used for encoding.
9317 If VAL is a function symbol, the function must return a coding system
9318 or a cons of coding systems which are used as above. The function gets
9319 the arguments with which `find-operation-coding-systems' was called.
9320
9321 See also the function `find-operation-coding-system'
9322 and the variable `auto-coding-alist'. */);
9323 Vfile_coding_system_alist = Qnil;
9324
9325 DEFVAR_LISP ("process-coding-system-alist", &Vprocess_coding_system_alist,
9326 doc: /*
9327 Alist to decide a coding system to use for a process I/O operation.
9328 The format is ((PATTERN . VAL) ...),
9329 where PATTERN is a regular expression matching a program name,
9330 VAL is a coding system, a cons of coding systems, or a function symbol.
9331 If VAL is a coding system, it is used for both decoding what received
9332 from the program and encoding what sent to the program.
9333 If VAL is a cons of coding systems, the car part is used for decoding,
9334 and the cdr part is used for encoding.
9335 If VAL is a function symbol, the function must return a coding system
9336 or a cons of coding systems which are used as above.
9337
9338 See also the function `find-operation-coding-system'. */);
9339 Vprocess_coding_system_alist = Qnil;
9340
9341 DEFVAR_LISP ("network-coding-system-alist", &Vnetwork_coding_system_alist,
9342 doc: /*
9343 Alist to decide a coding system to use for a network I/O operation.
9344 The format is ((PATTERN . VAL) ...),
9345 where PATTERN is a regular expression matching a network service name
9346 or is a port number to connect to,
9347 VAL is a coding system, a cons of coding systems, or a function symbol.
9348 If VAL is a coding system, it is used for both decoding what received
9349 from the network stream and encoding what sent to the network stream.
9350 If VAL is a cons of coding systems, the car part is used for decoding,
9351 and the cdr part is used for encoding.
9352 If VAL is a function symbol, the function must return a coding system
9353 or a cons of coding systems which are used as above.
9354
9355 See also the function `find-operation-coding-system'. */);
9356 Vnetwork_coding_system_alist = Qnil;
9357
9358 DEFVAR_LISP ("locale-coding-system", &Vlocale_coding_system,
9359 doc: /* Coding system to use with system messages.
9360 Also used for decoding keyboard input on X Window system. */);
9361 Vlocale_coding_system = Qnil;
9362
9363 /* The eol mnemonics are reset in startup.el system-dependently. */
9364 DEFVAR_LISP ("eol-mnemonic-unix", &eol_mnemonic_unix,
9365 doc: /*
9366 *String displayed in mode line for UNIX-like (LF) end-of-line format. */);
9367 eol_mnemonic_unix = build_string (":");
9368
9369 DEFVAR_LISP ("eol-mnemonic-dos", &eol_mnemonic_dos,
9370 doc: /*
9371 *String displayed in mode line for DOS-like (CRLF) end-of-line format. */);
9372 eol_mnemonic_dos = build_string ("\\");
9373
9374 DEFVAR_LISP ("eol-mnemonic-mac", &eol_mnemonic_mac,
9375 doc: /*
9376 *String displayed in mode line for MAC-like (CR) end-of-line format. */);
9377 eol_mnemonic_mac = build_string ("/");
9378
9379 DEFVAR_LISP ("eol-mnemonic-undecided", &eol_mnemonic_undecided,
9380 doc: /*
9381 *String displayed in mode line when end-of-line format is not yet determined. */);
9382 eol_mnemonic_undecided = build_string (":");
9383
9384 DEFVAR_LISP ("enable-character-translation", &Venable_character_translation,
9385 doc: /*
9386 *Non-nil enables character translation while encoding and decoding. */);
9387 Venable_character_translation = Qt;
9388
9389 DEFVAR_LISP ("standard-translation-table-for-decode",
9390 &Vstandard_translation_table_for_decode,
9391 doc: /* Table for translating characters while decoding. */);
9392 Vstandard_translation_table_for_decode = Qnil;
9393
9394 DEFVAR_LISP ("standard-translation-table-for-encode",
9395 &Vstandard_translation_table_for_encode,
9396 doc: /* Table for translating characters while encoding. */);
9397 Vstandard_translation_table_for_encode = Qnil;
9398
9399 DEFVAR_LISP ("charset-revision-table", &Vcharset_revision_table,
9400 doc: /* Alist of charsets vs revision numbers.
9401 While encoding, if a charset (car part of an element) is found,
9402 designate it with the escape sequence identifying revision (cdr part
9403 of the element). */);
9404 Vcharset_revision_table = Qnil;
9405
9406 DEFVAR_LISP ("default-process-coding-system",
9407 &Vdefault_process_coding_system,
9408 doc: /* Cons of coding systems used for process I/O by default.
9409 The car part is used for decoding a process output,
9410 the cdr part is used for encoding a text to be sent to a process. */);
9411 Vdefault_process_coding_system = Qnil;
9412
9413 DEFVAR_LISP ("latin-extra-code-table", &Vlatin_extra_code_table,
9414 doc: /*
9415 Table of extra Latin codes in the range 128..159 (inclusive).
9416 This is a vector of length 256.
9417 If Nth element is non-nil, the existence of code N in a file
9418 \(or output of subprocess) doesn't prevent it to be detected as
9419 a coding system of ISO 2022 variant which has a flag
9420 `accept-latin-extra-code' t (e.g. iso-latin-1) on reading a file
9421 or reading output of a subprocess.
9422 Only 128th through 159th elements has a meaning. */);
9423 Vlatin_extra_code_table = Fmake_vector (make_number (256), Qnil);
9424
9425 DEFVAR_LISP ("select-safe-coding-system-function",
9426 &Vselect_safe_coding_system_function,
9427 doc: /*
9428 Function to call to select safe coding system for encoding a text.
9429
9430 If set, this function is called to force a user to select a proper
9431 coding system which can encode the text in the case that a default
9432 coding system used in each operation can't encode the text.
9433
9434 The default value is `select-safe-coding-system' (which see). */);
9435 Vselect_safe_coding_system_function = Qnil;
9436
9437 DEFVAR_BOOL ("coding-system-require-warning",
9438 &coding_system_require_warning,
9439 doc: /* Internal use only.
9440 If non-nil, on writing a file, `select-safe-coding-system-function' is
9441 called even if `coding-system-for-write' is non-nil. The command
9442 `universal-coding-system-argument' binds this variable to t temporarily. */);
9443 coding_system_require_warning = 0;
9444
9445
9446 DEFVAR_BOOL ("inhibit-iso-escape-detection",
9447 &inhibit_iso_escape_detection,
9448 doc: /*
9449 If non-nil, Emacs ignores ISO2022's escape sequence on code detection.
9450
9451 By default, on reading a file, Emacs tries to detect how the text is
9452 encoded. This code detection is sensitive to escape sequences. If
9453 the sequence is valid as ISO2022, the code is determined as one of
9454 the ISO2022 encodings, and the file is decoded by the corresponding
9455 coding system (e.g. `iso-2022-7bit').
9456
9457 However, there may be a case that you want to read escape sequences in
9458 a file as is. In such a case, you can set this variable to non-nil.
9459 Then, as the code detection ignores any escape sequences, no file is
9460 detected as encoded in some ISO2022 encoding. The result is that all
9461 escape sequences become visible in a buffer.
9462
9463 The default value is nil, and it is strongly recommended not to change
9464 it. That is because many Emacs Lisp source files that contain
9465 non-ASCII characters are encoded by the coding system `iso-2022-7bit'
9466 in Emacs's distribution, and they won't be decoded correctly on
9467 reading if you suppress escape sequence detection.
9468
9469 The other way to read escape sequences in a file without decoding is
9470 to explicitly specify some coding system that doesn't use ISO2022's
9471 escape sequence (e.g `latin-1') on reading by \\[universal-coding-system-argument]. */);
9472 inhibit_iso_escape_detection = 0;
9473
9474 DEFVAR_LISP ("translation-table-for-input", &Vtranslation_table_for_input,
9475 doc: /* Char table for translating self-inserting characters.
9476 This is applied to the result of input methods, not their input. See also
9477 `keyboard-translate-table'. */);
9478 Vtranslation_table_for_input = Qnil;
9479
9480 {
9481 Lisp_Object args[coding_arg_max];
9482 Lisp_Object plist[16];
9483 int i;
9484
9485 for (i = 0; i < coding_arg_max; i++)
9486 args[i] = Qnil;
9487
9488 plist[0] = intern (":name");
9489 plist[1] = args[coding_arg_name] = Qno_conversion;
9490 plist[2] = intern (":mnemonic");
9491 plist[3] = args[coding_arg_mnemonic] = make_number ('=');
9492 plist[4] = intern (":coding-type");
9493 plist[5] = args[coding_arg_coding_type] = Qraw_text;
9494 plist[6] = intern (":ascii-compatible-p");
9495 plist[7] = args[coding_arg_ascii_compatible_p] = Qt;
9496 plist[8] = intern (":default-char");
9497 plist[9] = args[coding_arg_default_char] = make_number (0);
9498 plist[10] = intern (":for-unibyte");
9499 plist[11] = args[coding_arg_for_unibyte] = Qt;
9500 plist[12] = intern (":docstring");
9501 plist[13] = build_string ("Do no conversion.\n\
9502 \n\
9503 When you visit a file with this coding, the file is read into a\n\
9504 unibyte buffer as is, thus each byte of a file is treated as a\n\
9505 character.");
9506 plist[14] = intern (":eol-type");
9507 plist[15] = args[coding_arg_eol_type] = Qunix;
9508 args[coding_arg_plist] = Flist (16, plist);
9509 Fdefine_coding_system_internal (coding_arg_max, args);
9510 }
9511
9512 setup_coding_system (Qno_conversion, &keyboard_coding);
9513 setup_coding_system (Qno_conversion, &terminal_coding);
9514 setup_coding_system (Qno_conversion, &safe_terminal_coding);
9515
9516 {
9517 int i;
9518
9519 for (i = 0; i < coding_category_max; i++)
9520 Fset (AREF (Vcoding_category_table, i), Qno_conversion);
9521 }
9522 }
9523
9524 char *
9525 emacs_strerror (error_number)
9526 int error_number;
9527 {
9528 char *str;
9529
9530 synchronize_system_messages_locale ();
9531 str = strerror (error_number);
9532
9533 if (! NILP (Vlocale_coding_system))
9534 {
9535 Lisp_Object dec = code_convert_string_norecord (build_string (str),
9536 Vlocale_coding_system,
9537 0);
9538 str = (char *) SDATA (dec);
9539 }
9540
9541 return str;
9542 }
9543
9544 #endif /* emacs */