misc changes
[bpt/emacs.git] / src / charset.c
1 /* Basic character set support.
2
3 Copyright (C) 2001-2014 Free Software Foundation, Inc.
4
5 Copyright (C) 1995, 1996, 1997, 1998, 1999, 2000, 2001, 2002, 2003, 2004,
6 2005, 2006, 2007, 2008, 2009, 2010, 2011
7 National Institute of Advanced Industrial Science and Technology (AIST)
8 Registration Number H14PRO021
9
10 Copyright (C) 2003, 2004
11 National Institute of Advanced Industrial Science and Technology (AIST)
12 Registration Number H13PRO009
13
14 This file is part of GNU Emacs.
15
16 GNU Emacs is free software: you can redistribute it and/or modify
17 it under the terms of the GNU General Public License as published by
18 the Free Software Foundation, either version 3 of the License, or
19 (at your option) any later version.
20
21 GNU Emacs is distributed in the hope that it will be useful,
22 but WITHOUT ANY WARRANTY; without even the implied warranty of
23 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
24 GNU General Public License for more details.
25
26 You should have received a copy of the GNU General Public License
27 along with GNU Emacs. If not, see <http://www.gnu.org/licenses/>. */
28
29 #include <config.h>
30
31 #include <errno.h>
32 #include <stdio.h>
33 #include <unistd.h>
34 #include <limits.h>
35 #include <sys/types.h>
36 #include <c-ctype.h>
37 #include "lisp.h"
38 #include "character.h"
39 #include "charset.h"
40 #include "coding.h"
41 #include "disptab.h"
42 #include "buffer.h"
43
44 /*** GENERAL NOTES on CODED CHARACTER SETS (CHARSETS) ***
45
46 A coded character set ("charset" hereafter) is a meaningful
47 collection (i.e. language, culture, functionality, etc.) of
48 characters. Emacs handles multiple charsets at once. In Emacs Lisp
49 code, a charset is represented by a symbol. In C code, a charset is
50 represented by its ID number or by a pointer to a struct charset.
51
52 The actual information about each charset is stored in two places.
53 Lispy information is stored in the hash table Vcharset_hash_table as
54 a vector (charset attributes). The other information is stored in
55 charset_table as a struct charset.
56
57 */
58
59 /* Hash table that contains attributes of each charset. Keys are
60 charset symbols, and values are vectors of charset attributes. */
61 Lisp_Object Vcharset_hash_table;
62
63 /* Table of struct charset. */
64 struct charset *charset_table;
65
66 static ptrdiff_t charset_table_size;
67 static int charset_table_used;
68
69 Lisp_Object Qcharsetp;
70
71 /* Special charset symbols. */
72 Lisp_Object Qascii;
73 static Lisp_Object Qeight_bit;
74 static Lisp_Object Qiso_8859_1;
75 static Lisp_Object Qunicode;
76 static Lisp_Object Qemacs;
77
78 /* The corresponding charsets. */
79 int charset_ascii;
80 int charset_eight_bit;
81 static int charset_iso_8859_1;
82 int charset_unicode;
83 static int charset_emacs;
84
85 /* The other special charsets. */
86 int charset_jisx0201_roman;
87 int charset_jisx0208_1978;
88 int charset_jisx0208;
89 int charset_ksc5601;
90
91 /* Value of charset attribute `charset-iso-plane'. */
92 static Lisp_Object Qgl, Qgr;
93
94 /* Charset of unibyte characters. */
95 int charset_unibyte;
96
97 /* List of charsets ordered by the priority. */
98 Lisp_Object Vcharset_ordered_list;
99
100 /* Sub-list of Vcharset_ordered_list that contains all non-preferred
101 charsets. */
102 Lisp_Object Vcharset_non_preferred_head;
103
104 /* Incremented everytime we change Vcharset_ordered_list. This is
105 unsigned short so that it fits in Lisp_Int and never matches
106 -1. */
107 unsigned short charset_ordered_list_tick;
108
109 /* List of iso-2022 charsets. */
110 Lisp_Object Viso_2022_charset_list;
111
112 /* List of emacs-mule charsets. */
113 Lisp_Object Vemacs_mule_charset_list;
114
115 int emacs_mule_charset[256];
116
117 /* Mapping table from ISO2022's charset (specified by DIMENSION,
118 CHARS, and FINAL-CHAR) to Emacs' charset. */
119 int iso_charset_table[ISO_MAX_DIMENSION][ISO_MAX_CHARS][ISO_MAX_FINAL];
120
121 #define CODE_POINT_TO_INDEX(charset, code) \
122 ((charset)->code_linear_p \
123 ? (int) ((code) - (charset)->min_code) \
124 : (((charset)->code_space_mask[(code) >> 24] & 0x8) \
125 && ((charset)->code_space_mask[((code) >> 16) & 0xFF] & 0x4) \
126 && ((charset)->code_space_mask[((code) >> 8) & 0xFF] & 0x2) \
127 && ((charset)->code_space_mask[(code) & 0xFF] & 0x1)) \
128 ? (int) (((((code) >> 24) - (charset)->code_space[12]) \
129 * (charset)->code_space[11]) \
130 + (((((code) >> 16) & 0xFF) - (charset)->code_space[8]) \
131 * (charset)->code_space[7]) \
132 + (((((code) >> 8) & 0xFF) - (charset)->code_space[4]) \
133 * (charset)->code_space[3]) \
134 + (((code) & 0xFF) - (charset)->code_space[0]) \
135 - ((charset)->char_index_offset)) \
136 : -1)
137
138
139 /* Return the code-point for the character index IDX in CHARSET.
140 IDX should be an unsigned int variable in a valid range (which is
141 always in nonnegative int range too). IDX contains garbage afterwards. */
142
143 #define INDEX_TO_CODE_POINT(charset, idx) \
144 ((charset)->code_linear_p \
145 ? (idx) + (charset)->min_code \
146 : (idx += (charset)->char_index_offset, \
147 (((charset)->code_space[0] + (idx) % (charset)->code_space[2]) \
148 | (((charset)->code_space[4] \
149 + ((idx) / (charset)->code_space[3] % (charset)->code_space[6])) \
150 << 8) \
151 | (((charset)->code_space[8] \
152 + ((idx) / (charset)->code_space[7] % (charset)->code_space[10])) \
153 << 16) \
154 | (((charset)->code_space[12] + ((idx) / (charset)->code_space[11])) \
155 << 24))))
156
157 /* Structure to hold mapping tables for a charset. Used by temacs
158 invoked for dumping. */
159
160 static struct
161 {
162 /* The current charset for which the following tables are setup. */
163 struct charset *current;
164
165 /* 1 iff the following table is used for encoder. */
166 short for_encoder;
167
168 /* When the following table is used for encoding, minimum and
169 maximum character of the current charset. */
170 int min_char, max_char;
171
172 /* A Unicode character corresponding to the code index 0 (i.e. the
173 minimum code-point) of the current charset, or -1 if the code
174 index 0 is not a Unicode character. This is checked when
175 table.encoder[CHAR] is zero. */
176 int zero_index_char;
177
178 union {
179 /* Table mapping code-indices (not code-points) of the current
180 charset to Unicode characters. If decoder[CHAR] is -1, CHAR
181 doesn't belong to the current charset. */
182 int decoder[0x10000];
183 /* Table mapping Unicode characters to code-indices of the current
184 charset. The first 0x10000 elements are for BMP (0..0xFFFF),
185 and the last 0x10000 are for SMP (0x10000..0x1FFFF) or SIP
186 (0x20000..0x2FFFF). Note that there is no charset map that
187 uses both SMP and SIP. */
188 unsigned short encoder[0x20000];
189 } table;
190 } *temp_charset_work;
191
192 #define SET_TEMP_CHARSET_WORK_ENCODER(C, CODE) \
193 do { \
194 if ((CODE) == 0) \
195 temp_charset_work->zero_index_char = (C); \
196 else if ((C) < 0x20000) \
197 temp_charset_work->table.encoder[(C)] = (CODE); \
198 else \
199 temp_charset_work->table.encoder[(C) - 0x10000] = (CODE); \
200 } while (0)
201
202 #define GET_TEMP_CHARSET_WORK_ENCODER(C) \
203 ((C) == temp_charset_work->zero_index_char ? 0 \
204 : (C) < 0x20000 ? (temp_charset_work->table.encoder[(C)] \
205 ? (int) temp_charset_work->table.encoder[(C)] : -1) \
206 : temp_charset_work->table.encoder[(C) - 0x10000] \
207 ? temp_charset_work->table.encoder[(C) - 0x10000] : -1)
208
209 #define SET_TEMP_CHARSET_WORK_DECODER(C, CODE) \
210 (temp_charset_work->table.decoder[(CODE)] = (C))
211
212 #define GET_TEMP_CHARSET_WORK_DECODER(CODE) \
213 (temp_charset_work->table.decoder[(CODE)])
214 \f
215
216 /* Set to 1 to warn that a charset map is loaded and thus a buffer
217 text and a string data may be relocated. */
218 bool charset_map_loaded;
219
220 struct charset_map_entries
221 {
222 struct {
223 unsigned from, to;
224 int c;
225 } *entry; /* [0x10000] */
226 struct charset_map_entries *next;
227 };
228
229 /* Load the mapping information of CHARSET from ENTRIES for
230 initializing (CONTROL_FLAG == 0), decoding (CONTROL_FLAG == 1), and
231 encoding (CONTROL_FLAG == 2).
232
233 If CONTROL_FLAG is 0, setup CHARSET->min_char, CHARSET->max_char,
234 and CHARSET->fast_map.
235
236 If CONTROL_FLAG is 1, setup the following tables according to
237 CHARSET->method and inhibit_load_charset_map.
238
239 CHARSET->method | inhibit_lcm == 0 | inhibit_lcm == 1
240 ----------------------+--------------------+---------------------------
241 CHARSET_METHOD_MAP | CHARSET->decoder | temp_charset_work->decoder
242 ----------------------+--------------------+---------------------------
243 CHARSET_METHOD_OFFSET | Vchar_unify_table | temp_charset_work->decoder
244
245 If CONTROL_FLAG is 2, setup the following tables.
246
247 CHARSET->method | inhibit_lcm == 0 | inhibit_lcm == 1
248 ----------------------+--------------------+---------------------------
249 CHARSET_METHOD_MAP | CHARSET->encoder | temp_charset_work->encoder
250 ----------------------+--------------------+--------------------------
251 CHARSET_METHOD_OFFSET | CHARSET->deunifier | temp_charset_work->encoder
252 */
253
254 static void
255 load_charset_map (struct charset *charset, struct charset_map_entries *entries, int n_entries, int control_flag)
256 {
257 Lisp_Object vec IF_LINT (= Qnil), table IF_LINT (= Qnil);
258 unsigned max_code = CHARSET_MAX_CODE (charset);
259 bool ascii_compatible_p = charset->ascii_compatible_p;
260 int min_char, max_char, nonascii_min_char;
261 int i;
262 unsigned char *fast_map = charset->fast_map;
263
264 if (n_entries <= 0)
265 return;
266
267 if (control_flag)
268 {
269 if (! inhibit_load_charset_map)
270 {
271 if (control_flag == 1)
272 {
273 if (charset->method == CHARSET_METHOD_MAP)
274 {
275 int n = CODE_POINT_TO_INDEX (charset, max_code) + 1;
276
277 vec = Fmake_vector (make_number (n), make_number (-1));
278 set_charset_attr (charset, charset_decoder, vec);
279 }
280 else
281 {
282 char_table_set_range (Vchar_unify_table,
283 charset->min_char, charset->max_char,
284 Qnil);
285 }
286 }
287 else
288 {
289 table = Fmake_char_table (Qnil, Qnil);
290 set_charset_attr (charset,
291 (charset->method == CHARSET_METHOD_MAP
292 ? charset_encoder : charset_deunifier),
293 table);
294 }
295 }
296 else
297 {
298 if (! temp_charset_work)
299 temp_charset_work = xmalloc (sizeof *temp_charset_work);
300 if (control_flag == 1)
301 {
302 memset (temp_charset_work->table.decoder, -1,
303 sizeof (int) * 0x10000);
304 }
305 else
306 {
307 memset (temp_charset_work->table.encoder, 0,
308 sizeof (unsigned short) * 0x20000);
309 temp_charset_work->zero_index_char = -1;
310 }
311 temp_charset_work->current = charset;
312 temp_charset_work->for_encoder = (control_flag == 2);
313 control_flag += 2;
314 }
315 charset_map_loaded = 1;
316 }
317
318 min_char = max_char = entries->entry[0].c;
319 nonascii_min_char = MAX_CHAR;
320 for (i = 0; i < n_entries; i++)
321 {
322 unsigned from, to;
323 int from_index, to_index, lim_index;
324 int from_c, to_c;
325 int idx = i % 0x10000;
326
327 if (i > 0 && idx == 0)
328 entries = entries->next;
329 from = entries->entry[idx].from;
330 to = entries->entry[idx].to;
331 from_c = entries->entry[idx].c;
332 from_index = CODE_POINT_TO_INDEX (charset, from);
333 if (from == to)
334 {
335 to_index = from_index;
336 to_c = from_c;
337 }
338 else
339 {
340 to_index = CODE_POINT_TO_INDEX (charset, to);
341 to_c = from_c + (to_index - from_index);
342 }
343 if (from_index < 0 || to_index < 0)
344 continue;
345 lim_index = to_index + 1;
346
347 if (to_c > max_char)
348 max_char = to_c;
349 else if (from_c < min_char)
350 min_char = from_c;
351
352 if (control_flag == 1)
353 {
354 if (charset->method == CHARSET_METHOD_MAP)
355 for (; from_index < lim_index; from_index++, from_c++)
356 ASET (vec, from_index, make_number (from_c));
357 else
358 for (; from_index < lim_index; from_index++, from_c++)
359 CHAR_TABLE_SET (Vchar_unify_table,
360 CHARSET_CODE_OFFSET (charset) + from_index,
361 make_number (from_c));
362 }
363 else if (control_flag == 2)
364 {
365 if (charset->method == CHARSET_METHOD_MAP
366 && CHARSET_COMPACT_CODES_P (charset))
367 for (; from_index < lim_index; from_index++, from_c++)
368 {
369 unsigned code = from_index;
370 code = INDEX_TO_CODE_POINT (charset, code);
371
372 if (NILP (CHAR_TABLE_REF (table, from_c)))
373 CHAR_TABLE_SET (table, from_c, make_number (code));
374 }
375 else
376 for (; from_index < lim_index; from_index++, from_c++)
377 {
378 if (NILP (CHAR_TABLE_REF (table, from_c)))
379 CHAR_TABLE_SET (table, from_c, make_number (from_index));
380 }
381 }
382 else if (control_flag == 3)
383 for (; from_index < lim_index; from_index++, from_c++)
384 SET_TEMP_CHARSET_WORK_DECODER (from_c, from_index);
385 else if (control_flag == 4)
386 for (; from_index < lim_index; from_index++, from_c++)
387 SET_TEMP_CHARSET_WORK_ENCODER (from_c, from_index);
388 else /* control_flag == 0 */
389 {
390 if (ascii_compatible_p)
391 {
392 if (! ASCII_CHAR_P (from_c))
393 {
394 if (from_c < nonascii_min_char)
395 nonascii_min_char = from_c;
396 }
397 else if (! ASCII_CHAR_P (to_c))
398 {
399 nonascii_min_char = 0x80;
400 }
401 }
402
403 for (; from_c <= to_c; from_c++)
404 CHARSET_FAST_MAP_SET (from_c, fast_map);
405 }
406 }
407
408 if (control_flag == 0)
409 {
410 CHARSET_MIN_CHAR (charset) = (ascii_compatible_p
411 ? nonascii_min_char : min_char);
412 CHARSET_MAX_CHAR (charset) = max_char;
413 }
414 else if (control_flag == 4)
415 {
416 temp_charset_work->min_char = min_char;
417 temp_charset_work->max_char = max_char;
418 }
419 }
420
421
422 /* Read a hexadecimal number (preceded by "0x") from the file FP while
423 paying attention to comment character '#'. */
424
425 static unsigned
426 read_hex (FILE *fp, bool *eof, bool *overflow)
427 {
428 int c;
429 unsigned n;
430
431 while ((c = getc (fp)) != EOF)
432 {
433 if (c == '#')
434 {
435 while ((c = getc (fp)) != EOF && c != '\n');
436 }
437 else if (c == '0')
438 {
439 if ((c = getc (fp)) == EOF || c == 'x')
440 break;
441 }
442 }
443 if (c == EOF)
444 {
445 *eof = 1;
446 return 0;
447 }
448 n = 0;
449 while (c_isxdigit (c = getc (fp)))
450 {
451 if (UINT_MAX >> 4 < n)
452 *overflow = 1;
453 n = ((n << 4)
454 | (c - ('0' <= c && c <= '9' ? '0'
455 : 'A' <= c && c <= 'F' ? 'A' - 10
456 : 'a' - 10)));
457 }
458 if (c != EOF)
459 ungetc (c, fp);
460 return n;
461 }
462
463 /* Return a mapping vector for CHARSET loaded from MAPFILE.
464 Each line of MAPFILE has this form
465 0xAAAA 0xCCCC
466 where 0xAAAA is a code-point and 0xCCCC is the corresponding
467 character code, or this form
468 0xAAAA-0xBBBB 0xCCCC
469 where 0xAAAA and 0xBBBB are code-points specifying a range, and
470 0xCCCC is the first character code of the range.
471
472 The returned vector has this form:
473 [ CODE1 CHAR1 CODE2 CHAR2 .... ]
474 where CODE1 is a code-point or a cons of code-points specifying a
475 range.
476
477 Note that this function uses `openp' to open MAPFILE but ignores
478 `file-name-handler-alist' to avoid running any Lisp code. */
479
480 static void
481 load_charset_map_from_file (struct charset *charset, Lisp_Object mapfile,
482 int control_flag)
483 {
484 unsigned min_code = CHARSET_MIN_CODE (charset);
485 unsigned max_code = CHARSET_MAX_CODE (charset);
486 int fd;
487 FILE *fp = NULL;
488 Lisp_Object suffixes;
489 struct charset_map_entries *head, *entries;
490 int n_entries;
491 ptrdiff_t count;
492
493 suffixes = list2 (build_string (".map"), build_string (".TXT"));
494
495 dynwind_begin ();
496 record_unwind_protect_ptr (fclose_ptr_unwind, &fp);
497 {
498 dynwind_begin ();
499 specbind (Qfile_name_handler_alist, Qnil);
500 fd = openp (Vcharset_map_path, mapfile, suffixes, NULL, Qnil, false);
501 fp = fd < 0 ? 0 : fdopen (fd, "r");
502 if (!fp)
503 {
504 int open_errno = errno;
505 emacs_close (fd);
506 report_file_errno ("Loading charset map", mapfile, open_errno);
507 }
508 dynwind_end ();
509 }
510
511 /* Use record, as `charset_map_entries' is large (larger than
512 MAX_ALLOCA). */
513 head = xmalloc (sizeof *head);
514 head->entry = xmalloc_atomic (0x10000 * sizeof (*head->entry));
515 entries = head;
516
517 n_entries = 0;
518 while (1)
519 {
520 unsigned from, to, c;
521 int idx;
522 bool eof = 0, overflow = 0;
523
524 from = read_hex (fp, &eof, &overflow);
525 if (eof)
526 break;
527 if (getc (fp) == '-')
528 to = read_hex (fp, &eof, &overflow);
529 else
530 to = from;
531 if (eof)
532 break;
533 c = read_hex (fp, &eof, &overflow);
534 if (eof)
535 break;
536
537 if (overflow)
538 continue;
539 if (from < min_code || to > max_code || from > to || c > MAX_CHAR)
540 continue;
541
542 if (n_entries == 0x10000)
543 {
544 entries->next = xmalloc (sizeof *entries->next);
545 entries->next->entry = xmalloc_atomic (0x10000 * (sizeof *entries->next));
546 entries = entries->next;
547 n_entries = 0;
548 }
549 idx = n_entries;
550 entries->entry[idx].from = from;
551 entries->entry[idx].to = to;
552 entries->entry[idx].c = c;
553 n_entries++;
554 }
555 fclose (fp);
556 fp = NULL;
557
558 load_charset_map (charset, head, n_entries, control_flag);
559 dynwind_end ();
560 }
561
562 static void
563 load_charset_map_from_vector (struct charset *charset, Lisp_Object vec, int control_flag)
564 {
565 unsigned min_code = CHARSET_MIN_CODE (charset);
566 unsigned max_code = CHARSET_MAX_CODE (charset);
567 struct charset_map_entries *head, *entries;
568 int n_entries;
569 int len = ASIZE (vec);
570 int i;
571 USE_SAFE_ALLOCA;
572
573 if (len % 2 == 1)
574 {
575 add_to_log ("Failure in loading charset map: %V", vec, Qnil);
576 return;
577 }
578
579 /* Use SAFE_ALLOCA instead of alloca, as `charset_map_entries' is
580 large (larger than MAX_ALLOCA). */
581 head = SAFE_ALLOCA (sizeof *head);
582 head->entry = xmalloc_atomic (0x10000 * (sizeof *head->entry));
583 entries = head;
584 memset (entries, 0, sizeof (struct charset_map_entries));
585
586 n_entries = 0;
587 for (i = 0; i < len; i += 2)
588 {
589 Lisp_Object val, val2;
590 unsigned from, to;
591 EMACS_INT c;
592 int idx;
593
594 val = AREF (vec, i);
595 if (CONSP (val))
596 {
597 val2 = XCDR (val);
598 val = XCAR (val);
599 from = XFASTINT (val);
600 to = XFASTINT (val2);
601 }
602 else
603 from = to = XFASTINT (val);
604 val = AREF (vec, i + 1);
605 CHECK_NATNUM (val);
606 c = XFASTINT (val);
607
608 if (from < min_code || to > max_code || from > to || c > MAX_CHAR)
609 continue;
610
611 if (n_entries > 0 && (n_entries % 0x10000) == 0)
612 {
613 entries->next = SAFE_ALLOCA (sizeof *entries->next);
614 entries = entries->next;
615 memset (entries, 0, sizeof (struct charset_map_entries));
616 }
617 idx = n_entries % 0x10000;
618 entries->entry[idx].from = from;
619 entries->entry[idx].to = to;
620 entries->entry[idx].c = c;
621 n_entries++;
622 }
623
624 load_charset_map (charset, head, n_entries, control_flag);
625 SAFE_FREE ();
626 }
627
628
629 /* Load a mapping table for CHARSET. CONTROL-FLAG tells what kind of
630 map it is (see the comment of load_charset_map for the detail). */
631
632 static void
633 load_charset (struct charset *charset, int control_flag)
634 {
635 Lisp_Object map;
636
637 if (inhibit_load_charset_map
638 && temp_charset_work
639 && charset == temp_charset_work->current
640 && ((control_flag == 2) == temp_charset_work->for_encoder))
641 return;
642
643 if (CHARSET_METHOD (charset) == CHARSET_METHOD_MAP)
644 map = CHARSET_MAP (charset);
645 else
646 {
647 if (! CHARSET_UNIFIED_P (charset))
648 emacs_abort ();
649 map = CHARSET_UNIFY_MAP (charset);
650 }
651 if (STRINGP (map))
652 load_charset_map_from_file (charset, map, control_flag);
653 else
654 load_charset_map_from_vector (charset, map, control_flag);
655 }
656
657
658 DEFUN ("charsetp", Fcharsetp, Scharsetp, 1, 1, 0,
659 doc: /* Return non-nil if and only if OBJECT is a charset.*/)
660 (Lisp_Object object)
661 {
662 return (CHARSETP (object) ? Qt : Qnil);
663 }
664
665
666 static void
667 map_charset_for_dump (void (*c_function) (Lisp_Object, Lisp_Object),
668 Lisp_Object function, Lisp_Object arg,
669 unsigned int from, unsigned int to)
670 {
671 int from_idx = CODE_POINT_TO_INDEX (temp_charset_work->current, from);
672 int to_idx = CODE_POINT_TO_INDEX (temp_charset_work->current, to);
673 Lisp_Object range;
674 int c, stop;
675 struct gcpro gcpro1;
676
677 range = Fcons (Qnil, Qnil);
678 GCPRO1 (range);
679
680 c = temp_charset_work->min_char;
681 stop = (temp_charset_work->max_char < 0x20000
682 ? temp_charset_work->max_char : 0xFFFF);
683
684 while (1)
685 {
686 int idx = GET_TEMP_CHARSET_WORK_ENCODER (c);
687
688 if (idx >= from_idx && idx <= to_idx)
689 {
690 if (NILP (XCAR (range)))
691 XSETCAR (range, make_number (c));
692 }
693 else if (! NILP (XCAR (range)))
694 {
695 XSETCDR (range, make_number (c - 1));
696 if (c_function)
697 (*c_function) (arg, range);
698 else
699 call2 (function, range, arg);
700 XSETCAR (range, Qnil);
701 }
702 if (c == stop)
703 {
704 if (c == temp_charset_work->max_char)
705 {
706 if (! NILP (XCAR (range)))
707 {
708 XSETCDR (range, make_number (c));
709 if (c_function)
710 (*c_function) (arg, range);
711 else
712 call2 (function, range, arg);
713 }
714 break;
715 }
716 c = 0x1FFFF;
717 stop = temp_charset_work->max_char;
718 }
719 c++;
720 }
721 UNGCPRO;
722 }
723
724 void
725 map_charset_chars (void (*c_function)(Lisp_Object, Lisp_Object), Lisp_Object function,
726 Lisp_Object arg, struct charset *charset, unsigned from, unsigned to)
727 {
728 Lisp_Object range;
729 bool partial = (from > CHARSET_MIN_CODE (charset)
730 || to < CHARSET_MAX_CODE (charset));
731
732 if (CHARSET_METHOD (charset) == CHARSET_METHOD_OFFSET)
733 {
734 int from_idx = CODE_POINT_TO_INDEX (charset, from);
735 int to_idx = CODE_POINT_TO_INDEX (charset, to);
736 int from_c = from_idx + CHARSET_CODE_OFFSET (charset);
737 int to_c = to_idx + CHARSET_CODE_OFFSET (charset);
738
739 if (CHARSET_UNIFIED_P (charset))
740 {
741 if (! CHAR_TABLE_P (CHARSET_DEUNIFIER (charset)))
742 load_charset (charset, 2);
743 if (CHAR_TABLE_P (CHARSET_DEUNIFIER (charset)))
744 map_char_table_for_charset (c_function, function,
745 CHARSET_DEUNIFIER (charset), arg,
746 partial ? charset : NULL, from, to);
747 else
748 map_charset_for_dump (c_function, function, arg, from, to);
749 }
750
751 range = Fcons (make_number (from_c), make_number (to_c));
752 if (NILP (function))
753 (*c_function) (arg, range);
754 else
755 call2 (function, range, arg);
756 }
757 else if (CHARSET_METHOD (charset) == CHARSET_METHOD_MAP)
758 {
759 if (! CHAR_TABLE_P (CHARSET_ENCODER (charset)))
760 load_charset (charset, 2);
761 if (CHAR_TABLE_P (CHARSET_ENCODER (charset)))
762 map_char_table_for_charset (c_function, function,
763 CHARSET_ENCODER (charset), arg,
764 partial ? charset : NULL, from, to);
765 else
766 map_charset_for_dump (c_function, function, arg, from, to);
767 }
768 else if (CHARSET_METHOD (charset) == CHARSET_METHOD_SUBSET)
769 {
770 Lisp_Object subset_info;
771 int offset;
772
773 subset_info = CHARSET_SUBSET (charset);
774 charset = CHARSET_FROM_ID (XFASTINT (AREF (subset_info, 0)));
775 offset = XINT (AREF (subset_info, 3));
776 from -= offset;
777 if (from < XFASTINT (AREF (subset_info, 1)))
778 from = XFASTINT (AREF (subset_info, 1));
779 to -= offset;
780 if (to > XFASTINT (AREF (subset_info, 2)))
781 to = XFASTINT (AREF (subset_info, 2));
782 map_charset_chars (c_function, function, arg, charset, from, to);
783 }
784 else /* i.e. CHARSET_METHOD_SUPERSET */
785 {
786 Lisp_Object parents;
787
788 for (parents = CHARSET_SUPERSET (charset); CONSP (parents);
789 parents = XCDR (parents))
790 {
791 int offset;
792 unsigned this_from, this_to;
793
794 charset = CHARSET_FROM_ID (XFASTINT (XCAR (XCAR (parents))));
795 offset = XINT (XCDR (XCAR (parents)));
796 this_from = from > offset ? from - offset : 0;
797 this_to = to > offset ? to - offset : 0;
798 if (this_from < CHARSET_MIN_CODE (charset))
799 this_from = CHARSET_MIN_CODE (charset);
800 if (this_to > CHARSET_MAX_CODE (charset))
801 this_to = CHARSET_MAX_CODE (charset);
802 map_charset_chars (c_function, function, arg, charset,
803 this_from, this_to);
804 }
805 }
806 }
807
808 DEFUN ("map-charset-chars", Fmap_charset_chars, Smap_charset_chars, 2, 5, 0,
809 doc: /* Call FUNCTION for all characters in CHARSET.
810 FUNCTION is called with an argument RANGE and the optional 3rd
811 argument ARG.
812
813 RANGE is a cons (FROM . TO), where FROM and TO indicate a range of
814 characters contained in CHARSET.
815
816 The optional 4th and 5th arguments FROM-CODE and TO-CODE specify the
817 range of code points (in CHARSET) of target characters. */)
818 (Lisp_Object function, Lisp_Object charset, Lisp_Object arg, Lisp_Object from_code, Lisp_Object to_code)
819 {
820 struct charset *cs;
821 unsigned from, to;
822
823 CHECK_CHARSET_GET_CHARSET (charset, cs);
824 if (NILP (from_code))
825 from = CHARSET_MIN_CODE (cs);
826 else
827 {
828 from = XINT (from_code);
829 if (from < CHARSET_MIN_CODE (cs))
830 from = CHARSET_MIN_CODE (cs);
831 }
832 if (NILP (to_code))
833 to = CHARSET_MAX_CODE (cs);
834 else
835 {
836 to = XINT (to_code);
837 if (to > CHARSET_MAX_CODE (cs))
838 to = CHARSET_MAX_CODE (cs);
839 }
840 map_charset_chars (NULL, function, arg, cs, from, to);
841 return Qnil;
842 }
843
844
845 /* Define a charset according to the arguments. The Nth argument is
846 the Nth attribute of the charset (the last attribute `charset-id'
847 is not included). See the docstring of `define-charset' for the
848 detail. */
849
850 DEFUN ("define-charset-internal", Fdefine_charset_internal,
851 Sdefine_charset_internal, charset_arg_max, MANY, 0,
852 doc: /* For internal use only.
853 usage: (define-charset-internal ...) */)
854 (ptrdiff_t nargs, Lisp_Object *args)
855 {
856 /* Charset attr vector. */
857 Lisp_Object attrs;
858 Lisp_Object val;
859 EMACS_UINT hash_code;
860 struct Lisp_Hash_Table *hash_table = XHASH_TABLE (Vcharset_hash_table);
861 int i, j;
862 struct charset charset;
863 int id;
864 int dimension;
865 bool new_definition_p;
866 int nchars;
867
868 if (nargs != charset_arg_max)
869 return Fsignal (Qwrong_number_of_arguments,
870 Fcons (intern ("define-charset-internal"),
871 make_number (nargs)));
872
873 attrs = Fmake_vector (make_number (charset_attr_max), Qnil);
874
875 CHECK_SYMBOL (args[charset_arg_name]);
876 ASET (attrs, charset_name, args[charset_arg_name]);
877
878 val = args[charset_arg_code_space];
879 for (i = 0, dimension = 0, nchars = 1; ; i++)
880 {
881 Lisp_Object min_byte_obj, max_byte_obj;
882 int min_byte, max_byte;
883
884 min_byte_obj = Faref (val, make_number (i * 2));
885 max_byte_obj = Faref (val, make_number (i * 2 + 1));
886 CHECK_RANGED_INTEGER (min_byte_obj, 0, 255);
887 min_byte = XINT (min_byte_obj);
888 CHECK_RANGED_INTEGER (max_byte_obj, min_byte, 255);
889 max_byte = XINT (max_byte_obj);
890 charset.code_space[i * 4] = min_byte;
891 charset.code_space[i * 4 + 1] = max_byte;
892 charset.code_space[i * 4 + 2] = max_byte - min_byte + 1;
893 if (max_byte > 0)
894 dimension = i + 1;
895 if (i == 3)
896 break;
897 nchars *= charset.code_space[i * 4 + 2];
898 charset.code_space[i * 4 + 3] = nchars;
899 }
900
901 val = args[charset_arg_dimension];
902 if (NILP (val))
903 charset.dimension = dimension;
904 else
905 {
906 CHECK_RANGED_INTEGER (val, 1, 4);
907 charset.dimension = XINT (val);
908 }
909
910 charset.code_linear_p
911 = (charset.dimension == 1
912 || (charset.code_space[2] == 256
913 && (charset.dimension == 2
914 || (charset.code_space[6] == 256
915 && (charset.dimension == 3
916 || charset.code_space[10] == 256)))));
917
918 if (! charset.code_linear_p)
919 {
920 charset.code_space_mask = xzalloc_atomic (256);
921 for (i = 0; i < 4; i++)
922 for (j = charset.code_space[i * 4]; j <= charset.code_space[i * 4 + 1];
923 j++)
924 charset.code_space_mask[j] |= (1 << i);
925 }
926
927 charset.iso_chars_96 = charset.code_space[2] == 96;
928
929 charset.min_code = (charset.code_space[0]
930 | (charset.code_space[4] << 8)
931 | (charset.code_space[8] << 16)
932 | ((unsigned) charset.code_space[12] << 24));
933 charset.max_code = (charset.code_space[1]
934 | (charset.code_space[5] << 8)
935 | (charset.code_space[9] << 16)
936 | ((unsigned) charset.code_space[13] << 24));
937 charset.char_index_offset = 0;
938
939 val = args[charset_arg_min_code];
940 if (! NILP (val))
941 {
942 unsigned code = cons_to_unsigned (val, UINT_MAX);
943
944 if (code < charset.min_code
945 || code > charset.max_code)
946 args_out_of_range_3 (make_fixnum_or_float (charset.min_code),
947 make_fixnum_or_float (charset.max_code), val);
948 charset.char_index_offset = CODE_POINT_TO_INDEX (&charset, code);
949 charset.min_code = code;
950 }
951
952 val = args[charset_arg_max_code];
953 if (! NILP (val))
954 {
955 unsigned code = cons_to_unsigned (val, UINT_MAX);
956
957 if (code < charset.min_code
958 || code > charset.max_code)
959 args_out_of_range_3 (make_fixnum_or_float (charset.min_code),
960 make_fixnum_or_float (charset.max_code), val);
961 charset.max_code = code;
962 }
963
964 charset.compact_codes_p = charset.max_code < 0x10000;
965
966 val = args[charset_arg_invalid_code];
967 if (NILP (val))
968 {
969 if (charset.min_code > 0)
970 charset.invalid_code = 0;
971 else
972 {
973 if (charset.max_code < UINT_MAX)
974 charset.invalid_code = charset.max_code + 1;
975 else
976 error ("Attribute :invalid-code must be specified");
977 }
978 }
979 else
980 charset.invalid_code = cons_to_unsigned (val, UINT_MAX);
981
982 val = args[charset_arg_iso_final];
983 if (NILP (val))
984 charset.iso_final = -1;
985 else
986 {
987 CHECK_NUMBER (val);
988 if (XINT (val) < '0' || XINT (val) > 127)
989 error ("Invalid iso-final-char: %"pI"d", XINT (val));
990 charset.iso_final = XINT (val);
991 }
992
993 val = args[charset_arg_iso_revision];
994 if (NILP (val))
995 charset.iso_revision = -1;
996 else
997 {
998 CHECK_RANGED_INTEGER (val, -1, 63);
999 charset.iso_revision = XINT (val);
1000 }
1001
1002 val = args[charset_arg_emacs_mule_id];
1003 if (NILP (val))
1004 charset.emacs_mule_id = -1;
1005 else
1006 {
1007 CHECK_NATNUM (val);
1008 if ((XINT (val) > 0 && XINT (val) <= 128) || XINT (val) >= 256)
1009 error ("Invalid emacs-mule-id: %"pI"d", XINT (val));
1010 charset.emacs_mule_id = XINT (val);
1011 }
1012
1013 charset.ascii_compatible_p = ! NILP (args[charset_arg_ascii_compatible_p]);
1014
1015 charset.supplementary_p = ! NILP (args[charset_arg_supplementary_p]);
1016
1017 charset.unified_p = 0;
1018
1019 memset (charset.fast_map, 0, sizeof (charset.fast_map));
1020
1021 if (! NILP (args[charset_arg_code_offset]))
1022 {
1023 val = args[charset_arg_code_offset];
1024 CHECK_CHARACTER (val);
1025
1026 charset.method = CHARSET_METHOD_OFFSET;
1027 charset.code_offset = XINT (val);
1028
1029 i = CODE_POINT_TO_INDEX (&charset, charset.max_code);
1030 if (MAX_CHAR - charset.code_offset < i)
1031 error ("Unsupported max char: %d", charset.max_char);
1032 charset.max_char = i + charset.code_offset;
1033 i = CODE_POINT_TO_INDEX (&charset, charset.min_code);
1034 charset.min_char = i + charset.code_offset;
1035
1036 i = (charset.min_char >> 7) << 7;
1037 for (; i < 0x10000 && i <= charset.max_char; i += 128)
1038 CHARSET_FAST_MAP_SET (i, charset.fast_map);
1039 i = (i >> 12) << 12;
1040 for (; i <= charset.max_char; i += 0x1000)
1041 CHARSET_FAST_MAP_SET (i, charset.fast_map);
1042 if (charset.code_offset == 0 && charset.max_char >= 0x80)
1043 charset.ascii_compatible_p = 1;
1044 }
1045 else if (! NILP (args[charset_arg_map]))
1046 {
1047 val = args[charset_arg_map];
1048 ASET (attrs, charset_map, val);
1049 charset.method = CHARSET_METHOD_MAP;
1050 }
1051 else if (! NILP (args[charset_arg_subset]))
1052 {
1053 Lisp_Object parent;
1054 Lisp_Object parent_min_code, parent_max_code, parent_code_offset;
1055 struct charset *parent_charset;
1056
1057 val = args[charset_arg_subset];
1058 parent = Fcar (val);
1059 CHECK_CHARSET_GET_CHARSET (parent, parent_charset);
1060 parent_min_code = Fnth (make_number (1), val);
1061 CHECK_NATNUM (parent_min_code);
1062 parent_max_code = Fnth (make_number (2), val);
1063 CHECK_NATNUM (parent_max_code);
1064 parent_code_offset = Fnth (make_number (3), val);
1065 CHECK_NUMBER (parent_code_offset);
1066 val = make_uninit_vector (4);
1067 ASET (val, 0, make_number (parent_charset->id));
1068 ASET (val, 1, parent_min_code);
1069 ASET (val, 2, parent_max_code);
1070 ASET (val, 3, parent_code_offset);
1071 ASET (attrs, charset_subset, val);
1072
1073 charset.method = CHARSET_METHOD_SUBSET;
1074 /* Here, we just copy the parent's fast_map. It's not accurate,
1075 but at least it works for quickly detecting which character
1076 DOESN'T belong to this charset. */
1077 for (i = 0; i < 190; i++)
1078 charset.fast_map[i] = parent_charset->fast_map[i];
1079
1080 /* We also copy these for parents. */
1081 charset.min_char = parent_charset->min_char;
1082 charset.max_char = parent_charset->max_char;
1083 }
1084 else if (! NILP (args[charset_arg_superset]))
1085 {
1086 val = args[charset_arg_superset];
1087 charset.method = CHARSET_METHOD_SUPERSET;
1088 val = Fcopy_sequence (val);
1089 ASET (attrs, charset_superset, val);
1090
1091 charset.min_char = MAX_CHAR;
1092 charset.max_char = 0;
1093 for (; ! NILP (val); val = Fcdr (val))
1094 {
1095 Lisp_Object elt, car_part, cdr_part;
1096 int this_id, offset;
1097 struct charset *this_charset;
1098
1099 elt = Fcar (val);
1100 if (CONSP (elt))
1101 {
1102 car_part = XCAR (elt);
1103 cdr_part = XCDR (elt);
1104 CHECK_CHARSET_GET_ID (car_part, this_id);
1105 CHECK_TYPE_RANGED_INTEGER (int, cdr_part);
1106 offset = XINT (cdr_part);
1107 }
1108 else
1109 {
1110 CHECK_CHARSET_GET_ID (elt, this_id);
1111 offset = 0;
1112 }
1113 XSETCAR (val, Fcons (make_number (this_id), make_number (offset)));
1114
1115 this_charset = CHARSET_FROM_ID (this_id);
1116 if (charset.min_char > this_charset->min_char)
1117 charset.min_char = this_charset->min_char;
1118 if (charset.max_char < this_charset->max_char)
1119 charset.max_char = this_charset->max_char;
1120 for (i = 0; i < 190; i++)
1121 charset.fast_map[i] |= this_charset->fast_map[i];
1122 }
1123 }
1124 else
1125 error ("None of :code-offset, :map, :parents are specified");
1126
1127 val = args[charset_arg_unify_map];
1128 if (! NILP (val) && !STRINGP (val))
1129 CHECK_VECTOR (val);
1130 ASET (attrs, charset_unify_map, val);
1131
1132 CHECK_LIST (args[charset_arg_plist]);
1133 ASET (attrs, charset_plist, args[charset_arg_plist]);
1134
1135 charset.hash_index = hash_lookup (hash_table, args[charset_arg_name],
1136 &hash_code);
1137 if (charset.hash_index >= 0)
1138 {
1139 new_definition_p = 0;
1140 id = XFASTINT (CHARSET_SYMBOL_ID (args[charset_arg_name]));
1141 set_hash_value_slot (hash_table, charset.hash_index, attrs);
1142 }
1143 else
1144 {
1145 charset.hash_index = hash_put (hash_table, args[charset_arg_name], attrs,
1146 hash_code);
1147 if (charset_table_used == charset_table_size)
1148 {
1149 /* Ensure that charset IDs fit into 'int' as well as into the
1150 restriction imposed by fixnums. Although the 'int' restriction
1151 could be removed, too much other code would need altering; for
1152 example, the IDs are stuffed into struct
1153 coding_system.charbuf[i] entries, which are 'int'. */
1154 int old_size = charset_table_size;
1155 ptrdiff_t new_size = old_size;
1156 struct charset *new_table =
1157 xpalloc (0, &new_size, 1,
1158 min (INT_MAX, MOST_POSITIVE_FIXNUM),
1159 sizeof *charset_table);
1160 memcpy (new_table, charset_table, old_size * sizeof *new_table);
1161 charset_table = new_table;
1162 charset_table_size = new_size;
1163 /* FIXME: This leaks memory, as the old charset_table becomes
1164 unreachable. If the old charset table is charset_table_init
1165 then this leak is intentional; otherwise, it's unclear.
1166 If the latter memory leak is intentional, a
1167 comment should be added to explain this. If not, the old
1168 charset_table should be freed, by passing it as the 1st argument
1169 to xpalloc and removing the memcpy. */
1170 }
1171 id = charset_table_used++;
1172 new_definition_p = 1;
1173 }
1174
1175 ASET (attrs, charset_id, make_number (id));
1176 charset.id = id;
1177 charset_table[id] = charset;
1178
1179 if (charset.method == CHARSET_METHOD_MAP)
1180 {
1181 load_charset (&charset, 0);
1182 charset_table[id] = charset;
1183 }
1184
1185 if (charset.iso_final >= 0)
1186 {
1187 ISO_CHARSET_TABLE (charset.dimension, charset.iso_chars_96,
1188 charset.iso_final) = id;
1189 if (new_definition_p)
1190 Viso_2022_charset_list = nconc2 (Viso_2022_charset_list,
1191 list1 (make_number (id)));
1192 if (ISO_CHARSET_TABLE (1, 0, 'J') == id)
1193 charset_jisx0201_roman = id;
1194 else if (ISO_CHARSET_TABLE (2, 0, '@') == id)
1195 charset_jisx0208_1978 = id;
1196 else if (ISO_CHARSET_TABLE (2, 0, 'B') == id)
1197 charset_jisx0208 = id;
1198 else if (ISO_CHARSET_TABLE (2, 0, 'C') == id)
1199 charset_ksc5601 = id;
1200 }
1201
1202 if (charset.emacs_mule_id >= 0)
1203 {
1204 emacs_mule_charset[charset.emacs_mule_id] = id;
1205 if (charset.emacs_mule_id < 0xA0)
1206 emacs_mule_bytes[charset.emacs_mule_id] = charset.dimension + 1;
1207 else
1208 emacs_mule_bytes[charset.emacs_mule_id] = charset.dimension + 2;
1209 if (new_definition_p)
1210 Vemacs_mule_charset_list = nconc2 (Vemacs_mule_charset_list,
1211 list1 (make_number (id)));
1212 }
1213
1214 if (new_definition_p)
1215 {
1216 Vcharset_list = Fcons (args[charset_arg_name], Vcharset_list);
1217 if (charset.supplementary_p)
1218 Vcharset_ordered_list = nconc2 (Vcharset_ordered_list,
1219 list1 (make_number (id)));
1220 else
1221 {
1222 Lisp_Object tail;
1223
1224 for (tail = Vcharset_ordered_list; CONSP (tail); tail = XCDR (tail))
1225 {
1226 struct charset *cs = CHARSET_FROM_ID (XINT (XCAR (tail)));
1227
1228 if (cs->supplementary_p)
1229 break;
1230 }
1231 if (EQ (tail, Vcharset_ordered_list))
1232 Vcharset_ordered_list = Fcons (make_number (id),
1233 Vcharset_ordered_list);
1234 else if (NILP (tail))
1235 Vcharset_ordered_list = nconc2 (Vcharset_ordered_list,
1236 list1 (make_number (id)));
1237 else
1238 {
1239 val = Fcons (XCAR (tail), XCDR (tail));
1240 XSETCDR (tail, val);
1241 XSETCAR (tail, make_number (id));
1242 }
1243 }
1244 charset_ordered_list_tick++;
1245 }
1246
1247 return Qnil;
1248 }
1249
1250
1251 /* Same as Fdefine_charset_internal but arguments are more convenient
1252 to call from C (typically in syms_of_charset). This can define a
1253 charset of `offset' method only. Return the ID of the new
1254 charset. */
1255
1256 static int
1257 define_charset_internal (Lisp_Object name,
1258 int dimension,
1259 const char *code_space_chars,
1260 unsigned min_code, unsigned max_code,
1261 int iso_final, int iso_revision, int emacs_mule_id,
1262 bool ascii_compatible, bool supplementary,
1263 int code_offset)
1264 {
1265 const unsigned char *code_space = (const unsigned char *) code_space_chars;
1266 Lisp_Object args[charset_arg_max];
1267 Lisp_Object val;
1268 int i;
1269
1270 args[charset_arg_name] = name;
1271 args[charset_arg_dimension] = make_number (dimension);
1272 val = make_uninit_vector (8);
1273 for (i = 0; i < 8; i++)
1274 ASET (val, i, make_number (code_space[i]));
1275 args[charset_arg_code_space] = val;
1276 args[charset_arg_min_code] = make_number (min_code);
1277 args[charset_arg_max_code] = make_number (max_code);
1278 args[charset_arg_iso_final]
1279 = (iso_final < 0 ? Qnil : make_number (iso_final));
1280 args[charset_arg_iso_revision] = make_number (iso_revision);
1281 args[charset_arg_emacs_mule_id]
1282 = (emacs_mule_id < 0 ? Qnil : make_number (emacs_mule_id));
1283 args[charset_arg_ascii_compatible_p] = ascii_compatible ? Qt : Qnil;
1284 args[charset_arg_supplementary_p] = supplementary ? Qt : Qnil;
1285 args[charset_arg_invalid_code] = Qnil;
1286 args[charset_arg_code_offset] = make_number (code_offset);
1287 args[charset_arg_map] = Qnil;
1288 args[charset_arg_subset] = Qnil;
1289 args[charset_arg_superset] = Qnil;
1290 args[charset_arg_unify_map] = Qnil;
1291
1292 args[charset_arg_plist] =
1293 listn (CONSTYPE_HEAP, 14,
1294 intern_c_string (":name"),
1295 args[charset_arg_name],
1296 intern_c_string (":dimension"),
1297 args[charset_arg_dimension],
1298 intern_c_string (":code-space"),
1299 args[charset_arg_code_space],
1300 intern_c_string (":iso-final-char"),
1301 args[charset_arg_iso_final],
1302 intern_c_string (":emacs-mule-id"),
1303 args[charset_arg_emacs_mule_id],
1304 intern_c_string (":ascii-compatible-p"),
1305 args[charset_arg_ascii_compatible_p],
1306 intern_c_string (":code-offset"),
1307 args[charset_arg_code_offset]);
1308 Fdefine_charset_internal (charset_arg_max, args);
1309
1310 return XINT (CHARSET_SYMBOL_ID (name));
1311 }
1312
1313
1314 DEFUN ("define-charset-alias", Fdefine_charset_alias,
1315 Sdefine_charset_alias, 2, 2, 0,
1316 doc: /* Define ALIAS as an alias for charset CHARSET. */)
1317 (Lisp_Object alias, Lisp_Object charset)
1318 {
1319 Lisp_Object attr;
1320
1321 CHECK_CHARSET_GET_ATTR (charset, attr);
1322 Fputhash (alias, attr, Vcharset_hash_table);
1323 Vcharset_list = Fcons (alias, Vcharset_list);
1324 return Qnil;
1325 }
1326
1327
1328 DEFUN ("charset-plist", Fcharset_plist, Scharset_plist, 1, 1, 0,
1329 doc: /* Return the property list of CHARSET. */)
1330 (Lisp_Object charset)
1331 {
1332 Lisp_Object attrs;
1333
1334 CHECK_CHARSET_GET_ATTR (charset, attrs);
1335 return CHARSET_ATTR_PLIST (attrs);
1336 }
1337
1338
1339 DEFUN ("set-charset-plist", Fset_charset_plist, Sset_charset_plist, 2, 2, 0,
1340 doc: /* Set CHARSET's property list to PLIST. */)
1341 (Lisp_Object charset, Lisp_Object plist)
1342 {
1343 Lisp_Object attrs;
1344
1345 CHECK_CHARSET_GET_ATTR (charset, attrs);
1346 ASET (attrs, charset_plist, plist);
1347 return plist;
1348 }
1349
1350
1351 DEFUN ("unify-charset", Funify_charset, Sunify_charset, 1, 3, 0,
1352 doc: /* Unify characters of CHARSET with Unicode.
1353 This means reading the relevant file and installing the table defined
1354 by CHARSET's `:unify-map' property.
1355
1356 Optional second arg UNIFY-MAP is a file name string or a vector. It has
1357 the same meaning as the `:unify-map' attribute in the function
1358 `define-charset' (which see).
1359
1360 Optional third argument DEUNIFY, if non-nil, means to de-unify CHARSET. */)
1361 (Lisp_Object charset, Lisp_Object unify_map, Lisp_Object deunify)
1362 {
1363 int id;
1364 struct charset *cs;
1365
1366 CHECK_CHARSET_GET_ID (charset, id);
1367 cs = CHARSET_FROM_ID (id);
1368 if (NILP (deunify)
1369 ? CHARSET_UNIFIED_P (cs) && ! NILP (CHARSET_DEUNIFIER (cs))
1370 : ! CHARSET_UNIFIED_P (cs))
1371 return Qnil;
1372
1373 CHARSET_UNIFIED_P (cs) = 0;
1374 if (NILP (deunify))
1375 {
1376 if (CHARSET_METHOD (cs) != CHARSET_METHOD_OFFSET
1377 || CHARSET_CODE_OFFSET (cs) < 0x110000)
1378 error ("Can't unify charset: %s", SDATA (SYMBOL_NAME (charset)));
1379 if (NILP (unify_map))
1380 unify_map = CHARSET_UNIFY_MAP (cs);
1381 else
1382 {
1383 if (! STRINGP (unify_map) && ! VECTORP (unify_map))
1384 signal_error ("Bad unify-map", unify_map);
1385 set_charset_attr (cs, charset_unify_map, unify_map);
1386 }
1387 if (NILP (Vchar_unify_table))
1388 Vchar_unify_table = Fmake_char_table (Qnil, Qnil);
1389 char_table_set_range (Vchar_unify_table,
1390 cs->min_char, cs->max_char, charset);
1391 CHARSET_UNIFIED_P (cs) = 1;
1392 }
1393 else if (CHAR_TABLE_P (Vchar_unify_table))
1394 {
1395 unsigned min_code = CHARSET_MIN_CODE (cs);
1396 unsigned max_code = CHARSET_MAX_CODE (cs);
1397 int min_char = DECODE_CHAR (cs, min_code);
1398 int max_char = DECODE_CHAR (cs, max_code);
1399
1400 char_table_set_range (Vchar_unify_table, min_char, max_char, Qnil);
1401 }
1402
1403 return Qnil;
1404 }
1405
1406 DEFUN ("get-unused-iso-final-char", Fget_unused_iso_final_char,
1407 Sget_unused_iso_final_char, 2, 2, 0,
1408 doc: /*
1409 Return an unused ISO final char for a charset of DIMENSION and CHARS.
1410 DIMENSION is the number of bytes to represent a character: 1 or 2.
1411 CHARS is the number of characters in a dimension: 94 or 96.
1412
1413 This final char is for private use, thus the range is `0' (48) .. `?' (63).
1414 If there's no unused final char for the specified kind of charset,
1415 return nil. */)
1416 (Lisp_Object dimension, Lisp_Object chars)
1417 {
1418 int final_char;
1419
1420 CHECK_NUMBER (dimension);
1421 CHECK_NUMBER (chars);
1422 if (XINT (dimension) != 1 && XINT (dimension) != 2 && XINT (dimension) != 3)
1423 args_out_of_range_3 (dimension, make_number (1), make_number (3));
1424 if (XINT (chars) != 94 && XINT (chars) != 96)
1425 args_out_of_range_3 (chars, make_number (94), make_number (96));
1426 for (final_char = '0'; final_char <= '?'; final_char++)
1427 if (ISO_CHARSET_TABLE (XINT (dimension), XINT (chars), final_char) < 0)
1428 break;
1429 return (final_char <= '?' ? make_number (final_char) : Qnil);
1430 }
1431
1432 static void
1433 check_iso_charset_parameter (Lisp_Object dimension, Lisp_Object chars, Lisp_Object final_char)
1434 {
1435 CHECK_NATNUM (dimension);
1436 CHECK_NATNUM (chars);
1437 CHECK_CHARACTER (final_char);
1438
1439 if (XINT (dimension) > 3)
1440 error ("Invalid DIMENSION %"pI"d, it should be 1, 2, or 3",
1441 XINT (dimension));
1442 if (XINT (chars) != 94 && XINT (chars) != 96)
1443 error ("Invalid CHARS %"pI"d, it should be 94 or 96", XINT (chars));
1444 if (XINT (final_char) < '0' || XINT (final_char) > '~')
1445 error ("Invalid FINAL-CHAR %c, it should be `0'..`~'",
1446 (int)XINT (final_char));
1447 }
1448
1449
1450 DEFUN ("declare-equiv-charset", Fdeclare_equiv_charset, Sdeclare_equiv_charset,
1451 4, 4, 0,
1452 doc: /* Declare an equivalent charset for ISO-2022 decoding.
1453
1454 On decoding by an ISO-2022 base coding system, when a charset
1455 specified by DIMENSION, CHARS, and FINAL-CHAR is designated, behave as
1456 if CHARSET is designated instead. */)
1457 (Lisp_Object dimension, Lisp_Object chars, Lisp_Object final_char, Lisp_Object charset)
1458 {
1459 int id;
1460 bool chars_flag;
1461
1462 CHECK_CHARSET_GET_ID (charset, id);
1463 check_iso_charset_parameter (dimension, chars, final_char);
1464 chars_flag = XINT (chars) == 96;
1465 ISO_CHARSET_TABLE (XINT (dimension), chars_flag, XINT (final_char)) = id;
1466 return Qnil;
1467 }
1468
1469
1470 /* Return information about charsets in the text at PTR of NBYTES
1471 bytes, which are NCHARS characters. The value is:
1472
1473 0: Each character is represented by one byte. This is always
1474 true for a unibyte string. For a multibyte string, true if
1475 it contains only ASCII characters.
1476
1477 1: No charsets other than ascii, control-1, and latin-1 are
1478 found.
1479
1480 2: Otherwise.
1481 */
1482
1483 int
1484 string_xstring_p (Lisp_Object string)
1485 {
1486 const unsigned char *p = SDATA (string);
1487 const unsigned char *endp = p + SBYTES (string);
1488
1489 if (SCHARS (string) == SBYTES (string))
1490 return 0;
1491
1492 while (p < endp)
1493 {
1494 int c = STRING_CHAR_ADVANCE (p);
1495
1496 if (c >= 0x100)
1497 return 2;
1498 }
1499 return 1;
1500 }
1501
1502
1503 /* Find charsets in the string at PTR of NCHARS and NBYTES.
1504
1505 CHARSETS is a vector. If Nth element is non-nil, it means the
1506 charset whose id is N is already found.
1507
1508 It may lookup a translation table TABLE if supplied. */
1509
1510 static void
1511 find_charsets_in_text (const unsigned char *ptr, ptrdiff_t nchars,
1512 ptrdiff_t nbytes, Lisp_Object charsets,
1513 Lisp_Object table, bool multibyte)
1514 {
1515 const unsigned char *pend = ptr + nbytes;
1516
1517 if (nchars == nbytes)
1518 {
1519 if (multibyte)
1520 ASET (charsets, charset_ascii, Qt);
1521 else
1522 while (ptr < pend)
1523 {
1524 int c = *ptr++;
1525
1526 if (!NILP (table))
1527 c = translate_char (table, c);
1528 if (ASCII_CHAR_P (c))
1529 ASET (charsets, charset_ascii, Qt);
1530 else
1531 ASET (charsets, charset_eight_bit, Qt);
1532 }
1533 }
1534 else
1535 {
1536 while (ptr < pend)
1537 {
1538 int c = STRING_CHAR_ADVANCE (ptr);
1539 struct charset *charset;
1540
1541 if (!NILP (table))
1542 c = translate_char (table, c);
1543 charset = CHAR_CHARSET (c);
1544 ASET (charsets, CHARSET_ID (charset), Qt);
1545 }
1546 }
1547 }
1548
1549 DEFUN ("find-charset-region", Ffind_charset_region, Sfind_charset_region,
1550 2, 3, 0,
1551 doc: /* Return a list of charsets in the region between BEG and END.
1552 BEG and END are buffer positions.
1553 Optional arg TABLE if non-nil is a translation table to look up.
1554
1555 If the current buffer is unibyte, the returned list may contain
1556 only `ascii', `eight-bit-control', and `eight-bit-graphic'. */)
1557 (Lisp_Object beg, Lisp_Object end, Lisp_Object table)
1558 {
1559 Lisp_Object charsets;
1560 ptrdiff_t from, from_byte, to, stop, stop_byte;
1561 int i;
1562 Lisp_Object val;
1563 bool multibyte = ! NILP (BVAR (current_buffer, enable_multibyte_characters));
1564
1565 validate_region (&beg, &end);
1566 from = XFASTINT (beg);
1567 stop = to = XFASTINT (end);
1568
1569 if (from < GPT && GPT < to)
1570 {
1571 stop = GPT;
1572 stop_byte = GPT_BYTE;
1573 }
1574 else
1575 stop_byte = CHAR_TO_BYTE (stop);
1576
1577 from_byte = CHAR_TO_BYTE (from);
1578
1579 charsets = Fmake_vector (make_number (charset_table_used), Qnil);
1580 while (1)
1581 {
1582 find_charsets_in_text (BYTE_POS_ADDR (from_byte), stop - from,
1583 stop_byte - from_byte, charsets, table,
1584 multibyte);
1585 if (stop < to)
1586 {
1587 from = stop, from_byte = stop_byte;
1588 stop = to, stop_byte = CHAR_TO_BYTE (stop);
1589 }
1590 else
1591 break;
1592 }
1593
1594 val = Qnil;
1595 for (i = charset_table_used - 1; i >= 0; i--)
1596 if (!NILP (AREF (charsets, i)))
1597 val = Fcons (CHARSET_NAME (charset_table + i), val);
1598 return val;
1599 }
1600
1601 DEFUN ("find-charset-string", Ffind_charset_string, Sfind_charset_string,
1602 1, 2, 0,
1603 doc: /* Return a list of charsets in STR.
1604 Optional arg TABLE if non-nil is a translation table to look up.
1605
1606 If STR is unibyte, the returned list may contain
1607 only `ascii', `eight-bit-control', and `eight-bit-graphic'. */)
1608 (Lisp_Object str, Lisp_Object table)
1609 {
1610 Lisp_Object charsets;
1611 int i;
1612 Lisp_Object val;
1613
1614 CHECK_STRING (str);
1615
1616 charsets = Fmake_vector (make_number (charset_table_used), Qnil);
1617 find_charsets_in_text (SDATA (str), SCHARS (str), SBYTES (str),
1618 charsets, table,
1619 STRING_MULTIBYTE (str));
1620 val = Qnil;
1621 for (i = charset_table_used - 1; i >= 0; i--)
1622 if (!NILP (AREF (charsets, i)))
1623 val = Fcons (CHARSET_NAME (charset_table + i), val);
1624 return val;
1625 }
1626
1627 \f
1628
1629 /* Return a unified character code for C (>= 0x110000). VAL is a
1630 value of Vchar_unify_table for C; i.e. it is nil, an integer, or a
1631 charset symbol. */
1632 static int
1633 maybe_unify_char (int c, Lisp_Object val)
1634 {
1635 struct charset *charset;
1636
1637 if (INTEGERP (val))
1638 return XFASTINT (val);
1639 if (NILP (val))
1640 return c;
1641
1642 CHECK_CHARSET_GET_CHARSET (val, charset);
1643 #ifdef REL_ALLOC
1644 /* The call to load_charset below can allocate memory, which screws
1645 callers of this function through STRING_CHAR_* macros that hold C
1646 pointers to buffer text, if REL_ALLOC is used. */
1647 r_alloc_inhibit_buffer_relocation (1);
1648 #endif
1649 load_charset (charset, 1);
1650 if (! inhibit_load_charset_map)
1651 {
1652 val = CHAR_TABLE_REF (Vchar_unify_table, c);
1653 if (! NILP (val))
1654 c = XFASTINT (val);
1655 }
1656 else
1657 {
1658 int code_index = c - CHARSET_CODE_OFFSET (charset);
1659 int unified = GET_TEMP_CHARSET_WORK_DECODER (code_index);
1660
1661 if (unified > 0)
1662 c = unified;
1663 }
1664 #ifdef REL_ALLOC
1665 r_alloc_inhibit_buffer_relocation (0);
1666 #endif
1667 return c;
1668 }
1669
1670
1671 /* Return a character corresponding to the code-point CODE of
1672 CHARSET. */
1673
1674 int
1675 decode_char (struct charset *charset, unsigned int code)
1676 {
1677 int c, char_index;
1678 enum charset_method method = CHARSET_METHOD (charset);
1679
1680 if (code < CHARSET_MIN_CODE (charset) || code > CHARSET_MAX_CODE (charset))
1681 return -1;
1682
1683 if (method == CHARSET_METHOD_SUBSET)
1684 {
1685 Lisp_Object subset_info;
1686
1687 subset_info = CHARSET_SUBSET (charset);
1688 charset = CHARSET_FROM_ID (XFASTINT (AREF (subset_info, 0)));
1689 code -= XINT (AREF (subset_info, 3));
1690 if (code < XFASTINT (AREF (subset_info, 1))
1691 || code > XFASTINT (AREF (subset_info, 2)))
1692 c = -1;
1693 else
1694 c = DECODE_CHAR (charset, code);
1695 }
1696 else if (method == CHARSET_METHOD_SUPERSET)
1697 {
1698 Lisp_Object parents;
1699
1700 parents = CHARSET_SUPERSET (charset);
1701 c = -1;
1702 for (; CONSP (parents); parents = XCDR (parents))
1703 {
1704 int id = XINT (XCAR (XCAR (parents)));
1705 int code_offset = XINT (XCDR (XCAR (parents)));
1706 unsigned this_code = code - code_offset;
1707
1708 charset = CHARSET_FROM_ID (id);
1709 if ((c = DECODE_CHAR (charset, this_code)) >= 0)
1710 break;
1711 }
1712 }
1713 else
1714 {
1715 char_index = CODE_POINT_TO_INDEX (charset, code);
1716 if (char_index < 0)
1717 return -1;
1718
1719 if (method == CHARSET_METHOD_MAP)
1720 {
1721 Lisp_Object decoder;
1722
1723 decoder = CHARSET_DECODER (charset);
1724 if (! VECTORP (decoder))
1725 {
1726 load_charset (charset, 1);
1727 decoder = CHARSET_DECODER (charset);
1728 }
1729 if (VECTORP (decoder))
1730 c = XINT (AREF (decoder, char_index));
1731 else
1732 c = GET_TEMP_CHARSET_WORK_DECODER (char_index);
1733 }
1734 else /* method == CHARSET_METHOD_OFFSET */
1735 {
1736 c = char_index + CHARSET_CODE_OFFSET (charset);
1737 if (CHARSET_UNIFIED_P (charset)
1738 && MAX_UNICODE_CHAR < c && c <= MAX_5_BYTE_CHAR)
1739 {
1740 /* Unify C with a Unicode character if possible. */
1741 Lisp_Object val = CHAR_TABLE_REF (Vchar_unify_table, c);
1742 c = maybe_unify_char (c, val);
1743 }
1744 }
1745 }
1746
1747 return c;
1748 }
1749
1750 /* Variable used temporarily by the macro ENCODE_CHAR. */
1751 Lisp_Object charset_work;
1752
1753 /* Return a code-point of C in CHARSET. If C doesn't belong to
1754 CHARSET, return CHARSET_INVALID_CODE (CHARSET). If STRICT is true,
1755 use CHARSET's strict_max_char instead of max_char. */
1756
1757 unsigned
1758 encode_char (struct charset *charset, int c)
1759 {
1760 unsigned code;
1761 enum charset_method method = CHARSET_METHOD (charset);
1762
1763 if (CHARSET_UNIFIED_P (charset))
1764 {
1765 Lisp_Object deunifier;
1766 int code_index = -1;
1767
1768 deunifier = CHARSET_DEUNIFIER (charset);
1769 if (! CHAR_TABLE_P (deunifier))
1770 {
1771 load_charset (charset, 2);
1772 deunifier = CHARSET_DEUNIFIER (charset);
1773 }
1774 if (CHAR_TABLE_P (deunifier))
1775 {
1776 Lisp_Object deunified = CHAR_TABLE_REF (deunifier, c);
1777
1778 if (INTEGERP (deunified))
1779 code_index = XINT (deunified);
1780 }
1781 else
1782 {
1783 code_index = GET_TEMP_CHARSET_WORK_ENCODER (c);
1784 }
1785 if (code_index >= 0)
1786 c = CHARSET_CODE_OFFSET (charset) + code_index;
1787 }
1788
1789 if (method == CHARSET_METHOD_SUBSET)
1790 {
1791 Lisp_Object subset_info;
1792 struct charset *this_charset;
1793
1794 subset_info = CHARSET_SUBSET (charset);
1795 this_charset = CHARSET_FROM_ID (XFASTINT (AREF (subset_info, 0)));
1796 code = ENCODE_CHAR (this_charset, c);
1797 if (code == CHARSET_INVALID_CODE (this_charset)
1798 || code < XFASTINT (AREF (subset_info, 1))
1799 || code > XFASTINT (AREF (subset_info, 2)))
1800 return CHARSET_INVALID_CODE (charset);
1801 code += XINT (AREF (subset_info, 3));
1802 return code;
1803 }
1804
1805 if (method == CHARSET_METHOD_SUPERSET)
1806 {
1807 Lisp_Object parents;
1808
1809 parents = CHARSET_SUPERSET (charset);
1810 for (; CONSP (parents); parents = XCDR (parents))
1811 {
1812 int id = XINT (XCAR (XCAR (parents)));
1813 int code_offset = XINT (XCDR (XCAR (parents)));
1814 struct charset *this_charset = CHARSET_FROM_ID (id);
1815
1816 code = ENCODE_CHAR (this_charset, c);
1817 if (code != CHARSET_INVALID_CODE (this_charset))
1818 return code + code_offset;
1819 }
1820 return CHARSET_INVALID_CODE (charset);
1821 }
1822
1823 if (! CHARSET_FAST_MAP_REF ((c), charset->fast_map)
1824 || c < CHARSET_MIN_CHAR (charset) || c > CHARSET_MAX_CHAR (charset))
1825 return CHARSET_INVALID_CODE (charset);
1826
1827 if (method == CHARSET_METHOD_MAP)
1828 {
1829 Lisp_Object encoder;
1830 Lisp_Object val;
1831
1832 encoder = CHARSET_ENCODER (charset);
1833 if (! CHAR_TABLE_P (CHARSET_ENCODER (charset)))
1834 {
1835 load_charset (charset, 2);
1836 encoder = CHARSET_ENCODER (charset);
1837 }
1838 if (CHAR_TABLE_P (encoder))
1839 {
1840 val = CHAR_TABLE_REF (encoder, c);
1841 if (NILP (val))
1842 return CHARSET_INVALID_CODE (charset);
1843 code = XINT (val);
1844 if (! CHARSET_COMPACT_CODES_P (charset))
1845 code = INDEX_TO_CODE_POINT (charset, code);
1846 }
1847 else
1848 {
1849 code = GET_TEMP_CHARSET_WORK_ENCODER (c);
1850 code = INDEX_TO_CODE_POINT (charset, code);
1851 }
1852 }
1853 else /* method == CHARSET_METHOD_OFFSET */
1854 {
1855 unsigned code_index = c - CHARSET_CODE_OFFSET (charset);
1856
1857 code = INDEX_TO_CODE_POINT (charset, code_index);
1858 }
1859
1860 return code;
1861 }
1862
1863
1864 DEFUN ("decode-char", Fdecode_char, Sdecode_char, 2, 3, 0,
1865 doc: /* Decode the pair of CHARSET and CODE-POINT into a character.
1866 Return nil if CODE-POINT is not valid in CHARSET.
1867
1868 CODE-POINT may be a cons (HIGHER-16-BIT-VALUE . LOWER-16-BIT-VALUE). */)
1869 (Lisp_Object charset, Lisp_Object code_point, Lisp_Object restriction)
1870 {
1871 int c, id;
1872 unsigned code;
1873 struct charset *charsetp;
1874
1875 CHECK_CHARSET_GET_ID (charset, id);
1876 code = cons_to_unsigned (code_point, UINT_MAX);
1877 charsetp = CHARSET_FROM_ID (id);
1878 c = DECODE_CHAR (charsetp, code);
1879 return (c >= 0 ? make_number (c) : Qnil);
1880 }
1881
1882
1883 DEFUN ("encode-char", Fencode_char, Sencode_char, 2, 3, 0,
1884 doc: /* Encode the character CH into a code-point of CHARSET.
1885 Return nil if CHARSET doesn't include CH. */)
1886 (Lisp_Object ch, Lisp_Object charset, Lisp_Object restriction)
1887 {
1888 int c, id;
1889 unsigned code;
1890 struct charset *charsetp;
1891
1892 CHECK_CHARSET_GET_ID (charset, id);
1893 CHECK_CHARACTER (ch);
1894 c = XFASTINT (ch);
1895 charsetp = CHARSET_FROM_ID (id);
1896 code = ENCODE_CHAR (charsetp, c);
1897 if (code == CHARSET_INVALID_CODE (charsetp))
1898 return Qnil;
1899 return INTEGER_TO_CONS (code);
1900 }
1901
1902
1903 DEFUN ("make-char", Fmake_char, Smake_char, 1, 5, 0,
1904 doc:
1905 /* Return a character of CHARSET whose position codes are CODEn.
1906
1907 CODE1 through CODE4 are optional, but if you don't supply sufficient
1908 position codes, it is assumed that the minimum code in each dimension
1909 is specified. */)
1910 (Lisp_Object charset, Lisp_Object code1, Lisp_Object code2, Lisp_Object code3, Lisp_Object code4)
1911 {
1912 int id, dimension;
1913 struct charset *charsetp;
1914 unsigned code;
1915 int c;
1916
1917 CHECK_CHARSET_GET_ID (charset, id);
1918 charsetp = CHARSET_FROM_ID (id);
1919
1920 dimension = CHARSET_DIMENSION (charsetp);
1921 if (NILP (code1))
1922 code = (CHARSET_ASCII_COMPATIBLE_P (charsetp)
1923 ? 0 : CHARSET_MIN_CODE (charsetp));
1924 else
1925 {
1926 CHECK_NATNUM (code1);
1927 if (XFASTINT (code1) >= 0x100)
1928 args_out_of_range (make_number (0xFF), code1);
1929 code = XFASTINT (code1);
1930
1931 if (dimension > 1)
1932 {
1933 code <<= 8;
1934 if (NILP (code2))
1935 code |= charsetp->code_space[(dimension - 2) * 4];
1936 else
1937 {
1938 CHECK_NATNUM (code2);
1939 if (XFASTINT (code2) >= 0x100)
1940 args_out_of_range (make_number (0xFF), code2);
1941 code |= XFASTINT (code2);
1942 }
1943
1944 if (dimension > 2)
1945 {
1946 code <<= 8;
1947 if (NILP (code3))
1948 code |= charsetp->code_space[(dimension - 3) * 4];
1949 else
1950 {
1951 CHECK_NATNUM (code3);
1952 if (XFASTINT (code3) >= 0x100)
1953 args_out_of_range (make_number (0xFF), code3);
1954 code |= XFASTINT (code3);
1955 }
1956
1957 if (dimension > 3)
1958 {
1959 code <<= 8;
1960 if (NILP (code4))
1961 code |= charsetp->code_space[0];
1962 else
1963 {
1964 CHECK_NATNUM (code4);
1965 if (XFASTINT (code4) >= 0x100)
1966 args_out_of_range (make_number (0xFF), code4);
1967 code |= XFASTINT (code4);
1968 }
1969 }
1970 }
1971 }
1972 }
1973
1974 if (CHARSET_ISO_FINAL (charsetp) >= 0)
1975 code &= 0x7F7F7F7F;
1976 c = DECODE_CHAR (charsetp, code);
1977 if (c < 0)
1978 error ("Invalid code(s)");
1979 return make_number (c);
1980 }
1981
1982
1983 /* Return the first charset in CHARSET_LIST that contains C.
1984 CHARSET_LIST is a list of charset IDs. If it is nil, use
1985 Vcharset_ordered_list. */
1986
1987 struct charset *
1988 char_charset (int c, Lisp_Object charset_list, unsigned int *code_return)
1989 {
1990 bool maybe_null = 0;
1991
1992 if (NILP (charset_list))
1993 charset_list = Vcharset_ordered_list;
1994 else
1995 maybe_null = 1;
1996
1997 while (CONSP (charset_list))
1998 {
1999 struct charset *charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
2000 unsigned code = ENCODE_CHAR (charset, c);
2001
2002 if (code != CHARSET_INVALID_CODE (charset))
2003 {
2004 if (code_return)
2005 *code_return = code;
2006 return charset;
2007 }
2008 charset_list = XCDR (charset_list);
2009 if (! maybe_null
2010 && c <= MAX_UNICODE_CHAR
2011 && EQ (charset_list, Vcharset_non_preferred_head))
2012 return CHARSET_FROM_ID (charset_unicode);
2013 }
2014 return (maybe_null ? NULL
2015 : c <= MAX_5_BYTE_CHAR ? CHARSET_FROM_ID (charset_emacs)
2016 : CHARSET_FROM_ID (charset_eight_bit));
2017 }
2018
2019
2020 DEFUN ("split-char", Fsplit_char, Ssplit_char, 1, 1, 0,
2021 doc:
2022 /*Return list of charset and one to four position-codes of CH.
2023 The charset is decided by the current priority order of charsets.
2024 A position-code is a byte value of each dimension of the code-point of
2025 CH in the charset. */)
2026 (Lisp_Object ch)
2027 {
2028 struct charset *charset;
2029 int c, dimension;
2030 unsigned code;
2031 Lisp_Object val;
2032
2033 CHECK_CHARACTER (ch);
2034 c = XFASTINT (ch);
2035 charset = CHAR_CHARSET (c);
2036 if (! charset)
2037 emacs_abort ();
2038 code = ENCODE_CHAR (charset, c);
2039 if (code == CHARSET_INVALID_CODE (charset))
2040 emacs_abort ();
2041 dimension = CHARSET_DIMENSION (charset);
2042 for (val = Qnil; dimension > 0; dimension--)
2043 {
2044 val = Fcons (make_number (code & 0xFF), val);
2045 code >>= 8;
2046 }
2047 return Fcons (CHARSET_NAME (charset), val);
2048 }
2049
2050
2051 DEFUN ("char-charset", Fchar_charset, Schar_charset, 1, 2, 0,
2052 doc: /* Return the charset of highest priority that contains CH.
2053 ASCII characters are an exception: for them, this function always
2054 returns `ascii'.
2055 If optional 2nd arg RESTRICTION is non-nil, it is a list of charsets
2056 from which to find the charset. It may also be a coding system. In
2057 that case, find the charset from what supported by that coding system. */)
2058 (Lisp_Object ch, Lisp_Object restriction)
2059 {
2060 struct charset *charset;
2061
2062 CHECK_CHARACTER (ch);
2063 if (NILP (restriction))
2064 charset = CHAR_CHARSET (XINT (ch));
2065 else
2066 {
2067 if (CONSP (restriction))
2068 {
2069 int c = XFASTINT (ch);
2070
2071 for (; CONSP (restriction); restriction = XCDR (restriction))
2072 {
2073 struct charset *rcharset;
2074
2075 CHECK_CHARSET_GET_CHARSET (XCAR (restriction), rcharset);
2076 if (ENCODE_CHAR (rcharset, c) != CHARSET_INVALID_CODE (rcharset))
2077 return XCAR (restriction);
2078 }
2079 return Qnil;
2080 }
2081 restriction = coding_system_charset_list (restriction);
2082 charset = char_charset (XINT (ch), restriction, NULL);
2083 if (! charset)
2084 return Qnil;
2085 }
2086 return (CHARSET_NAME (charset));
2087 }
2088
2089
2090 DEFUN ("charset-after", Fcharset_after, Scharset_after, 0, 1, 0,
2091 doc: /*
2092 Return charset of a character in the current buffer at position POS.
2093 If POS is nil, it defaults to the current point.
2094 If POS is out of range, the value is nil. */)
2095 (Lisp_Object pos)
2096 {
2097 Lisp_Object ch;
2098 struct charset *charset;
2099
2100 ch = Fchar_after (pos);
2101 if (! INTEGERP (ch))
2102 return ch;
2103 charset = CHAR_CHARSET (XINT (ch));
2104 return (CHARSET_NAME (charset));
2105 }
2106
2107
2108 DEFUN ("iso-charset", Fiso_charset, Siso_charset, 3, 3, 0,
2109 doc: /*
2110 Return charset of ISO's specification DIMENSION, CHARS, and FINAL-CHAR.
2111
2112 ISO 2022's designation sequence (escape sequence) distinguishes charsets
2113 by their DIMENSION, CHARS, and FINAL-CHAR,
2114 whereas Emacs distinguishes them by charset symbol.
2115 See the documentation of the function `charset-info' for the meanings of
2116 DIMENSION, CHARS, and FINAL-CHAR. */)
2117 (Lisp_Object dimension, Lisp_Object chars, Lisp_Object final_char)
2118 {
2119 int id;
2120 bool chars_flag;
2121
2122 check_iso_charset_parameter (dimension, chars, final_char);
2123 chars_flag = XFASTINT (chars) == 96;
2124 id = ISO_CHARSET_TABLE (XFASTINT (dimension), chars_flag,
2125 XFASTINT (final_char));
2126 return (id >= 0 ? CHARSET_NAME (CHARSET_FROM_ID (id)) : Qnil);
2127 }
2128
2129
2130 DEFUN ("clear-charset-maps", Fclear_charset_maps, Sclear_charset_maps,
2131 0, 0, 0,
2132 doc: /*
2133 Internal use only.
2134 Clear temporary charset mapping tables.
2135 It should be called only from temacs invoked for dumping. */)
2136 (void)
2137 {
2138 if (temp_charset_work)
2139 {
2140 xfree (temp_charset_work);
2141 temp_charset_work = NULL;
2142 }
2143
2144 if (CHAR_TABLE_P (Vchar_unify_table))
2145 Foptimize_char_table (Vchar_unify_table, Qnil);
2146
2147 return Qnil;
2148 }
2149
2150 DEFUN ("charset-priority-list", Fcharset_priority_list,
2151 Scharset_priority_list, 0, 1, 0,
2152 doc: /* Return the list of charsets ordered by priority.
2153 HIGHESTP non-nil means just return the highest priority one. */)
2154 (Lisp_Object highestp)
2155 {
2156 Lisp_Object val = Qnil, list = Vcharset_ordered_list;
2157
2158 if (!NILP (highestp))
2159 return CHARSET_NAME (CHARSET_FROM_ID (XINT (Fcar (list))));
2160
2161 while (!NILP (list))
2162 {
2163 val = Fcons (CHARSET_NAME (CHARSET_FROM_ID (XINT (XCAR (list)))), val);
2164 list = XCDR (list);
2165 }
2166 return Fnreverse (val);
2167 }
2168
2169 DEFUN ("set-charset-priority", Fset_charset_priority, Sset_charset_priority,
2170 1, MANY, 0,
2171 doc: /* Assign higher priority to the charsets given as arguments.
2172 usage: (set-charset-priority &rest charsets) */)
2173 (ptrdiff_t nargs, Lisp_Object *args)
2174 {
2175 Lisp_Object new_head, old_list, arglist[2];
2176 Lisp_Object list_2022, list_emacs_mule;
2177 ptrdiff_t i;
2178 int id;
2179
2180 old_list = Fcopy_sequence (Vcharset_ordered_list);
2181 new_head = Qnil;
2182 for (i = 0; i < nargs; i++)
2183 {
2184 CHECK_CHARSET_GET_ID (args[i], id);
2185 if (! NILP (Fmemq (make_number (id), old_list)))
2186 {
2187 old_list = Fdelq (make_number (id), old_list);
2188 new_head = Fcons (make_number (id), new_head);
2189 }
2190 }
2191 arglist[0] = Fnreverse (new_head);
2192 arglist[1] = Vcharset_non_preferred_head = old_list;
2193 Vcharset_ordered_list = Fnconc (2, arglist);
2194 charset_ordered_list_tick++;
2195
2196 charset_unibyte = -1;
2197 for (old_list = Vcharset_ordered_list, list_2022 = list_emacs_mule = Qnil;
2198 CONSP (old_list); old_list = XCDR (old_list))
2199 {
2200 if (! NILP (Fmemq (XCAR (old_list), Viso_2022_charset_list)))
2201 list_2022 = Fcons (XCAR (old_list), list_2022);
2202 if (! NILP (Fmemq (XCAR (old_list), Vemacs_mule_charset_list)))
2203 list_emacs_mule = Fcons (XCAR (old_list), list_emacs_mule);
2204 if (charset_unibyte < 0)
2205 {
2206 struct charset *charset = CHARSET_FROM_ID (XINT (XCAR (old_list)));
2207
2208 if (CHARSET_DIMENSION (charset) == 1
2209 && CHARSET_ASCII_COMPATIBLE_P (charset)
2210 && CHARSET_MAX_CHAR (charset) >= 0x80)
2211 charset_unibyte = CHARSET_ID (charset);
2212 }
2213 }
2214 Viso_2022_charset_list = Fnreverse (list_2022);
2215 Vemacs_mule_charset_list = Fnreverse (list_emacs_mule);
2216 if (charset_unibyte < 0)
2217 charset_unibyte = charset_iso_8859_1;
2218
2219 return Qnil;
2220 }
2221
2222 DEFUN ("charset-id-internal", Fcharset_id_internal, Scharset_id_internal,
2223 0, 1, 0,
2224 doc: /* Internal use only.
2225 Return charset identification number of CHARSET. */)
2226 (Lisp_Object charset)
2227 {
2228 int id;
2229
2230 CHECK_CHARSET_GET_ID (charset, id);
2231 return make_number (id);
2232 }
2233
2234 struct charset_sort_data
2235 {
2236 Lisp_Object charset;
2237 int id;
2238 ptrdiff_t priority;
2239 };
2240
2241 static int
2242 charset_compare (const void *d1, const void *d2)
2243 {
2244 const struct charset_sort_data *data1 = d1, *data2 = d2;
2245 if (data1->priority != data2->priority)
2246 return data1->priority < data2->priority ? -1 : 1;
2247 return 0;
2248 }
2249
2250 DEFUN ("sort-charsets", Fsort_charsets, Ssort_charsets, 1, 1, 0,
2251 doc: /* Sort charset list CHARSETS by a priority of each charset.
2252 Return the sorted list. CHARSETS is modified by side effects.
2253 See also `charset-priority-list' and `set-charset-priority'. */)
2254 (Lisp_Object charsets)
2255 {
2256 Lisp_Object len = Flength (charsets);
2257 ptrdiff_t n = XFASTINT (len), i, j;
2258 int done;
2259 Lisp_Object tail, elt, attrs;
2260 struct charset_sort_data *sort_data;
2261 int id, min_id = INT_MAX, max_id = INT_MIN;
2262 USE_SAFE_ALLOCA;
2263
2264 if (n == 0)
2265 return Qnil;
2266 SAFE_NALLOCA (sort_data, 1, n);
2267 for (tail = charsets, i = 0; CONSP (tail); tail = XCDR (tail), i++)
2268 {
2269 elt = XCAR (tail);
2270 CHECK_CHARSET_GET_ATTR (elt, attrs);
2271 sort_data[i].charset = elt;
2272 sort_data[i].id = id = XINT (CHARSET_ATTR_ID (attrs));
2273 if (id < min_id)
2274 min_id = id;
2275 if (id > max_id)
2276 max_id = id;
2277 }
2278 for (done = 0, tail = Vcharset_ordered_list, i = 0;
2279 done < n && CONSP (tail); tail = XCDR (tail), i++)
2280 {
2281 elt = XCAR (tail);
2282 id = XFASTINT (elt);
2283 if (id >= min_id && id <= max_id)
2284 for (j = 0; j < n; j++)
2285 if (sort_data[j].id == id)
2286 {
2287 sort_data[j].priority = i;
2288 done++;
2289 }
2290 }
2291 qsort (sort_data, n, sizeof *sort_data, charset_compare);
2292 for (i = 0, tail = charsets; CONSP (tail); tail = XCDR (tail), i++)
2293 XSETCAR (tail, sort_data[i].charset);
2294 SAFE_FREE ();
2295 return charsets;
2296 }
2297
2298 \f
2299 void
2300 init_charset (void)
2301 {
2302 Lisp_Object tempdir;
2303 tempdir = Fexpand_file_name (build_string ("charsets"), Vdata_directory);
2304 if (! file_accessible_directory_p (SSDATA (tempdir)))
2305 {
2306 /* This used to be non-fatal (dir_warning), but it should not
2307 happen, and if it does sooner or later it will cause some
2308 obscure problem (eg bug#6401), so better abort. */
2309 fprintf (stderr, "Error: charsets directory not found:\n\
2310 %s\n\
2311 Emacs will not function correctly without the character map files.\n%s\
2312 Please check your installation!\n",
2313 SDATA (tempdir),
2314 egetenv("EMACSDATA") ? "The EMACSDATA environment \
2315 variable is set, maybe it has the wrong value?\n" : "");
2316 exit (1);
2317 }
2318
2319 Vcharset_map_path = list1 (tempdir);
2320 }
2321
2322
2323 void
2324 init_charset_once (void)
2325 {
2326 int i, j, k;
2327
2328 for (i = 0; i < ISO_MAX_DIMENSION; i++)
2329 for (j = 0; j < ISO_MAX_CHARS; j++)
2330 for (k = 0; k < ISO_MAX_FINAL; k++)
2331 iso_charset_table[i][j][k] = -1;
2332
2333 for (i = 0; i < 256; i++)
2334 emacs_mule_charset[i] = -1;
2335
2336 charset_jisx0201_roman = -1;
2337 charset_jisx0208_1978 = -1;
2338 charset_jisx0208 = -1;
2339 charset_ksc5601 = -1;
2340 }
2341
2342 #ifdef emacs
2343
2344 /* Allocate an initial charset table that is large enough to handle
2345 Emacs while it is bootstrapping. As of September 2011, the size
2346 needs to be at least 166; make it a bit bigger to allow for future
2347 expansion.
2348
2349 Don't make the value so small that the table is reallocated during
2350 bootstrapping, as glibc malloc calls larger than just under 64 KiB
2351 during an initial bootstrap wreak havoc after dumping; see the
2352 M_MMAP_THRESHOLD value in alloc.c, plus there is a extra overhead
2353 internal to glibc malloc and perhaps to Emacs malloc debugging. */
2354 static struct charset charset_table_init[180];
2355
2356 void
2357 syms_of_charset (void)
2358 {
2359 #include "charset.x"
2360
2361 DEFSYM (Qcharsetp, "charsetp");
2362
2363 DEFSYM (Qascii, "ascii");
2364 DEFSYM (Qunicode, "unicode");
2365 DEFSYM (Qemacs, "emacs");
2366 DEFSYM (Qeight_bit, "eight-bit");
2367 DEFSYM (Qiso_8859_1, "iso-8859-1");
2368
2369 DEFSYM (Qgl, "gl");
2370 DEFSYM (Qgr, "gr");
2371
2372 staticpro (&Vcharset_ordered_list);
2373 Vcharset_ordered_list = Qnil;
2374
2375 staticpro (&Viso_2022_charset_list);
2376 Viso_2022_charset_list = Qnil;
2377
2378 staticpro (&Vemacs_mule_charset_list);
2379 Vemacs_mule_charset_list = Qnil;
2380
2381 /* Don't staticpro them here. It's done in syms_of_fns. */
2382 QCtest = intern_c_string (":test");
2383 Qeq = intern_c_string ("eq");
2384
2385 staticpro (&Vcharset_hash_table);
2386 {
2387 Lisp_Object args[2];
2388 args[0] = QCtest;
2389 args[1] = Qeq;
2390 Vcharset_hash_table = Fmake_hash_table (2, args);
2391 }
2392
2393 charset_table = charset_table_init;
2394 charset_table_size = ARRAYELTS (charset_table_init);
2395 charset_table_used = 0;
2396
2397 DEFVAR_LISP ("charset-map-path", Vcharset_map_path,
2398 doc: /* List of directories to search for charset map files. */);
2399 Vcharset_map_path = Qnil;
2400
2401 DEFVAR_BOOL ("inhibit-load-charset-map", inhibit_load_charset_map,
2402 doc: /* Inhibit loading of charset maps. Used when dumping Emacs. */);
2403 inhibit_load_charset_map = 0;
2404
2405 DEFVAR_LISP ("charset-list", Vcharset_list,
2406 doc: /* List of all charsets ever defined. */);
2407 Vcharset_list = Qnil;
2408
2409 DEFVAR_LISP ("current-iso639-language", Vcurrent_iso639_language,
2410 doc: /* ISO639 language mnemonic symbol for the current language environment.
2411 If the current language environment is for multiple languages (e.g. "Latin-1"),
2412 the value may be a list of mnemonics. */);
2413 Vcurrent_iso639_language = Qnil;
2414
2415 charset_ascii
2416 = define_charset_internal (Qascii, 1, "\x00\x7F\0\0\0\0\0",
2417 0, 127, 'B', -1, 0, 1, 0, 0);
2418 charset_iso_8859_1
2419 = define_charset_internal (Qiso_8859_1, 1, "\x00\xFF\0\0\0\0\0",
2420 0, 255, -1, -1, -1, 1, 0, 0);
2421 charset_unicode
2422 = define_charset_internal (Qunicode, 3, "\x00\xFF\x00\xFF\x00\x10\0",
2423 0, MAX_UNICODE_CHAR, -1, 0, -1, 1, 0, 0);
2424 charset_emacs
2425 = define_charset_internal (Qemacs, 3, "\x00\xFF\x00\xFF\x00\x3F\0",
2426 0, MAX_5_BYTE_CHAR, -1, 0, -1, 1, 1, 0);
2427 charset_eight_bit
2428 = define_charset_internal (Qeight_bit, 1, "\x80\xFF\0\0\0\0\0",
2429 128, 255, -1, 0, -1, 0, 1,
2430 MAX_5_BYTE_CHAR + 1);
2431 charset_unibyte = charset_iso_8859_1;
2432 }
2433
2434 #endif /* emacs */