(syms_of_character): Fix CHAR_TABLE_SET call.
[bpt/emacs.git] / src / charset.h
CommitLineData
3263d5a2 1/* Header for charset handler.
4a2f9c6a 2 Copyright (C) 1995, 1997, 1998 Electrotechnical Laboratory, JAPAN.
75c8c592 3 Licensed to the Free Software Foundation.
e06aa1f9 4 Copyright (C) 2001 Free Software Foundation, Inc.
3263d5a2
KH
5 Copyright (C) 2001, 2002
6 National Institute of Advanced Industrial Science and Technology (AIST)
7 Registration Number H13PRO009
4ed46869 8
369314dc
KH
9This file is part of GNU Emacs.
10
11GNU Emacs is free software; you can redistribute it and/or modify
12it under the terms of the GNU General Public License as published by
13the Free Software Foundation; either version 2, or (at your option)
14any later version.
4ed46869 15
369314dc
KH
16GNU Emacs is distributed in the hope that it will be useful,
17but WITHOUT ANY WARRANTY; without even the implied warranty of
18MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19GNU General Public License for more details.
4ed46869 20
369314dc
KH
21You should have received a copy of the GNU General Public License
22along with GNU Emacs; see the file COPYING. If not, write to
23the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
24Boston, MA 02111-1307, USA. */
4ed46869 25
aa01a892
KH
26#ifndef EMACS_CHARSET_H
27#define EMACS_CHARSET_H
4ed46869 28
3263d5a2
KH
29/* Index to arguments of Fdefine_charset_internal. */
30
31enum define_charset_arg_index
32 {
33 charset_arg_name,
34 charset_arg_dimension,
35 charset_arg_code_space,
315c0139
KH
36 charset_arg_min_code,
37 charset_arg_max_code,
3263d5a2
KH
38 charset_arg_iso_final,
39 charset_arg_iso_revision,
40 charset_arg_emacs_mule_id,
41 charset_arg_ascii_compatible_p,
42 charset_arg_supplementary_p,
43 charset_arg_invalid_code,
44 charset_arg_code_offset,
45 charset_arg_map,
ec7dd615
KH
46 charset_arg_subset,
47 charset_arg_superset,
3263d5a2
KH
48 charset_arg_unify_map,
49 charset_arg_plist,
50 charset_arg_max
51 };
52
53
54/* Indices to charset attributes vector. */
55
56enum charset_attr_index
57 {
58 /* ID number of the charset. */
59 charset_id,
4ed46869 60
3263d5a2
KH
61 /* Name of the charset (symbol). */
62 charset_name,
63
64 /* Property list of the charset. */
65 charset_plist,
66
67 /* If the method of the charset is `MAP_DEFERRED', the value is a
3e4abc9e 68 mapping vector or a file name that contains mapping vector.
3263d5a2
KH
69 Otherwise, nil. */
70 charset_map,
71
72 /* If the method of the charset is `MAP', the value is a vector
73 that maps code points of the charset to characters. The vector
74 is indexed by a character index. A character index is
75 calculated from a code point and the code-space table of the
76 charset. */
77 charset_decoder,
78
79 /* If the method of the charset is `MAP', the value is a
80 char-table that maps characters of the charset to code
81 points. */
82 charset_encoder,
83
ec7dd615
KH
84 /* If the method of the charset is `SUBSET', the value is a vector
85 that has this form:
86
87 [ CHARSET-ID MIN-CODE MAX-CODE OFFSET ]
88
89 CHARSET-ID is an ID number of a parent charset. MIN-CODE and
90 MAX-CODE specify the range of characters inherited from the
91 parent. OFFSET is an integer value to add to a code point of
92 the parent charset to get the corresponding code point of this
93 charset. */
94 charset_subset,
95
96 /* If the method of the charset is `SUPERSET', the value is a list
97 whose elements have this form:
98
99 (CHARSET-ID . OFFSET)
100
101 CHARSET-IDs are ID numbers of parent charsets. OFFSET is an
102 integer value to add to a code point of the parent charset to
103 get the corresponding code point of this charset. */
104 charset_superset,
3263d5a2 105
7619dee9
DL
106 /* The value is a mapping vector or a file name that contains the
107 mapping. This defines how characters in the charset should be
108 unified with Unicode. The value of the member
3e4abc9e 109 `charset_deunifier' is created from this information. */
3263d5a2
KH
110 charset_unify_map,
111
3e4abc9e
KH
112 /* If characters in the charset must be unified Unicode, the value
113 is a char table that maps a character code in the charset to
114 the corresponding Unicode character. */
3263d5a2
KH
115 charset_deunifier,
116
7619dee9 117 /* The length of the charset attribute vector. */
3263d5a2
KH
118 charset_attr_max
119 };
120
121/* Methods for converting code points and characters of charsets. */
122
123enum charset_method
124 {
125 /* For a charset of this method, a character code is calculated
126 from a character index (which is calculated from a code point)
127 simply by adding an offset value. */
128 CHARSET_METHOD_OFFSET,
129
130 /* For a charset of this method, a decoder vector and an encoder
131 char-table is used for code point <-> character code
132 conversion. */
133 CHARSET_METHOD_MAP,
134
135 /* Same as above but decoder and encoder are loaded from a file on
136 demand. Once loaded, the method is changed to
137 CHARSET_METHOD_MAP. */
138 CHARSET_METHOD_MAP_DEFERRED,
139
7619dee9 140 /* A charset of this method is a subset of another charset. */
ec7dd615
KH
141 CHARSET_METHOD_SUBSET,
142
7619dee9 143 /* A charset of this method is a superset of other charsets. */
ec7dd615 144 CHARSET_METHOD_SUPERSET
3263d5a2
KH
145 };
146
147struct charset
148{
3e4abc9e 149 /* Index to charset_table. */
3263d5a2
KH
150 int id;
151
3e4abc9e 152 /* Index to Vcharset_hash_table. */
3263d5a2
KH
153 int hash_index;
154
155 /* Dimension of the charset: 1, 2, 3, or 4. */
156 int dimension;
157
ac6a8028
KH
158 /* Byte code range of each dimension. <code_space>[4N] is a mininum
159 byte code of the (N+1)th dimension, <code_space>[4N+1] is a
160 maximum byte code of the (N+1)th dimension, <code_space>[4N+2] is
161 (<code_space>[4N+1] - <code_space>[4N] + 1), <code_space>[4N+3]
162 is a number of characters containd in the first to (N+1)th
163 dismesions. We get `char-index' of a `code-point' from this
164 information. */
3263d5a2
KH
165 int code_space[16];
166
ac6a8028
KH
167 /* If B is a byte of Nth dimension of a code-point, the (N-1)th bit
168 of code_space_mask[B] is set. This array is used to quickly
169 check if a code-point is in a valid range. */
170 unsigned char *code_space_mask;
171
3263d5a2
KH
172 /* 1 if there's no gap in code-points. */
173 int code_linear_p;
174
175 /* If the charset is treated as 94-chars in ISO-2022, the value is 0.
176 If the charset is treated as 96-chars in ISO-2022, the value is 1. */
177 int iso_chars_96;
178
3e4abc9e
KH
179 /* ISO final byte of the charset: 48..127. It may be -1 if the
180 charset doesn't conform to ISO-2022. */
3263d5a2
KH
181 int iso_final;
182
3e4abc9e 183 /* ISO revision number of the charset. */
3263d5a2
KH
184 int iso_revision;
185
186 /* If the charset is identical to what supported by Emacs 21 and the
187 priors, the identification number of the charset used in those
188 version. Otherwise, -1. */
189 int emacs_mule_id;
190
191 /* Nonzero iff the charset is compatible with ASCII. */
192 int ascii_compatible_p;
193
194 /* Nonzero iff the charset is supplementary. */
195 int supplementary_p;
196
197 /* Nonzero iff all the code points are representable by Lisp_Int. */
198 int compact_codes_p;
199
200 /* The method for encoding/decoding characters of the charset. */
201 enum charset_method method;
202
203 /* Mininum and Maximum code points of the charset. */
204 unsigned min_code, max_code;
205
315c0139
KH
206 /* Offset value used by macros CODE_POINT_TO_INDEX and
207 INDEX_TO_CODE_POINT. . */
208 unsigned char_index_offset;
209
3263d5a2
KH
210 /* Mininum and Maximum character codes of the charset. If the
211 charset is compatible with ASCII, min_char is a minimum non-ASCII
212 character of the charset. */
213 int min_char, max_char;
214
215 /* The code returned by ENCODE_CHAR if a character is not encodable
216 by the charset. */
217 unsigned invalid_code;
218
219 /* If the method of the charset is CHARSET_METHOD_MAP, this is a
220 table of bits used to quickly and roughly guess if a character
221 belongs to the charset.
222
223 The first 64 elements are 512 bits for characters less than
224 0x10000. Each bit corresponds to 128-character block. The last
225 126 elements are 1008 bits for the greater characters
226 (0x10000..0x3FFFFF). Each bit corresponds to 4096-character
227 block.
228
7619dee9 229 If a bit is 1, at least one character in the corresponding block is
3263d5a2
KH
230 in this charset. */
231 unsigned char fast_map[190];
232
233 /* Offset value to calculate a character code from code-point, and
234 visa versa. */
235 int code_offset;
236
237 int unified_p;
238};
239
240/* Hash table of charset symbols vs. the correponding attribute
241 vectors. */
242extern Lisp_Object Vcharset_hash_table;
243
244/* Table of struct charset. */
245extern struct charset *charset_table;
246extern int charset_table_used;
247
248#define CHARSET_FROM_ID(id) (charset_table + (id))
249
250extern Lisp_Object Vcharset_list;
251extern Lisp_Object Viso_2022_charset_list;
252extern Lisp_Object Vemacs_mule_charset_list;
253
254extern struct charset *emacs_mule_charset[256];
255
256
257/* Macros to access information about charset. */
258
259/* Return the attribute vector of charset whose symbol is SYMBOL. */
260#define CHARSET_SYMBOL_ATTRIBUTES(symbol) \
261 Fgethash ((symbol), Vcharset_hash_table, Qnil)
262
263#define CHARSET_ATTR_ID(attrs) AREF ((attrs), charset_id)
264#define CHARSET_ATTR_NAME(attrs) AREF ((attrs), charset_name)
265#define CHARSET_ATTR_PLIST(attrs) AREF ((attrs), charset_plist)
266#define CHARSET_ATTR_MAP(attrs) AREF ((attrs), charset_map)
267#define CHARSET_ATTR_DECODER(attrs) AREF ((attrs), charset_decoder)
268#define CHARSET_ATTR_ENCODER(attrs) AREF ((attrs), charset_encoder)
ec7dd615
KH
269#define CHARSET_ATTR_SUBSET(attrs) AREF ((attrs), charset_subset)
270#define CHARSET_ATTR_SUPERSET(attrs) AREF ((attrs), charset_superset)
3263d5a2
KH
271#define CHARSET_ATTR_UNIFY_MAP(attrs) AREF ((attrs), charset_unify_map)
272#define CHARSET_ATTR_DEUNIFIER(attrs) AREF ((attrs), charset_deunifier)
273
274#define CHARSET_SYMBOL_ID(symbol) \
275 CHARSET_ATTR_ID (CHARSET_SYMBOL_ATTRIBUTES (symbol))
276
277/* Return an index to Vcharset_hash_table of the charset whose symbol
278 is SYMBOL. */
279#define CHARSET_SYMBOL_HASH_INDEX(symbol) \
280 hash_lookup (XHASH_TABLE (Vcharset_hash_table), symbol, NULL)
281
282/* Return the attribute vector of CHARSET. */
283#define CHARSET_ATTRIBUTES(charset) \
284 (HASH_VALUE (XHASH_TABLE (Vcharset_hash_table), (charset)->hash_index))
285
286#define CHARSET_ID(charset) ((charset)->id)
287#define CHARSET_HASH_INDEX(charset) ((charset)->hash_index)
288#define CHARSET_DIMENSION(charset) ((charset)->dimension)
289#define CHARSET_CODE_SPACE(charset) ((charset)->code_space)
290#define CHARSET_CODE_LINEAR_P(charset) ((charset)->code_linear_p)
291#define CHARSET_ISO_CHARS_96(charset) ((charset)->iso_chars_96)
292#define CHARSET_ISO_FINAL(charset) ((charset)->iso_final)
293#define CHARSET_ISO_PLANE(charset) ((charset)->iso_plane)
294#define CHARSET_ISO_REVISION(charset) ((charset)->iso_revision)
295#define CHARSET_EMACS_MULE_ID(charset) ((charset)->emacs_mule_id)
296#define CHARSET_ASCII_COMPATIBLE_P(charset) ((charset)->ascii_compatible_p)
297#define CHARSET_COMPACT_CODES_P(charset) ((charset)->compact_codes_p)
298#define CHARSET_METHOD(charset) ((charset)->method)
299#define CHARSET_MIN_CODE(charset) ((charset)->min_code)
300#define CHARSET_MAX_CODE(charset) ((charset)->max_code)
301#define CHARSET_INVALID_CODE(charset) ((charset)->invalid_code)
302#define CHARSET_MIN_CHAR(charset) ((charset)->min_char)
303#define CHARSET_MAX_CHAR(charset) ((charset)->max_char)
304#define CHARSET_CODE_OFFSET(charset) ((charset)->code_offset)
305#define CHARSET_UNIFIED_P(charset) ((charset)->unified_p)
306
307#define CHARSET_NAME(charset) \
308 (CHARSET_ATTR_NAME (CHARSET_ATTRIBUTES (charset)))
309#define CHARSET_MAP(charset) \
310 (CHARSET_ATTR_MAP (CHARSET_ATTRIBUTES (charset)))
311#define CHARSET_DECODER(charset) \
312 (CHARSET_ATTR_DECODER (CHARSET_ATTRIBUTES (charset)))
313#define CHARSET_ENCODER(charset) \
314 (CHARSET_ATTR_ENCODER (CHARSET_ATTRIBUTES (charset)))
ec7dd615
KH
315#define CHARSET_SUBSET(charset) \
316 (CHARSET_ATTR_SUBSET (CHARSET_ATTRIBUTES (charset)))
317#define CHARSET_SUPERSET(charset) \
318 (CHARSET_ATTR_SUPERSET (CHARSET_ATTRIBUTES (charset)))
3263d5a2
KH
319#define CHARSET_UNIFY_MAP(charset) \
320 (CHARSET_ATTR_UNIFY_MAP (CHARSET_ATTRIBUTES (charset)))
321#define CHARSET_DEUNIFIER(charset) \
322 (CHARSET_ATTR_DEUNIFIER (CHARSET_ATTRIBUTES (charset)))
323
324
325/* Nonzero iff OBJ is a valid charset symbol. */
326#define CHARSETP(obj) (CHARSET_SYMBOL_HASH_INDEX (obj) >= 0)
327
328/* Check if X is a valid charset symbol. If not, signal an error. */
329#define CHECK_CHARSET(x) \
384107f2 330 do { \
3263d5a2
KH
331 if (! SYMBOLP (x) || CHARSET_SYMBOL_HASH_INDEX (x) < 0) \
332 x = wrong_type_argument (Qcharsetp, (x)); \
384107f2 333 } while (0)
54e15bb9 334
4ed46869 335
3263d5a2
KH
336/* Check if X is a valid charset symbol. If valid, set ID to the id
337 number of the charset. Otherwise, signal an error. */
338#define CHECK_CHARSET_GET_ID(x, id) \
339 do { \
340 int idx; \
341 \
342 if (! SYMBOLP (x) || (idx = CHARSET_SYMBOL_HASH_INDEX (x)) < 0) \
343 x = wrong_type_argument (Qcharsetp, (x)); \
344 id = AREF (HASH_VALUE (XHASH_TABLE (Vcharset_hash_table), idx), \
345 charset_id); \
4ed46869
KH
346 } while (0)
347
6e4dc3e1 348
3263d5a2
KH
349/* Check if X is a valid charset symbol. If valid, set ATTR to the
350 attr vector of the charset. Otherwise, signal an error. */
351#define CHECK_CHARSET_GET_ATTR(x, attr) \
352 do { \
353 if (!SYMBOLP (x) || NILP (attr = CHARSET_SYMBOL_ATTRIBUTES (x))) \
354 x = wrong_type_argument (Qcharsetp, (x)); \
6e4dc3e1
KH
355 } while (0)
356
6e4dc3e1 357
3263d5a2
KH
358#define CHECK_CHARSET_GET_CHARSET(x, charset) \
359 do { \
360 int id; \
361 CHECK_CHARSET_GET_ID (x, id); \
362 charset = CHARSET_FROM_ID (id); \
c399b461
RS
363 } while (0)
364
c399b461 365
3263d5a2
KH
366/* Lookup Vcharset_order_list and return the first charset that
367 contains the character C. */
368#define CHAR_CHARSET(c) \
369 char_charset ((c), Qnil, NULL)
370
371#if 0
372/* Char-table of charset-sets. Each element is a bool vector indexed
373 by a charset ID. */
374extern Lisp_Object Vchar_charset_set;
375
376/* Charset-bag of character C. */
377#define CHAR_CHARSET_SET(c) \
378 CHAR_TABLE_REF (Vchar_charset_set, c)
379
380/* Check if two characters C1 and C2 belong to the same charset. */
381#define SAME_CHARSET_P(c1, c2) \
382 intersection_p (CHAR_CHARSET_SET (c1), CHAR_CHARSET_SET (c2))
383
384#endif
385
386
387/* Return a character correponding to the code-point CODE of CHARSET.
388 Try some optimization before calling decode_char. */
389
390#define DECODE_CHAR(charset, code) \
391 ((ASCII_BYTE_P (code) && (charset)->ascii_compatible_p) \
392 ? (code) \
393 : ((code) < (charset)->min_code || (code) > (charset)->max_code) \
394 ? -1 \
395 : (charset)->unified_p \
396 ? decode_char ((charset), (code)) \
397 : (charset)->method == CHARSET_METHOD_OFFSET \
398 ? ((charset)->code_linear_p \
399 ? (code) - (charset)->min_code + (charset)->code_offset \
400 : decode_char ((charset), (code))) \
401 : (charset)->method == CHARSET_METHOD_MAP \
402 ? ((charset)->code_linear_p \
403 ? XINT (AREF (CHARSET_DECODER (charset), \
404 (code) - (charset)->min_code)) \
405 : decode_char ((charset), (code))) \
406 : decode_char ((charset), (code)))
407
408
ec7dd615
KH
409extern Lisp_Object charset_work;
410
3263d5a2
KH
411/* Return a code point of CHAR in CHARSET.
412 Try some optimization before calling encode_char. */
413
ec7dd615
KH
414#define ENCODE_CHAR(charset, c) \
415 ((ASCII_CHAR_P (c) && (charset)->ascii_compatible_p) \
416 ? (c) \
417 : (charset)->unified_p \
418 ? encode_char ((charset), (c)) \
419 : ((c) < (charset)->min_char || (c) > (charset)->max_char) \
420 ? (charset)->invalid_code \
421 : (charset)->method == CHARSET_METHOD_OFFSET \
422 ? ((charset)->code_linear_p \
423 ? (c) - (charset)->code_offset + (charset)->min_code \
424 : encode_char ((charset), (c))) \
425 : (charset)->method == CHARSET_METHOD_MAP \
426 ? ((charset)->compact_codes_p \
427 ? (charset_work = CHAR_TABLE_REF (CHARSET_ENCODER (charset), (c)), \
428 (NILP (charset_work) \
429 ? (charset)->invalid_code \
430 : XFASTINT (charset_work))) \
431 : encode_char ((charset), (c))) \
3263d5a2
KH
432 : encode_char ((charset), (c)))
433
434
435/* Set to 1 when a charset map is loaded to warn that a buffer text
436 and a string data may be relocated. */
437extern int charset_map_loaded;
438
439
440/* Set CHARSET to the charset highest priority of C, CODE to the
441 code-point of C in CHARSET. */
442#define SPLIT_CHAR(c, charset, code) \
443 ((charset) = char_charset ((c), Qnil, &(code)))
444
445
446#define ISO_MAX_DIMENSION 3
447#define ISO_MAX_CHARS 2
448#define ISO_MAX_FINAL 0x80 /* only 0x30..0xFF are used */
449
450/* Mapping table from ISO2022's charset (specified by DIMENSION,
451 CHARS, and FINAL_CHAR) to Emacs' charset ID. Should be accessed by
452 macro ISO_CHARSET_TABLE (DIMENSION, CHARS, FINAL_CHAR). */
453extern int iso_charset_table[ISO_MAX_DIMENSION][ISO_MAX_CHARS][ISO_MAX_FINAL];
c399b461 454
3263d5a2
KH
455/* A charset of type iso2022 who has DIMENSION, CHARS, and FINAL
456 (final character). */
457#define ISO_CHARSET_TABLE(dimension, chars_96, final) \
458 iso_charset_table[(dimension) - 1][(chars_96)][(final)]
6e4dc3e1 459
3263d5a2
KH
460/* Nonzero iff the charset who has FAST_MAP may contain C. */
461#define CHARSET_FAST_MAP_REF(c, fast_map) \
462 ((c) < 0x10000 \
463 ? fast_map[(c) >> 10] & (1 << (((c) >> 7) & 7)) \
464 : fast_map[((c) >> 15) + 62] & (1 << (((c) >> 12) & 7)))
6e4dc3e1 465
3263d5a2 466#define CHARSET_FAST_MAP_SET(c, fast_map) \
384107f2 467 do { \
3263d5a2
KH
468 if ((c) < 0x10000) \
469 (fast_map)[(c) >> 10] |= 1 << (((c) >> 7) & 7); \
384107f2 470 else \
3263d5a2 471 (fast_map)[((c) >> 15) + 62] |= 1 << (((c) >> 12) & 7); \
384107f2
KH
472 } while (0)
473
6e4dc3e1 474
6e4dc3e1 475
3263d5a2 476/* 1 iff CHARSET may contain the character C. */
ec7dd615
KH
477#define CHAR_CHARSET_P(c, charset) \
478 ((ASCII_CHAR_P (c) && (charset)->ascii_compatible_p) \
479 || (CHARSET_UNIFIED_P (charset) \
480 ? encode_char ((charset), (c)) != (charset)->invalid_code \
481 : (CHARSET_FAST_MAP_REF ((c), (charset)->fast_map) \
482 && ((charset)->method == CHARSET_METHOD_OFFSET \
483 ? (c) >= (charset)->min_char && (c) <= (charset)->max_char \
484 : ((charset)->method == CHARSET_METHOD_MAP \
485 && (charset)->compact_codes_p) \
486 ? ! NILP (CHAR_TABLE_REF (CHARSET_ENCODER (charset), (c))) \
3263d5a2 487 : encode_char ((charset), (c)) != (charset)->invalid_code))))
6e4dc3e1 488
4ed46869 489
3263d5a2
KH
490extern Lisp_Object Qcharsetp;
491
492extern Lisp_Object Qascii, Qunicode;
493extern int charset_ascii, charset_8_bit_control, charset_8_bit_graphic;
494extern int charset_iso_8859_1;
495extern int charset_primary;
e73576a3
KH
496extern int charset_jisx0201_roman;
497extern int charset_jisx0208_1978;
498extern int charset_jisx0208;
3263d5a2
KH
499
500extern struct charset *char_charset P_ ((int, Lisp_Object, unsigned *));
501extern Lisp_Object charset_attributes P_ ((int));
502
503extern int decode_char P_ ((struct charset *, unsigned));
504extern unsigned encode_char P_ ((struct charset *, int));
505extern int string_xstring_p P_ ((Lisp_Object));
506
ec7dd615
KH
507extern void map_charset_chars P_ ((void (*) (Lisp_Object, Lisp_Object),
508 Lisp_Object, Lisp_Object,
509 struct charset *, unsigned, unsigned));
510
3263d5a2 511EXFUN (Funify_charset, 2);
c1f6608b 512
aa01a892 513#endif /* EMACS_CHARSET_H */