(load_charset_map): Fix previous change.
[bpt/emacs.git] / src / charset.h
CommitLineData
3263d5a2 1/* Header for charset handler.
4a2f9c6a 2 Copyright (C) 1995, 1997, 1998 Electrotechnical Laboratory, JAPAN.
75c8c592 3 Licensed to the Free Software Foundation.
e06aa1f9 4 Copyright (C) 2001 Free Software Foundation, Inc.
3263d5a2
KH
5 Copyright (C) 2001, 2002
6 National Institute of Advanced Industrial Science and Technology (AIST)
7 Registration Number H13PRO009
4ed46869 8
369314dc
KH
9This file is part of GNU Emacs.
10
11GNU Emacs is free software; you can redistribute it and/or modify
12it under the terms of the GNU General Public License as published by
13the Free Software Foundation; either version 2, or (at your option)
14any later version.
4ed46869 15
369314dc
KH
16GNU Emacs is distributed in the hope that it will be useful,
17but WITHOUT ANY WARRANTY; without even the implied warranty of
18MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19GNU General Public License for more details.
4ed46869 20
369314dc
KH
21You should have received a copy of the GNU General Public License
22along with GNU Emacs; see the file COPYING. If not, write to
23the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
24Boston, MA 02111-1307, USA. */
4ed46869 25
aa01a892
KH
26#ifndef EMACS_CHARSET_H
27#define EMACS_CHARSET_H
4ed46869 28
3263d5a2
KH
29/* Index to arguments of Fdefine_charset_internal. */
30
31enum define_charset_arg_index
32 {
33 charset_arg_name,
34 charset_arg_dimension,
35 charset_arg_code_space,
36 charset_arg_iso_final,
37 charset_arg_iso_revision,
38 charset_arg_emacs_mule_id,
39 charset_arg_ascii_compatible_p,
40 charset_arg_supplementary_p,
41 charset_arg_invalid_code,
42 charset_arg_code_offset,
43 charset_arg_map,
44 charset_arg_parents,
45 charset_arg_unify_map,
46 charset_arg_plist,
47 charset_arg_max
48 };
49
50
51/* Indices to charset attributes vector. */
52
53enum charset_attr_index
54 {
55 /* ID number of the charset. */
56 charset_id,
4ed46869 57
3263d5a2
KH
58 /* Name of the charset (symbol). */
59 charset_name,
60
61 /* Property list of the charset. */
62 charset_plist,
63
64 /* If the method of the charset is `MAP_DEFERRED', the value is a
3e4abc9e 65 mapping vector or a file name that contains mapping vector.
3263d5a2
KH
66 Otherwise, nil. */
67 charset_map,
68
69 /* If the method of the charset is `MAP', the value is a vector
70 that maps code points of the charset to characters. The vector
71 is indexed by a character index. A character index is
72 calculated from a code point and the code-space table of the
73 charset. */
74 charset_decoder,
75
76 /* If the method of the charset is `MAP', the value is a
77 char-table that maps characters of the charset to code
78 points. */
79 charset_encoder,
80
81 /* If the method of the charset is `INHERIT', the value is a list
82 of the form (PARENT-CHARSET-ID . CODE-OFFSET). */
83 charset_parents,
84
3e4abc9e
KH
85 /* The value is a mapping vector or a file name that contains
86 mapping vector. This provide how characters in the charset
87 should be unified with Unicode. The value of the member
88 `charset_deunifier' is created from this information. */
3263d5a2
KH
89 charset_unify_map,
90
3e4abc9e
KH
91 /* If characters in the charset must be unified Unicode, the value
92 is a char table that maps a character code in the charset to
93 the corresponding Unicode character. */
3263d5a2
KH
94 charset_deunifier,
95
96 /* The length of charset attribute vector. */
97 charset_attr_max
98 };
99
100/* Methods for converting code points and characters of charsets. */
101
102enum charset_method
103 {
104 /* For a charset of this method, a character code is calculated
105 from a character index (which is calculated from a code point)
106 simply by adding an offset value. */
107 CHARSET_METHOD_OFFSET,
108
109 /* For a charset of this method, a decoder vector and an encoder
110 char-table is used for code point <-> character code
111 conversion. */
112 CHARSET_METHOD_MAP,
113
114 /* Same as above but decoder and encoder are loaded from a file on
115 demand. Once loaded, the method is changed to
116 CHARSET_METHOD_MAP. */
117 CHARSET_METHOD_MAP_DEFERRED,
118
119 /* A charset of this method inherits characters from the other
120 charsets. */
3e4abc9e 121 CHARSET_METHOD_INHERIT
3263d5a2
KH
122 };
123
124struct charset
125{
3e4abc9e 126 /* Index to charset_table. */
3263d5a2
KH
127 int id;
128
3e4abc9e 129 /* Index to Vcharset_hash_table. */
3263d5a2
KH
130 int hash_index;
131
132 /* Dimension of the charset: 1, 2, 3, or 4. */
133 int dimension;
134
135 /* Minimum byte code in each dimension. */
136 int code_space[16];
137
138 /* 1 if there's no gap in code-points. */
139 int code_linear_p;
140
141 /* If the charset is treated as 94-chars in ISO-2022, the value is 0.
142 If the charset is treated as 96-chars in ISO-2022, the value is 1. */
143 int iso_chars_96;
144
3e4abc9e
KH
145 /* ISO final byte of the charset: 48..127. It may be -1 if the
146 charset doesn't conform to ISO-2022. */
3263d5a2
KH
147 int iso_final;
148
3e4abc9e 149 /* ISO revision number of the charset. */
3263d5a2
KH
150 int iso_revision;
151
152 /* If the charset is identical to what supported by Emacs 21 and the
153 priors, the identification number of the charset used in those
154 version. Otherwise, -1. */
155 int emacs_mule_id;
156
157 /* Nonzero iff the charset is compatible with ASCII. */
158 int ascii_compatible_p;
159
160 /* Nonzero iff the charset is supplementary. */
161 int supplementary_p;
162
163 /* Nonzero iff all the code points are representable by Lisp_Int. */
164 int compact_codes_p;
165
166 /* The method for encoding/decoding characters of the charset. */
167 enum charset_method method;
168
169 /* Mininum and Maximum code points of the charset. */
170 unsigned min_code, max_code;
171
172 /* Mininum and Maximum character codes of the charset. If the
173 charset is compatible with ASCII, min_char is a minimum non-ASCII
174 character of the charset. */
175 int min_char, max_char;
176
177 /* The code returned by ENCODE_CHAR if a character is not encodable
178 by the charset. */
179 unsigned invalid_code;
180
181 /* If the method of the charset is CHARSET_METHOD_MAP, this is a
182 table of bits used to quickly and roughly guess if a character
183 belongs to the charset.
184
185 The first 64 elements are 512 bits for characters less than
186 0x10000. Each bit corresponds to 128-character block. The last
187 126 elements are 1008 bits for the greater characters
188 (0x10000..0x3FFFFF). Each bit corresponds to 4096-character
189 block.
190
191 If a bit is 1, at least one character in the corresponds block is
192 in this charset. */
193 unsigned char fast_map[190];
194
195 /* Offset value to calculate a character code from code-point, and
196 visa versa. */
197 int code_offset;
198
199 int unified_p;
200};
201
202/* Hash table of charset symbols vs. the correponding attribute
203 vectors. */
204extern Lisp_Object Vcharset_hash_table;
205
206/* Table of struct charset. */
207extern struct charset *charset_table;
208extern int charset_table_used;
209
210#define CHARSET_FROM_ID(id) (charset_table + (id))
211
212extern Lisp_Object Vcharset_list;
213extern Lisp_Object Viso_2022_charset_list;
214extern Lisp_Object Vemacs_mule_charset_list;
215
216extern struct charset *emacs_mule_charset[256];
217
218
219/* Macros to access information about charset. */
220
221/* Return the attribute vector of charset whose symbol is SYMBOL. */
222#define CHARSET_SYMBOL_ATTRIBUTES(symbol) \
223 Fgethash ((symbol), Vcharset_hash_table, Qnil)
224
225#define CHARSET_ATTR_ID(attrs) AREF ((attrs), charset_id)
226#define CHARSET_ATTR_NAME(attrs) AREF ((attrs), charset_name)
227#define CHARSET_ATTR_PLIST(attrs) AREF ((attrs), charset_plist)
228#define CHARSET_ATTR_MAP(attrs) AREF ((attrs), charset_map)
229#define CHARSET_ATTR_DECODER(attrs) AREF ((attrs), charset_decoder)
230#define CHARSET_ATTR_ENCODER(attrs) AREF ((attrs), charset_encoder)
231#define CHARSET_ATTR_PARENTS(attrs) AREF ((attrs), charset_parents)
232#define CHARSET_ATTR_UNIFY_MAP(attrs) AREF ((attrs), charset_unify_map)
233#define CHARSET_ATTR_DEUNIFIER(attrs) AREF ((attrs), charset_deunifier)
234
235#define CHARSET_SYMBOL_ID(symbol) \
236 CHARSET_ATTR_ID (CHARSET_SYMBOL_ATTRIBUTES (symbol))
237
238/* Return an index to Vcharset_hash_table of the charset whose symbol
239 is SYMBOL. */
240#define CHARSET_SYMBOL_HASH_INDEX(symbol) \
241 hash_lookup (XHASH_TABLE (Vcharset_hash_table), symbol, NULL)
242
243/* Return the attribute vector of CHARSET. */
244#define CHARSET_ATTRIBUTES(charset) \
245 (HASH_VALUE (XHASH_TABLE (Vcharset_hash_table), (charset)->hash_index))
246
247#define CHARSET_ID(charset) ((charset)->id)
248#define CHARSET_HASH_INDEX(charset) ((charset)->hash_index)
249#define CHARSET_DIMENSION(charset) ((charset)->dimension)
250#define CHARSET_CODE_SPACE(charset) ((charset)->code_space)
251#define CHARSET_CODE_LINEAR_P(charset) ((charset)->code_linear_p)
252#define CHARSET_ISO_CHARS_96(charset) ((charset)->iso_chars_96)
253#define CHARSET_ISO_FINAL(charset) ((charset)->iso_final)
254#define CHARSET_ISO_PLANE(charset) ((charset)->iso_plane)
255#define CHARSET_ISO_REVISION(charset) ((charset)->iso_revision)
256#define CHARSET_EMACS_MULE_ID(charset) ((charset)->emacs_mule_id)
257#define CHARSET_ASCII_COMPATIBLE_P(charset) ((charset)->ascii_compatible_p)
258#define CHARSET_COMPACT_CODES_P(charset) ((charset)->compact_codes_p)
259#define CHARSET_METHOD(charset) ((charset)->method)
260#define CHARSET_MIN_CODE(charset) ((charset)->min_code)
261#define CHARSET_MAX_CODE(charset) ((charset)->max_code)
262#define CHARSET_INVALID_CODE(charset) ((charset)->invalid_code)
263#define CHARSET_MIN_CHAR(charset) ((charset)->min_char)
264#define CHARSET_MAX_CHAR(charset) ((charset)->max_char)
265#define CHARSET_CODE_OFFSET(charset) ((charset)->code_offset)
266#define CHARSET_UNIFIED_P(charset) ((charset)->unified_p)
267
268#define CHARSET_NAME(charset) \
269 (CHARSET_ATTR_NAME (CHARSET_ATTRIBUTES (charset)))
270#define CHARSET_MAP(charset) \
271 (CHARSET_ATTR_MAP (CHARSET_ATTRIBUTES (charset)))
272#define CHARSET_DECODER(charset) \
273 (CHARSET_ATTR_DECODER (CHARSET_ATTRIBUTES (charset)))
274#define CHARSET_ENCODER(charset) \
275 (CHARSET_ATTR_ENCODER (CHARSET_ATTRIBUTES (charset)))
276#define CHARSET_PARENTS(charset) \
277 (CHARSET_ATTR_PARENTS (CHARSET_ATTRIBUTES (charset)))
278#define CHARSET_UNIFY_MAP(charset) \
279 (CHARSET_ATTR_UNIFY_MAP (CHARSET_ATTRIBUTES (charset)))
280#define CHARSET_DEUNIFIER(charset) \
281 (CHARSET_ATTR_DEUNIFIER (CHARSET_ATTRIBUTES (charset)))
282
283
284/* Nonzero iff OBJ is a valid charset symbol. */
285#define CHARSETP(obj) (CHARSET_SYMBOL_HASH_INDEX (obj) >= 0)
286
287/* Check if X is a valid charset symbol. If not, signal an error. */
288#define CHECK_CHARSET(x) \
384107f2 289 do { \
3263d5a2
KH
290 if (! SYMBOLP (x) || CHARSET_SYMBOL_HASH_INDEX (x) < 0) \
291 x = wrong_type_argument (Qcharsetp, (x)); \
384107f2 292 } while (0)
54e15bb9 293
4ed46869 294
3263d5a2
KH
295/* Check if X is a valid charset symbol. If valid, set ID to the id
296 number of the charset. Otherwise, signal an error. */
297#define CHECK_CHARSET_GET_ID(x, id) \
298 do { \
299 int idx; \
300 \
301 if (! SYMBOLP (x) || (idx = CHARSET_SYMBOL_HASH_INDEX (x)) < 0) \
302 x = wrong_type_argument (Qcharsetp, (x)); \
303 id = AREF (HASH_VALUE (XHASH_TABLE (Vcharset_hash_table), idx), \
304 charset_id); \
4ed46869
KH
305 } while (0)
306
6e4dc3e1 307
3263d5a2
KH
308/* Check if X is a valid charset symbol. If valid, set ATTR to the
309 attr vector of the charset. Otherwise, signal an error. */
310#define CHECK_CHARSET_GET_ATTR(x, attr) \
311 do { \
312 if (!SYMBOLP (x) || NILP (attr = CHARSET_SYMBOL_ATTRIBUTES (x))) \
313 x = wrong_type_argument (Qcharsetp, (x)); \
6e4dc3e1
KH
314 } while (0)
315
6e4dc3e1 316
3263d5a2
KH
317#define CHECK_CHARSET_GET_CHARSET(x, charset) \
318 do { \
319 int id; \
320 CHECK_CHARSET_GET_ID (x, id); \
321 charset = CHARSET_FROM_ID (id); \
c399b461
RS
322 } while (0)
323
c399b461 324
3263d5a2
KH
325/* Lookup Vcharset_order_list and return the first charset that
326 contains the character C. */
327#define CHAR_CHARSET(c) \
328 char_charset ((c), Qnil, NULL)
329
330#if 0
331/* Char-table of charset-sets. Each element is a bool vector indexed
332 by a charset ID. */
333extern Lisp_Object Vchar_charset_set;
334
335/* Charset-bag of character C. */
336#define CHAR_CHARSET_SET(c) \
337 CHAR_TABLE_REF (Vchar_charset_set, c)
338
339/* Check if two characters C1 and C2 belong to the same charset. */
340#define SAME_CHARSET_P(c1, c2) \
341 intersection_p (CHAR_CHARSET_SET (c1), CHAR_CHARSET_SET (c2))
342
343#endif
344
345
346/* Return a character correponding to the code-point CODE of CHARSET.
347 Try some optimization before calling decode_char. */
348
349#define DECODE_CHAR(charset, code) \
350 ((ASCII_BYTE_P (code) && (charset)->ascii_compatible_p) \
351 ? (code) \
352 : ((code) < (charset)->min_code || (code) > (charset)->max_code) \
353 ? -1 \
354 : (charset)->unified_p \
355 ? decode_char ((charset), (code)) \
356 : (charset)->method == CHARSET_METHOD_OFFSET \
357 ? ((charset)->code_linear_p \
358 ? (code) - (charset)->min_code + (charset)->code_offset \
359 : decode_char ((charset), (code))) \
360 : (charset)->method == CHARSET_METHOD_MAP \
361 ? ((charset)->code_linear_p \
362 ? XINT (AREF (CHARSET_DECODER (charset), \
363 (code) - (charset)->min_code)) \
364 : decode_char ((charset), (code))) \
365 : decode_char ((charset), (code)))
366
367
368/* Return a code point of CHAR in CHARSET.
369 Try some optimization before calling encode_char. */
370
371#define ENCODE_CHAR(charset, c) \
372 ((ASCII_CHAR_P (c) && (charset)->ascii_compatible_p) \
373 ? (c) \
374 : (charset)->unified_p \
375 ? encode_char ((charset), (c)) \
376 : ((c) < (charset)->min_char || (c) > (charset)->max_char) \
377 ? (charset)->invalid_code \
378 : (charset)->method == CHARSET_METHOD_OFFSET \
379 ? ((charset)->code_linear_p \
380 ? (c) - (charset)->code_offset + (charset)->min_code \
381 : encode_char ((charset), (c))) \
382 : (charset)->method == CHARSET_METHOD_MAP \
383 ? ((charset)->compact_codes_p \
384 ? XFASTINT (CHAR_TABLE_REF (CHARSET_ENCODER (charset), (c))) \
385 : encode_char ((charset), (c))) \
386 : encode_char ((charset), (c)))
387
388
389/* Set to 1 when a charset map is loaded to warn that a buffer text
390 and a string data may be relocated. */
391extern int charset_map_loaded;
392
393
394/* Set CHARSET to the charset highest priority of C, CODE to the
395 code-point of C in CHARSET. */
396#define SPLIT_CHAR(c, charset, code) \
397 ((charset) = char_charset ((c), Qnil, &(code)))
398
399
400#define ISO_MAX_DIMENSION 3
401#define ISO_MAX_CHARS 2
402#define ISO_MAX_FINAL 0x80 /* only 0x30..0xFF are used */
403
404/* Mapping table from ISO2022's charset (specified by DIMENSION,
405 CHARS, and FINAL_CHAR) to Emacs' charset ID. Should be accessed by
406 macro ISO_CHARSET_TABLE (DIMENSION, CHARS, FINAL_CHAR). */
407extern int iso_charset_table[ISO_MAX_DIMENSION][ISO_MAX_CHARS][ISO_MAX_FINAL];
c399b461 408
3263d5a2
KH
409/* A charset of type iso2022 who has DIMENSION, CHARS, and FINAL
410 (final character). */
411#define ISO_CHARSET_TABLE(dimension, chars_96, final) \
412 iso_charset_table[(dimension) - 1][(chars_96)][(final)]
6e4dc3e1 413
3263d5a2
KH
414/* Nonzero iff the charset who has FAST_MAP may contain C. */
415#define CHARSET_FAST_MAP_REF(c, fast_map) \
416 ((c) < 0x10000 \
417 ? fast_map[(c) >> 10] & (1 << (((c) >> 7) & 7)) \
418 : fast_map[((c) >> 15) + 62] & (1 << (((c) >> 12) & 7)))
6e4dc3e1 419
3263d5a2 420#define CHARSET_FAST_MAP_SET(c, fast_map) \
384107f2 421 do { \
3263d5a2
KH
422 if ((c) < 0x10000) \
423 (fast_map)[(c) >> 10] |= 1 << (((c) >> 7) & 7); \
384107f2 424 else \
3263d5a2 425 (fast_map)[((c) >> 15) + 62] |= 1 << (((c) >> 12) & 7); \
384107f2
KH
426 } while (0)
427
6e4dc3e1 428
6e4dc3e1 429
3263d5a2
KH
430/* 1 iff CHARSET may contain the character C. */
431#define CHAR_CHARSET_P(c, charset) \
432 ((ASCII_CHAR_P (c) && (charset)->ascii_compatible_p) \
433 || (CHARSET_UNIFIED_P (charset) \
434 ? encode_char ((charset), (c)) != (charset)->invalid_code \
435 : (CHARSET_FAST_MAP_REF ((c), (charset)->fast_map) \
436 && ((charset)->method == CHARSET_METHOD_OFFSET \
437 ? (c) >= (charset)->min_char && (c) <= (charset)->max_char \
438 : ((charset)->method == CHARSET_METHOD_MAP \
439 && (charset)->compact_codes_p) \
440 ? (XFASTINT (CHAR_TABLE_REF (CHARSET_ENCODER (charset), (c))) \
441 != (charset)->invalid_code) \
442 : encode_char ((charset), (c)) != (charset)->invalid_code))))
6e4dc3e1 443
4ed46869 444
3263d5a2
KH
445extern Lisp_Object Qcharsetp;
446
447extern Lisp_Object Qascii, Qunicode;
448extern int charset_ascii, charset_8_bit_control, charset_8_bit_graphic;
449extern int charset_iso_8859_1;
450extern int charset_primary;
451
452extern struct charset *char_charset P_ ((int, Lisp_Object, unsigned *));
453extern Lisp_Object charset_attributes P_ ((int));
454
455extern int decode_char P_ ((struct charset *, unsigned));
456extern unsigned encode_char P_ ((struct charset *, int));
457extern int string_xstring_p P_ ((Lisp_Object));
458
459EXFUN (Funify_charset, 2);
c1f6608b 460
aa01a892 461#endif /* EMACS_CHARSET_H */