* character.c (char_string): Remove unnecessary casts.
[bpt/emacs.git] / src / charset.h
... / ...
CommitLineData
1/* Header for charset handler.
2 Copyright (C) 2001-2011 Free Software Foundation, Inc.
3 Copyright (C) 1995, 1996, 1997, 1998, 1999, 2000, 2001, 2002, 2003, 2004,
4 2005, 2006, 2007, 2008, 2009, 2010, 2011
5 National Institute of Advanced Industrial Science and Technology (AIST)
6 Registration Number H14PRO021
7
8 Copyright (C) 2003
9 National Institute of Advanced Industrial Science and Technology (AIST)
10 Registration Number H13PRO009
11
12This file is part of GNU Emacs.
13
14GNU Emacs is free software: you can redistribute it and/or modify
15it under the terms of the GNU General Public License as published by
16the Free Software Foundation, either version 3 of the License, or
17(at your option) any later version.
18
19GNU Emacs is distributed in the hope that it will be useful,
20but WITHOUT ANY WARRANTY; without even the implied warranty of
21MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
22GNU General Public License for more details.
23
24You should have received a copy of the GNU General Public License
25along with GNU Emacs. If not, see <http://www.gnu.org/licenses/>. */
26
27#ifndef EMACS_CHARSET_H
28#define EMACS_CHARSET_H
29
30/* Index to arguments of Fdefine_charset_internal. */
31
32enum define_charset_arg_index
33 {
34 charset_arg_name,
35 charset_arg_dimension,
36 charset_arg_code_space,
37 charset_arg_min_code,
38 charset_arg_max_code,
39 charset_arg_iso_final,
40 charset_arg_iso_revision,
41 charset_arg_emacs_mule_id,
42 charset_arg_ascii_compatible_p,
43 charset_arg_supplementary_p,
44 charset_arg_invalid_code,
45 charset_arg_code_offset,
46 charset_arg_map,
47 charset_arg_subset,
48 charset_arg_superset,
49 charset_arg_unify_map,
50 charset_arg_plist,
51 charset_arg_max
52 };
53
54
55/* Indices to charset attributes vector. */
56
57enum charset_attr_index
58 {
59 /* ID number of the charset. */
60 charset_id,
61
62 /* Name of the charset (symbol). */
63 charset_name,
64
65 /* Property list of the charset. */
66 charset_plist,
67
68 /* If the method of the charset is `MAP', the value is a mapping
69 vector or a file name that contains mapping vector. Otherwise,
70 nil. */
71 charset_map,
72
73 /* If the method of the charset is `MAP', the value is a vector
74 that maps code points of the charset to characters. The vector
75 is indexed by a character index. A character index is
76 calculated from a code point and the code-space table of the
77 charset. */
78 charset_decoder,
79
80 /* If the method of the charset is `MAP', the value is a
81 char-table that maps characters of the charset to code
82 points. */
83 charset_encoder,
84
85 /* If the method of the charset is `SUBSET', the value is a vector
86 that has this form:
87
88 [ CHARSET-ID MIN-CODE MAX-CODE OFFSET ]
89
90 CHARSET-ID is an ID number of a parent charset. MIN-CODE and
91 MAX-CODE specify the range of characters inherited from the
92 parent. OFFSET is an integer value to add to a code point of
93 the parent charset to get the corresponding code point of this
94 charset. */
95 charset_subset,
96
97 /* If the method of the charset is `SUPERSET', the value is a list
98 whose elements have this form:
99
100 (CHARSET-ID . OFFSET)
101
102 CHARSET-IDs are ID numbers of parent charsets. OFFSET is an
103 integer value to add to a code point of the parent charset to
104 get the corresponding code point of this charset. */
105 charset_superset,
106
107 /* The value is a mapping vector or a file name that contains the
108 mapping. This defines how characters in the charset should be
109 unified with Unicode. The value of the member
110 `charset_deunifier' is created from this information. */
111 charset_unify_map,
112
113 /* If characters in the charset must be unified Unicode, the value
114 is a char table that maps a unified Unicode character code to
115 the non-unified character code in the charset. */
116 charset_deunifier,
117
118 /* The length of the charset attribute vector. */
119 charset_attr_max
120 };
121
122/* Methods for converting code points and characters of charsets. */
123
124enum charset_method
125 {
126 /* For a charset of this method, a character code is calculated
127 from a character index (which is calculated from a code point)
128 simply by adding an offset value. */
129 CHARSET_METHOD_OFFSET,
130
131 /* For a charset of this method, a decoder vector and an encoder
132 char-table is used for code point <-> character code
133 conversion. */
134 CHARSET_METHOD_MAP,
135
136 /* A charset of this method is a subset of another charset. */
137 CHARSET_METHOD_SUBSET,
138
139 /* A charset of this method is a superset of other charsets. */
140 CHARSET_METHOD_SUPERSET
141 };
142
143struct charset
144{
145 /* Index to charset_table. */
146 int id;
147
148 /* Index to Vcharset_hash_table. */
149 EMACS_INT hash_index;
150
151 /* Dimension of the charset: 1, 2, 3, or 4. */
152 int dimension;
153
154 /* Byte code range of each dimension. <code_space>[4N] is a mininum
155 byte code of the (N+1)th dimension, <code_space>[4N+1] is a
156 maximum byte code of the (N+1)th dimension, <code_space>[4N+2] is
157 (<code_space>[4N+1] - <code_space>[4N] + 1), <code_space>[4N+3]
158 is the number of characters contained in the first through (N+1)th
159 dimensions, except that there is no <code_space>[15].
160 We get `char-index' of a `code-point' from this
161 information. */
162 int code_space[15];
163
164 /* If B is a byte of Nth dimension of a code-point, the (N-1)th bit
165 of code_space_mask[B] is set. This array is used to quickly
166 check if a code-point is in a valid range. */
167 unsigned char *code_space_mask;
168
169 /* 1 if there's no gap in code-points. */
170 int code_linear_p;
171
172 /* If the charset is treated as 94-chars in ISO-2022, the value is 0.
173 If the charset is treated as 96-chars in ISO-2022, the value is 1. */
174 int iso_chars_96;
175
176 /* ISO final byte of the charset: 48..127. It may be -1 if the
177 charset doesn't conform to ISO-2022. */
178 int iso_final;
179
180 /* ISO revision number of the charset. */
181 int iso_revision;
182
183 /* If the charset is identical to what supported by Emacs 21 and the
184 priors, the identification number of the charset used in those
185 version. Otherwise, -1. */
186 int emacs_mule_id;
187
188 /* Nonzero if the charset is compatible with ASCII. */
189 int ascii_compatible_p;
190
191 /* Nonzero if the charset is supplementary. */
192 int supplementary_p;
193
194 /* Nonzero if all the code points are representable by Lisp_Int. */
195 int compact_codes_p;
196
197 /* The method for encoding/decoding characters of the charset. */
198 enum charset_method method;
199
200 /* Mininum and Maximum code points of the charset. */
201 unsigned min_code, max_code;
202
203 /* Offset value used by macros CODE_POINT_TO_INDEX and
204 INDEX_TO_CODE_POINT. . */
205 unsigned char_index_offset;
206
207 /* Mininum and Maximum character codes of the charset. If the
208 charset is compatible with ASCII, min_char is a minimum non-ASCII
209 character of the charset. If the method of charset is
210 CHARSET_METHOD_OFFSET, even if the charset is unified, min_char
211 and max_char doesn't change. */
212 int min_char, max_char;
213
214 /* The code returned by ENCODE_CHAR if a character is not encodable
215 by the charset. */
216 unsigned invalid_code;
217
218 /* If the method of the charset is CHARSET_METHOD_MAP, this is a
219 table of bits used to quickly and roughly guess if a character
220 belongs to the charset.
221
222 The first 64 elements are 512 bits for characters less than
223 0x10000. Each bit corresponds to 128-character block. The last
224 126 elements are 1008 bits for the greater characters
225 (0x10000..0x3FFFFF). Each bit corresponds to 4096-character
226 block.
227
228 If a bit is 1, at least one character in the corresponding block is
229 in this charset. */
230 unsigned char fast_map[190];
231
232 /* Offset value to calculate a character code from code-point, and
233 visa versa. */
234 int code_offset;
235
236 int unified_p;
237};
238
239/* Hash table of charset symbols vs. the correponding attribute
240 vectors. */
241extern Lisp_Object Vcharset_hash_table;
242
243/* Table of struct charset. */
244extern struct charset *charset_table;
245
246#define CHARSET_FROM_ID(id) (charset_table + (id))
247
248extern Lisp_Object Vcharset_ordered_list;
249extern Lisp_Object Vcharset_non_preferred_head;
250
251/* Incremented everytime we change the priority of charsets. */
252extern unsigned short charset_ordered_list_tick;
253
254extern Lisp_Object Viso_2022_charset_list;
255extern Lisp_Object Vemacs_mule_charset_list;
256
257extern int emacs_mule_charset[256];
258
259/* Macros to access information about charset. */
260
261/* Return the attribute vector of charset whose symbol is SYMBOL. */
262#define CHARSET_SYMBOL_ATTRIBUTES(symbol) \
263 Fgethash ((symbol), Vcharset_hash_table, Qnil)
264
265#define CHARSET_ATTR_ID(attrs) AREF ((attrs), charset_id)
266#define CHARSET_ATTR_NAME(attrs) AREF ((attrs), charset_name)
267#define CHARSET_ATTR_PLIST(attrs) AREF ((attrs), charset_plist)
268#define CHARSET_ATTR_MAP(attrs) AREF ((attrs), charset_map)
269#define CHARSET_ATTR_DECODER(attrs) AREF ((attrs), charset_decoder)
270#define CHARSET_ATTR_ENCODER(attrs) AREF ((attrs), charset_encoder)
271#define CHARSET_ATTR_SUBSET(attrs) AREF ((attrs), charset_subset)
272#define CHARSET_ATTR_SUPERSET(attrs) AREF ((attrs), charset_superset)
273#define CHARSET_ATTR_UNIFY_MAP(attrs) AREF ((attrs), charset_unify_map)
274#define CHARSET_ATTR_DEUNIFIER(attrs) AREF ((attrs), charset_deunifier)
275
276#define CHARSET_SYMBOL_ID(symbol) \
277 CHARSET_ATTR_ID (CHARSET_SYMBOL_ATTRIBUTES (symbol))
278
279/* Return an index to Vcharset_hash_table of the charset whose symbol
280 is SYMBOL. */
281#define CHARSET_SYMBOL_HASH_INDEX(symbol) \
282 hash_lookup (XHASH_TABLE (Vcharset_hash_table), symbol, NULL)
283
284/* Return the attribute vector of CHARSET. */
285#define CHARSET_ATTRIBUTES(charset) \
286 (HASH_VALUE (XHASH_TABLE (Vcharset_hash_table), (charset)->hash_index))
287
288#define CHARSET_ID(charset) ((charset)->id)
289#define CHARSET_HASH_INDEX(charset) ((charset)->hash_index)
290#define CHARSET_DIMENSION(charset) ((charset)->dimension)
291#define CHARSET_CODE_SPACE(charset) ((charset)->code_space)
292#define CHARSET_CODE_LINEAR_P(charset) ((charset)->code_linear_p)
293#define CHARSET_ISO_CHARS_96(charset) ((charset)->iso_chars_96)
294#define CHARSET_ISO_FINAL(charset) ((charset)->iso_final)
295#define CHARSET_ISO_PLANE(charset) ((charset)->iso_plane)
296#define CHARSET_ISO_REVISION(charset) ((charset)->iso_revision)
297#define CHARSET_EMACS_MULE_ID(charset) ((charset)->emacs_mule_id)
298#define CHARSET_ASCII_COMPATIBLE_P(charset) ((charset)->ascii_compatible_p)
299#define CHARSET_COMPACT_CODES_P(charset) ((charset)->compact_codes_p)
300#define CHARSET_METHOD(charset) ((charset)->method)
301#define CHARSET_MIN_CODE(charset) ((charset)->min_code)
302#define CHARSET_MAX_CODE(charset) ((charset)->max_code)
303#define CHARSET_INVALID_CODE(charset) ((charset)->invalid_code)
304#define CHARSET_MIN_CHAR(charset) ((charset)->min_char)
305#define CHARSET_MAX_CHAR(charset) ((charset)->max_char)
306#define CHARSET_CODE_OFFSET(charset) ((charset)->code_offset)
307#define CHARSET_UNIFIED_P(charset) ((charset)->unified_p)
308
309#define CHARSET_NAME(charset) \
310 (CHARSET_ATTR_NAME (CHARSET_ATTRIBUTES (charset)))
311#define CHARSET_MAP(charset) \
312 (CHARSET_ATTR_MAP (CHARSET_ATTRIBUTES (charset)))
313#define CHARSET_DECODER(charset) \
314 (CHARSET_ATTR_DECODER (CHARSET_ATTRIBUTES (charset)))
315#define CHARSET_ENCODER(charset) \
316 (CHARSET_ATTR_ENCODER (CHARSET_ATTRIBUTES (charset)))
317#define CHARSET_SUBSET(charset) \
318 (CHARSET_ATTR_SUBSET (CHARSET_ATTRIBUTES (charset)))
319#define CHARSET_SUPERSET(charset) \
320 (CHARSET_ATTR_SUPERSET (CHARSET_ATTRIBUTES (charset)))
321#define CHARSET_UNIFY_MAP(charset) \
322 (CHARSET_ATTR_UNIFY_MAP (CHARSET_ATTRIBUTES (charset)))
323#define CHARSET_DEUNIFIER(charset) \
324 (CHARSET_ATTR_DEUNIFIER (CHARSET_ATTRIBUTES (charset)))
325
326
327/* Nonzero if OBJ is a valid charset symbol. */
328#define CHARSETP(obj) (CHARSET_SYMBOL_HASH_INDEX (obj) >= 0)
329
330/* Check if X is a valid charset symbol. If not, signal an error. */
331#define CHECK_CHARSET(x) \
332 do { \
333 if (! SYMBOLP (x) || CHARSET_SYMBOL_HASH_INDEX (x) < 0) \
334 wrong_type_argument (Qcharsetp, (x)); \
335 } while (0)
336
337
338/* Check if X is a valid charset symbol. If valid, set ID to the id
339 number of the charset. Otherwise, signal an error. */
340#define CHECK_CHARSET_GET_ID(x, id) \
341 do { \
342 int idx; \
343 \
344 if (! SYMBOLP (x) || (idx = CHARSET_SYMBOL_HASH_INDEX (x)) < 0) \
345 wrong_type_argument (Qcharsetp, (x)); \
346 id = XINT (AREF (HASH_VALUE (XHASH_TABLE (Vcharset_hash_table), idx), \
347 charset_id)); \
348 } while (0)
349
350
351/* Check if X is a valid charset symbol. If valid, set ATTR to the
352 attr vector of the charset. Otherwise, signal an error. */
353#define CHECK_CHARSET_GET_ATTR(x, attr) \
354 do { \
355 if (!SYMBOLP (x) || NILP (attr = CHARSET_SYMBOL_ATTRIBUTES (x))) \
356 wrong_type_argument (Qcharsetp, (x)); \
357 } while (0)
358
359
360#define CHECK_CHARSET_GET_CHARSET(x, charset) \
361 do { \
362 int csid; \
363 CHECK_CHARSET_GET_ID (x, csid); \
364 charset = CHARSET_FROM_ID (csid); \
365 } while (0)
366
367
368/* Lookup Vcharset_ordered_list and return the first charset that
369 contains the character C. */
370#define CHAR_CHARSET(c) \
371 ((c) < 0x80 ? CHARSET_FROM_ID (charset_ascii) \
372 : char_charset ((c), Qnil, NULL))
373
374#if 0
375/* Char-table of charset-sets. Each element is a bool vector indexed
376 by a charset ID. */
377extern Lisp_Object Vchar_charset_set;
378
379/* Charset-bag of character C. */
380#define CHAR_CHARSET_SET(c) \
381 CHAR_TABLE_REF (Vchar_charset_set, c)
382
383/* Check if two characters C1 and C2 belong to the same charset. */
384#define SAME_CHARSET_P(c1, c2) \
385 intersection_p (CHAR_CHARSET_SET (c1), CHAR_CHARSET_SET (c2))
386
387#endif
388
389
390/* Return a character correponding to the code-point CODE of CHARSET.
391 Try some optimization before calling decode_char. */
392
393#define DECODE_CHAR(charset, code) \
394 ((ASCII_BYTE_P (code) && (charset)->ascii_compatible_p) \
395 ? (code) \
396 : ((code) < (charset)->min_code || (code) > (charset)->max_code) \
397 ? -1 \
398 : (charset)->unified_p \
399 ? decode_char ((charset), (code)) \
400 : (charset)->method == CHARSET_METHOD_OFFSET \
401 ? ((charset)->code_linear_p \
402 ? (code) - (charset)->min_code + (charset)->code_offset \
403 : decode_char ((charset), (code))) \
404 : (charset)->method == CHARSET_METHOD_MAP \
405 ? (((charset)->code_linear_p \
406 && VECTORP (CHARSET_DECODER (charset))) \
407 ? XINT (AREF (CHARSET_DECODER (charset), \
408 (code) - (charset)->min_code)) \
409 : decode_char ((charset), (code))) \
410 : decode_char ((charset), (code)))
411
412
413/* If CHARSET is a simple offset base charset, return it's offset,
414 otherwise return -1. */
415#define CHARSET_OFFSET(charset) \
416 (((charset)->method == CHARSET_METHOD_OFFSET \
417 && (charset)->code_linear_p \
418 && ! (charset)->unified_p) \
419 ? (charset)->code_offset - (charset)->min_code \
420 : -1)
421
422extern Lisp_Object charset_work;
423
424/* Return a code point of CHAR in CHARSET.
425 Try some optimization before calling encode_char. */
426
427#define ENCODE_CHAR(charset, c) \
428 ((ASCII_CHAR_P (c) && (charset)->ascii_compatible_p) \
429 ? (c) \
430 : ((charset)->unified_p \
431 || (charset)->method == CHARSET_METHOD_SUBSET \
432 || (charset)->method == CHARSET_METHOD_SUPERSET) \
433 ? encode_char ((charset), (c)) \
434 : ((c) < (charset)->min_char || (c) > (charset)->max_char) \
435 ? (charset)->invalid_code \
436 : (charset)->method == CHARSET_METHOD_OFFSET \
437 ? ((charset)->code_linear_p \
438 ? (c) - (charset)->code_offset + (charset)->min_code \
439 : encode_char ((charset), (c))) \
440 : (charset)->method == CHARSET_METHOD_MAP \
441 ? (((charset)->compact_codes_p \
442 && CHAR_TABLE_P (CHARSET_ENCODER (charset))) \
443 ? (charset_work = CHAR_TABLE_REF (CHARSET_ENCODER (charset), (c)), \
444 (NILP (charset_work) \
445 ? (charset)->invalid_code \
446 : XFASTINT (charset_work))) \
447 : encode_char ((charset), (c))) \
448 : encode_char ((charset), (c)))
449
450
451/* Set to 1 when a charset map is loaded to warn that a buffer text
452 and a string data may be relocated. */
453extern int charset_map_loaded;
454
455
456/* Set CHARSET to the charset highest priority of C, CODE to the
457 code-point of C in CHARSET. */
458#define SPLIT_CHAR(c, charset, code) \
459 ((charset) = char_charset ((c), Qnil, &(code)))
460
461
462#define ISO_MAX_DIMENSION 3
463#define ISO_MAX_CHARS 2
464#define ISO_MAX_FINAL 0x80 /* only 0x30..0xFF are used */
465
466/* Mapping table from ISO2022's charset (specified by DIMENSION,
467 CHARS, and FINAL_CHAR) to Emacs' charset ID. Should be accessed by
468 macro ISO_CHARSET_TABLE (DIMENSION, CHARS, FINAL_CHAR). */
469extern int iso_charset_table[ISO_MAX_DIMENSION][ISO_MAX_CHARS][ISO_MAX_FINAL];
470
471/* A charset of type iso2022 who has DIMENSION, CHARS, and FINAL
472 (final character). */
473#define ISO_CHARSET_TABLE(dimension, chars_96, final) \
474 iso_charset_table[(dimension) - 1][(chars_96)][(final)]
475
476/* Nonzero if the charset who has FAST_MAP may contain C. */
477#define CHARSET_FAST_MAP_REF(c, fast_map) \
478 ((c) < 0x10000 \
479 ? fast_map[(c) >> 10] & (1 << (((c) >> 7) & 7)) \
480 : fast_map[((c) >> 15) + 62] & (1 << (((c) >> 12) & 7)))
481
482#define CHARSET_FAST_MAP_SET(c, fast_map) \
483 do { \
484 if ((c) < 0x10000) \
485 (fast_map)[(c) >> 10] |= 1 << (((c) >> 7) & 7); \
486 else \
487 (fast_map)[((c) >> 15) + 62] |= 1 << (((c) >> 12) & 7); \
488 } while (0)
489
490
491
492/* 1 if CHARSET may contain the character C. */
493#define CHAR_CHARSET_P(c, charset) \
494 ((ASCII_CHAR_P (c) && (charset)->ascii_compatible_p) \
495 || ((CHARSET_UNIFIED_P (charset) \
496 || (charset)->method == CHARSET_METHOD_SUBSET \
497 || (charset)->method == CHARSET_METHOD_SUPERSET) \
498 ? encode_char ((charset), (c)) != (charset)->invalid_code \
499 : (CHARSET_FAST_MAP_REF ((c), (charset)->fast_map) \
500 && ((charset)->method == CHARSET_METHOD_OFFSET \
501 ? (c) >= (charset)->min_char && (c) <= (charset)->max_char \
502 : ((charset)->method == CHARSET_METHOD_MAP \
503 && (charset)->compact_codes_p \
504 && CHAR_TABLE_P (CHARSET_ENCODER (charset))) \
505 ? ! NILP (CHAR_TABLE_REF (CHARSET_ENCODER (charset), (c))) \
506 : encode_char ((charset), (c)) != (charset)->invalid_code))))
507
508\f
509/* Special macros for emacs-mule encoding. */
510
511/* Leading-code followed by extended leading-code. DIMENSION/COLUMN */
512#define EMACS_MULE_LEADING_CODE_PRIVATE_11 0x9A /* 1/1 */
513#define EMACS_MULE_LEADING_CODE_PRIVATE_12 0x9B /* 1/2 */
514#define EMACS_MULE_LEADING_CODE_PRIVATE_21 0x9C /* 2/2 */
515#define EMACS_MULE_LEADING_CODE_PRIVATE_22 0x9D /* 2/2 */
516
517\f
518
519extern Lisp_Object Qcharsetp;
520
521extern Lisp_Object Qascii;
522extern int charset_ascii, charset_eight_bit;
523extern int charset_unicode;
524extern int charset_jisx0201_roman;
525extern int charset_jisx0208_1978;
526extern int charset_jisx0208;
527extern int charset_ksc5601;
528
529extern int charset_unibyte;
530
531extern struct charset *char_charset (int, Lisp_Object, unsigned *);
532extern Lisp_Object charset_attributes (int);
533
534extern int maybe_unify_char (int, Lisp_Object);
535extern int decode_char (struct charset *, unsigned);
536extern unsigned encode_char (struct charset *, int);
537extern int string_xstring_p (Lisp_Object);
538
539extern void map_charset_chars (void (*) (Lisp_Object, Lisp_Object),
540 Lisp_Object, Lisp_Object,
541 struct charset *, unsigned, unsigned);
542
543#endif /* EMACS_CHARSET_H */