1 /* Header for charset handler.
2 Copyright (C) 2001, 2002, 2003, 2004, 2005,
3 2006 Free Software Foundation, Inc.
4 Copyright (C) 1995, 1997, 1998, 2003
5 National Institute of Advanced Industrial Science and Technology (AIST)
6 Registration Number H14PRO021
8 National Institute of Advanced Industrial Science and Technology (AIST)
9 Registration Number H13PRO009
11 This file is part of GNU Emacs.
13 GNU Emacs is free software; you can redistribute it and/or modify
14 it under the terms of the GNU General Public License as published by
15 the Free Software Foundation; either version 2, or (at your option)
18 GNU Emacs is distributed in the hope that it will be useful,
19 but WITHOUT ANY WARRANTY; without even the implied warranty of
20 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
21 GNU General Public License for more details.
23 You should have received a copy of the GNU General Public License
24 along with GNU Emacs; see the file COPYING. If not, write to
25 the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
26 Boston, MA 02110-1301, USA. */
28 #ifndef EMACS_CHARSET_H
29 #define EMACS_CHARSET_H
31 /* Index to arguments of Fdefine_charset_internal. */
33 enum define_charset_arg_index
36 charset_arg_dimension
,
37 charset_arg_code_space
,
40 charset_arg_iso_final
,
41 charset_arg_iso_revision
,
42 charset_arg_emacs_mule_id
,
43 charset_arg_ascii_compatible_p
,
44 charset_arg_supplementary_p
,
45 charset_arg_invalid_code
,
46 charset_arg_code_offset
,
50 charset_arg_unify_map
,
56 /* Indices to charset attributes vector. */
58 enum charset_attr_index
60 /* ID number of the charset. */
63 /* Name of the charset (symbol). */
66 /* Property list of the charset. */
69 /* If the method of the charset is `MAP_DEFERRED', the value is a
70 mapping vector or a file name that contains mapping vector.
74 /* If the method of the charset is `MAP', the value is a vector
75 that maps code points of the charset to characters. The vector
76 is indexed by a character index. A character index is
77 calculated from a code point and the code-space table of the
81 /* If the method of the charset is `MAP', the value is a
82 char-table that maps characters of the charset to code
86 /* If the method of the charset is `SUBSET', the value is a vector
89 [ CHARSET-ID MIN-CODE MAX-CODE OFFSET ]
91 CHARSET-ID is an ID number of a parent charset. MIN-CODE and
92 MAX-CODE specify the range of characters inherited from the
93 parent. OFFSET is an integer value to add to a code point of
94 the parent charset to get the corresponding code point of this
98 /* If the method of the charset is `SUPERSET', the value is a list
99 whose elements have this form:
101 (CHARSET-ID . OFFSET)
103 CHARSET-IDs are ID numbers of parent charsets. OFFSET is an
104 integer value to add to a code point of the parent charset to
105 get the corresponding code point of this charset. */
108 /* The value is a mapping vector or a file name that contains the
109 mapping. This defines how characters in the charset should be
110 unified with Unicode. The value of the member
111 `charset_deunifier' is created from this information. */
114 /* If characters in the charset must be unified Unicode, the value
115 is a char table that maps a unified Unicode character code to
116 the non-unified character code in the charset. */
119 /* The length of the charset attribute vector. */
123 /* Methods for converting code points and characters of charsets. */
127 /* For a charset of this method, a character code is calculated
128 from a character index (which is calculated from a code point)
129 simply by adding an offset value. */
130 CHARSET_METHOD_OFFSET
,
132 /* For a charset of this method, a decoder vector and an encoder
133 char-table is used for code point <-> character code
137 /* Same as above but decoder and encoder are loaded from a file on
138 demand. Once loaded, the method is changed to
139 CHARSET_METHOD_MAP. */
140 CHARSET_METHOD_MAP_DEFERRED
,
142 /* A charset of this method is a subset of another charset. */
143 CHARSET_METHOD_SUBSET
,
145 /* A charset of this method is a superset of other charsets. */
146 CHARSET_METHOD_SUPERSET
151 /* Index to charset_table. */
154 /* Index to Vcharset_hash_table. */
157 /* Dimension of the charset: 1, 2, 3, or 4. */
160 /* Byte code range of each dimension. <code_space>[4N] is a mininum
161 byte code of the (N+1)th dimension, <code_space>[4N+1] is a
162 maximum byte code of the (N+1)th dimension, <code_space>[4N+2] is
163 (<code_space>[4N+1] - <code_space>[4N] + 1), <code_space>[4N+3]
164 is a number of characters containd in the first to (N+1)th
165 dismesions. We get `char-index' of a `code-point' from this
169 /* If B is a byte of Nth dimension of a code-point, the (N-1)th bit
170 of code_space_mask[B] is set. This array is used to quickly
171 check if a code-point is in a valid range. */
172 unsigned char *code_space_mask
;
174 /* 1 if there's no gap in code-points. */
177 /* If the charset is treated as 94-chars in ISO-2022, the value is 0.
178 If the charset is treated as 96-chars in ISO-2022, the value is 1. */
181 /* ISO final byte of the charset: 48..127. It may be -1 if the
182 charset doesn't conform to ISO-2022. */
185 /* ISO revision number of the charset. */
188 /* If the charset is identical to what supported by Emacs 21 and the
189 priors, the identification number of the charset used in those
190 version. Otherwise, -1. */
193 /* Nonzero iff the charset is compatible with ASCII. */
194 int ascii_compatible_p
;
196 /* Nonzero iff the charset is supplementary. */
199 /* Nonzero iff all the code points are representable by Lisp_Int. */
202 /* The method for encoding/decoding characters of the charset. */
203 enum charset_method method
;
205 /* Mininum and Maximum code points of the charset. */
206 unsigned min_code
, max_code
;
208 /* Offset value used by macros CODE_POINT_TO_INDEX and
209 INDEX_TO_CODE_POINT. . */
210 unsigned char_index_offset
;
212 /* Mininum and Maximum character codes of the charset. If the
213 charset is compatible with ASCII, min_char is a minimum non-ASCII
214 character of the charset. If the method of charset is
215 CHARSET_METHOD_OFFSET, even if the charset is unified, min_char
216 and max_char doesn't change. */
217 int min_char
, max_char
;
219 /* The code returned by ENCODE_CHAR if a character is not encodable
221 unsigned invalid_code
;
223 /* If the method of the charset is CHARSET_METHOD_MAP, this is a
224 table of bits used to quickly and roughly guess if a character
225 belongs to the charset.
227 The first 64 elements are 512 bits for characters less than
228 0x10000. Each bit corresponds to 128-character block. The last
229 126 elements are 1008 bits for the greater characters
230 (0x10000..0x3FFFFF). Each bit corresponds to 4096-character
233 If a bit is 1, at least one character in the corresponding block is
235 unsigned char fast_map
[190];
237 /* Offset value to calculate a character code from code-point, and
244 /* Hash table of charset symbols vs. the correponding attribute
246 extern Lisp_Object Vcharset_hash_table
;
248 /* Table of struct charset. */
249 extern struct charset
*charset_table
;
251 #define CHARSET_FROM_ID(id) (charset_table + (id))
253 extern Lisp_Object Vcharset_ordered_list
;
255 /* Incremented everytime we change the priority of charsets. */
256 extern unsigned short charset_ordered_list_tick
;
258 extern Lisp_Object Vcharset_list
;
259 extern Lisp_Object Viso_2022_charset_list
;
260 extern Lisp_Object Vemacs_mule_charset_list
;
262 extern struct charset
*emacs_mule_charset
[256];
265 /* Macros to access information about charset. */
267 /* Return the attribute vector of charset whose symbol is SYMBOL. */
268 #define CHARSET_SYMBOL_ATTRIBUTES(symbol) \
269 Fgethash ((symbol), Vcharset_hash_table, Qnil)
271 #define CHARSET_ATTR_ID(attrs) AREF ((attrs), charset_id)
272 #define CHARSET_ATTR_NAME(attrs) AREF ((attrs), charset_name)
273 #define CHARSET_ATTR_PLIST(attrs) AREF ((attrs), charset_plist)
274 #define CHARSET_ATTR_MAP(attrs) AREF ((attrs), charset_map)
275 #define CHARSET_ATTR_DECODER(attrs) AREF ((attrs), charset_decoder)
276 #define CHARSET_ATTR_ENCODER(attrs) AREF ((attrs), charset_encoder)
277 #define CHARSET_ATTR_SUBSET(attrs) AREF ((attrs), charset_subset)
278 #define CHARSET_ATTR_SUPERSET(attrs) AREF ((attrs), charset_superset)
279 #define CHARSET_ATTR_UNIFY_MAP(attrs) AREF ((attrs), charset_unify_map)
280 #define CHARSET_ATTR_DEUNIFIER(attrs) AREF ((attrs), charset_deunifier)
282 #define CHARSET_SYMBOL_ID(symbol) \
283 CHARSET_ATTR_ID (CHARSET_SYMBOL_ATTRIBUTES (symbol))
285 /* Return an index to Vcharset_hash_table of the charset whose symbol
287 #define CHARSET_SYMBOL_HASH_INDEX(symbol) \
288 hash_lookup (XHASH_TABLE (Vcharset_hash_table), symbol, NULL)
290 /* Return the attribute vector of CHARSET. */
291 #define CHARSET_ATTRIBUTES(charset) \
292 (HASH_VALUE (XHASH_TABLE (Vcharset_hash_table), (charset)->hash_index))
294 #define CHARSET_ID(charset) ((charset)->id)
295 #define CHARSET_HASH_INDEX(charset) ((charset)->hash_index)
296 #define CHARSET_DIMENSION(charset) ((charset)->dimension)
297 #define CHARSET_CODE_SPACE(charset) ((charset)->code_space)
298 #define CHARSET_CODE_LINEAR_P(charset) ((charset)->code_linear_p)
299 #define CHARSET_ISO_CHARS_96(charset) ((charset)->iso_chars_96)
300 #define CHARSET_ISO_FINAL(charset) ((charset)->iso_final)
301 #define CHARSET_ISO_PLANE(charset) ((charset)->iso_plane)
302 #define CHARSET_ISO_REVISION(charset) ((charset)->iso_revision)
303 #define CHARSET_EMACS_MULE_ID(charset) ((charset)->emacs_mule_id)
304 #define CHARSET_ASCII_COMPATIBLE_P(charset) ((charset)->ascii_compatible_p)
305 #define CHARSET_COMPACT_CODES_P(charset) ((charset)->compact_codes_p)
306 #define CHARSET_METHOD(charset) ((charset)->method)
307 #define CHARSET_MIN_CODE(charset) ((charset)->min_code)
308 #define CHARSET_MAX_CODE(charset) ((charset)->max_code)
309 #define CHARSET_INVALID_CODE(charset) ((charset)->invalid_code)
310 #define CHARSET_MIN_CHAR(charset) ((charset)->min_char)
311 #define CHARSET_MAX_CHAR(charset) ((charset)->max_char)
312 #define CHARSET_CODE_OFFSET(charset) ((charset)->code_offset)
313 #define CHARSET_UNIFIED_P(charset) ((charset)->unified_p)
315 #define CHARSET_NAME(charset) \
316 (CHARSET_ATTR_NAME (CHARSET_ATTRIBUTES (charset)))
317 #define CHARSET_MAP(charset) \
318 (CHARSET_ATTR_MAP (CHARSET_ATTRIBUTES (charset)))
319 #define CHARSET_DECODER(charset) \
320 (CHARSET_ATTR_DECODER (CHARSET_ATTRIBUTES (charset)))
321 #define CHARSET_ENCODER(charset) \
322 (CHARSET_ATTR_ENCODER (CHARSET_ATTRIBUTES (charset)))
323 #define CHARSET_SUBSET(charset) \
324 (CHARSET_ATTR_SUBSET (CHARSET_ATTRIBUTES (charset)))
325 #define CHARSET_SUPERSET(charset) \
326 (CHARSET_ATTR_SUPERSET (CHARSET_ATTRIBUTES (charset)))
327 #define CHARSET_UNIFY_MAP(charset) \
328 (CHARSET_ATTR_UNIFY_MAP (CHARSET_ATTRIBUTES (charset)))
329 #define CHARSET_DEUNIFIER(charset) \
330 (CHARSET_ATTR_DEUNIFIER (CHARSET_ATTRIBUTES (charset)))
333 /* Nonzero iff OBJ is a valid charset symbol. */
334 #define CHARSETP(obj) (CHARSET_SYMBOL_HASH_INDEX (obj) >= 0)
336 /* Check if X is a valid charset symbol. If not, signal an error. */
337 #define CHECK_CHARSET(x) \
339 if (! SYMBOLP (x) || CHARSET_SYMBOL_HASH_INDEX (x) < 0) \
340 x = wrong_type_argument (Qcharsetp, (x)); \
344 /* Check if X is a valid charset symbol. If valid, set ID to the id
345 number of the charset. Otherwise, signal an error. */
346 #define CHECK_CHARSET_GET_ID(x, id) \
350 if (! SYMBOLP (x) || (idx = CHARSET_SYMBOL_HASH_INDEX (x)) < 0) \
351 x = wrong_type_argument (Qcharsetp, (x)); \
352 id = XINT (AREF (HASH_VALUE (XHASH_TABLE (Vcharset_hash_table), idx), \
357 /* Check if X is a valid charset symbol. If valid, set ATTR to the
358 attr vector of the charset. Otherwise, signal an error. */
359 #define CHECK_CHARSET_GET_ATTR(x, attr) \
361 if (!SYMBOLP (x) || NILP (attr = CHARSET_SYMBOL_ATTRIBUTES (x))) \
362 x = wrong_type_argument (Qcharsetp, (x)); \
366 #define CHECK_CHARSET_GET_CHARSET(x, charset) \
369 CHECK_CHARSET_GET_ID (x, id); \
370 charset = CHARSET_FROM_ID (id); \
374 /* Lookup Vcharset_order_list and return the first charset that
375 contains the character C. */
376 #define CHAR_CHARSET(c) \
377 ((c) < 0x80 ? CHARSET_FROM_ID (charset_ascii) \
378 : char_charset ((c), Qnil, NULL))
381 /* Char-table of charset-sets. Each element is a bool vector indexed
383 extern Lisp_Object Vchar_charset_set
;
385 /* Charset-bag of character C. */
386 #define CHAR_CHARSET_SET(c) \
387 CHAR_TABLE_REF (Vchar_charset_set, c)
389 /* Check if two characters C1 and C2 belong to the same charset. */
390 #define SAME_CHARSET_P(c1, c2) \
391 intersection_p (CHAR_CHARSET_SET (c1), CHAR_CHARSET_SET (c2))
396 /* Return a character correponding to the code-point CODE of CHARSET.
397 Try some optimization before calling decode_char. */
399 #define DECODE_CHAR(charset, code) \
400 ((ASCII_BYTE_P (code) && (charset)->ascii_compatible_p) \
402 : ((code) < (charset)->min_code || (code) > (charset)->max_code) \
404 : (charset)->unified_p \
405 ? decode_char ((charset), (code)) \
406 : (charset)->method == CHARSET_METHOD_OFFSET \
407 ? ((charset)->code_linear_p \
408 ? (code) - (charset)->min_code + (charset)->code_offset \
409 : decode_char ((charset), (code))) \
410 : (charset)->method == CHARSET_METHOD_MAP \
411 ? ((charset)->code_linear_p \
412 ? XINT (AREF (CHARSET_DECODER (charset), \
413 (code) - (charset)->min_code)) \
414 : decode_char ((charset), (code))) \
415 : decode_char ((charset), (code)))
418 /* If CHARSET is a simple offset base charset, return it's offset,
419 otherwise return -1. */
420 #define CHARSET_OFFSET(charset) \
421 (((charset)->method == CHARSET_METHOD_OFFSET \
422 && (charset)->code_linear_p \
423 && ! (charset)->unified_p) \
424 ? (charset)->code_offset - (charset)->min_code \
427 extern Lisp_Object charset_work
;
429 /* Return a code point of CHAR in CHARSET.
430 Try some optimization before calling encode_char. */
432 #define ENCODE_CHAR(charset, c) \
433 ((ASCII_CHAR_P (c) && (charset)->ascii_compatible_p) \
435 : ((charset)->unified_p \
436 || (charset)->method == CHARSET_METHOD_SUBSET \
437 || (charset)->method == CHARSET_METHOD_SUPERSET) \
438 ? encode_char ((charset), (c)) \
439 : ((c) < (charset)->min_char || (c) > (charset)->max_char) \
440 ? (charset)->invalid_code \
441 : (charset)->method == CHARSET_METHOD_OFFSET \
442 ? ((charset)->code_linear_p \
443 ? (c) - (charset)->code_offset + (charset)->min_code \
444 : encode_char ((charset), (c))) \
445 : (charset)->method == CHARSET_METHOD_MAP \
446 ? ((charset)->compact_codes_p \
447 ? (charset_work = CHAR_TABLE_REF (CHARSET_ENCODER (charset), (c)), \
448 (NILP (charset_work) \
449 ? (charset)->invalid_code \
450 : XFASTINT (charset_work))) \
451 : encode_char ((charset), (c))) \
452 : encode_char ((charset), (c)))
455 /* Set to 1 when a charset map is loaded to warn that a buffer text
456 and a string data may be relocated. */
457 extern int charset_map_loaded
;
460 /* Set CHARSET to the charset highest priority of C, CODE to the
461 code-point of C in CHARSET. */
462 #define SPLIT_CHAR(c, charset, code) \
463 ((charset) = char_charset ((c), Qnil, &(code)))
466 #define ISO_MAX_DIMENSION 3
467 #define ISO_MAX_CHARS 2
468 #define ISO_MAX_FINAL 0x80 /* only 0x30..0xFF are used */
470 /* Mapping table from ISO2022's charset (specified by DIMENSION,
471 CHARS, and FINAL_CHAR) to Emacs' charset ID. Should be accessed by
472 macro ISO_CHARSET_TABLE (DIMENSION, CHARS, FINAL_CHAR). */
473 extern int iso_charset_table
[ISO_MAX_DIMENSION
][ISO_MAX_CHARS
][ISO_MAX_FINAL
];
475 /* A charset of type iso2022 who has DIMENSION, CHARS, and FINAL
476 (final character). */
477 #define ISO_CHARSET_TABLE(dimension, chars_96, final) \
478 iso_charset_table[(dimension) - 1][(chars_96)][(final)]
480 /* Nonzero iff the charset who has FAST_MAP may contain C. */
481 #define CHARSET_FAST_MAP_REF(c, fast_map) \
483 ? fast_map[(c) >> 10] & (1 << (((c) >> 7) & 7)) \
484 : fast_map[((c) >> 15) + 62] & (1 << (((c) >> 12) & 7)))
486 #define CHARSET_FAST_MAP_SET(c, fast_map) \
489 (fast_map)[(c) >> 10] |= 1 << (((c) >> 7) & 7); \
491 (fast_map)[((c) >> 15) + 62] |= 1 << (((c) >> 12) & 7); \
496 /* 1 iff CHARSET may contain the character C. */
497 #define CHAR_CHARSET_P(c, charset) \
498 ((ASCII_CHAR_P (c) && (charset)->ascii_compatible_p) \
499 || ((CHARSET_UNIFIED_P (charset) \
500 || (charset)->method == CHARSET_METHOD_SUBSET \
501 || (charset)->method == CHARSET_METHOD_SUPERSET) \
502 ? encode_char ((charset), (c)) != (charset)->invalid_code \
503 : (CHARSET_FAST_MAP_REF ((c), (charset)->fast_map) \
504 && ((charset)->method == CHARSET_METHOD_OFFSET \
505 ? (c) >= (charset)->min_char && (c) <= (charset)->max_char \
506 : ((charset)->method == CHARSET_METHOD_MAP \
507 && (charset)->compact_codes_p) \
508 ? ! NILP (CHAR_TABLE_REF (CHARSET_ENCODER (charset), (c))) \
509 : encode_char ((charset), (c)) != (charset)->invalid_code))))
512 /* Special macros for emacs-mule encoding. */
514 /* Leading-code followed by extended leading-code. DIMENSION/COLUMN */
515 #define EMACS_MULE_LEADING_CODE_PRIVATE_11 0x9A /* 1/1 */
516 #define EMACS_MULE_LEADING_CODE_PRIVATE_12 0x9B /* 1/2 */
517 #define EMACS_MULE_LEADING_CODE_PRIVATE_21 0x9C /* 2/2 */
518 #define EMACS_MULE_LEADING_CODE_PRIVATE_22 0x9D /* 2/2 */
520 extern struct charset
*emacs_mule_charset
[256];
524 extern Lisp_Object Qcharsetp
;
526 extern Lisp_Object Qascii
, Qunicode
;
527 extern int charset_ascii
, charset_eight_bit
;
528 extern int charset_iso_8859_1
;
529 extern int charset_unicode
;
530 extern int charset_jisx0201_roman
;
531 extern int charset_jisx0208_1978
;
532 extern int charset_jisx0208
;
534 extern int charset_unibyte
;
536 extern struct charset
*char_charset
P_ ((int, Lisp_Object
, unsigned *));
537 extern Lisp_Object charset_attributes
P_ ((int));
539 extern int decode_char
P_ ((struct charset
*, unsigned));
540 extern unsigned encode_char
P_ ((struct charset
*, int));
541 extern int string_xstring_p
P_ ((Lisp_Object
));
543 extern void map_charset_chars
P_ ((void (*) (Lisp_Object
, Lisp_Object
),
544 Lisp_Object
, Lisp_Object
,
545 struct charset
*, unsigned, unsigned));
547 EXFUN (Funify_charset
, 3);
549 #endif /* EMACS_CHARSET_H */
551 /* arch-tag: 3b96db55-4961-481d-ac3e-219f46a2b3aa
552 (do not change this comment) */