1 /* Basic multilingual character support.
2 Copyright (C) 1995, 1997, 1998 Electrotechnical Laboratory, JAPAN.
3 Licensed to the Free Software Foundation.
5 This file is part of GNU Emacs.
7 GNU Emacs is free software; you can redistribute it and/or modify
8 it under the terms of the GNU General Public License as published by
9 the Free Software Foundation; either version 2, or (at your option)
12 GNU Emacs is distributed in the hope that it will be useful,
13 but WITHOUT ANY WARRANTY; without even the implied warranty of
14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 GNU General Public License for more details.
17 You should have received a copy of the GNU General Public License
18 along with GNU Emacs; see the file COPYING. If not, write to
19 the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
20 Boston, MA 02111-1307, USA. */
22 /* At first, see the document in `charset.h' to understand the code in
29 #include <sys/types.h>
43 Lisp_Object Qcharset
, Qascii
, Qcomposition
;
45 /* Declaration of special leading-codes. */
46 int leading_code_composition
; /* for composite characters */
47 int leading_code_private_11
; /* for private DIMENSION1 of 1-column */
48 int leading_code_private_12
; /* for private DIMENSION1 of 2-column */
49 int leading_code_private_21
; /* for private DIMENSION2 of 1-column */
50 int leading_code_private_22
; /* for private DIMENSION2 of 2-column */
52 /* Declaration of special charsets. */
53 int charset_ascii
; /* ASCII */
54 int charset_composition
; /* for a composite character */
55 int charset_latin_iso8859_1
; /* ISO8859-1 (Latin-1) */
56 int charset_jisx0208_1978
; /* JISX0208.1978 (Japanese Kanji old set) */
57 int charset_jisx0208
; /* JISX0208.1983 (Japanese Kanji) */
58 int charset_katakana_jisx0201
; /* JISX0201.Kana (Japanese Katakana) */
59 int charset_latin_jisx0201
; /* JISX0201.Roman (Japanese Roman) */
60 int charset_big5_1
; /* Big5 Level 1 (Chinese Traditional) */
61 int charset_big5_2
; /* Big5 Level 2 (Chinese Traditional) */
63 int min_composite_char
;
65 Lisp_Object Qcharset_table
;
67 /* A char-table containing information of each character set. */
68 Lisp_Object Vcharset_table
;
70 /* A vector of charset symbol indexed by charset-id. This is used
71 only for returning charset symbol from C functions. */
72 Lisp_Object Vcharset_symbol_table
;
74 /* A list of charset symbols ever defined. */
75 Lisp_Object Vcharset_list
;
77 /* Vector of unification table ever defined.
78 An ID of a unification table is an index of this vector. */
79 Lisp_Object Vcharacter_unification_table_vector
;
81 /* Tables used by macros BYTES_BY_CHAR_HEAD and WIDTH_BY_CHAR_HEAD. */
82 int bytes_by_char_head
[256];
83 int width_by_char_head
[256];
85 /* Mapping table from ISO2022's charset (specified by DIMENSION,
86 CHARS, and FINAL-CHAR) to Emacs' charset. */
87 int iso_charset_table
[2][2][128];
89 /* Table of pointers to the structure `cmpchar_info' indexed by
91 struct cmpchar_info
**cmpchar_table
;
92 /* The current size of `cmpchar_table'. */
93 static int cmpchar_table_size
;
94 /* Number of the current composite characters. */
97 /* Variables used locally in the macro FETCH_MULTIBYTE_CHAR. */
98 unsigned char *_fetch_multibyte_char_p
;
99 int _fetch_multibyte_char_len
;
101 /* Offset to add to a non-ASCII value when inserting it. */
102 int nonascii_insert_offset
;
104 /* Translation table for converting non-ASCII unibyte characters
105 to multibyte codes, or nil. */
106 Lisp_Object Vnonascii_translate_table
;
108 #define min(X, Y) ((X) < (Y) ? (X) : (Y))
109 #define max(X, Y) ((X) > (Y) ? (X) : (Y))
112 invalid_character (c
)
115 error ("Invalid character: %o, %d, 0x%x", c
);
119 /* Set STR a pointer to the multi-byte form of the character C. If C
120 is not a composite character, the multi-byte form is set in WORKBUF
121 and STR points WORKBUF. The caller should allocate at least 4-byte
122 area at WORKBUF in advance. Returns the length of the multi-byte
123 form. If C is an invalid character to have a multi-byte form,
126 Use macro `CHAR_STRING (C, WORKBUF, STR)' instead of calling this
127 function directly if C can be an ASCII character. */
130 non_ascii_char_to_string (c
, workbuf
, str
)
132 unsigned char *workbuf
, **str
;
136 if (COMPOSITE_CHAR_P (c
))
138 int cmpchar_id
= COMPOSITE_CHAR_ID (c
);
140 if (cmpchar_id
< n_cmpchars
)
142 *str
= cmpchar_table
[cmpchar_id
]->data
;
143 return cmpchar_table
[cmpchar_id
]->len
;
147 invalid_character (c
);
151 SPLIT_NON_ASCII_CHAR (c
, charset
, c1
, c2
);
153 || ! CHARSET_DEFINED_P (charset
)
154 || c1
>= 0 && c1
< 32
155 || c2
>= 0 && c2
< 32)
156 invalid_character (c
);
159 *workbuf
++ = CHARSET_LEADING_CODE_BASE (charset
);
160 if (*workbuf
= CHARSET_LEADING_CODE_EXT (charset
))
162 *workbuf
++ = c1
| 0x80;
164 *workbuf
++ = c2
| 0x80;
166 return (workbuf
- *str
);
169 /* Return a non-ASCII character of which multi-byte form is at STR of
170 length LEN. If ACTUAL_LEN is not NULL, the actual length of the
171 character is set to the address ACTUAL_LEN.
173 Use macro `STRING_CHAR (STR, LEN)' instead of calling this function
174 directly if STR can hold an ASCII character. */
176 string_to_non_ascii_char (str
, len
, actual_len
)
178 int len
, *actual_len
;
181 unsigned char c1
, c2
;
184 if (SPLIT_STRING (str
, len
, charset
, c1
, c2
) == CHARSET_ASCII
)
191 c
= MAKE_NON_ASCII_CHAR (charset
, c1
, c2
);
194 *actual_len
= (charset
== CHARSET_COMPOSITION
195 ? cmpchar_table
[COMPOSITE_CHAR_ID (c
)]->len
196 : BYTES_BY_CHAR_HEAD (*str
));
200 /* Return the length of the multi-byte form at string STR of length LEN. */
202 multibyte_form_length (str
, len
)
207 unsigned char c1
, c2
;
210 if (SPLIT_STRING (str
, len
, charset
, c1
, c2
) == CHARSET_ASCII
)
213 return (charset
== CHARSET_COMPOSITION
214 ? cmpchar_table
[(c1
<< 7) | c2
]->len
215 : BYTES_BY_CHAR_HEAD (*str
));
218 /* Check if string STR of length LEN contains valid multi-byte form of
219 a character. If valid, charset and position codes of the character
220 is set at *CHARSET, *C1, and *C2, and return 0. If not valid,
221 return -1. This should be used only in the macro SPLIT_STRING
222 which checks range of STR in advance. */
224 split_non_ascii_string (str
, len
, charset
, c1
, c2
)
225 register unsigned char *str
, *c1
, *c2
;
226 register int len
, *charset
;
228 register unsigned int cs
= *str
++;
230 if (cs
== LEADING_CODE_COMPOSITION
)
232 int cmpchar_id
= str_cmpchar_id (str
- 1, len
);
236 *charset
= cs
, *c1
= cmpchar_id
>> 7, *c2
= cmpchar_id
& 0x7F;
238 else if ((cs
< LEADING_CODE_PRIVATE_11
|| (cs
= *str
++) >= 0xA0)
239 && CHARSET_DEFINED_P (cs
))
244 *c1
= (*str
++) & 0x7F;
245 if (CHARSET_DIMENSION (cs
) == 2)
249 *c2
= (*str
++) & 0x7F;
257 /* Return a character unified with C (or a character made of CHARSET,
258 C1, and C2) in unification table TABLE. If no unification is found
259 in TABLE, return C. */
260 unify_char (table
, c
, charset
, c1
, c2
)
262 int c
, charset
, c1
, c2
;
265 int alt_charset
, alt_c1
, alt_c2
, dimension
;
267 if (c
< 0) c
= MAKE_CHAR (charset
, c1
, c2
);
268 if (!CHAR_TABLE_P (table
)
269 || (ch
= Faref (table
, make_number (c
)), !INTEGERP (ch
))
273 SPLIT_CHAR (XFASTINT (ch
), alt_charset
, alt_c1
, alt_c2
);
274 dimension
= CHARSET_DIMENSION (alt_charset
);
275 if (dimension
== 1 && alt_c1
> 0 || dimension
== 2 && alt_c2
> 0)
276 /* CH is not a generic character, just return it. */
277 return XFASTINT (ch
);
279 /* Since CH is a generic character, we must return a specific
280 charater which has the same position codes as C from CH. */
282 SPLIT_CHAR (c
, charset
, c1
, c2
);
283 if (dimension
!= CHARSET_DIMENSION (charset
))
284 /* We can't make such a character because of dimension mismatch. */
286 return MAKE_CHAR (alt_charset
, c1
, c2
);
289 /* Convert the unibyte character C to multibyte based on
290 Vnonascii_translate_table or nonascii_insert_offset. If they can't
291 convert C to a valid multibyte character, convert it based on
292 DEFAULT_NONASCII_INSERT_OFFSET which makes C a Latin-1 character. */
294 unibyte_char_to_multibyte (c
)
297 if (c
>= 0240 && c
< 0400)
301 if (! NILP (Vnonascii_translate_table
))
302 c
= XINT (Faref (Vnonascii_translate_table
, make_number (c
)));
303 else if (nonascii_insert_offset
> 0)
304 c
+= nonascii_insert_offset
;
305 if (c
>= 0240 && (c
< 0400 || ! VALID_MULTIBYTE_CHAR_P (c
)))
306 c
= c_save
+ DEFAULT_NONASCII_INSERT_OFFSET
;
311 /* Update the table Vcharset_table with the given arguments (see the
312 document of `define-charset' for the meaning of each argument).
313 Several other table contents are also updated. The caller should
314 check the validity of CHARSET-ID and the remaining arguments in
318 update_charset_table (charset_id
, dimension
, chars
, width
, direction
,
319 iso_final_char
, iso_graphic_plane
,
320 short_name
, long_name
, description
)
321 Lisp_Object charset_id
, dimension
, chars
, width
, direction
;
322 Lisp_Object iso_final_char
, iso_graphic_plane
;
323 Lisp_Object short_name
, long_name
, description
;
325 int charset
= XINT (charset_id
);
327 unsigned char leading_code_base
, leading_code_ext
;
329 if (NILP (CHARSET_TABLE_ENTRY (charset
)))
330 CHARSET_TABLE_ENTRY (charset
)
331 = Fmake_vector (make_number (CHARSET_MAX_IDX
), Qnil
);
333 /* Get byte length of multibyte form, base leading-code, and
334 extended leading-code of the charset. See the comment under the
335 title "GENERAL NOTE on CHARACTER SET (CHARSET)" in charset.h. */
336 bytes
= XINT (dimension
);
337 if (charset
< MIN_CHARSET_PRIVATE_DIMENSION1
)
339 /* Official charset, it doesn't have an extended leading-code. */
340 if (charset
!= CHARSET_ASCII
)
341 bytes
+= 1; /* For a base leading-code. */
342 leading_code_base
= charset
;
343 leading_code_ext
= 0;
347 /* Private charset. */
348 bytes
+= 2; /* For base and extended leading-codes. */
350 = (charset
< LEADING_CODE_EXT_12
351 ? LEADING_CODE_PRIVATE_11
352 : (charset
< LEADING_CODE_EXT_21
353 ? LEADING_CODE_PRIVATE_12
354 : (charset
< LEADING_CODE_EXT_22
355 ? LEADING_CODE_PRIVATE_21
356 : LEADING_CODE_PRIVATE_22
)));
357 leading_code_ext
= charset
;
360 CHARSET_TABLE_INFO (charset
, CHARSET_ID_IDX
) = charset_id
;
361 CHARSET_TABLE_INFO (charset
, CHARSET_BYTES_IDX
) = make_number (bytes
);
362 CHARSET_TABLE_INFO (charset
, CHARSET_DIMENSION_IDX
) = dimension
;
363 CHARSET_TABLE_INFO (charset
, CHARSET_CHARS_IDX
) = chars
;
364 CHARSET_TABLE_INFO (charset
, CHARSET_WIDTH_IDX
) = width
;
365 CHARSET_TABLE_INFO (charset
, CHARSET_DIRECTION_IDX
) = direction
;
366 CHARSET_TABLE_INFO (charset
, CHARSET_LEADING_CODE_BASE_IDX
)
367 = make_number (leading_code_base
);
368 CHARSET_TABLE_INFO (charset
, CHARSET_LEADING_CODE_EXT_IDX
)
369 = make_number (leading_code_ext
);
370 CHARSET_TABLE_INFO (charset
, CHARSET_ISO_FINAL_CHAR_IDX
) = iso_final_char
;
371 CHARSET_TABLE_INFO (charset
, CHARSET_ISO_GRAPHIC_PLANE_IDX
)
373 CHARSET_TABLE_INFO (charset
, CHARSET_SHORT_NAME_IDX
) = short_name
;
374 CHARSET_TABLE_INFO (charset
, CHARSET_LONG_NAME_IDX
) = long_name
;
375 CHARSET_TABLE_INFO (charset
, CHARSET_DESCRIPTION_IDX
) = description
;
376 CHARSET_TABLE_INFO (charset
, CHARSET_PLIST_IDX
) = Qnil
;
379 /* If we have already defined a charset which has the same
380 DIMENSION, CHARS and ISO-FINAL-CHAR but the different
381 DIRECTION, we must update the entry REVERSE-CHARSET of both
382 charsets. If there's no such charset, the value of the entry
386 for (i
= 0; i
<= MAX_CHARSET
; i
++)
387 if (!NILP (CHARSET_TABLE_ENTRY (i
)))
389 if (CHARSET_DIMENSION (i
) == XINT (dimension
)
390 && CHARSET_CHARS (i
) == XINT (chars
)
391 && CHARSET_ISO_FINAL_CHAR (i
) == XINT (iso_final_char
)
392 && CHARSET_DIRECTION (i
) != XINT (direction
))
394 CHARSET_TABLE_INFO (charset
, CHARSET_REVERSE_CHARSET_IDX
)
396 CHARSET_TABLE_INFO (i
, CHARSET_REVERSE_CHARSET_IDX
) = charset_id
;
401 /* No such a charset. */
402 CHARSET_TABLE_INFO (charset
, CHARSET_REVERSE_CHARSET_IDX
)
406 if (charset
!= CHARSET_ASCII
407 && charset
< MIN_CHARSET_PRIVATE_DIMENSION1
)
409 /* Update tables bytes_by_char_head and width_by_char_head. */
410 bytes_by_char_head
[leading_code_base
] = bytes
;
411 width_by_char_head
[leading_code_base
] = XINT (width
);
413 /* Update table emacs_code_class. */
414 emacs_code_class
[charset
] = (bytes
== 2
415 ? EMACS_leading_code_2
417 ? EMACS_leading_code_3
418 : EMACS_leading_code_4
));
421 /* Update table iso_charset_table. */
422 if (ISO_CHARSET_TABLE (dimension
, chars
, iso_final_char
) < 0)
423 ISO_CHARSET_TABLE (dimension
, chars
, iso_final_char
) = charset
;
428 /* Return charset id of CHARSET_SYMBOL, or return -1 if CHARSET_SYMBOL
431 get_charset_id (charset_symbol
)
432 Lisp_Object charset_symbol
;
437 return ((SYMBOLP (charset_symbol
)
438 && (val
= Fget (charset_symbol
, Qcharset
), VECTORP (val
))
439 && (charset
= XINT (XVECTOR (val
)->contents
[CHARSET_ID_IDX
]),
440 CHARSET_VALID_P (charset
)))
444 /* Return an identification number for a new private charset of
445 DIMENSION and WIDTH. If there's no more room for the new charset,
448 get_new_private_charset_id (dimension
, width
)
449 int dimension
, width
;
451 int charset
, from
, to
;
456 from
= LEADING_CODE_EXT_11
, to
= LEADING_CODE_EXT_12
;
458 from
= LEADING_CODE_EXT_12
, to
= LEADING_CODE_EXT_21
;
463 from
= LEADING_CODE_EXT_21
, to
= LEADING_CODE_EXT_22
;
465 from
= LEADING_CODE_EXT_22
, to
= LEADING_CODE_EXT_MAX
+ 1;
468 for (charset
= from
; charset
< to
; charset
++)
469 if (!CHARSET_DEFINED_P (charset
)) break;
471 return make_number (charset
< to
? charset
: 0);
474 DEFUN ("define-charset", Fdefine_charset
, Sdefine_charset
, 3, 3, 0,
475 "Define CHARSET-ID as the identification number of CHARSET with INFO-VECTOR.\n\
476 If CHARSET-ID is nil, it is decided automatically, which means CHARSET is\n\
477 treated as a private charset.\n\
478 INFO-VECTOR is a vector of the format:\n\
479 [DIMENSION CHARS WIDTH DIRECTION ISO-FINAL-CHAR ISO-GRAPHIC-PLANE\n\
480 SHORT-NAME LONG-NAME DESCRIPTION]\n\
481 The meanings of each elements is as follows:\n\
482 DIMENSION (integer) is the number of bytes to represent a character: 1 or 2.\n\
483 CHARS (integer) is the number of characters in a dimension: 94 or 96.\n\
484 WIDTH (integer) is the number of columns a character in the charset\n\
485 occupies on the screen: one of 0, 1, and 2.\n\
487 DIRECTION (integer) is the rendering direction of characters in the\n\
488 charset when rendering. If 0, render from right to left, else\n\
489 render from left to right.\n\
491 ISO-FINAL-CHAR (character) is the final character of the\n\
492 corresponding ISO 2022 charset.\n\
494 ISO-GRAPHIC-PLANE (integer) is the graphic plane to be invoked\n\
495 while encoding to variants of ISO 2022 coding system, one of the\n\
496 following: 0/graphic-plane-left(GL), 1/graphic-plane-right(GR).\n\
498 SHORT-NAME (string) is the short name to refer to the charset.\n\
500 LONG-NAME (string) is the long name to refer to the charset.\n\
502 DESCRIPTION (string) is the description string of the charset.")
503 (charset_id
, charset_symbol
, info_vector
)
504 Lisp_Object charset_id
, charset_symbol
, info_vector
;
508 if (!NILP (charset_id
))
509 CHECK_NUMBER (charset_id
, 0);
510 CHECK_SYMBOL (charset_symbol
, 1);
511 CHECK_VECTOR (info_vector
, 2);
513 if (! NILP (charset_id
))
515 if (! CHARSET_VALID_P (XINT (charset_id
)))
516 error ("Invalid CHARSET: %d", XINT (charset_id
));
517 else if (CHARSET_DEFINED_P (XINT (charset_id
)))
518 error ("Already defined charset: %d", XINT (charset_id
));
521 vec
= XVECTOR (info_vector
)->contents
;
522 if (XVECTOR (info_vector
)->size
!= 9
523 || !INTEGERP (vec
[0]) || !(XINT (vec
[0]) == 1 || XINT (vec
[0]) == 2)
524 || !INTEGERP (vec
[1]) || !(XINT (vec
[1]) == 94 || XINT (vec
[1]) == 96)
525 || !INTEGERP (vec
[2]) || !(XINT (vec
[2]) == 1 || XINT (vec
[2]) == 2)
526 || !INTEGERP (vec
[3]) || !(XINT (vec
[3]) == 0 || XINT (vec
[3]) == 1)
527 || !INTEGERP (vec
[4]) || !(XINT (vec
[4]) >= '0' && XINT (vec
[4]) <= '~')
528 || !INTEGERP (vec
[5]) || !(XINT (vec
[5]) == 0 || XINT (vec
[5]) == 1)
531 || !STRINGP (vec
[8]))
532 error ("Invalid info-vector argument for defining charset %s",
533 XSYMBOL (charset_symbol
)->name
->data
);
535 if (NILP (charset_id
))
537 charset_id
= get_new_private_charset_id (XINT (vec
[0]), XINT (vec
[2]));
538 if (XINT (charset_id
) == 0)
539 error ("There's no room for a new private charset %s",
540 XSYMBOL (charset_symbol
)->name
->data
);
543 update_charset_table (charset_id
, vec
[0], vec
[1], vec
[2], vec
[3],
544 vec
[4], vec
[5], vec
[6], vec
[7], vec
[8]);
545 Fput (charset_symbol
, Qcharset
, CHARSET_TABLE_ENTRY (XINT (charset_id
)));
546 CHARSET_SYMBOL (XINT (charset_id
)) = charset_symbol
;
547 Vcharset_list
= Fcons (charset_symbol
, Vcharset_list
);
551 DEFUN ("get-unused-iso-final-char", Fget_unused_iso_final_char
,
552 Sget_unused_iso_final_char
, 2, 2, 0,
553 "Return an unsed ISO's final char for a charset of DIMENISION and CHARS.\n\
554 DIMENSION is the number of bytes to represent a character: 1 or 2.\n\
555 CHARS is the number of characters in a dimension: 94 or 96.\n\
557 This final char is for private use, thus the range is `0' (48) .. `?' (63).\n\
558 If there's no unused final char for the specified kind of charset,\n\
561 Lisp_Object dimension
, chars
;
565 CHECK_NUMBER (dimension
, 0);
566 CHECK_NUMBER (chars
, 1);
567 if (XINT (dimension
) != 1 && XINT (dimension
) != 2)
568 error ("Invalid charset dimension %d, it should be 1 or 2",
570 if (XINT (chars
) != 94 && XINT (chars
) != 96)
571 error ("Invalid charset chars %d, it should be 94 or 96",
573 for (final_char
= '0'; final_char
<= '?'; final_char
++)
575 if (ISO_CHARSET_TABLE (dimension
, chars
, make_number (final_char
)) < 0)
578 return (final_char
<= '?' ? make_number (final_char
) : Qnil
);
581 DEFUN ("declare-equiv-charset", Fdeclare_equiv_charset
, Sdeclare_equiv_charset
,
583 "Declare a charset of DIMENSION, CHARS, FINAL-CHAR is the same as CHARSET.\n\
584 CHARSET should be defined by `defined-charset' in advance.")
585 (dimension
, chars
, final_char
, charset_symbol
)
586 Lisp_Object dimension
, chars
, final_char
, charset_symbol
;
590 CHECK_NUMBER (dimension
, 0);
591 CHECK_NUMBER (chars
, 1);
592 CHECK_NUMBER (final_char
, 2);
593 CHECK_SYMBOL (charset_symbol
, 3);
595 if (XINT (dimension
) != 1 && XINT (dimension
) != 2)
596 error ("Invalid DIMENSION %d, it should be 1 or 2", XINT (dimension
));
597 if (XINT (chars
) != 94 && XINT (chars
) != 96)
598 error ("Invalid CHARS %d, it should be 94 or 96", XINT (chars
));
599 if (XINT (final_char
) < '0' || XFASTINT (final_char
) > '~')
600 error ("Invalid FINAL-CHAR %c, it should be `0'..`~'", XINT (chars
));
601 if ((charset
= get_charset_id (charset_symbol
)) < 0)
602 error ("Invalid charset %s", XSYMBOL (charset_symbol
)->name
->data
);
604 ISO_CHARSET_TABLE (dimension
, chars
, final_char
) = charset
;
608 /* Return number of different charsets in STR of length LEN. In
609 addition, for each found charset N, CHARSETS[N] is set 1. The
610 caller should allocate CHARSETS (MAX_CHARSET + 1 elements) in advance.
611 It may lookup a unification table TABLE if supplied. */
614 find_charset_in_str (str
, len
, charsets
, table
)
619 register int num
= 0, c
;
621 if (! CHAR_TABLE_P (table
))
629 if (c
== LEADING_CODE_COMPOSITION
)
631 int cmpchar_id
= str_cmpchar_id (str
, len
);
636 struct cmpchar_info
*cmpcharp
= cmpchar_table
[cmpchar_id
];
639 for (i
= 0; i
< cmpcharp
->glyph_len
; i
++)
641 c
= cmpcharp
->glyph
[i
];
644 if ((c
= unify_char (table
, c
, 0, 0, 0)) < 0)
645 c
= cmpcharp
->glyph
[i
];
647 if ((charset
= CHAR_CHARSET (c
)) < 0)
648 charset
= CHARSET_ASCII
;
649 if (!charsets
[charset
])
651 charsets
[charset
] = 1;
655 str
+= cmpcharp
->len
;
656 len
-= cmpcharp
->len
;
660 charset
= CHARSET_ASCII
;
665 c
= STRING_CHAR_AND_LENGTH (str
, len
, bytes
);
668 int c1
= unify_char (table
, c
, 0, 0, 0);
672 charset
= CHAR_CHARSET (c
);
675 if (!charsets
[charset
])
677 charsets
[charset
] = 1;
686 DEFUN ("find-charset-region", Ffind_charset_region
, Sfind_charset_region
,
688 "Return a list of charsets in the region between BEG and END.\n\
689 BEG and END are buffer positions.\n\
690 Optional arg TABLE if non-nil is a unification table to look up.")
692 Lisp_Object beg
, end
, table
;
694 int charsets
[MAX_CHARSET
+ 1];
695 int from
, from_byte
, to
, stop
, stop_byte
, i
;
698 validate_region (&beg
, &end
);
699 from
= XFASTINT (beg
);
700 stop
= to
= XFASTINT (end
);
702 if (from
< GPT
&& GPT
< to
)
705 stop_byte
= GPT_BYTE
;
708 stop_byte
= CHAR_TO_BYTE (stop
);
710 from_byte
= CHAR_TO_BYTE (from
);
712 bzero (charsets
, (MAX_CHARSET
+ 1) * sizeof (int));
715 find_charset_in_str (BYTE_POS_ADDR (from_byte
), stop_byte
- from_byte
,
719 from
= stop
, from_byte
= stop_byte
;
720 stop
= to
, stop_byte
= CHAR_TO_BYTE (stop
);
727 for (i
= MAX_CHARSET
; i
>= 0; i
--)
729 val
= Fcons (CHARSET_SYMBOL (i
), val
);
733 DEFUN ("find-charset-string", Ffind_charset_string
, Sfind_charset_string
,
735 "Return a list of charsets in STR.\n\
736 Optional arg TABLE if non-nil is a unification table to look up.")
738 Lisp_Object str
, table
;
740 int charsets
[MAX_CHARSET
+ 1];
744 CHECK_STRING (str
, 0);
746 if (! STRING_MULTIBYTE (str
))
749 bzero (charsets
, (MAX_CHARSET
+ 1) * sizeof (int));
750 find_charset_in_str (XSTRING (str
)->data
, XSTRING (str
)->size_byte
,
753 for (i
= MAX_CHARSET
; i
>= 0; i
--)
755 val
= Fcons (CHARSET_SYMBOL (i
), val
);
759 DEFUN ("make-char-internal", Fmake_char_internal
, Smake_char_internal
, 1, 3, 0,
761 (charset
, code1
, code2
)
762 Lisp_Object charset
, code1
, code2
;
764 CHECK_NUMBER (charset
, 0);
767 XSETFASTINT (code1
, 0);
769 CHECK_NUMBER (code1
, 1);
771 XSETFASTINT (code2
, 0);
773 CHECK_NUMBER (code2
, 2);
775 if (!CHARSET_DEFINED_P (XINT (charset
)))
776 error ("Invalid charset: %d", XINT (charset
));
778 return make_number (MAKE_CHAR (XINT (charset
), XINT (code1
), XINT (code2
)));
781 DEFUN ("split-char", Fsplit_char
, Ssplit_char
, 1, 1, 0,
782 "Return list of charset and one or two position-codes of CHAR.")
789 CHECK_NUMBER (ch
, 0);
790 SPLIT_CHAR (XFASTINT (ch
), charset
, c1
, c2
);
792 ? Fcons (CHARSET_SYMBOL (charset
),
793 Fcons (make_number (c1
), Fcons (make_number (c2
), Qnil
)))
794 : Fcons (CHARSET_SYMBOL (charset
), Fcons (make_number (c1
), Qnil
)));
797 DEFUN ("char-charset", Fchar_charset
, Schar_charset
, 1, 1, 0,
798 "Return charset of CHAR.")
802 CHECK_NUMBER (ch
, 0);
804 return CHARSET_SYMBOL (CHAR_CHARSET (XINT (ch
)));
807 DEFUN ("iso-charset", Fiso_charset
, Siso_charset
, 3, 3, 0,
808 "Return charset of ISO's specification DIMENSION, CHARS, and FINAL-CHAR.\n\
810 ISO 2022's designation sequence (escape sequence) distinguishes charsets\n\
811 by their DIMENSION, CHARS, and FINAL-CHAR,\n\
812 where as Emacs distinguishes them by charset symbol.\n\
813 See the documentation of the function `charset-info' for the meanings of\n\
814 DIMENSION, CHARS, and FINAL-CHAR.")
815 (dimension
, chars
, final_char
)
816 Lisp_Object dimension
, chars
, final_char
;
820 CHECK_NUMBER (dimension
, 0);
821 CHECK_NUMBER (chars
, 1);
822 CHECK_NUMBER (final_char
, 2);
824 if ((charset
= ISO_CHARSET_TABLE (dimension
, chars
, final_char
)) < 0)
826 return CHARSET_SYMBOL (charset
);
829 /* If GENERICP is nonzero, return nonzero iff C is a valid normal or
830 generic character. If GENERICP is zero, return nonzero iff C is a
831 valid normal character. Do not call this function directly,
832 instead use macro CHAR_VALID_P. */
834 char_valid_p (c
, genericp
)
841 if (SINGLE_BYTE_CHAR_P (c
))
843 SPLIT_NON_ASCII_CHAR (c
, charset
, c1
, c2
);
844 if (!CHARSET_VALID_P (charset
))
846 return (c
< MIN_CHAR_COMPOSITION
847 ? ((c
& CHAR_FIELD1_MASK
) /* i.e. dimension of C is two. */
848 ? (genericp
&& c1
== 0 && c2
== 0
849 || c1
>= 32 && c2
>= 32)
850 : (genericp
&& c1
== 0
852 : c
< MIN_CHAR_COMPOSITION
+ n_cmpchars
);
855 DEFUN ("char-valid-p", Fchar_valid_p
, Schar_valid_p
, 1, 2, 0,
856 "Return t if OBJECT is a valid normal character.\n\
857 If optional arg GENERICP is non-nil, also return t if OBJECT is\n\
858 a valid generic character.")
860 Lisp_Object object
, genericp
;
862 if (! NATNUMP (object
))
864 return (CHAR_VALID_P (XFASTINT (object
), !NILP (genericp
)) ? Qt
: Qnil
);
867 DEFUN ("unibyte-char-to-multibyte", Funibyte_char_to_multibyte
,
868 Sunibyte_char_to_multibyte
, 1, 1, 0,
869 "Convert the unibyte character CH to multibyte character.\n\
870 The conversion is done based on nonascii-translate-table (which see)\n\
871 or nonascii-insert-offset (which see).")
877 CHECK_NUMBER (ch
, 0);
879 if (c
< 0 || c
>= 0400)
880 error ("Invalid unibyte character: %d", c
);
881 c
= unibyte_char_to_multibyte (c
);
883 error ("Can't convert to multibyte character: %d", XINT (ch
));
884 return make_number (c
);
887 DEFUN ("char-bytes", Fchar_bytes
, Schar_bytes
, 1, 1, 0,
888 "Return byte length of multi-byte form of CHAR.")
895 CHECK_NUMBER (ch
, 0);
896 if (COMPOSITE_CHAR_P (XFASTINT (ch
)))
898 unsigned int id
= COMPOSITE_CHAR_ID (XFASTINT (ch
));
900 bytes
= (id
< n_cmpchars
? cmpchar_table
[id
]->len
: 1);
904 int charset
= CHAR_CHARSET (XFASTINT (ch
));
906 bytes
= CHARSET_DEFINED_P (charset
) ? CHARSET_BYTES (charset
) : 1;
909 XSETFASTINT (val
, bytes
);
913 /* Return the width of character of which multi-byte form starts with
914 C. The width is measured by how many columns occupied on the
915 screen when displayed in the current buffer. */
917 #define ONE_BYTE_CHAR_WIDTH(c) \
920 ? XFASTINT (current_buffer->tab_width) \
921 : (c == '\n' ? 0 : (NILP (current_buffer->ctl_arrow) ? 4 : 2))) \
925 ? (NILP (current_buffer->ctl_arrow) ? 4 : 2) \
926 : ((! NILP (current_buffer->enable_multibyte_characters) \
927 && BASE_LEADING_CODE_P (c)) \
928 ? WIDTH_BY_CHAR_HEAD (c) \
932 DEFUN ("char-width", Fchar_width, Schar_width, 1, 1, 0,
933 "Return width of CHAR when displayed in the current buffer.\n\
934 The width is measured by how many columns it occupies on the screen.")
938 Lisp_Object val
, disp
;
940 struct Lisp_Char_Table
*dp
= buffer_display_table ();
942 CHECK_NUMBER (ch
, 0);
946 /* Get the way the display table would display it. */
947 disp
= dp
? DISP_CHAR_VECTOR (dp
, c
) : Qnil
;
950 XSETINT (val
, XVECTOR (disp
)->size
);
951 else if (SINGLE_BYTE_CHAR_P (c
))
952 XSETINT (val
, ONE_BYTE_CHAR_WIDTH (c
));
953 else if (COMPOSITE_CHAR_P (c
))
955 int id
= COMPOSITE_CHAR_ID (XFASTINT (ch
));
956 XSETFASTINT (val
, (id
< n_cmpchars
? cmpchar_table
[id
]->width
: 0));
960 int charset
= CHAR_CHARSET (c
);
962 XSETFASTINT (val
, CHARSET_WIDTH (charset
));
967 /* Return width of string STR of length LEN when displayed in the
968 current buffer. The width is measured by how many columns it
969 occupies on the screen. */
976 unsigned char *endp
= str
+ len
;
978 struct Lisp_Char_Table
*dp
= buffer_display_table ();
982 if (*str
== LEADING_CODE_COMPOSITION
)
984 int id
= str_cmpchar_id (str
, endp
- str
);
993 width
+= cmpchar_table
[id
]->width
;
994 str
+= cmpchar_table
[id
]->len
;
1001 int c
= STRING_CHAR_AND_LENGTH (str
, endp
- str
, thislen
);
1003 /* Get the way the display table would display it. */
1005 disp
= DISP_CHAR_VECTOR (dp
, c
);
1010 width
+= XVECTOR (disp
)->size
;
1012 width
+= ONE_BYTE_CHAR_WIDTH (*str
);
1020 DEFUN ("string-width", Fstring_width
, Sstring_width
, 1, 1, 0,
1021 "Return width of STRING when displayed in the current buffer.\n\
1022 Width is measured by how many columns it occupies on the screen.\n\
1023 When calculating width of a multibyte character in STRING,\n\
1024 only the base leading-code is considered; the validity of\n\
1025 the following bytes is not checked.")
1031 CHECK_STRING (str
, 0);
1032 XSETFASTINT (val
, strwidth (XSTRING (str
)->data
, XSTRING (str
)->size_byte
));
1036 DEFUN ("char-direction", Fchar_direction
, Schar_direction
, 1, 1, 0,
1037 "Return the direction of CHAR.\n\
1038 The returned value is 0 for left-to-right and 1 for right-to-left.")
1044 CHECK_NUMBER (ch
, 0);
1045 charset
= CHAR_CHARSET (XFASTINT (ch
));
1046 if (!CHARSET_DEFINED_P (charset
))
1047 invalid_character (XINT (ch
));
1048 return CHARSET_TABLE_INFO (charset
, CHARSET_DIRECTION_IDX
);
1051 DEFUN ("chars-in-region", Fchars_in_region
, Schars_in_region
, 2, 2, 0,
1052 "Return number of characters between BEG and END.")
1054 Lisp_Object beg
, end
;
1058 from
= min (XFASTINT (beg
), XFASTINT (end
));
1059 to
= max (XFASTINT (beg
), XFASTINT (end
));
1064 /* Return the number of characters in the NBYTES bytes at PTR.
1065 This works by looking at the contents and checking for multibyte sequences.
1066 However, if the current buffer has enable-multibyte-characters = nil,
1067 we treat each byte as a character. */
1070 chars_in_text (ptr
, nbytes
)
1074 unsigned char *endp
, c
;
1077 /* current_buffer is null at early stages of Emacs initialization. */
1078 if (current_buffer
== 0
1079 || NILP (current_buffer
->enable_multibyte_characters
))
1082 endp
= ptr
+ nbytes
;
1089 if (BASE_LEADING_CODE_P (c
))
1090 while (ptr
< endp
&& ! CHAR_HEAD_P (*ptr
)) ptr
++;
1097 /* Return the number of characters in the NBYTES bytes at PTR.
1098 This works by looking at the contents and checking for multibyte sequences.
1099 It ignores enable-multibyte-characters. */
1102 multibyte_chars_in_text (ptr
, nbytes
)
1106 unsigned char *endp
, c
;
1109 endp
= ptr
+ nbytes
;
1116 if (BASE_LEADING_CODE_P (c
))
1117 while (ptr
< endp
&& ! CHAR_HEAD_P (*ptr
)) ptr
++;
1124 DEFUN ("string", Fstring
, Sstring
, 1, MANY
, 0,
1125 "Concatenate all the argument characters and make the result a string.")
1132 = (unsigned char *) alloca (MAX_LENGTH_OF_MULTI_BYTE_FORM
* n
);
1133 unsigned char *p
= buf
;
1136 for (i
= 0; i
< n
; i
++)
1141 if (!INTEGERP (args
[i
]))
1142 CHECK_NUMBER (args
[i
], 0);
1144 len
= CHAR_STRING (c
, p
, str
);
1146 /* C is a composite character. */
1147 bcopy (str
, p
, len
);
1151 val
= make_multibyte_string (buf
, n
, p
- buf
);
1157 /*** Composite characters staffs ***/
1159 /* Each composite character is identified by CMPCHAR-ID which is
1160 assigned when Emacs needs the character code of the composite
1161 character (e.g. when displaying it on the screen). See the
1162 document "GENERAL NOTE on COMPOSITE CHARACTER" in `charset.h' how a
1163 composite character is represented in Emacs. */
1165 /* If `static' is defined, it means that it is defined to null string. */
1167 /* The following function is copied from lread.c. */
1169 hash_string (ptr
, len
)
1173 register unsigned char *p
= ptr
;
1174 register unsigned char *end
= p
+ len
;
1175 register unsigned char c
;
1176 register int hash
= 0;
1181 if (c
>= 0140) c
-= 40;
1182 hash
= ((hash
<<3) + (hash
>>28) + c
);
1184 return hash
& 07777777777;
1188 #define CMPCHAR_HASH_TABLE_SIZE 0xFFF
1190 static int *cmpchar_hash_table
[CMPCHAR_HASH_TABLE_SIZE
];
1192 /* Each element of `cmpchar_hash_table' is a pointer to an array of
1193 integer, where the 1st element is the size of the array, the 2nd
1194 element is how many elements are actually used in the array, and
1195 the remaining elements are CMPCHAR-IDs of composite characters of
1196 the same hash value. */
1197 #define CMPCHAR_HASH_SIZE(table) table[0]
1198 #define CMPCHAR_HASH_USED(table) table[1]
1199 #define CMPCHAR_HASH_CMPCHAR_ID(table, i) table[i]
1201 /* Return CMPCHAR-ID of the composite character in STR of the length
1202 LEN. If the composite character has not yet been registered,
1203 register it in `cmpchar_table' and assign new CMPCHAR-ID. This
1204 is the sole function for assigning CMPCHAR-ID. */
1206 str_cmpchar_id (str
, len
)
1210 int hash_idx
, *hashp
;
1212 int embedded_rule
; /* 1 if composition rule is embedded. */
1213 int chars
; /* number of components. */
1215 struct cmpchar_info
*cmpcharp
;
1217 /* The second byte 0xFF means compostion rule is embedded. */
1218 embedded_rule
= (str
[1] == 0xFF);
1220 /* At first, get the actual length of the composite character. */
1222 unsigned char *p
, *endp
= str
+ 1, *lastp
= str
+ len
;
1225 while (endp
< lastp
&& ! CHAR_HEAD_P (*endp
)) endp
++;
1227 /* Any composite char have at least 5-byte length. */
1234 if (embedded_rule
) p
++;
1235 /* No need of checking if *P is 0xA0 because
1236 BYTES_BY_CHAR_HEAD (0x80) surely returns 2. */
1237 p
+= BYTES_BY_CHAR_HEAD (*p
- 0x20);
1240 if (p
> endp
|| chars
< 2 || chars
> MAX_COMPONENT_COUNT
)
1241 /* Invalid components. */
1245 hash_idx
= hash_string (str
, len
) % CMPCHAR_HASH_TABLE_SIZE
;
1246 hashp
= cmpchar_hash_table
[hash_idx
];
1248 /* Then, look into the hash table. */
1250 /* Find the correct one among composite characters of the same
1252 for (i
= 2; i
< CMPCHAR_HASH_USED (hashp
); i
++)
1254 cmpcharp
= cmpchar_table
[CMPCHAR_HASH_CMPCHAR_ID (hashp
, i
)];
1255 if (len
== cmpcharp
->len
1256 && ! bcmp (str
, cmpcharp
->data
, len
))
1257 return CMPCHAR_HASH_CMPCHAR_ID (hashp
, i
);
1260 /* We have to register the composite character in cmpchar_table. */
1261 if (n_cmpchars
> (CHAR_FIELD2_MASK
| CHAR_FIELD3_MASK
))
1262 /* No, we have no more room for a new composite character. */
1265 /* Make the entry in hash table. */
1268 /* Make a table for 8 composite characters initially. */
1269 hashp
= (cmpchar_hash_table
[hash_idx
]
1270 = (int *) xmalloc (sizeof (int) * (2 + 8)));
1271 CMPCHAR_HASH_SIZE (hashp
) = 10;
1272 CMPCHAR_HASH_USED (hashp
) = 2;
1274 else if (CMPCHAR_HASH_USED (hashp
) >= CMPCHAR_HASH_SIZE (hashp
))
1276 CMPCHAR_HASH_SIZE (hashp
) += 8;
1277 hashp
= (cmpchar_hash_table
[hash_idx
]
1278 = (int *) xrealloc (hashp
,
1279 sizeof (int) * CMPCHAR_HASH_SIZE (hashp
)));
1281 CMPCHAR_HASH_CMPCHAR_ID (hashp
, CMPCHAR_HASH_USED (hashp
)) = n_cmpchars
;
1282 CMPCHAR_HASH_USED (hashp
)++;
1284 /* Set information of the composite character in cmpchar_table. */
1285 if (cmpchar_table_size
== 0)
1287 /* This is the first composite character to be registered. */
1288 cmpchar_table_size
= 256;
1290 = (struct cmpchar_info
**) xmalloc (sizeof (cmpchar_table
[0])
1291 * cmpchar_table_size
);
1293 else if (cmpchar_table_size
<= n_cmpchars
)
1295 cmpchar_table_size
+= 256;
1297 = (struct cmpchar_info
**) xrealloc (cmpchar_table
,
1298 sizeof (cmpchar_table
[0])
1299 * cmpchar_table_size
);
1302 cmpcharp
= (struct cmpchar_info
*) xmalloc (sizeof (struct cmpchar_info
));
1304 cmpcharp
->len
= len
;
1305 cmpcharp
->data
= (unsigned char *) xmalloc (len
+ 1);
1306 bcopy (str
, cmpcharp
->data
, len
);
1307 cmpcharp
->data
[len
] = 0;
1308 cmpcharp
->glyph_len
= chars
;
1309 cmpcharp
->glyph
= (GLYPH
*) xmalloc (sizeof (GLYPH
) * chars
);
1312 cmpcharp
->cmp_rule
= (unsigned char *) xmalloc (chars
);
1313 cmpcharp
->col_offset
= (float *) xmalloc (sizeof (float) * chars
);
1317 cmpcharp
->cmp_rule
= NULL
;
1318 cmpcharp
->col_offset
= NULL
;
1321 /* Setup GLYPH data and composition rules (if any) so as not to make
1322 them every time on displaying. */
1324 unsigned char *bufp
;
1326 float leftmost
= 0.0, rightmost
= 1.0;
1329 /* At first, col_offset[N] is set to relative to col_offset[0]. */
1330 cmpcharp
->col_offset
[0] = 0;
1332 for (i
= 0, bufp
= cmpcharp
->data
+ 1; i
< chars
; i
++)
1335 cmpcharp
->cmp_rule
[i
] = *bufp
++;
1337 if (*bufp
== 0xA0) /* This is an ASCII character. */
1339 cmpcharp
->glyph
[i
] = FAST_MAKE_GLYPH ((*++bufp
& 0x7F), 0);
1343 else /* Multibyte character. */
1345 /* Make `bufp' point normal multi-byte form temporally. */
1348 = FAST_MAKE_GLYPH (string_to_non_ascii_char (bufp
, 4, 0), 0);
1349 width
= WIDTH_BY_CHAR_HEAD (*bufp
);
1351 bufp
+= BYTES_BY_CHAR_HEAD (*bufp
- 0x20);
1354 if (embedded_rule
&& i
> 0)
1356 /* Reference points (global_ref and new_ref) are
1367 Now, we calculate the column offset of the new glyph
1368 from the left edge of the first glyph. This can avoid
1369 the same calculation everytime displaying this
1370 composite character. */
1372 /* Reference points of global glyph and new glyph. */
1373 int global_ref
= (cmpcharp
->cmp_rule
[i
] - 0xA0) / 9;
1374 int new_ref
= (cmpcharp
->cmp_rule
[i
] - 0xA0) % 9;
1375 /* Column offset relative to the first glyph. */
1376 float left
= (leftmost
1377 + (global_ref
% 3) * (rightmost
- leftmost
) / 2.0
1378 - (new_ref
% 3) * width
/ 2.0);
1380 cmpcharp
->col_offset
[i
] = left
;
1381 if (left
< leftmost
)
1383 if (left
+ width
> rightmost
)
1384 rightmost
= left
+ width
;
1388 if (width
> rightmost
)
1394 /* Now col_offset[N] are relative to the left edge of the
1395 first component. Make them relative to the left edge of
1397 for (i
= 0; i
< chars
; i
++)
1398 cmpcharp
->col_offset
[i
] -= leftmost
;
1399 /* Make rightmost holds width of overall glyph. */
1400 rightmost
-= leftmost
;
1403 cmpcharp
->width
= rightmost
;
1404 if (cmpcharp
->width
< rightmost
)
1405 /* To get a ceiling integer value. */
1409 cmpchar_table
[n_cmpchars
] = cmpcharp
;
1411 return n_cmpchars
++;
1414 /* Return the Nth element of the composite character C. */
1416 cmpchar_component (c
, n
)
1419 int id
= COMPOSITE_CHAR_ID (c
);
1421 if (id
>= n_cmpchars
/* C is not a valid composite character. */
1422 || n
>= cmpchar_table
[id
]->glyph_len
) /* No such component. */
1424 /* No face data is stored in glyph code. */
1425 return ((int) (cmpchar_table
[id
]->glyph
[n
]));
1428 DEFUN ("cmpcharp", Fcmpcharp
, Scmpcharp
, 1, 1, 0,
1429 "T if CHAR is a composite character.")
1433 CHECK_NUMBER (ch
, 0);
1434 return (COMPOSITE_CHAR_P (XINT (ch
)) ? Qt
: Qnil
);
1437 DEFUN ("composite-char-component", Fcmpchar_component
, Scmpchar_component
,
1439 "Return the IDXth component character of composite character CHARACTER.")
1441 Lisp_Object character
, idx
;
1445 CHECK_NUMBER (character
, 0);
1446 CHECK_NUMBER (idx
, 1);
1448 if ((c
= cmpchar_component (XINT (character
), XINT (idx
))) < 0)
1449 args_out_of_range (character
, idx
);
1451 return make_number (c
);
1454 DEFUN ("composite-char-composition-rule", Fcmpchar_cmp_rule
, Scmpchar_cmp_rule
,
1456 "Return the Nth composition rule embedded in composite character CHARACTER.\n\
1457 The returned rule is for composing the Nth component\n\
1458 on the (N-1)th component. If N is 0, the returned value is always 255.")
1460 Lisp_Object character
, n
;
1464 CHECK_NUMBER (character
, 0);
1465 CHECK_NUMBER (n
, 1);
1467 id
= COMPOSITE_CHAR_ID (XINT (character
));
1468 if (id
< 0 || id
>= n_cmpchars
)
1469 error ("Invalid composite character: %d", XINT (character
));
1471 if (i
> cmpchar_table
[id
]->glyph_len
)
1472 args_out_of_range (character
, n
);
1474 return make_number (cmpchar_table
[id
]->cmp_rule
[i
]);
1477 DEFUN ("composite-char-composition-rule-p", Fcmpchar_cmp_rule_p
,
1478 Scmpchar_cmp_rule_p
, 1, 1, 0,
1479 "Return non-nil if composite character CHARACTER contains a embedded rule.")
1481 Lisp_Object character
;
1485 CHECK_NUMBER (character
, 0);
1486 id
= COMPOSITE_CHAR_ID (XINT (character
));
1487 if (id
< 0 || id
>= n_cmpchars
)
1488 error ("Invalid composite character: %d", XINT (character
));
1490 return (cmpchar_table
[id
]->cmp_rule
? Qt
: Qnil
);
1493 DEFUN ("composite-char-component-count", Fcmpchar_cmp_count
,
1494 Scmpchar_cmp_count
, 1, 1, 0,
1495 "Return number of compoents of composite character CHARACTER.")
1497 Lisp_Object character
;
1501 CHECK_NUMBER (character
, 0);
1502 id
= COMPOSITE_CHAR_ID (XINT (character
));
1503 if (id
< 0 || id
>= n_cmpchars
)
1504 error ("Invalid composite character: %d", XINT (character
));
1506 return (make_number (cmpchar_table
[id
]->glyph_len
));
1509 DEFUN ("compose-string", Fcompose_string
, Scompose_string
,
1511 "Return one char string composed from all characters in STRING.")
1515 unsigned char buf
[MAX_LENGTH_OF_MULTI_BYTE_FORM
], *p
, *pend
, *ptemp
;
1518 CHECK_STRING (str
, 0);
1520 buf
[0] = LEADING_CODE_COMPOSITION
;
1521 p
= XSTRING (str
)->data
;
1522 pend
= p
+ XSTRING (str
)->size_byte
;
1526 if (*p
< 0x20 || *p
== 127) /* control code */
1527 error ("Invalid component character: %d", *p
);
1528 else if (*p
< 0x80) /* ASCII */
1530 if (i
+ 2 >= MAX_LENGTH_OF_MULTI_BYTE_FORM
)
1531 error ("Too long string to be composed: %s", XSTRING (str
)->data
);
1532 /* Prepend an ASCII charset indicator 0xA0, set MSB of the
1535 buf
[i
++] = *p
++ + 0x80;
1537 else if (*p
== LEADING_CODE_COMPOSITION
) /* composite char */
1539 /* Already composed. Eliminate the heading
1540 LEADING_CODE_COMPOSITION, keep the remaining bytes
1544 while (! CHAR_HEAD_P (*p
)) p
++;
1545 if (i
+ (p
- ptemp
) >= MAX_LENGTH_OF_MULTI_BYTE_FORM
)
1546 error ("Too long string to be composed: %s", XSTRING (str
)->data
);
1547 bcopy (ptemp
, buf
+ i
, p
- ptemp
);
1550 else /* multibyte char */
1552 /* Add 0x20 to the base leading-code, keep the remaining
1554 len
= BYTES_BY_CHAR_HEAD (*p
);
1555 if (i
+ len
>= MAX_LENGTH_OF_MULTI_BYTE_FORM
)
1556 error ("Too long string to be composed: %s", XSTRING (str
)->data
);
1557 bcopy (p
, buf
+ i
, len
);
1564 /* STR contains only one character, which can't be composed. */
1565 error ("Too short string to be composed: %s", XSTRING (str
)->data
);
1567 return make_multibyte_string (buf
, 1, i
);
1571 charset_id_internal (charset_name
)
1574 Lisp_Object val
= Fget (intern (charset_name
), Qcharset
);
1577 error ("Charset %s is not defined", charset_name
);
1579 return (XINT (XVECTOR (val
)->contents
[0]));
1582 DEFUN ("setup-special-charsets", Fsetup_special_charsets
,
1583 Ssetup_special_charsets
, 0, 0, 0, "Internal use only.")
1586 charset_latin_iso8859_1
= charset_id_internal ("latin-iso8859-1");
1587 charset_jisx0208_1978
= charset_id_internal ("japanese-jisx0208-1978");
1588 charset_jisx0208
= charset_id_internal ("japanese-jisx0208");
1589 charset_katakana_jisx0201
= charset_id_internal ("katakana-jisx0201");
1590 charset_latin_jisx0201
= charset_id_internal ("latin-jisx0201");
1591 charset_big5_1
= charset_id_internal ("chinese-big5-1");
1592 charset_big5_2
= charset_id_internal ("chinese-big5-2");
1596 init_charset_once ()
1600 staticpro (&Vcharset_table
);
1601 staticpro (&Vcharset_symbol_table
);
1603 /* This has to be done here, before we call Fmake_char_table. */
1604 Qcharset_table
= intern ("charset-table");
1605 staticpro (&Qcharset_table
);
1607 /* Intern this now in case it isn't already done.
1608 Setting this variable twice is harmless.
1609 But don't staticpro it here--that is done in alloc.c. */
1610 Qchar_table_extra_slots
= intern ("char-table-extra-slots");
1612 /* Now we are ready to set up this property, so we can
1613 create the charset table. */
1614 Fput (Qcharset_table
, Qchar_table_extra_slots
, make_number (0));
1615 Vcharset_table
= Fmake_char_table (Qcharset_table
, Qnil
);
1617 Vcharset_symbol_table
= Fmake_vector (make_number (MAX_CHARSET
+ 1), Qnil
);
1620 for (i
= 0; i
< 2; i
++)
1621 for (j
= 0; j
< 2; j
++)
1622 for (k
= 0; k
< 128; k
++)
1623 iso_charset_table
[i
][j
][k
] = -1;
1625 bzero (cmpchar_hash_table
, sizeof cmpchar_hash_table
);
1626 cmpchar_table_size
= n_cmpchars
= 0;
1628 for (i
= 0; i
< 256; i
++)
1629 BYTES_BY_CHAR_HEAD (i
) = 1;
1630 BYTES_BY_CHAR_HEAD (LEADING_CODE_PRIVATE_11
) = 3;
1631 BYTES_BY_CHAR_HEAD (LEADING_CODE_PRIVATE_12
) = 3;
1632 BYTES_BY_CHAR_HEAD (LEADING_CODE_PRIVATE_21
) = 4;
1633 BYTES_BY_CHAR_HEAD (LEADING_CODE_PRIVATE_22
) = 4;
1634 /* The following doesn't reflect the actual bytes, but just to tell
1635 that it is a start of a multibyte character. */
1636 BYTES_BY_CHAR_HEAD (LEADING_CODE_COMPOSITION
) = 2;
1638 for (i
= 0; i
< 128; i
++)
1639 WIDTH_BY_CHAR_HEAD (i
) = 1;
1640 for (; i
< 256; i
++)
1641 WIDTH_BY_CHAR_HEAD (i
) = 4;
1642 WIDTH_BY_CHAR_HEAD (LEADING_CODE_PRIVATE_11
) = 1;
1643 WIDTH_BY_CHAR_HEAD (LEADING_CODE_PRIVATE_12
) = 2;
1644 WIDTH_BY_CHAR_HEAD (LEADING_CODE_PRIVATE_21
) = 1;
1645 WIDTH_BY_CHAR_HEAD (LEADING_CODE_PRIVATE_22
) = 2;
1652 Qascii
= intern ("ascii");
1653 staticpro (&Qascii
);
1655 Qcharset
= intern ("charset");
1656 staticpro (&Qcharset
);
1658 /* Define ASCII charset now. */
1659 update_charset_table (make_number (CHARSET_ASCII
),
1660 make_number (1), make_number (94),
1665 build_string ("ASCII"),
1666 build_string ("ASCII"),
1667 build_string ("ASCII (ISO646 IRV)"));
1668 CHARSET_SYMBOL (CHARSET_ASCII
) = Qascii
;
1669 Fput (Qascii
, Qcharset
, CHARSET_TABLE_ENTRY (CHARSET_ASCII
));
1671 Qcomposition
= intern ("composition");
1672 staticpro (&Qcomposition
);
1673 CHARSET_SYMBOL (CHARSET_COMPOSITION
) = Qcomposition
;
1675 defsubr (&Sdefine_charset
);
1676 defsubr (&Sget_unused_iso_final_char
);
1677 defsubr (&Sdeclare_equiv_charset
);
1678 defsubr (&Sfind_charset_region
);
1679 defsubr (&Sfind_charset_string
);
1680 defsubr (&Smake_char_internal
);
1681 defsubr (&Ssplit_char
);
1682 defsubr (&Schar_charset
);
1683 defsubr (&Siso_charset
);
1684 defsubr (&Schar_valid_p
);
1685 defsubr (&Sunibyte_char_to_multibyte
);
1686 defsubr (&Schar_bytes
);
1687 defsubr (&Schar_width
);
1688 defsubr (&Sstring_width
);
1689 defsubr (&Schar_direction
);
1690 defsubr (&Schars_in_region
);
1692 defsubr (&Scmpcharp
);
1693 defsubr (&Scmpchar_component
);
1694 defsubr (&Scmpchar_cmp_rule
);
1695 defsubr (&Scmpchar_cmp_rule_p
);
1696 defsubr (&Scmpchar_cmp_count
);
1697 defsubr (&Scompose_string
);
1698 defsubr (&Ssetup_special_charsets
);
1700 DEFVAR_LISP ("charset-list", &Vcharset_list
,
1701 "List of charsets ever defined.");
1702 Vcharset_list
= Fcons (Qascii
, Qnil
);
1704 DEFVAR_LISP ("character-unification-table-vector",
1705 &Vcharacter_unification_table_vector
,
1706 "Vector of cons cell of a symbol and unification table ever defined.\n\
1707 An ID of a unification table is an index of this vector.");
1708 Vcharacter_unification_table_vector
= Fmake_vector (make_number (16), Qnil
);
1710 DEFVAR_INT ("leading-code-composition", &leading_code_composition
,
1711 "Leading-code of composite characters.");
1712 leading_code_composition
= LEADING_CODE_COMPOSITION
;
1714 DEFVAR_INT ("leading-code-private-11", &leading_code_private_11
,
1715 "Leading-code of private TYPE9N charset of column-width 1.");
1716 leading_code_private_11
= LEADING_CODE_PRIVATE_11
;
1718 DEFVAR_INT ("leading-code-private-12", &leading_code_private_12
,
1719 "Leading-code of private TYPE9N charset of column-width 2.");
1720 leading_code_private_12
= LEADING_CODE_PRIVATE_12
;
1722 DEFVAR_INT ("leading-code-private-21", &leading_code_private_21
,
1723 "Leading-code of private TYPE9Nx9N charset of column-width 1.");
1724 leading_code_private_21
= LEADING_CODE_PRIVATE_21
;
1726 DEFVAR_INT ("leading-code-private-22", &leading_code_private_22
,
1727 "Leading-code of private TYPE9Nx9N charset of column-width 2.");
1728 leading_code_private_22
= LEADING_CODE_PRIVATE_22
;
1730 DEFVAR_INT ("nonascii-insert-offset", &nonascii_insert_offset
,
1731 "Offset for converting non-ASCII unibyte codes 0240...0377 to multibyte.\n\
1732 This is used for converting unibyte text to multibyte,\n\
1733 and for inserting character codes specified by number.\n\n\
1734 Conversion is performed only when multibyte characters are enabled,\n\
1735 and it serves to convert a Latin-1 or similar 8-bit character code\n\
1736 to the corresponding Emacs character code.\n\
1737 If `nonascii-translate-table' is non-nil, it overrides this variable.");
1738 nonascii_insert_offset
= 0;
1740 DEFVAR_LISP ("nonascii-translate-table", &Vnonascii_translate_table
,
1741 "Translate table for converting non-ASCII unibyte codes to multibyte.\n\
1742 This is used for converting unibyte text to multibyte,\n\
1743 and for inserting character codes specified by number.\n\n\
1744 Conversion is performed only when multibyte characters are enabled,\n\
1745 and it serves to convert a Latin-1 or similar 8-bit character code\n\
1746 to the corresponding Emacs character code.\n\n\
1747 If this is nil, `nonascii-insert-offset' is used instead.");
1748 Vnonascii_translate_table
= Qnil
;
1750 DEFVAR_INT ("min-composite-char", &min_composite_char
,
1751 "Minimum character code of a composite character.");
1752 min_composite_char
= MIN_CHAR_COMPOSITION
;