1 /* Basic multilingual character support.
2 Copyright (C) 1995, 1997, 1998 Electrotechnical Laboratory, JAPAN.
3 Licensed to the Free Software Foundation.
5 This file is part of GNU Emacs.
7 GNU Emacs is free software; you can redistribute it and/or modify
8 it under the terms of the GNU General Public License as published by
9 the Free Software Foundation; either version 2, or (at your option)
12 GNU Emacs is distributed in the hope that it will be useful,
13 but WITHOUT ANY WARRANTY; without even the implied warranty of
14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 GNU General Public License for more details.
17 You should have received a copy of the GNU General Public License
18 along with GNU Emacs; see the file COPYING. If not, write to
19 the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
20 Boston, MA 02111-1307, USA. */
22 /* At first, see the document in `charset.h' to understand the code in
29 #include <sys/types.h>
43 Lisp_Object Qcharset
, Qascii
, Qcomposition
;
45 /* Declaration of special leading-codes. */
46 int leading_code_composition
; /* for composite characters */
47 int leading_code_private_11
; /* for private DIMENSION1 of 1-column */
48 int leading_code_private_12
; /* for private DIMENSION1 of 2-column */
49 int leading_code_private_21
; /* for private DIMENSION2 of 1-column */
50 int leading_code_private_22
; /* for private DIMENSION2 of 2-column */
52 /* Declaration of special charsets. */
53 int charset_ascii
; /* ASCII */
54 int charset_composition
; /* for a composite character */
55 int charset_latin_iso8859_1
; /* ISO8859-1 (Latin-1) */
56 int charset_jisx0208_1978
; /* JISX0208.1978 (Japanese Kanji old set) */
57 int charset_jisx0208
; /* JISX0208.1983 (Japanese Kanji) */
58 int charset_katakana_jisx0201
; /* JISX0201.Kana (Japanese Katakana) */
59 int charset_latin_jisx0201
; /* JISX0201.Roman (Japanese Roman) */
60 int charset_big5_1
; /* Big5 Level 1 (Chinese Traditional) */
61 int charset_big5_2
; /* Big5 Level 2 (Chinese Traditional) */
63 int min_composite_char
;
65 Lisp_Object Qcharset_table
;
67 /* A char-table containing information of each character set. */
68 Lisp_Object Vcharset_table
;
70 /* A vector of charset symbol indexed by charset-id. This is used
71 only for returning charset symbol from C functions. */
72 Lisp_Object Vcharset_symbol_table
;
74 /* A list of charset symbols ever defined. */
75 Lisp_Object Vcharset_list
;
77 /* Vector of unification table ever defined.
78 An ID of a unification table is an index of this vector. */
79 Lisp_Object Vcharacter_unification_table_vector
;
81 /* Tables used by macros BYTES_BY_CHAR_HEAD and WIDTH_BY_CHAR_HEAD. */
82 int bytes_by_char_head
[256];
83 int width_by_char_head
[256];
85 /* Mapping table from ISO2022's charset (specified by DIMENSION,
86 CHARS, and FINAL-CHAR) to Emacs' charset. */
87 int iso_charset_table
[2][2][128];
89 /* Table of pointers to the structure `cmpchar_info' indexed by
91 struct cmpchar_info
**cmpchar_table
;
92 /* The current size of `cmpchar_table'. */
93 static int cmpchar_table_size
;
94 /* Number of the current composite characters. */
97 /* Variables used locally in the macro FETCH_MULTIBYTE_CHAR. */
98 unsigned char *_fetch_multibyte_char_p
;
99 int _fetch_multibyte_char_len
;
101 /* Offset to add to a non-ASCII value when inserting it. */
102 int nonascii_insert_offset
;
104 /* Translation table for converting non-ASCII unibyte characters
105 to multibyte codes, or nil. */
106 Lisp_Object Vnonascii_translate_table
;
108 #define min(X, Y) ((X) < (Y) ? (X) : (Y))
109 #define max(X, Y) ((X) > (Y) ? (X) : (Y))
112 invalid_character (c
)
115 error ("Invalid character: %o, %d, 0x%x", c
);
119 /* Set STR a pointer to the multi-byte form of the character C. If C
120 is not a composite character, the multi-byte form is set in WORKBUF
121 and STR points WORKBUF. The caller should allocate at least 4-byte
122 area at WORKBUF in advance. Returns the length of the multi-byte
123 form. If C is an invalid character to have a multi-byte form,
126 Use macro `CHAR_STRING (C, WORKBUF, STR)' instead of calling this
127 function directly if C can be an ASCII character. */
130 non_ascii_char_to_string (c
, workbuf
, str
)
132 unsigned char *workbuf
, **str
;
136 if (COMPOSITE_CHAR_P (c
))
138 int cmpchar_id
= COMPOSITE_CHAR_ID (c
);
140 if (cmpchar_id
< n_cmpchars
)
142 *str
= cmpchar_table
[cmpchar_id
]->data
;
143 return cmpchar_table
[cmpchar_id
]->len
;
147 invalid_character (c
);
151 SPLIT_NON_ASCII_CHAR (c
, charset
, c1
, c2
);
153 || ! CHARSET_DEFINED_P (charset
)
154 || c1
>= 0 && c1
< 32
155 || c2
>= 0 && c2
< 32)
156 invalid_character (c
);
159 *workbuf
++ = CHARSET_LEADING_CODE_BASE (charset
);
160 if (*workbuf
= CHARSET_LEADING_CODE_EXT (charset
))
162 *workbuf
++ = c1
| 0x80;
164 *workbuf
++ = c2
| 0x80;
166 return (workbuf
- *str
);
169 /* Return a non-ASCII character of which multi-byte form is at STR of
170 length LEN. If ACTUAL_LEN is not NULL, the actual length of the
171 character is set to the address ACTUAL_LEN.
173 Use macro `STRING_CHAR (STR, LEN)' instead of calling this function
174 directly if STR can hold an ASCII character. */
176 string_to_non_ascii_char (str
, len
, actual_len
)
178 int len
, *actual_len
;
181 unsigned char c1
, c2
;
184 if (SPLIT_STRING (str
, len
, charset
, c1
, c2
) == CHARSET_ASCII
)
191 c
= MAKE_NON_ASCII_CHAR (charset
, c1
, c2
);
194 *actual_len
= (charset
== CHARSET_COMPOSITION
195 ? cmpchar_table
[COMPOSITE_CHAR_ID (c
)]->len
196 : BYTES_BY_CHAR_HEAD (*str
));
200 /* Return the length of the multi-byte form at string STR of length LEN. */
202 multibyte_form_length (str
, len
)
207 unsigned char c1
, c2
;
210 if (SPLIT_STRING (str
, len
, charset
, c1
, c2
) == CHARSET_ASCII
)
213 return (charset
== CHARSET_COMPOSITION
214 ? cmpchar_table
[(c1
<< 7) | c2
]->len
215 : BYTES_BY_CHAR_HEAD (*str
));
218 /* Check if string STR of length LEN contains valid multi-byte form of
219 a character. If valid, charset and position codes of the character
220 is set at *CHARSET, *C1, and *C2, and return 0. If not valid,
221 return -1. This should be used only in the macro SPLIT_STRING
222 which checks range of STR in advance. */
224 split_non_ascii_string (str
, len
, charset
, c1
, c2
)
225 register unsigned char *str
, *c1
, *c2
;
226 register int len
, *charset
;
228 register unsigned int cs
= *str
++;
230 if (cs
== LEADING_CODE_COMPOSITION
)
232 int cmpchar_id
= str_cmpchar_id (str
- 1, len
);
236 *charset
= cs
, *c1
= cmpchar_id
>> 7, *c2
= cmpchar_id
& 0x7F;
238 else if ((cs
< LEADING_CODE_PRIVATE_11
|| (cs
= *str
++) >= 0xA0)
239 && CHARSET_DEFINED_P (cs
))
244 *c1
= (*str
++) & 0x7F;
245 if (CHARSET_DIMENSION (cs
) == 2)
249 *c2
= (*str
++) & 0x7F;
257 /* Return a character unified with C (or a character made of CHARSET,
258 C1, and C2) in unification table TABLE. If no unification is found
259 in TABLE, return C. */
260 unify_char (table
, c
, charset
, c1
, c2
)
262 int c
, charset
, c1
, c2
;
265 int alt_charset
, alt_c1
, alt_c2
, dimension
;
267 if (c
< 0) c
= MAKE_CHAR (charset
, c1
, c2
);
268 if (!CHAR_TABLE_P (table
)
269 || (ch
= Faref (table
, make_number (c
)), !INTEGERP (ch
))
273 SPLIT_CHAR (XFASTINT (ch
), alt_charset
, alt_c1
, alt_c2
);
274 dimension
= CHARSET_DIMENSION (alt_charset
);
275 if (dimension
== 1 && alt_c1
> 0 || dimension
== 2 && alt_c2
> 0)
276 /* CH is not a generic character, just return it. */
277 return XFASTINT (ch
);
279 /* Since CH is a generic character, we must return a specific
280 charater which has the same position codes as C from CH. */
282 SPLIT_CHAR (c
, charset
, c1
, c2
);
283 if (dimension
!= CHARSET_DIMENSION (charset
))
284 /* We can't make such a character because of dimension mismatch. */
286 return MAKE_CHAR (alt_charset
, c1
, c2
);
289 /* Convert the unibyte character C to multibyte based on
290 Vnonascii_translate_table or nonascii_insert_offset. If they can't
291 convert C to a valid multibyte character, convert it based on
292 DEFAULT_NONASCII_INSERT_OFFSET which makes C a Latin-1 character. */
294 unibyte_char_to_multibyte (c
)
297 if (c
>= 0240 && c
< 0400)
301 if (! NILP (Vnonascii_translate_table
))
302 c
= XINT (Faref (Vnonascii_translate_table
, make_number (c
)));
303 else if (nonascii_insert_offset
> 0)
304 c
+= nonascii_insert_offset
;
305 if (c
>= 0240 && (c
< 0400 || ! VALID_MULTIBYTE_CHAR_P (c
)))
306 c
= c_save
+ DEFAULT_NONASCII_INSERT_OFFSET
;
311 /* Update the table Vcharset_table with the given arguments (see the
312 document of `define-charset' for the meaning of each argument).
313 Several other table contents are also updated. The caller should
314 check the validity of CHARSET-ID and the remaining arguments in
318 update_charset_table (charset_id
, dimension
, chars
, width
, direction
,
319 iso_final_char
, iso_graphic_plane
,
320 short_name
, long_name
, description
)
321 Lisp_Object charset_id
, dimension
, chars
, width
, direction
;
322 Lisp_Object iso_final_char
, iso_graphic_plane
;
323 Lisp_Object short_name
, long_name
, description
;
325 int charset
= XINT (charset_id
);
327 unsigned char leading_code_base
, leading_code_ext
;
329 if (NILP (CHARSET_TABLE_ENTRY (charset
)))
330 CHARSET_TABLE_ENTRY (charset
)
331 = Fmake_vector (make_number (CHARSET_MAX_IDX
), Qnil
);
333 /* Get byte length of multibyte form, base leading-code, and
334 extended leading-code of the charset. See the comment under the
335 title "GENERAL NOTE on CHARACTER SET (CHARSET)" in charset.h. */
336 bytes
= XINT (dimension
);
337 if (charset
< MIN_CHARSET_PRIVATE_DIMENSION1
)
339 /* Official charset, it doesn't have an extended leading-code. */
340 if (charset
!= CHARSET_ASCII
)
341 bytes
+= 1; /* For a base leading-code. */
342 leading_code_base
= charset
;
343 leading_code_ext
= 0;
347 /* Private charset. */
348 bytes
+= 2; /* For base and extended leading-codes. */
350 = (charset
< LEADING_CODE_EXT_12
351 ? LEADING_CODE_PRIVATE_11
352 : (charset
< LEADING_CODE_EXT_21
353 ? LEADING_CODE_PRIVATE_12
354 : (charset
< LEADING_CODE_EXT_22
355 ? LEADING_CODE_PRIVATE_21
356 : LEADING_CODE_PRIVATE_22
)));
357 leading_code_ext
= charset
;
360 CHARSET_TABLE_INFO (charset
, CHARSET_ID_IDX
) = charset_id
;
361 CHARSET_TABLE_INFO (charset
, CHARSET_BYTES_IDX
) = make_number (bytes
);
362 CHARSET_TABLE_INFO (charset
, CHARSET_DIMENSION_IDX
) = dimension
;
363 CHARSET_TABLE_INFO (charset
, CHARSET_CHARS_IDX
) = chars
;
364 CHARSET_TABLE_INFO (charset
, CHARSET_WIDTH_IDX
) = width
;
365 CHARSET_TABLE_INFO (charset
, CHARSET_DIRECTION_IDX
) = direction
;
366 CHARSET_TABLE_INFO (charset
, CHARSET_LEADING_CODE_BASE_IDX
)
367 = make_number (leading_code_base
);
368 CHARSET_TABLE_INFO (charset
, CHARSET_LEADING_CODE_EXT_IDX
)
369 = make_number (leading_code_ext
);
370 CHARSET_TABLE_INFO (charset
, CHARSET_ISO_FINAL_CHAR_IDX
) = iso_final_char
;
371 CHARSET_TABLE_INFO (charset
, CHARSET_ISO_GRAPHIC_PLANE_IDX
)
373 CHARSET_TABLE_INFO (charset
, CHARSET_SHORT_NAME_IDX
) = short_name
;
374 CHARSET_TABLE_INFO (charset
, CHARSET_LONG_NAME_IDX
) = long_name
;
375 CHARSET_TABLE_INFO (charset
, CHARSET_DESCRIPTION_IDX
) = description
;
376 CHARSET_TABLE_INFO (charset
, CHARSET_PLIST_IDX
) = Qnil
;
379 /* If we have already defined a charset which has the same
380 DIMENSION, CHARS and ISO-FINAL-CHAR but the different
381 DIRECTION, we must update the entry REVERSE-CHARSET of both
382 charsets. If there's no such charset, the value of the entry
386 for (i
= 0; i
<= MAX_CHARSET
; i
++)
387 if (!NILP (CHARSET_TABLE_ENTRY (i
)))
389 if (CHARSET_DIMENSION (i
) == XINT (dimension
)
390 && CHARSET_CHARS (i
) == XINT (chars
)
391 && CHARSET_ISO_FINAL_CHAR (i
) == XINT (iso_final_char
)
392 && CHARSET_DIRECTION (i
) != XINT (direction
))
394 CHARSET_TABLE_INFO (charset
, CHARSET_REVERSE_CHARSET_IDX
)
396 CHARSET_TABLE_INFO (i
, CHARSET_REVERSE_CHARSET_IDX
) = charset_id
;
401 /* No such a charset. */
402 CHARSET_TABLE_INFO (charset
, CHARSET_REVERSE_CHARSET_IDX
)
406 if (charset
!= CHARSET_ASCII
407 && charset
< MIN_CHARSET_PRIVATE_DIMENSION1
)
409 /* Update tables bytes_by_char_head and width_by_char_head. */
410 bytes_by_char_head
[leading_code_base
] = bytes
;
411 width_by_char_head
[leading_code_base
] = XINT (width
);
413 /* Update table emacs_code_class. */
414 emacs_code_class
[charset
] = (bytes
== 2
415 ? EMACS_leading_code_2
417 ? EMACS_leading_code_3
418 : EMACS_leading_code_4
));
421 /* Update table iso_charset_table. */
422 if (ISO_CHARSET_TABLE (dimension
, chars
, iso_final_char
) < 0)
423 ISO_CHARSET_TABLE (dimension
, chars
, iso_final_char
) = charset
;
428 /* Return charset id of CHARSET_SYMBOL, or return -1 if CHARSET_SYMBOL
431 get_charset_id (charset_symbol
)
432 Lisp_Object charset_symbol
;
437 return ((SYMBOLP (charset_symbol
)
438 && (val
= Fget (charset_symbol
, Qcharset
), VECTORP (val
))
439 && (charset
= XINT (XVECTOR (val
)->contents
[CHARSET_ID_IDX
]),
440 CHARSET_VALID_P (charset
)))
444 /* Return an identification number for a new private charset of
445 DIMENSION and WIDTH. If there's no more room for the new charset,
448 get_new_private_charset_id (dimension
, width
)
449 int dimension
, width
;
451 int charset
, from
, to
;
456 from
= LEADING_CODE_EXT_11
, to
= LEADING_CODE_EXT_12
;
458 from
= LEADING_CODE_EXT_12
, to
= LEADING_CODE_EXT_21
;
463 from
= LEADING_CODE_EXT_21
, to
= LEADING_CODE_EXT_22
;
465 from
= LEADING_CODE_EXT_22
, to
= LEADING_CODE_EXT_MAX
+ 1;
468 for (charset
= from
; charset
< to
; charset
++)
469 if (!CHARSET_DEFINED_P (charset
)) break;
471 return make_number (charset
< to
? charset
: 0);
474 DEFUN ("define-charset", Fdefine_charset
, Sdefine_charset
, 3, 3, 0,
475 "Define CHARSET-ID as the identification number of CHARSET with INFO-VECTOR.\n\
476 If CHARSET-ID is nil, it is decided automatically, which means CHARSET is\n\
477 treated as a private charset.\n\
478 INFO-VECTOR is a vector of the format:\n\
479 [DIMENSION CHARS WIDTH DIRECTION ISO-FINAL-CHAR ISO-GRAPHIC-PLANE\n\
480 SHORT-NAME LONG-NAME DESCRIPTION]\n\
481 The meanings of each elements is as follows:\n\
482 DIMENSION (integer) is the number of bytes to represent a character: 1 or 2.\n\
483 CHARS (integer) is the number of characters in a dimension: 94 or 96.\n\
484 WIDTH (integer) is the number of columns a character in the charset\n\
485 occupies on the screen: one of 0, 1, and 2.\n\
487 DIRECTION (integer) is the rendering direction of characters in the\n\
488 charset when rendering. If 0, render from right to left, else\n\
489 render from left to right.\n\
491 ISO-FINAL-CHAR (character) is the final character of the\n\
492 corresponding ISO 2022 charset.\n\
494 ISO-GRAPHIC-PLANE (integer) is the graphic plane to be invoked\n\
495 while encoding to variants of ISO 2022 coding system, one of the\n\
496 following: 0/graphic-plane-left(GL), 1/graphic-plane-right(GR).\n\
498 SHORT-NAME (string) is the short name to refer to the charset.\n\
500 LONG-NAME (string) is the long name to refer to the charset.\n\
502 DESCRIPTION (string) is the description string of the charset.")
503 (charset_id
, charset_symbol
, info_vector
)
504 Lisp_Object charset_id
, charset_symbol
, info_vector
;
508 if (!NILP (charset_id
))
509 CHECK_NUMBER (charset_id
, 0);
510 CHECK_SYMBOL (charset_symbol
, 1);
511 CHECK_VECTOR (info_vector
, 2);
513 if (! NILP (charset_id
))
515 if (! CHARSET_VALID_P (XINT (charset_id
)))
516 error ("Invalid CHARSET: %d", XINT (charset_id
));
517 else if (CHARSET_DEFINED_P (XINT (charset_id
)))
518 error ("Already defined charset: %d", XINT (charset_id
));
521 vec
= XVECTOR (info_vector
)->contents
;
522 if (XVECTOR (info_vector
)->size
!= 9
523 || !INTEGERP (vec
[0]) || !(XINT (vec
[0]) == 1 || XINT (vec
[0]) == 2)
524 || !INTEGERP (vec
[1]) || !(XINT (vec
[1]) == 94 || XINT (vec
[1]) == 96)
525 || !INTEGERP (vec
[2]) || !(XINT (vec
[2]) == 1 || XINT (vec
[2]) == 2)
526 || !INTEGERP (vec
[3]) || !(XINT (vec
[3]) == 0 || XINT (vec
[3]) == 1)
527 || !INTEGERP (vec
[4]) || !(XINT (vec
[4]) >= '0' && XINT (vec
[4]) <= '~')
528 || !INTEGERP (vec
[5]) || !(XINT (vec
[5]) == 0 || XINT (vec
[5]) == 1)
531 || !STRINGP (vec
[8]))
532 error ("Invalid info-vector argument for defining charset %s",
533 XSYMBOL (charset_symbol
)->name
->data
);
535 if (NILP (charset_id
))
537 charset_id
= get_new_private_charset_id (XINT (vec
[0]), XINT (vec
[2]));
538 if (XINT (charset_id
) == 0)
539 error ("There's no room for a new private charset %s",
540 XSYMBOL (charset_symbol
)->name
->data
);
543 update_charset_table (charset_id
, vec
[0], vec
[1], vec
[2], vec
[3],
544 vec
[4], vec
[5], vec
[6], vec
[7], vec
[8]);
545 Fput (charset_symbol
, Qcharset
, CHARSET_TABLE_ENTRY (XINT (charset_id
)));
546 CHARSET_SYMBOL (XINT (charset_id
)) = charset_symbol
;
547 Vcharset_list
= Fcons (charset_symbol
, Vcharset_list
);
551 DEFUN ("get-unused-iso-final-char", Fget_unused_iso_final_char
,
552 Sget_unused_iso_final_char
, 2, 2, 0,
553 "Return an unsed ISO's final char for a charset of DIMENISION and CHARS.\n\
554 DIMENSION is the number of bytes to represent a character: 1 or 2.\n\
555 CHARS is the number of characters in a dimension: 94 or 96.\n\
557 This final char is for private use, thus the range is `0' (48) .. `?' (63).\n\
558 If there's no unused final char for the specified kind of charset,\n\
561 Lisp_Object dimension
, chars
;
565 CHECK_NUMBER (dimension
, 0);
566 CHECK_NUMBER (chars
, 1);
567 if (XINT (dimension
) != 1 && XINT (dimension
) != 2)
568 error ("Invalid charset dimension %d, it should be 1 or 2",
570 if (XINT (chars
) != 94 && XINT (chars
) != 96)
571 error ("Invalid charset chars %d, it should be 94 or 96",
573 for (final_char
= '0'; final_char
<= '?'; final_char
++)
575 if (ISO_CHARSET_TABLE (dimension
, chars
, make_number (final_char
)) < 0)
578 return (final_char
<= '?' ? make_number (final_char
) : Qnil
);
581 DEFUN ("declare-equiv-charset", Fdeclare_equiv_charset
, Sdeclare_equiv_charset
,
583 "Declare a charset of DIMENSION, CHARS, FINAL-CHAR is the same as CHARSET.\n\
584 CHARSET should be defined by `defined-charset' in advance.")
585 (dimension
, chars
, final_char
, charset_symbol
)
586 Lisp_Object dimension
, chars
, final_char
, charset_symbol
;
590 CHECK_NUMBER (dimension
, 0);
591 CHECK_NUMBER (chars
, 1);
592 CHECK_NUMBER (final_char
, 2);
593 CHECK_SYMBOL (charset_symbol
, 3);
595 if (XINT (dimension
) != 1 && XINT (dimension
) != 2)
596 error ("Invalid DIMENSION %d, it should be 1 or 2", XINT (dimension
));
597 if (XINT (chars
) != 94 && XINT (chars
) != 96)
598 error ("Invalid CHARS %d, it should be 94 or 96", XINT (chars
));
599 if (XINT (final_char
) < '0' || XFASTINT (final_char
) > '~')
600 error ("Invalid FINAL-CHAR %c, it should be `0'..`~'", XINT (chars
));
601 if ((charset
= get_charset_id (charset_symbol
)) < 0)
602 error ("Invalid charset %s", XSYMBOL (charset_symbol
)->name
->data
);
604 ISO_CHARSET_TABLE (dimension
, chars
, final_char
) = charset
;
608 /* Return number of different charsets in STR of length LEN. In
609 addition, for each found charset N, CHARSETS[N] is set 1. The
610 caller should allocate CHARSETS (MAX_CHARSET + 1 elements) in advance.
611 It may lookup a unification table TABLE if supplied. */
614 find_charset_in_str (str
, len
, charsets
, table
)
619 register int num
= 0, c
;
621 if (! CHAR_TABLE_P (table
))
629 if (c
== LEADING_CODE_COMPOSITION
)
631 int cmpchar_id
= str_cmpchar_id (str
, len
);
636 struct cmpchar_info
*cmpcharp
= cmpchar_table
[cmpchar_id
];
639 for (i
= 0; i
< cmpcharp
->glyph_len
; i
++)
641 c
= cmpcharp
->glyph
[i
];
644 if ((c
= unify_char (table
, c
, 0, 0, 0)) < 0)
645 c
= cmpcharp
->glyph
[i
];
647 if ((charset
= CHAR_CHARSET (c
)) < 0)
648 charset
= CHARSET_ASCII
;
649 if (!charsets
[charset
])
651 charsets
[charset
] = 1;
655 str
+= cmpcharp
->len
;
656 len
-= cmpcharp
->len
;
660 charset
= CHARSET_ASCII
;
665 c
= STRING_CHAR_AND_LENGTH (str
, len
, bytes
);
668 int c1
= unify_char (table
, c
, 0, 0, 0);
672 charset
= CHAR_CHARSET (c
);
675 if (!charsets
[charset
])
677 charsets
[charset
] = 1;
686 DEFUN ("find-charset-region", Ffind_charset_region
, Sfind_charset_region
,
688 "Return a list of charsets in the region between BEG and END.\n\
689 BEG and END are buffer positions.\n\
690 Optional arg TABLE if non-nil is a unification table to look up.")
692 Lisp_Object beg
, end
, table
;
694 int charsets
[MAX_CHARSET
+ 1];
695 int from
, from_byte
, to
, stop
, stop_byte
, i
;
698 validate_region (&beg
, &end
);
699 from
= XFASTINT (beg
);
700 stop
= to
= XFASTINT (end
);
702 if (from
< GPT
&& GPT
< to
)
705 stop_byte
= GPT_BYTE
;
708 stop_byte
= CHAR_TO_BYTE (stop
);
710 from_byte
= CHAR_TO_BYTE (from
);
712 bzero (charsets
, (MAX_CHARSET
+ 1) * sizeof (int));
715 find_charset_in_str (BYTE_POS_ADDR (from_byte
), stop_byte
- from_byte
,
719 from
= stop
, from_byte
= stop_byte
;
720 stop
= to
, stop_byte
= CHAR_TO_BYTE (stop
);
727 for (i
= MAX_CHARSET
; i
>= 0; i
--)
729 val
= Fcons (CHARSET_SYMBOL (i
), val
);
733 DEFUN ("find-charset-string", Ffind_charset_string
, Sfind_charset_string
,
735 "Return a list of charsets in STR.\n\
736 Optional arg TABLE if non-nil is a unification table to look up.")
738 Lisp_Object str
, table
;
740 int charsets
[MAX_CHARSET
+ 1];
744 CHECK_STRING (str
, 0);
746 if (! STRING_MULTIBYTE (str
))
749 bzero (charsets
, (MAX_CHARSET
+ 1) * sizeof (int));
750 find_charset_in_str (XSTRING (str
)->data
, STRING_BYTES (XSTRING (str
)),
753 for (i
= MAX_CHARSET
; i
>= 0; i
--)
755 val
= Fcons (CHARSET_SYMBOL (i
), val
);
759 DEFUN ("make-char-internal", Fmake_char_internal
, Smake_char_internal
, 1, 3, 0,
761 (charset
, code1
, code2
)
762 Lisp_Object charset
, code1
, code2
;
764 CHECK_NUMBER (charset
, 0);
767 XSETFASTINT (code1
, 0);
769 CHECK_NUMBER (code1
, 1);
771 XSETFASTINT (code2
, 0);
773 CHECK_NUMBER (code2
, 2);
775 if (!CHARSET_DEFINED_P (XINT (charset
)))
776 error ("Invalid charset: %d", XINT (charset
));
778 return make_number (MAKE_CHAR (XINT (charset
), XINT (code1
), XINT (code2
)));
781 DEFUN ("split-char", Fsplit_char
, Ssplit_char
, 1, 1, 0,
782 "Return list of charset and one or two position-codes of CHAR.")
789 CHECK_NUMBER (ch
, 0);
790 SPLIT_CHAR (XFASTINT (ch
), charset
, c1
, c2
);
792 ? Fcons (CHARSET_SYMBOL (charset
),
793 Fcons (make_number (c1
), Fcons (make_number (c2
), Qnil
)))
794 : Fcons (CHARSET_SYMBOL (charset
), Fcons (make_number (c1
), Qnil
)));
797 DEFUN ("char-charset", Fchar_charset
, Schar_charset
, 1, 1, 0,
798 "Return charset of CHAR.")
802 CHECK_NUMBER (ch
, 0);
804 return CHARSET_SYMBOL (CHAR_CHARSET (XINT (ch
)));
807 DEFUN ("iso-charset", Fiso_charset
, Siso_charset
, 3, 3, 0,
808 "Return charset of ISO's specification DIMENSION, CHARS, and FINAL-CHAR.\n\
810 ISO 2022's designation sequence (escape sequence) distinguishes charsets\n\
811 by their DIMENSION, CHARS, and FINAL-CHAR,\n\
812 where as Emacs distinguishes them by charset symbol.\n\
813 See the documentation of the function `charset-info' for the meanings of\n\
814 DIMENSION, CHARS, and FINAL-CHAR.")
815 (dimension
, chars
, final_char
)
816 Lisp_Object dimension
, chars
, final_char
;
820 CHECK_NUMBER (dimension
, 0);
821 CHECK_NUMBER (chars
, 1);
822 CHECK_NUMBER (final_char
, 2);
824 if ((charset
= ISO_CHARSET_TABLE (dimension
, chars
, final_char
)) < 0)
826 return CHARSET_SYMBOL (charset
);
829 /* If GENERICP is nonzero, return nonzero iff C is a valid normal or
830 generic character. If GENERICP is zero, return nonzero iff C is a
831 valid normal character. Do not call this function directly,
832 instead use macro CHAR_VALID_P. */
834 char_valid_p (c
, genericp
)
841 if (SINGLE_BYTE_CHAR_P (c
))
843 SPLIT_NON_ASCII_CHAR (c
, charset
, c1
, c2
);
844 if (!CHARSET_VALID_P (charset
))
846 return (c
< MIN_CHAR_COMPOSITION
847 ? ((c
& CHAR_FIELD1_MASK
) /* i.e. dimension of C is two. */
848 ? (genericp
&& c1
== 0 && c2
== 0
849 || c1
>= 32 && c2
>= 32)
850 : (genericp
&& c1
== 0
852 : c
< MIN_CHAR_COMPOSITION
+ n_cmpchars
);
855 DEFUN ("char-valid-p", Fchar_valid_p
, Schar_valid_p
, 1, 2, 0,
856 "Return t if OBJECT is a valid normal character.\n\
857 If optional arg GENERICP is non-nil, also return t if OBJECT is\n\
858 a valid generic character.")
860 Lisp_Object object
, genericp
;
862 if (! NATNUMP (object
))
864 return (CHAR_VALID_P (XFASTINT (object
), !NILP (genericp
)) ? Qt
: Qnil
);
867 DEFUN ("unibyte-char-to-multibyte", Funibyte_char_to_multibyte
,
868 Sunibyte_char_to_multibyte
, 1, 1, 0,
869 "Convert the unibyte character CH to multibyte character.\n\
870 The conversion is done based on nonascii-translate-table (which see)\n\
871 or nonascii-insert-offset (which see).")
877 CHECK_NUMBER (ch
, 0);
879 if (c
< 0 || c
>= 0400)
880 error ("Invalid unibyte character: %d", c
);
881 c
= unibyte_char_to_multibyte (c
);
883 error ("Can't convert to multibyte character: %d", XINT (ch
));
884 return make_number (c
);
887 DEFUN ("char-bytes", Fchar_bytes
, Schar_bytes
, 1, 1, 0,
888 "Return byte length of multi-byte form of CHAR.")
895 CHECK_NUMBER (ch
, 0);
896 if (COMPOSITE_CHAR_P (XFASTINT (ch
)))
898 unsigned int id
= COMPOSITE_CHAR_ID (XFASTINT (ch
));
900 bytes
= (id
< n_cmpchars
? cmpchar_table
[id
]->len
: 1);
904 int charset
= CHAR_CHARSET (XFASTINT (ch
));
906 bytes
= CHARSET_DEFINED_P (charset
) ? CHARSET_BYTES (charset
) : 1;
909 XSETFASTINT (val
, bytes
);
913 /* Return the width of character of which multi-byte form starts with
914 C. The width is measured by how many columns occupied on the
915 screen when displayed in the current buffer. */
917 #define ONE_BYTE_CHAR_WIDTH(c) \
920 ? XFASTINT (current_buffer->tab_width) \
921 : (c == '\n' ? 0 : (NILP (current_buffer->ctl_arrow) ? 4 : 2))) \
925 ? (NILP (current_buffer->ctl_arrow) ? 4 : 2) \
926 : ((! NILP (current_buffer->enable_multibyte_characters) \
927 && BASE_LEADING_CODE_P (c)) \
928 ? WIDTH_BY_CHAR_HEAD (c) \
932 DEFUN ("char-width", Fchar_width, Schar_width, 1, 1, 0,
933 "Return width of CHAR when displayed in the current buffer.\n\
934 The width is measured by how many columns it occupies on the screen.")
938 Lisp_Object val
, disp
;
940 struct Lisp_Char_Table
*dp
= buffer_display_table ();
942 CHECK_NUMBER (ch
, 0);
946 /* Get the way the display table would display it. */
947 disp
= dp
? DISP_CHAR_VECTOR (dp
, c
) : Qnil
;
950 XSETINT (val
, XVECTOR (disp
)->size
);
951 else if (SINGLE_BYTE_CHAR_P (c
))
952 XSETINT (val
, ONE_BYTE_CHAR_WIDTH (c
));
953 else if (COMPOSITE_CHAR_P (c
))
955 int id
= COMPOSITE_CHAR_ID (XFASTINT (ch
));
956 XSETFASTINT (val
, (id
< n_cmpchars
? cmpchar_table
[id
]->width
: 0));
960 int charset
= CHAR_CHARSET (c
);
962 XSETFASTINT (val
, CHARSET_WIDTH (charset
));
967 /* Return width of string STR of length LEN when displayed in the
968 current buffer. The width is measured by how many columns it
969 occupies on the screen. */
976 unsigned char *endp
= str
+ len
;
978 struct Lisp_Char_Table
*dp
= buffer_display_table ();
982 if (*str
== LEADING_CODE_COMPOSITION
)
984 int id
= str_cmpchar_id (str
, endp
- str
);
993 width
+= cmpchar_table
[id
]->width
;
994 str
+= cmpchar_table
[id
]->len
;
1001 int c
= STRING_CHAR_AND_LENGTH (str
, endp
- str
, thislen
);
1003 /* Get the way the display table would display it. */
1005 disp
= DISP_CHAR_VECTOR (dp
, c
);
1010 width
+= XVECTOR (disp
)->size
;
1012 width
+= ONE_BYTE_CHAR_WIDTH (*str
);
1020 DEFUN ("string-width", Fstring_width
, Sstring_width
, 1, 1, 0,
1021 "Return width of STRING when displayed in the current buffer.\n\
1022 Width is measured by how many columns it occupies on the screen.\n\
1023 When calculating width of a multibyte character in STRING,\n\
1024 only the base leading-code is considered; the validity of\n\
1025 the following bytes is not checked.")
1031 CHECK_STRING (str
, 0);
1032 XSETFASTINT (val
, strwidth (XSTRING (str
)->data
,
1033 STRING_BYTES (XSTRING (str
))));
1037 DEFUN ("char-direction", Fchar_direction
, Schar_direction
, 1, 1, 0,
1038 "Return the direction of CHAR.\n\
1039 The returned value is 0 for left-to-right and 1 for right-to-left.")
1045 CHECK_NUMBER (ch
, 0);
1046 charset
= CHAR_CHARSET (XFASTINT (ch
));
1047 if (!CHARSET_DEFINED_P (charset
))
1048 invalid_character (XINT (ch
));
1049 return CHARSET_TABLE_INFO (charset
, CHARSET_DIRECTION_IDX
);
1052 DEFUN ("chars-in-region", Fchars_in_region
, Schars_in_region
, 2, 2, 0,
1053 "Return number of characters between BEG and END.")
1055 Lisp_Object beg
, end
;
1059 from
= min (XFASTINT (beg
), XFASTINT (end
));
1060 to
= max (XFASTINT (beg
), XFASTINT (end
));
1065 /* Return the number of characters in the NBYTES bytes at PTR.
1066 This works by looking at the contents and checking for multibyte sequences.
1067 However, if the current buffer has enable-multibyte-characters = nil,
1068 we treat each byte as a character. */
1071 chars_in_text (ptr
, nbytes
)
1075 unsigned char *endp
, c
;
1078 /* current_buffer is null at early stages of Emacs initialization. */
1079 if (current_buffer
== 0
1080 || NILP (current_buffer
->enable_multibyte_characters
))
1083 endp
= ptr
+ nbytes
;
1090 if (BASE_LEADING_CODE_P (c
))
1091 while (ptr
< endp
&& ! CHAR_HEAD_P (*ptr
)) ptr
++;
1098 /* Return the number of characters in the NBYTES bytes at PTR.
1099 This works by looking at the contents and checking for multibyte sequences.
1100 It ignores enable-multibyte-characters. */
1103 multibyte_chars_in_text (ptr
, nbytes
)
1107 unsigned char *endp
, c
;
1110 endp
= ptr
+ nbytes
;
1117 if (BASE_LEADING_CODE_P (c
))
1118 while (ptr
< endp
&& ! CHAR_HEAD_P (*ptr
)) ptr
++;
1125 DEFUN ("string", Fstring
, Sstring
, 1, MANY
, 0,
1126 "Concatenate all the argument characters and make the result a string.")
1133 = (unsigned char *) alloca (MAX_LENGTH_OF_MULTI_BYTE_FORM
* n
);
1134 unsigned char *p
= buf
;
1137 for (i
= 0; i
< n
; i
++)
1142 if (!INTEGERP (args
[i
]))
1143 CHECK_NUMBER (args
[i
], 0);
1145 len
= CHAR_STRING (c
, p
, str
);
1147 /* C is a composite character. */
1148 bcopy (str
, p
, len
);
1152 val
= make_multibyte_string (buf
, n
, p
- buf
);
1158 /*** Composite characters staffs ***/
1160 /* Each composite character is identified by CMPCHAR-ID which is
1161 assigned when Emacs needs the character code of the composite
1162 character (e.g. when displaying it on the screen). See the
1163 document "GENERAL NOTE on COMPOSITE CHARACTER" in `charset.h' how a
1164 composite character is represented in Emacs. */
1166 /* If `static' is defined, it means that it is defined to null string. */
1168 /* The following function is copied from lread.c. */
1170 hash_string (ptr
, len
)
1174 register unsigned char *p
= ptr
;
1175 register unsigned char *end
= p
+ len
;
1176 register unsigned char c
;
1177 register int hash
= 0;
1182 if (c
>= 0140) c
-= 40;
1183 hash
= ((hash
<<3) + (hash
>>28) + c
);
1185 return hash
& 07777777777;
1189 #define CMPCHAR_HASH_TABLE_SIZE 0xFFF
1191 static int *cmpchar_hash_table
[CMPCHAR_HASH_TABLE_SIZE
];
1193 /* Each element of `cmpchar_hash_table' is a pointer to an array of
1194 integer, where the 1st element is the size of the array, the 2nd
1195 element is how many elements are actually used in the array, and
1196 the remaining elements are CMPCHAR-IDs of composite characters of
1197 the same hash value. */
1198 #define CMPCHAR_HASH_SIZE(table) table[0]
1199 #define CMPCHAR_HASH_USED(table) table[1]
1200 #define CMPCHAR_HASH_CMPCHAR_ID(table, i) table[i]
1202 /* Return CMPCHAR-ID of the composite character in STR of the length
1203 LEN. If the composite character has not yet been registered,
1204 register it in `cmpchar_table' and assign new CMPCHAR-ID. This
1205 is the sole function for assigning CMPCHAR-ID. */
1207 str_cmpchar_id (str
, len
)
1211 int hash_idx
, *hashp
;
1213 int embedded_rule
; /* 1 if composition rule is embedded. */
1214 int chars
; /* number of components. */
1216 struct cmpchar_info
*cmpcharp
;
1218 /* The second byte 0xFF means compostion rule is embedded. */
1219 embedded_rule
= (str
[1] == 0xFF);
1221 /* At first, get the actual length of the composite character. */
1223 unsigned char *p
, *endp
= str
+ 1, *lastp
= str
+ len
;
1226 while (endp
< lastp
&& ! CHAR_HEAD_P (*endp
)) endp
++;
1228 /* Any composite char have at least 5-byte length. */
1235 if (embedded_rule
) p
++;
1236 /* No need of checking if *P is 0xA0 because
1237 BYTES_BY_CHAR_HEAD (0x80) surely returns 2. */
1238 p
+= BYTES_BY_CHAR_HEAD (*p
- 0x20);
1241 if (p
> endp
|| chars
< 2 || chars
> MAX_COMPONENT_COUNT
)
1242 /* Invalid components. */
1246 hash_idx
= hash_string (str
, len
) % CMPCHAR_HASH_TABLE_SIZE
;
1247 hashp
= cmpchar_hash_table
[hash_idx
];
1249 /* Then, look into the hash table. */
1251 /* Find the correct one among composite characters of the same
1253 for (i
= 2; i
< CMPCHAR_HASH_USED (hashp
); i
++)
1255 cmpcharp
= cmpchar_table
[CMPCHAR_HASH_CMPCHAR_ID (hashp
, i
)];
1256 if (len
== cmpcharp
->len
1257 && ! bcmp (str
, cmpcharp
->data
, len
))
1258 return CMPCHAR_HASH_CMPCHAR_ID (hashp
, i
);
1261 /* We have to register the composite character in cmpchar_table. */
1262 if (n_cmpchars
> (CHAR_FIELD2_MASK
| CHAR_FIELD3_MASK
))
1263 /* No, we have no more room for a new composite character. */
1266 /* Make the entry in hash table. */
1269 /* Make a table for 8 composite characters initially. */
1270 hashp
= (cmpchar_hash_table
[hash_idx
]
1271 = (int *) xmalloc (sizeof (int) * (2 + 8)));
1272 CMPCHAR_HASH_SIZE (hashp
) = 10;
1273 CMPCHAR_HASH_USED (hashp
) = 2;
1275 else if (CMPCHAR_HASH_USED (hashp
) >= CMPCHAR_HASH_SIZE (hashp
))
1277 CMPCHAR_HASH_SIZE (hashp
) += 8;
1278 hashp
= (cmpchar_hash_table
[hash_idx
]
1279 = (int *) xrealloc (hashp
,
1280 sizeof (int) * CMPCHAR_HASH_SIZE (hashp
)));
1282 CMPCHAR_HASH_CMPCHAR_ID (hashp
, CMPCHAR_HASH_USED (hashp
)) = n_cmpchars
;
1283 CMPCHAR_HASH_USED (hashp
)++;
1285 /* Set information of the composite character in cmpchar_table. */
1286 if (cmpchar_table_size
== 0)
1288 /* This is the first composite character to be registered. */
1289 cmpchar_table_size
= 256;
1291 = (struct cmpchar_info
**) xmalloc (sizeof (cmpchar_table
[0])
1292 * cmpchar_table_size
);
1294 else if (cmpchar_table_size
<= n_cmpchars
)
1296 cmpchar_table_size
+= 256;
1298 = (struct cmpchar_info
**) xrealloc (cmpchar_table
,
1299 sizeof (cmpchar_table
[0])
1300 * cmpchar_table_size
);
1303 cmpcharp
= (struct cmpchar_info
*) xmalloc (sizeof (struct cmpchar_info
));
1305 cmpcharp
->len
= len
;
1306 cmpcharp
->data
= (unsigned char *) xmalloc (len
+ 1);
1307 bcopy (str
, cmpcharp
->data
, len
);
1308 cmpcharp
->data
[len
] = 0;
1309 cmpcharp
->glyph_len
= chars
;
1310 cmpcharp
->glyph
= (GLYPH
*) xmalloc (sizeof (GLYPH
) * chars
);
1313 cmpcharp
->cmp_rule
= (unsigned char *) xmalloc (chars
);
1314 cmpcharp
->col_offset
= (float *) xmalloc (sizeof (float) * chars
);
1318 cmpcharp
->cmp_rule
= NULL
;
1319 cmpcharp
->col_offset
= NULL
;
1322 /* Setup GLYPH data and composition rules (if any) so as not to make
1323 them every time on displaying. */
1325 unsigned char *bufp
;
1327 float leftmost
= 0.0, rightmost
= 1.0;
1330 /* At first, col_offset[N] is set to relative to col_offset[0]. */
1331 cmpcharp
->col_offset
[0] = 0;
1333 for (i
= 0, bufp
= cmpcharp
->data
+ 1; i
< chars
; i
++)
1336 cmpcharp
->cmp_rule
[i
] = *bufp
++;
1338 if (*bufp
== 0xA0) /* This is an ASCII character. */
1340 cmpcharp
->glyph
[i
] = FAST_MAKE_GLYPH ((*++bufp
& 0x7F), 0);
1344 else /* Multibyte character. */
1346 /* Make `bufp' point normal multi-byte form temporally. */
1349 = FAST_MAKE_GLYPH (string_to_non_ascii_char (bufp
, 4, 0), 0);
1350 width
= WIDTH_BY_CHAR_HEAD (*bufp
);
1352 bufp
+= BYTES_BY_CHAR_HEAD (*bufp
- 0x20);
1355 if (embedded_rule
&& i
> 0)
1357 /* Reference points (global_ref and new_ref) are
1368 Now, we calculate the column offset of the new glyph
1369 from the left edge of the first glyph. This can avoid
1370 the same calculation everytime displaying this
1371 composite character. */
1373 /* Reference points of global glyph and new glyph. */
1374 int global_ref
= (cmpcharp
->cmp_rule
[i
] - 0xA0) / 9;
1375 int new_ref
= (cmpcharp
->cmp_rule
[i
] - 0xA0) % 9;
1376 /* Column offset relative to the first glyph. */
1377 float left
= (leftmost
1378 + (global_ref
% 3) * (rightmost
- leftmost
) / 2.0
1379 - (new_ref
% 3) * width
/ 2.0);
1381 cmpcharp
->col_offset
[i
] = left
;
1382 if (left
< leftmost
)
1384 if (left
+ width
> rightmost
)
1385 rightmost
= left
+ width
;
1389 if (width
> rightmost
)
1395 /* Now col_offset[N] are relative to the left edge of the
1396 first component. Make them relative to the left edge of
1398 for (i
= 0; i
< chars
; i
++)
1399 cmpcharp
->col_offset
[i
] -= leftmost
;
1400 /* Make rightmost holds width of overall glyph. */
1401 rightmost
-= leftmost
;
1404 cmpcharp
->width
= rightmost
;
1405 if (cmpcharp
->width
< rightmost
)
1406 /* To get a ceiling integer value. */
1410 cmpchar_table
[n_cmpchars
] = cmpcharp
;
1412 return n_cmpchars
++;
1415 /* Return the Nth element of the composite character C. */
1417 cmpchar_component (c
, n
)
1420 int id
= COMPOSITE_CHAR_ID (c
);
1422 if (id
>= n_cmpchars
/* C is not a valid composite character. */
1423 || n
>= cmpchar_table
[id
]->glyph_len
) /* No such component. */
1425 /* No face data is stored in glyph code. */
1426 return ((int) (cmpchar_table
[id
]->glyph
[n
]));
1429 DEFUN ("cmpcharp", Fcmpcharp
, Scmpcharp
, 1, 1, 0,
1430 "T if CHAR is a composite character.")
1434 CHECK_NUMBER (ch
, 0);
1435 return (COMPOSITE_CHAR_P (XINT (ch
)) ? Qt
: Qnil
);
1438 DEFUN ("composite-char-component", Fcmpchar_component
, Scmpchar_component
,
1440 "Return the IDXth component character of composite character CHARACTER.")
1442 Lisp_Object character
, idx
;
1446 CHECK_NUMBER (character
, 0);
1447 CHECK_NUMBER (idx
, 1);
1449 if ((c
= cmpchar_component (XINT (character
), XINT (idx
))) < 0)
1450 args_out_of_range (character
, idx
);
1452 return make_number (c
);
1455 DEFUN ("composite-char-composition-rule", Fcmpchar_cmp_rule
, Scmpchar_cmp_rule
,
1457 "Return the Nth composition rule embedded in composite character CHARACTER.\n\
1458 The returned rule is for composing the Nth component\n\
1459 on the (N-1)th component. If N is 0, the returned value is always 255.")
1461 Lisp_Object character
, n
;
1465 CHECK_NUMBER (character
, 0);
1466 CHECK_NUMBER (n
, 1);
1468 id
= COMPOSITE_CHAR_ID (XINT (character
));
1469 if (id
< 0 || id
>= n_cmpchars
)
1470 error ("Invalid composite character: %d", XINT (character
));
1472 if (i
> cmpchar_table
[id
]->glyph_len
)
1473 args_out_of_range (character
, n
);
1475 return make_number (cmpchar_table
[id
]->cmp_rule
[i
]);
1478 DEFUN ("composite-char-composition-rule-p", Fcmpchar_cmp_rule_p
,
1479 Scmpchar_cmp_rule_p
, 1, 1, 0,
1480 "Return non-nil if composite character CHARACTER contains a embedded rule.")
1482 Lisp_Object character
;
1486 CHECK_NUMBER (character
, 0);
1487 id
= COMPOSITE_CHAR_ID (XINT (character
));
1488 if (id
< 0 || id
>= n_cmpchars
)
1489 error ("Invalid composite character: %d", XINT (character
));
1491 return (cmpchar_table
[id
]->cmp_rule
? Qt
: Qnil
);
1494 DEFUN ("composite-char-component-count", Fcmpchar_cmp_count
,
1495 Scmpchar_cmp_count
, 1, 1, 0,
1496 "Return number of compoents of composite character CHARACTER.")
1498 Lisp_Object character
;
1502 CHECK_NUMBER (character
, 0);
1503 id
= COMPOSITE_CHAR_ID (XINT (character
));
1504 if (id
< 0 || id
>= n_cmpchars
)
1505 error ("Invalid composite character: %d", XINT (character
));
1507 return (make_number (cmpchar_table
[id
]->glyph_len
));
1510 DEFUN ("compose-string", Fcompose_string
, Scompose_string
,
1512 "Return one char string composed from all characters in STRING.")
1516 unsigned char buf
[MAX_LENGTH_OF_MULTI_BYTE_FORM
], *p
, *pend
, *ptemp
;
1519 CHECK_STRING (str
, 0);
1521 buf
[0] = LEADING_CODE_COMPOSITION
;
1522 p
= XSTRING (str
)->data
;
1523 pend
= p
+ STRING_BYTES (XSTRING (str
));
1527 if (*p
< 0x20 || *p
== 127) /* control code */
1528 error ("Invalid component character: %d", *p
);
1529 else if (*p
< 0x80) /* ASCII */
1531 if (i
+ 2 >= MAX_LENGTH_OF_MULTI_BYTE_FORM
)
1532 error ("Too long string to be composed: %s", XSTRING (str
)->data
);
1533 /* Prepend an ASCII charset indicator 0xA0, set MSB of the
1536 buf
[i
++] = *p
++ + 0x80;
1538 else if (*p
== LEADING_CODE_COMPOSITION
) /* composite char */
1540 /* Already composed. Eliminate the heading
1541 LEADING_CODE_COMPOSITION, keep the remaining bytes
1545 while (! CHAR_HEAD_P (*p
)) p
++;
1546 if (i
+ (p
- ptemp
) >= MAX_LENGTH_OF_MULTI_BYTE_FORM
)
1547 error ("Too long string to be composed: %s", XSTRING (str
)->data
);
1548 bcopy (ptemp
, buf
+ i
, p
- ptemp
);
1551 else /* multibyte char */
1553 /* Add 0x20 to the base leading-code, keep the remaining
1555 len
= BYTES_BY_CHAR_HEAD (*p
);
1556 if (i
+ len
>= MAX_LENGTH_OF_MULTI_BYTE_FORM
)
1557 error ("Too long string to be composed: %s", XSTRING (str
)->data
);
1558 bcopy (p
, buf
+ i
, len
);
1565 /* STR contains only one character, which can't be composed. */
1566 error ("Too short string to be composed: %s", XSTRING (str
)->data
);
1568 return make_multibyte_string (buf
, 1, i
);
1572 charset_id_internal (charset_name
)
1575 Lisp_Object val
= Fget (intern (charset_name
), Qcharset
);
1578 error ("Charset %s is not defined", charset_name
);
1580 return (XINT (XVECTOR (val
)->contents
[0]));
1583 DEFUN ("setup-special-charsets", Fsetup_special_charsets
,
1584 Ssetup_special_charsets
, 0, 0, 0, "Internal use only.")
1587 charset_latin_iso8859_1
= charset_id_internal ("latin-iso8859-1");
1588 charset_jisx0208_1978
= charset_id_internal ("japanese-jisx0208-1978");
1589 charset_jisx0208
= charset_id_internal ("japanese-jisx0208");
1590 charset_katakana_jisx0201
= charset_id_internal ("katakana-jisx0201");
1591 charset_latin_jisx0201
= charset_id_internal ("latin-jisx0201");
1592 charset_big5_1
= charset_id_internal ("chinese-big5-1");
1593 charset_big5_2
= charset_id_internal ("chinese-big5-2");
1597 init_charset_once ()
1601 staticpro (&Vcharset_table
);
1602 staticpro (&Vcharset_symbol_table
);
1604 /* This has to be done here, before we call Fmake_char_table. */
1605 Qcharset_table
= intern ("charset-table");
1606 staticpro (&Qcharset_table
);
1608 /* Intern this now in case it isn't already done.
1609 Setting this variable twice is harmless.
1610 But don't staticpro it here--that is done in alloc.c. */
1611 Qchar_table_extra_slots
= intern ("char-table-extra-slots");
1613 /* Now we are ready to set up this property, so we can
1614 create the charset table. */
1615 Fput (Qcharset_table
, Qchar_table_extra_slots
, make_number (0));
1616 Vcharset_table
= Fmake_char_table (Qcharset_table
, Qnil
);
1618 Vcharset_symbol_table
= Fmake_vector (make_number (MAX_CHARSET
+ 1), Qnil
);
1621 for (i
= 0; i
< 2; i
++)
1622 for (j
= 0; j
< 2; j
++)
1623 for (k
= 0; k
< 128; k
++)
1624 iso_charset_table
[i
][j
][k
] = -1;
1626 bzero (cmpchar_hash_table
, sizeof cmpchar_hash_table
);
1627 cmpchar_table_size
= n_cmpchars
= 0;
1629 for (i
= 0; i
< 256; i
++)
1630 BYTES_BY_CHAR_HEAD (i
) = 1;
1631 BYTES_BY_CHAR_HEAD (LEADING_CODE_PRIVATE_11
) = 3;
1632 BYTES_BY_CHAR_HEAD (LEADING_CODE_PRIVATE_12
) = 3;
1633 BYTES_BY_CHAR_HEAD (LEADING_CODE_PRIVATE_21
) = 4;
1634 BYTES_BY_CHAR_HEAD (LEADING_CODE_PRIVATE_22
) = 4;
1635 /* The following doesn't reflect the actual bytes, but just to tell
1636 that it is a start of a multibyte character. */
1637 BYTES_BY_CHAR_HEAD (LEADING_CODE_COMPOSITION
) = 2;
1639 for (i
= 0; i
< 128; i
++)
1640 WIDTH_BY_CHAR_HEAD (i
) = 1;
1641 for (; i
< 256; i
++)
1642 WIDTH_BY_CHAR_HEAD (i
) = 4;
1643 WIDTH_BY_CHAR_HEAD (LEADING_CODE_PRIVATE_11
) = 1;
1644 WIDTH_BY_CHAR_HEAD (LEADING_CODE_PRIVATE_12
) = 2;
1645 WIDTH_BY_CHAR_HEAD (LEADING_CODE_PRIVATE_21
) = 1;
1646 WIDTH_BY_CHAR_HEAD (LEADING_CODE_PRIVATE_22
) = 2;
1653 Qascii
= intern ("ascii");
1654 staticpro (&Qascii
);
1656 Qcharset
= intern ("charset");
1657 staticpro (&Qcharset
);
1659 /* Define ASCII charset now. */
1660 update_charset_table (make_number (CHARSET_ASCII
),
1661 make_number (1), make_number (94),
1666 build_string ("ASCII"),
1667 build_string ("ASCII"),
1668 build_string ("ASCII (ISO646 IRV)"));
1669 CHARSET_SYMBOL (CHARSET_ASCII
) = Qascii
;
1670 Fput (Qascii
, Qcharset
, CHARSET_TABLE_ENTRY (CHARSET_ASCII
));
1672 Qcomposition
= intern ("composition");
1673 staticpro (&Qcomposition
);
1674 CHARSET_SYMBOL (CHARSET_COMPOSITION
) = Qcomposition
;
1676 defsubr (&Sdefine_charset
);
1677 defsubr (&Sget_unused_iso_final_char
);
1678 defsubr (&Sdeclare_equiv_charset
);
1679 defsubr (&Sfind_charset_region
);
1680 defsubr (&Sfind_charset_string
);
1681 defsubr (&Smake_char_internal
);
1682 defsubr (&Ssplit_char
);
1683 defsubr (&Schar_charset
);
1684 defsubr (&Siso_charset
);
1685 defsubr (&Schar_valid_p
);
1686 defsubr (&Sunibyte_char_to_multibyte
);
1687 defsubr (&Schar_bytes
);
1688 defsubr (&Schar_width
);
1689 defsubr (&Sstring_width
);
1690 defsubr (&Schar_direction
);
1691 defsubr (&Schars_in_region
);
1693 defsubr (&Scmpcharp
);
1694 defsubr (&Scmpchar_component
);
1695 defsubr (&Scmpchar_cmp_rule
);
1696 defsubr (&Scmpchar_cmp_rule_p
);
1697 defsubr (&Scmpchar_cmp_count
);
1698 defsubr (&Scompose_string
);
1699 defsubr (&Ssetup_special_charsets
);
1701 DEFVAR_LISP ("charset-list", &Vcharset_list
,
1702 "List of charsets ever defined.");
1703 Vcharset_list
= Fcons (Qascii
, Qnil
);
1705 DEFVAR_LISP ("character-unification-table-vector",
1706 &Vcharacter_unification_table_vector
,
1707 "Vector of cons cell of a symbol and unification table ever defined.\n\
1708 An ID of a unification table is an index of this vector.");
1709 Vcharacter_unification_table_vector
= Fmake_vector (make_number (16), Qnil
);
1711 DEFVAR_INT ("leading-code-composition", &leading_code_composition
,
1712 "Leading-code of composite characters.");
1713 leading_code_composition
= LEADING_CODE_COMPOSITION
;
1715 DEFVAR_INT ("leading-code-private-11", &leading_code_private_11
,
1716 "Leading-code of private TYPE9N charset of column-width 1.");
1717 leading_code_private_11
= LEADING_CODE_PRIVATE_11
;
1719 DEFVAR_INT ("leading-code-private-12", &leading_code_private_12
,
1720 "Leading-code of private TYPE9N charset of column-width 2.");
1721 leading_code_private_12
= LEADING_CODE_PRIVATE_12
;
1723 DEFVAR_INT ("leading-code-private-21", &leading_code_private_21
,
1724 "Leading-code of private TYPE9Nx9N charset of column-width 1.");
1725 leading_code_private_21
= LEADING_CODE_PRIVATE_21
;
1727 DEFVAR_INT ("leading-code-private-22", &leading_code_private_22
,
1728 "Leading-code of private TYPE9Nx9N charset of column-width 2.");
1729 leading_code_private_22
= LEADING_CODE_PRIVATE_22
;
1731 DEFVAR_INT ("nonascii-insert-offset", &nonascii_insert_offset
,
1732 "Offset for converting non-ASCII unibyte codes 0240...0377 to multibyte.\n\
1733 This is used for converting unibyte text to multibyte,\n\
1734 and for inserting character codes specified by number.\n\n\
1735 Conversion is performed only when multibyte characters are enabled,\n\
1736 and it serves to convert a Latin-1 or similar 8-bit character code\n\
1737 to the corresponding Emacs character code.\n\
1738 If `nonascii-translate-table' is non-nil, it overrides this variable.");
1739 nonascii_insert_offset
= 0;
1741 DEFVAR_LISP ("nonascii-translate-table", &Vnonascii_translate_table
,
1742 "Translate table for converting non-ASCII unibyte codes to multibyte.\n\
1743 This is used for converting unibyte text to multibyte,\n\
1744 and for inserting character codes specified by number.\n\n\
1745 Conversion is performed only when multibyte characters are enabled,\n\
1746 and it serves to convert a Latin-1 or similar 8-bit character code\n\
1747 to the corresponding Emacs character code.\n\n\
1748 If this is nil, `nonascii-insert-offset' is used instead.");
1749 Vnonascii_translate_table
= Qnil
;
1751 DEFVAR_INT ("min-composite-char", &min_composite_char
,
1752 "Minimum character code of a composite character.");
1753 min_composite_char
= MIN_CHAR_COMPOSITION
;