(compare_window_configurations): New arg ignore_positions.
[bpt/emacs.git] / src / charset.c
CommitLineData
75c8c592 1/* Basic multilingual character support.
35e623fb 2 Copyright (C) 1995, 1997, 1998 Electrotechnical Laboratory, JAPAN.
75c8c592 3 Licensed to the Free Software Foundation.
4ed46869 4
369314dc
KH
5This file is part of GNU Emacs.
6
7GNU Emacs is free software; you can redistribute it and/or modify
8it under the terms of the GNU General Public License as published by
9the Free Software Foundation; either version 2, or (at your option)
10any later version.
4ed46869 11
369314dc
KH
12GNU Emacs is distributed in the hope that it will be useful,
13but WITHOUT ANY WARRANTY; without even the implied warranty of
14MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15GNU General Public License for more details.
4ed46869 16
369314dc
KH
17You should have received a copy of the GNU General Public License
18along with GNU Emacs; see the file COPYING. If not, write to
19the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
20Boston, MA 02111-1307, USA. */
4ed46869
KH
21
22/* At first, see the document in `charset.h' to understand the code in
23 this file. */
24
25#include <stdio.h>
26
27#ifdef emacs
28
29#include <sys/types.h>
30#include <config.h>
31#include "lisp.h"
32#include "buffer.h"
33#include "charset.h"
34#include "coding.h"
fc6b09bf 35#include "disptab.h"
4ed46869
KH
36
37#else /* not emacs */
38
39#include "mulelib.h"
40
41#endif /* emacs */
42
43Lisp_Object Qcharset, Qascii, Qcomposition;
44
45/* Declaration of special leading-codes. */
46int leading_code_composition; /* for composite characters */
47int leading_code_private_11; /* for private DIMENSION1 of 1-column */
48int leading_code_private_12; /* for private DIMENSION1 of 2-column */
49int leading_code_private_21; /* for private DIMENSION2 of 1-column */
50int leading_code_private_22; /* for private DIMENSION2 of 2-column */
51
52/* Declaration of special charsets. */
53int charset_ascii; /* ASCII */
54int charset_composition; /* for a composite character */
55int charset_latin_iso8859_1; /* ISO8859-1 (Latin-1) */
56int charset_jisx0208_1978; /* JISX0208.1978 (Japanese Kanji old set) */
57int charset_jisx0208; /* JISX0208.1983 (Japanese Kanji) */
58int charset_katakana_jisx0201; /* JISX0201.Kana (Japanese Katakana) */
59int charset_latin_jisx0201; /* JISX0201.Roman (Japanese Roman) */
60int charset_big5_1; /* Big5 Level 1 (Chinese Traditional) */
61int charset_big5_2; /* Big5 Level 2 (Chinese Traditional) */
62
b0e3cf2b
KH
63int min_composite_char;
64
4ed46869
KH
65Lisp_Object Qcharset_table;
66
67/* A char-table containing information of each character set. */
68Lisp_Object Vcharset_table;
69
70/* A vector of charset symbol indexed by charset-id. This is used
71 only for returning charset symbol from C functions. */
72Lisp_Object Vcharset_symbol_table;
73
74/* A list of charset symbols ever defined. */
75Lisp_Object Vcharset_list;
76
b0e3cf2b
KH
77/* Vector of unification table ever defined.
78 An ID of a unification table is an index of this vector. */
79Lisp_Object Vcharacter_unification_table_vector;
80
4ed46869
KH
81/* Tables used by macros BYTES_BY_CHAR_HEAD and WIDTH_BY_CHAR_HEAD. */
82int bytes_by_char_head[256];
83int width_by_char_head[256];
84
85/* Mapping table from ISO2022's charset (specified by DIMENSION,
86 CHARS, and FINAL-CHAR) to Emacs' charset. */
87int iso_charset_table[2][2][128];
88
513ee442
KH
89/* Table of pointers to the structure `cmpchar_info' indexed by
90 CMPCHAR-ID. */
91struct cmpchar_info **cmpchar_table;
92/* The current size of `cmpchar_table'. */
93static int cmpchar_table_size;
94/* Number of the current composite characters. */
95int n_cmpchars;
96
4ed46869
KH
97/* Variables used locally in the macro FETCH_MULTIBYTE_CHAR. */
98unsigned char *_fetch_multibyte_char_p;
99int _fetch_multibyte_char_len;
100
35e623fb
RS
101/* Offset to add to a non-ASCII value when inserting it. */
102int nonascii_insert_offset;
103
4cf9710d
RS
104/* Translation table for converting non-ASCII unibyte characters
105 to multibyte codes, or nil. */
106Lisp_Object Vnonascii_translate_table;
107
046b1f03
RS
108#define min(X, Y) ((X) < (Y) ? (X) : (Y))
109#define max(X, Y) ((X) > (Y) ? (X) : (Y))
110\f
93bcb785
KH
111void
112invalid_character (c)
113 int c;
114{
115 error ("Invalid character: %o, %d, 0x%x", c);
116}
117
118
4ed46869
KH
119/* Set STR a pointer to the multi-byte form of the character C. If C
120 is not a composite character, the multi-byte form is set in WORKBUF
121 and STR points WORKBUF. The caller should allocate at least 4-byte
122 area at WORKBUF in advance. Returns the length of the multi-byte
bd4c6dd0
KH
123 form. If C is an invalid character to have a multi-byte form,
124 signal an error.
4ed46869
KH
125
126 Use macro `CHAR_STRING (C, WORKBUF, STR)' instead of calling this
127 function directly if C can be an ASCII character. */
128
129int
130non_ascii_char_to_string (c, workbuf, str)
131 int c;
132 unsigned char *workbuf, **str;
133{
6dc0722d 134 int charset, c1, c2;
4ed46869
KH
135
136 if (COMPOSITE_CHAR_P (c))
137 {
138 int cmpchar_id = COMPOSITE_CHAR_ID (c);
139
140 if (cmpchar_id < n_cmpchars)
141 {
142 *str = cmpchar_table[cmpchar_id]->data;
143 return cmpchar_table[cmpchar_id]->len;
144 }
145 else
146 {
93bcb785 147 invalid_character (c);
4ed46869
KH
148 }
149 }
150
151 SPLIT_NON_ASCII_CHAR (c, charset, c1, c2);
bd4c6dd0
KH
152 if (!charset
153 || ! CHARSET_DEFINED_P (charset)
154 || c1 >= 0 && c1 < 32
155 || c2 >= 0 && c2 < 32)
93bcb785 156 invalid_character (c);
4ed46869
KH
157
158 *str = workbuf;
159 *workbuf++ = CHARSET_LEADING_CODE_BASE (charset);
160 if (*workbuf = CHARSET_LEADING_CODE_EXT (charset))
161 workbuf++;
162 *workbuf++ = c1 | 0x80;
6dc0722d 163 if (c2 >= 0)
4ed46869
KH
164 *workbuf++ = c2 | 0x80;
165
166 return (workbuf - *str);
167}
168
169/* Return a non-ASCII character of which multi-byte form is at STR of
170 length LEN. If ACTUAL_LEN is not NULL, the actual length of the
171 character is set to the address ACTUAL_LEN.
172
173 Use macro `STRING_CHAR (STR, LEN)' instead of calling this function
174 directly if STR can hold an ASCII character. */
175
176string_to_non_ascii_char (str, len, actual_len)
177 unsigned char *str;
178 int len, *actual_len;
179{
180 int charset;
181 unsigned char c1, c2;
182 register int c;
183
184 if (SPLIT_STRING (str, len, charset, c1, c2) == CHARSET_ASCII)
185 {
186 if (actual_len)
187 *actual_len = 1;
188 return (int) *str;
189 }
190
191 c = MAKE_NON_ASCII_CHAR (charset, c1, c2);
192
193 if (actual_len)
194 *actual_len = (charset == CHARSET_COMPOSITION
195 ? cmpchar_table[COMPOSITE_CHAR_ID (c)]->len
196 : BYTES_BY_CHAR_HEAD (*str));
197 return c;
198}
199
200/* Return the length of the multi-byte form at string STR of length LEN. */
201int
202multibyte_form_length (str, len)
203 unsigned char *str;
204 int len;
205{
206 int charset;
207 unsigned char c1, c2;
208 register int c;
209
210 if (SPLIT_STRING (str, len, charset, c1, c2) == CHARSET_ASCII)
211 return 1;
212
213 return (charset == CHARSET_COMPOSITION
214 ? cmpchar_table[(c1 << 7) | c2]->len
215 : BYTES_BY_CHAR_HEAD (*str));
216}
217
218/* Check if string STR of length LEN contains valid multi-byte form of
219 a character. If valid, charset and position codes of the character
220 is set at *CHARSET, *C1, and *C2, and return 0. If not valid,
221 return -1. This should be used only in the macro SPLIT_STRING
222 which checks range of STR in advance. */
223
224split_non_ascii_string (str, len, charset, c1, c2)
225 register unsigned char *str, *c1, *c2;
226 register int len, *charset;
227{
228 register unsigned int cs = *str++;
229
230 if (cs == LEADING_CODE_COMPOSITION)
231 {
232 int cmpchar_id = str_cmpchar_id (str - 1, len);
233
234 if (cmpchar_id < 0)
235 return -1;
236 *charset = cs, *c1 = cmpchar_id >> 7, *c2 = cmpchar_id & 0x7F;
237 }
238 else if ((cs < LEADING_CODE_PRIVATE_11 || (cs = *str++) >= 0xA0)
239 && CHARSET_DEFINED_P (cs))
240 {
241 *charset = cs;
242 if (*str < 0xA0)
243 return -1;
244 *c1 = (*str++) & 0x7F;
245 if (CHARSET_DIMENSION (cs) == 2)
246 {
247 if (*str < 0xA0)
248 return -1;
249 *c2 = (*str++) & 0x7F;
250 }
251 }
252 else
253 return -1;
254 return 0;
255}
256
23d2a7f1
KH
257/* Return a character unified with C (or a character made of CHARSET,
258 C1, and C2) in unification table TABLE. If no unification is found
259 in TABLE, return C. */
260unify_char (table, c, charset, c1, c2)
261 Lisp_Object table;
262 int c, charset, c1, c2;
263{
264 Lisp_Object ch;
265 int alt_charset, alt_c1, alt_c2, dimension;
266
267 if (c < 0) c = MAKE_CHAR (charset, c1, c2);
268 if (!CHAR_TABLE_P (table)
269 || (ch = Faref (table, make_number (c)), !INTEGERP (ch))
270 || XINT (ch) < 0)
271 return c;
272
273 SPLIT_CHAR (XFASTINT (ch), alt_charset, alt_c1, alt_c2);
274 dimension = CHARSET_DIMENSION (alt_charset);
275 if (dimension == 1 && alt_c1 > 0 || dimension == 2 && alt_c2 > 0)
276 /* CH is not a generic character, just return it. */
277 return XFASTINT (ch);
278
279 /* Since CH is a generic character, we must return a specific
280 charater which has the same position codes as C from CH. */
281 if (charset < 0)
282 SPLIT_CHAR (c, charset, c1, c2);
283 if (dimension != CHARSET_DIMENSION (charset))
284 /* We can't make such a character because of dimension mismatch. */
285 return c;
23d2a7f1
KH
286 return MAKE_CHAR (alt_charset, c1, c2);
287}
288
35e623fb
RS
289#define DEFAULT_NONASCII_INSERT_OFFSET 0x800
290
291/* Convert the unibyte character C to multibyte
4cf9710d
RS
292 based on Vnonascii_translate_table or nonascii_insert_offset.
293 Note that copy_text in insdel.c has similar code. */
35e623fb
RS
294
295int
296unibyte_char_to_multibyte (c)
297 int c;
298{
299 if (c >= 0200 && c < 0400)
300 {
4cf9710d
RS
301 if (! NILP (Vnonascii_translate_table))
302 c = XINT (Faref (Vnonascii_translate_table, make_number (c)));
303 else if (nonascii_insert_offset > 0)
35e623fb
RS
304 c += nonascii_insert_offset;
305 else
306 c += DEFAULT_NONASCII_INSERT_OFFSET;
307 }
308 return c;
309}
310\f
4ed46869
KH
311/* Update the table Vcharset_table with the given arguments (see the
312 document of `define-charset' for the meaning of each argument).
313 Several other table contents are also updated. The caller should
314 check the validity of CHARSET-ID and the remaining arguments in
315 advance. */
316
317void
318update_charset_table (charset_id, dimension, chars, width, direction,
319 iso_final_char, iso_graphic_plane,
320 short_name, long_name, description)
321 Lisp_Object charset_id, dimension, chars, width, direction;
322 Lisp_Object iso_final_char, iso_graphic_plane;
323 Lisp_Object short_name, long_name, description;
324{
325 int charset = XINT (charset_id);
326 int bytes;
327 unsigned char leading_code_base, leading_code_ext;
328
6dc0722d
KH
329 if (NILP (CHARSET_TABLE_ENTRY (charset)))
330 CHARSET_TABLE_ENTRY (charset)
331 = Fmake_vector (make_number (CHARSET_MAX_IDX), Qnil);
4ed46869
KH
332
333 /* Get byte length of multibyte form, base leading-code, and
334 extended leading-code of the charset. See the comment under the
335 title "GENERAL NOTE on CHARACTER SET (CHARSET)" in charset.h. */
336 bytes = XINT (dimension);
337 if (charset < MIN_CHARSET_PRIVATE_DIMENSION1)
338 {
339 /* Official charset, it doesn't have an extended leading-code. */
340 if (charset != CHARSET_ASCII)
341 bytes += 1; /* For a base leading-code. */
342 leading_code_base = charset;
343 leading_code_ext = 0;
344 }
345 else
346 {
347 /* Private charset. */
348 bytes += 2; /* For base and extended leading-codes. */
349 leading_code_base
350 = (charset < LEADING_CODE_EXT_12
351 ? LEADING_CODE_PRIVATE_11
352 : (charset < LEADING_CODE_EXT_21
353 ? LEADING_CODE_PRIVATE_12
354 : (charset < LEADING_CODE_EXT_22
355 ? LEADING_CODE_PRIVATE_21
356 : LEADING_CODE_PRIVATE_22)));
357 leading_code_ext = charset;
358 }
359
360 CHARSET_TABLE_INFO (charset, CHARSET_ID_IDX) = charset_id;
361 CHARSET_TABLE_INFO (charset, CHARSET_BYTES_IDX) = make_number (bytes);
362 CHARSET_TABLE_INFO (charset, CHARSET_DIMENSION_IDX) = dimension;
363 CHARSET_TABLE_INFO (charset, CHARSET_CHARS_IDX) = chars;
364 CHARSET_TABLE_INFO (charset, CHARSET_WIDTH_IDX) = width;
365 CHARSET_TABLE_INFO (charset, CHARSET_DIRECTION_IDX) = direction;
366 CHARSET_TABLE_INFO (charset, CHARSET_LEADING_CODE_BASE_IDX)
367 = make_number (leading_code_base);
368 CHARSET_TABLE_INFO (charset, CHARSET_LEADING_CODE_EXT_IDX)
369 = make_number (leading_code_ext);
370 CHARSET_TABLE_INFO (charset, CHARSET_ISO_FINAL_CHAR_IDX) = iso_final_char;
371 CHARSET_TABLE_INFO (charset, CHARSET_ISO_GRAPHIC_PLANE_IDX)
372 = iso_graphic_plane;
373 CHARSET_TABLE_INFO (charset, CHARSET_SHORT_NAME_IDX) = short_name;
374 CHARSET_TABLE_INFO (charset, CHARSET_LONG_NAME_IDX) = long_name;
375 CHARSET_TABLE_INFO (charset, CHARSET_DESCRIPTION_IDX) = description;
376 CHARSET_TABLE_INFO (charset, CHARSET_PLIST_IDX) = Qnil;
377
378 {
379 /* If we have already defined a charset which has the same
380 DIMENSION, CHARS and ISO-FINAL-CHAR but the different
381 DIRECTION, we must update the entry REVERSE-CHARSET of both
382 charsets. If there's no such charset, the value of the entry
383 is set to nil. */
384 int i;
385
513ee442 386 for (i = 0; i <= MAX_CHARSET; i++)
4ed46869
KH
387 if (!NILP (CHARSET_TABLE_ENTRY (i)))
388 {
389 if (CHARSET_DIMENSION (i) == XINT (dimension)
390 && CHARSET_CHARS (i) == XINT (chars)
391 && CHARSET_ISO_FINAL_CHAR (i) == XINT (iso_final_char)
392 && CHARSET_DIRECTION (i) != XINT (direction))
393 {
394 CHARSET_TABLE_INFO (charset, CHARSET_REVERSE_CHARSET_IDX)
395 = make_number (i);
396 CHARSET_TABLE_INFO (i, CHARSET_REVERSE_CHARSET_IDX) = charset_id;
397 break;
398 }
399 }
513ee442 400 if (i > MAX_CHARSET)
4ed46869
KH
401 /* No such a charset. */
402 CHARSET_TABLE_INFO (charset, CHARSET_REVERSE_CHARSET_IDX)
403 = make_number (-1);
404 }
405
406 if (charset != CHARSET_ASCII
407 && charset < MIN_CHARSET_PRIVATE_DIMENSION1)
408 {
409 /* Update tables bytes_by_char_head and width_by_char_head. */
410 bytes_by_char_head[leading_code_base] = bytes;
411 width_by_char_head[leading_code_base] = XINT (width);
412
413 /* Update table emacs_code_class. */
414 emacs_code_class[charset] = (bytes == 2
415 ? EMACS_leading_code_2
416 : (bytes == 3
417 ? EMACS_leading_code_3
418 : EMACS_leading_code_4));
419 }
420
421 /* Update table iso_charset_table. */
422 if (ISO_CHARSET_TABLE (dimension, chars, iso_final_char) < 0)
423 ISO_CHARSET_TABLE (dimension, chars, iso_final_char) = charset;
424}
425
426#ifdef emacs
427
428/* Return charset id of CHARSET_SYMBOL, or return -1 if CHARSET_SYMBOL
429 is invalid. */
430int
431get_charset_id (charset_symbol)
432 Lisp_Object charset_symbol;
433{
434 Lisp_Object val;
435 int charset;
436
437 return ((SYMBOLP (charset_symbol)
438 && (val = Fget (charset_symbol, Qcharset), VECTORP (val))
439 && (charset = XINT (XVECTOR (val)->contents[CHARSET_ID_IDX]),
440 CHARSET_VALID_P (charset)))
441 ? charset : -1);
442}
443
444/* Return an identification number for a new private charset of
445 DIMENSION and WIDTH. If there's no more room for the new charset,
446 return 0. */
447Lisp_Object
448get_new_private_charset_id (dimension, width)
449 int dimension, width;
450{
451 int charset, from, to;
452
453 if (dimension == 1)
454 {
455 if (width == 1)
456 from = LEADING_CODE_EXT_11, to = LEADING_CODE_EXT_12;
457 else
458 from = LEADING_CODE_EXT_12, to = LEADING_CODE_EXT_21;
459 }
460 else
461 {
462 if (width == 1)
463 from = LEADING_CODE_EXT_21, to = LEADING_CODE_EXT_22;
464 else
b0e3cf2b 465 from = LEADING_CODE_EXT_22, to = LEADING_CODE_EXT_MAX + 1;
4ed46869
KH
466 }
467
468 for (charset = from; charset < to; charset++)
469 if (!CHARSET_DEFINED_P (charset)) break;
470
471 return make_number (charset < to ? charset : 0);
472}
473
474DEFUN ("define-charset", Fdefine_charset, Sdefine_charset, 3, 3, 0,
475 "Define CHARSET-ID as the identification number of CHARSET with INFO-VECTOR.\n\
23d2a7f1 476If CHARSET-ID is nil, it is decided automatically, which means CHARSET is\n\
4ed46869
KH
477 treated as a private charset.\n\
478INFO-VECTOR is a vector of the format:\n\
479 [DIMENSION CHARS WIDTH DIRECTION ISO-FINAL-CHAR ISO-GRAPHIC-PLANE\n\
480 SHORT-NAME LONG-NAME DESCRIPTION]\n\
481The meanings of each elements is as follows:\n\
482DIMENSION (integer) is the number of bytes to represent a character: 1 or 2.\n\
483CHARS (integer) is the number of characters in a dimension: 94 or 96.\n\
484WIDTH (integer) is the number of columns a character in the charset\n\
485occupies on the screen: one of 0, 1, and 2.\n\
486\n\
487DIRECTION (integer) is the rendering direction of characters in the\n\
488charset when rendering. If 0, render from right to left, else\n\
489render from left to right.\n\
490\n\
491ISO-FINAL-CHAR (character) is the final character of the\n\
492corresponding ISO 2022 charset.\n\
493\n\
494ISO-GRAPHIC-PLANE (integer) is the graphic plane to be invoked\n\
495while encoding to variants of ISO 2022 coding system, one of the\n\
496following: 0/graphic-plane-left(GL), 1/graphic-plane-right(GR).\n\
497\n\
498SHORT-NAME (string) is the short name to refer to the charset.\n\
499\n\
500LONG-NAME (string) is the long name to refer to the charset.\n\
501\n\
502DESCRIPTION (string) is the description string of the charset.")
503 (charset_id, charset_symbol, info_vector)
504 Lisp_Object charset_id, charset_symbol, info_vector;
505{
506 Lisp_Object *vec;
507
508 if (!NILP (charset_id))
509 CHECK_NUMBER (charset_id, 0);
510 CHECK_SYMBOL (charset_symbol, 1);
511 CHECK_VECTOR (info_vector, 2);
512
513 if (! NILP (charset_id))
514 {
515 if (! CHARSET_VALID_P (XINT (charset_id)))
516 error ("Invalid CHARSET: %d", XINT (charset_id));
517 else if (CHARSET_DEFINED_P (XINT (charset_id)))
518 error ("Already defined charset: %d", XINT (charset_id));
519 }
520
521 vec = XVECTOR (info_vector)->contents;
522 if (XVECTOR (info_vector)->size != 9
523 || !INTEGERP (vec[0]) || !(XINT (vec[0]) == 1 || XINT (vec[0]) == 2)
524 || !INTEGERP (vec[1]) || !(XINT (vec[1]) == 94 || XINT (vec[1]) == 96)
525 || !INTEGERP (vec[2]) || !(XINT (vec[2]) == 1 || XINT (vec[2]) == 2)
526 || !INTEGERP (vec[3]) || !(XINT (vec[3]) == 0 || XINT (vec[3]) == 1)
527 || !INTEGERP (vec[4]) || !(XINT (vec[4]) >= '0' && XINT (vec[4]) <= '~')
528 || !INTEGERP (vec[5]) || !(XINT (vec[5]) == 0 || XINT (vec[5]) == 1)
529 || !STRINGP (vec[6])
530 || !STRINGP (vec[7])
531 || !STRINGP (vec[8]))
532 error ("Invalid info-vector argument for defining charset %s",
533 XSYMBOL (charset_symbol)->name->data);
534
535 if (NILP (charset_id))
536 {
537 charset_id = get_new_private_charset_id (XINT (vec[0]), XINT (vec[2]));
538 if (XINT (charset_id) == 0)
539 error ("There's no room for a new private charset %s",
540 XSYMBOL (charset_symbol)->name->data);
541 }
542
543 update_charset_table (charset_id, vec[0], vec[1], vec[2], vec[3],
544 vec[4], vec[5], vec[6], vec[7], vec[8]);
6dc0722d 545 Fput (charset_symbol, Qcharset, CHARSET_TABLE_ENTRY (XINT (charset_id)));
4ed46869
KH
546 CHARSET_SYMBOL (XINT (charset_id)) = charset_symbol;
547 Vcharset_list = Fcons (charset_symbol, Vcharset_list);
548 return Qnil;
549}
550
3fac5a51
KH
551DEFUN ("get-unused-iso-final-char", Fget_unused_iso_final_char,
552 Sget_unused_iso_final_char, 2, 2, 0,
553 "Return an unsed ISO's final char for a charset of DIMENISION and CHARS.\n\
554DIMENSION is the number of bytes to represent a character: 1 or 2.\n\
555CHARS is the number of characters in a dimension: 94 or 96.\n\
556\n\
557This final char is for private use, thus the range is `0' (48) .. `?' (63).\n\
558If there's no unused final char for the specified kind of charset,\n\
559return nil.")
560 (dimension, chars)
561 Lisp_Object dimension, chars;
562{
563 int final_char;
564
565 CHECK_NUMBER (dimension, 0);
566 CHECK_NUMBER (chars, 1);
567 if (XINT (dimension) != 1 && XINT (dimension) != 2)
568 error ("Invalid charset dimension %d, it should be 1 or 2",
569 XINT (dimension));
570 if (XINT (chars) != 94 && XINT (chars) != 96)
571 error ("Invalid charset chars %d, it should be 94 or 96",
572 XINT (chars));
573 for (final_char = '0'; final_char <= '?'; final_char++)
574 {
575 if (ISO_CHARSET_TABLE (dimension, chars, make_number (final_char)) < 0)
576 break;
577 }
578 return (final_char <= '?' ? make_number (final_char) : Qnil);
579}
580
4ed46869
KH
581DEFUN ("declare-equiv-charset", Fdeclare_equiv_charset, Sdeclare_equiv_charset,
582 4, 4, 0,
583 "Declare a charset of DIMENSION, CHARS, FINAL-CHAR is the same as CHARSET.\n\
584CHARSET should be defined by `defined-charset' in advance.")
585 (dimension, chars, final_char, charset_symbol)
586 Lisp_Object dimension, chars, final_char, charset_symbol;
587{
588 int charset;
589
590 CHECK_NUMBER (dimension, 0);
591 CHECK_NUMBER (chars, 1);
592 CHECK_NUMBER (final_char, 2);
593 CHECK_SYMBOL (charset_symbol, 3);
594
595 if (XINT (dimension) != 1 && XINT (dimension) != 2)
596 error ("Invalid DIMENSION %d, it should be 1 or 2", XINT (dimension));
597 if (XINT (chars) != 94 && XINT (chars) != 96)
598 error ("Invalid CHARS %d, it should be 94 or 96", XINT (chars));
599 if (XINT (final_char) < '0' || XFASTINT (final_char) > '~')
600 error ("Invalid FINAL-CHAR %c, it should be `0'..`~'", XINT (chars));
601 if ((charset = get_charset_id (charset_symbol)) < 0)
602 error ("Invalid charset %s", XSYMBOL (charset_symbol)->name->data);
603
604 ISO_CHARSET_TABLE (dimension, chars, final_char) = charset;
605 return Qnil;
606}
607
608/* Return number of different charsets in STR of length LEN. In
609 addition, for each found charset N, CHARSETS[N] is set 1. The
a29e3b1b 610 caller should allocate CHARSETS (MAX_CHARSET + 1 elements) in advance.
23d2a7f1 611 It may lookup a unification table TABLE if supplied. */
4ed46869
KH
612
613int
23d2a7f1 614find_charset_in_str (str, len, charsets, table)
028d516b
KH
615 unsigned char *str;
616 int len, *charsets;
23d2a7f1 617 Lisp_Object table;
4ed46869 618{
733eafd8 619 register int num = 0, c;
4ed46869 620
23d2a7f1
KH
621 if (! CHAR_TABLE_P (table))
622 table = Qnil;
623
4ed46869
KH
624 while (len > 0)
625 {
05505664 626 int bytes, charset;
733eafd8 627 c = *str;
23d2a7f1 628
733eafd8 629 if (c == LEADING_CODE_COMPOSITION)
05505664 630 {
733eafd8
KH
631 int cmpchar_id = str_cmpchar_id (str, len);
632 GLYPH *glyph;
05505664 633
733eafd8 634 if (cmpchar_id > 0)
05505664 635 {
733eafd8
KH
636 struct cmpchar_info *cmpcharp = cmpchar_table[cmpchar_id];
637 int i;
638
639 for (i = 0; i < cmpcharp->glyph_len; i++)
640 {
641 c = cmpcharp->glyph[i];
642 if (!NILP (table))
643 {
644 if ((c = unify_char (table, c, 0, 0, 0)) < 0)
645 c = cmpcharp->glyph[i];
646 }
647 if ((charset = CHAR_CHARSET (c)) < 0)
648 charset = CHARSET_ASCII;
649 if (!charsets[charset])
650 {
651 charsets[charset] = 1;
652 num += 1;
653 }
654 }
655 str += cmpcharp->len;
656 len -= cmpcharp->len;
657 continue;
05505664 658 }
05505664 659
733eafd8
KH
660 charset = CHARSET_ASCII;
661 bytes = 1;
662 }
23d2a7f1
KH
663 else
664 {
733eafd8
KH
665 c = STRING_CHAR_AND_LENGTH (str, len, bytes);
666 if (! NILP (table))
667 {
668 int c1 = unify_char (table, c, 0, 0, 0);
669 if (c1 >= 0)
670 c = c1;
671 }
672 charset = CHAR_CHARSET (c);
23d2a7f1 673 }
4ed46869
KH
674
675 if (!charsets[charset])
676 {
677 charsets[charset] = 1;
678 num += 1;
679 }
680 str += bytes;
681 len -= bytes;
682 }
683 return num;
684}
685
686DEFUN ("find-charset-region", Ffind_charset_region, Sfind_charset_region,
23d2a7f1 687 2, 3, 0,
4ed46869 688 "Return a list of charsets in the region between BEG and END.\n\
23d2a7f1
KH
689BEG and END are buffer positions.\n\
690Optional arg TABLE if non-nil is a unification table to look up.")
691 (beg, end, table)
692 Lisp_Object beg, end, table;
4ed46869 693{
028d516b 694 int charsets[MAX_CHARSET + 1];
6ae1f27e 695 int from, from_byte, to, stop, stop_byte, i;
4ed46869
KH
696 Lisp_Object val;
697
698 validate_region (&beg, &end);
699 from = XFASTINT (beg);
700 stop = to = XFASTINT (end);
6ae1f27e 701
4ed46869 702 if (from < GPT && GPT < to)
6ae1f27e
RS
703 {
704 stop = GPT;
705 stop_byte = GPT_BYTE;
706 }
707 else
708 stop_byte = CHAR_TO_BYTE (stop);
709
710 from_byte = CHAR_TO_BYTE (from);
711
028d516b 712 bzero (charsets, (MAX_CHARSET + 1) * sizeof (int));
4ed46869
KH
713 while (1)
714 {
6ae1f27e
RS
715 find_charset_in_str (BYTE_POS_ADDR (from_byte), stop_byte - from_byte,
716 charsets, table);
4ed46869 717 if (stop < to)
6ae1f27e
RS
718 {
719 from = stop, from_byte = stop_byte;
720 stop = to, stop_byte = CHAR_TO_BYTE (stop);
721 }
4ed46869
KH
722 else
723 break;
724 }
6ae1f27e 725
4ed46869 726 val = Qnil;
513ee442 727 for (i = MAX_CHARSET; i >= 0; i--)
4ed46869
KH
728 if (charsets[i])
729 val = Fcons (CHARSET_SYMBOL (i), val);
730 return val;
731}
732
733DEFUN ("find-charset-string", Ffind_charset_string, Sfind_charset_string,
23d2a7f1
KH
734 1, 2, 0,
735 "Return a list of charsets in STR.\n\
736Optional arg TABLE if non-nil is a unification table to look up.")
737 (str, table)
738 Lisp_Object str, table;
4ed46869 739{
a29e3b1b 740 int charsets[MAX_CHARSET + 1];
4ed46869
KH
741 int i;
742 Lisp_Object val;
743
744 CHECK_STRING (str, 0);
87b089ad
RS
745
746 if (! STRING_MULTIBYTE (str))
747 return Qnil;
748
a29e3b1b 749 bzero (charsets, (MAX_CHARSET + 1) * sizeof (int));
87b089ad 750 find_charset_in_str (XSTRING (str)->data, XSTRING (str)->size_byte,
23d2a7f1 751 charsets, table);
4ed46869 752 val = Qnil;
513ee442 753 for (i = MAX_CHARSET; i >= 0; i--)
4ed46869
KH
754 if (charsets[i])
755 val = Fcons (CHARSET_SYMBOL (i), val);
756 return val;
757}
758\f
759DEFUN ("make-char-internal", Fmake_char_internal, Smake_char_internal, 1, 3, 0,
513ee442 760 "")
4ed46869
KH
761 (charset, code1, code2)
762 Lisp_Object charset, code1, code2;
763{
764 CHECK_NUMBER (charset, 0);
765
766 if (NILP (code1))
767 XSETFASTINT (code1, 0);
768 else
769 CHECK_NUMBER (code1, 1);
770 if (NILP (code2))
771 XSETFASTINT (code2, 0);
772 else
773 CHECK_NUMBER (code2, 2);
774
775 if (!CHARSET_DEFINED_P (XINT (charset)))
776 error ("Invalid charset: %d", XINT (charset));
777
778 return make_number (MAKE_CHAR (XINT (charset), XINT (code1), XINT (code2)));
779}
780
781DEFUN ("split-char", Fsplit_char, Ssplit_char, 1, 1, 0,
782 "Return list of charset and one or two position-codes of CHAR.")
783 (ch)
784 Lisp_Object ch;
785{
786 Lisp_Object val;
6dc0722d 787 int charset, c1, c2;
4ed46869
KH
788
789 CHECK_NUMBER (ch, 0);
790 SPLIT_CHAR (XFASTINT (ch), charset, c1, c2);
6dc0722d 791 return (c2 >= 0
4ed46869
KH
792 ? Fcons (CHARSET_SYMBOL (charset),
793 Fcons (make_number (c1), Fcons (make_number (c2), Qnil)))
794 : Fcons (CHARSET_SYMBOL (charset), Fcons (make_number (c1), Qnil)));
795}
796
797DEFUN ("char-charset", Fchar_charset, Schar_charset, 1, 1, 0,
798 "Return charset of CHAR.")
799 (ch)
800 Lisp_Object ch;
801{
802 CHECK_NUMBER (ch, 0);
803
804 return CHARSET_SYMBOL (CHAR_CHARSET (XINT (ch)));
805}
806
807DEFUN ("iso-charset", Fiso_charset, Siso_charset, 3, 3, 0,
2b71bb78
KH
808 "Return charset of ISO's specification DIMENSION, CHARS, and FINAL-CHAR.\n\
809\n\
810ISO 2022's designation sequence (escape sequence) distinguishes charsets\n\
811by their DIMENSION, CHARS, and FINAL-CHAR,\n\
812where as Emacs distinguishes them by charset symbol.\n\
813See the documentation of the function `charset-info' for the meanings of\n\
814DIMENSION, CHARS, and FINAL-CHAR.")
4ed46869
KH
815 (dimension, chars, final_char)
816 Lisp_Object dimension, chars, final_char;
817{
818 int charset;
819
820 CHECK_NUMBER (dimension, 0);
821 CHECK_NUMBER (chars, 1);
822 CHECK_NUMBER (final_char, 2);
823
824 if ((charset = ISO_CHARSET_TABLE (dimension, chars, final_char)) < 0)
825 return Qnil;
826 return CHARSET_SYMBOL (charset);
827}
828
9d3d8cba
KH
829/* If GENERICP is nonzero, return nonzero iff C is a valid normal or
830 generic character. If GENERICP is zero, return nonzero iff C is a
831 valid normal character. Do not call this function directly,
832 instead use macro CHAR_VALID_P. */
833int
834char_valid_p (c, genericp)
835 int c, genericp;
836{
837 int charset, c1, c2;
838
839 if (c < 0)
840 return 0;
841 if (SINGLE_BYTE_CHAR_P (c))
842 return 1;
843 SPLIT_NON_ASCII_CHAR (c, charset, c1, c2);
844 if (!CHARSET_VALID_P (charset))
845 return 0;
846 return (c < MIN_CHAR_COMPOSITION
847 ? ((c & CHAR_FIELD1_MASK) /* i.e. dimension of C is two. */
848 ? (genericp && c1 == 0 && c2 == 0
849 || c1 >= 32 && c2 >= 32)
850 : (genericp && c1 == 0
851 || c1 >= 32))
852 : c < MIN_CHAR_COMPOSITION + n_cmpchars);
853}
854
855DEFUN ("char-valid-p", Fchar_valid_p, Schar_valid_p, 1, 2, 0,
a9d02884
DL
856 "Return t if OBJECT is a valid normal character.\n\
857If optional arg GENERICP is non-nil, also return t if OBJECT is\n\
9d3d8cba
KH
858a valid generic character.")
859 (object, genericp)
860 Lisp_Object object, genericp;
861{
862 if (! NATNUMP (object))
863 return Qnil;
864 return (CHAR_VALID_P (XFASTINT (object), !NILP (genericp)) ? Qt : Qnil);
865}
866
4ed46869
KH
867DEFUN ("char-bytes", Fchar_bytes, Schar_bytes, 1, 1, 0,
868 "Return byte length of multi-byte form of CHAR.")
869 (ch)
870 Lisp_Object ch;
871{
872 Lisp_Object val;
873 int bytes;
874
875 CHECK_NUMBER (ch, 0);
876 if (COMPOSITE_CHAR_P (XFASTINT (ch)))
877 {
878 unsigned int id = COMPOSITE_CHAR_ID (XFASTINT (ch));
879
880 bytes = (id < n_cmpchars ? cmpchar_table[id]->len : 1);
881 }
882 else
883 {
884 int charset = CHAR_CHARSET (XFASTINT (ch));
885
886 bytes = CHARSET_DEFINED_P (charset) ? CHARSET_BYTES (charset) : 1;
887 }
888
889 XSETFASTINT (val, bytes);
890 return val;
891}
892
893/* Return the width of character of which multi-byte form starts with
894 C. The width is measured by how many columns occupied on the
895 screen when displayed in the current buffer. */
896
897#define ONE_BYTE_CHAR_WIDTH(c) \
898 (c < 0x20 \
899 ? (c == '\t' \
53316e55 900 ? XFASTINT (current_buffer->tab_width) \
4ed46869
KH
901 : (c == '\n' ? 0 : (NILP (current_buffer->ctl_arrow) ? 4 : 2))) \
902 : (c < 0x7f \
903 ? 1 \
904 : (c == 0x7F \
905 ? (NILP (current_buffer->ctl_arrow) ? 4 : 2) \
906 : ((! NILP (current_buffer->enable_multibyte_characters) \
907 && BASE_LEADING_CODE_P (c)) \
908 ? WIDTH_BY_CHAR_HEAD (c) \
909 : 4)))) \
910
911
912DEFUN ("char-width", Fchar_width, Schar_width, 1, 1, 0,
913 "Return width of CHAR when displayed in the current buffer.\n\
914The width is measured by how many columns it occupies on the screen.")
915 (ch)
916 Lisp_Object ch;
917{
859f2b3c 918 Lisp_Object val, disp;
4ed46869 919 int c;
51c4025f 920 struct Lisp_Char_Table *dp = buffer_display_table ();
4ed46869
KH
921
922 CHECK_NUMBER (ch, 0);
923
859f2b3c
RS
924 c = XINT (ch);
925
926 /* Get the way the display table would display it. */
51c4025f 927 disp = dp ? DISP_CHAR_VECTOR (dp, c) : Qnil;
859f2b3c
RS
928
929 if (VECTORP (disp))
930 XSETINT (val, XVECTOR (disp)->size);
931 else if (SINGLE_BYTE_CHAR_P (c))
932 XSETINT (val, ONE_BYTE_CHAR_WIDTH (c));
4ed46869
KH
933 else if (COMPOSITE_CHAR_P (c))
934 {
935 int id = COMPOSITE_CHAR_ID (XFASTINT (ch));
936 XSETFASTINT (val, (id < n_cmpchars ? cmpchar_table[id]->width : 0));
937 }
938 else
939 {
940 int charset = CHAR_CHARSET (c);
941
942 XSETFASTINT (val, CHARSET_WIDTH (charset));
943 }
944 return val;
945}
946
947/* Return width of string STR of length LEN when displayed in the
948 current buffer. The width is measured by how many columns it
949 occupies on the screen. */
859f2b3c 950
4ed46869
KH
951int
952strwidth (str, len)
953 unsigned char *str;
954 int len;
955{
956 unsigned char *endp = str + len;
957 int width = 0;
c4a4e28f 958 struct Lisp_Char_Table *dp = buffer_display_table ();
4ed46869 959
859f2b3c
RS
960 while (str < endp)
961 {
962 if (*str == LEADING_CODE_COMPOSITION)
963 {
964 int id = str_cmpchar_id (str, endp - str);
965
966 if (id < 0)
967 {
968 width += 4;
969 str++;
970 }
971 else
972 {
973 width += cmpchar_table[id]->width;
974 str += cmpchar_table[id]->len;
975 }
976 }
977 else
978 {
979 Lisp_Object disp;
980 int thiswidth;
981 int c = STRING_CHAR (str, endp - str);
982
983 /* Get the way the display table would display it. */
acc35c36
RS
984 if (dp)
985 disp = DISP_CHAR_VECTOR (dp, c);
986 else
987 disp = Qnil;
859f2b3c
RS
988
989 if (VECTORP (disp))
990 thiswidth = XVECTOR (disp)->size;
991 else
992 thiswidth = ONE_BYTE_CHAR_WIDTH (*str);
993
994 width += thiswidth;
995 str += BYTES_BY_CHAR_HEAD (*str);
996 }
997 }
4ed46869
KH
998 return width;
999}
1000
1001DEFUN ("string-width", Fstring_width, Sstring_width, 1, 1, 0,
1002 "Return width of STRING when displayed in the current buffer.\n\
1003Width is measured by how many columns it occupies on the screen.\n\
046b1f03
RS
1004When calculating width of a multibyte character in STRING,\n\
1005only the base leading-code is considered; the validity of\n\
1006the following bytes is not checked.")
4ed46869
KH
1007 (str)
1008 Lisp_Object str;
1009{
1010 Lisp_Object val;
1011
1012 CHECK_STRING (str, 0);
1013 XSETFASTINT (val, strwidth (XSTRING (str)->data, XSTRING (str)->size));
1014 return val;
1015}
1016
1017DEFUN ("char-direction", Fchar_direction, Schar_direction, 1, 1, 0,
1018 "Return the direction of CHAR.\n\
1019The returned value is 0 for left-to-right and 1 for right-to-left.")
1020 (ch)
1021 Lisp_Object ch;
1022{
1023 int charset;
1024
1025 CHECK_NUMBER (ch, 0);
1026 charset = CHAR_CHARSET (XFASTINT (ch));
1027 if (!CHARSET_DEFINED_P (charset))
93bcb785 1028 invalid_character (XINT (ch));
4ed46869
KH
1029 return CHARSET_TABLE_INFO (charset, CHARSET_DIRECTION_IDX);
1030}
1031
af4fecb4 1032DEFUN ("chars-in-region", Fchars_in_region, Schars_in_region, 2, 2, 0,
6ae1f27e 1033 "Return number of characters between BEG and END.")
046b1f03
RS
1034 (beg, end)
1035 Lisp_Object beg, end;
1036{
6ae1f27e 1037 int from, to;
046b1f03
RS
1038
1039 from = min (XFASTINT (beg), XFASTINT (end));
a8a35e61 1040 to = max (XFASTINT (beg), XFASTINT (end));
046b1f03 1041
6ae1f27e
RS
1042 return to - from;
1043}
9036eb45 1044
87b089ad
RS
1045/* Return the number of characters in the NBYTES bytes at PTR.
1046 This works by looking at the contents and checking for multibyte sequences.
1047 However, if the current buffer has enable-multibyte-characters = nil,
1048 we treat each byte as a character. */
1049
6ae1f27e
RS
1050int
1051chars_in_text (ptr, nbytes)
1052 unsigned char *ptr;
1053 int nbytes;
1054{
93bcb785 1055 unsigned char *endp, c;
6ae1f27e 1056 int chars;
046b1f03 1057
87b089ad
RS
1058 /* current_buffer is null at early stages of Emacs initialization. */
1059 if (current_buffer == 0
1060 || NILP (current_buffer->enable_multibyte_characters))
6ae1f27e 1061 return nbytes;
a8a35e61 1062
6ae1f27e
RS
1063 endp = ptr + nbytes;
1064 chars = 0;
046b1f03 1065
6ae1f27e
RS
1066 while (ptr < endp)
1067 {
93bcb785
KH
1068 c = *ptr++;
1069
1070 if (BASE_LEADING_CODE_P (c))
1071 while (ptr < endp && ! CHAR_HEAD_P (*ptr)) ptr++;
046b1f03
RS
1072 chars++;
1073 }
1074
6ae1f27e 1075 return chars;
046b1f03
RS
1076}
1077
87b089ad
RS
1078/* Return the number of characters in the NBYTES bytes at PTR.
1079 This works by looking at the contents and checking for multibyte sequences.
1080 It ignores enable-multibyte-characters. */
1081
1082int
1083multibyte_chars_in_text (ptr, nbytes)
1084 unsigned char *ptr;
1085 int nbytes;
1086{
93bcb785 1087 unsigned char *endp, c;
87b089ad
RS
1088 int chars;
1089
1090 endp = ptr + nbytes;
1091 chars = 0;
1092
1093 while (ptr < endp)
1094 {
93bcb785
KH
1095 c = *ptr++;
1096
1097 if (BASE_LEADING_CODE_P (c))
1098 while (ptr < endp && ! CHAR_HEAD_P (*ptr)) ptr++;
87b089ad
RS
1099 chars++;
1100 }
1101
1102 return chars;
1103}
1104
1105DEFUN ("string", Fstring, Sstring, 1, MANY, 0,
4ed46869 1106 "Concatenate all the argument characters and make the result a string.")
53316e55
KH
1107 (n, args)
1108 int n;
4ed46869
KH
1109 Lisp_Object *args;
1110{
53316e55 1111 int i;
4ed46869 1112 unsigned char *buf
bd4c6dd0 1113 = (unsigned char *) alloca (MAX_LENGTH_OF_MULTI_BYTE_FORM * n);
4ed46869
KH
1114 unsigned char *p = buf;
1115 Lisp_Object val;
1116
1117 for (i = 0; i < n; i++)
1118 {
1119 int c, len;
1120 unsigned char *str;
1121
1122 if (!INTEGERP (args[i]))
b0e3cf2b 1123 CHECK_NUMBER (args[i], 0);
4ed46869
KH
1124 c = XINT (args[i]);
1125 len = CHAR_STRING (c, p, str);
1126 if (p != str)
1127 /* C is a composite character. */
1128 bcopy (str, p, len);
1129 p += len;
1130 }
1131
87b089ad 1132 val = make_multibyte_string (buf, n, p - buf);
4ed46869
KH
1133 return val;
1134}
1135
1136#endif /* emacs */
1137\f
1138/*** Composite characters staffs ***/
1139
1140/* Each composite character is identified by CMPCHAR-ID which is
1141 assigned when Emacs needs the character code of the composite
1142 character (e.g. when displaying it on the screen). See the
1143 document "GENERAL NOTE on COMPOSITE CHARACTER" in `charset.h' how a
1144 composite character is represented in Emacs. */
1145
1146/* If `static' is defined, it means that it is defined to null string. */
1147#ifndef static
1148/* The following function is copied from lread.c. */
1149static int
1150hash_string (ptr, len)
1151 unsigned char *ptr;
1152 int len;
1153{
1154 register unsigned char *p = ptr;
1155 register unsigned char *end = p + len;
1156 register unsigned char c;
1157 register int hash = 0;
1158
1159 while (p != end)
1160 {
1161 c = *p++;
1162 if (c >= 0140) c -= 40;
1163 hash = ((hash<<3) + (hash>>28) + c);
1164 }
1165 return hash & 07777777777;
1166}
1167#endif
1168
4ed46869
KH
1169#define CMPCHAR_HASH_TABLE_SIZE 0xFFF
1170
1171static int *cmpchar_hash_table[CMPCHAR_HASH_TABLE_SIZE];
1172
1173/* Each element of `cmpchar_hash_table' is a pointer to an array of
1174 integer, where the 1st element is the size of the array, the 2nd
1175 element is how many elements are actually used in the array, and
1176 the remaining elements are CMPCHAR-IDs of composite characters of
1177 the same hash value. */
1178#define CMPCHAR_HASH_SIZE(table) table[0]
1179#define CMPCHAR_HASH_USED(table) table[1]
1180#define CMPCHAR_HASH_CMPCHAR_ID(table, i) table[i]
1181
1182/* Return CMPCHAR-ID of the composite character in STR of the length
1183 LEN. If the composite character has not yet been registered,
1184 register it in `cmpchar_table' and assign new CMPCHAR-ID. This
1185 is the sole function for assigning CMPCHAR-ID. */
1186int
1187str_cmpchar_id (str, len)
1188 unsigned char *str;
1189 int len;
1190{
1191 int hash_idx, *hashp;
1192 unsigned char *buf;
1193 int embedded_rule; /* 1 if composition rule is embedded. */
1194 int chars; /* number of components. */
1195 int i;
1196 struct cmpchar_info *cmpcharp;
1197
4ed46869
KH
1198 /* The second byte 0xFF means compostion rule is embedded. */
1199 embedded_rule = (str[1] == 0xFF);
1200
1201 /* At first, get the actual length of the composite character. */
1202 {
1203 unsigned char *p, *endp = str + 1, *lastp = str + len;
1204 int bytes;
1205
6ae1f27e 1206 while (endp < lastp && ! CHAR_HEAD_P (*endp)) endp++;
93bcb785
KH
1207 if (endp - str < 5)
1208 /* Any composite char have at least 5-byte length. */
1209 return -1;
1210
4ed46869 1211 chars = 0;
93bcb785 1212 p = str + 1;
4ed46869
KH
1213 while (p < endp)
1214 {
93bcb785 1215 if (embedded_rule) p++;
4ed46869 1216 /* No need of checking if *P is 0xA0 because
93bcb785
KH
1217 BYTES_BY_CHAR_HEAD (0x80) surely returns 2. */
1218 p += BYTES_BY_CHAR_HEAD (*p - 0x20);
4ed46869
KH
1219 chars++;
1220 }
93bcb785
KH
1221 if (p > endp || chars < 2 || chars > MAX_COMPONENT_COUNT)
1222 /* Invalid components. */
4ed46869 1223 return -1;
93bcb785 1224 len = p - str;
4ed46869
KH
1225 }
1226 hash_idx = hash_string (str, len) % CMPCHAR_HASH_TABLE_SIZE;
1227 hashp = cmpchar_hash_table[hash_idx];
1228
1229 /* Then, look into the hash table. */
1230 if (hashp != NULL)
1231 /* Find the correct one among composite characters of the same
1232 hash value. */
1233 for (i = 2; i < CMPCHAR_HASH_USED (hashp); i++)
1234 {
1235 cmpcharp = cmpchar_table[CMPCHAR_HASH_CMPCHAR_ID (hashp, i)];
1236 if (len == cmpcharp->len
1237 && ! bcmp (str, cmpcharp->data, len))
1238 return CMPCHAR_HASH_CMPCHAR_ID (hashp, i);
1239 }
1240
1241 /* We have to register the composite character in cmpchar_table. */
513ee442
KH
1242 if (n_cmpchars > (CHAR_FIELD2_MASK | CHAR_FIELD3_MASK))
1243 /* No, we have no more room for a new composite character. */
1244 return -1;
1245
4ed46869
KH
1246 /* Make the entry in hash table. */
1247 if (hashp == NULL)
1248 {
1249 /* Make a table for 8 composite characters initially. */
1250 hashp = (cmpchar_hash_table[hash_idx]
1251 = (int *) xmalloc (sizeof (int) * (2 + 8)));
1252 CMPCHAR_HASH_SIZE (hashp) = 10;
1253 CMPCHAR_HASH_USED (hashp) = 2;
1254 }
1255 else if (CMPCHAR_HASH_USED (hashp) >= CMPCHAR_HASH_SIZE (hashp))
1256 {
1257 CMPCHAR_HASH_SIZE (hashp) += 8;
1258 hashp = (cmpchar_hash_table[hash_idx]
1259 = (int *) xrealloc (hashp,
1260 sizeof (int) * CMPCHAR_HASH_SIZE (hashp)));
1261 }
1262 CMPCHAR_HASH_CMPCHAR_ID (hashp, CMPCHAR_HASH_USED (hashp)) = n_cmpchars;
1263 CMPCHAR_HASH_USED (hashp)++;
1264
1265 /* Set information of the composite character in cmpchar_table. */
1266 if (cmpchar_table_size == 0)
1267 {
1268 /* This is the first composite character to be registered. */
1269 cmpchar_table_size = 256;
1270 cmpchar_table
1271 = (struct cmpchar_info **) xmalloc (sizeof (cmpchar_table[0])
1272 * cmpchar_table_size);
1273 }
1274 else if (cmpchar_table_size <= n_cmpchars)
1275 {
1276 cmpchar_table_size += 256;
1277 cmpchar_table
1278 = (struct cmpchar_info **) xrealloc (cmpchar_table,
1279 sizeof (cmpchar_table[0])
1280 * cmpchar_table_size);
1281 }
1282
1283 cmpcharp = (struct cmpchar_info *) xmalloc (sizeof (struct cmpchar_info));
1284
1285 cmpcharp->len = len;
1286 cmpcharp->data = (unsigned char *) xmalloc (len + 1);
1287 bcopy (str, cmpcharp->data, len);
1288 cmpcharp->data[len] = 0;
1289 cmpcharp->glyph_len = chars;
1290 cmpcharp->glyph = (GLYPH *) xmalloc (sizeof (GLYPH) * chars);
1291 if (embedded_rule)
1292 {
1293 cmpcharp->cmp_rule = (unsigned char *) xmalloc (chars);
1294 cmpcharp->col_offset = (float *) xmalloc (sizeof (float) * chars);
1295 }
1296 else
1297 {
1298 cmpcharp->cmp_rule = NULL;
1299 cmpcharp->col_offset = NULL;
1300 }
1301
1302 /* Setup GLYPH data and composition rules (if any) so as not to make
1303 them every time on displaying. */
1304 {
1305 unsigned char *bufp;
1306 int width;
1307 float leftmost = 0.0, rightmost = 1.0;
1308
1309 if (embedded_rule)
1310 /* At first, col_offset[N] is set to relative to col_offset[0]. */
1311 cmpcharp->col_offset[0] = 0;
1312
1313 for (i = 0, bufp = cmpcharp->data + 1; i < chars; i++)
1314 {
1315 if (embedded_rule)
1316 cmpcharp->cmp_rule[i] = *bufp++;
1317
1318 if (*bufp == 0xA0) /* This is an ASCII character. */
1319 {
1320 cmpcharp->glyph[i] = FAST_MAKE_GLYPH ((*++bufp & 0x7F), 0);
1321 width = 1;
1322 bufp++;
1323 }
1324 else /* Multibyte character. */
1325 {
1326 /* Make `bufp' point normal multi-byte form temporally. */
1327 *bufp -= 0x20;
1328 cmpcharp->glyph[i]
1329 = FAST_MAKE_GLYPH (string_to_non_ascii_char (bufp, 4, 0), 0);
1330 width = WIDTH_BY_CHAR_HEAD (*bufp);
1331 *bufp += 0x20;
1332 bufp += BYTES_BY_CHAR_HEAD (*bufp - 0x20);
1333 }
1334
1335 if (embedded_rule && i > 0)
1336 {
1337 /* Reference points (global_ref and new_ref) are
1338 encoded as below:
1339
1340 0--1--2 -- ascent
1341 | |
1342 | |
1343 | 4 -+--- center
1344 -- 3 5 -- baseline
1345 | |
1346 6--7--8 -- descent
1347
1348 Now, we calculate the column offset of the new glyph
1349 from the left edge of the first glyph. This can avoid
1350 the same calculation everytime displaying this
1351 composite character. */
1352
1353 /* Reference points of global glyph and new glyph. */
1354 int global_ref = (cmpcharp->cmp_rule[i] - 0xA0) / 9;
1355 int new_ref = (cmpcharp->cmp_rule[i] - 0xA0) % 9;
1356 /* Column offset relative to the first glyph. */
1357 float left = (leftmost
1358 + (global_ref % 3) * (rightmost - leftmost) / 2.0
1359 - (new_ref % 3) * width / 2.0);
1360
1361 cmpcharp->col_offset[i] = left;
1362 if (left < leftmost)
1363 leftmost = left;
1364 if (left + width > rightmost)
1365 rightmost = left + width;
1366 }
1367 else
1368 {
1369 if (width > rightmost)
1370 rightmost = width;
1371 }
1372 }
1373 if (embedded_rule)
1374 {
1375 /* Now col_offset[N] are relative to the left edge of the
1376 first component. Make them relative to the left edge of
1377 overall glyph. */
1378 for (i = 0; i < chars; i++)
1379 cmpcharp->col_offset[i] -= leftmost;
1380 /* Make rightmost holds width of overall glyph. */
1381 rightmost -= leftmost;
1382 }
1383
1384 cmpcharp->width = rightmost;
1385 if (cmpcharp->width < rightmost)
1386 /* To get a ceiling integer value. */
1387 cmpcharp->width++;
1388 }
1389
1390 cmpchar_table[n_cmpchars] = cmpcharp;
1391
1392 return n_cmpchars++;
1393}
1394
1395/* Return the Nth element of the composite character C. */
1396int
1397cmpchar_component (c, n)
1398 unsigned int c, n;
1399{
1400 int id = COMPOSITE_CHAR_ID (c);
1401
1402 if (id >= n_cmpchars /* C is not a valid composite character. */
1403 || n >= cmpchar_table[id]->glyph_len) /* No such component. */
1404 return -1;
1405 /* No face data is stored in glyph code. */
1406 return ((int) (cmpchar_table[id]->glyph[n]));
1407}
1408
1409DEFUN ("cmpcharp", Fcmpcharp, Scmpcharp, 1, 1, 0,
1410 "T if CHAR is a composite character.")
1411 (ch)
1412 Lisp_Object ch;
1413{
1414 CHECK_NUMBER (ch, 0);
1415 return (COMPOSITE_CHAR_P (XINT (ch)) ? Qt : Qnil);
1416}
1417
1418DEFUN ("composite-char-component", Fcmpchar_component, Scmpchar_component,
1419 2, 2, 0,
1420 "Return the IDXth component character of composite character CHARACTER.")
1421 (character, idx)
1422 Lisp_Object character, idx;
1423{
1424 int c;
1425
1426 CHECK_NUMBER (character, 0);
1427 CHECK_NUMBER (idx, 1);
1428
1429 if ((c = cmpchar_component (XINT (character), XINT (idx))) < 0)
1430 args_out_of_range (character, idx);
1431
1432 return make_number (c);
1433}
1434
1435DEFUN ("composite-char-composition-rule", Fcmpchar_cmp_rule, Scmpchar_cmp_rule,
1436 2, 2, 0,
55001746
KH
1437 "Return the Nth composition rule embedded in composite character CHARACTER.\n\
1438The returned rule is for composing the Nth component\n\
1439on the (N-1)th component. If N is 0, the returned value is always 255.")
1440 (character, n)
1441 Lisp_Object character, n;
4ed46869
KH
1442{
1443 int id, i;
1444
1445 CHECK_NUMBER (character, 0);
55001746 1446 CHECK_NUMBER (n, 1);
4ed46869
KH
1447
1448 id = COMPOSITE_CHAR_ID (XINT (character));
1449 if (id < 0 || id >= n_cmpchars)
1450 error ("Invalid composite character: %d", XINT (character));
55001746 1451 i = XINT (n);
4ed46869 1452 if (i > cmpchar_table[id]->glyph_len)
55001746 1453 args_out_of_range (character, n);
4ed46869
KH
1454
1455 return make_number (cmpchar_table[id]->cmp_rule[i]);
1456}
1457
1458DEFUN ("composite-char-composition-rule-p", Fcmpchar_cmp_rule_p,
1459 Scmpchar_cmp_rule_p, 1, 1, 0,
1460 "Return non-nil if composite character CHARACTER contains a embedded rule.")
1461 (character)
1462 Lisp_Object character;
1463{
1464 int id;
1465
1466 CHECK_NUMBER (character, 0);
1467 id = COMPOSITE_CHAR_ID (XINT (character));
1468 if (id < 0 || id >= n_cmpchars)
1469 error ("Invalid composite character: %d", XINT (character));
1470
1471 return (cmpchar_table[id]->cmp_rule ? Qt : Qnil);
1472}
1473
1474DEFUN ("composite-char-component-count", Fcmpchar_cmp_count,
1475 Scmpchar_cmp_count, 1, 1, 0,
1476 "Return number of compoents of composite character CHARACTER.")
1477 (character)
1478 Lisp_Object character;
1479{
1480 int id;
1481
1482 CHECK_NUMBER (character, 0);
1483 id = COMPOSITE_CHAR_ID (XINT (character));
1484 if (id < 0 || id >= n_cmpchars)
1485 error ("Invalid composite character: %d", XINT (character));
1486
1487 return (make_number (cmpchar_table[id]->glyph_len));
1488}
1489
1490DEFUN ("compose-string", Fcompose_string, Scompose_string,
1491 1, 1, 0,
1492 "Return one char string composed from all characters in STRING.")
1493 (str)
1494 Lisp_Object str;
1495{
1496 unsigned char buf[MAX_LENGTH_OF_MULTI_BYTE_FORM], *p, *pend, *ptemp;
1497 int len, i;
1498
1499 CHECK_STRING (str, 0);
1500
1501 buf[0] = LEADING_CODE_COMPOSITION;
1502 p = XSTRING (str)->data;
87b089ad 1503 pend = p + XSTRING (str)->size_byte;
4ed46869
KH
1504 i = 1;
1505 while (p < pend)
1506 {
1507 if (*p < 0x20 || *p == 127) /* control code */
1508 error ("Invalid component character: %d", *p);
1509 else if (*p < 0x80) /* ASCII */
1510 {
1511 if (i + 2 >= MAX_LENGTH_OF_MULTI_BYTE_FORM)
1512 error ("Too long string to be composed: %s", XSTRING (str)->data);
1513 /* Prepend an ASCII charset indicator 0xA0, set MSB of the
1514 code itself. */
1515 buf[i++] = 0xA0;
1516 buf[i++] = *p++ + 0x80;
1517 }
1518 else if (*p == LEADING_CODE_COMPOSITION) /* composite char */
1519 {
1520 /* Already composed. Eliminate the heading
1521 LEADING_CODE_COMPOSITION, keep the remaining bytes
1522 unchanged. */
1523 p++;
1524 ptemp = p;
6ae1f27e 1525 while (! CHAR_HEAD_P (*p)) p++;
4ed46869
KH
1526 if (i + (p - ptemp) >= MAX_LENGTH_OF_MULTI_BYTE_FORM)
1527 error ("Too long string to be composed: %s", XSTRING (str)->data);
1528 bcopy (ptemp, buf + i, p - ptemp);
1529 i += p - ptemp;
1530 }
1531 else /* multibyte char */
1532 {
1533 /* Add 0x20 to the base leading-code, keep the remaining
1534 bytes unchanged. */
1535 len = BYTES_BY_CHAR_HEAD (*p);
1536 if (i + len >= MAX_LENGTH_OF_MULTI_BYTE_FORM)
1537 error ("Too long string to be composed: %s", XSTRING (str)->data);
1538 bcopy (p, buf + i, len);
1539 buf[i] += 0x20;
1540 p += len, i += len;
1541 }
1542 }
1543
1544 if (i < 5)
1545 /* STR contains only one character, which can't be composed. */
1546 error ("Too short string to be composed: %s", XSTRING (str)->data);
1547
87b089ad 1548 return make_multibyte_string (buf, 1, i);
4ed46869
KH
1549}
1550
1551\f
1552charset_id_internal (charset_name)
1553 char *charset_name;
1554{
1555 Lisp_Object val = Fget (intern (charset_name), Qcharset);
1556
1557 if (!VECTORP (val))
1558 error ("Charset %s is not defined", charset_name);
1559
1560 return (XINT (XVECTOR (val)->contents[0]));
1561}
1562
1563DEFUN ("setup-special-charsets", Fsetup_special_charsets,
1564 Ssetup_special_charsets, 0, 0, 0, "Internal use only.")
1565 ()
1566{
1567 charset_latin_iso8859_1 = charset_id_internal ("latin-iso8859-1");
1568 charset_jisx0208_1978 = charset_id_internal ("japanese-jisx0208-1978");
1569 charset_jisx0208 = charset_id_internal ("japanese-jisx0208");
1570 charset_katakana_jisx0201 = charset_id_internal ("katakana-jisx0201");
1571 charset_latin_jisx0201 = charset_id_internal ("latin-jisx0201");
1572 charset_big5_1 = charset_id_internal ("chinese-big5-1");
1573 charset_big5_2 = charset_id_internal ("chinese-big5-2");
1574 return Qnil;
1575}
1576
1577init_charset_once ()
1578{
1579 int i, j, k;
1580
1581 staticpro (&Vcharset_table);
1582 staticpro (&Vcharset_symbol_table);
1583
1584 /* This has to be done here, before we call Fmake_char_table. */
1585 Qcharset_table = intern ("charset-table");
1586 staticpro (&Qcharset_table);
1587
1588 /* Intern this now in case it isn't already done.
1589 Setting this variable twice is harmless.
1590 But don't staticpro it here--that is done in alloc.c. */
1591 Qchar_table_extra_slots = intern ("char-table-extra-slots");
1592
1593 /* Now we are ready to set up this property, so we can
1594 create the charset table. */
1595 Fput (Qcharset_table, Qchar_table_extra_slots, make_number (0));
1596 Vcharset_table = Fmake_char_table (Qcharset_table, Qnil);
1597
513ee442 1598 Vcharset_symbol_table = Fmake_vector (make_number (MAX_CHARSET + 1), Qnil);
4ed46869
KH
1599
1600 /* Setup tables. */
1601 for (i = 0; i < 2; i++)
1602 for (j = 0; j < 2; j++)
1603 for (k = 0; k < 128; k++)
1604 iso_charset_table [i][j][k] = -1;
1605
1606 bzero (cmpchar_hash_table, sizeof cmpchar_hash_table);
1607 cmpchar_table_size = n_cmpchars = 0;
1608
1609 for (i = 0; i < 256; i++)
1610 BYTES_BY_CHAR_HEAD (i) = 1;
1611 BYTES_BY_CHAR_HEAD (LEADING_CODE_PRIVATE_11) = 3;
1612 BYTES_BY_CHAR_HEAD (LEADING_CODE_PRIVATE_12) = 3;
1613 BYTES_BY_CHAR_HEAD (LEADING_CODE_PRIVATE_21) = 4;
1614 BYTES_BY_CHAR_HEAD (LEADING_CODE_PRIVATE_22) = 4;
1615 /* The following doesn't reflect the actual bytes, but just to tell
1616 that it is a start of a multibyte character. */
1617 BYTES_BY_CHAR_HEAD (LEADING_CODE_COMPOSITION) = 2;
1618
1619 for (i = 0; i < 128; i++)
1620 WIDTH_BY_CHAR_HEAD (i) = 1;
1621 for (; i < 256; i++)
1622 WIDTH_BY_CHAR_HEAD (i) = 4;
1623 WIDTH_BY_CHAR_HEAD (LEADING_CODE_PRIVATE_11) = 1;
1624 WIDTH_BY_CHAR_HEAD (LEADING_CODE_PRIVATE_12) = 2;
1625 WIDTH_BY_CHAR_HEAD (LEADING_CODE_PRIVATE_21) = 1;
1626 WIDTH_BY_CHAR_HEAD (LEADING_CODE_PRIVATE_22) = 2;
1627}
1628
1629#ifdef emacs
1630
1631syms_of_charset ()
1632{
1633 Qascii = intern ("ascii");
1634 staticpro (&Qascii);
1635
1636 Qcharset = intern ("charset");
1637 staticpro (&Qcharset);
1638
1639 /* Define ASCII charset now. */
1640 update_charset_table (make_number (CHARSET_ASCII),
1641 make_number (1), make_number (94),
1642 make_number (1),
1643 make_number (0),
1644 make_number ('B'),
1645 make_number (0),
1646 build_string ("ASCII"),
1647 build_string ("ASCII"),
1648 build_string ("ASCII (ISO646 IRV)"));
1649 CHARSET_SYMBOL (CHARSET_ASCII) = Qascii;
1650 Fput (Qascii, Qcharset, CHARSET_TABLE_ENTRY (CHARSET_ASCII));
1651
1652 Qcomposition = intern ("composition");
1653 staticpro (&Qcomposition);
1654 CHARSET_SYMBOL (CHARSET_COMPOSITION) = Qcomposition;
1655
1656 defsubr (&Sdefine_charset);
3fac5a51 1657 defsubr (&Sget_unused_iso_final_char);
4ed46869
KH
1658 defsubr (&Sdeclare_equiv_charset);
1659 defsubr (&Sfind_charset_region);
1660 defsubr (&Sfind_charset_string);
1661 defsubr (&Smake_char_internal);
1662 defsubr (&Ssplit_char);
1663 defsubr (&Schar_charset);
1664 defsubr (&Siso_charset);
9d3d8cba 1665 defsubr (&Schar_valid_p);
4ed46869
KH
1666 defsubr (&Schar_bytes);
1667 defsubr (&Schar_width);
1668 defsubr (&Sstring_width);
1669 defsubr (&Schar_direction);
af4fecb4 1670 defsubr (&Schars_in_region);
87b089ad 1671 defsubr (&Sstring);
4ed46869
KH
1672 defsubr (&Scmpcharp);
1673 defsubr (&Scmpchar_component);
1674 defsubr (&Scmpchar_cmp_rule);
1675 defsubr (&Scmpchar_cmp_rule_p);
1676 defsubr (&Scmpchar_cmp_count);
1677 defsubr (&Scompose_string);
1678 defsubr (&Ssetup_special_charsets);
1679
1680 DEFVAR_LISP ("charset-list", &Vcharset_list,
1681 "List of charsets ever defined.");
1682 Vcharset_list = Fcons (Qascii, Qnil);
1683
b0e3cf2b
KH
1684 DEFVAR_LISP ("character-unification-table-vector",
1685 &Vcharacter_unification_table_vector,
1686 "Vector of cons cell of a symbol and unification table ever defined.\n\
1687An ID of a unification table is an index of this vector.");
1688 Vcharacter_unification_table_vector = Fmake_vector (make_number (16), Qnil);
1689
4ed46869
KH
1690 DEFVAR_INT ("leading-code-composition", &leading_code_composition,
1691 "Leading-code of composite characters.");
1692 leading_code_composition = LEADING_CODE_COMPOSITION;
1693
1694 DEFVAR_INT ("leading-code-private-11", &leading_code_private_11,
1695 "Leading-code of private TYPE9N charset of column-width 1.");
1696 leading_code_private_11 = LEADING_CODE_PRIVATE_11;
1697
1698 DEFVAR_INT ("leading-code-private-12", &leading_code_private_12,
1699 "Leading-code of private TYPE9N charset of column-width 2.");
1700 leading_code_private_12 = LEADING_CODE_PRIVATE_12;
1701
1702 DEFVAR_INT ("leading-code-private-21", &leading_code_private_21,
1703 "Leading-code of private TYPE9Nx9N charset of column-width 1.");
1704 leading_code_private_21 = LEADING_CODE_PRIVATE_21;
1705
1706 DEFVAR_INT ("leading-code-private-22", &leading_code_private_22,
1707 "Leading-code of private TYPE9Nx9N charset of column-width 2.");
1708 leading_code_private_22 = LEADING_CODE_PRIVATE_22;
35e623fb
RS
1709
1710 DEFVAR_INT ("nonascii-insert-offset", &nonascii_insert_offset,
4cf9710d
RS
1711 "Offset for converting non-ASCII unibyte codes 0200...0377 to multibyte.\n\
1712This is used for converting unibyte text to multibyte,\n\
1713and for inserting character codes specified by number.\n\n\
1714Conversion is performed only when multibyte characters are enabled,\n\
1715and it serves to convert a Latin-1 or similar 8-bit character code\n\
1716to the corresponding Emacs character code.\n\
1717If `nonascii-translate-table' is non-nil, it overrides this variable.");
35e623fb 1718 nonascii_insert_offset = 0;
b0e3cf2b 1719
4cf9710d
RS
1720 DEFVAR_LISP ("nonascii-translate-table", &Vnonascii_translate_table,
1721 "Translate table for converting non-ASCII unibyte codes to multibyte.\n\
1722This is used for converting unibyte text to multibyte,\n\
1723and for inserting character codes specified by number.\n\n\
1724Conversion is performed only when multibyte characters are enabled,\n\
1725and it serves to convert a Latin-1 or similar 8-bit character code\n\
1726to the corresponding Emacs character code.\n\n\
1727If this is nil, `nonascii-insert-offset' is used instead.");
1728 Vnonascii_translate_table = Qnil;
1729
b0e3cf2b
KH
1730 DEFVAR_INT ("min-composite-char", &min_composite_char,
1731 "Minimum character code of a composite character.");
1732 min_composite_char = MIN_CHAR_COMPOSITION;
4ed46869
KH
1733}
1734
1735#endif /* emacs */