Doc fixes.
[bpt/emacs.git] / src / charset.c
CommitLineData
75c8c592 1/* Basic multilingual character support.
35e623fb 2 Copyright (C) 1995, 1997, 1998 Electrotechnical Laboratory, JAPAN.
75c8c592 3 Licensed to the Free Software Foundation.
4ed46869 4
369314dc
KH
5This file is part of GNU Emacs.
6
7GNU Emacs is free software; you can redistribute it and/or modify
8it under the terms of the GNU General Public License as published by
9the Free Software Foundation; either version 2, or (at your option)
10any later version.
4ed46869 11
369314dc
KH
12GNU Emacs is distributed in the hope that it will be useful,
13but WITHOUT ANY WARRANTY; without even the implied warranty of
14MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15GNU General Public License for more details.
4ed46869 16
369314dc
KH
17You should have received a copy of the GNU General Public License
18along with GNU Emacs; see the file COPYING. If not, write to
19the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
20Boston, MA 02111-1307, USA. */
4ed46869
KH
21
22/* At first, see the document in `charset.h' to understand the code in
23 this file. */
24
25#include <stdio.h>
26
27#ifdef emacs
28
29#include <sys/types.h>
30#include <config.h>
31#include "lisp.h"
32#include "buffer.h"
33#include "charset.h"
34#include "coding.h"
fc6b09bf 35#include "disptab.h"
4ed46869
KH
36
37#else /* not emacs */
38
39#include "mulelib.h"
40
41#endif /* emacs */
42
43Lisp_Object Qcharset, Qascii, Qcomposition;
44
45/* Declaration of special leading-codes. */
46int leading_code_composition; /* for composite characters */
47int leading_code_private_11; /* for private DIMENSION1 of 1-column */
48int leading_code_private_12; /* for private DIMENSION1 of 2-column */
49int leading_code_private_21; /* for private DIMENSION2 of 1-column */
50int leading_code_private_22; /* for private DIMENSION2 of 2-column */
51
52/* Declaration of special charsets. */
53int charset_ascii; /* ASCII */
54int charset_composition; /* for a composite character */
55int charset_latin_iso8859_1; /* ISO8859-1 (Latin-1) */
56int charset_jisx0208_1978; /* JISX0208.1978 (Japanese Kanji old set) */
57int charset_jisx0208; /* JISX0208.1983 (Japanese Kanji) */
58int charset_katakana_jisx0201; /* JISX0201.Kana (Japanese Katakana) */
59int charset_latin_jisx0201; /* JISX0201.Roman (Japanese Roman) */
60int charset_big5_1; /* Big5 Level 1 (Chinese Traditional) */
61int charset_big5_2; /* Big5 Level 2 (Chinese Traditional) */
62
b0e3cf2b
KH
63int min_composite_char;
64
4ed46869
KH
65Lisp_Object Qcharset_table;
66
67/* A char-table containing information of each character set. */
68Lisp_Object Vcharset_table;
69
70/* A vector of charset symbol indexed by charset-id. This is used
71 only for returning charset symbol from C functions. */
72Lisp_Object Vcharset_symbol_table;
73
74/* A list of charset symbols ever defined. */
75Lisp_Object Vcharset_list;
76
b0e3cf2b
KH
77/* Vector of unification table ever defined.
78 An ID of a unification table is an index of this vector. */
79Lisp_Object Vcharacter_unification_table_vector;
80
4ed46869
KH
81/* Tables used by macros BYTES_BY_CHAR_HEAD and WIDTH_BY_CHAR_HEAD. */
82int bytes_by_char_head[256];
83int width_by_char_head[256];
84
85/* Mapping table from ISO2022's charset (specified by DIMENSION,
86 CHARS, and FINAL-CHAR) to Emacs' charset. */
87int iso_charset_table[2][2][128];
88
513ee442
KH
89/* Table of pointers to the structure `cmpchar_info' indexed by
90 CMPCHAR-ID. */
91struct cmpchar_info **cmpchar_table;
92/* The current size of `cmpchar_table'. */
93static int cmpchar_table_size;
94/* Number of the current composite characters. */
95int n_cmpchars;
96
4ed46869
KH
97/* Variables used locally in the macro FETCH_MULTIBYTE_CHAR. */
98unsigned char *_fetch_multibyte_char_p;
99int _fetch_multibyte_char_len;
100
35e623fb
RS
101/* Offset to add to a non-ASCII value when inserting it. */
102int nonascii_insert_offset;
103
046b1f03
RS
104#define min(X, Y) ((X) < (Y) ? (X) : (Y))
105#define max(X, Y) ((X) > (Y) ? (X) : (Y))
106\f
4ed46869
KH
107/* Set STR a pointer to the multi-byte form of the character C. If C
108 is not a composite character, the multi-byte form is set in WORKBUF
109 and STR points WORKBUF. The caller should allocate at least 4-byte
110 area at WORKBUF in advance. Returns the length of the multi-byte
bd4c6dd0
KH
111 form. If C is an invalid character to have a multi-byte form,
112 signal an error.
4ed46869
KH
113
114 Use macro `CHAR_STRING (C, WORKBUF, STR)' instead of calling this
115 function directly if C can be an ASCII character. */
116
117int
118non_ascii_char_to_string (c, workbuf, str)
119 int c;
120 unsigned char *workbuf, **str;
121{
6dc0722d 122 int charset, c1, c2;
4ed46869
KH
123
124 if (COMPOSITE_CHAR_P (c))
125 {
126 int cmpchar_id = COMPOSITE_CHAR_ID (c);
127
128 if (cmpchar_id < n_cmpchars)
129 {
130 *str = cmpchar_table[cmpchar_id]->data;
131 return cmpchar_table[cmpchar_id]->len;
132 }
133 else
134 {
d16a4fc6 135 error ("Invalid character: %d", c);
4ed46869
KH
136 }
137 }
138
139 SPLIT_NON_ASCII_CHAR (c, charset, c1, c2);
bd4c6dd0
KH
140 if (!charset
141 || ! CHARSET_DEFINED_P (charset)
142 || c1 >= 0 && c1 < 32
143 || c2 >= 0 && c2 < 32)
d16a4fc6 144 error ("Invalid character: %d", c);
4ed46869
KH
145
146 *str = workbuf;
147 *workbuf++ = CHARSET_LEADING_CODE_BASE (charset);
148 if (*workbuf = CHARSET_LEADING_CODE_EXT (charset))
149 workbuf++;
150 *workbuf++ = c1 | 0x80;
6dc0722d 151 if (c2 >= 0)
4ed46869
KH
152 *workbuf++ = c2 | 0x80;
153
154 return (workbuf - *str);
155}
156
157/* Return a non-ASCII character of which multi-byte form is at STR of
158 length LEN. If ACTUAL_LEN is not NULL, the actual length of the
159 character is set to the address ACTUAL_LEN.
160
161 Use macro `STRING_CHAR (STR, LEN)' instead of calling this function
162 directly if STR can hold an ASCII character. */
163
164string_to_non_ascii_char (str, len, actual_len)
165 unsigned char *str;
166 int len, *actual_len;
167{
168 int charset;
169 unsigned char c1, c2;
170 register int c;
171
172 if (SPLIT_STRING (str, len, charset, c1, c2) == CHARSET_ASCII)
173 {
174 if (actual_len)
175 *actual_len = 1;
176 return (int) *str;
177 }
178
179 c = MAKE_NON_ASCII_CHAR (charset, c1, c2);
180
181 if (actual_len)
182 *actual_len = (charset == CHARSET_COMPOSITION
183 ? cmpchar_table[COMPOSITE_CHAR_ID (c)]->len
184 : BYTES_BY_CHAR_HEAD (*str));
185 return c;
186}
187
188/* Return the length of the multi-byte form at string STR of length LEN. */
189int
190multibyte_form_length (str, len)
191 unsigned char *str;
192 int len;
193{
194 int charset;
195 unsigned char c1, c2;
196 register int c;
197
198 if (SPLIT_STRING (str, len, charset, c1, c2) == CHARSET_ASCII)
199 return 1;
200
201 return (charset == CHARSET_COMPOSITION
202 ? cmpchar_table[(c1 << 7) | c2]->len
203 : BYTES_BY_CHAR_HEAD (*str));
204}
205
206/* Check if string STR of length LEN contains valid multi-byte form of
207 a character. If valid, charset and position codes of the character
208 is set at *CHARSET, *C1, and *C2, and return 0. If not valid,
209 return -1. This should be used only in the macro SPLIT_STRING
210 which checks range of STR in advance. */
211
212split_non_ascii_string (str, len, charset, c1, c2)
213 register unsigned char *str, *c1, *c2;
214 register int len, *charset;
215{
216 register unsigned int cs = *str++;
217
218 if (cs == LEADING_CODE_COMPOSITION)
219 {
220 int cmpchar_id = str_cmpchar_id (str - 1, len);
221
222 if (cmpchar_id < 0)
223 return -1;
224 *charset = cs, *c1 = cmpchar_id >> 7, *c2 = cmpchar_id & 0x7F;
225 }
226 else if ((cs < LEADING_CODE_PRIVATE_11 || (cs = *str++) >= 0xA0)
227 && CHARSET_DEFINED_P (cs))
228 {
229 *charset = cs;
230 if (*str < 0xA0)
231 return -1;
232 *c1 = (*str++) & 0x7F;
233 if (CHARSET_DIMENSION (cs) == 2)
234 {
235 if (*str < 0xA0)
236 return -1;
237 *c2 = (*str++) & 0x7F;
238 }
239 }
240 else
241 return -1;
242 return 0;
243}
244
23d2a7f1
KH
245/* Return a character unified with C (or a character made of CHARSET,
246 C1, and C2) in unification table TABLE. If no unification is found
247 in TABLE, return C. */
248unify_char (table, c, charset, c1, c2)
249 Lisp_Object table;
250 int c, charset, c1, c2;
251{
252 Lisp_Object ch;
253 int alt_charset, alt_c1, alt_c2, dimension;
254
255 if (c < 0) c = MAKE_CHAR (charset, c1, c2);
256 if (!CHAR_TABLE_P (table)
257 || (ch = Faref (table, make_number (c)), !INTEGERP (ch))
258 || XINT (ch) < 0)
259 return c;
260
261 SPLIT_CHAR (XFASTINT (ch), alt_charset, alt_c1, alt_c2);
262 dimension = CHARSET_DIMENSION (alt_charset);
263 if (dimension == 1 && alt_c1 > 0 || dimension == 2 && alt_c2 > 0)
264 /* CH is not a generic character, just return it. */
265 return XFASTINT (ch);
266
267 /* Since CH is a generic character, we must return a specific
268 charater which has the same position codes as C from CH. */
269 if (charset < 0)
270 SPLIT_CHAR (c, charset, c1, c2);
271 if (dimension != CHARSET_DIMENSION (charset))
272 /* We can't make such a character because of dimension mismatch. */
273 return c;
23d2a7f1
KH
274 return MAKE_CHAR (alt_charset, c1, c2);
275}
276
35e623fb
RS
277#define DEFAULT_NONASCII_INSERT_OFFSET 0x800
278
279/* Convert the unibyte character C to multibyte
280 based on nonascii_insert_offset. Note that copy_text in insdel.c
281 has similar code. */
282
283int
284unibyte_char_to_multibyte (c)
285 int c;
286{
287 if (c >= 0200 && c < 0400)
288 {
289 if (nonascii_insert_offset > 0)
290 c += nonascii_insert_offset;
291 else
292 c += DEFAULT_NONASCII_INSERT_OFFSET;
293 }
294 return c;
295}
296\f
4ed46869
KH
297/* Update the table Vcharset_table with the given arguments (see the
298 document of `define-charset' for the meaning of each argument).
299 Several other table contents are also updated. The caller should
300 check the validity of CHARSET-ID and the remaining arguments in
301 advance. */
302
303void
304update_charset_table (charset_id, dimension, chars, width, direction,
305 iso_final_char, iso_graphic_plane,
306 short_name, long_name, description)
307 Lisp_Object charset_id, dimension, chars, width, direction;
308 Lisp_Object iso_final_char, iso_graphic_plane;
309 Lisp_Object short_name, long_name, description;
310{
311 int charset = XINT (charset_id);
312 int bytes;
313 unsigned char leading_code_base, leading_code_ext;
314
6dc0722d
KH
315 if (NILP (CHARSET_TABLE_ENTRY (charset)))
316 CHARSET_TABLE_ENTRY (charset)
317 = Fmake_vector (make_number (CHARSET_MAX_IDX), Qnil);
4ed46869
KH
318
319 /* Get byte length of multibyte form, base leading-code, and
320 extended leading-code of the charset. See the comment under the
321 title "GENERAL NOTE on CHARACTER SET (CHARSET)" in charset.h. */
322 bytes = XINT (dimension);
323 if (charset < MIN_CHARSET_PRIVATE_DIMENSION1)
324 {
325 /* Official charset, it doesn't have an extended leading-code. */
326 if (charset != CHARSET_ASCII)
327 bytes += 1; /* For a base leading-code. */
328 leading_code_base = charset;
329 leading_code_ext = 0;
330 }
331 else
332 {
333 /* Private charset. */
334 bytes += 2; /* For base and extended leading-codes. */
335 leading_code_base
336 = (charset < LEADING_CODE_EXT_12
337 ? LEADING_CODE_PRIVATE_11
338 : (charset < LEADING_CODE_EXT_21
339 ? LEADING_CODE_PRIVATE_12
340 : (charset < LEADING_CODE_EXT_22
341 ? LEADING_CODE_PRIVATE_21
342 : LEADING_CODE_PRIVATE_22)));
343 leading_code_ext = charset;
344 }
345
346 CHARSET_TABLE_INFO (charset, CHARSET_ID_IDX) = charset_id;
347 CHARSET_TABLE_INFO (charset, CHARSET_BYTES_IDX) = make_number (bytes);
348 CHARSET_TABLE_INFO (charset, CHARSET_DIMENSION_IDX) = dimension;
349 CHARSET_TABLE_INFO (charset, CHARSET_CHARS_IDX) = chars;
350 CHARSET_TABLE_INFO (charset, CHARSET_WIDTH_IDX) = width;
351 CHARSET_TABLE_INFO (charset, CHARSET_DIRECTION_IDX) = direction;
352 CHARSET_TABLE_INFO (charset, CHARSET_LEADING_CODE_BASE_IDX)
353 = make_number (leading_code_base);
354 CHARSET_TABLE_INFO (charset, CHARSET_LEADING_CODE_EXT_IDX)
355 = make_number (leading_code_ext);
356 CHARSET_TABLE_INFO (charset, CHARSET_ISO_FINAL_CHAR_IDX) = iso_final_char;
357 CHARSET_TABLE_INFO (charset, CHARSET_ISO_GRAPHIC_PLANE_IDX)
358 = iso_graphic_plane;
359 CHARSET_TABLE_INFO (charset, CHARSET_SHORT_NAME_IDX) = short_name;
360 CHARSET_TABLE_INFO (charset, CHARSET_LONG_NAME_IDX) = long_name;
361 CHARSET_TABLE_INFO (charset, CHARSET_DESCRIPTION_IDX) = description;
362 CHARSET_TABLE_INFO (charset, CHARSET_PLIST_IDX) = Qnil;
363
364 {
365 /* If we have already defined a charset which has the same
366 DIMENSION, CHARS and ISO-FINAL-CHAR but the different
367 DIRECTION, we must update the entry REVERSE-CHARSET of both
368 charsets. If there's no such charset, the value of the entry
369 is set to nil. */
370 int i;
371
513ee442 372 for (i = 0; i <= MAX_CHARSET; i++)
4ed46869
KH
373 if (!NILP (CHARSET_TABLE_ENTRY (i)))
374 {
375 if (CHARSET_DIMENSION (i) == XINT (dimension)
376 && CHARSET_CHARS (i) == XINT (chars)
377 && CHARSET_ISO_FINAL_CHAR (i) == XINT (iso_final_char)
378 && CHARSET_DIRECTION (i) != XINT (direction))
379 {
380 CHARSET_TABLE_INFO (charset, CHARSET_REVERSE_CHARSET_IDX)
381 = make_number (i);
382 CHARSET_TABLE_INFO (i, CHARSET_REVERSE_CHARSET_IDX) = charset_id;
383 break;
384 }
385 }
513ee442 386 if (i > MAX_CHARSET)
4ed46869
KH
387 /* No such a charset. */
388 CHARSET_TABLE_INFO (charset, CHARSET_REVERSE_CHARSET_IDX)
389 = make_number (-1);
390 }
391
392 if (charset != CHARSET_ASCII
393 && charset < MIN_CHARSET_PRIVATE_DIMENSION1)
394 {
395 /* Update tables bytes_by_char_head and width_by_char_head. */
396 bytes_by_char_head[leading_code_base] = bytes;
397 width_by_char_head[leading_code_base] = XINT (width);
398
399 /* Update table emacs_code_class. */
400 emacs_code_class[charset] = (bytes == 2
401 ? EMACS_leading_code_2
402 : (bytes == 3
403 ? EMACS_leading_code_3
404 : EMACS_leading_code_4));
405 }
406
407 /* Update table iso_charset_table. */
408 if (ISO_CHARSET_TABLE (dimension, chars, iso_final_char) < 0)
409 ISO_CHARSET_TABLE (dimension, chars, iso_final_char) = charset;
410}
411
412#ifdef emacs
413
414/* Return charset id of CHARSET_SYMBOL, or return -1 if CHARSET_SYMBOL
415 is invalid. */
416int
417get_charset_id (charset_symbol)
418 Lisp_Object charset_symbol;
419{
420 Lisp_Object val;
421 int charset;
422
423 return ((SYMBOLP (charset_symbol)
424 && (val = Fget (charset_symbol, Qcharset), VECTORP (val))
425 && (charset = XINT (XVECTOR (val)->contents[CHARSET_ID_IDX]),
426 CHARSET_VALID_P (charset)))
427 ? charset : -1);
428}
429
430/* Return an identification number for a new private charset of
431 DIMENSION and WIDTH. If there's no more room for the new charset,
432 return 0. */
433Lisp_Object
434get_new_private_charset_id (dimension, width)
435 int dimension, width;
436{
437 int charset, from, to;
438
439 if (dimension == 1)
440 {
441 if (width == 1)
442 from = LEADING_CODE_EXT_11, to = LEADING_CODE_EXT_12;
443 else
444 from = LEADING_CODE_EXT_12, to = LEADING_CODE_EXT_21;
445 }
446 else
447 {
448 if (width == 1)
449 from = LEADING_CODE_EXT_21, to = LEADING_CODE_EXT_22;
450 else
b0e3cf2b 451 from = LEADING_CODE_EXT_22, to = LEADING_CODE_EXT_MAX + 1;
4ed46869
KH
452 }
453
454 for (charset = from; charset < to; charset++)
455 if (!CHARSET_DEFINED_P (charset)) break;
456
457 return make_number (charset < to ? charset : 0);
458}
459
460DEFUN ("define-charset", Fdefine_charset, Sdefine_charset, 3, 3, 0,
461 "Define CHARSET-ID as the identification number of CHARSET with INFO-VECTOR.\n\
23d2a7f1 462If CHARSET-ID is nil, it is decided automatically, which means CHARSET is\n\
4ed46869
KH
463 treated as a private charset.\n\
464INFO-VECTOR is a vector of the format:\n\
465 [DIMENSION CHARS WIDTH DIRECTION ISO-FINAL-CHAR ISO-GRAPHIC-PLANE\n\
466 SHORT-NAME LONG-NAME DESCRIPTION]\n\
467The meanings of each elements is as follows:\n\
468DIMENSION (integer) is the number of bytes to represent a character: 1 or 2.\n\
469CHARS (integer) is the number of characters in a dimension: 94 or 96.\n\
470WIDTH (integer) is the number of columns a character in the charset\n\
471occupies on the screen: one of 0, 1, and 2.\n\
472\n\
473DIRECTION (integer) is the rendering direction of characters in the\n\
474charset when rendering. If 0, render from right to left, else\n\
475render from left to right.\n\
476\n\
477ISO-FINAL-CHAR (character) is the final character of the\n\
478corresponding ISO 2022 charset.\n\
479\n\
480ISO-GRAPHIC-PLANE (integer) is the graphic plane to be invoked\n\
481while encoding to variants of ISO 2022 coding system, one of the\n\
482following: 0/graphic-plane-left(GL), 1/graphic-plane-right(GR).\n\
483\n\
484SHORT-NAME (string) is the short name to refer to the charset.\n\
485\n\
486LONG-NAME (string) is the long name to refer to the charset.\n\
487\n\
488DESCRIPTION (string) is the description string of the charset.")
489 (charset_id, charset_symbol, info_vector)
490 Lisp_Object charset_id, charset_symbol, info_vector;
491{
492 Lisp_Object *vec;
493
494 if (!NILP (charset_id))
495 CHECK_NUMBER (charset_id, 0);
496 CHECK_SYMBOL (charset_symbol, 1);
497 CHECK_VECTOR (info_vector, 2);
498
499 if (! NILP (charset_id))
500 {
501 if (! CHARSET_VALID_P (XINT (charset_id)))
502 error ("Invalid CHARSET: %d", XINT (charset_id));
503 else if (CHARSET_DEFINED_P (XINT (charset_id)))
504 error ("Already defined charset: %d", XINT (charset_id));
505 }
506
507 vec = XVECTOR (info_vector)->contents;
508 if (XVECTOR (info_vector)->size != 9
509 || !INTEGERP (vec[0]) || !(XINT (vec[0]) == 1 || XINT (vec[0]) == 2)
510 || !INTEGERP (vec[1]) || !(XINT (vec[1]) == 94 || XINT (vec[1]) == 96)
511 || !INTEGERP (vec[2]) || !(XINT (vec[2]) == 1 || XINT (vec[2]) == 2)
512 || !INTEGERP (vec[3]) || !(XINT (vec[3]) == 0 || XINT (vec[3]) == 1)
513 || !INTEGERP (vec[4]) || !(XINT (vec[4]) >= '0' && XINT (vec[4]) <= '~')
514 || !INTEGERP (vec[5]) || !(XINT (vec[5]) == 0 || XINT (vec[5]) == 1)
515 || !STRINGP (vec[6])
516 || !STRINGP (vec[7])
517 || !STRINGP (vec[8]))
518 error ("Invalid info-vector argument for defining charset %s",
519 XSYMBOL (charset_symbol)->name->data);
520
521 if (NILP (charset_id))
522 {
523 charset_id = get_new_private_charset_id (XINT (vec[0]), XINT (vec[2]));
524 if (XINT (charset_id) == 0)
525 error ("There's no room for a new private charset %s",
526 XSYMBOL (charset_symbol)->name->data);
527 }
528
529 update_charset_table (charset_id, vec[0], vec[1], vec[2], vec[3],
530 vec[4], vec[5], vec[6], vec[7], vec[8]);
6dc0722d 531 Fput (charset_symbol, Qcharset, CHARSET_TABLE_ENTRY (XINT (charset_id)));
4ed46869
KH
532 CHARSET_SYMBOL (XINT (charset_id)) = charset_symbol;
533 Vcharset_list = Fcons (charset_symbol, Vcharset_list);
534 return Qnil;
535}
536
3fac5a51
KH
537DEFUN ("get-unused-iso-final-char", Fget_unused_iso_final_char,
538 Sget_unused_iso_final_char, 2, 2, 0,
539 "Return an unsed ISO's final char for a charset of DIMENISION and CHARS.\n\
540DIMENSION is the number of bytes to represent a character: 1 or 2.\n\
541CHARS is the number of characters in a dimension: 94 or 96.\n\
542\n\
543This final char is for private use, thus the range is `0' (48) .. `?' (63).\n\
544If there's no unused final char for the specified kind of charset,\n\
545return nil.")
546 (dimension, chars)
547 Lisp_Object dimension, chars;
548{
549 int final_char;
550
551 CHECK_NUMBER (dimension, 0);
552 CHECK_NUMBER (chars, 1);
553 if (XINT (dimension) != 1 && XINT (dimension) != 2)
554 error ("Invalid charset dimension %d, it should be 1 or 2",
555 XINT (dimension));
556 if (XINT (chars) != 94 && XINT (chars) != 96)
557 error ("Invalid charset chars %d, it should be 94 or 96",
558 XINT (chars));
559 for (final_char = '0'; final_char <= '?'; final_char++)
560 {
561 if (ISO_CHARSET_TABLE (dimension, chars, make_number (final_char)) < 0)
562 break;
563 }
564 return (final_char <= '?' ? make_number (final_char) : Qnil);
565}
566
4ed46869
KH
567DEFUN ("declare-equiv-charset", Fdeclare_equiv_charset, Sdeclare_equiv_charset,
568 4, 4, 0,
569 "Declare a charset of DIMENSION, CHARS, FINAL-CHAR is the same as CHARSET.\n\
570CHARSET should be defined by `defined-charset' in advance.")
571 (dimension, chars, final_char, charset_symbol)
572 Lisp_Object dimension, chars, final_char, charset_symbol;
573{
574 int charset;
575
576 CHECK_NUMBER (dimension, 0);
577 CHECK_NUMBER (chars, 1);
578 CHECK_NUMBER (final_char, 2);
579 CHECK_SYMBOL (charset_symbol, 3);
580
581 if (XINT (dimension) != 1 && XINT (dimension) != 2)
582 error ("Invalid DIMENSION %d, it should be 1 or 2", XINT (dimension));
583 if (XINT (chars) != 94 && XINT (chars) != 96)
584 error ("Invalid CHARS %d, it should be 94 or 96", XINT (chars));
585 if (XINT (final_char) < '0' || XFASTINT (final_char) > '~')
586 error ("Invalid FINAL-CHAR %c, it should be `0'..`~'", XINT (chars));
587 if ((charset = get_charset_id (charset_symbol)) < 0)
588 error ("Invalid charset %s", XSYMBOL (charset_symbol)->name->data);
589
590 ISO_CHARSET_TABLE (dimension, chars, final_char) = charset;
591 return Qnil;
592}
593
594/* Return number of different charsets in STR of length LEN. In
595 addition, for each found charset N, CHARSETS[N] is set 1. The
a29e3b1b 596 caller should allocate CHARSETS (MAX_CHARSET + 1 elements) in advance.
23d2a7f1 597 It may lookup a unification table TABLE if supplied. */
4ed46869
KH
598
599int
23d2a7f1 600find_charset_in_str (str, len, charsets, table)
028d516b
KH
601 unsigned char *str;
602 int len, *charsets;
23d2a7f1 603 Lisp_Object table;
4ed46869 604{
733eafd8 605 register int num = 0, c;
4ed46869 606
23d2a7f1
KH
607 if (! CHAR_TABLE_P (table))
608 table = Qnil;
609
4ed46869
KH
610 while (len > 0)
611 {
05505664 612 int bytes, charset;
733eafd8 613 c = *str;
23d2a7f1 614
733eafd8 615 if (c == LEADING_CODE_COMPOSITION)
05505664 616 {
733eafd8
KH
617 int cmpchar_id = str_cmpchar_id (str, len);
618 GLYPH *glyph;
05505664 619
733eafd8 620 if (cmpchar_id > 0)
05505664 621 {
733eafd8
KH
622 struct cmpchar_info *cmpcharp = cmpchar_table[cmpchar_id];
623 int i;
624
625 for (i = 0; i < cmpcharp->glyph_len; i++)
626 {
627 c = cmpcharp->glyph[i];
628 if (!NILP (table))
629 {
630 if ((c = unify_char (table, c, 0, 0, 0)) < 0)
631 c = cmpcharp->glyph[i];
632 }
633 if ((charset = CHAR_CHARSET (c)) < 0)
634 charset = CHARSET_ASCII;
635 if (!charsets[charset])
636 {
637 charsets[charset] = 1;
638 num += 1;
639 }
640 }
641 str += cmpcharp->len;
642 len -= cmpcharp->len;
643 continue;
05505664 644 }
05505664 645
733eafd8
KH
646 charset = CHARSET_ASCII;
647 bytes = 1;
648 }
23d2a7f1
KH
649 else
650 {
733eafd8
KH
651 c = STRING_CHAR_AND_LENGTH (str, len, bytes);
652 if (! NILP (table))
653 {
654 int c1 = unify_char (table, c, 0, 0, 0);
655 if (c1 >= 0)
656 c = c1;
657 }
658 charset = CHAR_CHARSET (c);
23d2a7f1 659 }
4ed46869
KH
660
661 if (!charsets[charset])
662 {
663 charsets[charset] = 1;
664 num += 1;
665 }
666 str += bytes;
667 len -= bytes;
668 }
669 return num;
670}
671
672DEFUN ("find-charset-region", Ffind_charset_region, Sfind_charset_region,
23d2a7f1 673 2, 3, 0,
4ed46869 674 "Return a list of charsets in the region between BEG and END.\n\
23d2a7f1
KH
675BEG and END are buffer positions.\n\
676Optional arg TABLE if non-nil is a unification table to look up.")
677 (beg, end, table)
678 Lisp_Object beg, end, table;
4ed46869 679{
028d516b 680 int charsets[MAX_CHARSET + 1];
6ae1f27e 681 int from, from_byte, to, stop, stop_byte, i;
4ed46869
KH
682 Lisp_Object val;
683
684 validate_region (&beg, &end);
685 from = XFASTINT (beg);
686 stop = to = XFASTINT (end);
6ae1f27e 687
4ed46869 688 if (from < GPT && GPT < to)
6ae1f27e
RS
689 {
690 stop = GPT;
691 stop_byte = GPT_BYTE;
692 }
693 else
694 stop_byte = CHAR_TO_BYTE (stop);
695
696 from_byte = CHAR_TO_BYTE (from);
697
028d516b 698 bzero (charsets, (MAX_CHARSET + 1) * sizeof (int));
4ed46869
KH
699 while (1)
700 {
6ae1f27e
RS
701 find_charset_in_str (BYTE_POS_ADDR (from_byte), stop_byte - from_byte,
702 charsets, table);
4ed46869 703 if (stop < to)
6ae1f27e
RS
704 {
705 from = stop, from_byte = stop_byte;
706 stop = to, stop_byte = CHAR_TO_BYTE (stop);
707 }
4ed46869
KH
708 else
709 break;
710 }
6ae1f27e 711
4ed46869 712 val = Qnil;
513ee442 713 for (i = MAX_CHARSET; i >= 0; i--)
4ed46869
KH
714 if (charsets[i])
715 val = Fcons (CHARSET_SYMBOL (i), val);
716 return val;
717}
718
719DEFUN ("find-charset-string", Ffind_charset_string, Sfind_charset_string,
23d2a7f1
KH
720 1, 2, 0,
721 "Return a list of charsets in STR.\n\
722Optional arg TABLE if non-nil is a unification table to look up.")
723 (str, table)
724 Lisp_Object str, table;
4ed46869 725{
a29e3b1b 726 int charsets[MAX_CHARSET + 1];
4ed46869
KH
727 int i;
728 Lisp_Object val;
729
730 CHECK_STRING (str, 0);
87b089ad
RS
731
732 if (! STRING_MULTIBYTE (str))
733 return Qnil;
734
a29e3b1b 735 bzero (charsets, (MAX_CHARSET + 1) * sizeof (int));
87b089ad 736 find_charset_in_str (XSTRING (str)->data, XSTRING (str)->size_byte,
23d2a7f1 737 charsets, table);
4ed46869 738 val = Qnil;
513ee442 739 for (i = MAX_CHARSET; i >= 0; i--)
4ed46869
KH
740 if (charsets[i])
741 val = Fcons (CHARSET_SYMBOL (i), val);
742 return val;
743}
744\f
745DEFUN ("make-char-internal", Fmake_char_internal, Smake_char_internal, 1, 3, 0,
513ee442 746 "")
4ed46869
KH
747 (charset, code1, code2)
748 Lisp_Object charset, code1, code2;
749{
750 CHECK_NUMBER (charset, 0);
751
752 if (NILP (code1))
753 XSETFASTINT (code1, 0);
754 else
755 CHECK_NUMBER (code1, 1);
756 if (NILP (code2))
757 XSETFASTINT (code2, 0);
758 else
759 CHECK_NUMBER (code2, 2);
760
761 if (!CHARSET_DEFINED_P (XINT (charset)))
762 error ("Invalid charset: %d", XINT (charset));
763
764 return make_number (MAKE_CHAR (XINT (charset), XINT (code1), XINT (code2)));
765}
766
767DEFUN ("split-char", Fsplit_char, Ssplit_char, 1, 1, 0,
768 "Return list of charset and one or two position-codes of CHAR.")
769 (ch)
770 Lisp_Object ch;
771{
772 Lisp_Object val;
6dc0722d 773 int charset, c1, c2;
4ed46869
KH
774
775 CHECK_NUMBER (ch, 0);
776 SPLIT_CHAR (XFASTINT (ch), charset, c1, c2);
6dc0722d 777 return (c2 >= 0
4ed46869
KH
778 ? Fcons (CHARSET_SYMBOL (charset),
779 Fcons (make_number (c1), Fcons (make_number (c2), Qnil)))
780 : Fcons (CHARSET_SYMBOL (charset), Fcons (make_number (c1), Qnil)));
781}
782
783DEFUN ("char-charset", Fchar_charset, Schar_charset, 1, 1, 0,
784 "Return charset of CHAR.")
785 (ch)
786 Lisp_Object ch;
787{
788 CHECK_NUMBER (ch, 0);
789
790 return CHARSET_SYMBOL (CHAR_CHARSET (XINT (ch)));
791}
792
793DEFUN ("iso-charset", Fiso_charset, Siso_charset, 3, 3, 0,
794 "Return charset of ISO's specification DIMENSION, CHARS, and FINAL-CHAR.")
795 (dimension, chars, final_char)
796 Lisp_Object dimension, chars, final_char;
797{
798 int charset;
799
800 CHECK_NUMBER (dimension, 0);
801 CHECK_NUMBER (chars, 1);
802 CHECK_NUMBER (final_char, 2);
803
804 if ((charset = ISO_CHARSET_TABLE (dimension, chars, final_char)) < 0)
805 return Qnil;
806 return CHARSET_SYMBOL (charset);
807}
808
9d3d8cba
KH
809/* If GENERICP is nonzero, return nonzero iff C is a valid normal or
810 generic character. If GENERICP is zero, return nonzero iff C is a
811 valid normal character. Do not call this function directly,
812 instead use macro CHAR_VALID_P. */
813int
814char_valid_p (c, genericp)
815 int c, genericp;
816{
817 int charset, c1, c2;
818
819 if (c < 0)
820 return 0;
821 if (SINGLE_BYTE_CHAR_P (c))
822 return 1;
823 SPLIT_NON_ASCII_CHAR (c, charset, c1, c2);
824 if (!CHARSET_VALID_P (charset))
825 return 0;
826 return (c < MIN_CHAR_COMPOSITION
827 ? ((c & CHAR_FIELD1_MASK) /* i.e. dimension of C is two. */
828 ? (genericp && c1 == 0 && c2 == 0
829 || c1 >= 32 && c2 >= 32)
830 : (genericp && c1 == 0
831 || c1 >= 32))
832 : c < MIN_CHAR_COMPOSITION + n_cmpchars);
833}
834
835DEFUN ("char-valid-p", Fchar_valid_p, Schar_valid_p, 1, 2, 0,
a9d02884
DL
836 "Return t if OBJECT is a valid normal character.\n\
837If optional arg GENERICP is non-nil, also return t if OBJECT is\n\
9d3d8cba
KH
838a valid generic character.")
839 (object, genericp)
840 Lisp_Object object, genericp;
841{
842 if (! NATNUMP (object))
843 return Qnil;
844 return (CHAR_VALID_P (XFASTINT (object), !NILP (genericp)) ? Qt : Qnil);
845}
846
4ed46869
KH
847DEFUN ("char-bytes", Fchar_bytes, Schar_bytes, 1, 1, 0,
848 "Return byte length of multi-byte form of CHAR.")
849 (ch)
850 Lisp_Object ch;
851{
852 Lisp_Object val;
853 int bytes;
854
855 CHECK_NUMBER (ch, 0);
856 if (COMPOSITE_CHAR_P (XFASTINT (ch)))
857 {
858 unsigned int id = COMPOSITE_CHAR_ID (XFASTINT (ch));
859
860 bytes = (id < n_cmpchars ? cmpchar_table[id]->len : 1);
861 }
862 else
863 {
864 int charset = CHAR_CHARSET (XFASTINT (ch));
865
866 bytes = CHARSET_DEFINED_P (charset) ? CHARSET_BYTES (charset) : 1;
867 }
868
869 XSETFASTINT (val, bytes);
870 return val;
871}
872
873/* Return the width of character of which multi-byte form starts with
874 C. The width is measured by how many columns occupied on the
875 screen when displayed in the current buffer. */
876
877#define ONE_BYTE_CHAR_WIDTH(c) \
878 (c < 0x20 \
879 ? (c == '\t' \
53316e55 880 ? XFASTINT (current_buffer->tab_width) \
4ed46869
KH
881 : (c == '\n' ? 0 : (NILP (current_buffer->ctl_arrow) ? 4 : 2))) \
882 : (c < 0x7f \
883 ? 1 \
884 : (c == 0x7F \
885 ? (NILP (current_buffer->ctl_arrow) ? 4 : 2) \
886 : ((! NILP (current_buffer->enable_multibyte_characters) \
887 && BASE_LEADING_CODE_P (c)) \
888 ? WIDTH_BY_CHAR_HEAD (c) \
889 : 4)))) \
890
891
892DEFUN ("char-width", Fchar_width, Schar_width, 1, 1, 0,
893 "Return width of CHAR when displayed in the current buffer.\n\
894The width is measured by how many columns it occupies on the screen.")
895 (ch)
896 Lisp_Object ch;
897{
859f2b3c 898 Lisp_Object val, disp;
4ed46869 899 int c;
51c4025f 900 struct Lisp_Char_Table *dp = buffer_display_table ();
4ed46869
KH
901
902 CHECK_NUMBER (ch, 0);
903
859f2b3c
RS
904 c = XINT (ch);
905
906 /* Get the way the display table would display it. */
51c4025f 907 disp = dp ? DISP_CHAR_VECTOR (dp, c) : Qnil;
859f2b3c
RS
908
909 if (VECTORP (disp))
910 XSETINT (val, XVECTOR (disp)->size);
911 else if (SINGLE_BYTE_CHAR_P (c))
912 XSETINT (val, ONE_BYTE_CHAR_WIDTH (c));
4ed46869
KH
913 else if (COMPOSITE_CHAR_P (c))
914 {
915 int id = COMPOSITE_CHAR_ID (XFASTINT (ch));
916 XSETFASTINT (val, (id < n_cmpchars ? cmpchar_table[id]->width : 0));
917 }
918 else
919 {
920 int charset = CHAR_CHARSET (c);
921
922 XSETFASTINT (val, CHARSET_WIDTH (charset));
923 }
924 return val;
925}
926
927/* Return width of string STR of length LEN when displayed in the
928 current buffer. The width is measured by how many columns it
929 occupies on the screen. */
859f2b3c 930
4ed46869
KH
931int
932strwidth (str, len)
933 unsigned char *str;
934 int len;
935{
936 unsigned char *endp = str + len;
937 int width = 0;
c4a4e28f 938 struct Lisp_Char_Table *dp = buffer_display_table ();
4ed46869 939
859f2b3c
RS
940 while (str < endp)
941 {
942 if (*str == LEADING_CODE_COMPOSITION)
943 {
944 int id = str_cmpchar_id (str, endp - str);
945
946 if (id < 0)
947 {
948 width += 4;
949 str++;
950 }
951 else
952 {
953 width += cmpchar_table[id]->width;
954 str += cmpchar_table[id]->len;
955 }
956 }
957 else
958 {
959 Lisp_Object disp;
960 int thiswidth;
961 int c = STRING_CHAR (str, endp - str);
962
963 /* Get the way the display table would display it. */
acc35c36
RS
964 if (dp)
965 disp = DISP_CHAR_VECTOR (dp, c);
966 else
967 disp = Qnil;
859f2b3c
RS
968
969 if (VECTORP (disp))
970 thiswidth = XVECTOR (disp)->size;
971 else
972 thiswidth = ONE_BYTE_CHAR_WIDTH (*str);
973
974 width += thiswidth;
975 str += BYTES_BY_CHAR_HEAD (*str);
976 }
977 }
4ed46869
KH
978 return width;
979}
980
981DEFUN ("string-width", Fstring_width, Sstring_width, 1, 1, 0,
982 "Return width of STRING when displayed in the current buffer.\n\
983Width is measured by how many columns it occupies on the screen.\n\
046b1f03
RS
984When calculating width of a multibyte character in STRING,\n\
985only the base leading-code is considered; the validity of\n\
986the following bytes is not checked.")
4ed46869
KH
987 (str)
988 Lisp_Object str;
989{
990 Lisp_Object val;
991
992 CHECK_STRING (str, 0);
993 XSETFASTINT (val, strwidth (XSTRING (str)->data, XSTRING (str)->size));
994 return val;
995}
996
997DEFUN ("char-direction", Fchar_direction, Schar_direction, 1, 1, 0,
998 "Return the direction of CHAR.\n\
999The returned value is 0 for left-to-right and 1 for right-to-left.")
1000 (ch)
1001 Lisp_Object ch;
1002{
1003 int charset;
1004
1005 CHECK_NUMBER (ch, 0);
1006 charset = CHAR_CHARSET (XFASTINT (ch));
1007 if (!CHARSET_DEFINED_P (charset))
1008 error ("Invalid character: %d", XINT (ch));
1009 return CHARSET_TABLE_INFO (charset, CHARSET_DIRECTION_IDX);
1010}
1011
af4fecb4 1012DEFUN ("chars-in-region", Fchars_in_region, Schars_in_region, 2, 2, 0,
6ae1f27e 1013 "Return number of characters between BEG and END.")
046b1f03
RS
1014 (beg, end)
1015 Lisp_Object beg, end;
1016{
6ae1f27e 1017 int from, to;
046b1f03
RS
1018
1019 from = min (XFASTINT (beg), XFASTINT (end));
a8a35e61 1020 to = max (XFASTINT (beg), XFASTINT (end));
046b1f03 1021
6ae1f27e
RS
1022 return to - from;
1023}
9036eb45 1024
87b089ad
RS
1025/* Return the number of characters in the NBYTES bytes at PTR.
1026 This works by looking at the contents and checking for multibyte sequences.
1027 However, if the current buffer has enable-multibyte-characters = nil,
1028 we treat each byte as a character. */
1029
6ae1f27e
RS
1030int
1031chars_in_text (ptr, nbytes)
1032 unsigned char *ptr;
1033 int nbytes;
1034{
1035 unsigned char *endp;
1036 int chars;
046b1f03 1037
87b089ad
RS
1038 /* current_buffer is null at early stages of Emacs initialization. */
1039 if (current_buffer == 0
1040 || NILP (current_buffer->enable_multibyte_characters))
6ae1f27e 1041 return nbytes;
a8a35e61 1042
6ae1f27e
RS
1043 endp = ptr + nbytes;
1044 chars = 0;
046b1f03 1045
6ae1f27e
RS
1046 while (ptr < endp)
1047 {
1048 if (*ptr == LEADING_CODE_COMPOSITION)
046b1f03 1049 {
6ae1f27e
RS
1050 ptr++;
1051 while (ptr < endp && ! CHAR_HEAD_P (*ptr)) ptr++;
046b1f03
RS
1052 }
1053 else
6ae1f27e 1054 ptr += BYTES_BY_CHAR_HEAD (*ptr);
046b1f03
RS
1055 chars++;
1056 }
1057
6ae1f27e 1058 return chars;
046b1f03
RS
1059}
1060
87b089ad
RS
1061/* Return the number of characters in the NBYTES bytes at PTR.
1062 This works by looking at the contents and checking for multibyte sequences.
1063 It ignores enable-multibyte-characters. */
1064
1065int
1066multibyte_chars_in_text (ptr, nbytes)
1067 unsigned char *ptr;
1068 int nbytes;
1069{
1070 unsigned char *endp;
1071 int chars;
1072
1073 endp = ptr + nbytes;
1074 chars = 0;
1075
1076 while (ptr < endp)
1077 {
1078 if (*ptr == LEADING_CODE_COMPOSITION)
1079 {
1080 ptr++;
1081 while (ptr < endp && ! CHAR_HEAD_P (*ptr)) ptr++;
1082 }
1083 else
1084 ptr += BYTES_BY_CHAR_HEAD (*ptr);
1085 chars++;
1086 }
1087
1088 return chars;
1089}
1090
1091DEFUN ("string", Fstring, Sstring, 1, MANY, 0,
4ed46869 1092 "Concatenate all the argument characters and make the result a string.")
53316e55
KH
1093 (n, args)
1094 int n;
4ed46869
KH
1095 Lisp_Object *args;
1096{
53316e55 1097 int i;
4ed46869 1098 unsigned char *buf
bd4c6dd0 1099 = (unsigned char *) alloca (MAX_LENGTH_OF_MULTI_BYTE_FORM * n);
4ed46869
KH
1100 unsigned char *p = buf;
1101 Lisp_Object val;
1102
1103 for (i = 0; i < n; i++)
1104 {
1105 int c, len;
1106 unsigned char *str;
1107
1108 if (!INTEGERP (args[i]))
b0e3cf2b 1109 CHECK_NUMBER (args[i], 0);
4ed46869
KH
1110 c = XINT (args[i]);
1111 len = CHAR_STRING (c, p, str);
1112 if (p != str)
1113 /* C is a composite character. */
1114 bcopy (str, p, len);
1115 p += len;
1116 }
1117
87b089ad 1118 val = make_multibyte_string (buf, n, p - buf);
4ed46869
KH
1119 return val;
1120}
1121
1122#endif /* emacs */
1123\f
1124/*** Composite characters staffs ***/
1125
1126/* Each composite character is identified by CMPCHAR-ID which is
1127 assigned when Emacs needs the character code of the composite
1128 character (e.g. when displaying it on the screen). See the
1129 document "GENERAL NOTE on COMPOSITE CHARACTER" in `charset.h' how a
1130 composite character is represented in Emacs. */
1131
1132/* If `static' is defined, it means that it is defined to null string. */
1133#ifndef static
1134/* The following function is copied from lread.c. */
1135static int
1136hash_string (ptr, len)
1137 unsigned char *ptr;
1138 int len;
1139{
1140 register unsigned char *p = ptr;
1141 register unsigned char *end = p + len;
1142 register unsigned char c;
1143 register int hash = 0;
1144
1145 while (p != end)
1146 {
1147 c = *p++;
1148 if (c >= 0140) c -= 40;
1149 hash = ((hash<<3) + (hash>>28) + c);
1150 }
1151 return hash & 07777777777;
1152}
1153#endif
1154
4ed46869
KH
1155#define CMPCHAR_HASH_TABLE_SIZE 0xFFF
1156
1157static int *cmpchar_hash_table[CMPCHAR_HASH_TABLE_SIZE];
1158
1159/* Each element of `cmpchar_hash_table' is a pointer to an array of
1160 integer, where the 1st element is the size of the array, the 2nd
1161 element is how many elements are actually used in the array, and
1162 the remaining elements are CMPCHAR-IDs of composite characters of
1163 the same hash value. */
1164#define CMPCHAR_HASH_SIZE(table) table[0]
1165#define CMPCHAR_HASH_USED(table) table[1]
1166#define CMPCHAR_HASH_CMPCHAR_ID(table, i) table[i]
1167
1168/* Return CMPCHAR-ID of the composite character in STR of the length
1169 LEN. If the composite character has not yet been registered,
1170 register it in `cmpchar_table' and assign new CMPCHAR-ID. This
1171 is the sole function for assigning CMPCHAR-ID. */
1172int
1173str_cmpchar_id (str, len)
1174 unsigned char *str;
1175 int len;
1176{
1177 int hash_idx, *hashp;
1178 unsigned char *buf;
1179 int embedded_rule; /* 1 if composition rule is embedded. */
1180 int chars; /* number of components. */
1181 int i;
1182 struct cmpchar_info *cmpcharp;
1183
1184 if (len < 5)
1185 /* Any composite char have at least 3-byte length. */
1186 return -1;
1187
1188 /* The second byte 0xFF means compostion rule is embedded. */
1189 embedded_rule = (str[1] == 0xFF);
1190
1191 /* At first, get the actual length of the composite character. */
1192 {
1193 unsigned char *p, *endp = str + 1, *lastp = str + len;
1194 int bytes;
1195
6ae1f27e 1196 while (endp < lastp && ! CHAR_HEAD_P (*endp)) endp++;
4ed46869
KH
1197 chars = 0;
1198 p = str + 1 + embedded_rule;
1199 while (p < endp)
1200 {
1201 /* No need of checking if *P is 0xA0 because
1202 BYTES_BY_CHAR_HEAD (0x80) surely returns 2. */
1203 p += (bytes = BYTES_BY_CHAR_HEAD (*p - 0x20) + embedded_rule);
1204 chars++;
1205 }
1206 len = (p -= embedded_rule) - str;
1207 if (p > endp)
1208 len -= - bytes, chars--;
1209
1210 if (chars < 2 || chars > MAX_COMPONENT_COUNT)
1211 /* Invalid number of components. */
1212 return -1;
1213 }
1214 hash_idx = hash_string (str, len) % CMPCHAR_HASH_TABLE_SIZE;
1215 hashp = cmpchar_hash_table[hash_idx];
1216
1217 /* Then, look into the hash table. */
1218 if (hashp != NULL)
1219 /* Find the correct one among composite characters of the same
1220 hash value. */
1221 for (i = 2; i < CMPCHAR_HASH_USED (hashp); i++)
1222 {
1223 cmpcharp = cmpchar_table[CMPCHAR_HASH_CMPCHAR_ID (hashp, i)];
1224 if (len == cmpcharp->len
1225 && ! bcmp (str, cmpcharp->data, len))
1226 return CMPCHAR_HASH_CMPCHAR_ID (hashp, i);
1227 }
1228
1229 /* We have to register the composite character in cmpchar_table. */
513ee442
KH
1230 if (n_cmpchars > (CHAR_FIELD2_MASK | CHAR_FIELD3_MASK))
1231 /* No, we have no more room for a new composite character. */
1232 return -1;
1233
4ed46869
KH
1234 /* Make the entry in hash table. */
1235 if (hashp == NULL)
1236 {
1237 /* Make a table for 8 composite characters initially. */
1238 hashp = (cmpchar_hash_table[hash_idx]
1239 = (int *) xmalloc (sizeof (int) * (2 + 8)));
1240 CMPCHAR_HASH_SIZE (hashp) = 10;
1241 CMPCHAR_HASH_USED (hashp) = 2;
1242 }
1243 else if (CMPCHAR_HASH_USED (hashp) >= CMPCHAR_HASH_SIZE (hashp))
1244 {
1245 CMPCHAR_HASH_SIZE (hashp) += 8;
1246 hashp = (cmpchar_hash_table[hash_idx]
1247 = (int *) xrealloc (hashp,
1248 sizeof (int) * CMPCHAR_HASH_SIZE (hashp)));
1249 }
1250 CMPCHAR_HASH_CMPCHAR_ID (hashp, CMPCHAR_HASH_USED (hashp)) = n_cmpchars;
1251 CMPCHAR_HASH_USED (hashp)++;
1252
1253 /* Set information of the composite character in cmpchar_table. */
1254 if (cmpchar_table_size == 0)
1255 {
1256 /* This is the first composite character to be registered. */
1257 cmpchar_table_size = 256;
1258 cmpchar_table
1259 = (struct cmpchar_info **) xmalloc (sizeof (cmpchar_table[0])
1260 * cmpchar_table_size);
1261 }
1262 else if (cmpchar_table_size <= n_cmpchars)
1263 {
1264 cmpchar_table_size += 256;
1265 cmpchar_table
1266 = (struct cmpchar_info **) xrealloc (cmpchar_table,
1267 sizeof (cmpchar_table[0])
1268 * cmpchar_table_size);
1269 }
1270
1271 cmpcharp = (struct cmpchar_info *) xmalloc (sizeof (struct cmpchar_info));
1272
1273 cmpcharp->len = len;
1274 cmpcharp->data = (unsigned char *) xmalloc (len + 1);
1275 bcopy (str, cmpcharp->data, len);
1276 cmpcharp->data[len] = 0;
1277 cmpcharp->glyph_len = chars;
1278 cmpcharp->glyph = (GLYPH *) xmalloc (sizeof (GLYPH) * chars);
1279 if (embedded_rule)
1280 {
1281 cmpcharp->cmp_rule = (unsigned char *) xmalloc (chars);
1282 cmpcharp->col_offset = (float *) xmalloc (sizeof (float) * chars);
1283 }
1284 else
1285 {
1286 cmpcharp->cmp_rule = NULL;
1287 cmpcharp->col_offset = NULL;
1288 }
1289
1290 /* Setup GLYPH data and composition rules (if any) so as not to make
1291 them every time on displaying. */
1292 {
1293 unsigned char *bufp;
1294 int width;
1295 float leftmost = 0.0, rightmost = 1.0;
1296
1297 if (embedded_rule)
1298 /* At first, col_offset[N] is set to relative to col_offset[0]. */
1299 cmpcharp->col_offset[0] = 0;
1300
1301 for (i = 0, bufp = cmpcharp->data + 1; i < chars; i++)
1302 {
1303 if (embedded_rule)
1304 cmpcharp->cmp_rule[i] = *bufp++;
1305
1306 if (*bufp == 0xA0) /* This is an ASCII character. */
1307 {
1308 cmpcharp->glyph[i] = FAST_MAKE_GLYPH ((*++bufp & 0x7F), 0);
1309 width = 1;
1310 bufp++;
1311 }
1312 else /* Multibyte character. */
1313 {
1314 /* Make `bufp' point normal multi-byte form temporally. */
1315 *bufp -= 0x20;
1316 cmpcharp->glyph[i]
1317 = FAST_MAKE_GLYPH (string_to_non_ascii_char (bufp, 4, 0), 0);
1318 width = WIDTH_BY_CHAR_HEAD (*bufp);
1319 *bufp += 0x20;
1320 bufp += BYTES_BY_CHAR_HEAD (*bufp - 0x20);
1321 }
1322
1323 if (embedded_rule && i > 0)
1324 {
1325 /* Reference points (global_ref and new_ref) are
1326 encoded as below:
1327
1328 0--1--2 -- ascent
1329 | |
1330 | |
1331 | 4 -+--- center
1332 -- 3 5 -- baseline
1333 | |
1334 6--7--8 -- descent
1335
1336 Now, we calculate the column offset of the new glyph
1337 from the left edge of the first glyph. This can avoid
1338 the same calculation everytime displaying this
1339 composite character. */
1340
1341 /* Reference points of global glyph and new glyph. */
1342 int global_ref = (cmpcharp->cmp_rule[i] - 0xA0) / 9;
1343 int new_ref = (cmpcharp->cmp_rule[i] - 0xA0) % 9;
1344 /* Column offset relative to the first glyph. */
1345 float left = (leftmost
1346 + (global_ref % 3) * (rightmost - leftmost) / 2.0
1347 - (new_ref % 3) * width / 2.0);
1348
1349 cmpcharp->col_offset[i] = left;
1350 if (left < leftmost)
1351 leftmost = left;
1352 if (left + width > rightmost)
1353 rightmost = left + width;
1354 }
1355 else
1356 {
1357 if (width > rightmost)
1358 rightmost = width;
1359 }
1360 }
1361 if (embedded_rule)
1362 {
1363 /* Now col_offset[N] are relative to the left edge of the
1364 first component. Make them relative to the left edge of
1365 overall glyph. */
1366 for (i = 0; i < chars; i++)
1367 cmpcharp->col_offset[i] -= leftmost;
1368 /* Make rightmost holds width of overall glyph. */
1369 rightmost -= leftmost;
1370 }
1371
1372 cmpcharp->width = rightmost;
1373 if (cmpcharp->width < rightmost)
1374 /* To get a ceiling integer value. */
1375 cmpcharp->width++;
1376 }
1377
1378 cmpchar_table[n_cmpchars] = cmpcharp;
1379
1380 return n_cmpchars++;
1381}
1382
1383/* Return the Nth element of the composite character C. */
1384int
1385cmpchar_component (c, n)
1386 unsigned int c, n;
1387{
1388 int id = COMPOSITE_CHAR_ID (c);
1389
1390 if (id >= n_cmpchars /* C is not a valid composite character. */
1391 || n >= cmpchar_table[id]->glyph_len) /* No such component. */
1392 return -1;
1393 /* No face data is stored in glyph code. */
1394 return ((int) (cmpchar_table[id]->glyph[n]));
1395}
1396
1397DEFUN ("cmpcharp", Fcmpcharp, Scmpcharp, 1, 1, 0,
1398 "T if CHAR is a composite character.")
1399 (ch)
1400 Lisp_Object ch;
1401{
1402 CHECK_NUMBER (ch, 0);
1403 return (COMPOSITE_CHAR_P (XINT (ch)) ? Qt : Qnil);
1404}
1405
1406DEFUN ("composite-char-component", Fcmpchar_component, Scmpchar_component,
1407 2, 2, 0,
1408 "Return the IDXth component character of composite character CHARACTER.")
1409 (character, idx)
1410 Lisp_Object character, idx;
1411{
1412 int c;
1413
1414 CHECK_NUMBER (character, 0);
1415 CHECK_NUMBER (idx, 1);
1416
1417 if ((c = cmpchar_component (XINT (character), XINT (idx))) < 0)
1418 args_out_of_range (character, idx);
1419
1420 return make_number (c);
1421}
1422
1423DEFUN ("composite-char-composition-rule", Fcmpchar_cmp_rule, Scmpchar_cmp_rule,
1424 2, 2, 0,
55001746
KH
1425 "Return the Nth composition rule embedded in composite character CHARACTER.\n\
1426The returned rule is for composing the Nth component\n\
1427on the (N-1)th component. If N is 0, the returned value is always 255.")
1428 (character, n)
1429 Lisp_Object character, n;
4ed46869
KH
1430{
1431 int id, i;
1432
1433 CHECK_NUMBER (character, 0);
55001746 1434 CHECK_NUMBER (n, 1);
4ed46869
KH
1435
1436 id = COMPOSITE_CHAR_ID (XINT (character));
1437 if (id < 0 || id >= n_cmpchars)
1438 error ("Invalid composite character: %d", XINT (character));
55001746 1439 i = XINT (n);
4ed46869 1440 if (i > cmpchar_table[id]->glyph_len)
55001746 1441 args_out_of_range (character, n);
4ed46869
KH
1442
1443 return make_number (cmpchar_table[id]->cmp_rule[i]);
1444}
1445
1446DEFUN ("composite-char-composition-rule-p", Fcmpchar_cmp_rule_p,
1447 Scmpchar_cmp_rule_p, 1, 1, 0,
1448 "Return non-nil if composite character CHARACTER contains a embedded rule.")
1449 (character)
1450 Lisp_Object character;
1451{
1452 int id;
1453
1454 CHECK_NUMBER (character, 0);
1455 id = COMPOSITE_CHAR_ID (XINT (character));
1456 if (id < 0 || id >= n_cmpchars)
1457 error ("Invalid composite character: %d", XINT (character));
1458
1459 return (cmpchar_table[id]->cmp_rule ? Qt : Qnil);
1460}
1461
1462DEFUN ("composite-char-component-count", Fcmpchar_cmp_count,
1463 Scmpchar_cmp_count, 1, 1, 0,
1464 "Return number of compoents of composite character CHARACTER.")
1465 (character)
1466 Lisp_Object character;
1467{
1468 int id;
1469
1470 CHECK_NUMBER (character, 0);
1471 id = COMPOSITE_CHAR_ID (XINT (character));
1472 if (id < 0 || id >= n_cmpchars)
1473 error ("Invalid composite character: %d", XINT (character));
1474
1475 return (make_number (cmpchar_table[id]->glyph_len));
1476}
1477
1478DEFUN ("compose-string", Fcompose_string, Scompose_string,
1479 1, 1, 0,
1480 "Return one char string composed from all characters in STRING.")
1481 (str)
1482 Lisp_Object str;
1483{
1484 unsigned char buf[MAX_LENGTH_OF_MULTI_BYTE_FORM], *p, *pend, *ptemp;
1485 int len, i;
1486
1487 CHECK_STRING (str, 0);
1488
1489 buf[0] = LEADING_CODE_COMPOSITION;
1490 p = XSTRING (str)->data;
87b089ad 1491 pend = p + XSTRING (str)->size_byte;
4ed46869
KH
1492 i = 1;
1493 while (p < pend)
1494 {
1495 if (*p < 0x20 || *p == 127) /* control code */
1496 error ("Invalid component character: %d", *p);
1497 else if (*p < 0x80) /* ASCII */
1498 {
1499 if (i + 2 >= MAX_LENGTH_OF_MULTI_BYTE_FORM)
1500 error ("Too long string to be composed: %s", XSTRING (str)->data);
1501 /* Prepend an ASCII charset indicator 0xA0, set MSB of the
1502 code itself. */
1503 buf[i++] = 0xA0;
1504 buf[i++] = *p++ + 0x80;
1505 }
1506 else if (*p == LEADING_CODE_COMPOSITION) /* composite char */
1507 {
1508 /* Already composed. Eliminate the heading
1509 LEADING_CODE_COMPOSITION, keep the remaining bytes
1510 unchanged. */
1511 p++;
1512 ptemp = p;
6ae1f27e 1513 while (! CHAR_HEAD_P (*p)) p++;
4ed46869
KH
1514 if (i + (p - ptemp) >= MAX_LENGTH_OF_MULTI_BYTE_FORM)
1515 error ("Too long string to be composed: %s", XSTRING (str)->data);
1516 bcopy (ptemp, buf + i, p - ptemp);
1517 i += p - ptemp;
1518 }
1519 else /* multibyte char */
1520 {
1521 /* Add 0x20 to the base leading-code, keep the remaining
1522 bytes unchanged. */
1523 len = BYTES_BY_CHAR_HEAD (*p);
1524 if (i + len >= MAX_LENGTH_OF_MULTI_BYTE_FORM)
1525 error ("Too long string to be composed: %s", XSTRING (str)->data);
1526 bcopy (p, buf + i, len);
1527 buf[i] += 0x20;
1528 p += len, i += len;
1529 }
1530 }
1531
1532 if (i < 5)
1533 /* STR contains only one character, which can't be composed. */
1534 error ("Too short string to be composed: %s", XSTRING (str)->data);
1535
87b089ad 1536 return make_multibyte_string (buf, 1, i);
4ed46869
KH
1537}
1538
1539\f
1540charset_id_internal (charset_name)
1541 char *charset_name;
1542{
1543 Lisp_Object val = Fget (intern (charset_name), Qcharset);
1544
1545 if (!VECTORP (val))
1546 error ("Charset %s is not defined", charset_name);
1547
1548 return (XINT (XVECTOR (val)->contents[0]));
1549}
1550
1551DEFUN ("setup-special-charsets", Fsetup_special_charsets,
1552 Ssetup_special_charsets, 0, 0, 0, "Internal use only.")
1553 ()
1554{
1555 charset_latin_iso8859_1 = charset_id_internal ("latin-iso8859-1");
1556 charset_jisx0208_1978 = charset_id_internal ("japanese-jisx0208-1978");
1557 charset_jisx0208 = charset_id_internal ("japanese-jisx0208");
1558 charset_katakana_jisx0201 = charset_id_internal ("katakana-jisx0201");
1559 charset_latin_jisx0201 = charset_id_internal ("latin-jisx0201");
1560 charset_big5_1 = charset_id_internal ("chinese-big5-1");
1561 charset_big5_2 = charset_id_internal ("chinese-big5-2");
1562 return Qnil;
1563}
1564
1565init_charset_once ()
1566{
1567 int i, j, k;
1568
1569 staticpro (&Vcharset_table);
1570 staticpro (&Vcharset_symbol_table);
1571
1572 /* This has to be done here, before we call Fmake_char_table. */
1573 Qcharset_table = intern ("charset-table");
1574 staticpro (&Qcharset_table);
1575
1576 /* Intern this now in case it isn't already done.
1577 Setting this variable twice is harmless.
1578 But don't staticpro it here--that is done in alloc.c. */
1579 Qchar_table_extra_slots = intern ("char-table-extra-slots");
1580
1581 /* Now we are ready to set up this property, so we can
1582 create the charset table. */
1583 Fput (Qcharset_table, Qchar_table_extra_slots, make_number (0));
1584 Vcharset_table = Fmake_char_table (Qcharset_table, Qnil);
1585
513ee442 1586 Vcharset_symbol_table = Fmake_vector (make_number (MAX_CHARSET + 1), Qnil);
4ed46869
KH
1587
1588 /* Setup tables. */
1589 for (i = 0; i < 2; i++)
1590 for (j = 0; j < 2; j++)
1591 for (k = 0; k < 128; k++)
1592 iso_charset_table [i][j][k] = -1;
1593
1594 bzero (cmpchar_hash_table, sizeof cmpchar_hash_table);
1595 cmpchar_table_size = n_cmpchars = 0;
1596
1597 for (i = 0; i < 256; i++)
1598 BYTES_BY_CHAR_HEAD (i) = 1;
1599 BYTES_BY_CHAR_HEAD (LEADING_CODE_PRIVATE_11) = 3;
1600 BYTES_BY_CHAR_HEAD (LEADING_CODE_PRIVATE_12) = 3;
1601 BYTES_BY_CHAR_HEAD (LEADING_CODE_PRIVATE_21) = 4;
1602 BYTES_BY_CHAR_HEAD (LEADING_CODE_PRIVATE_22) = 4;
1603 /* The following doesn't reflect the actual bytes, but just to tell
1604 that it is a start of a multibyte character. */
1605 BYTES_BY_CHAR_HEAD (LEADING_CODE_COMPOSITION) = 2;
1606
1607 for (i = 0; i < 128; i++)
1608 WIDTH_BY_CHAR_HEAD (i) = 1;
1609 for (; i < 256; i++)
1610 WIDTH_BY_CHAR_HEAD (i) = 4;
1611 WIDTH_BY_CHAR_HEAD (LEADING_CODE_PRIVATE_11) = 1;
1612 WIDTH_BY_CHAR_HEAD (LEADING_CODE_PRIVATE_12) = 2;
1613 WIDTH_BY_CHAR_HEAD (LEADING_CODE_PRIVATE_21) = 1;
1614 WIDTH_BY_CHAR_HEAD (LEADING_CODE_PRIVATE_22) = 2;
1615}
1616
1617#ifdef emacs
1618
1619syms_of_charset ()
1620{
1621 Qascii = intern ("ascii");
1622 staticpro (&Qascii);
1623
1624 Qcharset = intern ("charset");
1625 staticpro (&Qcharset);
1626
1627 /* Define ASCII charset now. */
1628 update_charset_table (make_number (CHARSET_ASCII),
1629 make_number (1), make_number (94),
1630 make_number (1),
1631 make_number (0),
1632 make_number ('B'),
1633 make_number (0),
1634 build_string ("ASCII"),
1635 build_string ("ASCII"),
1636 build_string ("ASCII (ISO646 IRV)"));
1637 CHARSET_SYMBOL (CHARSET_ASCII) = Qascii;
1638 Fput (Qascii, Qcharset, CHARSET_TABLE_ENTRY (CHARSET_ASCII));
1639
1640 Qcomposition = intern ("composition");
1641 staticpro (&Qcomposition);
1642 CHARSET_SYMBOL (CHARSET_COMPOSITION) = Qcomposition;
1643
1644 defsubr (&Sdefine_charset);
3fac5a51 1645 defsubr (&Sget_unused_iso_final_char);
4ed46869
KH
1646 defsubr (&Sdeclare_equiv_charset);
1647 defsubr (&Sfind_charset_region);
1648 defsubr (&Sfind_charset_string);
1649 defsubr (&Smake_char_internal);
1650 defsubr (&Ssplit_char);
1651 defsubr (&Schar_charset);
1652 defsubr (&Siso_charset);
9d3d8cba 1653 defsubr (&Schar_valid_p);
4ed46869
KH
1654 defsubr (&Schar_bytes);
1655 defsubr (&Schar_width);
1656 defsubr (&Sstring_width);
1657 defsubr (&Schar_direction);
af4fecb4 1658 defsubr (&Schars_in_region);
87b089ad 1659 defsubr (&Sstring);
4ed46869
KH
1660 defsubr (&Scmpcharp);
1661 defsubr (&Scmpchar_component);
1662 defsubr (&Scmpchar_cmp_rule);
1663 defsubr (&Scmpchar_cmp_rule_p);
1664 defsubr (&Scmpchar_cmp_count);
1665 defsubr (&Scompose_string);
1666 defsubr (&Ssetup_special_charsets);
1667
1668 DEFVAR_LISP ("charset-list", &Vcharset_list,
1669 "List of charsets ever defined.");
1670 Vcharset_list = Fcons (Qascii, Qnil);
1671
b0e3cf2b
KH
1672 DEFVAR_LISP ("character-unification-table-vector",
1673 &Vcharacter_unification_table_vector,
1674 "Vector of cons cell of a symbol and unification table ever defined.\n\
1675An ID of a unification table is an index of this vector.");
1676 Vcharacter_unification_table_vector = Fmake_vector (make_number (16), Qnil);
1677
4ed46869
KH
1678 DEFVAR_INT ("leading-code-composition", &leading_code_composition,
1679 "Leading-code of composite characters.");
1680 leading_code_composition = LEADING_CODE_COMPOSITION;
1681
1682 DEFVAR_INT ("leading-code-private-11", &leading_code_private_11,
1683 "Leading-code of private TYPE9N charset of column-width 1.");
1684 leading_code_private_11 = LEADING_CODE_PRIVATE_11;
1685
1686 DEFVAR_INT ("leading-code-private-12", &leading_code_private_12,
1687 "Leading-code of private TYPE9N charset of column-width 2.");
1688 leading_code_private_12 = LEADING_CODE_PRIVATE_12;
1689
1690 DEFVAR_INT ("leading-code-private-21", &leading_code_private_21,
1691 "Leading-code of private TYPE9Nx9N charset of column-width 1.");
1692 leading_code_private_21 = LEADING_CODE_PRIVATE_21;
1693
1694 DEFVAR_INT ("leading-code-private-22", &leading_code_private_22,
1695 "Leading-code of private TYPE9Nx9N charset of column-width 2.");
1696 leading_code_private_22 = LEADING_CODE_PRIVATE_22;
35e623fb
RS
1697
1698 DEFVAR_INT ("nonascii-insert-offset", &nonascii_insert_offset,
1699 "Offset to add to a non-ascii code 0200...0377 when inserting it.\n\
1700This applies only when multibyte characters are enabled, and it serves\n\
1701to convert a Latin-1 or similar 8-bit character code to the corresponding\n\
1702Emacs character code.");
1703 nonascii_insert_offset = 0;
b0e3cf2b
KH
1704
1705 DEFVAR_INT ("min-composite-char", &min_composite_char,
1706 "Minimum character code of a composite character.");
1707 min_composite_char = MIN_CHAR_COMPOSITION;
4ed46869
KH
1708}
1709
1710#endif /* emacs */