(fix_submap_inheritance, get_keyelt, store_in_keymap,
[bpt/emacs.git] / src / charset.c
1 /* Basic multilingual character support.
2 Copyright (C) 1995, 1997, 1998 Electrotechnical Laboratory, JAPAN.
3 Licensed to the Free Software Foundation.
4
5 This file is part of GNU Emacs.
6
7 GNU Emacs is free software; you can redistribute it and/or modify
8 it under the terms of the GNU General Public License as published by
9 the Free Software Foundation; either version 2, or (at your option)
10 any later version.
11
12 GNU Emacs is distributed in the hope that it will be useful,
13 but WITHOUT ANY WARRANTY; without even the implied warranty of
14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 GNU General Public License for more details.
16
17 You should have received a copy of the GNU General Public License
18 along with GNU Emacs; see the file COPYING. If not, write to
19 the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
20 Boston, MA 02111-1307, USA. */
21
22 /* At first, see the document in `charset.h' to understand the code in
23 this file. */
24
25 #include <stdio.h>
26
27 #ifdef emacs
28
29 #include <sys/types.h>
30 #include <config.h>
31 #include "lisp.h"
32 #include "buffer.h"
33 #include "charset.h"
34 #include "coding.h"
35 #include "disptab.h"
36
37 #else /* not emacs */
38
39 #include "mulelib.h"
40
41 #endif /* emacs */
42
43 Lisp_Object Qcharset, Qascii, Qcomposition;
44
45 /* Declaration of special leading-codes. */
46 int leading_code_composition; /* for composite characters */
47 int leading_code_private_11; /* for private DIMENSION1 of 1-column */
48 int leading_code_private_12; /* for private DIMENSION1 of 2-column */
49 int leading_code_private_21; /* for private DIMENSION2 of 1-column */
50 int leading_code_private_22; /* for private DIMENSION2 of 2-column */
51
52 /* Declaration of special charsets. */
53 int charset_ascii; /* ASCII */
54 int charset_composition; /* for a composite character */
55 int charset_latin_iso8859_1; /* ISO8859-1 (Latin-1) */
56 int charset_jisx0208_1978; /* JISX0208.1978 (Japanese Kanji old set) */
57 int charset_jisx0208; /* JISX0208.1983 (Japanese Kanji) */
58 int charset_katakana_jisx0201; /* JISX0201.Kana (Japanese Katakana) */
59 int charset_latin_jisx0201; /* JISX0201.Roman (Japanese Roman) */
60 int charset_big5_1; /* Big5 Level 1 (Chinese Traditional) */
61 int charset_big5_2; /* Big5 Level 2 (Chinese Traditional) */
62
63 int min_composite_char;
64
65 Lisp_Object Qcharset_table;
66
67 /* A char-table containing information of each character set. */
68 Lisp_Object Vcharset_table;
69
70 /* A vector of charset symbol indexed by charset-id. This is used
71 only for returning charset symbol from C functions. */
72 Lisp_Object Vcharset_symbol_table;
73
74 /* A list of charset symbols ever defined. */
75 Lisp_Object Vcharset_list;
76
77 /* Vector of unification table ever defined.
78 An ID of a unification table is an index of this vector. */
79 Lisp_Object Vcharacter_unification_table_vector;
80
81 /* Tables used by macros BYTES_BY_CHAR_HEAD and WIDTH_BY_CHAR_HEAD. */
82 int bytes_by_char_head[256];
83 int width_by_char_head[256];
84
85 /* Mapping table from ISO2022's charset (specified by DIMENSION,
86 CHARS, and FINAL-CHAR) to Emacs' charset. */
87 int iso_charset_table[2][2][128];
88
89 /* Table of pointers to the structure `cmpchar_info' indexed by
90 CMPCHAR-ID. */
91 struct cmpchar_info **cmpchar_table;
92 /* The current size of `cmpchar_table'. */
93 static int cmpchar_table_size;
94 /* Number of the current composite characters. */
95 int n_cmpchars;
96
97 /* Variables used locally in the macro FETCH_MULTIBYTE_CHAR. */
98 unsigned char *_fetch_multibyte_char_p;
99 int _fetch_multibyte_char_len;
100
101 /* Offset to add to a non-ASCII value when inserting it. */
102 int nonascii_insert_offset;
103
104 /* Translation table for converting non-ASCII unibyte characters
105 to multibyte codes, or nil. */
106 Lisp_Object Vnonascii_translate_table;
107
108 #define min(X, Y) ((X) < (Y) ? (X) : (Y))
109 #define max(X, Y) ((X) > (Y) ? (X) : (Y))
110 \f
111 void
112 invalid_character (c)
113 int c;
114 {
115 error ("Invalid character: %o, %d, 0x%x", c);
116 }
117
118
119 /* Set STR a pointer to the multi-byte form of the character C. If C
120 is not a composite character, the multi-byte form is set in WORKBUF
121 and STR points WORKBUF. The caller should allocate at least 4-byte
122 area at WORKBUF in advance. Returns the length of the multi-byte
123 form. If C is an invalid character to have a multi-byte form,
124 signal an error.
125
126 Use macro `CHAR_STRING (C, WORKBUF, STR)' instead of calling this
127 function directly if C can be an ASCII character. */
128
129 int
130 non_ascii_char_to_string (c, workbuf, str)
131 int c;
132 unsigned char *workbuf, **str;
133 {
134 int charset, c1, c2;
135
136 if (COMPOSITE_CHAR_P (c))
137 {
138 int cmpchar_id = COMPOSITE_CHAR_ID (c);
139
140 if (cmpchar_id < n_cmpchars)
141 {
142 *str = cmpchar_table[cmpchar_id]->data;
143 return cmpchar_table[cmpchar_id]->len;
144 }
145 else
146 {
147 invalid_character (c);
148 }
149 }
150
151 SPLIT_NON_ASCII_CHAR (c, charset, c1, c2);
152 if (!charset
153 || ! CHARSET_DEFINED_P (charset)
154 || c1 >= 0 && c1 < 32
155 || c2 >= 0 && c2 < 32)
156 invalid_character (c);
157
158 *str = workbuf;
159 *workbuf++ = CHARSET_LEADING_CODE_BASE (charset);
160 if (*workbuf = CHARSET_LEADING_CODE_EXT (charset))
161 workbuf++;
162 *workbuf++ = c1 | 0x80;
163 if (c2 >= 0)
164 *workbuf++ = c2 | 0x80;
165
166 return (workbuf - *str);
167 }
168
169 /* Return a non-ASCII character of which multi-byte form is at STR of
170 length LEN. If ACTUAL_LEN is not NULL, the actual length of the
171 character is set to the address ACTUAL_LEN.
172
173 Use macro `STRING_CHAR (STR, LEN)' instead of calling this function
174 directly if STR can hold an ASCII character. */
175
176 string_to_non_ascii_char (str, len, actual_len)
177 unsigned char *str;
178 int len, *actual_len;
179 {
180 int charset;
181 unsigned char c1, c2;
182 register int c;
183
184 if (SPLIT_STRING (str, len, charset, c1, c2) == CHARSET_ASCII)
185 {
186 if (actual_len)
187 *actual_len = 1;
188 return (int) *str;
189 }
190
191 c = MAKE_NON_ASCII_CHAR (charset, c1, c2);
192
193 if (actual_len)
194 *actual_len = (charset == CHARSET_COMPOSITION
195 ? cmpchar_table[COMPOSITE_CHAR_ID (c)]->len
196 : BYTES_BY_CHAR_HEAD (*str));
197 return c;
198 }
199
200 /* Return the length of the multi-byte form at string STR of length LEN. */
201 int
202 multibyte_form_length (str, len)
203 unsigned char *str;
204 int len;
205 {
206 int charset;
207 unsigned char c1, c2;
208 register int c;
209
210 if (SPLIT_STRING (str, len, charset, c1, c2) == CHARSET_ASCII)
211 return 1;
212
213 return (charset == CHARSET_COMPOSITION
214 ? cmpchar_table[(c1 << 7) | c2]->len
215 : BYTES_BY_CHAR_HEAD (*str));
216 }
217
218 /* Check if string STR of length LEN contains valid multi-byte form of
219 a character. If valid, charset and position codes of the character
220 is set at *CHARSET, *C1, and *C2, and return 0. If not valid,
221 return -1. This should be used only in the macro SPLIT_STRING
222 which checks range of STR in advance. */
223
224 split_non_ascii_string (str, len, charset, c1, c2)
225 register unsigned char *str, *c1, *c2;
226 register int len, *charset;
227 {
228 register unsigned int cs = *str++;
229
230 if (cs == LEADING_CODE_COMPOSITION)
231 {
232 int cmpchar_id = str_cmpchar_id (str - 1, len);
233
234 if (cmpchar_id < 0)
235 return -1;
236 *charset = cs, *c1 = cmpchar_id >> 7, *c2 = cmpchar_id & 0x7F;
237 }
238 else if ((cs < LEADING_CODE_PRIVATE_11 || (cs = *str++) >= 0xA0)
239 && CHARSET_DEFINED_P (cs))
240 {
241 *charset = cs;
242 if (*str < 0xA0)
243 return -1;
244 *c1 = (*str++) & 0x7F;
245 if (CHARSET_DIMENSION (cs) == 2)
246 {
247 if (*str < 0xA0)
248 return -1;
249 *c2 = (*str++) & 0x7F;
250 }
251 }
252 else
253 return -1;
254 return 0;
255 }
256
257 /* Return a character unified with C (or a character made of CHARSET,
258 C1, and C2) in unification table TABLE. If no unification is found
259 in TABLE, return C. */
260 unify_char (table, c, charset, c1, c2)
261 Lisp_Object table;
262 int c, charset, c1, c2;
263 {
264 Lisp_Object ch;
265 int alt_charset, alt_c1, alt_c2, dimension;
266
267 if (c < 0) c = MAKE_CHAR (charset, c1, c2);
268 if (!CHAR_TABLE_P (table)
269 || (ch = Faref (table, make_number (c)), !INTEGERP (ch))
270 || XINT (ch) < 0)
271 return c;
272
273 SPLIT_CHAR (XFASTINT (ch), alt_charset, alt_c1, alt_c2);
274 dimension = CHARSET_DIMENSION (alt_charset);
275 if (dimension == 1 && alt_c1 > 0 || dimension == 2 && alt_c2 > 0)
276 /* CH is not a generic character, just return it. */
277 return XFASTINT (ch);
278
279 /* Since CH is a generic character, we must return a specific
280 charater which has the same position codes as C from CH. */
281 if (charset < 0)
282 SPLIT_CHAR (c, charset, c1, c2);
283 if (dimension != CHARSET_DIMENSION (charset))
284 /* We can't make such a character because of dimension mismatch. */
285 return c;
286 return MAKE_CHAR (alt_charset, c1, c2);
287 }
288
289 /* Convert the unibyte character C to multibyte based on
290 Vnonascii_translate_table or nonascii_insert_offset. If they can't
291 convert C to a valid multibyte character, convert it based on
292 DEFAULT_NONASCII_INSERT_OFFSET which makes C a Latin-1 character. */
293
294 unibyte_char_to_multibyte (c)
295 int c;
296 {
297 if (c >= 0240 && c < 0400)
298 {
299 int c_save = c;
300
301 if (! NILP (Vnonascii_translate_table))
302 c = XINT (Faref (Vnonascii_translate_table, make_number (c)));
303 else if (nonascii_insert_offset > 0)
304 c += nonascii_insert_offset;
305 if (c >= 0240 && (c < 0400 || ! VALID_MULTIBYTE_CHAR_P (c)))
306 c = c_save + DEFAULT_NONASCII_INSERT_OFFSET;
307 }
308 return c;
309 }
310 \f
311 /* Update the table Vcharset_table with the given arguments (see the
312 document of `define-charset' for the meaning of each argument).
313 Several other table contents are also updated. The caller should
314 check the validity of CHARSET-ID and the remaining arguments in
315 advance. */
316
317 void
318 update_charset_table (charset_id, dimension, chars, width, direction,
319 iso_final_char, iso_graphic_plane,
320 short_name, long_name, description)
321 Lisp_Object charset_id, dimension, chars, width, direction;
322 Lisp_Object iso_final_char, iso_graphic_plane;
323 Lisp_Object short_name, long_name, description;
324 {
325 int charset = XINT (charset_id);
326 int bytes;
327 unsigned char leading_code_base, leading_code_ext;
328
329 if (NILP (CHARSET_TABLE_ENTRY (charset)))
330 CHARSET_TABLE_ENTRY (charset)
331 = Fmake_vector (make_number (CHARSET_MAX_IDX), Qnil);
332
333 /* Get byte length of multibyte form, base leading-code, and
334 extended leading-code of the charset. See the comment under the
335 title "GENERAL NOTE on CHARACTER SET (CHARSET)" in charset.h. */
336 bytes = XINT (dimension);
337 if (charset < MIN_CHARSET_PRIVATE_DIMENSION1)
338 {
339 /* Official charset, it doesn't have an extended leading-code. */
340 if (charset != CHARSET_ASCII)
341 bytes += 1; /* For a base leading-code. */
342 leading_code_base = charset;
343 leading_code_ext = 0;
344 }
345 else
346 {
347 /* Private charset. */
348 bytes += 2; /* For base and extended leading-codes. */
349 leading_code_base
350 = (charset < LEADING_CODE_EXT_12
351 ? LEADING_CODE_PRIVATE_11
352 : (charset < LEADING_CODE_EXT_21
353 ? LEADING_CODE_PRIVATE_12
354 : (charset < LEADING_CODE_EXT_22
355 ? LEADING_CODE_PRIVATE_21
356 : LEADING_CODE_PRIVATE_22)));
357 leading_code_ext = charset;
358 }
359
360 CHARSET_TABLE_INFO (charset, CHARSET_ID_IDX) = charset_id;
361 CHARSET_TABLE_INFO (charset, CHARSET_BYTES_IDX) = make_number (bytes);
362 CHARSET_TABLE_INFO (charset, CHARSET_DIMENSION_IDX) = dimension;
363 CHARSET_TABLE_INFO (charset, CHARSET_CHARS_IDX) = chars;
364 CHARSET_TABLE_INFO (charset, CHARSET_WIDTH_IDX) = width;
365 CHARSET_TABLE_INFO (charset, CHARSET_DIRECTION_IDX) = direction;
366 CHARSET_TABLE_INFO (charset, CHARSET_LEADING_CODE_BASE_IDX)
367 = make_number (leading_code_base);
368 CHARSET_TABLE_INFO (charset, CHARSET_LEADING_CODE_EXT_IDX)
369 = make_number (leading_code_ext);
370 CHARSET_TABLE_INFO (charset, CHARSET_ISO_FINAL_CHAR_IDX) = iso_final_char;
371 CHARSET_TABLE_INFO (charset, CHARSET_ISO_GRAPHIC_PLANE_IDX)
372 = iso_graphic_plane;
373 CHARSET_TABLE_INFO (charset, CHARSET_SHORT_NAME_IDX) = short_name;
374 CHARSET_TABLE_INFO (charset, CHARSET_LONG_NAME_IDX) = long_name;
375 CHARSET_TABLE_INFO (charset, CHARSET_DESCRIPTION_IDX) = description;
376 CHARSET_TABLE_INFO (charset, CHARSET_PLIST_IDX) = Qnil;
377
378 {
379 /* If we have already defined a charset which has the same
380 DIMENSION, CHARS and ISO-FINAL-CHAR but the different
381 DIRECTION, we must update the entry REVERSE-CHARSET of both
382 charsets. If there's no such charset, the value of the entry
383 is set to nil. */
384 int i;
385
386 for (i = 0; i <= MAX_CHARSET; i++)
387 if (!NILP (CHARSET_TABLE_ENTRY (i)))
388 {
389 if (CHARSET_DIMENSION (i) == XINT (dimension)
390 && CHARSET_CHARS (i) == XINT (chars)
391 && CHARSET_ISO_FINAL_CHAR (i) == XINT (iso_final_char)
392 && CHARSET_DIRECTION (i) != XINT (direction))
393 {
394 CHARSET_TABLE_INFO (charset, CHARSET_REVERSE_CHARSET_IDX)
395 = make_number (i);
396 CHARSET_TABLE_INFO (i, CHARSET_REVERSE_CHARSET_IDX) = charset_id;
397 break;
398 }
399 }
400 if (i > MAX_CHARSET)
401 /* No such a charset. */
402 CHARSET_TABLE_INFO (charset, CHARSET_REVERSE_CHARSET_IDX)
403 = make_number (-1);
404 }
405
406 if (charset != CHARSET_ASCII
407 && charset < MIN_CHARSET_PRIVATE_DIMENSION1)
408 {
409 /* Update tables bytes_by_char_head and width_by_char_head. */
410 bytes_by_char_head[leading_code_base] = bytes;
411 width_by_char_head[leading_code_base] = XINT (width);
412
413 /* Update table emacs_code_class. */
414 emacs_code_class[charset] = (bytes == 2
415 ? EMACS_leading_code_2
416 : (bytes == 3
417 ? EMACS_leading_code_3
418 : EMACS_leading_code_4));
419 }
420
421 /* Update table iso_charset_table. */
422 if (ISO_CHARSET_TABLE (dimension, chars, iso_final_char) < 0)
423 ISO_CHARSET_TABLE (dimension, chars, iso_final_char) = charset;
424 }
425
426 #ifdef emacs
427
428 /* Return charset id of CHARSET_SYMBOL, or return -1 if CHARSET_SYMBOL
429 is invalid. */
430 int
431 get_charset_id (charset_symbol)
432 Lisp_Object charset_symbol;
433 {
434 Lisp_Object val;
435 int charset;
436
437 return ((SYMBOLP (charset_symbol)
438 && (val = Fget (charset_symbol, Qcharset), VECTORP (val))
439 && (charset = XINT (XVECTOR (val)->contents[CHARSET_ID_IDX]),
440 CHARSET_VALID_P (charset)))
441 ? charset : -1);
442 }
443
444 /* Return an identification number for a new private charset of
445 DIMENSION and WIDTH. If there's no more room for the new charset,
446 return 0. */
447 Lisp_Object
448 get_new_private_charset_id (dimension, width)
449 int dimension, width;
450 {
451 int charset, from, to;
452
453 if (dimension == 1)
454 {
455 if (width == 1)
456 from = LEADING_CODE_EXT_11, to = LEADING_CODE_EXT_12;
457 else
458 from = LEADING_CODE_EXT_12, to = LEADING_CODE_EXT_21;
459 }
460 else
461 {
462 if (width == 1)
463 from = LEADING_CODE_EXT_21, to = LEADING_CODE_EXT_22;
464 else
465 from = LEADING_CODE_EXT_22, to = LEADING_CODE_EXT_MAX + 1;
466 }
467
468 for (charset = from; charset < to; charset++)
469 if (!CHARSET_DEFINED_P (charset)) break;
470
471 return make_number (charset < to ? charset : 0);
472 }
473
474 DEFUN ("define-charset", Fdefine_charset, Sdefine_charset, 3, 3, 0,
475 "Define CHARSET-ID as the identification number of CHARSET with INFO-VECTOR.\n\
476 If CHARSET-ID is nil, it is decided automatically, which means CHARSET is\n\
477 treated as a private charset.\n\
478 INFO-VECTOR is a vector of the format:\n\
479 [DIMENSION CHARS WIDTH DIRECTION ISO-FINAL-CHAR ISO-GRAPHIC-PLANE\n\
480 SHORT-NAME LONG-NAME DESCRIPTION]\n\
481 The meanings of each elements is as follows:\n\
482 DIMENSION (integer) is the number of bytes to represent a character: 1 or 2.\n\
483 CHARS (integer) is the number of characters in a dimension: 94 or 96.\n\
484 WIDTH (integer) is the number of columns a character in the charset\n\
485 occupies on the screen: one of 0, 1, and 2.\n\
486 \n\
487 DIRECTION (integer) is the rendering direction of characters in the\n\
488 charset when rendering. If 0, render from right to left, else\n\
489 render from left to right.\n\
490 \n\
491 ISO-FINAL-CHAR (character) is the final character of the\n\
492 corresponding ISO 2022 charset.\n\
493 \n\
494 ISO-GRAPHIC-PLANE (integer) is the graphic plane to be invoked\n\
495 while encoding to variants of ISO 2022 coding system, one of the\n\
496 following: 0/graphic-plane-left(GL), 1/graphic-plane-right(GR).\n\
497 \n\
498 SHORT-NAME (string) is the short name to refer to the charset.\n\
499 \n\
500 LONG-NAME (string) is the long name to refer to the charset.\n\
501 \n\
502 DESCRIPTION (string) is the description string of the charset.")
503 (charset_id, charset_symbol, info_vector)
504 Lisp_Object charset_id, charset_symbol, info_vector;
505 {
506 Lisp_Object *vec;
507
508 if (!NILP (charset_id))
509 CHECK_NUMBER (charset_id, 0);
510 CHECK_SYMBOL (charset_symbol, 1);
511 CHECK_VECTOR (info_vector, 2);
512
513 if (! NILP (charset_id))
514 {
515 if (! CHARSET_VALID_P (XINT (charset_id)))
516 error ("Invalid CHARSET: %d", XINT (charset_id));
517 else if (CHARSET_DEFINED_P (XINT (charset_id)))
518 error ("Already defined charset: %d", XINT (charset_id));
519 }
520
521 vec = XVECTOR (info_vector)->contents;
522 if (XVECTOR (info_vector)->size != 9
523 || !INTEGERP (vec[0]) || !(XINT (vec[0]) == 1 || XINT (vec[0]) == 2)
524 || !INTEGERP (vec[1]) || !(XINT (vec[1]) == 94 || XINT (vec[1]) == 96)
525 || !INTEGERP (vec[2]) || !(XINT (vec[2]) == 1 || XINT (vec[2]) == 2)
526 || !INTEGERP (vec[3]) || !(XINT (vec[3]) == 0 || XINT (vec[3]) == 1)
527 || !INTEGERP (vec[4]) || !(XINT (vec[4]) >= '0' && XINT (vec[4]) <= '~')
528 || !INTEGERP (vec[5]) || !(XINT (vec[5]) == 0 || XINT (vec[5]) == 1)
529 || !STRINGP (vec[6])
530 || !STRINGP (vec[7])
531 || !STRINGP (vec[8]))
532 error ("Invalid info-vector argument for defining charset %s",
533 XSYMBOL (charset_symbol)->name->data);
534
535 if (NILP (charset_id))
536 {
537 charset_id = get_new_private_charset_id (XINT (vec[0]), XINT (vec[2]));
538 if (XINT (charset_id) == 0)
539 error ("There's no room for a new private charset %s",
540 XSYMBOL (charset_symbol)->name->data);
541 }
542
543 update_charset_table (charset_id, vec[0], vec[1], vec[2], vec[3],
544 vec[4], vec[5], vec[6], vec[7], vec[8]);
545 Fput (charset_symbol, Qcharset, CHARSET_TABLE_ENTRY (XINT (charset_id)));
546 CHARSET_SYMBOL (XINT (charset_id)) = charset_symbol;
547 Vcharset_list = Fcons (charset_symbol, Vcharset_list);
548 return Qnil;
549 }
550
551 DEFUN ("get-unused-iso-final-char", Fget_unused_iso_final_char,
552 Sget_unused_iso_final_char, 2, 2, 0,
553 "Return an unsed ISO's final char for a charset of DIMENISION and CHARS.\n\
554 DIMENSION is the number of bytes to represent a character: 1 or 2.\n\
555 CHARS is the number of characters in a dimension: 94 or 96.\n\
556 \n\
557 This final char is for private use, thus the range is `0' (48) .. `?' (63).\n\
558 If there's no unused final char for the specified kind of charset,\n\
559 return nil.")
560 (dimension, chars)
561 Lisp_Object dimension, chars;
562 {
563 int final_char;
564
565 CHECK_NUMBER (dimension, 0);
566 CHECK_NUMBER (chars, 1);
567 if (XINT (dimension) != 1 && XINT (dimension) != 2)
568 error ("Invalid charset dimension %d, it should be 1 or 2",
569 XINT (dimension));
570 if (XINT (chars) != 94 && XINT (chars) != 96)
571 error ("Invalid charset chars %d, it should be 94 or 96",
572 XINT (chars));
573 for (final_char = '0'; final_char <= '?'; final_char++)
574 {
575 if (ISO_CHARSET_TABLE (dimension, chars, make_number (final_char)) < 0)
576 break;
577 }
578 return (final_char <= '?' ? make_number (final_char) : Qnil);
579 }
580
581 DEFUN ("declare-equiv-charset", Fdeclare_equiv_charset, Sdeclare_equiv_charset,
582 4, 4, 0,
583 "Declare a charset of DIMENSION, CHARS, FINAL-CHAR is the same as CHARSET.\n\
584 CHARSET should be defined by `defined-charset' in advance.")
585 (dimension, chars, final_char, charset_symbol)
586 Lisp_Object dimension, chars, final_char, charset_symbol;
587 {
588 int charset;
589
590 CHECK_NUMBER (dimension, 0);
591 CHECK_NUMBER (chars, 1);
592 CHECK_NUMBER (final_char, 2);
593 CHECK_SYMBOL (charset_symbol, 3);
594
595 if (XINT (dimension) != 1 && XINT (dimension) != 2)
596 error ("Invalid DIMENSION %d, it should be 1 or 2", XINT (dimension));
597 if (XINT (chars) != 94 && XINT (chars) != 96)
598 error ("Invalid CHARS %d, it should be 94 or 96", XINT (chars));
599 if (XINT (final_char) < '0' || XFASTINT (final_char) > '~')
600 error ("Invalid FINAL-CHAR %c, it should be `0'..`~'", XINT (chars));
601 if ((charset = get_charset_id (charset_symbol)) < 0)
602 error ("Invalid charset %s", XSYMBOL (charset_symbol)->name->data);
603
604 ISO_CHARSET_TABLE (dimension, chars, final_char) = charset;
605 return Qnil;
606 }
607
608 /* Return number of different charsets in STR of length LEN. In
609 addition, for each found charset N, CHARSETS[N] is set 1. The
610 caller should allocate CHARSETS (MAX_CHARSET + 1 elements) in advance.
611 It may lookup a unification table TABLE if supplied. */
612
613 int
614 find_charset_in_str (str, len, charsets, table)
615 unsigned char *str;
616 int len, *charsets;
617 Lisp_Object table;
618 {
619 register int num = 0, c;
620
621 if (! CHAR_TABLE_P (table))
622 table = Qnil;
623
624 while (len > 0)
625 {
626 int bytes, charset;
627 c = *str;
628
629 if (c == LEADING_CODE_COMPOSITION)
630 {
631 int cmpchar_id = str_cmpchar_id (str, len);
632 GLYPH *glyph;
633
634 if (cmpchar_id > 0)
635 {
636 struct cmpchar_info *cmpcharp = cmpchar_table[cmpchar_id];
637 int i;
638
639 for (i = 0; i < cmpcharp->glyph_len; i++)
640 {
641 c = cmpcharp->glyph[i];
642 if (!NILP (table))
643 {
644 if ((c = unify_char (table, c, 0, 0, 0)) < 0)
645 c = cmpcharp->glyph[i];
646 }
647 if ((charset = CHAR_CHARSET (c)) < 0)
648 charset = CHARSET_ASCII;
649 if (!charsets[charset])
650 {
651 charsets[charset] = 1;
652 num += 1;
653 }
654 }
655 str += cmpcharp->len;
656 len -= cmpcharp->len;
657 continue;
658 }
659
660 charset = CHARSET_ASCII;
661 bytes = 1;
662 }
663 else
664 {
665 c = STRING_CHAR_AND_LENGTH (str, len, bytes);
666 if (! NILP (table))
667 {
668 int c1 = unify_char (table, c, 0, 0, 0);
669 if (c1 >= 0)
670 c = c1;
671 }
672 charset = CHAR_CHARSET (c);
673 }
674
675 if (!charsets[charset])
676 {
677 charsets[charset] = 1;
678 num += 1;
679 }
680 str += bytes;
681 len -= bytes;
682 }
683 return num;
684 }
685
686 DEFUN ("find-charset-region", Ffind_charset_region, Sfind_charset_region,
687 2, 3, 0,
688 "Return a list of charsets in the region between BEG and END.\n\
689 BEG and END are buffer positions.\n\
690 Optional arg TABLE if non-nil is a unification table to look up.")
691 (beg, end, table)
692 Lisp_Object beg, end, table;
693 {
694 int charsets[MAX_CHARSET + 1];
695 int from, from_byte, to, stop, stop_byte, i;
696 Lisp_Object val;
697
698 validate_region (&beg, &end);
699 from = XFASTINT (beg);
700 stop = to = XFASTINT (end);
701
702 if (from < GPT && GPT < to)
703 {
704 stop = GPT;
705 stop_byte = GPT_BYTE;
706 }
707 else
708 stop_byte = CHAR_TO_BYTE (stop);
709
710 from_byte = CHAR_TO_BYTE (from);
711
712 bzero (charsets, (MAX_CHARSET + 1) * sizeof (int));
713 while (1)
714 {
715 find_charset_in_str (BYTE_POS_ADDR (from_byte), stop_byte - from_byte,
716 charsets, table);
717 if (stop < to)
718 {
719 from = stop, from_byte = stop_byte;
720 stop = to, stop_byte = CHAR_TO_BYTE (stop);
721 }
722 else
723 break;
724 }
725
726 val = Qnil;
727 for (i = MAX_CHARSET; i >= 0; i--)
728 if (charsets[i])
729 val = Fcons (CHARSET_SYMBOL (i), val);
730 return val;
731 }
732
733 DEFUN ("find-charset-string", Ffind_charset_string, Sfind_charset_string,
734 1, 2, 0,
735 "Return a list of charsets in STR.\n\
736 Optional arg TABLE if non-nil is a unification table to look up.")
737 (str, table)
738 Lisp_Object str, table;
739 {
740 int charsets[MAX_CHARSET + 1];
741 int i;
742 Lisp_Object val;
743
744 CHECK_STRING (str, 0);
745
746 if (! STRING_MULTIBYTE (str))
747 return Qnil;
748
749 bzero (charsets, (MAX_CHARSET + 1) * sizeof (int));
750 find_charset_in_str (XSTRING (str)->data, XSTRING (str)->size_byte,
751 charsets, table);
752 val = Qnil;
753 for (i = MAX_CHARSET; i >= 0; i--)
754 if (charsets[i])
755 val = Fcons (CHARSET_SYMBOL (i), val);
756 return val;
757 }
758 \f
759 DEFUN ("make-char-internal", Fmake_char_internal, Smake_char_internal, 1, 3, 0,
760 "")
761 (charset, code1, code2)
762 Lisp_Object charset, code1, code2;
763 {
764 CHECK_NUMBER (charset, 0);
765
766 if (NILP (code1))
767 XSETFASTINT (code1, 0);
768 else
769 CHECK_NUMBER (code1, 1);
770 if (NILP (code2))
771 XSETFASTINT (code2, 0);
772 else
773 CHECK_NUMBER (code2, 2);
774
775 if (!CHARSET_DEFINED_P (XINT (charset)))
776 error ("Invalid charset: %d", XINT (charset));
777
778 return make_number (MAKE_CHAR (XINT (charset), XINT (code1), XINT (code2)));
779 }
780
781 DEFUN ("split-char", Fsplit_char, Ssplit_char, 1, 1, 0,
782 "Return list of charset and one or two position-codes of CHAR.")
783 (ch)
784 Lisp_Object ch;
785 {
786 Lisp_Object val;
787 int charset, c1, c2;
788
789 CHECK_NUMBER (ch, 0);
790 SPLIT_CHAR (XFASTINT (ch), charset, c1, c2);
791 return (c2 >= 0
792 ? Fcons (CHARSET_SYMBOL (charset),
793 Fcons (make_number (c1), Fcons (make_number (c2), Qnil)))
794 : Fcons (CHARSET_SYMBOL (charset), Fcons (make_number (c1), Qnil)));
795 }
796
797 DEFUN ("char-charset", Fchar_charset, Schar_charset, 1, 1, 0,
798 "Return charset of CHAR.")
799 (ch)
800 Lisp_Object ch;
801 {
802 CHECK_NUMBER (ch, 0);
803
804 return CHARSET_SYMBOL (CHAR_CHARSET (XINT (ch)));
805 }
806
807 DEFUN ("iso-charset", Fiso_charset, Siso_charset, 3, 3, 0,
808 "Return charset of ISO's specification DIMENSION, CHARS, and FINAL-CHAR.\n\
809 \n\
810 ISO 2022's designation sequence (escape sequence) distinguishes charsets\n\
811 by their DIMENSION, CHARS, and FINAL-CHAR,\n\
812 where as Emacs distinguishes them by charset symbol.\n\
813 See the documentation of the function `charset-info' for the meanings of\n\
814 DIMENSION, CHARS, and FINAL-CHAR.")
815 (dimension, chars, final_char)
816 Lisp_Object dimension, chars, final_char;
817 {
818 int charset;
819
820 CHECK_NUMBER (dimension, 0);
821 CHECK_NUMBER (chars, 1);
822 CHECK_NUMBER (final_char, 2);
823
824 if ((charset = ISO_CHARSET_TABLE (dimension, chars, final_char)) < 0)
825 return Qnil;
826 return CHARSET_SYMBOL (charset);
827 }
828
829 /* If GENERICP is nonzero, return nonzero iff C is a valid normal or
830 generic character. If GENERICP is zero, return nonzero iff C is a
831 valid normal character. Do not call this function directly,
832 instead use macro CHAR_VALID_P. */
833 int
834 char_valid_p (c, genericp)
835 int c, genericp;
836 {
837 int charset, c1, c2;
838
839 if (c < 0)
840 return 0;
841 if (SINGLE_BYTE_CHAR_P (c))
842 return 1;
843 SPLIT_NON_ASCII_CHAR (c, charset, c1, c2);
844 if (!CHARSET_VALID_P (charset))
845 return 0;
846 return (c < MIN_CHAR_COMPOSITION
847 ? ((c & CHAR_FIELD1_MASK) /* i.e. dimension of C is two. */
848 ? (genericp && c1 == 0 && c2 == 0
849 || c1 >= 32 && c2 >= 32)
850 : (genericp && c1 == 0
851 || c1 >= 32))
852 : c < MIN_CHAR_COMPOSITION + n_cmpchars);
853 }
854
855 DEFUN ("char-valid-p", Fchar_valid_p, Schar_valid_p, 1, 2, 0,
856 "Return t if OBJECT is a valid normal character.\n\
857 If optional arg GENERICP is non-nil, also return t if OBJECT is\n\
858 a valid generic character.")
859 (object, genericp)
860 Lisp_Object object, genericp;
861 {
862 if (! NATNUMP (object))
863 return Qnil;
864 return (CHAR_VALID_P (XFASTINT (object), !NILP (genericp)) ? Qt : Qnil);
865 }
866
867 DEFUN ("unibyte-char-to-multibyte", Funibyte_char_to_multibyte,
868 Sunibyte_char_to_multibyte, 1, 1, 0,
869 "Convert the unibyte character CH to multibyte character.\n\
870 The conversion is done based on nonascii-translate-table (which see)\n\
871 or nonascii-insert-offset (which see).")
872 (ch)
873 Lisp_Object ch;
874 {
875 int c;
876
877 CHECK_NUMBER (ch, 0);
878 c = XINT (ch);
879 if (c < 0 || c >= 0400)
880 error ("Invalid unibyte character: %d", c);
881 c = unibyte_char_to_multibyte (c);
882 if (c < 0)
883 error ("Can't convert to multibyte character: %d", XINT (ch));
884 return make_number (c);
885 }
886
887 DEFUN ("char-bytes", Fchar_bytes, Schar_bytes, 1, 1, 0,
888 "Return byte length of multi-byte form of CHAR.")
889 (ch)
890 Lisp_Object ch;
891 {
892 Lisp_Object val;
893 int bytes;
894
895 CHECK_NUMBER (ch, 0);
896 if (COMPOSITE_CHAR_P (XFASTINT (ch)))
897 {
898 unsigned int id = COMPOSITE_CHAR_ID (XFASTINT (ch));
899
900 bytes = (id < n_cmpchars ? cmpchar_table[id]->len : 1);
901 }
902 else
903 {
904 int charset = CHAR_CHARSET (XFASTINT (ch));
905
906 bytes = CHARSET_DEFINED_P (charset) ? CHARSET_BYTES (charset) : 1;
907 }
908
909 XSETFASTINT (val, bytes);
910 return val;
911 }
912
913 /* Return the width of character of which multi-byte form starts with
914 C. The width is measured by how many columns occupied on the
915 screen when displayed in the current buffer. */
916
917 #define ONE_BYTE_CHAR_WIDTH(c) \
918 (c < 0x20 \
919 ? (c == '\t' \
920 ? XFASTINT (current_buffer->tab_width) \
921 : (c == '\n' ? 0 : (NILP (current_buffer->ctl_arrow) ? 4 : 2))) \
922 : (c < 0x7f \
923 ? 1 \
924 : (c == 0x7F \
925 ? (NILP (current_buffer->ctl_arrow) ? 4 : 2) \
926 : ((! NILP (current_buffer->enable_multibyte_characters) \
927 && BASE_LEADING_CODE_P (c)) \
928 ? WIDTH_BY_CHAR_HEAD (c) \
929 : 4)))) \
930
931
932 DEFUN ("char-width", Fchar_width, Schar_width, 1, 1, 0,
933 "Return width of CHAR when displayed in the current buffer.\n\
934 The width is measured by how many columns it occupies on the screen.")
935 (ch)
936 Lisp_Object ch;
937 {
938 Lisp_Object val, disp;
939 int c;
940 struct Lisp_Char_Table *dp = buffer_display_table ();
941
942 CHECK_NUMBER (ch, 0);
943
944 c = XINT (ch);
945
946 /* Get the way the display table would display it. */
947 disp = dp ? DISP_CHAR_VECTOR (dp, c) : Qnil;
948
949 if (VECTORP (disp))
950 XSETINT (val, XVECTOR (disp)->size);
951 else if (SINGLE_BYTE_CHAR_P (c))
952 XSETINT (val, ONE_BYTE_CHAR_WIDTH (c));
953 else if (COMPOSITE_CHAR_P (c))
954 {
955 int id = COMPOSITE_CHAR_ID (XFASTINT (ch));
956 XSETFASTINT (val, (id < n_cmpchars ? cmpchar_table[id]->width : 0));
957 }
958 else
959 {
960 int charset = CHAR_CHARSET (c);
961
962 XSETFASTINT (val, CHARSET_WIDTH (charset));
963 }
964 return val;
965 }
966
967 /* Return width of string STR of length LEN when displayed in the
968 current buffer. The width is measured by how many columns it
969 occupies on the screen. */
970
971 int
972 strwidth (str, len)
973 unsigned char *str;
974 int len;
975 {
976 unsigned char *endp = str + len;
977 int width = 0;
978 struct Lisp_Char_Table *dp = buffer_display_table ();
979
980 while (str < endp)
981 {
982 if (*str == LEADING_CODE_COMPOSITION)
983 {
984 int id = str_cmpchar_id (str, endp - str);
985
986 if (id < 0)
987 {
988 width += 4;
989 str++;
990 }
991 else
992 {
993 width += cmpchar_table[id]->width;
994 str += cmpchar_table[id]->len;
995 }
996 }
997 else
998 {
999 Lisp_Object disp;
1000 int thislen;
1001 int c = STRING_CHAR_AND_LENGTH (str, endp - str, thislen);
1002
1003 /* Get the way the display table would display it. */
1004 if (dp)
1005 disp = DISP_CHAR_VECTOR (dp, c);
1006 else
1007 disp = Qnil;
1008
1009 if (VECTORP (disp))
1010 width += XVECTOR (disp)->size;
1011 else
1012 width += ONE_BYTE_CHAR_WIDTH (*str);
1013
1014 str += thislen;
1015 }
1016 }
1017 return width;
1018 }
1019
1020 DEFUN ("string-width", Fstring_width, Sstring_width, 1, 1, 0,
1021 "Return width of STRING when displayed in the current buffer.\n\
1022 Width is measured by how many columns it occupies on the screen.\n\
1023 When calculating width of a multibyte character in STRING,\n\
1024 only the base leading-code is considered; the validity of\n\
1025 the following bytes is not checked.")
1026 (str)
1027 Lisp_Object str;
1028 {
1029 Lisp_Object val;
1030
1031 CHECK_STRING (str, 0);
1032 XSETFASTINT (val, strwidth (XSTRING (str)->data, XSTRING (str)->size_byte));
1033 return val;
1034 }
1035
1036 DEFUN ("char-direction", Fchar_direction, Schar_direction, 1, 1, 0,
1037 "Return the direction of CHAR.\n\
1038 The returned value is 0 for left-to-right and 1 for right-to-left.")
1039 (ch)
1040 Lisp_Object ch;
1041 {
1042 int charset;
1043
1044 CHECK_NUMBER (ch, 0);
1045 charset = CHAR_CHARSET (XFASTINT (ch));
1046 if (!CHARSET_DEFINED_P (charset))
1047 invalid_character (XINT (ch));
1048 return CHARSET_TABLE_INFO (charset, CHARSET_DIRECTION_IDX);
1049 }
1050
1051 DEFUN ("chars-in-region", Fchars_in_region, Schars_in_region, 2, 2, 0,
1052 "Return number of characters between BEG and END.")
1053 (beg, end)
1054 Lisp_Object beg, end;
1055 {
1056 int from, to;
1057
1058 from = min (XFASTINT (beg), XFASTINT (end));
1059 to = max (XFASTINT (beg), XFASTINT (end));
1060
1061 return to - from;
1062 }
1063
1064 /* Return the number of characters in the NBYTES bytes at PTR.
1065 This works by looking at the contents and checking for multibyte sequences.
1066 However, if the current buffer has enable-multibyte-characters = nil,
1067 we treat each byte as a character. */
1068
1069 int
1070 chars_in_text (ptr, nbytes)
1071 unsigned char *ptr;
1072 int nbytes;
1073 {
1074 unsigned char *endp, c;
1075 int chars;
1076
1077 /* current_buffer is null at early stages of Emacs initialization. */
1078 if (current_buffer == 0
1079 || NILP (current_buffer->enable_multibyte_characters))
1080 return nbytes;
1081
1082 endp = ptr + nbytes;
1083 chars = 0;
1084
1085 while (ptr < endp)
1086 {
1087 c = *ptr++;
1088
1089 if (BASE_LEADING_CODE_P (c))
1090 while (ptr < endp && ! CHAR_HEAD_P (*ptr)) ptr++;
1091 chars++;
1092 }
1093
1094 return chars;
1095 }
1096
1097 /* Return the number of characters in the NBYTES bytes at PTR.
1098 This works by looking at the contents and checking for multibyte sequences.
1099 It ignores enable-multibyte-characters. */
1100
1101 int
1102 multibyte_chars_in_text (ptr, nbytes)
1103 unsigned char *ptr;
1104 int nbytes;
1105 {
1106 unsigned char *endp, c;
1107 int chars;
1108
1109 endp = ptr + nbytes;
1110 chars = 0;
1111
1112 while (ptr < endp)
1113 {
1114 c = *ptr++;
1115
1116 if (BASE_LEADING_CODE_P (c))
1117 while (ptr < endp && ! CHAR_HEAD_P (*ptr)) ptr++;
1118 chars++;
1119 }
1120
1121 return chars;
1122 }
1123
1124 DEFUN ("string", Fstring, Sstring, 1, MANY, 0,
1125 "Concatenate all the argument characters and make the result a string.")
1126 (n, args)
1127 int n;
1128 Lisp_Object *args;
1129 {
1130 int i;
1131 unsigned char *buf
1132 = (unsigned char *) alloca (MAX_LENGTH_OF_MULTI_BYTE_FORM * n);
1133 unsigned char *p = buf;
1134 Lisp_Object val;
1135
1136 for (i = 0; i < n; i++)
1137 {
1138 int c, len;
1139 unsigned char *str;
1140
1141 if (!INTEGERP (args[i]))
1142 CHECK_NUMBER (args[i], 0);
1143 c = XINT (args[i]);
1144 len = CHAR_STRING (c, p, str);
1145 if (p != str)
1146 /* C is a composite character. */
1147 bcopy (str, p, len);
1148 p += len;
1149 }
1150
1151 val = make_multibyte_string (buf, n, p - buf);
1152 return val;
1153 }
1154
1155 #endif /* emacs */
1156 \f
1157 /*** Composite characters staffs ***/
1158
1159 /* Each composite character is identified by CMPCHAR-ID which is
1160 assigned when Emacs needs the character code of the composite
1161 character (e.g. when displaying it on the screen). See the
1162 document "GENERAL NOTE on COMPOSITE CHARACTER" in `charset.h' how a
1163 composite character is represented in Emacs. */
1164
1165 /* If `static' is defined, it means that it is defined to null string. */
1166 #ifndef static
1167 /* The following function is copied from lread.c. */
1168 static int
1169 hash_string (ptr, len)
1170 unsigned char *ptr;
1171 int len;
1172 {
1173 register unsigned char *p = ptr;
1174 register unsigned char *end = p + len;
1175 register unsigned char c;
1176 register int hash = 0;
1177
1178 while (p != end)
1179 {
1180 c = *p++;
1181 if (c >= 0140) c -= 40;
1182 hash = ((hash<<3) + (hash>>28) + c);
1183 }
1184 return hash & 07777777777;
1185 }
1186 #endif
1187
1188 #define CMPCHAR_HASH_TABLE_SIZE 0xFFF
1189
1190 static int *cmpchar_hash_table[CMPCHAR_HASH_TABLE_SIZE];
1191
1192 /* Each element of `cmpchar_hash_table' is a pointer to an array of
1193 integer, where the 1st element is the size of the array, the 2nd
1194 element is how many elements are actually used in the array, and
1195 the remaining elements are CMPCHAR-IDs of composite characters of
1196 the same hash value. */
1197 #define CMPCHAR_HASH_SIZE(table) table[0]
1198 #define CMPCHAR_HASH_USED(table) table[1]
1199 #define CMPCHAR_HASH_CMPCHAR_ID(table, i) table[i]
1200
1201 /* Return CMPCHAR-ID of the composite character in STR of the length
1202 LEN. If the composite character has not yet been registered,
1203 register it in `cmpchar_table' and assign new CMPCHAR-ID. This
1204 is the sole function for assigning CMPCHAR-ID. */
1205 int
1206 str_cmpchar_id (str, len)
1207 unsigned char *str;
1208 int len;
1209 {
1210 int hash_idx, *hashp;
1211 unsigned char *buf;
1212 int embedded_rule; /* 1 if composition rule is embedded. */
1213 int chars; /* number of components. */
1214 int i;
1215 struct cmpchar_info *cmpcharp;
1216
1217 /* The second byte 0xFF means compostion rule is embedded. */
1218 embedded_rule = (str[1] == 0xFF);
1219
1220 /* At first, get the actual length of the composite character. */
1221 {
1222 unsigned char *p, *endp = str + 1, *lastp = str + len;
1223 int bytes;
1224
1225 while (endp < lastp && ! CHAR_HEAD_P (*endp)) endp++;
1226 if (endp - str < 5)
1227 /* Any composite char have at least 5-byte length. */
1228 return -1;
1229
1230 chars = 0;
1231 p = str + 1;
1232 while (p < endp)
1233 {
1234 if (embedded_rule) p++;
1235 /* No need of checking if *P is 0xA0 because
1236 BYTES_BY_CHAR_HEAD (0x80) surely returns 2. */
1237 p += BYTES_BY_CHAR_HEAD (*p - 0x20);
1238 chars++;
1239 }
1240 if (p > endp || chars < 2 || chars > MAX_COMPONENT_COUNT)
1241 /* Invalid components. */
1242 return -1;
1243 len = p - str;
1244 }
1245 hash_idx = hash_string (str, len) % CMPCHAR_HASH_TABLE_SIZE;
1246 hashp = cmpchar_hash_table[hash_idx];
1247
1248 /* Then, look into the hash table. */
1249 if (hashp != NULL)
1250 /* Find the correct one among composite characters of the same
1251 hash value. */
1252 for (i = 2; i < CMPCHAR_HASH_USED (hashp); i++)
1253 {
1254 cmpcharp = cmpchar_table[CMPCHAR_HASH_CMPCHAR_ID (hashp, i)];
1255 if (len == cmpcharp->len
1256 && ! bcmp (str, cmpcharp->data, len))
1257 return CMPCHAR_HASH_CMPCHAR_ID (hashp, i);
1258 }
1259
1260 /* We have to register the composite character in cmpchar_table. */
1261 if (n_cmpchars > (CHAR_FIELD2_MASK | CHAR_FIELD3_MASK))
1262 /* No, we have no more room for a new composite character. */
1263 return -1;
1264
1265 /* Make the entry in hash table. */
1266 if (hashp == NULL)
1267 {
1268 /* Make a table for 8 composite characters initially. */
1269 hashp = (cmpchar_hash_table[hash_idx]
1270 = (int *) xmalloc (sizeof (int) * (2 + 8)));
1271 CMPCHAR_HASH_SIZE (hashp) = 10;
1272 CMPCHAR_HASH_USED (hashp) = 2;
1273 }
1274 else if (CMPCHAR_HASH_USED (hashp) >= CMPCHAR_HASH_SIZE (hashp))
1275 {
1276 CMPCHAR_HASH_SIZE (hashp) += 8;
1277 hashp = (cmpchar_hash_table[hash_idx]
1278 = (int *) xrealloc (hashp,
1279 sizeof (int) * CMPCHAR_HASH_SIZE (hashp)));
1280 }
1281 CMPCHAR_HASH_CMPCHAR_ID (hashp, CMPCHAR_HASH_USED (hashp)) = n_cmpchars;
1282 CMPCHAR_HASH_USED (hashp)++;
1283
1284 /* Set information of the composite character in cmpchar_table. */
1285 if (cmpchar_table_size == 0)
1286 {
1287 /* This is the first composite character to be registered. */
1288 cmpchar_table_size = 256;
1289 cmpchar_table
1290 = (struct cmpchar_info **) xmalloc (sizeof (cmpchar_table[0])
1291 * cmpchar_table_size);
1292 }
1293 else if (cmpchar_table_size <= n_cmpchars)
1294 {
1295 cmpchar_table_size += 256;
1296 cmpchar_table
1297 = (struct cmpchar_info **) xrealloc (cmpchar_table,
1298 sizeof (cmpchar_table[0])
1299 * cmpchar_table_size);
1300 }
1301
1302 cmpcharp = (struct cmpchar_info *) xmalloc (sizeof (struct cmpchar_info));
1303
1304 cmpcharp->len = len;
1305 cmpcharp->data = (unsigned char *) xmalloc (len + 1);
1306 bcopy (str, cmpcharp->data, len);
1307 cmpcharp->data[len] = 0;
1308 cmpcharp->glyph_len = chars;
1309 cmpcharp->glyph = (GLYPH *) xmalloc (sizeof (GLYPH) * chars);
1310 if (embedded_rule)
1311 {
1312 cmpcharp->cmp_rule = (unsigned char *) xmalloc (chars);
1313 cmpcharp->col_offset = (float *) xmalloc (sizeof (float) * chars);
1314 }
1315 else
1316 {
1317 cmpcharp->cmp_rule = NULL;
1318 cmpcharp->col_offset = NULL;
1319 }
1320
1321 /* Setup GLYPH data and composition rules (if any) so as not to make
1322 them every time on displaying. */
1323 {
1324 unsigned char *bufp;
1325 int width;
1326 float leftmost = 0.0, rightmost = 1.0;
1327
1328 if (embedded_rule)
1329 /* At first, col_offset[N] is set to relative to col_offset[0]. */
1330 cmpcharp->col_offset[0] = 0;
1331
1332 for (i = 0, bufp = cmpcharp->data + 1; i < chars; i++)
1333 {
1334 if (embedded_rule)
1335 cmpcharp->cmp_rule[i] = *bufp++;
1336
1337 if (*bufp == 0xA0) /* This is an ASCII character. */
1338 {
1339 cmpcharp->glyph[i] = FAST_MAKE_GLYPH ((*++bufp & 0x7F), 0);
1340 width = 1;
1341 bufp++;
1342 }
1343 else /* Multibyte character. */
1344 {
1345 /* Make `bufp' point normal multi-byte form temporally. */
1346 *bufp -= 0x20;
1347 cmpcharp->glyph[i]
1348 = FAST_MAKE_GLYPH (string_to_non_ascii_char (bufp, 4, 0), 0);
1349 width = WIDTH_BY_CHAR_HEAD (*bufp);
1350 *bufp += 0x20;
1351 bufp += BYTES_BY_CHAR_HEAD (*bufp - 0x20);
1352 }
1353
1354 if (embedded_rule && i > 0)
1355 {
1356 /* Reference points (global_ref and new_ref) are
1357 encoded as below:
1358
1359 0--1--2 -- ascent
1360 | |
1361 | |
1362 | 4 -+--- center
1363 -- 3 5 -- baseline
1364 | |
1365 6--7--8 -- descent
1366
1367 Now, we calculate the column offset of the new glyph
1368 from the left edge of the first glyph. This can avoid
1369 the same calculation everytime displaying this
1370 composite character. */
1371
1372 /* Reference points of global glyph and new glyph. */
1373 int global_ref = (cmpcharp->cmp_rule[i] - 0xA0) / 9;
1374 int new_ref = (cmpcharp->cmp_rule[i] - 0xA0) % 9;
1375 /* Column offset relative to the first glyph. */
1376 float left = (leftmost
1377 + (global_ref % 3) * (rightmost - leftmost) / 2.0
1378 - (new_ref % 3) * width / 2.0);
1379
1380 cmpcharp->col_offset[i] = left;
1381 if (left < leftmost)
1382 leftmost = left;
1383 if (left + width > rightmost)
1384 rightmost = left + width;
1385 }
1386 else
1387 {
1388 if (width > rightmost)
1389 rightmost = width;
1390 }
1391 }
1392 if (embedded_rule)
1393 {
1394 /* Now col_offset[N] are relative to the left edge of the
1395 first component. Make them relative to the left edge of
1396 overall glyph. */
1397 for (i = 0; i < chars; i++)
1398 cmpcharp->col_offset[i] -= leftmost;
1399 /* Make rightmost holds width of overall glyph. */
1400 rightmost -= leftmost;
1401 }
1402
1403 cmpcharp->width = rightmost;
1404 if (cmpcharp->width < rightmost)
1405 /* To get a ceiling integer value. */
1406 cmpcharp->width++;
1407 }
1408
1409 cmpchar_table[n_cmpchars] = cmpcharp;
1410
1411 return n_cmpchars++;
1412 }
1413
1414 /* Return the Nth element of the composite character C. */
1415 int
1416 cmpchar_component (c, n)
1417 unsigned int c, n;
1418 {
1419 int id = COMPOSITE_CHAR_ID (c);
1420
1421 if (id >= n_cmpchars /* C is not a valid composite character. */
1422 || n >= cmpchar_table[id]->glyph_len) /* No such component. */
1423 return -1;
1424 /* No face data is stored in glyph code. */
1425 return ((int) (cmpchar_table[id]->glyph[n]));
1426 }
1427
1428 DEFUN ("cmpcharp", Fcmpcharp, Scmpcharp, 1, 1, 0,
1429 "T if CHAR is a composite character.")
1430 (ch)
1431 Lisp_Object ch;
1432 {
1433 CHECK_NUMBER (ch, 0);
1434 return (COMPOSITE_CHAR_P (XINT (ch)) ? Qt : Qnil);
1435 }
1436
1437 DEFUN ("composite-char-component", Fcmpchar_component, Scmpchar_component,
1438 2, 2, 0,
1439 "Return the IDXth component character of composite character CHARACTER.")
1440 (character, idx)
1441 Lisp_Object character, idx;
1442 {
1443 int c;
1444
1445 CHECK_NUMBER (character, 0);
1446 CHECK_NUMBER (idx, 1);
1447
1448 if ((c = cmpchar_component (XINT (character), XINT (idx))) < 0)
1449 args_out_of_range (character, idx);
1450
1451 return make_number (c);
1452 }
1453
1454 DEFUN ("composite-char-composition-rule", Fcmpchar_cmp_rule, Scmpchar_cmp_rule,
1455 2, 2, 0,
1456 "Return the Nth composition rule embedded in composite character CHARACTER.\n\
1457 The returned rule is for composing the Nth component\n\
1458 on the (N-1)th component. If N is 0, the returned value is always 255.")
1459 (character, n)
1460 Lisp_Object character, n;
1461 {
1462 int id, i;
1463
1464 CHECK_NUMBER (character, 0);
1465 CHECK_NUMBER (n, 1);
1466
1467 id = COMPOSITE_CHAR_ID (XINT (character));
1468 if (id < 0 || id >= n_cmpchars)
1469 error ("Invalid composite character: %d", XINT (character));
1470 i = XINT (n);
1471 if (i > cmpchar_table[id]->glyph_len)
1472 args_out_of_range (character, n);
1473
1474 return make_number (cmpchar_table[id]->cmp_rule[i]);
1475 }
1476
1477 DEFUN ("composite-char-composition-rule-p", Fcmpchar_cmp_rule_p,
1478 Scmpchar_cmp_rule_p, 1, 1, 0,
1479 "Return non-nil if composite character CHARACTER contains a embedded rule.")
1480 (character)
1481 Lisp_Object character;
1482 {
1483 int id;
1484
1485 CHECK_NUMBER (character, 0);
1486 id = COMPOSITE_CHAR_ID (XINT (character));
1487 if (id < 0 || id >= n_cmpchars)
1488 error ("Invalid composite character: %d", XINT (character));
1489
1490 return (cmpchar_table[id]->cmp_rule ? Qt : Qnil);
1491 }
1492
1493 DEFUN ("composite-char-component-count", Fcmpchar_cmp_count,
1494 Scmpchar_cmp_count, 1, 1, 0,
1495 "Return number of compoents of composite character CHARACTER.")
1496 (character)
1497 Lisp_Object character;
1498 {
1499 int id;
1500
1501 CHECK_NUMBER (character, 0);
1502 id = COMPOSITE_CHAR_ID (XINT (character));
1503 if (id < 0 || id >= n_cmpchars)
1504 error ("Invalid composite character: %d", XINT (character));
1505
1506 return (make_number (cmpchar_table[id]->glyph_len));
1507 }
1508
1509 DEFUN ("compose-string", Fcompose_string, Scompose_string,
1510 1, 1, 0,
1511 "Return one char string composed from all characters in STRING.")
1512 (str)
1513 Lisp_Object str;
1514 {
1515 unsigned char buf[MAX_LENGTH_OF_MULTI_BYTE_FORM], *p, *pend, *ptemp;
1516 int len, i;
1517
1518 CHECK_STRING (str, 0);
1519
1520 buf[0] = LEADING_CODE_COMPOSITION;
1521 p = XSTRING (str)->data;
1522 pend = p + XSTRING (str)->size_byte;
1523 i = 1;
1524 while (p < pend)
1525 {
1526 if (*p < 0x20 || *p == 127) /* control code */
1527 error ("Invalid component character: %d", *p);
1528 else if (*p < 0x80) /* ASCII */
1529 {
1530 if (i + 2 >= MAX_LENGTH_OF_MULTI_BYTE_FORM)
1531 error ("Too long string to be composed: %s", XSTRING (str)->data);
1532 /* Prepend an ASCII charset indicator 0xA0, set MSB of the
1533 code itself. */
1534 buf[i++] = 0xA0;
1535 buf[i++] = *p++ + 0x80;
1536 }
1537 else if (*p == LEADING_CODE_COMPOSITION) /* composite char */
1538 {
1539 /* Already composed. Eliminate the heading
1540 LEADING_CODE_COMPOSITION, keep the remaining bytes
1541 unchanged. */
1542 p++;
1543 ptemp = p;
1544 while (! CHAR_HEAD_P (*p)) p++;
1545 if (i + (p - ptemp) >= MAX_LENGTH_OF_MULTI_BYTE_FORM)
1546 error ("Too long string to be composed: %s", XSTRING (str)->data);
1547 bcopy (ptemp, buf + i, p - ptemp);
1548 i += p - ptemp;
1549 }
1550 else /* multibyte char */
1551 {
1552 /* Add 0x20 to the base leading-code, keep the remaining
1553 bytes unchanged. */
1554 len = BYTES_BY_CHAR_HEAD (*p);
1555 if (i + len >= MAX_LENGTH_OF_MULTI_BYTE_FORM)
1556 error ("Too long string to be composed: %s", XSTRING (str)->data);
1557 bcopy (p, buf + i, len);
1558 buf[i] += 0x20;
1559 p += len, i += len;
1560 }
1561 }
1562
1563 if (i < 5)
1564 /* STR contains only one character, which can't be composed. */
1565 error ("Too short string to be composed: %s", XSTRING (str)->data);
1566
1567 return make_multibyte_string (buf, 1, i);
1568 }
1569
1570 \f
1571 charset_id_internal (charset_name)
1572 char *charset_name;
1573 {
1574 Lisp_Object val = Fget (intern (charset_name), Qcharset);
1575
1576 if (!VECTORP (val))
1577 error ("Charset %s is not defined", charset_name);
1578
1579 return (XINT (XVECTOR (val)->contents[0]));
1580 }
1581
1582 DEFUN ("setup-special-charsets", Fsetup_special_charsets,
1583 Ssetup_special_charsets, 0, 0, 0, "Internal use only.")
1584 ()
1585 {
1586 charset_latin_iso8859_1 = charset_id_internal ("latin-iso8859-1");
1587 charset_jisx0208_1978 = charset_id_internal ("japanese-jisx0208-1978");
1588 charset_jisx0208 = charset_id_internal ("japanese-jisx0208");
1589 charset_katakana_jisx0201 = charset_id_internal ("katakana-jisx0201");
1590 charset_latin_jisx0201 = charset_id_internal ("latin-jisx0201");
1591 charset_big5_1 = charset_id_internal ("chinese-big5-1");
1592 charset_big5_2 = charset_id_internal ("chinese-big5-2");
1593 return Qnil;
1594 }
1595
1596 init_charset_once ()
1597 {
1598 int i, j, k;
1599
1600 staticpro (&Vcharset_table);
1601 staticpro (&Vcharset_symbol_table);
1602
1603 /* This has to be done here, before we call Fmake_char_table. */
1604 Qcharset_table = intern ("charset-table");
1605 staticpro (&Qcharset_table);
1606
1607 /* Intern this now in case it isn't already done.
1608 Setting this variable twice is harmless.
1609 But don't staticpro it here--that is done in alloc.c. */
1610 Qchar_table_extra_slots = intern ("char-table-extra-slots");
1611
1612 /* Now we are ready to set up this property, so we can
1613 create the charset table. */
1614 Fput (Qcharset_table, Qchar_table_extra_slots, make_number (0));
1615 Vcharset_table = Fmake_char_table (Qcharset_table, Qnil);
1616
1617 Vcharset_symbol_table = Fmake_vector (make_number (MAX_CHARSET + 1), Qnil);
1618
1619 /* Setup tables. */
1620 for (i = 0; i < 2; i++)
1621 for (j = 0; j < 2; j++)
1622 for (k = 0; k < 128; k++)
1623 iso_charset_table [i][j][k] = -1;
1624
1625 bzero (cmpchar_hash_table, sizeof cmpchar_hash_table);
1626 cmpchar_table_size = n_cmpchars = 0;
1627
1628 for (i = 0; i < 256; i++)
1629 BYTES_BY_CHAR_HEAD (i) = 1;
1630 BYTES_BY_CHAR_HEAD (LEADING_CODE_PRIVATE_11) = 3;
1631 BYTES_BY_CHAR_HEAD (LEADING_CODE_PRIVATE_12) = 3;
1632 BYTES_BY_CHAR_HEAD (LEADING_CODE_PRIVATE_21) = 4;
1633 BYTES_BY_CHAR_HEAD (LEADING_CODE_PRIVATE_22) = 4;
1634 /* The following doesn't reflect the actual bytes, but just to tell
1635 that it is a start of a multibyte character. */
1636 BYTES_BY_CHAR_HEAD (LEADING_CODE_COMPOSITION) = 2;
1637
1638 for (i = 0; i < 128; i++)
1639 WIDTH_BY_CHAR_HEAD (i) = 1;
1640 for (; i < 256; i++)
1641 WIDTH_BY_CHAR_HEAD (i) = 4;
1642 WIDTH_BY_CHAR_HEAD (LEADING_CODE_PRIVATE_11) = 1;
1643 WIDTH_BY_CHAR_HEAD (LEADING_CODE_PRIVATE_12) = 2;
1644 WIDTH_BY_CHAR_HEAD (LEADING_CODE_PRIVATE_21) = 1;
1645 WIDTH_BY_CHAR_HEAD (LEADING_CODE_PRIVATE_22) = 2;
1646 }
1647
1648 #ifdef emacs
1649
1650 syms_of_charset ()
1651 {
1652 Qascii = intern ("ascii");
1653 staticpro (&Qascii);
1654
1655 Qcharset = intern ("charset");
1656 staticpro (&Qcharset);
1657
1658 /* Define ASCII charset now. */
1659 update_charset_table (make_number (CHARSET_ASCII),
1660 make_number (1), make_number (94),
1661 make_number (1),
1662 make_number (0),
1663 make_number ('B'),
1664 make_number (0),
1665 build_string ("ASCII"),
1666 build_string ("ASCII"),
1667 build_string ("ASCII (ISO646 IRV)"));
1668 CHARSET_SYMBOL (CHARSET_ASCII) = Qascii;
1669 Fput (Qascii, Qcharset, CHARSET_TABLE_ENTRY (CHARSET_ASCII));
1670
1671 Qcomposition = intern ("composition");
1672 staticpro (&Qcomposition);
1673 CHARSET_SYMBOL (CHARSET_COMPOSITION) = Qcomposition;
1674
1675 defsubr (&Sdefine_charset);
1676 defsubr (&Sget_unused_iso_final_char);
1677 defsubr (&Sdeclare_equiv_charset);
1678 defsubr (&Sfind_charset_region);
1679 defsubr (&Sfind_charset_string);
1680 defsubr (&Smake_char_internal);
1681 defsubr (&Ssplit_char);
1682 defsubr (&Schar_charset);
1683 defsubr (&Siso_charset);
1684 defsubr (&Schar_valid_p);
1685 defsubr (&Sunibyte_char_to_multibyte);
1686 defsubr (&Schar_bytes);
1687 defsubr (&Schar_width);
1688 defsubr (&Sstring_width);
1689 defsubr (&Schar_direction);
1690 defsubr (&Schars_in_region);
1691 defsubr (&Sstring);
1692 defsubr (&Scmpcharp);
1693 defsubr (&Scmpchar_component);
1694 defsubr (&Scmpchar_cmp_rule);
1695 defsubr (&Scmpchar_cmp_rule_p);
1696 defsubr (&Scmpchar_cmp_count);
1697 defsubr (&Scompose_string);
1698 defsubr (&Ssetup_special_charsets);
1699
1700 DEFVAR_LISP ("charset-list", &Vcharset_list,
1701 "List of charsets ever defined.");
1702 Vcharset_list = Fcons (Qascii, Qnil);
1703
1704 DEFVAR_LISP ("character-unification-table-vector",
1705 &Vcharacter_unification_table_vector,
1706 "Vector of cons cell of a symbol and unification table ever defined.\n\
1707 An ID of a unification table is an index of this vector.");
1708 Vcharacter_unification_table_vector = Fmake_vector (make_number (16), Qnil);
1709
1710 DEFVAR_INT ("leading-code-composition", &leading_code_composition,
1711 "Leading-code of composite characters.");
1712 leading_code_composition = LEADING_CODE_COMPOSITION;
1713
1714 DEFVAR_INT ("leading-code-private-11", &leading_code_private_11,
1715 "Leading-code of private TYPE9N charset of column-width 1.");
1716 leading_code_private_11 = LEADING_CODE_PRIVATE_11;
1717
1718 DEFVAR_INT ("leading-code-private-12", &leading_code_private_12,
1719 "Leading-code of private TYPE9N charset of column-width 2.");
1720 leading_code_private_12 = LEADING_CODE_PRIVATE_12;
1721
1722 DEFVAR_INT ("leading-code-private-21", &leading_code_private_21,
1723 "Leading-code of private TYPE9Nx9N charset of column-width 1.");
1724 leading_code_private_21 = LEADING_CODE_PRIVATE_21;
1725
1726 DEFVAR_INT ("leading-code-private-22", &leading_code_private_22,
1727 "Leading-code of private TYPE9Nx9N charset of column-width 2.");
1728 leading_code_private_22 = LEADING_CODE_PRIVATE_22;
1729
1730 DEFVAR_INT ("nonascii-insert-offset", &nonascii_insert_offset,
1731 "Offset for converting non-ASCII unibyte codes 0240...0377 to multibyte.\n\
1732 This is used for converting unibyte text to multibyte,\n\
1733 and for inserting character codes specified by number.\n\n\
1734 Conversion is performed only when multibyte characters are enabled,\n\
1735 and it serves to convert a Latin-1 or similar 8-bit character code\n\
1736 to the corresponding Emacs character code.\n\
1737 If `nonascii-translate-table' is non-nil, it overrides this variable.");
1738 nonascii_insert_offset = 0;
1739
1740 DEFVAR_LISP ("nonascii-translate-table", &Vnonascii_translate_table,
1741 "Translate table for converting non-ASCII unibyte codes to multibyte.\n\
1742 This is used for converting unibyte text to multibyte,\n\
1743 and for inserting character codes specified by number.\n\n\
1744 Conversion is performed only when multibyte characters are enabled,\n\
1745 and it serves to convert a Latin-1 or similar 8-bit character code\n\
1746 to the corresponding Emacs character code.\n\n\
1747 If this is nil, `nonascii-insert-offset' is used instead.");
1748 Vnonascii_translate_table = Qnil;
1749
1750 DEFVAR_INT ("min-composite-char", &min_composite_char,
1751 "Minimum character code of a composite character.");
1752 min_composite_char = MIN_CHAR_COMPOSITION;
1753 }
1754
1755 #endif /* emacs */