(string_to_non_ascii_char): Include garbage bytes (if
[bpt/emacs.git] / src / charset.c
1 /* Basic multilingual character support.
2 Copyright (C) 1995, 1997, 1998 Electrotechnical Laboratory, JAPAN.
3 Licensed to the Free Software Foundation.
4
5 This file is part of GNU Emacs.
6
7 GNU Emacs is free software; you can redistribute it and/or modify
8 it under the terms of the GNU General Public License as published by
9 the Free Software Foundation; either version 2, or (at your option)
10 any later version.
11
12 GNU Emacs is distributed in the hope that it will be useful,
13 but WITHOUT ANY WARRANTY; without even the implied warranty of
14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 GNU General Public License for more details.
16
17 You should have received a copy of the GNU General Public License
18 along with GNU Emacs; see the file COPYING. If not, write to
19 the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
20 Boston, MA 02111-1307, USA. */
21
22 /* At first, see the document in `charset.h' to understand the code in
23 this file. */
24
25 #include <stdio.h>
26
27 #ifdef emacs
28
29 #include <sys/types.h>
30 #include <config.h>
31 #include "lisp.h"
32 #include "buffer.h"
33 #include "charset.h"
34 #include "coding.h"
35 #include "disptab.h"
36
37 #else /* not emacs */
38
39 #include "mulelib.h"
40
41 #endif /* emacs */
42
43 Lisp_Object Qcharset, Qascii, Qcomposition;
44
45 /* Declaration of special leading-codes. */
46 int leading_code_composition; /* for composite characters */
47 int leading_code_private_11; /* for private DIMENSION1 of 1-column */
48 int leading_code_private_12; /* for private DIMENSION1 of 2-column */
49 int leading_code_private_21; /* for private DIMENSION2 of 1-column */
50 int leading_code_private_22; /* for private DIMENSION2 of 2-column */
51
52 /* Declaration of special charsets. */
53 int charset_ascii; /* ASCII */
54 int charset_composition; /* for a composite character */
55 int charset_latin_iso8859_1; /* ISO8859-1 (Latin-1) */
56 int charset_jisx0208_1978; /* JISX0208.1978 (Japanese Kanji old set) */
57 int charset_jisx0208; /* JISX0208.1983 (Japanese Kanji) */
58 int charset_katakana_jisx0201; /* JISX0201.Kana (Japanese Katakana) */
59 int charset_latin_jisx0201; /* JISX0201.Roman (Japanese Roman) */
60 int charset_big5_1; /* Big5 Level 1 (Chinese Traditional) */
61 int charset_big5_2; /* Big5 Level 2 (Chinese Traditional) */
62
63 int min_composite_char;
64
65 Lisp_Object Qcharset_table;
66
67 /* A char-table containing information of each character set. */
68 Lisp_Object Vcharset_table;
69
70 /* A vector of charset symbol indexed by charset-id. This is used
71 only for returning charset symbol from C functions. */
72 Lisp_Object Vcharset_symbol_table;
73
74 /* A list of charset symbols ever defined. */
75 Lisp_Object Vcharset_list;
76
77 /* Vector of unification table ever defined.
78 An ID of a unification table is an index of this vector. */
79 Lisp_Object Vcharacter_unification_table_vector;
80
81 /* Tables used by macros BYTES_BY_CHAR_HEAD and WIDTH_BY_CHAR_HEAD. */
82 int bytes_by_char_head[256];
83 int width_by_char_head[256];
84
85 /* Mapping table from ISO2022's charset (specified by DIMENSION,
86 CHARS, and FINAL-CHAR) to Emacs' charset. */
87 int iso_charset_table[2][2][128];
88
89 /* Table of pointers to the structure `cmpchar_info' indexed by
90 CMPCHAR-ID. */
91 struct cmpchar_info **cmpchar_table;
92 /* The current size of `cmpchar_table'. */
93 static int cmpchar_table_size;
94 /* Number of the current composite characters. */
95 int n_cmpchars;
96
97 /* Variables used locally in the macro FETCH_MULTIBYTE_CHAR. */
98 unsigned char *_fetch_multibyte_char_p;
99 int _fetch_multibyte_char_len;
100
101 /* Offset to add to a non-ASCII value when inserting it. */
102 int nonascii_insert_offset;
103
104 /* Translation table for converting non-ASCII unibyte characters
105 to multibyte codes, or nil. */
106 Lisp_Object Vnonascii_translate_table;
107
108 #define min(X, Y) ((X) < (Y) ? (X) : (Y))
109 #define max(X, Y) ((X) > (Y) ? (X) : (Y))
110 \f
111 void
112 invalid_character (c)
113 int c;
114 {
115 error ("Invalid character: %o, %d, 0x%x", c);
116 }
117
118
119 /* Set STR a pointer to the multi-byte form of the character C. If C
120 is not a composite character, the multi-byte form is set in WORKBUF
121 and STR points WORKBUF. The caller should allocate at least 4-byte
122 area at WORKBUF in advance. Returns the length of the multi-byte
123 form. If C is an invalid character to have a multi-byte form,
124 signal an error.
125
126 Use macro `CHAR_STRING (C, WORKBUF, STR)' instead of calling this
127 function directly if C can be an ASCII character. */
128
129 int
130 non_ascii_char_to_string (c, workbuf, str)
131 int c;
132 unsigned char *workbuf, **str;
133 {
134 int charset, c1, c2;
135
136 if (COMPOSITE_CHAR_P (c))
137 {
138 int cmpchar_id = COMPOSITE_CHAR_ID (c);
139
140 if (cmpchar_id < n_cmpchars)
141 {
142 *str = cmpchar_table[cmpchar_id]->data;
143 return cmpchar_table[cmpchar_id]->len;
144 }
145 else
146 {
147 invalid_character (c);
148 }
149 }
150
151 SPLIT_NON_ASCII_CHAR (c, charset, c1, c2);
152 if (!charset
153 || ! CHARSET_DEFINED_P (charset)
154 || c1 >= 0 && c1 < 32
155 || c2 >= 0 && c2 < 32)
156 invalid_character (c);
157
158 *str = workbuf;
159 *workbuf++ = CHARSET_LEADING_CODE_BASE (charset);
160 if (*workbuf = CHARSET_LEADING_CODE_EXT (charset))
161 workbuf++;
162 *workbuf++ = c1 | 0x80;
163 if (c2 >= 0)
164 *workbuf++ = c2 | 0x80;
165
166 return (workbuf - *str);
167 }
168
169 /* Return a non-ASCII character of which multi-byte form is at STR of
170 length LEN. If ACTUAL_LEN is not NULL, the actual length of the
171 character is set to the address ACTUAL_LEN.
172
173 Use macro `STRING_CHAR (STR, LEN)' instead of calling this function
174 directly if STR can hold an ASCII character. */
175
176 string_to_non_ascii_char (str, len, actual_len)
177 const unsigned char *str;
178 int len, *actual_len;
179 {
180 int charset;
181 unsigned char c1, c2;
182 register int c, bytes;
183
184 c = *str;
185 bytes = 1;
186
187 if (BASE_LEADING_CODE_P (c))
188 {
189 while (bytes < len && ! CHAR_HEAD_P (str[bytes])) bytes++;
190
191 if (c == LEADING_CODE_COMPOSITION)
192 {
193 int cmpchar_id = str_cmpchar_id (str, bytes);
194
195 if (cmpchar_id >= 0)
196 c = MAKE_COMPOSITE_CHAR (cmpchar_id);
197 }
198 else
199 {
200 int charset = c, c1, c2 = 0;
201
202 str++;
203 if (c >= LEADING_CODE_PRIVATE_11)
204 charset = *str++;
205 if (BYTES_BY_CHAR_HEAD (c) <= bytes && CHARSET_DEFINED_P (charset))
206 {
207 c1 = *str++ & 0x7f;
208 if (CHARSET_DIMENSION (charset) == 2)
209 c2 = *str & 0x7F;
210 c = MAKE_NON_ASCII_CHAR (charset, c1, c2);
211 }
212 }
213 }
214
215 if (actual_len)
216 *actual_len = bytes;
217 return c;
218 }
219
220 /* Return the length of the multi-byte form at string STR of length LEN. */
221 int
222 multibyte_form_length (str, len)
223 const unsigned char *str;
224 int len;
225 {
226 int bytes = 1;
227
228 if (BASE_LEADING_CODE_P (*str))
229 while (bytes < len && ! CHAR_HEAD_P (str[bytes])) bytes++;
230
231 return bytes;
232 }
233
234 /* Check if string STR of length LEN contains valid multi-byte form of
235 a character. If valid, charset and position codes of the character
236 is set at *CHARSET, *C1, and *C2, and return 0. If not valid,
237 return -1. This should be used only in the macro SPLIT_STRING
238 which checks range of STR in advance. */
239
240 split_non_ascii_string (str, len, charset, c1, c2)
241 register const unsigned char *str;
242 register unsigned char *c1, *c2;
243 register int len, *charset;
244 {
245 register unsigned int cs = *str++;
246
247 if (cs == LEADING_CODE_COMPOSITION)
248 {
249 int cmpchar_id = str_cmpchar_id (str - 1, len);
250
251 if (cmpchar_id < 0)
252 return -1;
253 *charset = cs, *c1 = cmpchar_id >> 7, *c2 = cmpchar_id & 0x7F;
254 }
255 else if ((cs < LEADING_CODE_PRIVATE_11 || (cs = *str++) >= 0xA0)
256 && CHARSET_DEFINED_P (cs))
257 {
258 *charset = cs;
259 if (*str < 0xA0)
260 return -1;
261 *c1 = (*str++) & 0x7F;
262 if (CHARSET_DIMENSION (cs) == 2)
263 {
264 if (*str < 0xA0)
265 return -1;
266 *c2 = (*str++) & 0x7F;
267 }
268 }
269 else
270 return -1;
271 return 0;
272 }
273
274 /* Return a character unified with C (or a character made of CHARSET,
275 C1, and C2) in unification table TABLE. If no unification is found
276 in TABLE, return C. */
277 unify_char (table, c, charset, c1, c2)
278 Lisp_Object table;
279 int c, charset, c1, c2;
280 {
281 Lisp_Object ch;
282 int alt_charset, alt_c1, alt_c2, dimension;
283
284 if (c < 0) c = MAKE_CHAR (charset, c1, c2);
285 if (!CHAR_TABLE_P (table)
286 || (ch = Faref (table, make_number (c)), !INTEGERP (ch))
287 || XINT (ch) < 0)
288 return c;
289
290 SPLIT_CHAR (XFASTINT (ch), alt_charset, alt_c1, alt_c2);
291 dimension = CHARSET_DIMENSION (alt_charset);
292 if (dimension == 1 && alt_c1 > 0 || dimension == 2 && alt_c2 > 0)
293 /* CH is not a generic character, just return it. */
294 return XFASTINT (ch);
295
296 /* Since CH is a generic character, we must return a specific
297 charater which has the same position codes as C from CH. */
298 if (charset < 0)
299 SPLIT_CHAR (c, charset, c1, c2);
300 if (dimension != CHARSET_DIMENSION (charset))
301 /* We can't make such a character because of dimension mismatch. */
302 return c;
303 return MAKE_CHAR (alt_charset, c1, c2);
304 }
305
306 /* Convert the unibyte character C to multibyte based on
307 Vnonascii_translate_table or nonascii_insert_offset. If they can't
308 convert C to a valid multibyte character, convert it based on
309 DEFAULT_NONASCII_INSERT_OFFSET which makes C a Latin-1 character. */
310
311 unibyte_char_to_multibyte (c)
312 int c;
313 {
314 if (c >= 0240 && c < 0400)
315 {
316 int c_save = c;
317
318 if (! NILP (Vnonascii_translate_table))
319 c = XINT (Faref (Vnonascii_translate_table, make_number (c)));
320 else if (nonascii_insert_offset > 0)
321 c += nonascii_insert_offset;
322 if (c >= 0240 && (c < 0400 || ! VALID_MULTIBYTE_CHAR_P (c)))
323 c = c_save + DEFAULT_NONASCII_INSERT_OFFSET;
324 }
325 return c;
326 }
327 \f
328 /* Update the table Vcharset_table with the given arguments (see the
329 document of `define-charset' for the meaning of each argument).
330 Several other table contents are also updated. The caller should
331 check the validity of CHARSET-ID and the remaining arguments in
332 advance. */
333
334 void
335 update_charset_table (charset_id, dimension, chars, width, direction,
336 iso_final_char, iso_graphic_plane,
337 short_name, long_name, description)
338 Lisp_Object charset_id, dimension, chars, width, direction;
339 Lisp_Object iso_final_char, iso_graphic_plane;
340 Lisp_Object short_name, long_name, description;
341 {
342 int charset = XINT (charset_id);
343 int bytes;
344 unsigned char leading_code_base, leading_code_ext;
345
346 if (NILP (CHARSET_TABLE_ENTRY (charset)))
347 CHARSET_TABLE_ENTRY (charset)
348 = Fmake_vector (make_number (CHARSET_MAX_IDX), Qnil);
349
350 /* Get byte length of multibyte form, base leading-code, and
351 extended leading-code of the charset. See the comment under the
352 title "GENERAL NOTE on CHARACTER SET (CHARSET)" in charset.h. */
353 bytes = XINT (dimension);
354 if (charset < MIN_CHARSET_PRIVATE_DIMENSION1)
355 {
356 /* Official charset, it doesn't have an extended leading-code. */
357 if (charset != CHARSET_ASCII)
358 bytes += 1; /* For a base leading-code. */
359 leading_code_base = charset;
360 leading_code_ext = 0;
361 }
362 else
363 {
364 /* Private charset. */
365 bytes += 2; /* For base and extended leading-codes. */
366 leading_code_base
367 = (charset < LEADING_CODE_EXT_12
368 ? LEADING_CODE_PRIVATE_11
369 : (charset < LEADING_CODE_EXT_21
370 ? LEADING_CODE_PRIVATE_12
371 : (charset < LEADING_CODE_EXT_22
372 ? LEADING_CODE_PRIVATE_21
373 : LEADING_CODE_PRIVATE_22)));
374 leading_code_ext = charset;
375 }
376
377 CHARSET_TABLE_INFO (charset, CHARSET_ID_IDX) = charset_id;
378 CHARSET_TABLE_INFO (charset, CHARSET_BYTES_IDX) = make_number (bytes);
379 CHARSET_TABLE_INFO (charset, CHARSET_DIMENSION_IDX) = dimension;
380 CHARSET_TABLE_INFO (charset, CHARSET_CHARS_IDX) = chars;
381 CHARSET_TABLE_INFO (charset, CHARSET_WIDTH_IDX) = width;
382 CHARSET_TABLE_INFO (charset, CHARSET_DIRECTION_IDX) = direction;
383 CHARSET_TABLE_INFO (charset, CHARSET_LEADING_CODE_BASE_IDX)
384 = make_number (leading_code_base);
385 CHARSET_TABLE_INFO (charset, CHARSET_LEADING_CODE_EXT_IDX)
386 = make_number (leading_code_ext);
387 CHARSET_TABLE_INFO (charset, CHARSET_ISO_FINAL_CHAR_IDX) = iso_final_char;
388 CHARSET_TABLE_INFO (charset, CHARSET_ISO_GRAPHIC_PLANE_IDX)
389 = iso_graphic_plane;
390 CHARSET_TABLE_INFO (charset, CHARSET_SHORT_NAME_IDX) = short_name;
391 CHARSET_TABLE_INFO (charset, CHARSET_LONG_NAME_IDX) = long_name;
392 CHARSET_TABLE_INFO (charset, CHARSET_DESCRIPTION_IDX) = description;
393 CHARSET_TABLE_INFO (charset, CHARSET_PLIST_IDX) = Qnil;
394
395 {
396 /* If we have already defined a charset which has the same
397 DIMENSION, CHARS and ISO-FINAL-CHAR but the different
398 DIRECTION, we must update the entry REVERSE-CHARSET of both
399 charsets. If there's no such charset, the value of the entry
400 is set to nil. */
401 int i;
402
403 for (i = 0; i <= MAX_CHARSET; i++)
404 if (!NILP (CHARSET_TABLE_ENTRY (i)))
405 {
406 if (CHARSET_DIMENSION (i) == XINT (dimension)
407 && CHARSET_CHARS (i) == XINT (chars)
408 && CHARSET_ISO_FINAL_CHAR (i) == XINT (iso_final_char)
409 && CHARSET_DIRECTION (i) != XINT (direction))
410 {
411 CHARSET_TABLE_INFO (charset, CHARSET_REVERSE_CHARSET_IDX)
412 = make_number (i);
413 CHARSET_TABLE_INFO (i, CHARSET_REVERSE_CHARSET_IDX) = charset_id;
414 break;
415 }
416 }
417 if (i > MAX_CHARSET)
418 /* No such a charset. */
419 CHARSET_TABLE_INFO (charset, CHARSET_REVERSE_CHARSET_IDX)
420 = make_number (-1);
421 }
422
423 if (charset != CHARSET_ASCII
424 && charset < MIN_CHARSET_PRIVATE_DIMENSION1)
425 {
426 /* Update tables bytes_by_char_head and width_by_char_head. */
427 bytes_by_char_head[leading_code_base] = bytes;
428 width_by_char_head[leading_code_base] = XINT (width);
429
430 /* Update table emacs_code_class. */
431 emacs_code_class[charset] = (bytes == 2
432 ? EMACS_leading_code_2
433 : (bytes == 3
434 ? EMACS_leading_code_3
435 : EMACS_leading_code_4));
436 }
437
438 /* Update table iso_charset_table. */
439 if (ISO_CHARSET_TABLE (dimension, chars, iso_final_char) < 0)
440 ISO_CHARSET_TABLE (dimension, chars, iso_final_char) = charset;
441 }
442
443 #ifdef emacs
444
445 /* Return charset id of CHARSET_SYMBOL, or return -1 if CHARSET_SYMBOL
446 is invalid. */
447 int
448 get_charset_id (charset_symbol)
449 Lisp_Object charset_symbol;
450 {
451 Lisp_Object val;
452 int charset;
453
454 return ((SYMBOLP (charset_symbol)
455 && (val = Fget (charset_symbol, Qcharset), VECTORP (val))
456 && (charset = XINT (XVECTOR (val)->contents[CHARSET_ID_IDX]),
457 CHARSET_VALID_P (charset)))
458 ? charset : -1);
459 }
460
461 /* Return an identification number for a new private charset of
462 DIMENSION and WIDTH. If there's no more room for the new charset,
463 return 0. */
464 Lisp_Object
465 get_new_private_charset_id (dimension, width)
466 int dimension, width;
467 {
468 int charset, from, to;
469
470 if (dimension == 1)
471 {
472 if (width == 1)
473 from = LEADING_CODE_EXT_11, to = LEADING_CODE_EXT_12;
474 else
475 from = LEADING_CODE_EXT_12, to = LEADING_CODE_EXT_21;
476 }
477 else
478 {
479 if (width == 1)
480 from = LEADING_CODE_EXT_21, to = LEADING_CODE_EXT_22;
481 else
482 from = LEADING_CODE_EXT_22, to = LEADING_CODE_EXT_MAX + 1;
483 }
484
485 for (charset = from; charset < to; charset++)
486 if (!CHARSET_DEFINED_P (charset)) break;
487
488 return make_number (charset < to ? charset : 0);
489 }
490
491 DEFUN ("define-charset", Fdefine_charset, Sdefine_charset, 3, 3, 0,
492 "Define CHARSET-ID as the identification number of CHARSET with INFO-VECTOR.\n\
493 If CHARSET-ID is nil, it is decided automatically, which means CHARSET is\n\
494 treated as a private charset.\n\
495 INFO-VECTOR is a vector of the format:\n\
496 [DIMENSION CHARS WIDTH DIRECTION ISO-FINAL-CHAR ISO-GRAPHIC-PLANE\n\
497 SHORT-NAME LONG-NAME DESCRIPTION]\n\
498 The meanings of each elements is as follows:\n\
499 DIMENSION (integer) is the number of bytes to represent a character: 1 or 2.\n\
500 CHARS (integer) is the number of characters in a dimension: 94 or 96.\n\
501 WIDTH (integer) is the number of columns a character in the charset\n\
502 occupies on the screen: one of 0, 1, and 2.\n\
503 \n\
504 DIRECTION (integer) is the rendering direction of characters in the\n\
505 charset when rendering. If 0, render from right to left, else\n\
506 render from left to right.\n\
507 \n\
508 ISO-FINAL-CHAR (character) is the final character of the\n\
509 corresponding ISO 2022 charset.\n\
510 \n\
511 ISO-GRAPHIC-PLANE (integer) is the graphic plane to be invoked\n\
512 while encoding to variants of ISO 2022 coding system, one of the\n\
513 following: 0/graphic-plane-left(GL), 1/graphic-plane-right(GR).\n\
514 \n\
515 SHORT-NAME (string) is the short name to refer to the charset.\n\
516 \n\
517 LONG-NAME (string) is the long name to refer to the charset.\n\
518 \n\
519 DESCRIPTION (string) is the description string of the charset.")
520 (charset_id, charset_symbol, info_vector)
521 Lisp_Object charset_id, charset_symbol, info_vector;
522 {
523 Lisp_Object *vec;
524
525 if (!NILP (charset_id))
526 CHECK_NUMBER (charset_id, 0);
527 CHECK_SYMBOL (charset_symbol, 1);
528 CHECK_VECTOR (info_vector, 2);
529
530 if (! NILP (charset_id))
531 {
532 if (! CHARSET_VALID_P (XINT (charset_id)))
533 error ("Invalid CHARSET: %d", XINT (charset_id));
534 else if (CHARSET_DEFINED_P (XINT (charset_id)))
535 error ("Already defined charset: %d", XINT (charset_id));
536 }
537
538 vec = XVECTOR (info_vector)->contents;
539 if (XVECTOR (info_vector)->size != 9
540 || !INTEGERP (vec[0]) || !(XINT (vec[0]) == 1 || XINT (vec[0]) == 2)
541 || !INTEGERP (vec[1]) || !(XINT (vec[1]) == 94 || XINT (vec[1]) == 96)
542 || !INTEGERP (vec[2]) || !(XINT (vec[2]) == 1 || XINT (vec[2]) == 2)
543 || !INTEGERP (vec[3]) || !(XINT (vec[3]) == 0 || XINT (vec[3]) == 1)
544 || !INTEGERP (vec[4]) || !(XINT (vec[4]) >= '0' && XINT (vec[4]) <= '~')
545 || !INTEGERP (vec[5]) || !(XINT (vec[5]) == 0 || XINT (vec[5]) == 1)
546 || !STRINGP (vec[6])
547 || !STRINGP (vec[7])
548 || !STRINGP (vec[8]))
549 error ("Invalid info-vector argument for defining charset %s",
550 XSYMBOL (charset_symbol)->name->data);
551
552 if (NILP (charset_id))
553 {
554 charset_id = get_new_private_charset_id (XINT (vec[0]), XINT (vec[2]));
555 if (XINT (charset_id) == 0)
556 error ("There's no room for a new private charset %s",
557 XSYMBOL (charset_symbol)->name->data);
558 }
559
560 update_charset_table (charset_id, vec[0], vec[1], vec[2], vec[3],
561 vec[4], vec[5], vec[6], vec[7], vec[8]);
562 Fput (charset_symbol, Qcharset, CHARSET_TABLE_ENTRY (XINT (charset_id)));
563 CHARSET_SYMBOL (XINT (charset_id)) = charset_symbol;
564 Vcharset_list = Fcons (charset_symbol, Vcharset_list);
565 return Qnil;
566 }
567
568 DEFUN ("get-unused-iso-final-char", Fget_unused_iso_final_char,
569 Sget_unused_iso_final_char, 2, 2, 0,
570 "Return an unsed ISO's final char for a charset of DIMENISION and CHARS.\n\
571 DIMENSION is the number of bytes to represent a character: 1 or 2.\n\
572 CHARS is the number of characters in a dimension: 94 or 96.\n\
573 \n\
574 This final char is for private use, thus the range is `0' (48) .. `?' (63).\n\
575 If there's no unused final char for the specified kind of charset,\n\
576 return nil.")
577 (dimension, chars)
578 Lisp_Object dimension, chars;
579 {
580 int final_char;
581
582 CHECK_NUMBER (dimension, 0);
583 CHECK_NUMBER (chars, 1);
584 if (XINT (dimension) != 1 && XINT (dimension) != 2)
585 error ("Invalid charset dimension %d, it should be 1 or 2",
586 XINT (dimension));
587 if (XINT (chars) != 94 && XINT (chars) != 96)
588 error ("Invalid charset chars %d, it should be 94 or 96",
589 XINT (chars));
590 for (final_char = '0'; final_char <= '?'; final_char++)
591 {
592 if (ISO_CHARSET_TABLE (dimension, chars, make_number (final_char)) < 0)
593 break;
594 }
595 return (final_char <= '?' ? make_number (final_char) : Qnil);
596 }
597
598 DEFUN ("declare-equiv-charset", Fdeclare_equiv_charset, Sdeclare_equiv_charset,
599 4, 4, 0,
600 "Declare a charset of DIMENSION, CHARS, FINAL-CHAR is the same as CHARSET.\n\
601 CHARSET should be defined by `defined-charset' in advance.")
602 (dimension, chars, final_char, charset_symbol)
603 Lisp_Object dimension, chars, final_char, charset_symbol;
604 {
605 int charset;
606
607 CHECK_NUMBER (dimension, 0);
608 CHECK_NUMBER (chars, 1);
609 CHECK_NUMBER (final_char, 2);
610 CHECK_SYMBOL (charset_symbol, 3);
611
612 if (XINT (dimension) != 1 && XINT (dimension) != 2)
613 error ("Invalid DIMENSION %d, it should be 1 or 2", XINT (dimension));
614 if (XINT (chars) != 94 && XINT (chars) != 96)
615 error ("Invalid CHARS %d, it should be 94 or 96", XINT (chars));
616 if (XINT (final_char) < '0' || XFASTINT (final_char) > '~')
617 error ("Invalid FINAL-CHAR %c, it should be `0'..`~'", XINT (chars));
618 if ((charset = get_charset_id (charset_symbol)) < 0)
619 error ("Invalid charset %s", XSYMBOL (charset_symbol)->name->data);
620
621 ISO_CHARSET_TABLE (dimension, chars, final_char) = charset;
622 return Qnil;
623 }
624
625 /* Return number of different charsets in STR of length LEN. In
626 addition, for each found charset N, CHARSETS[N] is set 1. The
627 caller should allocate CHARSETS (MAX_CHARSET + 1 elements) in advance.
628 It may lookup a unification table TABLE if supplied. */
629
630 int
631 find_charset_in_str (str, len, charsets, table)
632 unsigned char *str;
633 int len, *charsets;
634 Lisp_Object table;
635 {
636 register int num = 0, c;
637
638 if (! CHAR_TABLE_P (table))
639 table = Qnil;
640
641 while (len > 0)
642 {
643 int bytes, charset;
644 c = *str;
645
646 if (c == LEADING_CODE_COMPOSITION)
647 {
648 int cmpchar_id = str_cmpchar_id (str, len);
649 GLYPH *glyph;
650
651 if (cmpchar_id > 0)
652 {
653 struct cmpchar_info *cmpcharp = cmpchar_table[cmpchar_id];
654 int i;
655
656 for (i = 0; i < cmpcharp->glyph_len; i++)
657 {
658 c = cmpcharp->glyph[i];
659 if (!NILP (table))
660 {
661 if ((c = unify_char (table, c, 0, 0, 0)) < 0)
662 c = cmpcharp->glyph[i];
663 }
664 if ((charset = CHAR_CHARSET (c)) < 0)
665 charset = CHARSET_ASCII;
666 if (!charsets[charset])
667 {
668 charsets[charset] = 1;
669 num += 1;
670 }
671 }
672 str += cmpcharp->len;
673 len -= cmpcharp->len;
674 continue;
675 }
676
677 charset = CHARSET_ASCII;
678 bytes = 1;
679 }
680 else
681 {
682 c = STRING_CHAR_AND_LENGTH (str, len, bytes);
683 if (! NILP (table))
684 {
685 int c1 = unify_char (table, c, 0, 0, 0);
686 if (c1 >= 0)
687 c = c1;
688 }
689 charset = CHAR_CHARSET (c);
690 }
691
692 if (!charsets[charset])
693 {
694 charsets[charset] = 1;
695 num += 1;
696 }
697 str += bytes;
698 len -= bytes;
699 }
700 return num;
701 }
702
703 DEFUN ("find-charset-region", Ffind_charset_region, Sfind_charset_region,
704 2, 3, 0,
705 "Return a list of charsets in the region between BEG and END.\n\
706 BEG and END are buffer positions.\n\
707 Optional arg TABLE if non-nil is a unification table to look up.")
708 (beg, end, table)
709 Lisp_Object beg, end, table;
710 {
711 int charsets[MAX_CHARSET + 1];
712 int from, from_byte, to, stop, stop_byte, i;
713 Lisp_Object val;
714
715 validate_region (&beg, &end);
716 from = XFASTINT (beg);
717 stop = to = XFASTINT (end);
718
719 if (from < GPT && GPT < to)
720 {
721 stop = GPT;
722 stop_byte = GPT_BYTE;
723 }
724 else
725 stop_byte = CHAR_TO_BYTE (stop);
726
727 from_byte = CHAR_TO_BYTE (from);
728
729 bzero (charsets, (MAX_CHARSET + 1) * sizeof (int));
730 while (1)
731 {
732 find_charset_in_str (BYTE_POS_ADDR (from_byte), stop_byte - from_byte,
733 charsets, table);
734 if (stop < to)
735 {
736 from = stop, from_byte = stop_byte;
737 stop = to, stop_byte = CHAR_TO_BYTE (stop);
738 }
739 else
740 break;
741 }
742
743 val = Qnil;
744 for (i = MAX_CHARSET; i >= 0; i--)
745 if (charsets[i])
746 val = Fcons (CHARSET_SYMBOL (i), val);
747 return val;
748 }
749
750 DEFUN ("find-charset-string", Ffind_charset_string, Sfind_charset_string,
751 1, 2, 0,
752 "Return a list of charsets in STR.\n\
753 Optional arg TABLE if non-nil is a unification table to look up.")
754 (str, table)
755 Lisp_Object str, table;
756 {
757 int charsets[MAX_CHARSET + 1];
758 int i;
759 Lisp_Object val;
760
761 CHECK_STRING (str, 0);
762
763 if (! STRING_MULTIBYTE (str))
764 return Qnil;
765
766 bzero (charsets, (MAX_CHARSET + 1) * sizeof (int));
767 find_charset_in_str (XSTRING (str)->data, STRING_BYTES (XSTRING (str)),
768 charsets, table);
769 val = Qnil;
770 for (i = MAX_CHARSET; i >= 0; i--)
771 if (charsets[i])
772 val = Fcons (CHARSET_SYMBOL (i), val);
773 return val;
774 }
775 \f
776 DEFUN ("make-char-internal", Fmake_char_internal, Smake_char_internal, 1, 3, 0,
777 "")
778 (charset, code1, code2)
779 Lisp_Object charset, code1, code2;
780 {
781 CHECK_NUMBER (charset, 0);
782
783 if (NILP (code1))
784 XSETFASTINT (code1, 0);
785 else
786 CHECK_NUMBER (code1, 1);
787 if (NILP (code2))
788 XSETFASTINT (code2, 0);
789 else
790 CHECK_NUMBER (code2, 2);
791
792 if (!CHARSET_DEFINED_P (XINT (charset)))
793 error ("Invalid charset: %d", XINT (charset));
794
795 return make_number (MAKE_CHAR (XINT (charset), XINT (code1), XINT (code2)));
796 }
797
798 DEFUN ("split-char", Fsplit_char, Ssplit_char, 1, 1, 0,
799 "Return list of charset and one or two position-codes of CHAR.")
800 (ch)
801 Lisp_Object ch;
802 {
803 Lisp_Object val;
804 int charset, c1, c2;
805
806 CHECK_NUMBER (ch, 0);
807 SPLIT_CHAR (XFASTINT (ch), charset, c1, c2);
808 return (c2 >= 0
809 ? Fcons (CHARSET_SYMBOL (charset),
810 Fcons (make_number (c1), Fcons (make_number (c2), Qnil)))
811 : Fcons (CHARSET_SYMBOL (charset), Fcons (make_number (c1), Qnil)));
812 }
813
814 DEFUN ("char-charset", Fchar_charset, Schar_charset, 1, 1, 0,
815 "Return charset of CHAR.")
816 (ch)
817 Lisp_Object ch;
818 {
819 CHECK_NUMBER (ch, 0);
820
821 return CHARSET_SYMBOL (CHAR_CHARSET (XINT (ch)));
822 }
823
824 DEFUN ("charset-after", Fcharset_after, Scharset_after, 0, 1, 0,
825 "Return charset of a character in current buffer at position POS.\n\
826 If POS is nil, it defauls to the current point.")
827 (pos)
828 Lisp_Object pos;
829 {
830 register int pos_byte, c, charset;
831 register unsigned char *p;
832
833 if (NILP (pos))
834 pos_byte = PT_BYTE;
835 else if (MARKERP (pos))
836 pos_byte = marker_byte_position (pos);
837 else
838 {
839 CHECK_NUMBER (pos, 0);
840 pos_byte = CHAR_TO_BYTE (XINT (pos));
841 }
842 p = BYTE_POS_ADDR (pos_byte);
843 c = STRING_CHAR (p, Z_BYTE - pos_byte);
844 charset = CHAR_CHARSET (c);
845 return CHARSET_SYMBOL (charset);
846 }
847
848 DEFUN ("iso-charset", Fiso_charset, Siso_charset, 3, 3, 0,
849 "Return charset of ISO's specification DIMENSION, CHARS, and FINAL-CHAR.\n\
850 \n\
851 ISO 2022's designation sequence (escape sequence) distinguishes charsets\n\
852 by their DIMENSION, CHARS, and FINAL-CHAR,\n\
853 where as Emacs distinguishes them by charset symbol.\n\
854 See the documentation of the function `charset-info' for the meanings of\n\
855 DIMENSION, CHARS, and FINAL-CHAR.")
856 (dimension, chars, final_char)
857 Lisp_Object dimension, chars, final_char;
858 {
859 int charset;
860
861 CHECK_NUMBER (dimension, 0);
862 CHECK_NUMBER (chars, 1);
863 CHECK_NUMBER (final_char, 2);
864
865 if ((charset = ISO_CHARSET_TABLE (dimension, chars, final_char)) < 0)
866 return Qnil;
867 return CHARSET_SYMBOL (charset);
868 }
869
870 /* If GENERICP is nonzero, return nonzero iff C is a valid normal or
871 generic character. If GENERICP is zero, return nonzero iff C is a
872 valid normal character. Do not call this function directly,
873 instead use macro CHAR_VALID_P. */
874 int
875 char_valid_p (c, genericp)
876 int c, genericp;
877 {
878 int charset, c1, c2;
879
880 if (c < 0)
881 return 0;
882 if (SINGLE_BYTE_CHAR_P (c))
883 return 1;
884 SPLIT_NON_ASCII_CHAR (c, charset, c1, c2);
885 if (!CHARSET_VALID_P (charset))
886 return 0;
887 return (c < MIN_CHAR_COMPOSITION
888 ? ((c & CHAR_FIELD1_MASK) /* i.e. dimension of C is two. */
889 ? (genericp && c1 == 0 && c2 == 0
890 || c1 >= 32 && c2 >= 32)
891 : (genericp && c1 == 0
892 || c1 >= 32))
893 : c < MIN_CHAR_COMPOSITION + n_cmpchars);
894 }
895
896 DEFUN ("char-valid-p", Fchar_valid_p, Schar_valid_p, 1, 2, 0,
897 "Return t if OBJECT is a valid normal character.\n\
898 If optional arg GENERICP is non-nil, also return t if OBJECT is\n\
899 a valid generic character.")
900 (object, genericp)
901 Lisp_Object object, genericp;
902 {
903 if (! NATNUMP (object))
904 return Qnil;
905 return (CHAR_VALID_P (XFASTINT (object), !NILP (genericp)) ? Qt : Qnil);
906 }
907
908 DEFUN ("unibyte-char-to-multibyte", Funibyte_char_to_multibyte,
909 Sunibyte_char_to_multibyte, 1, 1, 0,
910 "Convert the unibyte character CH to multibyte character.\n\
911 The conversion is done based on nonascii-translate-table (which see)\n\
912 or nonascii-insert-offset (which see).")
913 (ch)
914 Lisp_Object ch;
915 {
916 int c;
917
918 CHECK_NUMBER (ch, 0);
919 c = XINT (ch);
920 if (c < 0 || c >= 0400)
921 error ("Invalid unibyte character: %d", c);
922 c = unibyte_char_to_multibyte (c);
923 if (c < 0)
924 error ("Can't convert to multibyte character: %d", XINT (ch));
925 return make_number (c);
926 }
927
928 DEFUN ("char-bytes", Fchar_bytes, Schar_bytes, 1, 1, 0,
929 "Return byte length of multi-byte form of CHAR.")
930 (ch)
931 Lisp_Object ch;
932 {
933 Lisp_Object val;
934 int bytes;
935
936 CHECK_NUMBER (ch, 0);
937 if (COMPOSITE_CHAR_P (XFASTINT (ch)))
938 {
939 unsigned int id = COMPOSITE_CHAR_ID (XFASTINT (ch));
940
941 bytes = (id < n_cmpchars ? cmpchar_table[id]->len : 1);
942 }
943 else
944 {
945 int charset = CHAR_CHARSET (XFASTINT (ch));
946
947 bytes = CHARSET_DEFINED_P (charset) ? CHARSET_BYTES (charset) : 1;
948 }
949
950 XSETFASTINT (val, bytes);
951 return val;
952 }
953
954 /* Return the width of character of which multi-byte form starts with
955 C. The width is measured by how many columns occupied on the
956 screen when displayed in the current buffer. */
957
958 #define ONE_BYTE_CHAR_WIDTH(c) \
959 (c < 0x20 \
960 ? (c == '\t' \
961 ? XFASTINT (current_buffer->tab_width) \
962 : (c == '\n' ? 0 : (NILP (current_buffer->ctl_arrow) ? 4 : 2))) \
963 : (c < 0x7f \
964 ? 1 \
965 : (c == 0x7F \
966 ? (NILP (current_buffer->ctl_arrow) ? 4 : 2) \
967 : ((! NILP (current_buffer->enable_multibyte_characters) \
968 && BASE_LEADING_CODE_P (c)) \
969 ? WIDTH_BY_CHAR_HEAD (c) \
970 : 4)))) \
971
972
973 DEFUN ("char-width", Fchar_width, Schar_width, 1, 1, 0,
974 "Return width of CHAR when displayed in the current buffer.\n\
975 The width is measured by how many columns it occupies on the screen.")
976 (ch)
977 Lisp_Object ch;
978 {
979 Lisp_Object val, disp;
980 int c;
981 struct Lisp_Char_Table *dp = buffer_display_table ();
982
983 CHECK_NUMBER (ch, 0);
984
985 c = XINT (ch);
986
987 /* Get the way the display table would display it. */
988 disp = dp ? DISP_CHAR_VECTOR (dp, c) : Qnil;
989
990 if (VECTORP (disp))
991 XSETINT (val, XVECTOR (disp)->size);
992 else if (SINGLE_BYTE_CHAR_P (c))
993 XSETINT (val, ONE_BYTE_CHAR_WIDTH (c));
994 else if (COMPOSITE_CHAR_P (c))
995 {
996 int id = COMPOSITE_CHAR_ID (XFASTINT (ch));
997 XSETFASTINT (val, (id < n_cmpchars ? cmpchar_table[id]->width : 0));
998 }
999 else
1000 {
1001 int charset = CHAR_CHARSET (c);
1002
1003 XSETFASTINT (val, CHARSET_WIDTH (charset));
1004 }
1005 return val;
1006 }
1007
1008 /* Return width of string STR of length LEN when displayed in the
1009 current buffer. The width is measured by how many columns it
1010 occupies on the screen. */
1011
1012 int
1013 strwidth (str, len)
1014 unsigned char *str;
1015 int len;
1016 {
1017 unsigned char *endp = str + len;
1018 int width = 0;
1019 struct Lisp_Char_Table *dp = buffer_display_table ();
1020
1021 while (str < endp)
1022 {
1023 if (*str == LEADING_CODE_COMPOSITION)
1024 {
1025 int id = str_cmpchar_id (str, endp - str);
1026
1027 if (id < 0)
1028 {
1029 width += 4;
1030 str++;
1031 }
1032 else
1033 {
1034 width += cmpchar_table[id]->width;
1035 str += cmpchar_table[id]->len;
1036 }
1037 }
1038 else
1039 {
1040 Lisp_Object disp;
1041 int thislen;
1042 int c = STRING_CHAR_AND_LENGTH (str, endp - str, thislen);
1043
1044 /* Get the way the display table would display it. */
1045 if (dp)
1046 disp = DISP_CHAR_VECTOR (dp, c);
1047 else
1048 disp = Qnil;
1049
1050 if (VECTORP (disp))
1051 width += XVECTOR (disp)->size;
1052 else
1053 width += ONE_BYTE_CHAR_WIDTH (*str);
1054
1055 str += thislen;
1056 }
1057 }
1058 return width;
1059 }
1060
1061 DEFUN ("string-width", Fstring_width, Sstring_width, 1, 1, 0,
1062 "Return width of STRING when displayed in the current buffer.\n\
1063 Width is measured by how many columns it occupies on the screen.\n\
1064 When calculating width of a multibyte character in STRING,\n\
1065 only the base leading-code is considered; the validity of\n\
1066 the following bytes is not checked.")
1067 (str)
1068 Lisp_Object str;
1069 {
1070 Lisp_Object val;
1071
1072 CHECK_STRING (str, 0);
1073 XSETFASTINT (val, strwidth (XSTRING (str)->data,
1074 STRING_BYTES (XSTRING (str))));
1075 return val;
1076 }
1077
1078 DEFUN ("char-direction", Fchar_direction, Schar_direction, 1, 1, 0,
1079 "Return the direction of CHAR.\n\
1080 The returned value is 0 for left-to-right and 1 for right-to-left.")
1081 (ch)
1082 Lisp_Object ch;
1083 {
1084 int charset;
1085
1086 CHECK_NUMBER (ch, 0);
1087 charset = CHAR_CHARSET (XFASTINT (ch));
1088 if (!CHARSET_DEFINED_P (charset))
1089 invalid_character (XINT (ch));
1090 return CHARSET_TABLE_INFO (charset, CHARSET_DIRECTION_IDX);
1091 }
1092
1093 DEFUN ("chars-in-region", Fchars_in_region, Schars_in_region, 2, 2, 0,
1094 "Return number of characters between BEG and END.")
1095 (beg, end)
1096 Lisp_Object beg, end;
1097 {
1098 int from, to;
1099
1100 from = min (XFASTINT (beg), XFASTINT (end));
1101 to = max (XFASTINT (beg), XFASTINT (end));
1102
1103 return to - from;
1104 }
1105
1106 /* Return the number of characters in the NBYTES bytes at PTR.
1107 This works by looking at the contents and checking for multibyte sequences.
1108 However, if the current buffer has enable-multibyte-characters = nil,
1109 we treat each byte as a character. */
1110
1111 int
1112 chars_in_text (ptr, nbytes)
1113 unsigned char *ptr;
1114 int nbytes;
1115 {
1116 unsigned char *endp, c;
1117 int chars;
1118
1119 /* current_buffer is null at early stages of Emacs initialization. */
1120 if (current_buffer == 0
1121 || NILP (current_buffer->enable_multibyte_characters))
1122 return nbytes;
1123
1124 endp = ptr + nbytes;
1125 chars = 0;
1126
1127 while (ptr < endp)
1128 {
1129 c = *ptr++;
1130
1131 if (BASE_LEADING_CODE_P (c))
1132 while (ptr < endp && ! CHAR_HEAD_P (*ptr)) ptr++;
1133 chars++;
1134 }
1135
1136 return chars;
1137 }
1138
1139 /* Return the number of characters in the NBYTES bytes at PTR.
1140 This works by looking at the contents and checking for multibyte sequences.
1141 It ignores enable-multibyte-characters. */
1142
1143 int
1144 multibyte_chars_in_text (ptr, nbytes)
1145 unsigned char *ptr;
1146 int nbytes;
1147 {
1148 unsigned char *endp, c;
1149 int chars;
1150
1151 endp = ptr + nbytes;
1152 chars = 0;
1153
1154 while (ptr < endp)
1155 {
1156 c = *ptr++;
1157
1158 if (BASE_LEADING_CODE_P (c))
1159 while (ptr < endp && ! CHAR_HEAD_P (*ptr)) ptr++;
1160 chars++;
1161 }
1162
1163 return chars;
1164 }
1165
1166 DEFUN ("string", Fstring, Sstring, 1, MANY, 0,
1167 "Concatenate all the argument characters and make the result a string.")
1168 (n, args)
1169 int n;
1170 Lisp_Object *args;
1171 {
1172 int i;
1173 unsigned char *buf
1174 = (unsigned char *) alloca (MAX_LENGTH_OF_MULTI_BYTE_FORM * n);
1175 unsigned char *p = buf;
1176 Lisp_Object val;
1177
1178 for (i = 0; i < n; i++)
1179 {
1180 int c, len;
1181 unsigned char *str;
1182
1183 if (!INTEGERP (args[i]))
1184 CHECK_NUMBER (args[i], 0);
1185 c = XINT (args[i]);
1186 len = CHAR_STRING (c, p, str);
1187 if (p != str)
1188 /* C is a composite character. */
1189 bcopy (str, p, len);
1190 p += len;
1191 }
1192
1193 val = make_string_from_bytes (buf, n, p - buf);
1194 return val;
1195 }
1196
1197 #endif /* emacs */
1198 \f
1199 /*** Composite characters staffs ***/
1200
1201 /* Each composite character is identified by CMPCHAR-ID which is
1202 assigned when Emacs needs the character code of the composite
1203 character (e.g. when displaying it on the screen). See the
1204 document "GENERAL NOTE on COMPOSITE CHARACTER" in `charset.h' how a
1205 composite character is represented in Emacs. */
1206
1207 /* If `static' is defined, it means that it is defined to null string. */
1208 #ifndef static
1209 /* The following function is copied from lread.c. */
1210 static int
1211 hash_string (ptr, len)
1212 unsigned char *ptr;
1213 int len;
1214 {
1215 register unsigned char *p = ptr;
1216 register unsigned char *end = p + len;
1217 register unsigned char c;
1218 register int hash = 0;
1219
1220 while (p != end)
1221 {
1222 c = *p++;
1223 if (c >= 0140) c -= 40;
1224 hash = ((hash<<3) + (hash>>28) + c);
1225 }
1226 return hash & 07777777777;
1227 }
1228 #endif
1229
1230 #define CMPCHAR_HASH_TABLE_SIZE 0xFFF
1231
1232 static int *cmpchar_hash_table[CMPCHAR_HASH_TABLE_SIZE];
1233
1234 /* Each element of `cmpchar_hash_table' is a pointer to an array of
1235 integer, where the 1st element is the size of the array, the 2nd
1236 element is how many elements are actually used in the array, and
1237 the remaining elements are CMPCHAR-IDs of composite characters of
1238 the same hash value. */
1239 #define CMPCHAR_HASH_SIZE(table) table[0]
1240 #define CMPCHAR_HASH_USED(table) table[1]
1241 #define CMPCHAR_HASH_CMPCHAR_ID(table, i) table[i]
1242
1243 /* Return CMPCHAR-ID of the composite character in STR of the length
1244 LEN. If the composite character has not yet been registered,
1245 register it in `cmpchar_table' and assign new CMPCHAR-ID. This
1246 is the sole function for assigning CMPCHAR-ID. */
1247 int
1248 str_cmpchar_id (str, len)
1249 const unsigned char *str;
1250 int len;
1251 {
1252 int hash_idx, *hashp;
1253 unsigned char *buf;
1254 int embedded_rule; /* 1 if composition rule is embedded. */
1255 int chars; /* number of components. */
1256 int i;
1257 struct cmpchar_info *cmpcharp;
1258
1259 /* The second byte 0xFF means compostion rule is embedded. */
1260 embedded_rule = (str[1] == 0xFF);
1261
1262 /* At first, get the actual length of the composite character. */
1263 {
1264 const unsigned char *p, *endp = str + 1, *lastp = str + len;
1265 int bytes;
1266
1267 while (endp < lastp && ! CHAR_HEAD_P (*endp)) endp++;
1268 if (endp - str < 5)
1269 /* Any composite char have at least 5-byte length. */
1270 return -1;
1271
1272 chars = 0;
1273 p = str + 1;
1274 while (p < endp)
1275 {
1276 if (embedded_rule) p++;
1277 /* No need of checking if *P is 0xA0 because
1278 BYTES_BY_CHAR_HEAD (0x80) surely returns 2. */
1279 p += BYTES_BY_CHAR_HEAD (*p - 0x20);
1280 chars++;
1281 }
1282 if (p > endp || chars < 2 || chars > MAX_COMPONENT_COUNT)
1283 /* Invalid components. */
1284 return -1;
1285 len = p - str;
1286 }
1287 hash_idx = hash_string (str, len) % CMPCHAR_HASH_TABLE_SIZE;
1288 hashp = cmpchar_hash_table[hash_idx];
1289
1290 /* Then, look into the hash table. */
1291 if (hashp != NULL)
1292 /* Find the correct one among composite characters of the same
1293 hash value. */
1294 for (i = 2; i < CMPCHAR_HASH_USED (hashp); i++)
1295 {
1296 cmpcharp = cmpchar_table[CMPCHAR_HASH_CMPCHAR_ID (hashp, i)];
1297 if (len == cmpcharp->len
1298 && ! bcmp (str, cmpcharp->data, len))
1299 return CMPCHAR_HASH_CMPCHAR_ID (hashp, i);
1300 }
1301
1302 /* We have to register the composite character in cmpchar_table. */
1303 if (n_cmpchars > (CHAR_FIELD2_MASK | CHAR_FIELD3_MASK))
1304 /* No, we have no more room for a new composite character. */
1305 return -1;
1306
1307 /* Make the entry in hash table. */
1308 if (hashp == NULL)
1309 {
1310 /* Make a table for 8 composite characters initially. */
1311 hashp = (cmpchar_hash_table[hash_idx]
1312 = (int *) xmalloc (sizeof (int) * (2 + 8)));
1313 CMPCHAR_HASH_SIZE (hashp) = 10;
1314 CMPCHAR_HASH_USED (hashp) = 2;
1315 }
1316 else if (CMPCHAR_HASH_USED (hashp) >= CMPCHAR_HASH_SIZE (hashp))
1317 {
1318 CMPCHAR_HASH_SIZE (hashp) += 8;
1319 hashp = (cmpchar_hash_table[hash_idx]
1320 = (int *) xrealloc (hashp,
1321 sizeof (int) * CMPCHAR_HASH_SIZE (hashp)));
1322 }
1323 CMPCHAR_HASH_CMPCHAR_ID (hashp, CMPCHAR_HASH_USED (hashp)) = n_cmpchars;
1324 CMPCHAR_HASH_USED (hashp)++;
1325
1326 /* Set information of the composite character in cmpchar_table. */
1327 if (cmpchar_table_size == 0)
1328 {
1329 /* This is the first composite character to be registered. */
1330 cmpchar_table_size = 256;
1331 cmpchar_table
1332 = (struct cmpchar_info **) xmalloc (sizeof (cmpchar_table[0])
1333 * cmpchar_table_size);
1334 }
1335 else if (cmpchar_table_size <= n_cmpchars)
1336 {
1337 cmpchar_table_size += 256;
1338 cmpchar_table
1339 = (struct cmpchar_info **) xrealloc (cmpchar_table,
1340 sizeof (cmpchar_table[0])
1341 * cmpchar_table_size);
1342 }
1343
1344 cmpcharp = (struct cmpchar_info *) xmalloc (sizeof (struct cmpchar_info));
1345
1346 cmpcharp->len = len;
1347 cmpcharp->data = (unsigned char *) xmalloc (len + 1);
1348 bcopy (str, cmpcharp->data, len);
1349 cmpcharp->data[len] = 0;
1350 cmpcharp->glyph_len = chars;
1351 cmpcharp->glyph = (GLYPH *) xmalloc (sizeof (GLYPH) * chars);
1352 if (embedded_rule)
1353 {
1354 cmpcharp->cmp_rule = (unsigned char *) xmalloc (chars);
1355 cmpcharp->col_offset = (float *) xmalloc (sizeof (float) * chars);
1356 }
1357 else
1358 {
1359 cmpcharp->cmp_rule = NULL;
1360 cmpcharp->col_offset = NULL;
1361 }
1362
1363 /* Setup GLYPH data and composition rules (if any) so as not to make
1364 them every time on displaying. */
1365 {
1366 unsigned char *bufp;
1367 int width;
1368 float leftmost = 0.0, rightmost = 1.0;
1369
1370 if (embedded_rule)
1371 /* At first, col_offset[N] is set to relative to col_offset[0]. */
1372 cmpcharp->col_offset[0] = 0;
1373
1374 for (i = 0, bufp = cmpcharp->data + 1; i < chars; i++)
1375 {
1376 if (embedded_rule)
1377 cmpcharp->cmp_rule[i] = *bufp++;
1378
1379 if (*bufp == 0xA0) /* This is an ASCII character. */
1380 {
1381 cmpcharp->glyph[i] = FAST_MAKE_GLYPH ((*++bufp & 0x7F), 0);
1382 width = 1;
1383 bufp++;
1384 }
1385 else /* Multibyte character. */
1386 {
1387 /* Make `bufp' point normal multi-byte form temporally. */
1388 *bufp -= 0x20;
1389 cmpcharp->glyph[i]
1390 = FAST_MAKE_GLYPH (string_to_non_ascii_char (bufp, 4, 0), 0);
1391 width = WIDTH_BY_CHAR_HEAD (*bufp);
1392 *bufp += 0x20;
1393 bufp += BYTES_BY_CHAR_HEAD (*bufp - 0x20);
1394 }
1395
1396 if (embedded_rule && i > 0)
1397 {
1398 /* Reference points (global_ref and new_ref) are
1399 encoded as below:
1400
1401 0--1--2 -- ascent
1402 | |
1403 | |
1404 | 4 -+--- center
1405 -- 3 5 -- baseline
1406 | |
1407 6--7--8 -- descent
1408
1409 Now, we calculate the column offset of the new glyph
1410 from the left edge of the first glyph. This can avoid
1411 the same calculation everytime displaying this
1412 composite character. */
1413
1414 /* Reference points of global glyph and new glyph. */
1415 int global_ref = (cmpcharp->cmp_rule[i] - 0xA0) / 9;
1416 int new_ref = (cmpcharp->cmp_rule[i] - 0xA0) % 9;
1417 /* Column offset relative to the first glyph. */
1418 float left = (leftmost
1419 + (global_ref % 3) * (rightmost - leftmost) / 2.0
1420 - (new_ref % 3) * width / 2.0);
1421
1422 cmpcharp->col_offset[i] = left;
1423 if (left < leftmost)
1424 leftmost = left;
1425 if (left + width > rightmost)
1426 rightmost = left + width;
1427 }
1428 else
1429 {
1430 if (width > rightmost)
1431 rightmost = width;
1432 }
1433 }
1434 if (embedded_rule)
1435 {
1436 /* Now col_offset[N] are relative to the left edge of the
1437 first component. Make them relative to the left edge of
1438 overall glyph. */
1439 for (i = 0; i < chars; i++)
1440 cmpcharp->col_offset[i] -= leftmost;
1441 /* Make rightmost holds width of overall glyph. */
1442 rightmost -= leftmost;
1443 }
1444
1445 cmpcharp->width = rightmost;
1446 if (cmpcharp->width < rightmost)
1447 /* To get a ceiling integer value. */
1448 cmpcharp->width++;
1449 }
1450
1451 cmpchar_table[n_cmpchars] = cmpcharp;
1452
1453 return n_cmpchars++;
1454 }
1455
1456 /* Return the Nth element of the composite character C. */
1457 int
1458 cmpchar_component (c, n)
1459 unsigned int c, n;
1460 {
1461 int id = COMPOSITE_CHAR_ID (c);
1462
1463 if (id >= n_cmpchars /* C is not a valid composite character. */
1464 || n >= cmpchar_table[id]->glyph_len) /* No such component. */
1465 return -1;
1466 /* No face data is stored in glyph code. */
1467 return ((int) (cmpchar_table[id]->glyph[n]));
1468 }
1469
1470 DEFUN ("cmpcharp", Fcmpcharp, Scmpcharp, 1, 1, 0,
1471 "T if CHAR is a composite character.")
1472 (ch)
1473 Lisp_Object ch;
1474 {
1475 CHECK_NUMBER (ch, 0);
1476 return (COMPOSITE_CHAR_P (XINT (ch)) ? Qt : Qnil);
1477 }
1478
1479 DEFUN ("composite-char-component", Fcmpchar_component, Scmpchar_component,
1480 2, 2, 0,
1481 "Return the IDXth component character of composite character CHARACTER.")
1482 (character, idx)
1483 Lisp_Object character, idx;
1484 {
1485 int c;
1486
1487 CHECK_NUMBER (character, 0);
1488 CHECK_NUMBER (idx, 1);
1489
1490 if ((c = cmpchar_component (XINT (character), XINT (idx))) < 0)
1491 args_out_of_range (character, idx);
1492
1493 return make_number (c);
1494 }
1495
1496 DEFUN ("composite-char-composition-rule", Fcmpchar_cmp_rule, Scmpchar_cmp_rule,
1497 2, 2, 0,
1498 "Return the Nth composition rule embedded in composite character CHARACTER.\n\
1499 The returned rule is for composing the Nth component\n\
1500 on the (N-1)th component. If N is 0, the returned value is always 255.")
1501 (character, n)
1502 Lisp_Object character, n;
1503 {
1504 int id, i;
1505
1506 CHECK_NUMBER (character, 0);
1507 CHECK_NUMBER (n, 1);
1508
1509 id = COMPOSITE_CHAR_ID (XINT (character));
1510 if (id < 0 || id >= n_cmpchars)
1511 error ("Invalid composite character: %d", XINT (character));
1512 i = XINT (n);
1513 if (i > cmpchar_table[id]->glyph_len)
1514 args_out_of_range (character, n);
1515
1516 return make_number (cmpchar_table[id]->cmp_rule[i]);
1517 }
1518
1519 DEFUN ("composite-char-composition-rule-p", Fcmpchar_cmp_rule_p,
1520 Scmpchar_cmp_rule_p, 1, 1, 0,
1521 "Return non-nil if composite character CHARACTER contains a embedded rule.")
1522 (character)
1523 Lisp_Object character;
1524 {
1525 int id;
1526
1527 CHECK_NUMBER (character, 0);
1528 id = COMPOSITE_CHAR_ID (XINT (character));
1529 if (id < 0 || id >= n_cmpchars)
1530 error ("Invalid composite character: %d", XINT (character));
1531
1532 return (cmpchar_table[id]->cmp_rule ? Qt : Qnil);
1533 }
1534
1535 DEFUN ("composite-char-component-count", Fcmpchar_cmp_count,
1536 Scmpchar_cmp_count, 1, 1, 0,
1537 "Return number of compoents of composite character CHARACTER.")
1538 (character)
1539 Lisp_Object character;
1540 {
1541 int id;
1542
1543 CHECK_NUMBER (character, 0);
1544 id = COMPOSITE_CHAR_ID (XINT (character));
1545 if (id < 0 || id >= n_cmpchars)
1546 error ("Invalid composite character: %d", XINT (character));
1547
1548 return (make_number (cmpchar_table[id]->glyph_len));
1549 }
1550
1551 DEFUN ("compose-string", Fcompose_string, Scompose_string,
1552 1, 1, 0,
1553 "Return one char string composed from all characters in STRING.")
1554 (str)
1555 Lisp_Object str;
1556 {
1557 unsigned char buf[MAX_LENGTH_OF_MULTI_BYTE_FORM], *p, *pend, *ptemp;
1558 int len, i;
1559
1560 CHECK_STRING (str, 0);
1561
1562 buf[0] = LEADING_CODE_COMPOSITION;
1563 p = XSTRING (str)->data;
1564 pend = p + STRING_BYTES (XSTRING (str));
1565 i = 1;
1566 while (p < pend)
1567 {
1568 if (*p < 0x20 || *p == 127) /* control code */
1569 error ("Invalid component character: %d", *p);
1570 else if (*p < 0x80) /* ASCII */
1571 {
1572 if (i + 2 >= MAX_LENGTH_OF_MULTI_BYTE_FORM)
1573 error ("Too long string to be composed: %s", XSTRING (str)->data);
1574 /* Prepend an ASCII charset indicator 0xA0, set MSB of the
1575 code itself. */
1576 buf[i++] = 0xA0;
1577 buf[i++] = *p++ + 0x80;
1578 }
1579 else if (*p == LEADING_CODE_COMPOSITION) /* composite char */
1580 {
1581 /* Already composed. Eliminate the heading
1582 LEADING_CODE_COMPOSITION, keep the remaining bytes
1583 unchanged. */
1584 p++;
1585 ptemp = p;
1586 while (! CHAR_HEAD_P (*p)) p++;
1587 if (i + (p - ptemp) >= MAX_LENGTH_OF_MULTI_BYTE_FORM)
1588 error ("Too long string to be composed: %s", XSTRING (str)->data);
1589 bcopy (ptemp, buf + i, p - ptemp);
1590 i += p - ptemp;
1591 }
1592 else /* multibyte char */
1593 {
1594 /* Add 0x20 to the base leading-code, keep the remaining
1595 bytes unchanged. */
1596 len = BYTES_BY_CHAR_HEAD (*p);
1597 if (i + len >= MAX_LENGTH_OF_MULTI_BYTE_FORM)
1598 error ("Too long string to be composed: %s", XSTRING (str)->data);
1599 bcopy (p, buf + i, len);
1600 buf[i] += 0x20;
1601 p += len, i += len;
1602 }
1603 }
1604
1605 if (i < 5)
1606 /* STR contains only one character, which can't be composed. */
1607 error ("Too short string to be composed: %s", XSTRING (str)->data);
1608
1609 return make_string_from_bytes (buf, 1, i);
1610 }
1611
1612 \f
1613 charset_id_internal (charset_name)
1614 char *charset_name;
1615 {
1616 Lisp_Object val = Fget (intern (charset_name), Qcharset);
1617
1618 if (!VECTORP (val))
1619 error ("Charset %s is not defined", charset_name);
1620
1621 return (XINT (XVECTOR (val)->contents[0]));
1622 }
1623
1624 DEFUN ("setup-special-charsets", Fsetup_special_charsets,
1625 Ssetup_special_charsets, 0, 0, 0, "Internal use only.")
1626 ()
1627 {
1628 charset_latin_iso8859_1 = charset_id_internal ("latin-iso8859-1");
1629 charset_jisx0208_1978 = charset_id_internal ("japanese-jisx0208-1978");
1630 charset_jisx0208 = charset_id_internal ("japanese-jisx0208");
1631 charset_katakana_jisx0201 = charset_id_internal ("katakana-jisx0201");
1632 charset_latin_jisx0201 = charset_id_internal ("latin-jisx0201");
1633 charset_big5_1 = charset_id_internal ("chinese-big5-1");
1634 charset_big5_2 = charset_id_internal ("chinese-big5-2");
1635 return Qnil;
1636 }
1637
1638 init_charset_once ()
1639 {
1640 int i, j, k;
1641
1642 staticpro (&Vcharset_table);
1643 staticpro (&Vcharset_symbol_table);
1644
1645 /* This has to be done here, before we call Fmake_char_table. */
1646 Qcharset_table = intern ("charset-table");
1647 staticpro (&Qcharset_table);
1648
1649 /* Intern this now in case it isn't already done.
1650 Setting this variable twice is harmless.
1651 But don't staticpro it here--that is done in alloc.c. */
1652 Qchar_table_extra_slots = intern ("char-table-extra-slots");
1653
1654 /* Now we are ready to set up this property, so we can
1655 create the charset table. */
1656 Fput (Qcharset_table, Qchar_table_extra_slots, make_number (0));
1657 Vcharset_table = Fmake_char_table (Qcharset_table, Qnil);
1658
1659 Vcharset_symbol_table = Fmake_vector (make_number (MAX_CHARSET + 1), Qnil);
1660
1661 /* Setup tables. */
1662 for (i = 0; i < 2; i++)
1663 for (j = 0; j < 2; j++)
1664 for (k = 0; k < 128; k++)
1665 iso_charset_table [i][j][k] = -1;
1666
1667 bzero (cmpchar_hash_table, sizeof cmpchar_hash_table);
1668 cmpchar_table_size = n_cmpchars = 0;
1669
1670 for (i = 0; i < 256; i++)
1671 BYTES_BY_CHAR_HEAD (i) = 1;
1672 BYTES_BY_CHAR_HEAD (LEADING_CODE_PRIVATE_11) = 3;
1673 BYTES_BY_CHAR_HEAD (LEADING_CODE_PRIVATE_12) = 3;
1674 BYTES_BY_CHAR_HEAD (LEADING_CODE_PRIVATE_21) = 4;
1675 BYTES_BY_CHAR_HEAD (LEADING_CODE_PRIVATE_22) = 4;
1676 /* The following doesn't reflect the actual bytes, but just to tell
1677 that it is a start of a multibyte character. */
1678 BYTES_BY_CHAR_HEAD (LEADING_CODE_COMPOSITION) = 2;
1679
1680 for (i = 0; i < 128; i++)
1681 WIDTH_BY_CHAR_HEAD (i) = 1;
1682 for (; i < 256; i++)
1683 WIDTH_BY_CHAR_HEAD (i) = 4;
1684 WIDTH_BY_CHAR_HEAD (LEADING_CODE_PRIVATE_11) = 1;
1685 WIDTH_BY_CHAR_HEAD (LEADING_CODE_PRIVATE_12) = 2;
1686 WIDTH_BY_CHAR_HEAD (LEADING_CODE_PRIVATE_21) = 1;
1687 WIDTH_BY_CHAR_HEAD (LEADING_CODE_PRIVATE_22) = 2;
1688 }
1689
1690 #ifdef emacs
1691
1692 syms_of_charset ()
1693 {
1694 Qascii = intern ("ascii");
1695 staticpro (&Qascii);
1696
1697 Qcharset = intern ("charset");
1698 staticpro (&Qcharset);
1699
1700 /* Define ASCII charset now. */
1701 update_charset_table (make_number (CHARSET_ASCII),
1702 make_number (1), make_number (94),
1703 make_number (1),
1704 make_number (0),
1705 make_number ('B'),
1706 make_number (0),
1707 build_string ("ASCII"),
1708 build_string ("ASCII"),
1709 build_string ("ASCII (ISO646 IRV)"));
1710 CHARSET_SYMBOL (CHARSET_ASCII) = Qascii;
1711 Fput (Qascii, Qcharset, CHARSET_TABLE_ENTRY (CHARSET_ASCII));
1712
1713 Qcomposition = intern ("composition");
1714 staticpro (&Qcomposition);
1715 CHARSET_SYMBOL (CHARSET_COMPOSITION) = Qcomposition;
1716
1717 defsubr (&Sdefine_charset);
1718 defsubr (&Sget_unused_iso_final_char);
1719 defsubr (&Sdeclare_equiv_charset);
1720 defsubr (&Sfind_charset_region);
1721 defsubr (&Sfind_charset_string);
1722 defsubr (&Smake_char_internal);
1723 defsubr (&Ssplit_char);
1724 defsubr (&Schar_charset);
1725 defsubr (&Scharset_after);
1726 defsubr (&Siso_charset);
1727 defsubr (&Schar_valid_p);
1728 defsubr (&Sunibyte_char_to_multibyte);
1729 defsubr (&Schar_bytes);
1730 defsubr (&Schar_width);
1731 defsubr (&Sstring_width);
1732 defsubr (&Schar_direction);
1733 defsubr (&Schars_in_region);
1734 defsubr (&Sstring);
1735 defsubr (&Scmpcharp);
1736 defsubr (&Scmpchar_component);
1737 defsubr (&Scmpchar_cmp_rule);
1738 defsubr (&Scmpchar_cmp_rule_p);
1739 defsubr (&Scmpchar_cmp_count);
1740 defsubr (&Scompose_string);
1741 defsubr (&Ssetup_special_charsets);
1742
1743 DEFVAR_LISP ("charset-list", &Vcharset_list,
1744 "List of charsets ever defined.");
1745 Vcharset_list = Fcons (Qascii, Qnil);
1746
1747 DEFVAR_LISP ("character-unification-table-vector",
1748 &Vcharacter_unification_table_vector,
1749 "Vector of cons cell of a symbol and unification table ever defined.\n\
1750 An ID of a unification table is an index of this vector.");
1751 Vcharacter_unification_table_vector = Fmake_vector (make_number (16), Qnil);
1752
1753 DEFVAR_INT ("leading-code-composition", &leading_code_composition,
1754 "Leading-code of composite characters.");
1755 leading_code_composition = LEADING_CODE_COMPOSITION;
1756
1757 DEFVAR_INT ("leading-code-private-11", &leading_code_private_11,
1758 "Leading-code of private TYPE9N charset of column-width 1.");
1759 leading_code_private_11 = LEADING_CODE_PRIVATE_11;
1760
1761 DEFVAR_INT ("leading-code-private-12", &leading_code_private_12,
1762 "Leading-code of private TYPE9N charset of column-width 2.");
1763 leading_code_private_12 = LEADING_CODE_PRIVATE_12;
1764
1765 DEFVAR_INT ("leading-code-private-21", &leading_code_private_21,
1766 "Leading-code of private TYPE9Nx9N charset of column-width 1.");
1767 leading_code_private_21 = LEADING_CODE_PRIVATE_21;
1768
1769 DEFVAR_INT ("leading-code-private-22", &leading_code_private_22,
1770 "Leading-code of private TYPE9Nx9N charset of column-width 2.");
1771 leading_code_private_22 = LEADING_CODE_PRIVATE_22;
1772
1773 DEFVAR_INT ("nonascii-insert-offset", &nonascii_insert_offset,
1774 "Offset for converting non-ASCII unibyte codes 0240...0377 to multibyte.\n\
1775 This is used for converting unibyte text to multibyte,\n\
1776 and for inserting character codes specified by number.\n\n\
1777 Conversion is performed only when multibyte characters are enabled,\n\
1778 and it serves to convert a Latin-1 or similar 8-bit character code\n\
1779 to the corresponding Emacs character code.\n\
1780 If `nonascii-translate-table' is non-nil, it overrides this variable.");
1781 nonascii_insert_offset = 0;
1782
1783 DEFVAR_LISP ("nonascii-translate-table", &Vnonascii_translate_table,
1784 "Translate table for converting non-ASCII unibyte codes to multibyte.\n\
1785 This is used for converting unibyte text to multibyte,\n\
1786 and for inserting character codes specified by number.\n\n\
1787 Conversion is performed only when multibyte characters are enabled,\n\
1788 and it serves to convert a Latin-1 or similar 8-bit character code\n\
1789 to the corresponding Emacs character code.\n\n\
1790 If this is nil, `nonascii-insert-offset' is used instead.");
1791 Vnonascii_translate_table = Qnil;
1792
1793 DEFVAR_INT ("min-composite-char", &min_composite_char,
1794 "Minimum character code of a composite character.");
1795 min_composite_char = MIN_CHAR_COMPOSITION;
1796 }
1797
1798 #endif /* emacs */