Use STRING_BYTES and SET_STRING_BYTES.
[bpt/emacs.git] / src / charset.c
1 /* Basic multilingual character support.
2 Copyright (C) 1995, 1997, 1998 Electrotechnical Laboratory, JAPAN.
3 Licensed to the Free Software Foundation.
4
5 This file is part of GNU Emacs.
6
7 GNU Emacs is free software; you can redistribute it and/or modify
8 it under the terms of the GNU General Public License as published by
9 the Free Software Foundation; either version 2, or (at your option)
10 any later version.
11
12 GNU Emacs is distributed in the hope that it will be useful,
13 but WITHOUT ANY WARRANTY; without even the implied warranty of
14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 GNU General Public License for more details.
16
17 You should have received a copy of the GNU General Public License
18 along with GNU Emacs; see the file COPYING. If not, write to
19 the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
20 Boston, MA 02111-1307, USA. */
21
22 /* At first, see the document in `charset.h' to understand the code in
23 this file. */
24
25 #include <stdio.h>
26
27 #ifdef emacs
28
29 #include <sys/types.h>
30 #include <config.h>
31 #include "lisp.h"
32 #include "buffer.h"
33 #include "charset.h"
34 #include "coding.h"
35 #include "disptab.h"
36
37 #else /* not emacs */
38
39 #include "mulelib.h"
40
41 #endif /* emacs */
42
43 Lisp_Object Qcharset, Qascii, Qcomposition;
44
45 /* Declaration of special leading-codes. */
46 int leading_code_composition; /* for composite characters */
47 int leading_code_private_11; /* for private DIMENSION1 of 1-column */
48 int leading_code_private_12; /* for private DIMENSION1 of 2-column */
49 int leading_code_private_21; /* for private DIMENSION2 of 1-column */
50 int leading_code_private_22; /* for private DIMENSION2 of 2-column */
51
52 /* Declaration of special charsets. */
53 int charset_ascii; /* ASCII */
54 int charset_composition; /* for a composite character */
55 int charset_latin_iso8859_1; /* ISO8859-1 (Latin-1) */
56 int charset_jisx0208_1978; /* JISX0208.1978 (Japanese Kanji old set) */
57 int charset_jisx0208; /* JISX0208.1983 (Japanese Kanji) */
58 int charset_katakana_jisx0201; /* JISX0201.Kana (Japanese Katakana) */
59 int charset_latin_jisx0201; /* JISX0201.Roman (Japanese Roman) */
60 int charset_big5_1; /* Big5 Level 1 (Chinese Traditional) */
61 int charset_big5_2; /* Big5 Level 2 (Chinese Traditional) */
62
63 int min_composite_char;
64
65 Lisp_Object Qcharset_table;
66
67 /* A char-table containing information of each character set. */
68 Lisp_Object Vcharset_table;
69
70 /* A vector of charset symbol indexed by charset-id. This is used
71 only for returning charset symbol from C functions. */
72 Lisp_Object Vcharset_symbol_table;
73
74 /* A list of charset symbols ever defined. */
75 Lisp_Object Vcharset_list;
76
77 /* Vector of unification table ever defined.
78 An ID of a unification table is an index of this vector. */
79 Lisp_Object Vcharacter_unification_table_vector;
80
81 /* Tables used by macros BYTES_BY_CHAR_HEAD and WIDTH_BY_CHAR_HEAD. */
82 int bytes_by_char_head[256];
83 int width_by_char_head[256];
84
85 /* Mapping table from ISO2022's charset (specified by DIMENSION,
86 CHARS, and FINAL-CHAR) to Emacs' charset. */
87 int iso_charset_table[2][2][128];
88
89 /* Table of pointers to the structure `cmpchar_info' indexed by
90 CMPCHAR-ID. */
91 struct cmpchar_info **cmpchar_table;
92 /* The current size of `cmpchar_table'. */
93 static int cmpchar_table_size;
94 /* Number of the current composite characters. */
95 int n_cmpchars;
96
97 /* Variables used locally in the macro FETCH_MULTIBYTE_CHAR. */
98 unsigned char *_fetch_multibyte_char_p;
99 int _fetch_multibyte_char_len;
100
101 /* Offset to add to a non-ASCII value when inserting it. */
102 int nonascii_insert_offset;
103
104 /* Translation table for converting non-ASCII unibyte characters
105 to multibyte codes, or nil. */
106 Lisp_Object Vnonascii_translate_table;
107
108 #define min(X, Y) ((X) < (Y) ? (X) : (Y))
109 #define max(X, Y) ((X) > (Y) ? (X) : (Y))
110 \f
111 void
112 invalid_character (c)
113 int c;
114 {
115 error ("Invalid character: %o, %d, 0x%x", c);
116 }
117
118
119 /* Set STR a pointer to the multi-byte form of the character C. If C
120 is not a composite character, the multi-byte form is set in WORKBUF
121 and STR points WORKBUF. The caller should allocate at least 4-byte
122 area at WORKBUF in advance. Returns the length of the multi-byte
123 form. If C is an invalid character to have a multi-byte form,
124 signal an error.
125
126 Use macro `CHAR_STRING (C, WORKBUF, STR)' instead of calling this
127 function directly if C can be an ASCII character. */
128
129 int
130 non_ascii_char_to_string (c, workbuf, str)
131 int c;
132 unsigned char *workbuf, **str;
133 {
134 int charset, c1, c2;
135
136 if (COMPOSITE_CHAR_P (c))
137 {
138 int cmpchar_id = COMPOSITE_CHAR_ID (c);
139
140 if (cmpchar_id < n_cmpchars)
141 {
142 *str = cmpchar_table[cmpchar_id]->data;
143 return cmpchar_table[cmpchar_id]->len;
144 }
145 else
146 {
147 invalid_character (c);
148 }
149 }
150
151 SPLIT_NON_ASCII_CHAR (c, charset, c1, c2);
152 if (!charset
153 || ! CHARSET_DEFINED_P (charset)
154 || c1 >= 0 && c1 < 32
155 || c2 >= 0 && c2 < 32)
156 invalid_character (c);
157
158 *str = workbuf;
159 *workbuf++ = CHARSET_LEADING_CODE_BASE (charset);
160 if (*workbuf = CHARSET_LEADING_CODE_EXT (charset))
161 workbuf++;
162 *workbuf++ = c1 | 0x80;
163 if (c2 >= 0)
164 *workbuf++ = c2 | 0x80;
165
166 return (workbuf - *str);
167 }
168
169 /* Return a non-ASCII character of which multi-byte form is at STR of
170 length LEN. If ACTUAL_LEN is not NULL, the actual length of the
171 character is set to the address ACTUAL_LEN.
172
173 Use macro `STRING_CHAR (STR, LEN)' instead of calling this function
174 directly if STR can hold an ASCII character. */
175
176 string_to_non_ascii_char (str, len, actual_len)
177 unsigned char *str;
178 int len, *actual_len;
179 {
180 int charset;
181 unsigned char c1, c2;
182 register int c;
183
184 if (SPLIT_STRING (str, len, charset, c1, c2) == CHARSET_ASCII)
185 {
186 if (actual_len)
187 *actual_len = 1;
188 return (int) *str;
189 }
190
191 c = MAKE_NON_ASCII_CHAR (charset, c1, c2);
192
193 if (actual_len)
194 *actual_len = (charset == CHARSET_COMPOSITION
195 ? cmpchar_table[COMPOSITE_CHAR_ID (c)]->len
196 : BYTES_BY_CHAR_HEAD (*str));
197 return c;
198 }
199
200 /* Return the length of the multi-byte form at string STR of length LEN. */
201 int
202 multibyte_form_length (str, len)
203 unsigned char *str;
204 int len;
205 {
206 int charset;
207 unsigned char c1, c2;
208 register int c;
209
210 if (SPLIT_STRING (str, len, charset, c1, c2) == CHARSET_ASCII)
211 return 1;
212
213 return (charset == CHARSET_COMPOSITION
214 ? cmpchar_table[(c1 << 7) | c2]->len
215 : BYTES_BY_CHAR_HEAD (*str));
216 }
217
218 /* Check if string STR of length LEN contains valid multi-byte form of
219 a character. If valid, charset and position codes of the character
220 is set at *CHARSET, *C1, and *C2, and return 0. If not valid,
221 return -1. This should be used only in the macro SPLIT_STRING
222 which checks range of STR in advance. */
223
224 split_non_ascii_string (str, len, charset, c1, c2)
225 register unsigned char *str, *c1, *c2;
226 register int len, *charset;
227 {
228 register unsigned int cs = *str++;
229
230 if (cs == LEADING_CODE_COMPOSITION)
231 {
232 int cmpchar_id = str_cmpchar_id (str - 1, len);
233
234 if (cmpchar_id < 0)
235 return -1;
236 *charset = cs, *c1 = cmpchar_id >> 7, *c2 = cmpchar_id & 0x7F;
237 }
238 else if ((cs < LEADING_CODE_PRIVATE_11 || (cs = *str++) >= 0xA0)
239 && CHARSET_DEFINED_P (cs))
240 {
241 *charset = cs;
242 if (*str < 0xA0)
243 return -1;
244 *c1 = (*str++) & 0x7F;
245 if (CHARSET_DIMENSION (cs) == 2)
246 {
247 if (*str < 0xA0)
248 return -1;
249 *c2 = (*str++) & 0x7F;
250 }
251 }
252 else
253 return -1;
254 return 0;
255 }
256
257 /* Return a character unified with C (or a character made of CHARSET,
258 C1, and C2) in unification table TABLE. If no unification is found
259 in TABLE, return C. */
260 unify_char (table, c, charset, c1, c2)
261 Lisp_Object table;
262 int c, charset, c1, c2;
263 {
264 Lisp_Object ch;
265 int alt_charset, alt_c1, alt_c2, dimension;
266
267 if (c < 0) c = MAKE_CHAR (charset, c1, c2);
268 if (!CHAR_TABLE_P (table)
269 || (ch = Faref (table, make_number (c)), !INTEGERP (ch))
270 || XINT (ch) < 0)
271 return c;
272
273 SPLIT_CHAR (XFASTINT (ch), alt_charset, alt_c1, alt_c2);
274 dimension = CHARSET_DIMENSION (alt_charset);
275 if (dimension == 1 && alt_c1 > 0 || dimension == 2 && alt_c2 > 0)
276 /* CH is not a generic character, just return it. */
277 return XFASTINT (ch);
278
279 /* Since CH is a generic character, we must return a specific
280 charater which has the same position codes as C from CH. */
281 if (charset < 0)
282 SPLIT_CHAR (c, charset, c1, c2);
283 if (dimension != CHARSET_DIMENSION (charset))
284 /* We can't make such a character because of dimension mismatch. */
285 return c;
286 return MAKE_CHAR (alt_charset, c1, c2);
287 }
288
289 /* Convert the unibyte character C to multibyte based on
290 Vnonascii_translate_table or nonascii_insert_offset. If they can't
291 convert C to a valid multibyte character, convert it based on
292 DEFAULT_NONASCII_INSERT_OFFSET which makes C a Latin-1 character. */
293
294 unibyte_char_to_multibyte (c)
295 int c;
296 {
297 if (c >= 0240 && c < 0400)
298 {
299 int c_save = c;
300
301 if (! NILP (Vnonascii_translate_table))
302 c = XINT (Faref (Vnonascii_translate_table, make_number (c)));
303 else if (nonascii_insert_offset > 0)
304 c += nonascii_insert_offset;
305 if (c >= 0240 && (c < 0400 || ! VALID_MULTIBYTE_CHAR_P (c)))
306 c = c_save + DEFAULT_NONASCII_INSERT_OFFSET;
307 }
308 return c;
309 }
310 \f
311 /* Update the table Vcharset_table with the given arguments (see the
312 document of `define-charset' for the meaning of each argument).
313 Several other table contents are also updated. The caller should
314 check the validity of CHARSET-ID and the remaining arguments in
315 advance. */
316
317 void
318 update_charset_table (charset_id, dimension, chars, width, direction,
319 iso_final_char, iso_graphic_plane,
320 short_name, long_name, description)
321 Lisp_Object charset_id, dimension, chars, width, direction;
322 Lisp_Object iso_final_char, iso_graphic_plane;
323 Lisp_Object short_name, long_name, description;
324 {
325 int charset = XINT (charset_id);
326 int bytes;
327 unsigned char leading_code_base, leading_code_ext;
328
329 if (NILP (CHARSET_TABLE_ENTRY (charset)))
330 CHARSET_TABLE_ENTRY (charset)
331 = Fmake_vector (make_number (CHARSET_MAX_IDX), Qnil);
332
333 /* Get byte length of multibyte form, base leading-code, and
334 extended leading-code of the charset. See the comment under the
335 title "GENERAL NOTE on CHARACTER SET (CHARSET)" in charset.h. */
336 bytes = XINT (dimension);
337 if (charset < MIN_CHARSET_PRIVATE_DIMENSION1)
338 {
339 /* Official charset, it doesn't have an extended leading-code. */
340 if (charset != CHARSET_ASCII)
341 bytes += 1; /* For a base leading-code. */
342 leading_code_base = charset;
343 leading_code_ext = 0;
344 }
345 else
346 {
347 /* Private charset. */
348 bytes += 2; /* For base and extended leading-codes. */
349 leading_code_base
350 = (charset < LEADING_CODE_EXT_12
351 ? LEADING_CODE_PRIVATE_11
352 : (charset < LEADING_CODE_EXT_21
353 ? LEADING_CODE_PRIVATE_12
354 : (charset < LEADING_CODE_EXT_22
355 ? LEADING_CODE_PRIVATE_21
356 : LEADING_CODE_PRIVATE_22)));
357 leading_code_ext = charset;
358 }
359
360 CHARSET_TABLE_INFO (charset, CHARSET_ID_IDX) = charset_id;
361 CHARSET_TABLE_INFO (charset, CHARSET_BYTES_IDX) = make_number (bytes);
362 CHARSET_TABLE_INFO (charset, CHARSET_DIMENSION_IDX) = dimension;
363 CHARSET_TABLE_INFO (charset, CHARSET_CHARS_IDX) = chars;
364 CHARSET_TABLE_INFO (charset, CHARSET_WIDTH_IDX) = width;
365 CHARSET_TABLE_INFO (charset, CHARSET_DIRECTION_IDX) = direction;
366 CHARSET_TABLE_INFO (charset, CHARSET_LEADING_CODE_BASE_IDX)
367 = make_number (leading_code_base);
368 CHARSET_TABLE_INFO (charset, CHARSET_LEADING_CODE_EXT_IDX)
369 = make_number (leading_code_ext);
370 CHARSET_TABLE_INFO (charset, CHARSET_ISO_FINAL_CHAR_IDX) = iso_final_char;
371 CHARSET_TABLE_INFO (charset, CHARSET_ISO_GRAPHIC_PLANE_IDX)
372 = iso_graphic_plane;
373 CHARSET_TABLE_INFO (charset, CHARSET_SHORT_NAME_IDX) = short_name;
374 CHARSET_TABLE_INFO (charset, CHARSET_LONG_NAME_IDX) = long_name;
375 CHARSET_TABLE_INFO (charset, CHARSET_DESCRIPTION_IDX) = description;
376 CHARSET_TABLE_INFO (charset, CHARSET_PLIST_IDX) = Qnil;
377
378 {
379 /* If we have already defined a charset which has the same
380 DIMENSION, CHARS and ISO-FINAL-CHAR but the different
381 DIRECTION, we must update the entry REVERSE-CHARSET of both
382 charsets. If there's no such charset, the value of the entry
383 is set to nil. */
384 int i;
385
386 for (i = 0; i <= MAX_CHARSET; i++)
387 if (!NILP (CHARSET_TABLE_ENTRY (i)))
388 {
389 if (CHARSET_DIMENSION (i) == XINT (dimension)
390 && CHARSET_CHARS (i) == XINT (chars)
391 && CHARSET_ISO_FINAL_CHAR (i) == XINT (iso_final_char)
392 && CHARSET_DIRECTION (i) != XINT (direction))
393 {
394 CHARSET_TABLE_INFO (charset, CHARSET_REVERSE_CHARSET_IDX)
395 = make_number (i);
396 CHARSET_TABLE_INFO (i, CHARSET_REVERSE_CHARSET_IDX) = charset_id;
397 break;
398 }
399 }
400 if (i > MAX_CHARSET)
401 /* No such a charset. */
402 CHARSET_TABLE_INFO (charset, CHARSET_REVERSE_CHARSET_IDX)
403 = make_number (-1);
404 }
405
406 if (charset != CHARSET_ASCII
407 && charset < MIN_CHARSET_PRIVATE_DIMENSION1)
408 {
409 /* Update tables bytes_by_char_head and width_by_char_head. */
410 bytes_by_char_head[leading_code_base] = bytes;
411 width_by_char_head[leading_code_base] = XINT (width);
412
413 /* Update table emacs_code_class. */
414 emacs_code_class[charset] = (bytes == 2
415 ? EMACS_leading_code_2
416 : (bytes == 3
417 ? EMACS_leading_code_3
418 : EMACS_leading_code_4));
419 }
420
421 /* Update table iso_charset_table. */
422 if (ISO_CHARSET_TABLE (dimension, chars, iso_final_char) < 0)
423 ISO_CHARSET_TABLE (dimension, chars, iso_final_char) = charset;
424 }
425
426 #ifdef emacs
427
428 /* Return charset id of CHARSET_SYMBOL, or return -1 if CHARSET_SYMBOL
429 is invalid. */
430 int
431 get_charset_id (charset_symbol)
432 Lisp_Object charset_symbol;
433 {
434 Lisp_Object val;
435 int charset;
436
437 return ((SYMBOLP (charset_symbol)
438 && (val = Fget (charset_symbol, Qcharset), VECTORP (val))
439 && (charset = XINT (XVECTOR (val)->contents[CHARSET_ID_IDX]),
440 CHARSET_VALID_P (charset)))
441 ? charset : -1);
442 }
443
444 /* Return an identification number for a new private charset of
445 DIMENSION and WIDTH. If there's no more room for the new charset,
446 return 0. */
447 Lisp_Object
448 get_new_private_charset_id (dimension, width)
449 int dimension, width;
450 {
451 int charset, from, to;
452
453 if (dimension == 1)
454 {
455 if (width == 1)
456 from = LEADING_CODE_EXT_11, to = LEADING_CODE_EXT_12;
457 else
458 from = LEADING_CODE_EXT_12, to = LEADING_CODE_EXT_21;
459 }
460 else
461 {
462 if (width == 1)
463 from = LEADING_CODE_EXT_21, to = LEADING_CODE_EXT_22;
464 else
465 from = LEADING_CODE_EXT_22, to = LEADING_CODE_EXT_MAX + 1;
466 }
467
468 for (charset = from; charset < to; charset++)
469 if (!CHARSET_DEFINED_P (charset)) break;
470
471 return make_number (charset < to ? charset : 0);
472 }
473
474 DEFUN ("define-charset", Fdefine_charset, Sdefine_charset, 3, 3, 0,
475 "Define CHARSET-ID as the identification number of CHARSET with INFO-VECTOR.\n\
476 If CHARSET-ID is nil, it is decided automatically, which means CHARSET is\n\
477 treated as a private charset.\n\
478 INFO-VECTOR is a vector of the format:\n\
479 [DIMENSION CHARS WIDTH DIRECTION ISO-FINAL-CHAR ISO-GRAPHIC-PLANE\n\
480 SHORT-NAME LONG-NAME DESCRIPTION]\n\
481 The meanings of each elements is as follows:\n\
482 DIMENSION (integer) is the number of bytes to represent a character: 1 or 2.\n\
483 CHARS (integer) is the number of characters in a dimension: 94 or 96.\n\
484 WIDTH (integer) is the number of columns a character in the charset\n\
485 occupies on the screen: one of 0, 1, and 2.\n\
486 \n\
487 DIRECTION (integer) is the rendering direction of characters in the\n\
488 charset when rendering. If 0, render from right to left, else\n\
489 render from left to right.\n\
490 \n\
491 ISO-FINAL-CHAR (character) is the final character of the\n\
492 corresponding ISO 2022 charset.\n\
493 \n\
494 ISO-GRAPHIC-PLANE (integer) is the graphic plane to be invoked\n\
495 while encoding to variants of ISO 2022 coding system, one of the\n\
496 following: 0/graphic-plane-left(GL), 1/graphic-plane-right(GR).\n\
497 \n\
498 SHORT-NAME (string) is the short name to refer to the charset.\n\
499 \n\
500 LONG-NAME (string) is the long name to refer to the charset.\n\
501 \n\
502 DESCRIPTION (string) is the description string of the charset.")
503 (charset_id, charset_symbol, info_vector)
504 Lisp_Object charset_id, charset_symbol, info_vector;
505 {
506 Lisp_Object *vec;
507
508 if (!NILP (charset_id))
509 CHECK_NUMBER (charset_id, 0);
510 CHECK_SYMBOL (charset_symbol, 1);
511 CHECK_VECTOR (info_vector, 2);
512
513 if (! NILP (charset_id))
514 {
515 if (! CHARSET_VALID_P (XINT (charset_id)))
516 error ("Invalid CHARSET: %d", XINT (charset_id));
517 else if (CHARSET_DEFINED_P (XINT (charset_id)))
518 error ("Already defined charset: %d", XINT (charset_id));
519 }
520
521 vec = XVECTOR (info_vector)->contents;
522 if (XVECTOR (info_vector)->size != 9
523 || !INTEGERP (vec[0]) || !(XINT (vec[0]) == 1 || XINT (vec[0]) == 2)
524 || !INTEGERP (vec[1]) || !(XINT (vec[1]) == 94 || XINT (vec[1]) == 96)
525 || !INTEGERP (vec[2]) || !(XINT (vec[2]) == 1 || XINT (vec[2]) == 2)
526 || !INTEGERP (vec[3]) || !(XINT (vec[3]) == 0 || XINT (vec[3]) == 1)
527 || !INTEGERP (vec[4]) || !(XINT (vec[4]) >= '0' && XINT (vec[4]) <= '~')
528 || !INTEGERP (vec[5]) || !(XINT (vec[5]) == 0 || XINT (vec[5]) == 1)
529 || !STRINGP (vec[6])
530 || !STRINGP (vec[7])
531 || !STRINGP (vec[8]))
532 error ("Invalid info-vector argument for defining charset %s",
533 XSYMBOL (charset_symbol)->name->data);
534
535 if (NILP (charset_id))
536 {
537 charset_id = get_new_private_charset_id (XINT (vec[0]), XINT (vec[2]));
538 if (XINT (charset_id) == 0)
539 error ("There's no room for a new private charset %s",
540 XSYMBOL (charset_symbol)->name->data);
541 }
542
543 update_charset_table (charset_id, vec[0], vec[1], vec[2], vec[3],
544 vec[4], vec[5], vec[6], vec[7], vec[8]);
545 Fput (charset_symbol, Qcharset, CHARSET_TABLE_ENTRY (XINT (charset_id)));
546 CHARSET_SYMBOL (XINT (charset_id)) = charset_symbol;
547 Vcharset_list = Fcons (charset_symbol, Vcharset_list);
548 return Qnil;
549 }
550
551 DEFUN ("get-unused-iso-final-char", Fget_unused_iso_final_char,
552 Sget_unused_iso_final_char, 2, 2, 0,
553 "Return an unsed ISO's final char for a charset of DIMENISION and CHARS.\n\
554 DIMENSION is the number of bytes to represent a character: 1 or 2.\n\
555 CHARS is the number of characters in a dimension: 94 or 96.\n\
556 \n\
557 This final char is for private use, thus the range is `0' (48) .. `?' (63).\n\
558 If there's no unused final char for the specified kind of charset,\n\
559 return nil.")
560 (dimension, chars)
561 Lisp_Object dimension, chars;
562 {
563 int final_char;
564
565 CHECK_NUMBER (dimension, 0);
566 CHECK_NUMBER (chars, 1);
567 if (XINT (dimension) != 1 && XINT (dimension) != 2)
568 error ("Invalid charset dimension %d, it should be 1 or 2",
569 XINT (dimension));
570 if (XINT (chars) != 94 && XINT (chars) != 96)
571 error ("Invalid charset chars %d, it should be 94 or 96",
572 XINT (chars));
573 for (final_char = '0'; final_char <= '?'; final_char++)
574 {
575 if (ISO_CHARSET_TABLE (dimension, chars, make_number (final_char)) < 0)
576 break;
577 }
578 return (final_char <= '?' ? make_number (final_char) : Qnil);
579 }
580
581 DEFUN ("declare-equiv-charset", Fdeclare_equiv_charset, Sdeclare_equiv_charset,
582 4, 4, 0,
583 "Declare a charset of DIMENSION, CHARS, FINAL-CHAR is the same as CHARSET.\n\
584 CHARSET should be defined by `defined-charset' in advance.")
585 (dimension, chars, final_char, charset_symbol)
586 Lisp_Object dimension, chars, final_char, charset_symbol;
587 {
588 int charset;
589
590 CHECK_NUMBER (dimension, 0);
591 CHECK_NUMBER (chars, 1);
592 CHECK_NUMBER (final_char, 2);
593 CHECK_SYMBOL (charset_symbol, 3);
594
595 if (XINT (dimension) != 1 && XINT (dimension) != 2)
596 error ("Invalid DIMENSION %d, it should be 1 or 2", XINT (dimension));
597 if (XINT (chars) != 94 && XINT (chars) != 96)
598 error ("Invalid CHARS %d, it should be 94 or 96", XINT (chars));
599 if (XINT (final_char) < '0' || XFASTINT (final_char) > '~')
600 error ("Invalid FINAL-CHAR %c, it should be `0'..`~'", XINT (chars));
601 if ((charset = get_charset_id (charset_symbol)) < 0)
602 error ("Invalid charset %s", XSYMBOL (charset_symbol)->name->data);
603
604 ISO_CHARSET_TABLE (dimension, chars, final_char) = charset;
605 return Qnil;
606 }
607
608 /* Return number of different charsets in STR of length LEN. In
609 addition, for each found charset N, CHARSETS[N] is set 1. The
610 caller should allocate CHARSETS (MAX_CHARSET + 1 elements) in advance.
611 It may lookup a unification table TABLE if supplied. */
612
613 int
614 find_charset_in_str (str, len, charsets, table)
615 unsigned char *str;
616 int len, *charsets;
617 Lisp_Object table;
618 {
619 register int num = 0, c;
620
621 if (! CHAR_TABLE_P (table))
622 table = Qnil;
623
624 while (len > 0)
625 {
626 int bytes, charset;
627 c = *str;
628
629 if (c == LEADING_CODE_COMPOSITION)
630 {
631 int cmpchar_id = str_cmpchar_id (str, len);
632 GLYPH *glyph;
633
634 if (cmpchar_id > 0)
635 {
636 struct cmpchar_info *cmpcharp = cmpchar_table[cmpchar_id];
637 int i;
638
639 for (i = 0; i < cmpcharp->glyph_len; i++)
640 {
641 c = cmpcharp->glyph[i];
642 if (!NILP (table))
643 {
644 if ((c = unify_char (table, c, 0, 0, 0)) < 0)
645 c = cmpcharp->glyph[i];
646 }
647 if ((charset = CHAR_CHARSET (c)) < 0)
648 charset = CHARSET_ASCII;
649 if (!charsets[charset])
650 {
651 charsets[charset] = 1;
652 num += 1;
653 }
654 }
655 str += cmpcharp->len;
656 len -= cmpcharp->len;
657 continue;
658 }
659
660 charset = CHARSET_ASCII;
661 bytes = 1;
662 }
663 else
664 {
665 c = STRING_CHAR_AND_LENGTH (str, len, bytes);
666 if (! NILP (table))
667 {
668 int c1 = unify_char (table, c, 0, 0, 0);
669 if (c1 >= 0)
670 c = c1;
671 }
672 charset = CHAR_CHARSET (c);
673 }
674
675 if (!charsets[charset])
676 {
677 charsets[charset] = 1;
678 num += 1;
679 }
680 str += bytes;
681 len -= bytes;
682 }
683 return num;
684 }
685
686 DEFUN ("find-charset-region", Ffind_charset_region, Sfind_charset_region,
687 2, 3, 0,
688 "Return a list of charsets in the region between BEG and END.\n\
689 BEG and END are buffer positions.\n\
690 Optional arg TABLE if non-nil is a unification table to look up.")
691 (beg, end, table)
692 Lisp_Object beg, end, table;
693 {
694 int charsets[MAX_CHARSET + 1];
695 int from, from_byte, to, stop, stop_byte, i;
696 Lisp_Object val;
697
698 validate_region (&beg, &end);
699 from = XFASTINT (beg);
700 stop = to = XFASTINT (end);
701
702 if (from < GPT && GPT < to)
703 {
704 stop = GPT;
705 stop_byte = GPT_BYTE;
706 }
707 else
708 stop_byte = CHAR_TO_BYTE (stop);
709
710 from_byte = CHAR_TO_BYTE (from);
711
712 bzero (charsets, (MAX_CHARSET + 1) * sizeof (int));
713 while (1)
714 {
715 find_charset_in_str (BYTE_POS_ADDR (from_byte), stop_byte - from_byte,
716 charsets, table);
717 if (stop < to)
718 {
719 from = stop, from_byte = stop_byte;
720 stop = to, stop_byte = CHAR_TO_BYTE (stop);
721 }
722 else
723 break;
724 }
725
726 val = Qnil;
727 for (i = MAX_CHARSET; i >= 0; i--)
728 if (charsets[i])
729 val = Fcons (CHARSET_SYMBOL (i), val);
730 return val;
731 }
732
733 DEFUN ("find-charset-string", Ffind_charset_string, Sfind_charset_string,
734 1, 2, 0,
735 "Return a list of charsets in STR.\n\
736 Optional arg TABLE if non-nil is a unification table to look up.")
737 (str, table)
738 Lisp_Object str, table;
739 {
740 int charsets[MAX_CHARSET + 1];
741 int i;
742 Lisp_Object val;
743
744 CHECK_STRING (str, 0);
745
746 if (! STRING_MULTIBYTE (str))
747 return Qnil;
748
749 bzero (charsets, (MAX_CHARSET + 1) * sizeof (int));
750 find_charset_in_str (XSTRING (str)->data, STRING_BYTES (XSTRING (str)),
751 charsets, table);
752 val = Qnil;
753 for (i = MAX_CHARSET; i >= 0; i--)
754 if (charsets[i])
755 val = Fcons (CHARSET_SYMBOL (i), val);
756 return val;
757 }
758 \f
759 DEFUN ("make-char-internal", Fmake_char_internal, Smake_char_internal, 1, 3, 0,
760 "")
761 (charset, code1, code2)
762 Lisp_Object charset, code1, code2;
763 {
764 CHECK_NUMBER (charset, 0);
765
766 if (NILP (code1))
767 XSETFASTINT (code1, 0);
768 else
769 CHECK_NUMBER (code1, 1);
770 if (NILP (code2))
771 XSETFASTINT (code2, 0);
772 else
773 CHECK_NUMBER (code2, 2);
774
775 if (!CHARSET_DEFINED_P (XINT (charset)))
776 error ("Invalid charset: %d", XINT (charset));
777
778 return make_number (MAKE_CHAR (XINT (charset), XINT (code1), XINT (code2)));
779 }
780
781 DEFUN ("split-char", Fsplit_char, Ssplit_char, 1, 1, 0,
782 "Return list of charset and one or two position-codes of CHAR.")
783 (ch)
784 Lisp_Object ch;
785 {
786 Lisp_Object val;
787 int charset, c1, c2;
788
789 CHECK_NUMBER (ch, 0);
790 SPLIT_CHAR (XFASTINT (ch), charset, c1, c2);
791 return (c2 >= 0
792 ? Fcons (CHARSET_SYMBOL (charset),
793 Fcons (make_number (c1), Fcons (make_number (c2), Qnil)))
794 : Fcons (CHARSET_SYMBOL (charset), Fcons (make_number (c1), Qnil)));
795 }
796
797 DEFUN ("char-charset", Fchar_charset, Schar_charset, 1, 1, 0,
798 "Return charset of CHAR.")
799 (ch)
800 Lisp_Object ch;
801 {
802 CHECK_NUMBER (ch, 0);
803
804 return CHARSET_SYMBOL (CHAR_CHARSET (XINT (ch)));
805 }
806
807 DEFUN ("iso-charset", Fiso_charset, Siso_charset, 3, 3, 0,
808 "Return charset of ISO's specification DIMENSION, CHARS, and FINAL-CHAR.\n\
809 \n\
810 ISO 2022's designation sequence (escape sequence) distinguishes charsets\n\
811 by their DIMENSION, CHARS, and FINAL-CHAR,\n\
812 where as Emacs distinguishes them by charset symbol.\n\
813 See the documentation of the function `charset-info' for the meanings of\n\
814 DIMENSION, CHARS, and FINAL-CHAR.")
815 (dimension, chars, final_char)
816 Lisp_Object dimension, chars, final_char;
817 {
818 int charset;
819
820 CHECK_NUMBER (dimension, 0);
821 CHECK_NUMBER (chars, 1);
822 CHECK_NUMBER (final_char, 2);
823
824 if ((charset = ISO_CHARSET_TABLE (dimension, chars, final_char)) < 0)
825 return Qnil;
826 return CHARSET_SYMBOL (charset);
827 }
828
829 /* If GENERICP is nonzero, return nonzero iff C is a valid normal or
830 generic character. If GENERICP is zero, return nonzero iff C is a
831 valid normal character. Do not call this function directly,
832 instead use macro CHAR_VALID_P. */
833 int
834 char_valid_p (c, genericp)
835 int c, genericp;
836 {
837 int charset, c1, c2;
838
839 if (c < 0)
840 return 0;
841 if (SINGLE_BYTE_CHAR_P (c))
842 return 1;
843 SPLIT_NON_ASCII_CHAR (c, charset, c1, c2);
844 if (!CHARSET_VALID_P (charset))
845 return 0;
846 return (c < MIN_CHAR_COMPOSITION
847 ? ((c & CHAR_FIELD1_MASK) /* i.e. dimension of C is two. */
848 ? (genericp && c1 == 0 && c2 == 0
849 || c1 >= 32 && c2 >= 32)
850 : (genericp && c1 == 0
851 || c1 >= 32))
852 : c < MIN_CHAR_COMPOSITION + n_cmpchars);
853 }
854
855 DEFUN ("char-valid-p", Fchar_valid_p, Schar_valid_p, 1, 2, 0,
856 "Return t if OBJECT is a valid normal character.\n\
857 If optional arg GENERICP is non-nil, also return t if OBJECT is\n\
858 a valid generic character.")
859 (object, genericp)
860 Lisp_Object object, genericp;
861 {
862 if (! NATNUMP (object))
863 return Qnil;
864 return (CHAR_VALID_P (XFASTINT (object), !NILP (genericp)) ? Qt : Qnil);
865 }
866
867 DEFUN ("unibyte-char-to-multibyte", Funibyte_char_to_multibyte,
868 Sunibyte_char_to_multibyte, 1, 1, 0,
869 "Convert the unibyte character CH to multibyte character.\n\
870 The conversion is done based on nonascii-translate-table (which see)\n\
871 or nonascii-insert-offset (which see).")
872 (ch)
873 Lisp_Object ch;
874 {
875 int c;
876
877 CHECK_NUMBER (ch, 0);
878 c = XINT (ch);
879 if (c < 0 || c >= 0400)
880 error ("Invalid unibyte character: %d", c);
881 c = unibyte_char_to_multibyte (c);
882 if (c < 0)
883 error ("Can't convert to multibyte character: %d", XINT (ch));
884 return make_number (c);
885 }
886
887 DEFUN ("char-bytes", Fchar_bytes, Schar_bytes, 1, 1, 0,
888 "Return byte length of multi-byte form of CHAR.")
889 (ch)
890 Lisp_Object ch;
891 {
892 Lisp_Object val;
893 int bytes;
894
895 CHECK_NUMBER (ch, 0);
896 if (COMPOSITE_CHAR_P (XFASTINT (ch)))
897 {
898 unsigned int id = COMPOSITE_CHAR_ID (XFASTINT (ch));
899
900 bytes = (id < n_cmpchars ? cmpchar_table[id]->len : 1);
901 }
902 else
903 {
904 int charset = CHAR_CHARSET (XFASTINT (ch));
905
906 bytes = CHARSET_DEFINED_P (charset) ? CHARSET_BYTES (charset) : 1;
907 }
908
909 XSETFASTINT (val, bytes);
910 return val;
911 }
912
913 /* Return the width of character of which multi-byte form starts with
914 C. The width is measured by how many columns occupied on the
915 screen when displayed in the current buffer. */
916
917 #define ONE_BYTE_CHAR_WIDTH(c) \
918 (c < 0x20 \
919 ? (c == '\t' \
920 ? XFASTINT (current_buffer->tab_width) \
921 : (c == '\n' ? 0 : (NILP (current_buffer->ctl_arrow) ? 4 : 2))) \
922 : (c < 0x7f \
923 ? 1 \
924 : (c == 0x7F \
925 ? (NILP (current_buffer->ctl_arrow) ? 4 : 2) \
926 : ((! NILP (current_buffer->enable_multibyte_characters) \
927 && BASE_LEADING_CODE_P (c)) \
928 ? WIDTH_BY_CHAR_HEAD (c) \
929 : 4)))) \
930
931
932 DEFUN ("char-width", Fchar_width, Schar_width, 1, 1, 0,
933 "Return width of CHAR when displayed in the current buffer.\n\
934 The width is measured by how many columns it occupies on the screen.")
935 (ch)
936 Lisp_Object ch;
937 {
938 Lisp_Object val, disp;
939 int c;
940 struct Lisp_Char_Table *dp = buffer_display_table ();
941
942 CHECK_NUMBER (ch, 0);
943
944 c = XINT (ch);
945
946 /* Get the way the display table would display it. */
947 disp = dp ? DISP_CHAR_VECTOR (dp, c) : Qnil;
948
949 if (VECTORP (disp))
950 XSETINT (val, XVECTOR (disp)->size);
951 else if (SINGLE_BYTE_CHAR_P (c))
952 XSETINT (val, ONE_BYTE_CHAR_WIDTH (c));
953 else if (COMPOSITE_CHAR_P (c))
954 {
955 int id = COMPOSITE_CHAR_ID (XFASTINT (ch));
956 XSETFASTINT (val, (id < n_cmpchars ? cmpchar_table[id]->width : 0));
957 }
958 else
959 {
960 int charset = CHAR_CHARSET (c);
961
962 XSETFASTINT (val, CHARSET_WIDTH (charset));
963 }
964 return val;
965 }
966
967 /* Return width of string STR of length LEN when displayed in the
968 current buffer. The width is measured by how many columns it
969 occupies on the screen. */
970
971 int
972 strwidth (str, len)
973 unsigned char *str;
974 int len;
975 {
976 unsigned char *endp = str + len;
977 int width = 0;
978 struct Lisp_Char_Table *dp = buffer_display_table ();
979
980 while (str < endp)
981 {
982 if (*str == LEADING_CODE_COMPOSITION)
983 {
984 int id = str_cmpchar_id (str, endp - str);
985
986 if (id < 0)
987 {
988 width += 4;
989 str++;
990 }
991 else
992 {
993 width += cmpchar_table[id]->width;
994 str += cmpchar_table[id]->len;
995 }
996 }
997 else
998 {
999 Lisp_Object disp;
1000 int thislen;
1001 int c = STRING_CHAR_AND_LENGTH (str, endp - str, thislen);
1002
1003 /* Get the way the display table would display it. */
1004 if (dp)
1005 disp = DISP_CHAR_VECTOR (dp, c);
1006 else
1007 disp = Qnil;
1008
1009 if (VECTORP (disp))
1010 width += XVECTOR (disp)->size;
1011 else
1012 width += ONE_BYTE_CHAR_WIDTH (*str);
1013
1014 str += thislen;
1015 }
1016 }
1017 return width;
1018 }
1019
1020 DEFUN ("string-width", Fstring_width, Sstring_width, 1, 1, 0,
1021 "Return width of STRING when displayed in the current buffer.\n\
1022 Width is measured by how many columns it occupies on the screen.\n\
1023 When calculating width of a multibyte character in STRING,\n\
1024 only the base leading-code is considered; the validity of\n\
1025 the following bytes is not checked.")
1026 (str)
1027 Lisp_Object str;
1028 {
1029 Lisp_Object val;
1030
1031 CHECK_STRING (str, 0);
1032 XSETFASTINT (val, strwidth (XSTRING (str)->data,
1033 STRING_BYTES (XSTRING (str))));
1034 return val;
1035 }
1036
1037 DEFUN ("char-direction", Fchar_direction, Schar_direction, 1, 1, 0,
1038 "Return the direction of CHAR.\n\
1039 The returned value is 0 for left-to-right and 1 for right-to-left.")
1040 (ch)
1041 Lisp_Object ch;
1042 {
1043 int charset;
1044
1045 CHECK_NUMBER (ch, 0);
1046 charset = CHAR_CHARSET (XFASTINT (ch));
1047 if (!CHARSET_DEFINED_P (charset))
1048 invalid_character (XINT (ch));
1049 return CHARSET_TABLE_INFO (charset, CHARSET_DIRECTION_IDX);
1050 }
1051
1052 DEFUN ("chars-in-region", Fchars_in_region, Schars_in_region, 2, 2, 0,
1053 "Return number of characters between BEG and END.")
1054 (beg, end)
1055 Lisp_Object beg, end;
1056 {
1057 int from, to;
1058
1059 from = min (XFASTINT (beg), XFASTINT (end));
1060 to = max (XFASTINT (beg), XFASTINT (end));
1061
1062 return to - from;
1063 }
1064
1065 /* Return the number of characters in the NBYTES bytes at PTR.
1066 This works by looking at the contents and checking for multibyte sequences.
1067 However, if the current buffer has enable-multibyte-characters = nil,
1068 we treat each byte as a character. */
1069
1070 int
1071 chars_in_text (ptr, nbytes)
1072 unsigned char *ptr;
1073 int nbytes;
1074 {
1075 unsigned char *endp, c;
1076 int chars;
1077
1078 /* current_buffer is null at early stages of Emacs initialization. */
1079 if (current_buffer == 0
1080 || NILP (current_buffer->enable_multibyte_characters))
1081 return nbytes;
1082
1083 endp = ptr + nbytes;
1084 chars = 0;
1085
1086 while (ptr < endp)
1087 {
1088 c = *ptr++;
1089
1090 if (BASE_LEADING_CODE_P (c))
1091 while (ptr < endp && ! CHAR_HEAD_P (*ptr)) ptr++;
1092 chars++;
1093 }
1094
1095 return chars;
1096 }
1097
1098 /* Return the number of characters in the NBYTES bytes at PTR.
1099 This works by looking at the contents and checking for multibyte sequences.
1100 It ignores enable-multibyte-characters. */
1101
1102 int
1103 multibyte_chars_in_text (ptr, nbytes)
1104 unsigned char *ptr;
1105 int nbytes;
1106 {
1107 unsigned char *endp, c;
1108 int chars;
1109
1110 endp = ptr + nbytes;
1111 chars = 0;
1112
1113 while (ptr < endp)
1114 {
1115 c = *ptr++;
1116
1117 if (BASE_LEADING_CODE_P (c))
1118 while (ptr < endp && ! CHAR_HEAD_P (*ptr)) ptr++;
1119 chars++;
1120 }
1121
1122 return chars;
1123 }
1124
1125 DEFUN ("string", Fstring, Sstring, 1, MANY, 0,
1126 "Concatenate all the argument characters and make the result a string.")
1127 (n, args)
1128 int n;
1129 Lisp_Object *args;
1130 {
1131 int i;
1132 unsigned char *buf
1133 = (unsigned char *) alloca (MAX_LENGTH_OF_MULTI_BYTE_FORM * n);
1134 unsigned char *p = buf;
1135 Lisp_Object val;
1136
1137 for (i = 0; i < n; i++)
1138 {
1139 int c, len;
1140 unsigned char *str;
1141
1142 if (!INTEGERP (args[i]))
1143 CHECK_NUMBER (args[i], 0);
1144 c = XINT (args[i]);
1145 len = CHAR_STRING (c, p, str);
1146 if (p != str)
1147 /* C is a composite character. */
1148 bcopy (str, p, len);
1149 p += len;
1150 }
1151
1152 val = make_multibyte_string (buf, n, p - buf);
1153 return val;
1154 }
1155
1156 #endif /* emacs */
1157 \f
1158 /*** Composite characters staffs ***/
1159
1160 /* Each composite character is identified by CMPCHAR-ID which is
1161 assigned when Emacs needs the character code of the composite
1162 character (e.g. when displaying it on the screen). See the
1163 document "GENERAL NOTE on COMPOSITE CHARACTER" in `charset.h' how a
1164 composite character is represented in Emacs. */
1165
1166 /* If `static' is defined, it means that it is defined to null string. */
1167 #ifndef static
1168 /* The following function is copied from lread.c. */
1169 static int
1170 hash_string (ptr, len)
1171 unsigned char *ptr;
1172 int len;
1173 {
1174 register unsigned char *p = ptr;
1175 register unsigned char *end = p + len;
1176 register unsigned char c;
1177 register int hash = 0;
1178
1179 while (p != end)
1180 {
1181 c = *p++;
1182 if (c >= 0140) c -= 40;
1183 hash = ((hash<<3) + (hash>>28) + c);
1184 }
1185 return hash & 07777777777;
1186 }
1187 #endif
1188
1189 #define CMPCHAR_HASH_TABLE_SIZE 0xFFF
1190
1191 static int *cmpchar_hash_table[CMPCHAR_HASH_TABLE_SIZE];
1192
1193 /* Each element of `cmpchar_hash_table' is a pointer to an array of
1194 integer, where the 1st element is the size of the array, the 2nd
1195 element is how many elements are actually used in the array, and
1196 the remaining elements are CMPCHAR-IDs of composite characters of
1197 the same hash value. */
1198 #define CMPCHAR_HASH_SIZE(table) table[0]
1199 #define CMPCHAR_HASH_USED(table) table[1]
1200 #define CMPCHAR_HASH_CMPCHAR_ID(table, i) table[i]
1201
1202 /* Return CMPCHAR-ID of the composite character in STR of the length
1203 LEN. If the composite character has not yet been registered,
1204 register it in `cmpchar_table' and assign new CMPCHAR-ID. This
1205 is the sole function for assigning CMPCHAR-ID. */
1206 int
1207 str_cmpchar_id (str, len)
1208 unsigned char *str;
1209 int len;
1210 {
1211 int hash_idx, *hashp;
1212 unsigned char *buf;
1213 int embedded_rule; /* 1 if composition rule is embedded. */
1214 int chars; /* number of components. */
1215 int i;
1216 struct cmpchar_info *cmpcharp;
1217
1218 /* The second byte 0xFF means compostion rule is embedded. */
1219 embedded_rule = (str[1] == 0xFF);
1220
1221 /* At first, get the actual length of the composite character. */
1222 {
1223 unsigned char *p, *endp = str + 1, *lastp = str + len;
1224 int bytes;
1225
1226 while (endp < lastp && ! CHAR_HEAD_P (*endp)) endp++;
1227 if (endp - str < 5)
1228 /* Any composite char have at least 5-byte length. */
1229 return -1;
1230
1231 chars = 0;
1232 p = str + 1;
1233 while (p < endp)
1234 {
1235 if (embedded_rule) p++;
1236 /* No need of checking if *P is 0xA0 because
1237 BYTES_BY_CHAR_HEAD (0x80) surely returns 2. */
1238 p += BYTES_BY_CHAR_HEAD (*p - 0x20);
1239 chars++;
1240 }
1241 if (p > endp || chars < 2 || chars > MAX_COMPONENT_COUNT)
1242 /* Invalid components. */
1243 return -1;
1244 len = p - str;
1245 }
1246 hash_idx = hash_string (str, len) % CMPCHAR_HASH_TABLE_SIZE;
1247 hashp = cmpchar_hash_table[hash_idx];
1248
1249 /* Then, look into the hash table. */
1250 if (hashp != NULL)
1251 /* Find the correct one among composite characters of the same
1252 hash value. */
1253 for (i = 2; i < CMPCHAR_HASH_USED (hashp); i++)
1254 {
1255 cmpcharp = cmpchar_table[CMPCHAR_HASH_CMPCHAR_ID (hashp, i)];
1256 if (len == cmpcharp->len
1257 && ! bcmp (str, cmpcharp->data, len))
1258 return CMPCHAR_HASH_CMPCHAR_ID (hashp, i);
1259 }
1260
1261 /* We have to register the composite character in cmpchar_table. */
1262 if (n_cmpchars > (CHAR_FIELD2_MASK | CHAR_FIELD3_MASK))
1263 /* No, we have no more room for a new composite character. */
1264 return -1;
1265
1266 /* Make the entry in hash table. */
1267 if (hashp == NULL)
1268 {
1269 /* Make a table for 8 composite characters initially. */
1270 hashp = (cmpchar_hash_table[hash_idx]
1271 = (int *) xmalloc (sizeof (int) * (2 + 8)));
1272 CMPCHAR_HASH_SIZE (hashp) = 10;
1273 CMPCHAR_HASH_USED (hashp) = 2;
1274 }
1275 else if (CMPCHAR_HASH_USED (hashp) >= CMPCHAR_HASH_SIZE (hashp))
1276 {
1277 CMPCHAR_HASH_SIZE (hashp) += 8;
1278 hashp = (cmpchar_hash_table[hash_idx]
1279 = (int *) xrealloc (hashp,
1280 sizeof (int) * CMPCHAR_HASH_SIZE (hashp)));
1281 }
1282 CMPCHAR_HASH_CMPCHAR_ID (hashp, CMPCHAR_HASH_USED (hashp)) = n_cmpchars;
1283 CMPCHAR_HASH_USED (hashp)++;
1284
1285 /* Set information of the composite character in cmpchar_table. */
1286 if (cmpchar_table_size == 0)
1287 {
1288 /* This is the first composite character to be registered. */
1289 cmpchar_table_size = 256;
1290 cmpchar_table
1291 = (struct cmpchar_info **) xmalloc (sizeof (cmpchar_table[0])
1292 * cmpchar_table_size);
1293 }
1294 else if (cmpchar_table_size <= n_cmpchars)
1295 {
1296 cmpchar_table_size += 256;
1297 cmpchar_table
1298 = (struct cmpchar_info **) xrealloc (cmpchar_table,
1299 sizeof (cmpchar_table[0])
1300 * cmpchar_table_size);
1301 }
1302
1303 cmpcharp = (struct cmpchar_info *) xmalloc (sizeof (struct cmpchar_info));
1304
1305 cmpcharp->len = len;
1306 cmpcharp->data = (unsigned char *) xmalloc (len + 1);
1307 bcopy (str, cmpcharp->data, len);
1308 cmpcharp->data[len] = 0;
1309 cmpcharp->glyph_len = chars;
1310 cmpcharp->glyph = (GLYPH *) xmalloc (sizeof (GLYPH) * chars);
1311 if (embedded_rule)
1312 {
1313 cmpcharp->cmp_rule = (unsigned char *) xmalloc (chars);
1314 cmpcharp->col_offset = (float *) xmalloc (sizeof (float) * chars);
1315 }
1316 else
1317 {
1318 cmpcharp->cmp_rule = NULL;
1319 cmpcharp->col_offset = NULL;
1320 }
1321
1322 /* Setup GLYPH data and composition rules (if any) so as not to make
1323 them every time on displaying. */
1324 {
1325 unsigned char *bufp;
1326 int width;
1327 float leftmost = 0.0, rightmost = 1.0;
1328
1329 if (embedded_rule)
1330 /* At first, col_offset[N] is set to relative to col_offset[0]. */
1331 cmpcharp->col_offset[0] = 0;
1332
1333 for (i = 0, bufp = cmpcharp->data + 1; i < chars; i++)
1334 {
1335 if (embedded_rule)
1336 cmpcharp->cmp_rule[i] = *bufp++;
1337
1338 if (*bufp == 0xA0) /* This is an ASCII character. */
1339 {
1340 cmpcharp->glyph[i] = FAST_MAKE_GLYPH ((*++bufp & 0x7F), 0);
1341 width = 1;
1342 bufp++;
1343 }
1344 else /* Multibyte character. */
1345 {
1346 /* Make `bufp' point normal multi-byte form temporally. */
1347 *bufp -= 0x20;
1348 cmpcharp->glyph[i]
1349 = FAST_MAKE_GLYPH (string_to_non_ascii_char (bufp, 4, 0), 0);
1350 width = WIDTH_BY_CHAR_HEAD (*bufp);
1351 *bufp += 0x20;
1352 bufp += BYTES_BY_CHAR_HEAD (*bufp - 0x20);
1353 }
1354
1355 if (embedded_rule && i > 0)
1356 {
1357 /* Reference points (global_ref and new_ref) are
1358 encoded as below:
1359
1360 0--1--2 -- ascent
1361 | |
1362 | |
1363 | 4 -+--- center
1364 -- 3 5 -- baseline
1365 | |
1366 6--7--8 -- descent
1367
1368 Now, we calculate the column offset of the new glyph
1369 from the left edge of the first glyph. This can avoid
1370 the same calculation everytime displaying this
1371 composite character. */
1372
1373 /* Reference points of global glyph and new glyph. */
1374 int global_ref = (cmpcharp->cmp_rule[i] - 0xA0) / 9;
1375 int new_ref = (cmpcharp->cmp_rule[i] - 0xA0) % 9;
1376 /* Column offset relative to the first glyph. */
1377 float left = (leftmost
1378 + (global_ref % 3) * (rightmost - leftmost) / 2.0
1379 - (new_ref % 3) * width / 2.0);
1380
1381 cmpcharp->col_offset[i] = left;
1382 if (left < leftmost)
1383 leftmost = left;
1384 if (left + width > rightmost)
1385 rightmost = left + width;
1386 }
1387 else
1388 {
1389 if (width > rightmost)
1390 rightmost = width;
1391 }
1392 }
1393 if (embedded_rule)
1394 {
1395 /* Now col_offset[N] are relative to the left edge of the
1396 first component. Make them relative to the left edge of
1397 overall glyph. */
1398 for (i = 0; i < chars; i++)
1399 cmpcharp->col_offset[i] -= leftmost;
1400 /* Make rightmost holds width of overall glyph. */
1401 rightmost -= leftmost;
1402 }
1403
1404 cmpcharp->width = rightmost;
1405 if (cmpcharp->width < rightmost)
1406 /* To get a ceiling integer value. */
1407 cmpcharp->width++;
1408 }
1409
1410 cmpchar_table[n_cmpchars] = cmpcharp;
1411
1412 return n_cmpchars++;
1413 }
1414
1415 /* Return the Nth element of the composite character C. */
1416 int
1417 cmpchar_component (c, n)
1418 unsigned int c, n;
1419 {
1420 int id = COMPOSITE_CHAR_ID (c);
1421
1422 if (id >= n_cmpchars /* C is not a valid composite character. */
1423 || n >= cmpchar_table[id]->glyph_len) /* No such component. */
1424 return -1;
1425 /* No face data is stored in glyph code. */
1426 return ((int) (cmpchar_table[id]->glyph[n]));
1427 }
1428
1429 DEFUN ("cmpcharp", Fcmpcharp, Scmpcharp, 1, 1, 0,
1430 "T if CHAR is a composite character.")
1431 (ch)
1432 Lisp_Object ch;
1433 {
1434 CHECK_NUMBER (ch, 0);
1435 return (COMPOSITE_CHAR_P (XINT (ch)) ? Qt : Qnil);
1436 }
1437
1438 DEFUN ("composite-char-component", Fcmpchar_component, Scmpchar_component,
1439 2, 2, 0,
1440 "Return the IDXth component character of composite character CHARACTER.")
1441 (character, idx)
1442 Lisp_Object character, idx;
1443 {
1444 int c;
1445
1446 CHECK_NUMBER (character, 0);
1447 CHECK_NUMBER (idx, 1);
1448
1449 if ((c = cmpchar_component (XINT (character), XINT (idx))) < 0)
1450 args_out_of_range (character, idx);
1451
1452 return make_number (c);
1453 }
1454
1455 DEFUN ("composite-char-composition-rule", Fcmpchar_cmp_rule, Scmpchar_cmp_rule,
1456 2, 2, 0,
1457 "Return the Nth composition rule embedded in composite character CHARACTER.\n\
1458 The returned rule is for composing the Nth component\n\
1459 on the (N-1)th component. If N is 0, the returned value is always 255.")
1460 (character, n)
1461 Lisp_Object character, n;
1462 {
1463 int id, i;
1464
1465 CHECK_NUMBER (character, 0);
1466 CHECK_NUMBER (n, 1);
1467
1468 id = COMPOSITE_CHAR_ID (XINT (character));
1469 if (id < 0 || id >= n_cmpchars)
1470 error ("Invalid composite character: %d", XINT (character));
1471 i = XINT (n);
1472 if (i > cmpchar_table[id]->glyph_len)
1473 args_out_of_range (character, n);
1474
1475 return make_number (cmpchar_table[id]->cmp_rule[i]);
1476 }
1477
1478 DEFUN ("composite-char-composition-rule-p", Fcmpchar_cmp_rule_p,
1479 Scmpchar_cmp_rule_p, 1, 1, 0,
1480 "Return non-nil if composite character CHARACTER contains a embedded rule.")
1481 (character)
1482 Lisp_Object character;
1483 {
1484 int id;
1485
1486 CHECK_NUMBER (character, 0);
1487 id = COMPOSITE_CHAR_ID (XINT (character));
1488 if (id < 0 || id >= n_cmpchars)
1489 error ("Invalid composite character: %d", XINT (character));
1490
1491 return (cmpchar_table[id]->cmp_rule ? Qt : Qnil);
1492 }
1493
1494 DEFUN ("composite-char-component-count", Fcmpchar_cmp_count,
1495 Scmpchar_cmp_count, 1, 1, 0,
1496 "Return number of compoents of composite character CHARACTER.")
1497 (character)
1498 Lisp_Object character;
1499 {
1500 int id;
1501
1502 CHECK_NUMBER (character, 0);
1503 id = COMPOSITE_CHAR_ID (XINT (character));
1504 if (id < 0 || id >= n_cmpchars)
1505 error ("Invalid composite character: %d", XINT (character));
1506
1507 return (make_number (cmpchar_table[id]->glyph_len));
1508 }
1509
1510 DEFUN ("compose-string", Fcompose_string, Scompose_string,
1511 1, 1, 0,
1512 "Return one char string composed from all characters in STRING.")
1513 (str)
1514 Lisp_Object str;
1515 {
1516 unsigned char buf[MAX_LENGTH_OF_MULTI_BYTE_FORM], *p, *pend, *ptemp;
1517 int len, i;
1518
1519 CHECK_STRING (str, 0);
1520
1521 buf[0] = LEADING_CODE_COMPOSITION;
1522 p = XSTRING (str)->data;
1523 pend = p + STRING_BYTES (XSTRING (str));
1524 i = 1;
1525 while (p < pend)
1526 {
1527 if (*p < 0x20 || *p == 127) /* control code */
1528 error ("Invalid component character: %d", *p);
1529 else if (*p < 0x80) /* ASCII */
1530 {
1531 if (i + 2 >= MAX_LENGTH_OF_MULTI_BYTE_FORM)
1532 error ("Too long string to be composed: %s", XSTRING (str)->data);
1533 /* Prepend an ASCII charset indicator 0xA0, set MSB of the
1534 code itself. */
1535 buf[i++] = 0xA0;
1536 buf[i++] = *p++ + 0x80;
1537 }
1538 else if (*p == LEADING_CODE_COMPOSITION) /* composite char */
1539 {
1540 /* Already composed. Eliminate the heading
1541 LEADING_CODE_COMPOSITION, keep the remaining bytes
1542 unchanged. */
1543 p++;
1544 ptemp = p;
1545 while (! CHAR_HEAD_P (*p)) p++;
1546 if (i + (p - ptemp) >= MAX_LENGTH_OF_MULTI_BYTE_FORM)
1547 error ("Too long string to be composed: %s", XSTRING (str)->data);
1548 bcopy (ptemp, buf + i, p - ptemp);
1549 i += p - ptemp;
1550 }
1551 else /* multibyte char */
1552 {
1553 /* Add 0x20 to the base leading-code, keep the remaining
1554 bytes unchanged. */
1555 len = BYTES_BY_CHAR_HEAD (*p);
1556 if (i + len >= MAX_LENGTH_OF_MULTI_BYTE_FORM)
1557 error ("Too long string to be composed: %s", XSTRING (str)->data);
1558 bcopy (p, buf + i, len);
1559 buf[i] += 0x20;
1560 p += len, i += len;
1561 }
1562 }
1563
1564 if (i < 5)
1565 /* STR contains only one character, which can't be composed. */
1566 error ("Too short string to be composed: %s", XSTRING (str)->data);
1567
1568 return make_multibyte_string (buf, 1, i);
1569 }
1570
1571 \f
1572 charset_id_internal (charset_name)
1573 char *charset_name;
1574 {
1575 Lisp_Object val = Fget (intern (charset_name), Qcharset);
1576
1577 if (!VECTORP (val))
1578 error ("Charset %s is not defined", charset_name);
1579
1580 return (XINT (XVECTOR (val)->contents[0]));
1581 }
1582
1583 DEFUN ("setup-special-charsets", Fsetup_special_charsets,
1584 Ssetup_special_charsets, 0, 0, 0, "Internal use only.")
1585 ()
1586 {
1587 charset_latin_iso8859_1 = charset_id_internal ("latin-iso8859-1");
1588 charset_jisx0208_1978 = charset_id_internal ("japanese-jisx0208-1978");
1589 charset_jisx0208 = charset_id_internal ("japanese-jisx0208");
1590 charset_katakana_jisx0201 = charset_id_internal ("katakana-jisx0201");
1591 charset_latin_jisx0201 = charset_id_internal ("latin-jisx0201");
1592 charset_big5_1 = charset_id_internal ("chinese-big5-1");
1593 charset_big5_2 = charset_id_internal ("chinese-big5-2");
1594 return Qnil;
1595 }
1596
1597 init_charset_once ()
1598 {
1599 int i, j, k;
1600
1601 staticpro (&Vcharset_table);
1602 staticpro (&Vcharset_symbol_table);
1603
1604 /* This has to be done here, before we call Fmake_char_table. */
1605 Qcharset_table = intern ("charset-table");
1606 staticpro (&Qcharset_table);
1607
1608 /* Intern this now in case it isn't already done.
1609 Setting this variable twice is harmless.
1610 But don't staticpro it here--that is done in alloc.c. */
1611 Qchar_table_extra_slots = intern ("char-table-extra-slots");
1612
1613 /* Now we are ready to set up this property, so we can
1614 create the charset table. */
1615 Fput (Qcharset_table, Qchar_table_extra_slots, make_number (0));
1616 Vcharset_table = Fmake_char_table (Qcharset_table, Qnil);
1617
1618 Vcharset_symbol_table = Fmake_vector (make_number (MAX_CHARSET + 1), Qnil);
1619
1620 /* Setup tables. */
1621 for (i = 0; i < 2; i++)
1622 for (j = 0; j < 2; j++)
1623 for (k = 0; k < 128; k++)
1624 iso_charset_table [i][j][k] = -1;
1625
1626 bzero (cmpchar_hash_table, sizeof cmpchar_hash_table);
1627 cmpchar_table_size = n_cmpchars = 0;
1628
1629 for (i = 0; i < 256; i++)
1630 BYTES_BY_CHAR_HEAD (i) = 1;
1631 BYTES_BY_CHAR_HEAD (LEADING_CODE_PRIVATE_11) = 3;
1632 BYTES_BY_CHAR_HEAD (LEADING_CODE_PRIVATE_12) = 3;
1633 BYTES_BY_CHAR_HEAD (LEADING_CODE_PRIVATE_21) = 4;
1634 BYTES_BY_CHAR_HEAD (LEADING_CODE_PRIVATE_22) = 4;
1635 /* The following doesn't reflect the actual bytes, but just to tell
1636 that it is a start of a multibyte character. */
1637 BYTES_BY_CHAR_HEAD (LEADING_CODE_COMPOSITION) = 2;
1638
1639 for (i = 0; i < 128; i++)
1640 WIDTH_BY_CHAR_HEAD (i) = 1;
1641 for (; i < 256; i++)
1642 WIDTH_BY_CHAR_HEAD (i) = 4;
1643 WIDTH_BY_CHAR_HEAD (LEADING_CODE_PRIVATE_11) = 1;
1644 WIDTH_BY_CHAR_HEAD (LEADING_CODE_PRIVATE_12) = 2;
1645 WIDTH_BY_CHAR_HEAD (LEADING_CODE_PRIVATE_21) = 1;
1646 WIDTH_BY_CHAR_HEAD (LEADING_CODE_PRIVATE_22) = 2;
1647 }
1648
1649 #ifdef emacs
1650
1651 syms_of_charset ()
1652 {
1653 Qascii = intern ("ascii");
1654 staticpro (&Qascii);
1655
1656 Qcharset = intern ("charset");
1657 staticpro (&Qcharset);
1658
1659 /* Define ASCII charset now. */
1660 update_charset_table (make_number (CHARSET_ASCII),
1661 make_number (1), make_number (94),
1662 make_number (1),
1663 make_number (0),
1664 make_number ('B'),
1665 make_number (0),
1666 build_string ("ASCII"),
1667 build_string ("ASCII"),
1668 build_string ("ASCII (ISO646 IRV)"));
1669 CHARSET_SYMBOL (CHARSET_ASCII) = Qascii;
1670 Fput (Qascii, Qcharset, CHARSET_TABLE_ENTRY (CHARSET_ASCII));
1671
1672 Qcomposition = intern ("composition");
1673 staticpro (&Qcomposition);
1674 CHARSET_SYMBOL (CHARSET_COMPOSITION) = Qcomposition;
1675
1676 defsubr (&Sdefine_charset);
1677 defsubr (&Sget_unused_iso_final_char);
1678 defsubr (&Sdeclare_equiv_charset);
1679 defsubr (&Sfind_charset_region);
1680 defsubr (&Sfind_charset_string);
1681 defsubr (&Smake_char_internal);
1682 defsubr (&Ssplit_char);
1683 defsubr (&Schar_charset);
1684 defsubr (&Siso_charset);
1685 defsubr (&Schar_valid_p);
1686 defsubr (&Sunibyte_char_to_multibyte);
1687 defsubr (&Schar_bytes);
1688 defsubr (&Schar_width);
1689 defsubr (&Sstring_width);
1690 defsubr (&Schar_direction);
1691 defsubr (&Schars_in_region);
1692 defsubr (&Sstring);
1693 defsubr (&Scmpcharp);
1694 defsubr (&Scmpchar_component);
1695 defsubr (&Scmpchar_cmp_rule);
1696 defsubr (&Scmpchar_cmp_rule_p);
1697 defsubr (&Scmpchar_cmp_count);
1698 defsubr (&Scompose_string);
1699 defsubr (&Ssetup_special_charsets);
1700
1701 DEFVAR_LISP ("charset-list", &Vcharset_list,
1702 "List of charsets ever defined.");
1703 Vcharset_list = Fcons (Qascii, Qnil);
1704
1705 DEFVAR_LISP ("character-unification-table-vector",
1706 &Vcharacter_unification_table_vector,
1707 "Vector of cons cell of a symbol and unification table ever defined.\n\
1708 An ID of a unification table is an index of this vector.");
1709 Vcharacter_unification_table_vector = Fmake_vector (make_number (16), Qnil);
1710
1711 DEFVAR_INT ("leading-code-composition", &leading_code_composition,
1712 "Leading-code of composite characters.");
1713 leading_code_composition = LEADING_CODE_COMPOSITION;
1714
1715 DEFVAR_INT ("leading-code-private-11", &leading_code_private_11,
1716 "Leading-code of private TYPE9N charset of column-width 1.");
1717 leading_code_private_11 = LEADING_CODE_PRIVATE_11;
1718
1719 DEFVAR_INT ("leading-code-private-12", &leading_code_private_12,
1720 "Leading-code of private TYPE9N charset of column-width 2.");
1721 leading_code_private_12 = LEADING_CODE_PRIVATE_12;
1722
1723 DEFVAR_INT ("leading-code-private-21", &leading_code_private_21,
1724 "Leading-code of private TYPE9Nx9N charset of column-width 1.");
1725 leading_code_private_21 = LEADING_CODE_PRIVATE_21;
1726
1727 DEFVAR_INT ("leading-code-private-22", &leading_code_private_22,
1728 "Leading-code of private TYPE9Nx9N charset of column-width 2.");
1729 leading_code_private_22 = LEADING_CODE_PRIVATE_22;
1730
1731 DEFVAR_INT ("nonascii-insert-offset", &nonascii_insert_offset,
1732 "Offset for converting non-ASCII unibyte codes 0240...0377 to multibyte.\n\
1733 This is used for converting unibyte text to multibyte,\n\
1734 and for inserting character codes specified by number.\n\n\
1735 Conversion is performed only when multibyte characters are enabled,\n\
1736 and it serves to convert a Latin-1 or similar 8-bit character code\n\
1737 to the corresponding Emacs character code.\n\
1738 If `nonascii-translate-table' is non-nil, it overrides this variable.");
1739 nonascii_insert_offset = 0;
1740
1741 DEFVAR_LISP ("nonascii-translate-table", &Vnonascii_translate_table,
1742 "Translate table for converting non-ASCII unibyte codes to multibyte.\n\
1743 This is used for converting unibyte text to multibyte,\n\
1744 and for inserting character codes specified by number.\n\n\
1745 Conversion is performed only when multibyte characters are enabled,\n\
1746 and it serves to convert a Latin-1 or similar 8-bit character code\n\
1747 to the corresponding Emacs character code.\n\n\
1748 If this is nil, `nonascii-insert-offset' is used instead.");
1749 Vnonascii_translate_table = Qnil;
1750
1751 DEFVAR_INT ("min-composite-char", &min_composite_char,
1752 "Minimum character code of a composite character.");
1753 min_composite_char = MIN_CHAR_COMPOSITION;
1754 }
1755
1756 #endif /* emacs */