Can use linear algorithm for indentation if Emacs supports it.
[bpt/emacs.git] / src / charset.c
CommitLineData
75c8c592 1/* Basic multilingual character support.
35e623fb 2 Copyright (C) 1995, 1997, 1998 Electrotechnical Laboratory, JAPAN.
75c8c592 3 Licensed to the Free Software Foundation.
4ed46869 4
369314dc
KH
5This file is part of GNU Emacs.
6
7GNU Emacs is free software; you can redistribute it and/or modify
8it under the terms of the GNU General Public License as published by
9the Free Software Foundation; either version 2, or (at your option)
10any later version.
4ed46869 11
369314dc
KH
12GNU Emacs is distributed in the hope that it will be useful,
13but WITHOUT ANY WARRANTY; without even the implied warranty of
14MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15GNU General Public License for more details.
4ed46869 16
369314dc
KH
17You should have received a copy of the GNU General Public License
18along with GNU Emacs; see the file COPYING. If not, write to
19the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
20Boston, MA 02111-1307, USA. */
4ed46869
KH
21
22/* At first, see the document in `charset.h' to understand the code in
23 this file. */
24
25#include <stdio.h>
26
27#ifdef emacs
28
29#include <sys/types.h>
30#include <config.h>
31#include "lisp.h"
32#include "buffer.h"
33#include "charset.h"
34#include "coding.h"
fc6b09bf 35#include "disptab.h"
4ed46869
KH
36
37#else /* not emacs */
38
39#include "mulelib.h"
40
41#endif /* emacs */
42
43Lisp_Object Qcharset, Qascii, Qcomposition;
0282eb69 44Lisp_Object Qunknown;
4ed46869
KH
45
46/* Declaration of special leading-codes. */
47int leading_code_composition; /* for composite characters */
48int leading_code_private_11; /* for private DIMENSION1 of 1-column */
49int leading_code_private_12; /* for private DIMENSION1 of 2-column */
50int leading_code_private_21; /* for private DIMENSION2 of 1-column */
51int leading_code_private_22; /* for private DIMENSION2 of 2-column */
52
53/* Declaration of special charsets. */
54int charset_ascii; /* ASCII */
55int charset_composition; /* for a composite character */
56int charset_latin_iso8859_1; /* ISO8859-1 (Latin-1) */
57int charset_jisx0208_1978; /* JISX0208.1978 (Japanese Kanji old set) */
58int charset_jisx0208; /* JISX0208.1983 (Japanese Kanji) */
59int charset_katakana_jisx0201; /* JISX0201.Kana (Japanese Katakana) */
60int charset_latin_jisx0201; /* JISX0201.Roman (Japanese Roman) */
61int charset_big5_1; /* Big5 Level 1 (Chinese Traditional) */
62int charset_big5_2; /* Big5 Level 2 (Chinese Traditional) */
63
b0e3cf2b
KH
64int min_composite_char;
65
4ed46869
KH
66Lisp_Object Qcharset_table;
67
68/* A char-table containing information of each character set. */
69Lisp_Object Vcharset_table;
70
71/* A vector of charset symbol indexed by charset-id. This is used
72 only for returning charset symbol from C functions. */
73Lisp_Object Vcharset_symbol_table;
74
75/* A list of charset symbols ever defined. */
76Lisp_Object Vcharset_list;
77
537efd8d
KH
78/* Vector of translation table ever defined.
79 ID of a translation table is used to index this vector. */
80Lisp_Object Vtranslation_table_vector;
b0e3cf2b 81
c1a08b4c
KH
82/* A char-table for characters which may invoke auto-filling. */
83Lisp_Object Vauto_fill_chars;
84
85Lisp_Object Qauto_fill_chars;
86
4ed46869
KH
87/* Tables used by macros BYTES_BY_CHAR_HEAD and WIDTH_BY_CHAR_HEAD. */
88int bytes_by_char_head[256];
89int width_by_char_head[256];
90
91/* Mapping table from ISO2022's charset (specified by DIMENSION,
92 CHARS, and FINAL-CHAR) to Emacs' charset. */
93int iso_charset_table[2][2][128];
94
513ee442
KH
95/* Table of pointers to the structure `cmpchar_info' indexed by
96 CMPCHAR-ID. */
97struct cmpchar_info **cmpchar_table;
98/* The current size of `cmpchar_table'. */
99static int cmpchar_table_size;
100/* Number of the current composite characters. */
101int n_cmpchars;
102
4ed46869
KH
103/* Variables used locally in the macro FETCH_MULTIBYTE_CHAR. */
104unsigned char *_fetch_multibyte_char_p;
105int _fetch_multibyte_char_len;
106
35e623fb
RS
107/* Offset to add to a non-ASCII value when inserting it. */
108int nonascii_insert_offset;
109
4cf9710d
RS
110/* Translation table for converting non-ASCII unibyte characters
111 to multibyte codes, or nil. */
b4e9dd77 112Lisp_Object Vnonascii_translation_table;
4cf9710d 113
8a73a704
KH
114/* List of all possible generic characters. */
115Lisp_Object Vgeneric_character_list;
116
046b1f03
RS
117#define min(X, Y) ((X) < (Y) ? (X) : (Y))
118#define max(X, Y) ((X) > (Y) ? (X) : (Y))
119\f
93bcb785
KH
120void
121invalid_character (c)
122 int c;
123{
ba7434e5 124 error ("Invalid character: 0%o, %d, 0x%x", c, c, c);
93bcb785
KH
125}
126
127
4ed46869
KH
128/* Set STR a pointer to the multi-byte form of the character C. If C
129 is not a composite character, the multi-byte form is set in WORKBUF
130 and STR points WORKBUF. The caller should allocate at least 4-byte
131 area at WORKBUF in advance. Returns the length of the multi-byte
bd4c6dd0
KH
132 form. If C is an invalid character to have a multi-byte form,
133 signal an error.
4ed46869
KH
134
135 Use macro `CHAR_STRING (C, WORKBUF, STR)' instead of calling this
136 function directly if C can be an ASCII character. */
137
138int
139non_ascii_char_to_string (c, workbuf, str)
140 int c;
141 unsigned char *workbuf, **str;
142{
6dc0722d 143 int charset, c1, c2;
4ed46869 144
0282eb69 145 if (c & ~GLYPH_MASK_CHAR) /* This includes the case C is negative. */
8ac5a9cc
KH
146 {
147 if (c & CHAR_META)
148 /* Move the meta bit to the right place for a string. */
149 c |= 0x80;
150 if (c & CHAR_CTL)
151 c &= 0x9F;
152 else if (c & CHAR_SHIFT && (c & 0x7F) >= 'a' && (c & 0x7F) <= 'z')
153 c -= 'a' - 'A';
154 *str = workbuf;
155 *workbuf = c;
156 return 1;
157 }
158
da63a5fe
KH
159 if (c < 0)
160 invalid_character (c);
161
4ed46869
KH
162 if (COMPOSITE_CHAR_P (c))
163 {
164 int cmpchar_id = COMPOSITE_CHAR_ID (c);
165
166 if (cmpchar_id < n_cmpchars)
167 {
168 *str = cmpchar_table[cmpchar_id]->data;
169 return cmpchar_table[cmpchar_id]->len;
170 }
171 else
172 {
93bcb785 173 invalid_character (c);
4ed46869
KH
174 }
175 }
176
177 SPLIT_NON_ASCII_CHAR (c, charset, c1, c2);
bd4c6dd0
KH
178 if (!charset
179 || ! CHARSET_DEFINED_P (charset)
180 || c1 >= 0 && c1 < 32
181 || c2 >= 0 && c2 < 32)
93bcb785 182 invalid_character (c);
4ed46869
KH
183
184 *str = workbuf;
185 *workbuf++ = CHARSET_LEADING_CODE_BASE (charset);
186 if (*workbuf = CHARSET_LEADING_CODE_EXT (charset))
187 workbuf++;
188 *workbuf++ = c1 | 0x80;
6dc0722d 189 if (c2 >= 0)
4ed46869
KH
190 *workbuf++ = c2 | 0x80;
191
192 return (workbuf - *str);
193}
194
195/* Return a non-ASCII character of which multi-byte form is at STR of
196 length LEN. If ACTUAL_LEN is not NULL, the actual length of the
537efd8d
KH
197 multibyte form is set to the address ACTUAL_LEN.
198
199 If exclude_tail_garbage is nonzero, ACTUAL_LEN excludes gabage
200 bytes following the non-ASCII character.
4ed46869
KH
201
202 Use macro `STRING_CHAR (STR, LEN)' instead of calling this function
203 directly if STR can hold an ASCII character. */
204
dfcf069d 205int
537efd8d 206string_to_non_ascii_char (str, len, actual_len, exclude_tail_garbage)
8867de67 207 const unsigned char *str;
537efd8d 208 int len, *actual_len, exclude_tail_garbage;
4ed46869
KH
209{
210 int charset;
211 unsigned char c1, c2;
bb63e573
KH
212 int c, bytes;
213 const unsigned char *begp = str;
4ed46869 214
bb63e573 215 c = *str++;
90d7b74e
KH
216 bytes = 1;
217
218 if (BASE_LEADING_CODE_P (c))
bb63e573
KH
219 do {
220 while (bytes < len && ! CHAR_HEAD_P (begp[bytes])) bytes++;
90d7b74e
KH
221
222 if (c == LEADING_CODE_COMPOSITION)
223 {
bb63e573 224 int cmpchar_id = str_cmpchar_id (begp, bytes);
90d7b74e
KH
225
226 if (cmpchar_id >= 0)
5d76bc89
KH
227 {
228 c = MAKE_COMPOSITE_CHAR (cmpchar_id);
1dca54f6 229 str += cmpchar_table[cmpchar_id]->len - 1;
5d76bc89 230 }
1dca54f6
KH
231 else
232 str += bytes - 1;
90d7b74e
KH
233 }
234 else
235 {
bb63e573 236 const unsigned char *endp = begp + bytes;
90d7b74e 237 int charset = c, c1, c2 = 0;
4ed46869 238
bb63e573 239 if (str >= endp) break;
a6c25326 240 if (c >= LEADING_CODE_PRIVATE_11 && c <= LEADING_CODE_PRIVATE_22)
90d7b74e 241 {
bb63e573
KH
242 charset = *str++;
243 if (str < endp)
244 c1 = *str++ & 0x7F;
245 else
246 c1 = charset, charset = c;
90d7b74e 247 }
bb63e573
KH
248 else
249 c1 = *str++ & 0x7f;
250 if (CHARSET_DEFINED_P (charset)
251 && CHARSET_DIMENSION (charset) == 2
252 && str < endp)
1dca54f6 253 c2 = *str++ & 0x7F;
bb63e573 254 c = MAKE_NON_ASCII_CHAR (charset, c1, c2);
90d7b74e 255 }
bb63e573 256 } while (0);
4ed46869
KH
257
258 if (actual_len)
bb63e573 259 *actual_len = exclude_tail_garbage ? str - begp : bytes;
4ed46869
KH
260 return c;
261}
262
263/* Return the length of the multi-byte form at string STR of length LEN. */
264int
265multibyte_form_length (str, len)
8867de67 266 const unsigned char *str;
4ed46869
KH
267 int len;
268{
90d7b74e 269 int bytes = 1;
4ed46869 270
90d7b74e
KH
271 if (BASE_LEADING_CODE_P (*str))
272 while (bytes < len && ! CHAR_HEAD_P (str[bytes])) bytes++;
4ed46869 273
90d7b74e 274 return bytes;
4ed46869
KH
275}
276
277/* Check if string STR of length LEN contains valid multi-byte form of
278 a character. If valid, charset and position codes of the character
279 is set at *CHARSET, *C1, and *C2, and return 0. If not valid,
280 return -1. This should be used only in the macro SPLIT_STRING
281 which checks range of STR in advance. */
282
dfcf069d 283int
4ed46869 284split_non_ascii_string (str, len, charset, c1, c2)
8867de67
KH
285 register const unsigned char *str;
286 register unsigned char *c1, *c2;
4ed46869
KH
287 register int len, *charset;
288{
289 register unsigned int cs = *str++;
290
291 if (cs == LEADING_CODE_COMPOSITION)
292 {
293 int cmpchar_id = str_cmpchar_id (str - 1, len);
294
295 if (cmpchar_id < 0)
296 return -1;
297 *charset = cs, *c1 = cmpchar_id >> 7, *c2 = cmpchar_id & 0x7F;
298 }
299 else if ((cs < LEADING_CODE_PRIVATE_11 || (cs = *str++) >= 0xA0)
300 && CHARSET_DEFINED_P (cs))
301 {
302 *charset = cs;
303 if (*str < 0xA0)
304 return -1;
305 *c1 = (*str++) & 0x7F;
306 if (CHARSET_DIMENSION (cs) == 2)
307 {
308 if (*str < 0xA0)
309 return -1;
310 *c2 = (*str++) & 0x7F;
311 }
312 }
313 else
314 return -1;
315 return 0;
316}
317
537efd8d 318/* Translate character C by translation table TABLE. If C
b4e9dd77
KH
319 is negative, translate a character specified by CHARSET, C1, and C2
320 (C1 and C2 are code points of the character). If no translation is
321 found in TABLE, return C. */
dfcf069d 322int
b4e9dd77 323translate_char (table, c, charset, c1, c2)
23d2a7f1
KH
324 Lisp_Object table;
325 int c, charset, c1, c2;
326{
327 Lisp_Object ch;
328 int alt_charset, alt_c1, alt_c2, dimension;
329
330 if (c < 0) c = MAKE_CHAR (charset, c1, c2);
331 if (!CHAR_TABLE_P (table)
332 || (ch = Faref (table, make_number (c)), !INTEGERP (ch))
333 || XINT (ch) < 0)
334 return c;
335
336 SPLIT_CHAR (XFASTINT (ch), alt_charset, alt_c1, alt_c2);
337 dimension = CHARSET_DIMENSION (alt_charset);
338 if (dimension == 1 && alt_c1 > 0 || dimension == 2 && alt_c2 > 0)
339 /* CH is not a generic character, just return it. */
340 return XFASTINT (ch);
341
342 /* Since CH is a generic character, we must return a specific
343 charater which has the same position codes as C from CH. */
344 if (charset < 0)
345 SPLIT_CHAR (c, charset, c1, c2);
346 if (dimension != CHARSET_DIMENSION (charset))
347 /* We can't make such a character because of dimension mismatch. */
348 return c;
23d2a7f1
KH
349 return MAKE_CHAR (alt_charset, c1, c2);
350}
351
d2665018 352/* Convert the unibyte character C to multibyte based on
b4e9dd77 353 Vnonascii_translation_table or nonascii_insert_offset. If they can't
d2665018
KH
354 convert C to a valid multibyte character, convert it based on
355 DEFAULT_NONASCII_INSERT_OFFSET which makes C a Latin-1 character. */
35e623fb 356
dfcf069d 357int
35e623fb
RS
358unibyte_char_to_multibyte (c)
359 int c;
360{
543b4f61 361 if (c < 0400 && c >= 0200)
35e623fb 362 {
d2665018
KH
363 int c_save = c;
364
b4e9dd77 365 if (! NILP (Vnonascii_translation_table))
bbf12bb3
KH
366 {
367 c = XINT (Faref (Vnonascii_translation_table, make_number (c)));
368 if (c >= 0400 && ! VALID_MULTIBYTE_CHAR_P (c))
369 c = c_save + DEFAULT_NONASCII_INSERT_OFFSET;
370 }
371 else if (c >= 0240 && nonascii_insert_offset > 0)
372 {
373 c += nonascii_insert_offset;
374 if (c < 0400 || ! VALID_MULTIBYTE_CHAR_P (c))
375 c = c_save + DEFAULT_NONASCII_INSERT_OFFSET;
376 }
377 else if (c >= 0240)
d2665018 378 c = c_save + DEFAULT_NONASCII_INSERT_OFFSET;
35e623fb
RS
379 }
380 return c;
381}
76d7b829
KH
382
383
384/* Convert the multibyte character C to unibyte 8-bit character based
385 on Vnonascii_translation_table or nonascii_insert_offset. If
386 REV_TBL is non-nil, it should be a reverse table of
387 Vnonascii_translation_table, i.e. what given by:
388 Fchar_table_extra_slot (Vnonascii_translation_table, make_number (0)) */
389
390int
391multibyte_char_to_unibyte (c, rev_tbl)
392 int c;
393 Lisp_Object rev_tbl;
394{
395 if (!SINGLE_BYTE_CHAR_P (c))
396 {
397 int c_save = c;
398
399 if (! CHAR_TABLE_P (rev_tbl)
400 && CHAR_TABLE_P (Vnonascii_translation_table))
401 rev_tbl = Fchar_table_extra_slot (Vnonascii_translation_table,
402 make_number (0));
403 if (CHAR_TABLE_P (rev_tbl))
404 {
405 Lisp_Object temp;
406 temp = Faref (rev_tbl, make_number (c));
407 if (INTEGERP (temp))
408 c = XINT (temp);
bbf12bb3
KH
409 if (c >= 256)
410 c = (c_save & 0177) + 0200;
411 }
412 else
413 {
414 if (nonascii_insert_offset > 0)
415 c -= nonascii_insert_offset;
416 if (c < 128 || c >= 256)
417 c = (c_save & 0177) + 0200;
76d7b829 418 }
76d7b829
KH
419 }
420
421 return c;
422}
423
35e623fb 424\f
4ed46869
KH
425/* Update the table Vcharset_table with the given arguments (see the
426 document of `define-charset' for the meaning of each argument).
427 Several other table contents are also updated. The caller should
428 check the validity of CHARSET-ID and the remaining arguments in
429 advance. */
430
431void
432update_charset_table (charset_id, dimension, chars, width, direction,
433 iso_final_char, iso_graphic_plane,
434 short_name, long_name, description)
435 Lisp_Object charset_id, dimension, chars, width, direction;
436 Lisp_Object iso_final_char, iso_graphic_plane;
437 Lisp_Object short_name, long_name, description;
438{
439 int charset = XINT (charset_id);
440 int bytes;
441 unsigned char leading_code_base, leading_code_ext;
442
6dc0722d
KH
443 if (NILP (CHARSET_TABLE_ENTRY (charset)))
444 CHARSET_TABLE_ENTRY (charset)
445 = Fmake_vector (make_number (CHARSET_MAX_IDX), Qnil);
4ed46869
KH
446
447 /* Get byte length of multibyte form, base leading-code, and
448 extended leading-code of the charset. See the comment under the
449 title "GENERAL NOTE on CHARACTER SET (CHARSET)" in charset.h. */
450 bytes = XINT (dimension);
451 if (charset < MIN_CHARSET_PRIVATE_DIMENSION1)
452 {
453 /* Official charset, it doesn't have an extended leading-code. */
454 if (charset != CHARSET_ASCII)
455 bytes += 1; /* For a base leading-code. */
456 leading_code_base = charset;
457 leading_code_ext = 0;
458 }
459 else
460 {
461 /* Private charset. */
462 bytes += 2; /* For base and extended leading-codes. */
463 leading_code_base
464 = (charset < LEADING_CODE_EXT_12
465 ? LEADING_CODE_PRIVATE_11
466 : (charset < LEADING_CODE_EXT_21
467 ? LEADING_CODE_PRIVATE_12
468 : (charset < LEADING_CODE_EXT_22
469 ? LEADING_CODE_PRIVATE_21
470 : LEADING_CODE_PRIVATE_22)));
471 leading_code_ext = charset;
472 }
473
6ef23ebb
KH
474 if (BYTES_BY_CHAR_HEAD (leading_code_base) != bytes)
475 error ("Invalid dimension for the charset-ID %d", charset);
476
4ed46869
KH
477 CHARSET_TABLE_INFO (charset, CHARSET_ID_IDX) = charset_id;
478 CHARSET_TABLE_INFO (charset, CHARSET_BYTES_IDX) = make_number (bytes);
479 CHARSET_TABLE_INFO (charset, CHARSET_DIMENSION_IDX) = dimension;
480 CHARSET_TABLE_INFO (charset, CHARSET_CHARS_IDX) = chars;
481 CHARSET_TABLE_INFO (charset, CHARSET_WIDTH_IDX) = width;
482 CHARSET_TABLE_INFO (charset, CHARSET_DIRECTION_IDX) = direction;
483 CHARSET_TABLE_INFO (charset, CHARSET_LEADING_CODE_BASE_IDX)
484 = make_number (leading_code_base);
485 CHARSET_TABLE_INFO (charset, CHARSET_LEADING_CODE_EXT_IDX)
486 = make_number (leading_code_ext);
487 CHARSET_TABLE_INFO (charset, CHARSET_ISO_FINAL_CHAR_IDX) = iso_final_char;
488 CHARSET_TABLE_INFO (charset, CHARSET_ISO_GRAPHIC_PLANE_IDX)
489 = iso_graphic_plane;
490 CHARSET_TABLE_INFO (charset, CHARSET_SHORT_NAME_IDX) = short_name;
491 CHARSET_TABLE_INFO (charset, CHARSET_LONG_NAME_IDX) = long_name;
492 CHARSET_TABLE_INFO (charset, CHARSET_DESCRIPTION_IDX) = description;
493 CHARSET_TABLE_INFO (charset, CHARSET_PLIST_IDX) = Qnil;
494
495 {
496 /* If we have already defined a charset which has the same
497 DIMENSION, CHARS and ISO-FINAL-CHAR but the different
498 DIRECTION, we must update the entry REVERSE-CHARSET of both
499 charsets. If there's no such charset, the value of the entry
500 is set to nil. */
501 int i;
502
513ee442 503 for (i = 0; i <= MAX_CHARSET; i++)
4ed46869
KH
504 if (!NILP (CHARSET_TABLE_ENTRY (i)))
505 {
506 if (CHARSET_DIMENSION (i) == XINT (dimension)
507 && CHARSET_CHARS (i) == XINT (chars)
508 && CHARSET_ISO_FINAL_CHAR (i) == XINT (iso_final_char)
509 && CHARSET_DIRECTION (i) != XINT (direction))
510 {
511 CHARSET_TABLE_INFO (charset, CHARSET_REVERSE_CHARSET_IDX)
512 = make_number (i);
513 CHARSET_TABLE_INFO (i, CHARSET_REVERSE_CHARSET_IDX) = charset_id;
514 break;
515 }
516 }
513ee442 517 if (i > MAX_CHARSET)
4ed46869
KH
518 /* No such a charset. */
519 CHARSET_TABLE_INFO (charset, CHARSET_REVERSE_CHARSET_IDX)
520 = make_number (-1);
521 }
522
523 if (charset != CHARSET_ASCII
524 && charset < MIN_CHARSET_PRIVATE_DIMENSION1)
525 {
4ed46869
KH
526 width_by_char_head[leading_code_base] = XINT (width);
527
528 /* Update table emacs_code_class. */
529 emacs_code_class[charset] = (bytes == 2
530 ? EMACS_leading_code_2
531 : (bytes == 3
532 ? EMACS_leading_code_3
533 : EMACS_leading_code_4));
534 }
535
536 /* Update table iso_charset_table. */
537 if (ISO_CHARSET_TABLE (dimension, chars, iso_final_char) < 0)
538 ISO_CHARSET_TABLE (dimension, chars, iso_final_char) = charset;
539}
540
541#ifdef emacs
542
543/* Return charset id of CHARSET_SYMBOL, or return -1 if CHARSET_SYMBOL
544 is invalid. */
545int
546get_charset_id (charset_symbol)
547 Lisp_Object charset_symbol;
548{
549 Lisp_Object val;
550 int charset;
551
552 return ((SYMBOLP (charset_symbol)
553 && (val = Fget (charset_symbol, Qcharset), VECTORP (val))
554 && (charset = XINT (XVECTOR (val)->contents[CHARSET_ID_IDX]),
555 CHARSET_VALID_P (charset)))
556 ? charset : -1);
557}
558
559/* Return an identification number for a new private charset of
560 DIMENSION and WIDTH. If there's no more room for the new charset,
561 return 0. */
562Lisp_Object
563get_new_private_charset_id (dimension, width)
564 int dimension, width;
565{
566 int charset, from, to;
567
568 if (dimension == 1)
569 {
570 if (width == 1)
571 from = LEADING_CODE_EXT_11, to = LEADING_CODE_EXT_12;
572 else
573 from = LEADING_CODE_EXT_12, to = LEADING_CODE_EXT_21;
574 }
575 else
576 {
577 if (width == 1)
578 from = LEADING_CODE_EXT_21, to = LEADING_CODE_EXT_22;
579 else
b0e3cf2b 580 from = LEADING_CODE_EXT_22, to = LEADING_CODE_EXT_MAX + 1;
4ed46869
KH
581 }
582
583 for (charset = from; charset < to; charset++)
584 if (!CHARSET_DEFINED_P (charset)) break;
585
586 return make_number (charset < to ? charset : 0);
587}
588
589DEFUN ("define-charset", Fdefine_charset, Sdefine_charset, 3, 3, 0,
590 "Define CHARSET-ID as the identification number of CHARSET with INFO-VECTOR.\n\
23d2a7f1 591If CHARSET-ID is nil, it is decided automatically, which means CHARSET is\n\
4ed46869
KH
592 treated as a private charset.\n\
593INFO-VECTOR is a vector of the format:\n\
594 [DIMENSION CHARS WIDTH DIRECTION ISO-FINAL-CHAR ISO-GRAPHIC-PLANE\n\
595 SHORT-NAME LONG-NAME DESCRIPTION]\n\
596The meanings of each elements is as follows:\n\
597DIMENSION (integer) is the number of bytes to represent a character: 1 or 2.\n\
598CHARS (integer) is the number of characters in a dimension: 94 or 96.\n\
599WIDTH (integer) is the number of columns a character in the charset\n\
600occupies on the screen: one of 0, 1, and 2.\n\
601\n\
602DIRECTION (integer) is the rendering direction of characters in the\n\
277576f6
KH
603charset when rendering. If 0, render from left to right, else\n\
604render from right to left.\n\
4ed46869
KH
605\n\
606ISO-FINAL-CHAR (character) is the final character of the\n\
607corresponding ISO 2022 charset.\n\
608\n\
609ISO-GRAPHIC-PLANE (integer) is the graphic plane to be invoked\n\
610while encoding to variants of ISO 2022 coding system, one of the\n\
611following: 0/graphic-plane-left(GL), 1/graphic-plane-right(GR).\n\
612\n\
613SHORT-NAME (string) is the short name to refer to the charset.\n\
614\n\
615LONG-NAME (string) is the long name to refer to the charset.\n\
616\n\
617DESCRIPTION (string) is the description string of the charset.")
618 (charset_id, charset_symbol, info_vector)
619 Lisp_Object charset_id, charset_symbol, info_vector;
620{
621 Lisp_Object *vec;
622
623 if (!NILP (charset_id))
624 CHECK_NUMBER (charset_id, 0);
625 CHECK_SYMBOL (charset_symbol, 1);
626 CHECK_VECTOR (info_vector, 2);
627
628 if (! NILP (charset_id))
629 {
630 if (! CHARSET_VALID_P (XINT (charset_id)))
631 error ("Invalid CHARSET: %d", XINT (charset_id));
632 else if (CHARSET_DEFINED_P (XINT (charset_id)))
633 error ("Already defined charset: %d", XINT (charset_id));
634 }
635
636 vec = XVECTOR (info_vector)->contents;
637 if (XVECTOR (info_vector)->size != 9
638 || !INTEGERP (vec[0]) || !(XINT (vec[0]) == 1 || XINT (vec[0]) == 2)
639 || !INTEGERP (vec[1]) || !(XINT (vec[1]) == 94 || XINT (vec[1]) == 96)
640 || !INTEGERP (vec[2]) || !(XINT (vec[2]) == 1 || XINT (vec[2]) == 2)
641 || !INTEGERP (vec[3]) || !(XINT (vec[3]) == 0 || XINT (vec[3]) == 1)
642 || !INTEGERP (vec[4]) || !(XINT (vec[4]) >= '0' && XINT (vec[4]) <= '~')
643 || !INTEGERP (vec[5]) || !(XINT (vec[5]) == 0 || XINT (vec[5]) == 1)
644 || !STRINGP (vec[6])
645 || !STRINGP (vec[7])
646 || !STRINGP (vec[8]))
647 error ("Invalid info-vector argument for defining charset %s",
648 XSYMBOL (charset_symbol)->name->data);
649
650 if (NILP (charset_id))
651 {
652 charset_id = get_new_private_charset_id (XINT (vec[0]), XINT (vec[2]));
653 if (XINT (charset_id) == 0)
654 error ("There's no room for a new private charset %s",
655 XSYMBOL (charset_symbol)->name->data);
656 }
657
658 update_charset_table (charset_id, vec[0], vec[1], vec[2], vec[3],
659 vec[4], vec[5], vec[6], vec[7], vec[8]);
6dc0722d 660 Fput (charset_symbol, Qcharset, CHARSET_TABLE_ENTRY (XINT (charset_id)));
4ed46869
KH
661 CHARSET_SYMBOL (XINT (charset_id)) = charset_symbol;
662 Vcharset_list = Fcons (charset_symbol, Vcharset_list);
663 return Qnil;
664}
665
8a73a704
KH
666DEFUN ("generic-character-list", Fgeneric_character_list,
667 Sgeneric_character_list, 0, 0, 0,
668 "Return a list of all possible generic characters.\n\
669It includes a generic character for a charset not yet defined.")
670 ()
671{
672 return Vgeneric_character_list;
673}
674
3fac5a51
KH
675DEFUN ("get-unused-iso-final-char", Fget_unused_iso_final_char,
676 Sget_unused_iso_final_char, 2, 2, 0,
677 "Return an unsed ISO's final char for a charset of DIMENISION and CHARS.\n\
678DIMENSION is the number of bytes to represent a character: 1 or 2.\n\
679CHARS is the number of characters in a dimension: 94 or 96.\n\
680\n\
681This final char is for private use, thus the range is `0' (48) .. `?' (63).\n\
682If there's no unused final char for the specified kind of charset,\n\
683return nil.")
684 (dimension, chars)
685 Lisp_Object dimension, chars;
686{
687 int final_char;
688
689 CHECK_NUMBER (dimension, 0);
690 CHECK_NUMBER (chars, 1);
691 if (XINT (dimension) != 1 && XINT (dimension) != 2)
692 error ("Invalid charset dimension %d, it should be 1 or 2",
693 XINT (dimension));
694 if (XINT (chars) != 94 && XINT (chars) != 96)
695 error ("Invalid charset chars %d, it should be 94 or 96",
696 XINT (chars));
697 for (final_char = '0'; final_char <= '?'; final_char++)
698 {
699 if (ISO_CHARSET_TABLE (dimension, chars, make_number (final_char)) < 0)
700 break;
701 }
702 return (final_char <= '?' ? make_number (final_char) : Qnil);
703}
704
4ed46869
KH
705DEFUN ("declare-equiv-charset", Fdeclare_equiv_charset, Sdeclare_equiv_charset,
706 4, 4, 0,
707 "Declare a charset of DIMENSION, CHARS, FINAL-CHAR is the same as CHARSET.\n\
708CHARSET should be defined by `defined-charset' in advance.")
709 (dimension, chars, final_char, charset_symbol)
710 Lisp_Object dimension, chars, final_char, charset_symbol;
711{
712 int charset;
713
714 CHECK_NUMBER (dimension, 0);
715 CHECK_NUMBER (chars, 1);
716 CHECK_NUMBER (final_char, 2);
717 CHECK_SYMBOL (charset_symbol, 3);
718
719 if (XINT (dimension) != 1 && XINT (dimension) != 2)
720 error ("Invalid DIMENSION %d, it should be 1 or 2", XINT (dimension));
721 if (XINT (chars) != 94 && XINT (chars) != 96)
722 error ("Invalid CHARS %d, it should be 94 or 96", XINT (chars));
723 if (XINT (final_char) < '0' || XFASTINT (final_char) > '~')
724 error ("Invalid FINAL-CHAR %c, it should be `0'..`~'", XINT (chars));
725 if ((charset = get_charset_id (charset_symbol)) < 0)
726 error ("Invalid charset %s", XSYMBOL (charset_symbol)->name->data);
727
728 ISO_CHARSET_TABLE (dimension, chars, final_char) = charset;
729 return Qnil;
730}
731
732/* Return number of different charsets in STR of length LEN. In
733 addition, for each found charset N, CHARSETS[N] is set 1. The
a29e3b1b 734 caller should allocate CHARSETS (MAX_CHARSET + 1 elements) in advance.
1d67c29b
KH
735 It may lookup a translation table TABLE if supplied.
736
737 If CMPCHARP is nonzero and some composite character is found,
738 CHARSETS[128] is also set 1 and the returned number is incremented
0282eb69
KH
739 by 1.
740
741 If MULTIBYTE is zero, do not check multibyte characters, i.e. if
742 any ASCII codes (7-bit) are found, CHARSET[0] is set to 1, if any
743 8-bit codes are found CHARSET[1] is set to 1. */
4ed46869
KH
744
745int
0282eb69 746find_charset_in_str (str, len, charsets, table, cmpcharp, multibyte)
028d516b
KH
747 unsigned char *str;
748 int len, *charsets;
23d2a7f1 749 Lisp_Object table;
1d67c29b 750 int cmpcharp;
0282eb69 751 int multibyte;
4ed46869 752{
733eafd8 753 register int num = 0, c;
4ed46869 754
0282eb69
KH
755 if (! multibyte)
756 {
757 unsigned char *endp = str + len;
758 int maskbits = 0;
759
760 while (str < endp && maskbits != 3)
761 maskbits |= (*str++ < 0x80 ? 1 : 2);
762 if (maskbits & 1)
763 {
764 charsets[0] = 1;
765 num++;
766 }
767 if (maskbits & 2)
768 {
769 charsets[1] = 1;
770 num++;
771 }
772 return num;
773 }
774
23d2a7f1
KH
775 if (! CHAR_TABLE_P (table))
776 table = Qnil;
777
4ed46869
KH
778 while (len > 0)
779 {
05505664 780 int bytes, charset;
733eafd8 781 c = *str;
23d2a7f1 782
733eafd8 783 if (c == LEADING_CODE_COMPOSITION)
05505664 784 {
733eafd8
KH
785 int cmpchar_id = str_cmpchar_id (str, len);
786 GLYPH *glyph;
05505664 787
1d67c29b 788 if (cmpchar_id >= 0)
05505664 789 {
020da460 790 struct cmpchar_info *cmp_p = cmpchar_table[cmpchar_id];
733eafd8
KH
791 int i;
792
020da460 793 for (i = 0; i < cmp_p->glyph_len; i++)
733eafd8 794 {
020da460 795 c = cmp_p->glyph[i];
733eafd8
KH
796 if (!NILP (table))
797 {
b4e9dd77 798 if ((c = translate_char (table, c, 0, 0, 0)) < 0)
020da460 799 c = cmp_p->glyph[i];
733eafd8
KH
800 }
801 if ((charset = CHAR_CHARSET (c)) < 0)
802 charset = CHARSET_ASCII;
803 if (!charsets[charset])
804 {
805 charsets[charset] = 1;
806 num += 1;
807 }
808 }
020da460
KH
809 str += cmp_p->len;
810 len -= cmp_p->len;
811 if (cmpcharp && !charsets[CHARSET_COMPOSITION])
812 {
813 charsets[CHARSET_COMPOSITION] = 1;
814 num += 1;
815 }
733eafd8 816 continue;
05505664 817 }
05505664 818
0282eb69 819 charset = 1; /* This leads to `unknown' charset. */
733eafd8
KH
820 bytes = 1;
821 }
23d2a7f1
KH
822 else
823 {
733eafd8
KH
824 c = STRING_CHAR_AND_LENGTH (str, len, bytes);
825 if (! NILP (table))
826 {
b4e9dd77 827 int c1 = translate_char (table, c, 0, 0, 0);
733eafd8
KH
828 if (c1 >= 0)
829 c = c1;
830 }
831 charset = CHAR_CHARSET (c);
23d2a7f1 832 }
4ed46869
KH
833
834 if (!charsets[charset])
835 {
836 charsets[charset] = 1;
837 num += 1;
838 }
839 str += bytes;
840 len -= bytes;
841 }
842 return num;
843}
844
845DEFUN ("find-charset-region", Ffind_charset_region, Sfind_charset_region,
23d2a7f1 846 2, 3, 0,
4ed46869 847 "Return a list of charsets in the region between BEG and END.\n\
23d2a7f1 848BEG and END are buffer positions.\n\
020da460
KH
849If the region contains any composite character,\n\
850`composition' is included in the returned list.\n\
0282eb69
KH
851Optional arg TABLE if non-nil is a translation table to look up.\n\
852\n\
853If the region contains invalid multiybte characters,\n\
854`unknown' is included in the returned list.
855\n\
856If the current buffer is unibyte, the returned list contains\n\
857`ascii' if any 7-bit characters are found,\n\
858and `unknown' if any 8-bit characters are found.")
23d2a7f1
KH
859 (beg, end, table)
860 Lisp_Object beg, end, table;
4ed46869 861{
028d516b 862 int charsets[MAX_CHARSET + 1];
6ae1f27e 863 int from, from_byte, to, stop, stop_byte, i;
4ed46869 864 Lisp_Object val;
0282eb69
KH
865 int undefined;
866 int multibyte = !NILP (current_buffer->enable_multibyte_characters);
4ed46869
KH
867
868 validate_region (&beg, &end);
869 from = XFASTINT (beg);
870 stop = to = XFASTINT (end);
6ae1f27e 871
4ed46869 872 if (from < GPT && GPT < to)
6ae1f27e
RS
873 {
874 stop = GPT;
875 stop_byte = GPT_BYTE;
876 }
877 else
878 stop_byte = CHAR_TO_BYTE (stop);
879
880 from_byte = CHAR_TO_BYTE (from);
881
028d516b 882 bzero (charsets, (MAX_CHARSET + 1) * sizeof (int));
4ed46869
KH
883 while (1)
884 {
6ae1f27e 885 find_charset_in_str (BYTE_POS_ADDR (from_byte), stop_byte - from_byte,
0282eb69 886 charsets, table, 1, multibyte);
4ed46869 887 if (stop < to)
6ae1f27e
RS
888 {
889 from = stop, from_byte = stop_byte;
890 stop = to, stop_byte = CHAR_TO_BYTE (stop);
891 }
4ed46869
KH
892 else
893 break;
894 }
6ae1f27e 895
4ed46869 896 val = Qnil;
0282eb69
KH
897 undefined = 0;
898 for (i = (multibyte ? MAX_CHARSET : 1); i >= 0; i--)
4ed46869 899 if (charsets[i])
0282eb69
KH
900 {
901 if (CHARSET_DEFINED_P (i) || i == CHARSET_COMPOSITION)
902 val = Fcons (CHARSET_SYMBOL (i), val);
903 else
904 undefined = 1;
905 }
906 if (undefined)
907 val = Fcons (Qunknown, val);
4ed46869
KH
908 return val;
909}
910
911DEFUN ("find-charset-string", Ffind_charset_string, Sfind_charset_string,
23d2a7f1
KH
912 1, 2, 0,
913 "Return a list of charsets in STR.\n\
020da460
KH
914If the string contains any composite characters,\n\
915`composition' is included in the returned list.\n\
0282eb69
KH
916Optional arg TABLE if non-nil is a translation table to look up.\n\
917\n\
918If the region contains invalid multiybte characters,\n\
919`unknown' is included in the returned list.\n\
920\n\
921If STR is unibyte, the returned list contains\n\
922`ascii' if any 7-bit characters are found,\n\
923and `unknown' if any 8-bit characters are found.")
23d2a7f1
KH
924 (str, table)
925 Lisp_Object str, table;
4ed46869 926{
a29e3b1b 927 int charsets[MAX_CHARSET + 1];
4ed46869
KH
928 int i;
929 Lisp_Object val;
0282eb69
KH
930 int undefined;
931 int multibyte;
4ed46869
KH
932
933 CHECK_STRING (str, 0);
0282eb69 934 multibyte = STRING_MULTIBYTE (str);
87b089ad 935
a29e3b1b 936 bzero (charsets, (MAX_CHARSET + 1) * sizeof (int));
fc932ac6 937 find_charset_in_str (XSTRING (str)->data, STRING_BYTES (XSTRING (str)),
0282eb69 938 charsets, table, 1, multibyte);
4ed46869 939 val = Qnil;
0282eb69
KH
940 undefined = 0;
941 for (i = (multibyte ? MAX_CHARSET : 1); i >= 0; i--)
4ed46869 942 if (charsets[i])
0282eb69
KH
943 {
944 if (CHARSET_DEFINED_P (i) || i == CHARSET_COMPOSITION)
945 val = Fcons (CHARSET_SYMBOL (i), val);
946 else
947 undefined = 1;
948 }
949 if (undefined)
950 val = Fcons (Qunknown, val);
4ed46869
KH
951 return val;
952}
953\f
954DEFUN ("make-char-internal", Fmake_char_internal, Smake_char_internal, 1, 3, 0,
513ee442 955 "")
4ed46869
KH
956 (charset, code1, code2)
957 Lisp_Object charset, code1, code2;
958{
959 CHECK_NUMBER (charset, 0);
960
961 if (NILP (code1))
962 XSETFASTINT (code1, 0);
963 else
964 CHECK_NUMBER (code1, 1);
965 if (NILP (code2))
966 XSETFASTINT (code2, 0);
967 else
968 CHECK_NUMBER (code2, 2);
969
970 if (!CHARSET_DEFINED_P (XINT (charset)))
971 error ("Invalid charset: %d", XINT (charset));
972
973 return make_number (MAKE_CHAR (XINT (charset), XINT (code1), XINT (code2)));
974}
975
976DEFUN ("split-char", Fsplit_char, Ssplit_char, 1, 1, 0,
0282eb69
KH
977 "Return list of charset and one or two position-codes of CHAR.\n\
978If CHAR is invalid as a character code,\n\
979return a list of symbol `unknown' and CHAR.")
4ed46869
KH
980 (ch)
981 Lisp_Object ch;
982{
983 Lisp_Object val;
0282eb69 984 int c, charset, c1, c2;
4ed46869
KH
985
986 CHECK_NUMBER (ch, 0);
0282eb69
KH
987 c = XFASTINT (ch);
988 if (!CHAR_VALID_P (c, 1))
989 return Fcons (Qunknown, Fcons (ch, Qnil));
4ed46869 990 SPLIT_CHAR (XFASTINT (ch), charset, c1, c2);
6dc0722d 991 return (c2 >= 0
4ed46869
KH
992 ? Fcons (CHARSET_SYMBOL (charset),
993 Fcons (make_number (c1), Fcons (make_number (c2), Qnil)))
994 : Fcons (CHARSET_SYMBOL (charset), Fcons (make_number (c1), Qnil)));
995}
996
997DEFUN ("char-charset", Fchar_charset, Schar_charset, 1, 1, 0,
998 "Return charset of CHAR.")
999 (ch)
1000 Lisp_Object ch;
1001{
1002 CHECK_NUMBER (ch, 0);
1003
1004 return CHARSET_SYMBOL (CHAR_CHARSET (XINT (ch)));
1005}
1006
90d7b74e
KH
1007DEFUN ("charset-after", Fcharset_after, Scharset_after, 0, 1, 0,
1008 "Return charset of a character in current buffer at position POS.\n\
e6e114f2
KH
1009If POS is nil, it defauls to the current point.\n\
1010If POS is out of range, the value is nil.")
90d7b74e
KH
1011 (pos)
1012 Lisp_Object pos;
1013{
1014 register int pos_byte, c, charset;
1015 register unsigned char *p;
1016
1017 if (NILP (pos))
1018 pos_byte = PT_BYTE;
1019 else if (MARKERP (pos))
e6e114f2
KH
1020 {
1021 pos_byte = marker_byte_position (pos);
1022 if (pos_byte < BEGV_BYTE || pos_byte >= ZV_BYTE)
1023 return Qnil;
1024 }
90d7b74e
KH
1025 else
1026 {
1027 CHECK_NUMBER (pos, 0);
e6e114f2
KH
1028 if (XINT (pos) < BEGV || XINT (pos) >= ZV)
1029 return Qnil;
90d7b74e
KH
1030 pos_byte = CHAR_TO_BYTE (XINT (pos));
1031 }
1032 p = BYTE_POS_ADDR (pos_byte);
1033 c = STRING_CHAR (p, Z_BYTE - pos_byte);
1034 charset = CHAR_CHARSET (c);
1035 return CHARSET_SYMBOL (charset);
1036}
1037
4ed46869 1038DEFUN ("iso-charset", Fiso_charset, Siso_charset, 3, 3, 0,
2b71bb78
KH
1039 "Return charset of ISO's specification DIMENSION, CHARS, and FINAL-CHAR.\n\
1040\n\
1041ISO 2022's designation sequence (escape sequence) distinguishes charsets\n\
1042by their DIMENSION, CHARS, and FINAL-CHAR,\n\
1043where as Emacs distinguishes them by charset symbol.\n\
1044See the documentation of the function `charset-info' for the meanings of\n\
1045DIMENSION, CHARS, and FINAL-CHAR.")
4ed46869
KH
1046 (dimension, chars, final_char)
1047 Lisp_Object dimension, chars, final_char;
1048{
1049 int charset;
1050
1051 CHECK_NUMBER (dimension, 0);
1052 CHECK_NUMBER (chars, 1);
1053 CHECK_NUMBER (final_char, 2);
1054
1055 if ((charset = ISO_CHARSET_TABLE (dimension, chars, final_char)) < 0)
1056 return Qnil;
1057 return CHARSET_SYMBOL (charset);
1058}
1059
9d3d8cba
KH
1060/* If GENERICP is nonzero, return nonzero iff C is a valid normal or
1061 generic character. If GENERICP is zero, return nonzero iff C is a
1062 valid normal character. Do not call this function directly,
1063 instead use macro CHAR_VALID_P. */
1064int
1065char_valid_p (c, genericp)
1066 int c, genericp;
1067{
1068 int charset, c1, c2;
1069
1070 if (c < 0)
1071 return 0;
1072 if (SINGLE_BYTE_CHAR_P (c))
1073 return 1;
1074 SPLIT_NON_ASCII_CHAR (c, charset, c1, c2);
32278fd5 1075 if (charset != CHARSET_COMPOSITION && !CHARSET_DEFINED_P (charset))
9d3d8cba
KH
1076 return 0;
1077 return (c < MIN_CHAR_COMPOSITION
1078 ? ((c & CHAR_FIELD1_MASK) /* i.e. dimension of C is two. */
1079 ? (genericp && c1 == 0 && c2 == 0
1080 || c1 >= 32 && c2 >= 32)
1081 : (genericp && c1 == 0
1082 || c1 >= 32))
1083 : c < MIN_CHAR_COMPOSITION + n_cmpchars);
1084}
1085
1086DEFUN ("char-valid-p", Fchar_valid_p, Schar_valid_p, 1, 2, 0,
a9d02884
DL
1087 "Return t if OBJECT is a valid normal character.\n\
1088If optional arg GENERICP is non-nil, also return t if OBJECT is\n\
9d3d8cba
KH
1089a valid generic character.")
1090 (object, genericp)
1091 Lisp_Object object, genericp;
1092{
1093 if (! NATNUMP (object))
1094 return Qnil;
1095 return (CHAR_VALID_P (XFASTINT (object), !NILP (genericp)) ? Qt : Qnil);
1096}
1097
d2665018
KH
1098DEFUN ("unibyte-char-to-multibyte", Funibyte_char_to_multibyte,
1099 Sunibyte_char_to_multibyte, 1, 1, 0,
1100 "Convert the unibyte character CH to multibyte character.\n\
537efd8d 1101The conversion is done based on `nonascii-translation-table' (which see)\n\
340b8d58 1102 or `nonascii-insert-offset' (which see).")
d2665018
KH
1103 (ch)
1104 Lisp_Object ch;
1105{
1106 int c;
1107
1108 CHECK_NUMBER (ch, 0);
1109 c = XINT (ch);
1110 if (c < 0 || c >= 0400)
1111 error ("Invalid unibyte character: %d", c);
1112 c = unibyte_char_to_multibyte (c);
1113 if (c < 0)
1114 error ("Can't convert to multibyte character: %d", XINT (ch));
1115 return make_number (c);
1116}
1117
1bcc1567
RS
1118DEFUN ("multibyte-char-to-unibyte", Fmultibyte_char_to_unibyte,
1119 Smultibyte_char_to_unibyte, 1, 1, 0,
1120 "Convert the multibyte character CH to unibyte character.\n\
1121The conversion is done based on `nonascii-translation-table' (which see)\n\
1122 or `nonascii-insert-offset' (which see).")
1123 (ch)
1124 Lisp_Object ch;
1125{
1126 int c;
1127
1128 CHECK_NUMBER (ch, 0);
1129 c = XINT (ch);
1130 if (c < 0)
1131 error ("Invalid multibyte character: %d", c);
1132 c = multibyte_char_to_unibyte (c, Qnil);
1133 if (c < 0)
1134 error ("Can't convert to unibyte character: %d", XINT (ch));
1135 return make_number (c);
1136}
1137
4ed46869 1138DEFUN ("char-bytes", Fchar_bytes, Schar_bytes, 1, 1, 0,
f78643ef 1139 "Return 1 regardless of the argument CHAR.\n\
60022cb7 1140This is now an obsolete function. We keep it just for backward compatibility.")
4ed46869
KH
1141 (ch)
1142 Lisp_Object ch;
1143{
1144 Lisp_Object val;
4ed46869
KH
1145
1146 CHECK_NUMBER (ch, 0);
9b6a601f
KH
1147 return make_number (1);
1148}
1149
1150/* Return how many bytes C will occupy in a multibyte buffer.
1151 Don't call this function directly, instead use macro CHAR_BYTES. */
1152int
1153char_bytes (c)
1154 int c;
1155{
1156 int bytes;
1157
8ac5a9cc
KH
1158 if (SINGLE_BYTE_CHAR_P (c) || (c & ~GLYPH_MASK_CHAR))
1159 return 1;
1160
9b6a601f 1161 if (COMPOSITE_CHAR_P (c))
4ed46869 1162 {
9b6a601f 1163 unsigned int id = COMPOSITE_CHAR_ID (c);
4ed46869
KH
1164
1165 bytes = (id < n_cmpchars ? cmpchar_table[id]->len : 1);
1166 }
1167 else
1168 {
9b6a601f 1169 int charset = CHAR_CHARSET (c);
4ed46869
KH
1170
1171 bytes = CHARSET_DEFINED_P (charset) ? CHARSET_BYTES (charset) : 1;
1172 }
1173
60022cb7 1174 return bytes;
4ed46869
KH
1175}
1176
1177/* Return the width of character of which multi-byte form starts with
1178 C. The width is measured by how many columns occupied on the
1179 screen when displayed in the current buffer. */
1180
1181#define ONE_BYTE_CHAR_WIDTH(c) \
1182 (c < 0x20 \
1183 ? (c == '\t' \
53316e55 1184 ? XFASTINT (current_buffer->tab_width) \
4ed46869
KH
1185 : (c == '\n' ? 0 : (NILP (current_buffer->ctl_arrow) ? 4 : 2))) \
1186 : (c < 0x7f \
1187 ? 1 \
1188 : (c == 0x7F \
1189 ? (NILP (current_buffer->ctl_arrow) ? 4 : 2) \
1190 : ((! NILP (current_buffer->enable_multibyte_characters) \
1191 && BASE_LEADING_CODE_P (c)) \
1192 ? WIDTH_BY_CHAR_HEAD (c) \
b4e9dd77 1193 : 4))))
4ed46869
KH
1194
1195DEFUN ("char-width", Fchar_width, Schar_width, 1, 1, 0,
1196 "Return width of CHAR when displayed in the current buffer.\n\
1197The width is measured by how many columns it occupies on the screen.")
1198 (ch)
1199 Lisp_Object ch;
1200{
859f2b3c 1201 Lisp_Object val, disp;
4ed46869 1202 int c;
51c4025f 1203 struct Lisp_Char_Table *dp = buffer_display_table ();
4ed46869
KH
1204
1205 CHECK_NUMBER (ch, 0);
1206
859f2b3c
RS
1207 c = XINT (ch);
1208
1209 /* Get the way the display table would display it. */
51c4025f 1210 disp = dp ? DISP_CHAR_VECTOR (dp, c) : Qnil;
859f2b3c
RS
1211
1212 if (VECTORP (disp))
1213 XSETINT (val, XVECTOR (disp)->size);
1214 else if (SINGLE_BYTE_CHAR_P (c))
1215 XSETINT (val, ONE_BYTE_CHAR_WIDTH (c));
4ed46869
KH
1216 else if (COMPOSITE_CHAR_P (c))
1217 {
1218 int id = COMPOSITE_CHAR_ID (XFASTINT (ch));
0282eb69 1219 XSETFASTINT (val, (id < n_cmpchars ? cmpchar_table[id]->width : 1));
4ed46869
KH
1220 }
1221 else
1222 {
1223 int charset = CHAR_CHARSET (c);
1224
1225 XSETFASTINT (val, CHARSET_WIDTH (charset));
1226 }
1227 return val;
1228}
1229
1230/* Return width of string STR of length LEN when displayed in the
1231 current buffer. The width is measured by how many columns it
1232 occupies on the screen. */
859f2b3c 1233
4ed46869
KH
1234int
1235strwidth (str, len)
1236 unsigned char *str;
1237 int len;
1238{
1239 unsigned char *endp = str + len;
1240 int width = 0;
c4a4e28f 1241 struct Lisp_Char_Table *dp = buffer_display_table ();
4ed46869 1242
859f2b3c
RS
1243 while (str < endp)
1244 {
1245 if (*str == LEADING_CODE_COMPOSITION)
1246 {
1247 int id = str_cmpchar_id (str, endp - str);
1248
1249 if (id < 0)
1250 {
1251 width += 4;
1252 str++;
1253 }
1254 else
1255 {
1256 width += cmpchar_table[id]->width;
1257 str += cmpchar_table[id]->len;
1258 }
1259 }
1260 else
1261 {
1262 Lisp_Object disp;
e515b0a9
KH
1263 int thislen;
1264 int c = STRING_CHAR_AND_LENGTH (str, endp - str, thislen);
859f2b3c
RS
1265
1266 /* Get the way the display table would display it. */
acc35c36
RS
1267 if (dp)
1268 disp = DISP_CHAR_VECTOR (dp, c);
1269 else
1270 disp = Qnil;
859f2b3c
RS
1271
1272 if (VECTORP (disp))
e515b0a9 1273 width += XVECTOR (disp)->size;
859f2b3c 1274 else
e515b0a9 1275 width += ONE_BYTE_CHAR_WIDTH (*str);
859f2b3c 1276
e515b0a9 1277 str += thislen;
859f2b3c
RS
1278 }
1279 }
4ed46869
KH
1280 return width;
1281}
1282
1283DEFUN ("string-width", Fstring_width, Sstring_width, 1, 1, 0,
1284 "Return width of STRING when displayed in the current buffer.\n\
1285Width is measured by how many columns it occupies on the screen.\n\
046b1f03
RS
1286When calculating width of a multibyte character in STRING,\n\
1287only the base leading-code is considered; the validity of\n\
1288the following bytes is not checked.")
4ed46869
KH
1289 (str)
1290 Lisp_Object str;
1291{
1292 Lisp_Object val;
1293
1294 CHECK_STRING (str, 0);
fc932ac6
RS
1295 XSETFASTINT (val, strwidth (XSTRING (str)->data,
1296 STRING_BYTES (XSTRING (str))));
4ed46869
KH
1297 return val;
1298}
1299
1300DEFUN ("char-direction", Fchar_direction, Schar_direction, 1, 1, 0,
1301 "Return the direction of CHAR.\n\
1302The returned value is 0 for left-to-right and 1 for right-to-left.")
1303 (ch)
1304 Lisp_Object ch;
1305{
1306 int charset;
1307
1308 CHECK_NUMBER (ch, 0);
1309 charset = CHAR_CHARSET (XFASTINT (ch));
1310 if (!CHARSET_DEFINED_P (charset))
93bcb785 1311 invalid_character (XINT (ch));
4ed46869
KH
1312 return CHARSET_TABLE_INFO (charset, CHARSET_DIRECTION_IDX);
1313}
1314
af4fecb4 1315DEFUN ("chars-in-region", Fchars_in_region, Schars_in_region, 2, 2, 0,
6ae1f27e 1316 "Return number of characters between BEG and END.")
046b1f03
RS
1317 (beg, end)
1318 Lisp_Object beg, end;
1319{
6ae1f27e 1320 int from, to;
046b1f03 1321
17e7ef1b
RS
1322 CHECK_NUMBER_COERCE_MARKER (beg, 0);
1323 CHECK_NUMBER_COERCE_MARKER (end, 1);
1324
046b1f03 1325 from = min (XFASTINT (beg), XFASTINT (end));
a8a35e61 1326 to = max (XFASTINT (beg), XFASTINT (end));
046b1f03 1327
a8c21066 1328 return make_number (to - from);
6ae1f27e 1329}
9036eb45 1330
87b089ad
RS
1331/* Return the number of characters in the NBYTES bytes at PTR.
1332 This works by looking at the contents and checking for multibyte sequences.
1333 However, if the current buffer has enable-multibyte-characters = nil,
1334 we treat each byte as a character. */
1335
6ae1f27e
RS
1336int
1337chars_in_text (ptr, nbytes)
1338 unsigned char *ptr;
1339 int nbytes;
1340{
93bcb785 1341 unsigned char *endp, c;
6ae1f27e 1342 int chars;
046b1f03 1343
87b089ad
RS
1344 /* current_buffer is null at early stages of Emacs initialization. */
1345 if (current_buffer == 0
1346 || NILP (current_buffer->enable_multibyte_characters))
6ae1f27e 1347 return nbytes;
a8a35e61 1348
6ae1f27e
RS
1349 endp = ptr + nbytes;
1350 chars = 0;
046b1f03 1351
6ae1f27e
RS
1352 while (ptr < endp)
1353 {
93bcb785
KH
1354 c = *ptr++;
1355
1356 if (BASE_LEADING_CODE_P (c))
1357 while (ptr < endp && ! CHAR_HEAD_P (*ptr)) ptr++;
046b1f03
RS
1358 chars++;
1359 }
1360
6ae1f27e 1361 return chars;
046b1f03
RS
1362}
1363
87b089ad
RS
1364/* Return the number of characters in the NBYTES bytes at PTR.
1365 This works by looking at the contents and checking for multibyte sequences.
1366 It ignores enable-multibyte-characters. */
1367
1368int
1369multibyte_chars_in_text (ptr, nbytes)
1370 unsigned char *ptr;
1371 int nbytes;
1372{
93bcb785 1373 unsigned char *endp, c;
87b089ad
RS
1374 int chars;
1375
1376 endp = ptr + nbytes;
1377 chars = 0;
1378
1379 while (ptr < endp)
1380 {
93bcb785
KH
1381 c = *ptr++;
1382
1383 if (BASE_LEADING_CODE_P (c))
1384 while (ptr < endp && ! CHAR_HEAD_P (*ptr)) ptr++;
87b089ad
RS
1385 chars++;
1386 }
1387
1388 return chars;
1389}
1390
1391DEFUN ("string", Fstring, Sstring, 1, MANY, 0,
4ed46869 1392 "Concatenate all the argument characters and make the result a string.")
53316e55
KH
1393 (n, args)
1394 int n;
4ed46869
KH
1395 Lisp_Object *args;
1396{
53316e55 1397 int i;
4ed46869 1398 unsigned char *buf
bd4c6dd0 1399 = (unsigned char *) alloca (MAX_LENGTH_OF_MULTI_BYTE_FORM * n);
4ed46869
KH
1400 unsigned char *p = buf;
1401 Lisp_Object val;
1402
1403 for (i = 0; i < n; i++)
1404 {
1405 int c, len;
1406 unsigned char *str;
1407
1408 if (!INTEGERP (args[i]))
b0e3cf2b 1409 CHECK_NUMBER (args[i], 0);
4ed46869
KH
1410 c = XINT (args[i]);
1411 len = CHAR_STRING (c, p, str);
1412 if (p != str)
1413 /* C is a composite character. */
1414 bcopy (str, p, len);
1415 p += len;
1416 }
1417
020da460
KH
1418 /* Here, we can't use make_string_from_bytes because of byte
1419 combining problem. */
1420 val = make_string (buf, p - buf);
4ed46869
KH
1421 return val;
1422}
1423
1424#endif /* emacs */
1425\f
1426/*** Composite characters staffs ***/
1427
1428/* Each composite character is identified by CMPCHAR-ID which is
1429 assigned when Emacs needs the character code of the composite
1430 character (e.g. when displaying it on the screen). See the
1431 document "GENERAL NOTE on COMPOSITE CHARACTER" in `charset.h' how a
1432 composite character is represented in Emacs. */
1433
1434/* If `static' is defined, it means that it is defined to null string. */
1435#ifndef static
1436/* The following function is copied from lread.c. */
1437static int
1438hash_string (ptr, len)
1439 unsigned char *ptr;
1440 int len;
1441{
1442 register unsigned char *p = ptr;
1443 register unsigned char *end = p + len;
1444 register unsigned char c;
1445 register int hash = 0;
1446
1447 while (p != end)
1448 {
1449 c = *p++;
1450 if (c >= 0140) c -= 40;
1451 hash = ((hash<<3) + (hash>>28) + c);
1452 }
1453 return hash & 07777777777;
1454}
1455#endif
1456
4ed46869
KH
1457#define CMPCHAR_HASH_TABLE_SIZE 0xFFF
1458
1459static int *cmpchar_hash_table[CMPCHAR_HASH_TABLE_SIZE];
1460
1461/* Each element of `cmpchar_hash_table' is a pointer to an array of
1462 integer, where the 1st element is the size of the array, the 2nd
1463 element is how many elements are actually used in the array, and
1464 the remaining elements are CMPCHAR-IDs of composite characters of
1465 the same hash value. */
1466#define CMPCHAR_HASH_SIZE(table) table[0]
1467#define CMPCHAR_HASH_USED(table) table[1]
1468#define CMPCHAR_HASH_CMPCHAR_ID(table, i) table[i]
1469
1470/* Return CMPCHAR-ID of the composite character in STR of the length
1471 LEN. If the composite character has not yet been registered,
1472 register it in `cmpchar_table' and assign new CMPCHAR-ID. This
1473 is the sole function for assigning CMPCHAR-ID. */
1474int
1475str_cmpchar_id (str, len)
8867de67 1476 const unsigned char *str;
4ed46869
KH
1477 int len;
1478{
1479 int hash_idx, *hashp;
1480 unsigned char *buf;
1481 int embedded_rule; /* 1 if composition rule is embedded. */
1482 int chars; /* number of components. */
1483 int i;
1484 struct cmpchar_info *cmpcharp;
1485
4ed46869
KH
1486 /* The second byte 0xFF means compostion rule is embedded. */
1487 embedded_rule = (str[1] == 0xFF);
1488
1489 /* At first, get the actual length of the composite character. */
1490 {
8867de67 1491 const unsigned char *p, *endp = str + 1, *lastp = str + len;
4ed46869
KH
1492 int bytes;
1493
6ae1f27e 1494 while (endp < lastp && ! CHAR_HEAD_P (*endp)) endp++;
93bcb785
KH
1495 if (endp - str < 5)
1496 /* Any composite char have at least 5-byte length. */
1497 return -1;
1498
4ed46869 1499 chars = 0;
93bcb785 1500 p = str + 1;
4ed46869
KH
1501 while (p < endp)
1502 {
9b4d1fe6
KH
1503 if (embedded_rule)
1504 {
1505 p++;
1506 if (p >= endp)
1507 return -1;
1508 }
4ed46869 1509 /* No need of checking if *P is 0xA0 because
93bcb785
KH
1510 BYTES_BY_CHAR_HEAD (0x80) surely returns 2. */
1511 p += BYTES_BY_CHAR_HEAD (*p - 0x20);
4ed46869
KH
1512 chars++;
1513 }
93bcb785
KH
1514 if (p > endp || chars < 2 || chars > MAX_COMPONENT_COUNT)
1515 /* Invalid components. */
4ed46869 1516 return -1;
93bcb785 1517 len = p - str;
4ed46869
KH
1518 }
1519 hash_idx = hash_string (str, len) % CMPCHAR_HASH_TABLE_SIZE;
1520 hashp = cmpchar_hash_table[hash_idx];
1521
1522 /* Then, look into the hash table. */
1523 if (hashp != NULL)
1524 /* Find the correct one among composite characters of the same
1525 hash value. */
1526 for (i = 2; i < CMPCHAR_HASH_USED (hashp); i++)
1527 {
1528 cmpcharp = cmpchar_table[CMPCHAR_HASH_CMPCHAR_ID (hashp, i)];
1529 if (len == cmpcharp->len
1530 && ! bcmp (str, cmpcharp->data, len))
1531 return CMPCHAR_HASH_CMPCHAR_ID (hashp, i);
1532 }
1533
1534 /* We have to register the composite character in cmpchar_table. */
0282eb69 1535 if (n_cmpchars >= (CHAR_FIELD2_MASK | CHAR_FIELD3_MASK))
513ee442
KH
1536 /* No, we have no more room for a new composite character. */
1537 return -1;
1538
4ed46869
KH
1539 /* Make the entry in hash table. */
1540 if (hashp == NULL)
1541 {
1542 /* Make a table for 8 composite characters initially. */
1543 hashp = (cmpchar_hash_table[hash_idx]
1544 = (int *) xmalloc (sizeof (int) * (2 + 8)));
1545 CMPCHAR_HASH_SIZE (hashp) = 10;
1546 CMPCHAR_HASH_USED (hashp) = 2;
1547 }
1548 else if (CMPCHAR_HASH_USED (hashp) >= CMPCHAR_HASH_SIZE (hashp))
1549 {
1550 CMPCHAR_HASH_SIZE (hashp) += 8;
1551 hashp = (cmpchar_hash_table[hash_idx]
1552 = (int *) xrealloc (hashp,
1553 sizeof (int) * CMPCHAR_HASH_SIZE (hashp)));
1554 }
1555 CMPCHAR_HASH_CMPCHAR_ID (hashp, CMPCHAR_HASH_USED (hashp)) = n_cmpchars;
1556 CMPCHAR_HASH_USED (hashp)++;
1557
1558 /* Set information of the composite character in cmpchar_table. */
1559 if (cmpchar_table_size == 0)
1560 {
1561 /* This is the first composite character to be registered. */
1562 cmpchar_table_size = 256;
1563 cmpchar_table
1564 = (struct cmpchar_info **) xmalloc (sizeof (cmpchar_table[0])
1565 * cmpchar_table_size);
1566 }
1567 else if (cmpchar_table_size <= n_cmpchars)
1568 {
1569 cmpchar_table_size += 256;
1570 cmpchar_table
1571 = (struct cmpchar_info **) xrealloc (cmpchar_table,
1572 sizeof (cmpchar_table[0])
1573 * cmpchar_table_size);
1574 }
1575
1576 cmpcharp = (struct cmpchar_info *) xmalloc (sizeof (struct cmpchar_info));
1577
1578 cmpcharp->len = len;
1579 cmpcharp->data = (unsigned char *) xmalloc (len + 1);
1580 bcopy (str, cmpcharp->data, len);
1581 cmpcharp->data[len] = 0;
1582 cmpcharp->glyph_len = chars;
1583 cmpcharp->glyph = (GLYPH *) xmalloc (sizeof (GLYPH) * chars);
1584 if (embedded_rule)
1585 {
1586 cmpcharp->cmp_rule = (unsigned char *) xmalloc (chars);
1587 cmpcharp->col_offset = (float *) xmalloc (sizeof (float) * chars);
1588 }
1589 else
1590 {
1591 cmpcharp->cmp_rule = NULL;
1592 cmpcharp->col_offset = NULL;
1593 }
1594
1595 /* Setup GLYPH data and composition rules (if any) so as not to make
1596 them every time on displaying. */
1597 {
1598 unsigned char *bufp;
1599 int width;
1600 float leftmost = 0.0, rightmost = 1.0;
1601
1602 if (embedded_rule)
1603 /* At first, col_offset[N] is set to relative to col_offset[0]. */
1604 cmpcharp->col_offset[0] = 0;
1605
1606 for (i = 0, bufp = cmpcharp->data + 1; i < chars; i++)
1607 {
1608 if (embedded_rule)
1609 cmpcharp->cmp_rule[i] = *bufp++;
1610
1611 if (*bufp == 0xA0) /* This is an ASCII character. */
1612 {
1613 cmpcharp->glyph[i] = FAST_MAKE_GLYPH ((*++bufp & 0x7F), 0);
1614 width = 1;
1615 bufp++;
1616 }
1617 else /* Multibyte character. */
1618 {
1619 /* Make `bufp' point normal multi-byte form temporally. */
1620 *bufp -= 0x20;
1621 cmpcharp->glyph[i]
537efd8d 1622 = FAST_MAKE_GLYPH (string_to_non_ascii_char (bufp, 4, 0, 0), 0);
4ed46869
KH
1623 width = WIDTH_BY_CHAR_HEAD (*bufp);
1624 *bufp += 0x20;
1625 bufp += BYTES_BY_CHAR_HEAD (*bufp - 0x20);
1626 }
1627
1628 if (embedded_rule && i > 0)
1629 {
1630 /* Reference points (global_ref and new_ref) are
1631 encoded as below:
1632
1633 0--1--2 -- ascent
1634 | |
1635 | |
1636 | 4 -+--- center
1637 -- 3 5 -- baseline
1638 | |
1639 6--7--8 -- descent
1640
1641 Now, we calculate the column offset of the new glyph
1642 from the left edge of the first glyph. This can avoid
1643 the same calculation everytime displaying this
1644 composite character. */
1645
1646 /* Reference points of global glyph and new glyph. */
1647 int global_ref = (cmpcharp->cmp_rule[i] - 0xA0) / 9;
1648 int new_ref = (cmpcharp->cmp_rule[i] - 0xA0) % 9;
1649 /* Column offset relative to the first glyph. */
1650 float left = (leftmost
1651 + (global_ref % 3) * (rightmost - leftmost) / 2.0
1652 - (new_ref % 3) * width / 2.0);
1653
1654 cmpcharp->col_offset[i] = left;
1655 if (left < leftmost)
1656 leftmost = left;
1657 if (left + width > rightmost)
1658 rightmost = left + width;
1659 }
1660 else
1661 {
1662 if (width > rightmost)
1663 rightmost = width;
1664 }
1665 }
1666 if (embedded_rule)
1667 {
1668 /* Now col_offset[N] are relative to the left edge of the
1669 first component. Make them relative to the left edge of
1670 overall glyph. */
1671 for (i = 0; i < chars; i++)
1672 cmpcharp->col_offset[i] -= leftmost;
1673 /* Make rightmost holds width of overall glyph. */
1674 rightmost -= leftmost;
1675 }
1676
1677 cmpcharp->width = rightmost;
1678 if (cmpcharp->width < rightmost)
1679 /* To get a ceiling integer value. */
1680 cmpcharp->width++;
1681 }
1682
1683 cmpchar_table[n_cmpchars] = cmpcharp;
1684
1685 return n_cmpchars++;
1686}
1687
de54b0d5
KH
1688/* Return the Nth element of the composite character C. If NOERROR is
1689 nonzero, return 0 on error condition (C is an invalid composite
1690 charcter, or N is out of range). */
4ed46869 1691int
de54b0d5
KH
1692cmpchar_component (c, n, noerror)
1693 int c, n, noerror;
4ed46869
KH
1694{
1695 int id = COMPOSITE_CHAR_ID (c);
1696
de54b0d5
KH
1697 if (id < 0 || id >= n_cmpchars)
1698 {
1699 /* C is not a valid composite character. */
1700 if (noerror) return 0;
1701 error ("Invalid composite character: %d", c) ;
1702 }
1703 if (n >= cmpchar_table[id]->glyph_len)
1704 {
1705 /* No such component. */
1706 if (noerror) return 0;
1707 args_out_of_range (make_number (c), make_number (n));
1708 }
4ed46869
KH
1709 /* No face data is stored in glyph code. */
1710 return ((int) (cmpchar_table[id]->glyph[n]));
1711}
1712
1713DEFUN ("cmpcharp", Fcmpcharp, Scmpcharp, 1, 1, 0,
1714 "T if CHAR is a composite character.")
1715 (ch)
1716 Lisp_Object ch;
1717{
1718 CHECK_NUMBER (ch, 0);
1719 return (COMPOSITE_CHAR_P (XINT (ch)) ? Qt : Qnil);
1720}
1721
1722DEFUN ("composite-char-component", Fcmpchar_component, Scmpchar_component,
1723 2, 2, 0,
de54b0d5
KH
1724 "Return the Nth component character of composite character CHARACTER.")
1725 (character, n)
1726 Lisp_Object character, n;
4ed46869 1727{
de54b0d5 1728 int id;
4ed46869
KH
1729
1730 CHECK_NUMBER (character, 0);
de54b0d5 1731 CHECK_NUMBER (n, 1);
4ed46869 1732
de54b0d5 1733 return (make_number (cmpchar_component (XINT (character), XINT (n), 0)));
4ed46869
KH
1734}
1735
1736DEFUN ("composite-char-composition-rule", Fcmpchar_cmp_rule, Scmpchar_cmp_rule,
1737 2, 2, 0,
de54b0d5 1738 "Return the Nth composition rule of composite character CHARACTER.\n\
55001746 1739The returned rule is for composing the Nth component\n\
de54b0d5
KH
1740on the (N-1)th component.\n\
1741If CHARACTER should be composed relatively or N is 0, return 255.")
55001746
KH
1742 (character, n)
1743 Lisp_Object character, n;
4ed46869 1744{
de54b0d5 1745 int id;
4ed46869
KH
1746
1747 CHECK_NUMBER (character, 0);
55001746 1748 CHECK_NUMBER (n, 1);
4ed46869
KH
1749
1750 id = COMPOSITE_CHAR_ID (XINT (character));
1751 if (id < 0 || id >= n_cmpchars)
1752 error ("Invalid composite character: %d", XINT (character));
de54b0d5 1753 if (XINT (n) < 0 || XINT (n) >= cmpchar_table[id]->glyph_len)
55001746 1754 args_out_of_range (character, n);
4ed46869 1755
de54b0d5
KH
1756 return make_number (cmpchar_table[id]->cmp_rule
1757 ? cmpchar_table[id]->cmp_rule[XINT (n)]
1758 : 255);
4ed46869
KH
1759}
1760
1761DEFUN ("composite-char-composition-rule-p", Fcmpchar_cmp_rule_p,
1762 Scmpchar_cmp_rule_p, 1, 1, 0,
1763 "Return non-nil if composite character CHARACTER contains a embedded rule.")
1764 (character)
1765 Lisp_Object character;
1766{
1767 int id;
1768
1769 CHECK_NUMBER (character, 0);
1770 id = COMPOSITE_CHAR_ID (XINT (character));
1771 if (id < 0 || id >= n_cmpchars)
1772 error ("Invalid composite character: %d", XINT (character));
1773
1774 return (cmpchar_table[id]->cmp_rule ? Qt : Qnil);
1775}
1776
1777DEFUN ("composite-char-component-count", Fcmpchar_cmp_count,
1778 Scmpchar_cmp_count, 1, 1, 0,
1779 "Return number of compoents of composite character CHARACTER.")
1780 (character)
1781 Lisp_Object character;
1782{
1783 int id;
1784
1785 CHECK_NUMBER (character, 0);
1786 id = COMPOSITE_CHAR_ID (XINT (character));
1787 if (id < 0 || id >= n_cmpchars)
1788 error ("Invalid composite character: %d", XINT (character));
1789
1790 return (make_number (cmpchar_table[id]->glyph_len));
1791}
1792
1793DEFUN ("compose-string", Fcompose_string, Scompose_string,
1794 1, 1, 0,
1795 "Return one char string composed from all characters in STRING.")
1796 (str)
1797 Lisp_Object str;
1798{
1799 unsigned char buf[MAX_LENGTH_OF_MULTI_BYTE_FORM], *p, *pend, *ptemp;
1800 int len, i;
1801
1802 CHECK_STRING (str, 0);
1803
1804 buf[0] = LEADING_CODE_COMPOSITION;
1805 p = XSTRING (str)->data;
fc932ac6 1806 pend = p + STRING_BYTES (XSTRING (str));
4ed46869
KH
1807 i = 1;
1808 while (p < pend)
1809 {
9b4d1fe6 1810 if (*p < 0x20) /* control code */
4ed46869
KH
1811 error ("Invalid component character: %d", *p);
1812 else if (*p < 0x80) /* ASCII */
1813 {
1814 if (i + 2 >= MAX_LENGTH_OF_MULTI_BYTE_FORM)
1815 error ("Too long string to be composed: %s", XSTRING (str)->data);
1816 /* Prepend an ASCII charset indicator 0xA0, set MSB of the
1817 code itself. */
1818 buf[i++] = 0xA0;
1819 buf[i++] = *p++ + 0x80;
1820 }
1821 else if (*p == LEADING_CODE_COMPOSITION) /* composite char */
1822 {
1823 /* Already composed. Eliminate the heading
1824 LEADING_CODE_COMPOSITION, keep the remaining bytes
1825 unchanged. */
1826 p++;
de54b0d5
KH
1827 if (*p == 255)
1828 error ("Can't compose a rule-based composition character");
4ed46869 1829 ptemp = p;
6ae1f27e 1830 while (! CHAR_HEAD_P (*p)) p++;
9b4d1fe6
KH
1831 if (str_cmpchar_id (ptemp - 1, p - ptemp + 1) < 0)
1832 error ("Can't compose an invalid composition character");
4ed46869
KH
1833 if (i + (p - ptemp) >= MAX_LENGTH_OF_MULTI_BYTE_FORM)
1834 error ("Too long string to be composed: %s", XSTRING (str)->data);
1835 bcopy (ptemp, buf + i, p - ptemp);
1836 i += p - ptemp;
1837 }
1838 else /* multibyte char */
1839 {
1840 /* Add 0x20 to the base leading-code, keep the remaining
1841 bytes unchanged. */
9b4d1fe6
KH
1842 int c = STRING_CHAR_AND_CHAR_LENGTH (p, pend - p, len);
1843
1844 if (len <= 1 || ! CHAR_VALID_P (c, 0))
1845 error ("Can't compose an invalid character");
4ed46869
KH
1846 if (i + len >= MAX_LENGTH_OF_MULTI_BYTE_FORM)
1847 error ("Too long string to be composed: %s", XSTRING (str)->data);
1848 bcopy (p, buf + i, len);
1849 buf[i] += 0x20;
1850 p += len, i += len;
1851 }
1852 }
1853
1854 if (i < 5)
1855 /* STR contains only one character, which can't be composed. */
1856 error ("Too short string to be composed: %s", XSTRING (str)->data);
1857
27802600 1858 return make_string_from_bytes (buf, 1, i);
4ed46869
KH
1859}
1860
1861\f
dfcf069d 1862int
4ed46869
KH
1863charset_id_internal (charset_name)
1864 char *charset_name;
1865{
76d7b829 1866 Lisp_Object val;
4ed46869 1867
76d7b829 1868 val= Fget (intern (charset_name), Qcharset);
4ed46869
KH
1869 if (!VECTORP (val))
1870 error ("Charset %s is not defined", charset_name);
1871
1872 return (XINT (XVECTOR (val)->contents[0]));
1873}
1874
1875DEFUN ("setup-special-charsets", Fsetup_special_charsets,
1876 Ssetup_special_charsets, 0, 0, 0, "Internal use only.")
1877 ()
1878{
1879 charset_latin_iso8859_1 = charset_id_internal ("latin-iso8859-1");
1880 charset_jisx0208_1978 = charset_id_internal ("japanese-jisx0208-1978");
1881 charset_jisx0208 = charset_id_internal ("japanese-jisx0208");
1882 charset_katakana_jisx0201 = charset_id_internal ("katakana-jisx0201");
1883 charset_latin_jisx0201 = charset_id_internal ("latin-jisx0201");
1884 charset_big5_1 = charset_id_internal ("chinese-big5-1");
1885 charset_big5_2 = charset_id_internal ("chinese-big5-2");
1886 return Qnil;
1887}
1888
dfcf069d 1889void
4ed46869
KH
1890init_charset_once ()
1891{
1892 int i, j, k;
1893
1894 staticpro (&Vcharset_table);
1895 staticpro (&Vcharset_symbol_table);
8a73a704 1896 staticpro (&Vgeneric_character_list);
4ed46869
KH
1897
1898 /* This has to be done here, before we call Fmake_char_table. */
1899 Qcharset_table = intern ("charset-table");
1900 staticpro (&Qcharset_table);
1901
1902 /* Intern this now in case it isn't already done.
1903 Setting this variable twice is harmless.
1904 But don't staticpro it here--that is done in alloc.c. */
1905 Qchar_table_extra_slots = intern ("char-table-extra-slots");
1906
1907 /* Now we are ready to set up this property, so we can
1908 create the charset table. */
1909 Fput (Qcharset_table, Qchar_table_extra_slots, make_number (0));
1910 Vcharset_table = Fmake_char_table (Qcharset_table, Qnil);
1911
0282eb69
KH
1912 Qunknown = intern ("unknown");
1913 staticpro (&Qunknown);
1914 Vcharset_symbol_table = Fmake_vector (make_number (MAX_CHARSET + 1),
1915 Qunknown);
4ed46869
KH
1916
1917 /* Setup tables. */
1918 for (i = 0; i < 2; i++)
1919 for (j = 0; j < 2; j++)
1920 for (k = 0; k < 128; k++)
1921 iso_charset_table [i][j][k] = -1;
1922
1923 bzero (cmpchar_hash_table, sizeof cmpchar_hash_table);
1924 cmpchar_table_size = n_cmpchars = 0;
1925
60383934 1926 for (i = 0; i < 256; i++)
4ed46869 1927 BYTES_BY_CHAR_HEAD (i) = 1;
6ef23ebb
KH
1928 for (i = MIN_CHARSET_OFFICIAL_DIMENSION1;
1929 i <= MAX_CHARSET_OFFICIAL_DIMENSION1; i++)
1930 BYTES_BY_CHAR_HEAD (i) = 2;
1931 for (i = MIN_CHARSET_OFFICIAL_DIMENSION2;
1932 i <= MAX_CHARSET_OFFICIAL_DIMENSION2; i++)
1933 BYTES_BY_CHAR_HEAD (i) = 3;
4ed46869
KH
1934 BYTES_BY_CHAR_HEAD (LEADING_CODE_PRIVATE_11) = 3;
1935 BYTES_BY_CHAR_HEAD (LEADING_CODE_PRIVATE_12) = 3;
1936 BYTES_BY_CHAR_HEAD (LEADING_CODE_PRIVATE_21) = 4;
1937 BYTES_BY_CHAR_HEAD (LEADING_CODE_PRIVATE_22) = 4;
6ef23ebb 1938 /* The followings don't reflect the actual bytes, but just to tell
4ed46869
KH
1939 that it is a start of a multibyte character. */
1940 BYTES_BY_CHAR_HEAD (LEADING_CODE_COMPOSITION) = 2;
6ef23ebb
KH
1941 BYTES_BY_CHAR_HEAD (0x9E) = 2;
1942 BYTES_BY_CHAR_HEAD (0x9F) = 2;
4ed46869
KH
1943
1944 for (i = 0; i < 128; i++)
1945 WIDTH_BY_CHAR_HEAD (i) = 1;
1946 for (; i < 256; i++)
1947 WIDTH_BY_CHAR_HEAD (i) = 4;
1948 WIDTH_BY_CHAR_HEAD (LEADING_CODE_PRIVATE_11) = 1;
1949 WIDTH_BY_CHAR_HEAD (LEADING_CODE_PRIVATE_12) = 2;
1950 WIDTH_BY_CHAR_HEAD (LEADING_CODE_PRIVATE_21) = 1;
1951 WIDTH_BY_CHAR_HEAD (LEADING_CODE_PRIVATE_22) = 2;
8a73a704
KH
1952
1953 {
76d7b829 1954 Lisp_Object val;
8a73a704 1955
76d7b829 1956 val = Qnil;
8a73a704
KH
1957 for (i = 0x81; i < 0x90; i++)
1958 val = Fcons (make_number ((i - 0x70) << 7), val);
1959 for (; i < 0x9A; i++)
1960 val = Fcons (make_number ((i - 0x8F) << 14), val);
1961 for (i = 0xA0; i < 0xF0; i++)
1962 val = Fcons (make_number ((i - 0x70) << 7), val);
1963 for (; i < 0xFF; i++)
1964 val = Fcons (make_number ((i - 0xE0) << 14), val);
1965 val = Fcons (make_number (GENERIC_COMPOSITION_CHAR), val);
1966 Vgeneric_character_list = Fnreverse (val);
1967 }
bbf12bb3
KH
1968
1969 nonascii_insert_offset = 0;
1970 Vnonascii_translation_table = Qnil;
4ed46869
KH
1971}
1972
1973#ifdef emacs
1974
dfcf069d 1975void
4ed46869
KH
1976syms_of_charset ()
1977{
1978 Qascii = intern ("ascii");
1979 staticpro (&Qascii);
1980
1981 Qcharset = intern ("charset");
1982 staticpro (&Qcharset);
1983
1984 /* Define ASCII charset now. */
1985 update_charset_table (make_number (CHARSET_ASCII),
1986 make_number (1), make_number (94),
1987 make_number (1),
1988 make_number (0),
1989 make_number ('B'),
1990 make_number (0),
1991 build_string ("ASCII"),
1992 build_string ("ASCII"),
1993 build_string ("ASCII (ISO646 IRV)"));
1994 CHARSET_SYMBOL (CHARSET_ASCII) = Qascii;
1995 Fput (Qascii, Qcharset, CHARSET_TABLE_ENTRY (CHARSET_ASCII));
1996
1997 Qcomposition = intern ("composition");
1998 staticpro (&Qcomposition);
1999 CHARSET_SYMBOL (CHARSET_COMPOSITION) = Qcomposition;
2000
c1a08b4c
KH
2001 Qauto_fill_chars = intern ("auto-fill-chars");
2002 staticpro (&Qauto_fill_chars);
2003 Fput (Qauto_fill_chars, Qchar_table_extra_slots, make_number (0));
2004
4ed46869 2005 defsubr (&Sdefine_charset);
8a73a704 2006 defsubr (&Sgeneric_character_list);
3fac5a51 2007 defsubr (&Sget_unused_iso_final_char);
4ed46869
KH
2008 defsubr (&Sdeclare_equiv_charset);
2009 defsubr (&Sfind_charset_region);
2010 defsubr (&Sfind_charset_string);
2011 defsubr (&Smake_char_internal);
2012 defsubr (&Ssplit_char);
2013 defsubr (&Schar_charset);
90d7b74e 2014 defsubr (&Scharset_after);
4ed46869 2015 defsubr (&Siso_charset);
9d3d8cba 2016 defsubr (&Schar_valid_p);
d2665018 2017 defsubr (&Sunibyte_char_to_multibyte);
1bcc1567 2018 defsubr (&Smultibyte_char_to_unibyte);
4ed46869
KH
2019 defsubr (&Schar_bytes);
2020 defsubr (&Schar_width);
2021 defsubr (&Sstring_width);
2022 defsubr (&Schar_direction);
af4fecb4 2023 defsubr (&Schars_in_region);
87b089ad 2024 defsubr (&Sstring);
4ed46869
KH
2025 defsubr (&Scmpcharp);
2026 defsubr (&Scmpchar_component);
2027 defsubr (&Scmpchar_cmp_rule);
2028 defsubr (&Scmpchar_cmp_rule_p);
2029 defsubr (&Scmpchar_cmp_count);
2030 defsubr (&Scompose_string);
2031 defsubr (&Ssetup_special_charsets);
2032
2033 DEFVAR_LISP ("charset-list", &Vcharset_list,
2034 "List of charsets ever defined.");
2035 Vcharset_list = Fcons (Qascii, Qnil);
2036
537efd8d 2037 DEFVAR_LISP ("translation-table-vector", &Vtranslation_table_vector,
b4e9dd77
KH
2038 "Vector of cons cell of a symbol and translation table ever defined.\n\
2039An ID of a translation table is an index of this vector.");
537efd8d 2040 Vtranslation_table_vector = Fmake_vector (make_number (16), Qnil);
b0e3cf2b 2041
4ed46869
KH
2042 DEFVAR_INT ("leading-code-composition", &leading_code_composition,
2043 "Leading-code of composite characters.");
2044 leading_code_composition = LEADING_CODE_COMPOSITION;
2045
2046 DEFVAR_INT ("leading-code-private-11", &leading_code_private_11,
2047 "Leading-code of private TYPE9N charset of column-width 1.");
2048 leading_code_private_11 = LEADING_CODE_PRIVATE_11;
2049
2050 DEFVAR_INT ("leading-code-private-12", &leading_code_private_12,
2051 "Leading-code of private TYPE9N charset of column-width 2.");
2052 leading_code_private_12 = LEADING_CODE_PRIVATE_12;
2053
2054 DEFVAR_INT ("leading-code-private-21", &leading_code_private_21,
2055 "Leading-code of private TYPE9Nx9N charset of column-width 1.");
2056 leading_code_private_21 = LEADING_CODE_PRIVATE_21;
2057
2058 DEFVAR_INT ("leading-code-private-22", &leading_code_private_22,
2059 "Leading-code of private TYPE9Nx9N charset of column-width 2.");
2060 leading_code_private_22 = LEADING_CODE_PRIVATE_22;
35e623fb
RS
2061
2062 DEFVAR_INT ("nonascii-insert-offset", &nonascii_insert_offset,
d2665018 2063 "Offset for converting non-ASCII unibyte codes 0240...0377 to multibyte.\n\
4cf9710d
RS
2064This is used for converting unibyte text to multibyte,\n\
2065and for inserting character codes specified by number.\n\n\
3e8ceaac
RS
2066This serves to convert a Latin-1 or similar 8-bit character code\n\
2067to the corresponding Emacs multibyte character code.\n\
2068Typically the value should be (- (make-char CHARSET 0) 128),\n\
2069for your choice of character set.\n\
537efd8d 2070If `nonascii-translation-table' is non-nil, it overrides this variable.");
35e623fb 2071 nonascii_insert_offset = 0;
b0e3cf2b 2072
b4e9dd77 2073 DEFVAR_LISP ("nonascii-translation-table", &Vnonascii_translation_table,
537efd8d 2074 "Translation table to convert non-ASCII unibyte codes to multibyte.\n\
4cf9710d
RS
2075This is used for converting unibyte text to multibyte,\n\
2076and for inserting character codes specified by number.\n\n\
2077Conversion is performed only when multibyte characters are enabled,\n\
2078and it serves to convert a Latin-1 or similar 8-bit character code\n\
2079to the corresponding Emacs character code.\n\n\
da4d65af 2080If this is nil, `nonascii-insert-offset' is used instead.\n\
b4e9dd77
KH
2081See also the docstring of `make-translation-table'.");
2082 Vnonascii_translation_table = Qnil;
4cf9710d 2083
b0e3cf2b
KH
2084 DEFVAR_INT ("min-composite-char", &min_composite_char,
2085 "Minimum character code of a composite character.");
2086 min_composite_char = MIN_CHAR_COMPOSITION;
c1a08b4c
KH
2087
2088 DEFVAR_LISP ("auto-fill-chars", &Vauto_fill_chars,
2089 "A char-table for characters which invoke auto-filling.\n\
2090Such characters has value t in this table.");
2091 Vauto_fill_chars = Fmake_char_table (Qauto_fill_chars, Qnil);
60022cb7
AS
2092 CHAR_TABLE_SET (Vauto_fill_chars, make_number (' '), Qt);
2093 CHAR_TABLE_SET (Vauto_fill_chars, make_number ('\n'), Qt);
4ed46869
KH
2094}
2095
2096#endif /* emacs */