Clear up previous change.
[bpt/emacs.git] / src / charset.c
CommitLineData
75c8c592 1/* Basic multilingual character support.
35e623fb 2 Copyright (C) 1995, 1997, 1998 Electrotechnical Laboratory, JAPAN.
75c8c592 3 Licensed to the Free Software Foundation.
4ed46869 4
369314dc
KH
5This file is part of GNU Emacs.
6
7GNU Emacs is free software; you can redistribute it and/or modify
8it under the terms of the GNU General Public License as published by
9the Free Software Foundation; either version 2, or (at your option)
10any later version.
4ed46869 11
369314dc
KH
12GNU Emacs is distributed in the hope that it will be useful,
13but WITHOUT ANY WARRANTY; without even the implied warranty of
14MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15GNU General Public License for more details.
4ed46869 16
369314dc
KH
17You should have received a copy of the GNU General Public License
18along with GNU Emacs; see the file COPYING. If not, write to
19the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
20Boston, MA 02111-1307, USA. */
4ed46869
KH
21
22/* At first, see the document in `charset.h' to understand the code in
23 this file. */
24
25#include <stdio.h>
26
27#ifdef emacs
28
29#include <sys/types.h>
30#include <config.h>
31#include "lisp.h"
32#include "buffer.h"
33#include "charset.h"
34#include "coding.h"
fc6b09bf 35#include "disptab.h"
4ed46869
KH
36
37#else /* not emacs */
38
39#include "mulelib.h"
40
41#endif /* emacs */
42
43Lisp_Object Qcharset, Qascii, Qcomposition;
0282eb69 44Lisp_Object Qunknown;
4ed46869
KH
45
46/* Declaration of special leading-codes. */
47int leading_code_composition; /* for composite characters */
48int leading_code_private_11; /* for private DIMENSION1 of 1-column */
49int leading_code_private_12; /* for private DIMENSION1 of 2-column */
50int leading_code_private_21; /* for private DIMENSION2 of 1-column */
51int leading_code_private_22; /* for private DIMENSION2 of 2-column */
52
53/* Declaration of special charsets. */
54int charset_ascii; /* ASCII */
55int charset_composition; /* for a composite character */
56int charset_latin_iso8859_1; /* ISO8859-1 (Latin-1) */
57int charset_jisx0208_1978; /* JISX0208.1978 (Japanese Kanji old set) */
58int charset_jisx0208; /* JISX0208.1983 (Japanese Kanji) */
59int charset_katakana_jisx0201; /* JISX0201.Kana (Japanese Katakana) */
60int charset_latin_jisx0201; /* JISX0201.Roman (Japanese Roman) */
61int charset_big5_1; /* Big5 Level 1 (Chinese Traditional) */
62int charset_big5_2; /* Big5 Level 2 (Chinese Traditional) */
63
b0e3cf2b
KH
64int min_composite_char;
65
4ed46869
KH
66Lisp_Object Qcharset_table;
67
68/* A char-table containing information of each character set. */
69Lisp_Object Vcharset_table;
70
71/* A vector of charset symbol indexed by charset-id. This is used
72 only for returning charset symbol from C functions. */
73Lisp_Object Vcharset_symbol_table;
74
75/* A list of charset symbols ever defined. */
76Lisp_Object Vcharset_list;
77
537efd8d
KH
78/* Vector of translation table ever defined.
79 ID of a translation table is used to index this vector. */
80Lisp_Object Vtranslation_table_vector;
b0e3cf2b 81
c1a08b4c
KH
82/* A char-table for characters which may invoke auto-filling. */
83Lisp_Object Vauto_fill_chars;
84
85Lisp_Object Qauto_fill_chars;
86
4ed46869
KH
87/* Tables used by macros BYTES_BY_CHAR_HEAD and WIDTH_BY_CHAR_HEAD. */
88int bytes_by_char_head[256];
89int width_by_char_head[256];
90
91/* Mapping table from ISO2022's charset (specified by DIMENSION,
92 CHARS, and FINAL-CHAR) to Emacs' charset. */
93int iso_charset_table[2][2][128];
94
513ee442
KH
95/* Table of pointers to the structure `cmpchar_info' indexed by
96 CMPCHAR-ID. */
97struct cmpchar_info **cmpchar_table;
98/* The current size of `cmpchar_table'. */
99static int cmpchar_table_size;
100/* Number of the current composite characters. */
101int n_cmpchars;
102
4ed46869
KH
103/* Variables used locally in the macro FETCH_MULTIBYTE_CHAR. */
104unsigned char *_fetch_multibyte_char_p;
105int _fetch_multibyte_char_len;
106
35e623fb
RS
107/* Offset to add to a non-ASCII value when inserting it. */
108int nonascii_insert_offset;
109
4cf9710d
RS
110/* Translation table for converting non-ASCII unibyte characters
111 to multibyte codes, or nil. */
b4e9dd77 112Lisp_Object Vnonascii_translation_table;
4cf9710d 113
8a73a704
KH
114/* List of all possible generic characters. */
115Lisp_Object Vgeneric_character_list;
116
046b1f03
RS
117#define min(X, Y) ((X) < (Y) ? (X) : (Y))
118#define max(X, Y) ((X) > (Y) ? (X) : (Y))
119\f
93bcb785
KH
120void
121invalid_character (c)
122 int c;
123{
ba7434e5 124 error ("Invalid character: 0%o, %d, 0x%x", c, c, c);
93bcb785
KH
125}
126
44c6492d
KH
127/* Parse composite character string STR of length LENGTH (>= 2) and
128 set BYTES, CHARSET, C1, and C2 as below.
129
130 It is assumed that *STR is LEADING_CODE_COMPOSITION and the
131 following (LENGTH - 1) bytes satisfy !CHAR_HEAD_P.
132
133 If there is a valid composite character, set CHARSET, C1, and C2 to
134 such values that MAKE_CHAR can make the composite character from
135 them. Otherwise, set CHARSET to CHARSET_COMPOSITION, set C1 to the
136 second byte of the sequence, C2 to -1 so that MAKE_CHAR can make
137 the invalid multibyte character whose string representation is two
138 bytes of STR[0] and STR[1]. In any case, set BYTES to LENGTH.
139
140 This macro should be called only from SPLIT_MULTIBYTE_SEQ. */
ac4137cc
KH
141
142#define SPLIT_COMPOSITE_SEQ(str, length, bytes, charset, c1, c2) \
143 do { \
144 int cmpchar_id = str_cmpchar_id ((str), (length)); \
145 \
146 (charset) = CHARSET_COMPOSITION; \
147 (bytes) = (length); \
148 if (cmpchar_id >= 0) \
149 { \
150 (c1) = CHAR_FIELD2 (cmpchar_id); \
151 (c2) = CHAR_FIELD3 (cmpchar_id); \
152 } \
153 else \
154 { \
155 (c1) = (str)[1] & 0x7F; \
156 (c2) = -1; \
157 } \
158 } while (0)
159
44c6492d
KH
160/* Parse non-composite multibyte character string STR of length LENGTH
161 (>= 2) and set BYTES to the length of actual multibyte sequence,
162 CHARSET, C1, and C2 to such values that MAKE_CHAR can make the
163 multibyte character from them.
164
165 It is assumed that *STR is one of base leading codes (excluding
166 LEADING_CODE_COMPOSITION) and the following (LENGTH - 1) bytes
167 satisfy !CHAR_HEAD_P.
168
169 This macro should be called only from SPLIT_MULTIBYTE_SEQ. */
ac4137cc
KH
170
171#define SPLIT_CHARACTER_SEQ(str, length, bytes, charset, c1, c2) \
172 do { \
173 (bytes) = 1; \
174 (charset) = (str)[0]; \
175 if ((charset) >= LEADING_CODE_PRIVATE_11 \
176 && (charset) <= LEADING_CODE_PRIVATE_22) \
177 (charset) = (str)[(bytes)++]; \
178 if ((bytes) < (length)) \
179 { \
180 (c1) = (str)[(bytes)++] & 0x7F; \
181 if ((bytes) < (length)) \
182 (c2) = (str)[(bytes)++] & 0x7F; \
183 else \
184 (c2) = -1; \
185 } \
186 else \
187 (c1) = (c2) = -1; \
188 } while (0)
189
190/* Parse string STR of length LENGTH and check if a multibyte
191 characters is at STR. set BYTES to the actual length, CHARSET, C1,
192 C2 to proper values for that character. */
193
194#define SPLIT_MULTIBYTE_SEQ(str, length, bytes, charset, c1, c2) \
195 do { \
196 int i; \
44c6492d
KH
197 if (ASCII_BYTE_P ((str)[0])) \
198 i = 1; \
199 else \
200 for (i = 1; i < (length) && ! CHAR_HEAD_P ((str)[i]); i++); \
ac4137cc
KH
201 if (i == 1) \
202 (bytes) = 1, (charset) = CHARSET_ASCII, (c1) = (str)[0] ; \
203 else if ((str)[0] == LEADING_CODE_COMPOSITION) \
204 SPLIT_COMPOSITE_SEQ (str, i, bytes, charset, c1, c2); \
205 else \
206 { \
207 if (i > BYTES_BY_CHAR_HEAD ((str)[0])) \
208 i = BYTES_BY_CHAR_HEAD ((str)[0]); \
209 SPLIT_CHARACTER_SEQ (str, i, bytes, charset, c1, c2); \
210 } \
211 } while (0)
212
213/* 1 if CHARSET, C1, and C2 compose a valid character, else 0. */
44c6492d 214#define CHAR_COMPONENTS_VALID_P(charset, c1, c2) \
ac4137cc
KH
215 (CHARSET_DIMENSION (charset) == 1 \
216 ? ((c1) >= 0x20 && (c1) <= 0x7F) \
217 : ((c1) >= 0x20 && (c1) <= 0x7F && (c2) >= 0x20 && (c2) <= 0x7F))
93bcb785 218
4ed46869
KH
219/* Set STR a pointer to the multi-byte form of the character C. If C
220 is not a composite character, the multi-byte form is set in WORKBUF
221 and STR points WORKBUF. The caller should allocate at least 4-byte
222 area at WORKBUF in advance. Returns the length of the multi-byte
ac4137cc
KH
223 form. If C is an invalid character, store (C & 0xFF) in WORKBUF[0]
224 and return 1.
4ed46869
KH
225
226 Use macro `CHAR_STRING (C, WORKBUF, STR)' instead of calling this
227 function directly if C can be an ASCII character. */
228
229int
230non_ascii_char_to_string (c, workbuf, str)
231 int c;
232 unsigned char *workbuf, **str;
233{
6662e69b 234 if (c & CHAR_MODIFIER_MASK) /* This includes the case C is negative. */
8ac5a9cc 235 {
6662e69b
KH
236 /* Multibyte character can't have a modifier bit. */
237 if (! SINGLE_BYTE_CHAR_P ((c & ~CHAR_MODIFIER_MASK)))
238 invalid_character (c);
239
240 /* For Meta, Shift, and Control modifiers, we need special care. */
8ac5a9cc 241 if (c & CHAR_META)
6662e69b
KH
242 {
243 /* Move the meta bit to the right place for a string. */
244 c = (c & ~CHAR_META) | 0x80;
245 }
246 if (c & CHAR_SHIFT)
247 {
248 /* Shift modifier is valid only with [A-Za-z]. */
249 if ((c & 0377) >= 'A' && (c & 0377) <= 'Z')
250 c &= ~CHAR_SHIFT;
251 else if ((c & 0377) >= 'a' && (c & 0377) <= 'z')
252 c = (c & ~CHAR_SHIFT) - ('a' - 'A');
253 }
8ac5a9cc 254 if (c & CHAR_CTL)
6662e69b
KH
255 {
256 /* Simulate the code in lread.c. */
257 /* Allow `\C- ' and `\C-?'. */
258 if (c == (CHAR_CTL | ' '))
259 c = 0;
260 else if (c == (CHAR_CTL | '?'))
261 c = 127;
262 /* ASCII control chars are made from letters (both cases),
263 as well as the non-letters within 0100...0137. */
264 else if ((c & 0137) >= 0101 && (c & 0137) <= 0132)
265 c &= (037 | (~0177 & ~CHAR_CTL));
266 else if ((c & 0177) >= 0100 && (c & 0177) <= 0137)
267 c &= (037 | (~0177 & ~CHAR_CTL));
268 }
269
270 /* If C still has any modifier bits, it is an invalid character. */
271 if (c & CHAR_MODIFIER_MASK)
272 invalid_character (c);
273
8ac5a9cc 274 *str = workbuf;
ac4137cc 275 *workbuf++ = c;
8ac5a9cc 276 }
ac4137cc 277 else
4ed46869 278 {
ac4137cc 279 int charset, c1, c2;
4ed46869 280
ac4137cc
KH
281 SPLIT_NON_ASCII_CHAR (c, charset, c1, c2);
282 if (charset == CHARSET_COMPOSITION)
4ed46869 283 {
ac4137cc
KH
284 if (c >= MAX_CHAR)
285 invalid_character (c);
286 if (c >= MIN_CHAR_COMPOSITION)
287 {
288 /* Valid composite character. */
289 *str = cmpchar_table[COMPOSITE_CHAR_ID (c)]->data;
290 workbuf = *str + cmpchar_table[COMPOSITE_CHAR_ID (c)]->len;
291 }
292 else
293 {
294 /* Invalid but can have multibyte form. */
295 *str = workbuf;
296 *workbuf++ = LEADING_CODE_COMPOSITION;
297 *workbuf++ = c1 | 0x80;
298 }
4ed46869 299 }
ac4137cc 300 else if (charset > CHARSET_COMPOSITION)
4ed46869 301 {
ac4137cc
KH
302 *str = workbuf;
303 if (charset >= LEADING_CODE_EXT_11)
304 *workbuf++ = (charset < LEADING_CODE_EXT_12
305 ? LEADING_CODE_PRIVATE_11
306 : (charset < LEADING_CODE_EXT_21
307 ? LEADING_CODE_PRIVATE_12
308 : (charset < LEADING_CODE_EXT_22
309 ? LEADING_CODE_PRIVATE_21
310 : LEADING_CODE_PRIVATE_22)));
311 *workbuf++ = charset;
312 if (c1 > 0 && c1 < 32 || c2 > 0 && c2 < 32)
313 invalid_character (c);
314 if (c1)
315 {
316 *workbuf++ = c1 | 0x80;
317 if (c2 > 0)
318 *workbuf++ = c2 | 0x80;
319 }
4ed46869 320 }
ac4137cc
KH
321 else if (charset == CHARSET_ASCII)
322 *workbuf++= c & 0x7F;
323 else
324 invalid_character (c);
4ed46869
KH
325 }
326
4ed46869
KH
327 return (workbuf - *str);
328}
329
44c6492d
KH
330/* Return the non-ASCII character corresponding to multi-byte form at
331 STR of length LEN. If ACTUAL_LEN is not NULL, store the byte
332 length of the multibyte form in *ACTUAL_LEN.
537efd8d 333
4ed46869 334 Use macro `STRING_CHAR (STR, LEN)' instead of calling this function
44c6492d 335 directly if you want ot handle ASCII characters as well. */
4ed46869 336
dfcf069d 337int
ac4137cc 338string_to_non_ascii_char (str, len, actual_len)
8867de67 339 const unsigned char *str;
ac4137cc 340 int len, *actual_len;
4ed46869 341{
ac4137cc 342 int c, bytes, charset, c1, c2;
4ed46869 343
ac4137cc
KH
344 SPLIT_MULTIBYTE_SEQ (str, len, bytes, charset, c1, c2);
345 c = MAKE_CHAR (charset, c1, c2);
4ed46869 346 if (actual_len)
ac4137cc 347 *actual_len = bytes;
4ed46869
KH
348 return c;
349}
350
44c6492d
KH
351/* Return the length of the multi-byte form at string STR of length LEN.
352 Use the macro MULTIBYTE_FORM_LENGTH instead. */
4ed46869
KH
353int
354multibyte_form_length (str, len)
8867de67 355 const unsigned char *str;
4ed46869
KH
356 int len;
357{
ac4137cc 358 int bytes;
4ed46869 359
ac4137cc 360 PARSE_MULTIBYTE_SEQ (str, len, bytes);
90d7b74e 361 return bytes;
4ed46869
KH
362}
363
ac4137cc
KH
364/* Check multibyte form at string STR of length LEN and set variables
365 pointed by CHARSET, C1, and C2 to charset and position codes of the
366 character at STR, and return 0. If there's no multibyte character,
4ed46869
KH
367 return -1. This should be used only in the macro SPLIT_STRING
368 which checks range of STR in advance. */
369
dfcf069d 370int
4ed46869 371split_non_ascii_string (str, len, charset, c1, c2)
ac4137cc
KH
372 const unsigned char *str;
373 unsigned char *c1, *c2;
374 int len, *charset;
4ed46869 375{
ac4137cc 376 register int bytes, cs, code1, code2 = -1;
4ed46869 377
ac4137cc
KH
378 SPLIT_MULTIBYTE_SEQ (str, len, bytes, cs, code1, code2);
379 if (cs == CHARSET_ASCII)
4ed46869 380 return -1;
ac4137cc
KH
381 *charset = cs;
382 *c1 = code1;
383 *c2 = code2;
384}
385
44c6492d
KH
386/* Return 1 iff character C has valid printable glyph.
387 Use the macro CHAR_PRINTABLE_P instead. */
ac4137cc
KH
388int
389char_printable_p (c)
390 int c;
391{
392 int charset, c1, c2, chars;
393
394 if (SINGLE_BYTE_CHAR_P (c))
395 return 1;
396 if (c >= MIN_CHAR_COMPOSITION)
397 return (c < MAX_CHAR);
398
399 SPLIT_NON_ASCII_CHAR (c, charset, c1, c2);
400 if (! CHARSET_DEFINED_P (charset))
401 return 0;
402 if (CHARSET_CHARS (charset) == 94
403 ? c1 <= 32 || c1 >= 127
404 : c1 < 32)
405 return 0;
406 if (CHARSET_DIMENSION (charset) == 2
407 && (CHARSET_CHARS (charset) == 94
408 ? c2 <= 32 || c2 >= 127
409 : c2 < 32))
410 return 0;
411 return 1;
4ed46869
KH
412}
413
537efd8d 414/* Translate character C by translation table TABLE. If C
b4e9dd77
KH
415 is negative, translate a character specified by CHARSET, C1, and C2
416 (C1 and C2 are code points of the character). If no translation is
417 found in TABLE, return C. */
dfcf069d 418int
b4e9dd77 419translate_char (table, c, charset, c1, c2)
23d2a7f1
KH
420 Lisp_Object table;
421 int c, charset, c1, c2;
422{
423 Lisp_Object ch;
424 int alt_charset, alt_c1, alt_c2, dimension;
425
426 if (c < 0) c = MAKE_CHAR (charset, c1, c2);
427 if (!CHAR_TABLE_P (table)
ac4137cc 428 || (ch = Faref (table, make_number (c)), !NATNUMP (ch)))
23d2a7f1
KH
429 return c;
430
431 SPLIT_CHAR (XFASTINT (ch), alt_charset, alt_c1, alt_c2);
432 dimension = CHARSET_DIMENSION (alt_charset);
433 if (dimension == 1 && alt_c1 > 0 || dimension == 2 && alt_c2 > 0)
434 /* CH is not a generic character, just return it. */
435 return XFASTINT (ch);
436
437 /* Since CH is a generic character, we must return a specific
438 charater which has the same position codes as C from CH. */
439 if (charset < 0)
440 SPLIT_CHAR (c, charset, c1, c2);
441 if (dimension != CHARSET_DIMENSION (charset))
442 /* We can't make such a character because of dimension mismatch. */
443 return c;
23d2a7f1
KH
444 return MAKE_CHAR (alt_charset, c1, c2);
445}
446
d2665018 447/* Convert the unibyte character C to multibyte based on
b4e9dd77 448 Vnonascii_translation_table or nonascii_insert_offset. If they can't
d2665018
KH
449 convert C to a valid multibyte character, convert it based on
450 DEFAULT_NONASCII_INSERT_OFFSET which makes C a Latin-1 character. */
35e623fb 451
dfcf069d 452int
35e623fb
RS
453unibyte_char_to_multibyte (c)
454 int c;
455{
543b4f61 456 if (c < 0400 && c >= 0200)
35e623fb 457 {
d2665018
KH
458 int c_save = c;
459
b4e9dd77 460 if (! NILP (Vnonascii_translation_table))
bbf12bb3
KH
461 {
462 c = XINT (Faref (Vnonascii_translation_table, make_number (c)));
ac4137cc 463 if (c >= 0400 && ! char_valid_p (c, 0))
bbf12bb3
KH
464 c = c_save + DEFAULT_NONASCII_INSERT_OFFSET;
465 }
466 else if (c >= 0240 && nonascii_insert_offset > 0)
467 {
468 c += nonascii_insert_offset;
ac4137cc 469 if (c < 0400 || ! char_valid_p (c, 0))
bbf12bb3
KH
470 c = c_save + DEFAULT_NONASCII_INSERT_OFFSET;
471 }
472 else if (c >= 0240)
d2665018 473 c = c_save + DEFAULT_NONASCII_INSERT_OFFSET;
35e623fb
RS
474 }
475 return c;
476}
76d7b829
KH
477
478
479/* Convert the multibyte character C to unibyte 8-bit character based
480 on Vnonascii_translation_table or nonascii_insert_offset. If
481 REV_TBL is non-nil, it should be a reverse table of
482 Vnonascii_translation_table, i.e. what given by:
483 Fchar_table_extra_slot (Vnonascii_translation_table, make_number (0)) */
484
485int
486multibyte_char_to_unibyte (c, rev_tbl)
487 int c;
488 Lisp_Object rev_tbl;
489{
490 if (!SINGLE_BYTE_CHAR_P (c))
491 {
492 int c_save = c;
493
494 if (! CHAR_TABLE_P (rev_tbl)
495 && CHAR_TABLE_P (Vnonascii_translation_table))
496 rev_tbl = Fchar_table_extra_slot (Vnonascii_translation_table,
497 make_number (0));
498 if (CHAR_TABLE_P (rev_tbl))
499 {
500 Lisp_Object temp;
501 temp = Faref (rev_tbl, make_number (c));
502 if (INTEGERP (temp))
503 c = XINT (temp);
bbf12bb3
KH
504 if (c >= 256)
505 c = (c_save & 0177) + 0200;
506 }
507 else
508 {
509 if (nonascii_insert_offset > 0)
510 c -= nonascii_insert_offset;
511 if (c < 128 || c >= 256)
512 c = (c_save & 0177) + 0200;
76d7b829 513 }
76d7b829
KH
514 }
515
516 return c;
517}
518
35e623fb 519\f
4ed46869
KH
520/* Update the table Vcharset_table with the given arguments (see the
521 document of `define-charset' for the meaning of each argument).
522 Several other table contents are also updated. The caller should
523 check the validity of CHARSET-ID and the remaining arguments in
524 advance. */
525
526void
527update_charset_table (charset_id, dimension, chars, width, direction,
528 iso_final_char, iso_graphic_plane,
529 short_name, long_name, description)
530 Lisp_Object charset_id, dimension, chars, width, direction;
531 Lisp_Object iso_final_char, iso_graphic_plane;
532 Lisp_Object short_name, long_name, description;
533{
534 int charset = XINT (charset_id);
535 int bytes;
536 unsigned char leading_code_base, leading_code_ext;
537
6dc0722d
KH
538 if (NILP (CHARSET_TABLE_ENTRY (charset)))
539 CHARSET_TABLE_ENTRY (charset)
540 = Fmake_vector (make_number (CHARSET_MAX_IDX), Qnil);
4ed46869
KH
541
542 /* Get byte length of multibyte form, base leading-code, and
543 extended leading-code of the charset. See the comment under the
544 title "GENERAL NOTE on CHARACTER SET (CHARSET)" in charset.h. */
545 bytes = XINT (dimension);
546 if (charset < MIN_CHARSET_PRIVATE_DIMENSION1)
547 {
548 /* Official charset, it doesn't have an extended leading-code. */
549 if (charset != CHARSET_ASCII)
550 bytes += 1; /* For a base leading-code. */
551 leading_code_base = charset;
552 leading_code_ext = 0;
553 }
554 else
555 {
556 /* Private charset. */
557 bytes += 2; /* For base and extended leading-codes. */
558 leading_code_base
559 = (charset < LEADING_CODE_EXT_12
560 ? LEADING_CODE_PRIVATE_11
561 : (charset < LEADING_CODE_EXT_21
562 ? LEADING_CODE_PRIVATE_12
563 : (charset < LEADING_CODE_EXT_22
564 ? LEADING_CODE_PRIVATE_21
565 : LEADING_CODE_PRIVATE_22)));
566 leading_code_ext = charset;
567 }
568
6ef23ebb
KH
569 if (BYTES_BY_CHAR_HEAD (leading_code_base) != bytes)
570 error ("Invalid dimension for the charset-ID %d", charset);
571
4ed46869
KH
572 CHARSET_TABLE_INFO (charset, CHARSET_ID_IDX) = charset_id;
573 CHARSET_TABLE_INFO (charset, CHARSET_BYTES_IDX) = make_number (bytes);
574 CHARSET_TABLE_INFO (charset, CHARSET_DIMENSION_IDX) = dimension;
575 CHARSET_TABLE_INFO (charset, CHARSET_CHARS_IDX) = chars;
576 CHARSET_TABLE_INFO (charset, CHARSET_WIDTH_IDX) = width;
577 CHARSET_TABLE_INFO (charset, CHARSET_DIRECTION_IDX) = direction;
578 CHARSET_TABLE_INFO (charset, CHARSET_LEADING_CODE_BASE_IDX)
579 = make_number (leading_code_base);
580 CHARSET_TABLE_INFO (charset, CHARSET_LEADING_CODE_EXT_IDX)
581 = make_number (leading_code_ext);
582 CHARSET_TABLE_INFO (charset, CHARSET_ISO_FINAL_CHAR_IDX) = iso_final_char;
583 CHARSET_TABLE_INFO (charset, CHARSET_ISO_GRAPHIC_PLANE_IDX)
584 = iso_graphic_plane;
585 CHARSET_TABLE_INFO (charset, CHARSET_SHORT_NAME_IDX) = short_name;
586 CHARSET_TABLE_INFO (charset, CHARSET_LONG_NAME_IDX) = long_name;
587 CHARSET_TABLE_INFO (charset, CHARSET_DESCRIPTION_IDX) = description;
588 CHARSET_TABLE_INFO (charset, CHARSET_PLIST_IDX) = Qnil;
589
590 {
591 /* If we have already defined a charset which has the same
592 DIMENSION, CHARS and ISO-FINAL-CHAR but the different
593 DIRECTION, we must update the entry REVERSE-CHARSET of both
594 charsets. If there's no such charset, the value of the entry
595 is set to nil. */
596 int i;
597
513ee442 598 for (i = 0; i <= MAX_CHARSET; i++)
4ed46869
KH
599 if (!NILP (CHARSET_TABLE_ENTRY (i)))
600 {
601 if (CHARSET_DIMENSION (i) == XINT (dimension)
602 && CHARSET_CHARS (i) == XINT (chars)
603 && CHARSET_ISO_FINAL_CHAR (i) == XINT (iso_final_char)
604 && CHARSET_DIRECTION (i) != XINT (direction))
605 {
606 CHARSET_TABLE_INFO (charset, CHARSET_REVERSE_CHARSET_IDX)
607 = make_number (i);
608 CHARSET_TABLE_INFO (i, CHARSET_REVERSE_CHARSET_IDX) = charset_id;
609 break;
610 }
611 }
513ee442 612 if (i > MAX_CHARSET)
4ed46869
KH
613 /* No such a charset. */
614 CHARSET_TABLE_INFO (charset, CHARSET_REVERSE_CHARSET_IDX)
615 = make_number (-1);
616 }
617
618 if (charset != CHARSET_ASCII
619 && charset < MIN_CHARSET_PRIVATE_DIMENSION1)
620 {
4ed46869
KH
621 width_by_char_head[leading_code_base] = XINT (width);
622
623 /* Update table emacs_code_class. */
624 emacs_code_class[charset] = (bytes == 2
625 ? EMACS_leading_code_2
626 : (bytes == 3
627 ? EMACS_leading_code_3
628 : EMACS_leading_code_4));
629 }
630
631 /* Update table iso_charset_table. */
632 if (ISO_CHARSET_TABLE (dimension, chars, iso_final_char) < 0)
633 ISO_CHARSET_TABLE (dimension, chars, iso_final_char) = charset;
634}
635
636#ifdef emacs
637
638/* Return charset id of CHARSET_SYMBOL, or return -1 if CHARSET_SYMBOL
639 is invalid. */
640int
641get_charset_id (charset_symbol)
642 Lisp_Object charset_symbol;
643{
644 Lisp_Object val;
645 int charset;
646
647 return ((SYMBOLP (charset_symbol)
648 && (val = Fget (charset_symbol, Qcharset), VECTORP (val))
649 && (charset = XINT (XVECTOR (val)->contents[CHARSET_ID_IDX]),
650 CHARSET_VALID_P (charset)))
651 ? charset : -1);
652}
653
654/* Return an identification number for a new private charset of
655 DIMENSION and WIDTH. If there's no more room for the new charset,
656 return 0. */
657Lisp_Object
658get_new_private_charset_id (dimension, width)
659 int dimension, width;
660{
661 int charset, from, to;
662
663 if (dimension == 1)
664 {
665 if (width == 1)
666 from = LEADING_CODE_EXT_11, to = LEADING_CODE_EXT_12;
667 else
668 from = LEADING_CODE_EXT_12, to = LEADING_CODE_EXT_21;
669 }
670 else
671 {
672 if (width == 1)
673 from = LEADING_CODE_EXT_21, to = LEADING_CODE_EXT_22;
674 else
b0e3cf2b 675 from = LEADING_CODE_EXT_22, to = LEADING_CODE_EXT_MAX + 1;
4ed46869
KH
676 }
677
678 for (charset = from; charset < to; charset++)
679 if (!CHARSET_DEFINED_P (charset)) break;
680
681 return make_number (charset < to ? charset : 0);
682}
683
684DEFUN ("define-charset", Fdefine_charset, Sdefine_charset, 3, 3, 0,
685 "Define CHARSET-ID as the identification number of CHARSET with INFO-VECTOR.\n\
23d2a7f1 686If CHARSET-ID is nil, it is decided automatically, which means CHARSET is\n\
4ed46869
KH
687 treated as a private charset.\n\
688INFO-VECTOR is a vector of the format:\n\
689 [DIMENSION CHARS WIDTH DIRECTION ISO-FINAL-CHAR ISO-GRAPHIC-PLANE\n\
690 SHORT-NAME LONG-NAME DESCRIPTION]\n\
691The meanings of each elements is as follows:\n\
692DIMENSION (integer) is the number of bytes to represent a character: 1 or 2.\n\
693CHARS (integer) is the number of characters in a dimension: 94 or 96.\n\
694WIDTH (integer) is the number of columns a character in the charset\n\
695occupies on the screen: one of 0, 1, and 2.\n\
696\n\
697DIRECTION (integer) is the rendering direction of characters in the\n\
277576f6
KH
698charset when rendering. If 0, render from left to right, else\n\
699render from right to left.\n\
4ed46869
KH
700\n\
701ISO-FINAL-CHAR (character) is the final character of the\n\
702corresponding ISO 2022 charset.\n\
703\n\
704ISO-GRAPHIC-PLANE (integer) is the graphic plane to be invoked\n\
705while encoding to variants of ISO 2022 coding system, one of the\n\
706following: 0/graphic-plane-left(GL), 1/graphic-plane-right(GR).\n\
707\n\
708SHORT-NAME (string) is the short name to refer to the charset.\n\
709\n\
710LONG-NAME (string) is the long name to refer to the charset.\n\
711\n\
712DESCRIPTION (string) is the description string of the charset.")
713 (charset_id, charset_symbol, info_vector)
714 Lisp_Object charset_id, charset_symbol, info_vector;
715{
716 Lisp_Object *vec;
717
718 if (!NILP (charset_id))
719 CHECK_NUMBER (charset_id, 0);
720 CHECK_SYMBOL (charset_symbol, 1);
721 CHECK_VECTOR (info_vector, 2);
722
723 if (! NILP (charset_id))
724 {
725 if (! CHARSET_VALID_P (XINT (charset_id)))
726 error ("Invalid CHARSET: %d", XINT (charset_id));
727 else if (CHARSET_DEFINED_P (XINT (charset_id)))
728 error ("Already defined charset: %d", XINT (charset_id));
729 }
730
731 vec = XVECTOR (info_vector)->contents;
732 if (XVECTOR (info_vector)->size != 9
733 || !INTEGERP (vec[0]) || !(XINT (vec[0]) == 1 || XINT (vec[0]) == 2)
734 || !INTEGERP (vec[1]) || !(XINT (vec[1]) == 94 || XINT (vec[1]) == 96)
735 || !INTEGERP (vec[2]) || !(XINT (vec[2]) == 1 || XINT (vec[2]) == 2)
736 || !INTEGERP (vec[3]) || !(XINT (vec[3]) == 0 || XINT (vec[3]) == 1)
737 || !INTEGERP (vec[4]) || !(XINT (vec[4]) >= '0' && XINT (vec[4]) <= '~')
738 || !INTEGERP (vec[5]) || !(XINT (vec[5]) == 0 || XINT (vec[5]) == 1)
739 || !STRINGP (vec[6])
740 || !STRINGP (vec[7])
741 || !STRINGP (vec[8]))
742 error ("Invalid info-vector argument for defining charset %s",
743 XSYMBOL (charset_symbol)->name->data);
744
745 if (NILP (charset_id))
746 {
747 charset_id = get_new_private_charset_id (XINT (vec[0]), XINT (vec[2]));
748 if (XINT (charset_id) == 0)
749 error ("There's no room for a new private charset %s",
750 XSYMBOL (charset_symbol)->name->data);
751 }
752
753 update_charset_table (charset_id, vec[0], vec[1], vec[2], vec[3],
754 vec[4], vec[5], vec[6], vec[7], vec[8]);
6dc0722d 755 Fput (charset_symbol, Qcharset, CHARSET_TABLE_ENTRY (XINT (charset_id)));
4ed46869
KH
756 CHARSET_SYMBOL (XINT (charset_id)) = charset_symbol;
757 Vcharset_list = Fcons (charset_symbol, Vcharset_list);
758 return Qnil;
759}
760
8a73a704
KH
761DEFUN ("generic-character-list", Fgeneric_character_list,
762 Sgeneric_character_list, 0, 0, 0,
763 "Return a list of all possible generic characters.\n\
764It includes a generic character for a charset not yet defined.")
765 ()
766{
767 return Vgeneric_character_list;
768}
769
3fac5a51
KH
770DEFUN ("get-unused-iso-final-char", Fget_unused_iso_final_char,
771 Sget_unused_iso_final_char, 2, 2, 0,
772 "Return an unsed ISO's final char for a charset of DIMENISION and CHARS.\n\
773DIMENSION is the number of bytes to represent a character: 1 or 2.\n\
774CHARS is the number of characters in a dimension: 94 or 96.\n\
775\n\
776This final char is for private use, thus the range is `0' (48) .. `?' (63).\n\
777If there's no unused final char for the specified kind of charset,\n\
778return nil.")
779 (dimension, chars)
780 Lisp_Object dimension, chars;
781{
782 int final_char;
783
784 CHECK_NUMBER (dimension, 0);
785 CHECK_NUMBER (chars, 1);
786 if (XINT (dimension) != 1 && XINT (dimension) != 2)
787 error ("Invalid charset dimension %d, it should be 1 or 2",
788 XINT (dimension));
789 if (XINT (chars) != 94 && XINT (chars) != 96)
790 error ("Invalid charset chars %d, it should be 94 or 96",
791 XINT (chars));
792 for (final_char = '0'; final_char <= '?'; final_char++)
793 {
794 if (ISO_CHARSET_TABLE (dimension, chars, make_number (final_char)) < 0)
795 break;
796 }
797 return (final_char <= '?' ? make_number (final_char) : Qnil);
798}
799
4ed46869
KH
800DEFUN ("declare-equiv-charset", Fdeclare_equiv_charset, Sdeclare_equiv_charset,
801 4, 4, 0,
802 "Declare a charset of DIMENSION, CHARS, FINAL-CHAR is the same as CHARSET.\n\
803CHARSET should be defined by `defined-charset' in advance.")
804 (dimension, chars, final_char, charset_symbol)
805 Lisp_Object dimension, chars, final_char, charset_symbol;
806{
807 int charset;
808
809 CHECK_NUMBER (dimension, 0);
810 CHECK_NUMBER (chars, 1);
811 CHECK_NUMBER (final_char, 2);
812 CHECK_SYMBOL (charset_symbol, 3);
813
814 if (XINT (dimension) != 1 && XINT (dimension) != 2)
815 error ("Invalid DIMENSION %d, it should be 1 or 2", XINT (dimension));
816 if (XINT (chars) != 94 && XINT (chars) != 96)
817 error ("Invalid CHARS %d, it should be 94 or 96", XINT (chars));
818 if (XINT (final_char) < '0' || XFASTINT (final_char) > '~')
819 error ("Invalid FINAL-CHAR %c, it should be `0'..`~'", XINT (chars));
820 if ((charset = get_charset_id (charset_symbol)) < 0)
821 error ("Invalid charset %s", XSYMBOL (charset_symbol)->name->data);
822
823 ISO_CHARSET_TABLE (dimension, chars, final_char) = charset;
824 return Qnil;
825}
826
827/* Return number of different charsets in STR of length LEN. In
828 addition, for each found charset N, CHARSETS[N] is set 1. The
a29e3b1b 829 caller should allocate CHARSETS (MAX_CHARSET + 1 elements) in advance.
1d67c29b
KH
830 It may lookup a translation table TABLE if supplied.
831
832 If CMPCHARP is nonzero and some composite character is found,
833 CHARSETS[128] is also set 1 and the returned number is incremented
0282eb69
KH
834 by 1.
835
836 If MULTIBYTE is zero, do not check multibyte characters, i.e. if
837 any ASCII codes (7-bit) are found, CHARSET[0] is set to 1, if any
838 8-bit codes are found CHARSET[1] is set to 1. */
4ed46869
KH
839
840int
0282eb69 841find_charset_in_str (str, len, charsets, table, cmpcharp, multibyte)
028d516b
KH
842 unsigned char *str;
843 int len, *charsets;
23d2a7f1 844 Lisp_Object table;
1d67c29b 845 int cmpcharp;
0282eb69 846 int multibyte;
4ed46869 847{
733eafd8 848 register int num = 0, c;
4ed46869 849
0282eb69
KH
850 if (! multibyte)
851 {
852 unsigned char *endp = str + len;
853 int maskbits = 0;
854
855 while (str < endp && maskbits != 3)
856 maskbits |= (*str++ < 0x80 ? 1 : 2);
857 if (maskbits & 1)
858 {
859 charsets[0] = 1;
860 num++;
861 }
862 if (maskbits & 2)
863 {
864 charsets[1] = 1;
865 num++;
866 }
867 return num;
868 }
869
23d2a7f1
KH
870 if (! CHAR_TABLE_P (table))
871 table = Qnil;
872
4ed46869
KH
873 while (len > 0)
874 {
05505664 875 int bytes, charset;
733eafd8 876 c = *str;
23d2a7f1 877
733eafd8 878 if (c == LEADING_CODE_COMPOSITION)
05505664 879 {
733eafd8
KH
880 int cmpchar_id = str_cmpchar_id (str, len);
881 GLYPH *glyph;
05505664 882
1d67c29b 883 if (cmpchar_id >= 0)
05505664 884 {
020da460 885 struct cmpchar_info *cmp_p = cmpchar_table[cmpchar_id];
733eafd8
KH
886 int i;
887
020da460 888 for (i = 0; i < cmp_p->glyph_len; i++)
733eafd8 889 {
020da460 890 c = cmp_p->glyph[i];
733eafd8
KH
891 if (!NILP (table))
892 {
b4e9dd77 893 if ((c = translate_char (table, c, 0, 0, 0)) < 0)
020da460 894 c = cmp_p->glyph[i];
733eafd8
KH
895 }
896 if ((charset = CHAR_CHARSET (c)) < 0)
897 charset = CHARSET_ASCII;
898 if (!charsets[charset])
899 {
900 charsets[charset] = 1;
901 num += 1;
902 }
903 }
020da460
KH
904 str += cmp_p->len;
905 len -= cmp_p->len;
906 if (cmpcharp && !charsets[CHARSET_COMPOSITION])
907 {
908 charsets[CHARSET_COMPOSITION] = 1;
909 num += 1;
910 }
733eafd8 911 continue;
05505664 912 }
05505664 913
0282eb69 914 charset = 1; /* This leads to `unknown' charset. */
733eafd8
KH
915 bytes = 1;
916 }
23d2a7f1
KH
917 else
918 {
733eafd8
KH
919 c = STRING_CHAR_AND_LENGTH (str, len, bytes);
920 if (! NILP (table))
921 {
b4e9dd77 922 int c1 = translate_char (table, c, 0, 0, 0);
733eafd8
KH
923 if (c1 >= 0)
924 c = c1;
925 }
926 charset = CHAR_CHARSET (c);
23d2a7f1 927 }
4ed46869
KH
928
929 if (!charsets[charset])
930 {
931 charsets[charset] = 1;
932 num += 1;
933 }
934 str += bytes;
935 len -= bytes;
936 }
937 return num;
938}
939
940DEFUN ("find-charset-region", Ffind_charset_region, Sfind_charset_region,
23d2a7f1 941 2, 3, 0,
4ed46869 942 "Return a list of charsets in the region between BEG and END.\n\
23d2a7f1 943BEG and END are buffer positions.\n\
020da460
KH
944If the region contains any composite character,\n\
945`composition' is included in the returned list.\n\
0282eb69
KH
946Optional arg TABLE if non-nil is a translation table to look up.\n\
947\n\
948If the region contains invalid multiybte characters,\n\
38f02ede 949`unknown' is included in the returned list.\n\
0282eb69
KH
950\n\
951If the current buffer is unibyte, the returned list contains\n\
952`ascii' if any 7-bit characters are found,\n\
953and `unknown' if any 8-bit characters are found.")
23d2a7f1
KH
954 (beg, end, table)
955 Lisp_Object beg, end, table;
4ed46869 956{
028d516b 957 int charsets[MAX_CHARSET + 1];
6ae1f27e 958 int from, from_byte, to, stop, stop_byte, i;
4ed46869 959 Lisp_Object val;
0282eb69
KH
960 int undefined;
961 int multibyte = !NILP (current_buffer->enable_multibyte_characters);
4ed46869
KH
962
963 validate_region (&beg, &end);
964 from = XFASTINT (beg);
965 stop = to = XFASTINT (end);
6ae1f27e 966
4ed46869 967 if (from < GPT && GPT < to)
6ae1f27e
RS
968 {
969 stop = GPT;
970 stop_byte = GPT_BYTE;
971 }
972 else
973 stop_byte = CHAR_TO_BYTE (stop);
974
975 from_byte = CHAR_TO_BYTE (from);
976
028d516b 977 bzero (charsets, (MAX_CHARSET + 1) * sizeof (int));
4ed46869
KH
978 while (1)
979 {
6ae1f27e 980 find_charset_in_str (BYTE_POS_ADDR (from_byte), stop_byte - from_byte,
0282eb69 981 charsets, table, 1, multibyte);
4ed46869 982 if (stop < to)
6ae1f27e
RS
983 {
984 from = stop, from_byte = stop_byte;
985 stop = to, stop_byte = CHAR_TO_BYTE (stop);
986 }
4ed46869
KH
987 else
988 break;
989 }
6ae1f27e 990
4ed46869 991 val = Qnil;
0282eb69
KH
992 undefined = 0;
993 for (i = (multibyte ? MAX_CHARSET : 1); i >= 0; i--)
4ed46869 994 if (charsets[i])
0282eb69
KH
995 {
996 if (CHARSET_DEFINED_P (i) || i == CHARSET_COMPOSITION)
997 val = Fcons (CHARSET_SYMBOL (i), val);
998 else
999 undefined = 1;
1000 }
1001 if (undefined)
1002 val = Fcons (Qunknown, val);
4ed46869
KH
1003 return val;
1004}
1005
1006DEFUN ("find-charset-string", Ffind_charset_string, Sfind_charset_string,
23d2a7f1
KH
1007 1, 2, 0,
1008 "Return a list of charsets in STR.\n\
020da460
KH
1009If the string contains any composite characters,\n\
1010`composition' is included in the returned list.\n\
0282eb69
KH
1011Optional arg TABLE if non-nil is a translation table to look up.\n\
1012\n\
1013If the region contains invalid multiybte characters,\n\
1014`unknown' is included in the returned list.\n\
1015\n\
1016If STR is unibyte, the returned list contains\n\
1017`ascii' if any 7-bit characters are found,\n\
1018and `unknown' if any 8-bit characters are found.")
23d2a7f1
KH
1019 (str, table)
1020 Lisp_Object str, table;
4ed46869 1021{
a29e3b1b 1022 int charsets[MAX_CHARSET + 1];
4ed46869
KH
1023 int i;
1024 Lisp_Object val;
0282eb69
KH
1025 int undefined;
1026 int multibyte;
4ed46869
KH
1027
1028 CHECK_STRING (str, 0);
0282eb69 1029 multibyte = STRING_MULTIBYTE (str);
87b089ad 1030
a29e3b1b 1031 bzero (charsets, (MAX_CHARSET + 1) * sizeof (int));
fc932ac6 1032 find_charset_in_str (XSTRING (str)->data, STRING_BYTES (XSTRING (str)),
0282eb69 1033 charsets, table, 1, multibyte);
4ed46869 1034 val = Qnil;
0282eb69
KH
1035 undefined = 0;
1036 for (i = (multibyte ? MAX_CHARSET : 1); i >= 0; i--)
4ed46869 1037 if (charsets[i])
0282eb69
KH
1038 {
1039 if (CHARSET_DEFINED_P (i) || i == CHARSET_COMPOSITION)
1040 val = Fcons (CHARSET_SYMBOL (i), val);
1041 else
1042 undefined = 1;
1043 }
1044 if (undefined)
1045 val = Fcons (Qunknown, val);
4ed46869
KH
1046 return val;
1047}
1048\f
1049DEFUN ("make-char-internal", Fmake_char_internal, Smake_char_internal, 1, 3, 0,
513ee442 1050 "")
4ed46869
KH
1051 (charset, code1, code2)
1052 Lisp_Object charset, code1, code2;
1053{
ac4137cc
KH
1054 int charset_id, c1, c2;
1055
4ed46869 1056 CHECK_NUMBER (charset, 0);
ac4137cc
KH
1057 charset_id = XINT (charset);
1058 if (!CHARSET_DEFINED_P (charset_id))
1059 error ("Invalid charset ID: %d", XINT (charset));
4ed46869
KH
1060
1061 if (NILP (code1))
ac4137cc 1062 c1 = 0;
4ed46869 1063 else
ac4137cc
KH
1064 {
1065 CHECK_NUMBER (code1, 1);
1066 c1 = XINT (code1);
1067 }
4ed46869 1068 if (NILP (code2))
ac4137cc 1069 c2 = 0;
4ed46869 1070 else
ac4137cc
KH
1071 {
1072 CHECK_NUMBER (code2, 2);
1073 c2 = XINT (code2);
1074 }
4ed46869 1075
ac4137cc
KH
1076 if (c1 < 0 || c1 > 0xFF || c2 < 0 || c2 > 0xFF)
1077 error ("Invalid code points: %d %d", c1, c2);
1078 c1 &= 0x7F;
1079 c2 &= 0x7F;
1080 if (c1 == 0
1081 ? c2 != 0
1082 : (c2 == 0
44c6492d
KH
1083 ? !CHAR_COMPONENTS_VALID_P (charset, c1, 0x20)
1084 : !CHAR_COMPONENTS_VALID_P (charset, c1, c2)))
ac4137cc
KH
1085 error ("Invalid code points: %d %d", c1, c2);
1086
1087 return make_number (MAKE_CHAR (charset_id, c1, c2));
4ed46869
KH
1088}
1089
1090DEFUN ("split-char", Fsplit_char, Ssplit_char, 1, 1, 0,
0282eb69
KH
1091 "Return list of charset and one or two position-codes of CHAR.\n\
1092If CHAR is invalid as a character code,\n\
1093return a list of symbol `unknown' and CHAR.")
4ed46869
KH
1094 (ch)
1095 Lisp_Object ch;
1096{
1097 Lisp_Object val;
0282eb69 1098 int c, charset, c1, c2;
4ed46869
KH
1099
1100 CHECK_NUMBER (ch, 0);
0282eb69
KH
1101 c = XFASTINT (ch);
1102 if (!CHAR_VALID_P (c, 1))
1103 return Fcons (Qunknown, Fcons (ch, Qnil));
4ed46869 1104 SPLIT_CHAR (XFASTINT (ch), charset, c1, c2);
6dc0722d 1105 return (c2 >= 0
4ed46869
KH
1106 ? Fcons (CHARSET_SYMBOL (charset),
1107 Fcons (make_number (c1), Fcons (make_number (c2), Qnil)))
1108 : Fcons (CHARSET_SYMBOL (charset), Fcons (make_number (c1), Qnil)));
1109}
1110
1111DEFUN ("char-charset", Fchar_charset, Schar_charset, 1, 1, 0,
1112 "Return charset of CHAR.")
1113 (ch)
1114 Lisp_Object ch;
1115{
1116 CHECK_NUMBER (ch, 0);
1117
1118 return CHARSET_SYMBOL (CHAR_CHARSET (XINT (ch)));
1119}
1120
90d7b74e 1121DEFUN ("charset-after", Fcharset_after, Scharset_after, 0, 1, 0,
ac4137cc 1122 "Return charset of a character in the current buffer at position POS.\n\
e6e114f2
KH
1123If POS is nil, it defauls to the current point.\n\
1124If POS is out of range, the value is nil.")
90d7b74e
KH
1125 (pos)
1126 Lisp_Object pos;
1127{
ac4137cc 1128 register int pos_byte, bytes, charset, c1, c2;
90d7b74e
KH
1129 register unsigned char *p;
1130
1131 if (NILP (pos))
1132 pos_byte = PT_BYTE;
1133 else if (MARKERP (pos))
e6e114f2
KH
1134 {
1135 pos_byte = marker_byte_position (pos);
1136 if (pos_byte < BEGV_BYTE || pos_byte >= ZV_BYTE)
1137 return Qnil;
1138 }
90d7b74e
KH
1139 else
1140 {
1141 CHECK_NUMBER (pos, 0);
e6e114f2
KH
1142 if (XINT (pos) < BEGV || XINT (pos) >= ZV)
1143 return Qnil;
90d7b74e
KH
1144 pos_byte = CHAR_TO_BYTE (XINT (pos));
1145 }
1146 p = BYTE_POS_ADDR (pos_byte);
ac4137cc
KH
1147 if (BASE_LEADING_CODE_P (*p))
1148 {
1149 SPLIT_MULTIBYTE_SEQ (p, Z_BYTE - pos_byte, bytes, charset, c1, c2);
1150 if (charset < 0)
1151 charset = 1;
1152 }
1153 else
1154 charset = CHARSET_ASCII;
1155
90d7b74e
KH
1156 return CHARSET_SYMBOL (charset);
1157}
1158
4ed46869 1159DEFUN ("iso-charset", Fiso_charset, Siso_charset, 3, 3, 0,
2b71bb78
KH
1160 "Return charset of ISO's specification DIMENSION, CHARS, and FINAL-CHAR.\n\
1161\n\
1162ISO 2022's designation sequence (escape sequence) distinguishes charsets\n\
1163by their DIMENSION, CHARS, and FINAL-CHAR,\n\
1164where as Emacs distinguishes them by charset symbol.\n\
1165See the documentation of the function `charset-info' for the meanings of\n\
1166DIMENSION, CHARS, and FINAL-CHAR.")
4ed46869
KH
1167 (dimension, chars, final_char)
1168 Lisp_Object dimension, chars, final_char;
1169{
1170 int charset;
1171
1172 CHECK_NUMBER (dimension, 0);
1173 CHECK_NUMBER (chars, 1);
1174 CHECK_NUMBER (final_char, 2);
1175
1176 if ((charset = ISO_CHARSET_TABLE (dimension, chars, final_char)) < 0)
1177 return Qnil;
1178 return CHARSET_SYMBOL (charset);
1179}
1180
9d3d8cba
KH
1181/* If GENERICP is nonzero, return nonzero iff C is a valid normal or
1182 generic character. If GENERICP is zero, return nonzero iff C is a
1183 valid normal character. Do not call this function directly,
1184 instead use macro CHAR_VALID_P. */
1185int
1186char_valid_p (c, genericp)
1187 int c, genericp;
1188{
1189 int charset, c1, c2;
1190
1191 if (c < 0)
1192 return 0;
1193 if (SINGLE_BYTE_CHAR_P (c))
1194 return 1;
1195 SPLIT_NON_ASCII_CHAR (c, charset, c1, c2);
ac4137cc
KH
1196 if (charset == CHARSET_COMPOSITION)
1197 return ((c >= MIN_CHAR_COMPOSITION
1198 && c < MIN_CHAR_COMPOSITION + n_cmpchars)
1199 || (genericp && c == GENERIC_COMPOSITION_CHAR));
1200 if (genericp)
1201 {
1202 if (c1)
1203 {
1204 if (c2 <= 0) c2 = 0x20;
1205 }
1206 else
1207 {
1208 if (c2 <= 0) c1 = c2 = 0x20;
1209 }
1210 }
1211 return (CHARSET_DEFINED_P (charset)
44c6492d 1212 && CHAR_COMPONENTS_VALID_P (charset, c1, c2));
9d3d8cba
KH
1213}
1214
1215DEFUN ("char-valid-p", Fchar_valid_p, Schar_valid_p, 1, 2, 0,
a9d02884
DL
1216 "Return t if OBJECT is a valid normal character.\n\
1217If optional arg GENERICP is non-nil, also return t if OBJECT is\n\
9d3d8cba
KH
1218a valid generic character.")
1219 (object, genericp)
1220 Lisp_Object object, genericp;
1221{
1222 if (! NATNUMP (object))
1223 return Qnil;
1224 return (CHAR_VALID_P (XFASTINT (object), !NILP (genericp)) ? Qt : Qnil);
1225}
1226
d2665018
KH
1227DEFUN ("unibyte-char-to-multibyte", Funibyte_char_to_multibyte,
1228 Sunibyte_char_to_multibyte, 1, 1, 0,
1229 "Convert the unibyte character CH to multibyte character.\n\
537efd8d 1230The conversion is done based on `nonascii-translation-table' (which see)\n\
340b8d58 1231 or `nonascii-insert-offset' (which see).")
d2665018
KH
1232 (ch)
1233 Lisp_Object ch;
1234{
1235 int c;
1236
1237 CHECK_NUMBER (ch, 0);
1238 c = XINT (ch);
1239 if (c < 0 || c >= 0400)
1240 error ("Invalid unibyte character: %d", c);
1241 c = unibyte_char_to_multibyte (c);
1242 if (c < 0)
1243 error ("Can't convert to multibyte character: %d", XINT (ch));
1244 return make_number (c);
1245}
1246
1bcc1567
RS
1247DEFUN ("multibyte-char-to-unibyte", Fmultibyte_char_to_unibyte,
1248 Smultibyte_char_to_unibyte, 1, 1, 0,
1249 "Convert the multibyte character CH to unibyte character.\n\
1250The conversion is done based on `nonascii-translation-table' (which see)\n\
1251 or `nonascii-insert-offset' (which see).")
1252 (ch)
1253 Lisp_Object ch;
1254{
1255 int c;
1256
1257 CHECK_NUMBER (ch, 0);
1258 c = XINT (ch);
ac4137cc 1259 if (! CHAR_VALID_P (c, 0))
1bcc1567
RS
1260 error ("Invalid multibyte character: %d", c);
1261 c = multibyte_char_to_unibyte (c, Qnil);
1262 if (c < 0)
1263 error ("Can't convert to unibyte character: %d", XINT (ch));
1264 return make_number (c);
1265}
1266
4ed46869 1267DEFUN ("char-bytes", Fchar_bytes, Schar_bytes, 1, 1, 0,
f78643ef 1268 "Return 1 regardless of the argument CHAR.\n\
60022cb7 1269This is now an obsolete function. We keep it just for backward compatibility.")
4ed46869
KH
1270 (ch)
1271 Lisp_Object ch;
1272{
1273 Lisp_Object val;
4ed46869
KH
1274
1275 CHECK_NUMBER (ch, 0);
9b6a601f
KH
1276 return make_number (1);
1277}
1278
1279/* Return how many bytes C will occupy in a multibyte buffer.
1280 Don't call this function directly, instead use macro CHAR_BYTES. */
1281int
1282char_bytes (c)
1283 int c;
1284{
1285 int bytes;
1286
8ac5a9cc
KH
1287 if (SINGLE_BYTE_CHAR_P (c) || (c & ~GLYPH_MASK_CHAR))
1288 return 1;
1289
9b6a601f 1290 if (COMPOSITE_CHAR_P (c))
4ed46869 1291 {
9b6a601f 1292 unsigned int id = COMPOSITE_CHAR_ID (c);
4ed46869
KH
1293
1294 bytes = (id < n_cmpchars ? cmpchar_table[id]->len : 1);
1295 }
1296 else
1297 {
9b6a601f 1298 int charset = CHAR_CHARSET (c);
4ed46869
KH
1299
1300 bytes = CHARSET_DEFINED_P (charset) ? CHARSET_BYTES (charset) : 1;
1301 }
1302
60022cb7 1303 return bytes;
4ed46869
KH
1304}
1305
1306/* Return the width of character of which multi-byte form starts with
1307 C. The width is measured by how many columns occupied on the
1308 screen when displayed in the current buffer. */
1309
1310#define ONE_BYTE_CHAR_WIDTH(c) \
1311 (c < 0x20 \
1312 ? (c == '\t' \
53316e55 1313 ? XFASTINT (current_buffer->tab_width) \
4ed46869
KH
1314 : (c == '\n' ? 0 : (NILP (current_buffer->ctl_arrow) ? 4 : 2))) \
1315 : (c < 0x7f \
1316 ? 1 \
1317 : (c == 0x7F \
1318 ? (NILP (current_buffer->ctl_arrow) ? 4 : 2) \
1319 : ((! NILP (current_buffer->enable_multibyte_characters) \
1320 && BASE_LEADING_CODE_P (c)) \
1321 ? WIDTH_BY_CHAR_HEAD (c) \
b4e9dd77 1322 : 4))))
4ed46869
KH
1323
1324DEFUN ("char-width", Fchar_width, Schar_width, 1, 1, 0,
1325 "Return width of CHAR when displayed in the current buffer.\n\
1326The width is measured by how many columns it occupies on the screen.")
1327 (ch)
1328 Lisp_Object ch;
1329{
859f2b3c 1330 Lisp_Object val, disp;
4ed46869 1331 int c;
51c4025f 1332 struct Lisp_Char_Table *dp = buffer_display_table ();
4ed46869
KH
1333
1334 CHECK_NUMBER (ch, 0);
1335
859f2b3c
RS
1336 c = XINT (ch);
1337
1338 /* Get the way the display table would display it. */
51c4025f 1339 disp = dp ? DISP_CHAR_VECTOR (dp, c) : Qnil;
859f2b3c
RS
1340
1341 if (VECTORP (disp))
1342 XSETINT (val, XVECTOR (disp)->size);
1343 else if (SINGLE_BYTE_CHAR_P (c))
1344 XSETINT (val, ONE_BYTE_CHAR_WIDTH (c));
4ed46869
KH
1345 else if (COMPOSITE_CHAR_P (c))
1346 {
1347 int id = COMPOSITE_CHAR_ID (XFASTINT (ch));
0282eb69 1348 XSETFASTINT (val, (id < n_cmpchars ? cmpchar_table[id]->width : 1));
4ed46869
KH
1349 }
1350 else
1351 {
1352 int charset = CHAR_CHARSET (c);
1353
1354 XSETFASTINT (val, CHARSET_WIDTH (charset));
1355 }
1356 return val;
1357}
1358
1359/* Return width of string STR of length LEN when displayed in the
1360 current buffer. The width is measured by how many columns it
1361 occupies on the screen. */
859f2b3c 1362
4ed46869
KH
1363int
1364strwidth (str, len)
1365 unsigned char *str;
1366 int len;
1367{
1368 unsigned char *endp = str + len;
1369 int width = 0;
c4a4e28f 1370 struct Lisp_Char_Table *dp = buffer_display_table ();
4ed46869 1371
859f2b3c
RS
1372 while (str < endp)
1373 {
1374 if (*str == LEADING_CODE_COMPOSITION)
1375 {
1376 int id = str_cmpchar_id (str, endp - str);
1377
1378 if (id < 0)
1379 {
1380 width += 4;
1381 str++;
1382 }
1383 else
1384 {
1385 width += cmpchar_table[id]->width;
1386 str += cmpchar_table[id]->len;
1387 }
1388 }
1389 else
1390 {
1391 Lisp_Object disp;
e515b0a9
KH
1392 int thislen;
1393 int c = STRING_CHAR_AND_LENGTH (str, endp - str, thislen);
859f2b3c
RS
1394
1395 /* Get the way the display table would display it. */
acc35c36
RS
1396 if (dp)
1397 disp = DISP_CHAR_VECTOR (dp, c);
1398 else
1399 disp = Qnil;
859f2b3c
RS
1400
1401 if (VECTORP (disp))
e515b0a9 1402 width += XVECTOR (disp)->size;
859f2b3c 1403 else
e515b0a9 1404 width += ONE_BYTE_CHAR_WIDTH (*str);
859f2b3c 1405
e515b0a9 1406 str += thislen;
859f2b3c
RS
1407 }
1408 }
4ed46869
KH
1409 return width;
1410}
1411
1412DEFUN ("string-width", Fstring_width, Sstring_width, 1, 1, 0,
1413 "Return width of STRING when displayed in the current buffer.\n\
1414Width is measured by how many columns it occupies on the screen.\n\
046b1f03
RS
1415When calculating width of a multibyte character in STRING,\n\
1416only the base leading-code is considered; the validity of\n\
1417the following bytes is not checked.")
4ed46869
KH
1418 (str)
1419 Lisp_Object str;
1420{
1421 Lisp_Object val;
1422
1423 CHECK_STRING (str, 0);
fc932ac6
RS
1424 XSETFASTINT (val, strwidth (XSTRING (str)->data,
1425 STRING_BYTES (XSTRING (str))));
4ed46869
KH
1426 return val;
1427}
1428
1429DEFUN ("char-direction", Fchar_direction, Schar_direction, 1, 1, 0,
1430 "Return the direction of CHAR.\n\
1431The returned value is 0 for left-to-right and 1 for right-to-left.")
1432 (ch)
1433 Lisp_Object ch;
1434{
1435 int charset;
1436
1437 CHECK_NUMBER (ch, 0);
1438 charset = CHAR_CHARSET (XFASTINT (ch));
1439 if (!CHARSET_DEFINED_P (charset))
93bcb785 1440 invalid_character (XINT (ch));
4ed46869
KH
1441 return CHARSET_TABLE_INFO (charset, CHARSET_DIRECTION_IDX);
1442}
1443
af4fecb4 1444DEFUN ("chars-in-region", Fchars_in_region, Schars_in_region, 2, 2, 0,
6ae1f27e 1445 "Return number of characters between BEG and END.")
046b1f03
RS
1446 (beg, end)
1447 Lisp_Object beg, end;
1448{
6ae1f27e 1449 int from, to;
046b1f03 1450
17e7ef1b
RS
1451 CHECK_NUMBER_COERCE_MARKER (beg, 0);
1452 CHECK_NUMBER_COERCE_MARKER (end, 1);
1453
046b1f03 1454 from = min (XFASTINT (beg), XFASTINT (end));
a8a35e61 1455 to = max (XFASTINT (beg), XFASTINT (end));
046b1f03 1456
a8c21066 1457 return make_number (to - from);
6ae1f27e 1458}
9036eb45 1459
87b089ad
RS
1460/* Return the number of characters in the NBYTES bytes at PTR.
1461 This works by looking at the contents and checking for multibyte sequences.
1462 However, if the current buffer has enable-multibyte-characters = nil,
1463 we treat each byte as a character. */
1464
6ae1f27e
RS
1465int
1466chars_in_text (ptr, nbytes)
1467 unsigned char *ptr;
1468 int nbytes;
1469{
87b089ad
RS
1470 /* current_buffer is null at early stages of Emacs initialization. */
1471 if (current_buffer == 0
1472 || NILP (current_buffer->enable_multibyte_characters))
6ae1f27e 1473 return nbytes;
a8a35e61 1474
ac4137cc 1475 return multibyte_chars_in_text (ptr, nbytes);
046b1f03
RS
1476}
1477
87b089ad
RS
1478/* Return the number of characters in the NBYTES bytes at PTR.
1479 This works by looking at the contents and checking for multibyte sequences.
1480 It ignores enable-multibyte-characters. */
1481
1482int
1483multibyte_chars_in_text (ptr, nbytes)
1484 unsigned char *ptr;
1485 int nbytes;
1486{
ac4137cc
KH
1487 unsigned char *endp;
1488 int chars, bytes;
87b089ad
RS
1489
1490 endp = ptr + nbytes;
1491 chars = 0;
1492
1493 while (ptr < endp)
1494 {
ac4137cc
KH
1495 if (BASE_LEADING_CODE_P (*ptr))
1496 {
1497 PARSE_MULTIBYTE_SEQ (ptr, nbytes, bytes);
1498 ptr += bytes;
1499 nbytes -= bytes;
1500 }
1501 else
1502 {
1503 ptr++;
1504 nbytes--;
1505 }
87b089ad
RS
1506 chars++;
1507 }
1508
1509 return chars;
1510}
1511
1512DEFUN ("string", Fstring, Sstring, 1, MANY, 0,
4ed46869 1513 "Concatenate all the argument characters and make the result a string.")
53316e55
KH
1514 (n, args)
1515 int n;
4ed46869
KH
1516 Lisp_Object *args;
1517{
53316e55 1518 int i;
4ed46869 1519 unsigned char *buf
bd4c6dd0 1520 = (unsigned char *) alloca (MAX_LENGTH_OF_MULTI_BYTE_FORM * n);
4ed46869
KH
1521 unsigned char *p = buf;
1522 Lisp_Object val;
1523
1524 for (i = 0; i < n; i++)
1525 {
1526 int c, len;
1527 unsigned char *str;
1528
1529 if (!INTEGERP (args[i]))
b0e3cf2b 1530 CHECK_NUMBER (args[i], 0);
4ed46869
KH
1531 c = XINT (args[i]);
1532 len = CHAR_STRING (c, p, str);
1533 if (p != str)
1534 /* C is a composite character. */
1535 bcopy (str, p, len);
1536 p += len;
1537 }
1538
020da460
KH
1539 /* Here, we can't use make_string_from_bytes because of byte
1540 combining problem. */
1541 val = make_string (buf, p - buf);
4ed46869
KH
1542 return val;
1543}
1544
1545#endif /* emacs */
1546\f
1547/*** Composite characters staffs ***/
1548
1549/* Each composite character is identified by CMPCHAR-ID which is
1550 assigned when Emacs needs the character code of the composite
1551 character (e.g. when displaying it on the screen). See the
1552 document "GENERAL NOTE on COMPOSITE CHARACTER" in `charset.h' how a
1553 composite character is represented in Emacs. */
1554
1555/* If `static' is defined, it means that it is defined to null string. */
1556#ifndef static
1557/* The following function is copied from lread.c. */
1558static int
1559hash_string (ptr, len)
1560 unsigned char *ptr;
1561 int len;
1562{
1563 register unsigned char *p = ptr;
1564 register unsigned char *end = p + len;
1565 register unsigned char c;
1566 register int hash = 0;
1567
1568 while (p != end)
1569 {
1570 c = *p++;
1571 if (c >= 0140) c -= 40;
1572 hash = ((hash<<3) + (hash>>28) + c);
1573 }
1574 return hash & 07777777777;
1575}
1576#endif
1577
4ed46869
KH
1578#define CMPCHAR_HASH_TABLE_SIZE 0xFFF
1579
1580static int *cmpchar_hash_table[CMPCHAR_HASH_TABLE_SIZE];
1581
1582/* Each element of `cmpchar_hash_table' is a pointer to an array of
1583 integer, where the 1st element is the size of the array, the 2nd
1584 element is how many elements are actually used in the array, and
1585 the remaining elements are CMPCHAR-IDs of composite characters of
1586 the same hash value. */
1587#define CMPCHAR_HASH_SIZE(table) table[0]
1588#define CMPCHAR_HASH_USED(table) table[1]
1589#define CMPCHAR_HASH_CMPCHAR_ID(table, i) table[i]
1590
1591/* Return CMPCHAR-ID of the composite character in STR of the length
1592 LEN. If the composite character has not yet been registered,
1593 register it in `cmpchar_table' and assign new CMPCHAR-ID. This
1594 is the sole function for assigning CMPCHAR-ID. */
1595int
1596str_cmpchar_id (str, len)
8867de67 1597 const unsigned char *str;
4ed46869
KH
1598 int len;
1599{
1600 int hash_idx, *hashp;
1601 unsigned char *buf;
1602 int embedded_rule; /* 1 if composition rule is embedded. */
1603 int chars; /* number of components. */
1604 int i;
1605 struct cmpchar_info *cmpcharp;
1606
ac4137cc 1607 /* The second byte 0xFF means COMPOSITION rule is embedded. */
4ed46869
KH
1608 embedded_rule = (str[1] == 0xFF);
1609
1610 /* At first, get the actual length of the composite character. */
1611 {
8867de67 1612 const unsigned char *p, *endp = str + 1, *lastp = str + len;
4ed46869
KH
1613 int bytes;
1614
6ae1f27e 1615 while (endp < lastp && ! CHAR_HEAD_P (*endp)) endp++;
93bcb785
KH
1616 if (endp - str < 5)
1617 /* Any composite char have at least 5-byte length. */
1618 return -1;
1619
4ed46869 1620 chars = 0;
93bcb785 1621 p = str + 1;
4ed46869
KH
1622 while (p < endp)
1623 {
9b4d1fe6
KH
1624 if (embedded_rule)
1625 {
1626 p++;
1627 if (p >= endp)
1628 return -1;
1629 }
4ed46869 1630 /* No need of checking if *P is 0xA0 because
93bcb785
KH
1631 BYTES_BY_CHAR_HEAD (0x80) surely returns 2. */
1632 p += BYTES_BY_CHAR_HEAD (*p - 0x20);
4ed46869
KH
1633 chars++;
1634 }
93bcb785
KH
1635 if (p > endp || chars < 2 || chars > MAX_COMPONENT_COUNT)
1636 /* Invalid components. */
4ed46869 1637 return -1;
93bcb785 1638 len = p - str;
4ed46869
KH
1639 }
1640 hash_idx = hash_string (str, len) % CMPCHAR_HASH_TABLE_SIZE;
1641 hashp = cmpchar_hash_table[hash_idx];
1642
1643 /* Then, look into the hash table. */
1644 if (hashp != NULL)
1645 /* Find the correct one among composite characters of the same
1646 hash value. */
1647 for (i = 2; i < CMPCHAR_HASH_USED (hashp); i++)
1648 {
1649 cmpcharp = cmpchar_table[CMPCHAR_HASH_CMPCHAR_ID (hashp, i)];
1650 if (len == cmpcharp->len
1651 && ! bcmp (str, cmpcharp->data, len))
1652 return CMPCHAR_HASH_CMPCHAR_ID (hashp, i);
1653 }
1654
1655 /* We have to register the composite character in cmpchar_table. */
0282eb69 1656 if (n_cmpchars >= (CHAR_FIELD2_MASK | CHAR_FIELD3_MASK))
513ee442
KH
1657 /* No, we have no more room for a new composite character. */
1658 return -1;
1659
4ed46869
KH
1660 /* Make the entry in hash table. */
1661 if (hashp == NULL)
1662 {
1663 /* Make a table for 8 composite characters initially. */
1664 hashp = (cmpchar_hash_table[hash_idx]
1665 = (int *) xmalloc (sizeof (int) * (2 + 8)));
1666 CMPCHAR_HASH_SIZE (hashp) = 10;
1667 CMPCHAR_HASH_USED (hashp) = 2;
1668 }
1669 else if (CMPCHAR_HASH_USED (hashp) >= CMPCHAR_HASH_SIZE (hashp))
1670 {
1671 CMPCHAR_HASH_SIZE (hashp) += 8;
1672 hashp = (cmpchar_hash_table[hash_idx]
1673 = (int *) xrealloc (hashp,
1674 sizeof (int) * CMPCHAR_HASH_SIZE (hashp)));
1675 }
1676 CMPCHAR_HASH_CMPCHAR_ID (hashp, CMPCHAR_HASH_USED (hashp)) = n_cmpchars;
1677 CMPCHAR_HASH_USED (hashp)++;
1678
1679 /* Set information of the composite character in cmpchar_table. */
1680 if (cmpchar_table_size == 0)
1681 {
1682 /* This is the first composite character to be registered. */
1683 cmpchar_table_size = 256;
1684 cmpchar_table
1685 = (struct cmpchar_info **) xmalloc (sizeof (cmpchar_table[0])
1686 * cmpchar_table_size);
1687 }
1688 else if (cmpchar_table_size <= n_cmpchars)
1689 {
1690 cmpchar_table_size += 256;
1691 cmpchar_table
1692 = (struct cmpchar_info **) xrealloc (cmpchar_table,
1693 sizeof (cmpchar_table[0])
1694 * cmpchar_table_size);
1695 }
1696
1697 cmpcharp = (struct cmpchar_info *) xmalloc (sizeof (struct cmpchar_info));
1698
1699 cmpcharp->len = len;
1700 cmpcharp->data = (unsigned char *) xmalloc (len + 1);
1701 bcopy (str, cmpcharp->data, len);
1702 cmpcharp->data[len] = 0;
1703 cmpcharp->glyph_len = chars;
1704 cmpcharp->glyph = (GLYPH *) xmalloc (sizeof (GLYPH) * chars);
1705 if (embedded_rule)
1706 {
1707 cmpcharp->cmp_rule = (unsigned char *) xmalloc (chars);
1708 cmpcharp->col_offset = (float *) xmalloc (sizeof (float) * chars);
1709 }
1710 else
1711 {
1712 cmpcharp->cmp_rule = NULL;
1713 cmpcharp->col_offset = NULL;
1714 }
1715
1716 /* Setup GLYPH data and composition rules (if any) so as not to make
1717 them every time on displaying. */
1718 {
1719 unsigned char *bufp;
1720 int width;
1721 float leftmost = 0.0, rightmost = 1.0;
1722
1723 if (embedded_rule)
1724 /* At first, col_offset[N] is set to relative to col_offset[0]. */
1725 cmpcharp->col_offset[0] = 0;
1726
1727 for (i = 0, bufp = cmpcharp->data + 1; i < chars; i++)
1728 {
1729 if (embedded_rule)
1730 cmpcharp->cmp_rule[i] = *bufp++;
1731
1732 if (*bufp == 0xA0) /* This is an ASCII character. */
1733 {
1734 cmpcharp->glyph[i] = FAST_MAKE_GLYPH ((*++bufp & 0x7F), 0);
1735 width = 1;
1736 bufp++;
1737 }
1738 else /* Multibyte character. */
1739 {
1740 /* Make `bufp' point normal multi-byte form temporally. */
1741 *bufp -= 0x20;
1742 cmpcharp->glyph[i]
ac4137cc 1743 = FAST_MAKE_GLYPH (string_to_non_ascii_char (bufp, 4, 0), 0);
4ed46869
KH
1744 width = WIDTH_BY_CHAR_HEAD (*bufp);
1745 *bufp += 0x20;
1746 bufp += BYTES_BY_CHAR_HEAD (*bufp - 0x20);
1747 }
1748
1749 if (embedded_rule && i > 0)
1750 {
1751 /* Reference points (global_ref and new_ref) are
1752 encoded as below:
1753
1754 0--1--2 -- ascent
1755 | |
1756 | |
1757 | 4 -+--- center
1758 -- 3 5 -- baseline
1759 | |
1760 6--7--8 -- descent
1761
1762 Now, we calculate the column offset of the new glyph
1763 from the left edge of the first glyph. This can avoid
1764 the same calculation everytime displaying this
1765 composite character. */
1766
1767 /* Reference points of global glyph and new glyph. */
1768 int global_ref = (cmpcharp->cmp_rule[i] - 0xA0) / 9;
1769 int new_ref = (cmpcharp->cmp_rule[i] - 0xA0) % 9;
1770 /* Column offset relative to the first glyph. */
1771 float left = (leftmost
1772 + (global_ref % 3) * (rightmost - leftmost) / 2.0
1773 - (new_ref % 3) * width / 2.0);
1774
1775 cmpcharp->col_offset[i] = left;
1776 if (left < leftmost)
1777 leftmost = left;
1778 if (left + width > rightmost)
1779 rightmost = left + width;
1780 }
1781 else
1782 {
1783 if (width > rightmost)
1784 rightmost = width;
1785 }
1786 }
1787 if (embedded_rule)
1788 {
1789 /* Now col_offset[N] are relative to the left edge of the
1790 first component. Make them relative to the left edge of
1791 overall glyph. */
1792 for (i = 0; i < chars; i++)
1793 cmpcharp->col_offset[i] -= leftmost;
1794 /* Make rightmost holds width of overall glyph. */
1795 rightmost -= leftmost;
1796 }
1797
1798 cmpcharp->width = rightmost;
1799 if (cmpcharp->width < rightmost)
1800 /* To get a ceiling integer value. */
1801 cmpcharp->width++;
1802 }
1803
1804 cmpchar_table[n_cmpchars] = cmpcharp;
1805
1806 return n_cmpchars++;
1807}
1808
de54b0d5
KH
1809/* Return the Nth element of the composite character C. If NOERROR is
1810 nonzero, return 0 on error condition (C is an invalid composite
1811 charcter, or N is out of range). */
4ed46869 1812int
de54b0d5
KH
1813cmpchar_component (c, n, noerror)
1814 int c, n, noerror;
4ed46869
KH
1815{
1816 int id = COMPOSITE_CHAR_ID (c);
1817
de54b0d5
KH
1818 if (id < 0 || id >= n_cmpchars)
1819 {
1820 /* C is not a valid composite character. */
1821 if (noerror) return 0;
1822 error ("Invalid composite character: %d", c) ;
1823 }
1824 if (n >= cmpchar_table[id]->glyph_len)
1825 {
1826 /* No such component. */
1827 if (noerror) return 0;
1828 args_out_of_range (make_number (c), make_number (n));
1829 }
4ed46869
KH
1830 /* No face data is stored in glyph code. */
1831 return ((int) (cmpchar_table[id]->glyph[n]));
1832}
1833
1834DEFUN ("cmpcharp", Fcmpcharp, Scmpcharp, 1, 1, 0,
1835 "T if CHAR is a composite character.")
1836 (ch)
1837 Lisp_Object ch;
1838{
1839 CHECK_NUMBER (ch, 0);
1840 return (COMPOSITE_CHAR_P (XINT (ch)) ? Qt : Qnil);
1841}
1842
1843DEFUN ("composite-char-component", Fcmpchar_component, Scmpchar_component,
1844 2, 2, 0,
de54b0d5
KH
1845 "Return the Nth component character of composite character CHARACTER.")
1846 (character, n)
1847 Lisp_Object character, n;
4ed46869 1848{
de54b0d5 1849 int id;
4ed46869
KH
1850
1851 CHECK_NUMBER (character, 0);
de54b0d5 1852 CHECK_NUMBER (n, 1);
4ed46869 1853
de54b0d5 1854 return (make_number (cmpchar_component (XINT (character), XINT (n), 0)));
4ed46869
KH
1855}
1856
1857DEFUN ("composite-char-composition-rule", Fcmpchar_cmp_rule, Scmpchar_cmp_rule,
1858 2, 2, 0,
de54b0d5 1859 "Return the Nth composition rule of composite character CHARACTER.\n\
55001746 1860The returned rule is for composing the Nth component\n\
de54b0d5
KH
1861on the (N-1)th component.\n\
1862If CHARACTER should be composed relatively or N is 0, return 255.")
55001746
KH
1863 (character, n)
1864 Lisp_Object character, n;
4ed46869 1865{
de54b0d5 1866 int id;
4ed46869
KH
1867
1868 CHECK_NUMBER (character, 0);
55001746 1869 CHECK_NUMBER (n, 1);
4ed46869
KH
1870
1871 id = COMPOSITE_CHAR_ID (XINT (character));
1872 if (id < 0 || id >= n_cmpchars)
1873 error ("Invalid composite character: %d", XINT (character));
de54b0d5 1874 if (XINT (n) < 0 || XINT (n) >= cmpchar_table[id]->glyph_len)
55001746 1875 args_out_of_range (character, n);
4ed46869 1876
de54b0d5
KH
1877 return make_number (cmpchar_table[id]->cmp_rule
1878 ? cmpchar_table[id]->cmp_rule[XINT (n)]
1879 : 255);
4ed46869
KH
1880}
1881
1882DEFUN ("composite-char-composition-rule-p", Fcmpchar_cmp_rule_p,
1883 Scmpchar_cmp_rule_p, 1, 1, 0,
1884 "Return non-nil if composite character CHARACTER contains a embedded rule.")
1885 (character)
1886 Lisp_Object character;
1887{
1888 int id;
1889
1890 CHECK_NUMBER (character, 0);
1891 id = COMPOSITE_CHAR_ID (XINT (character));
1892 if (id < 0 || id >= n_cmpchars)
1893 error ("Invalid composite character: %d", XINT (character));
1894
1895 return (cmpchar_table[id]->cmp_rule ? Qt : Qnil);
1896}
1897
1898DEFUN ("composite-char-component-count", Fcmpchar_cmp_count,
1899 Scmpchar_cmp_count, 1, 1, 0,
1900 "Return number of compoents of composite character CHARACTER.")
1901 (character)
1902 Lisp_Object character;
1903{
1904 int id;
1905
1906 CHECK_NUMBER (character, 0);
1907 id = COMPOSITE_CHAR_ID (XINT (character));
1908 if (id < 0 || id >= n_cmpchars)
1909 error ("Invalid composite character: %d", XINT (character));
1910
1911 return (make_number (cmpchar_table[id]->glyph_len));
1912}
1913
1914DEFUN ("compose-string", Fcompose_string, Scompose_string,
1915 1, 1, 0,
1916 "Return one char string composed from all characters in STRING.")
1917 (str)
1918 Lisp_Object str;
1919{
1920 unsigned char buf[MAX_LENGTH_OF_MULTI_BYTE_FORM], *p, *pend, *ptemp;
1921 int len, i;
1922
1923 CHECK_STRING (str, 0);
1924
1925 buf[0] = LEADING_CODE_COMPOSITION;
1926 p = XSTRING (str)->data;
fc932ac6 1927 pend = p + STRING_BYTES (XSTRING (str));
4ed46869
KH
1928 i = 1;
1929 while (p < pend)
1930 {
9b4d1fe6 1931 if (*p < 0x20) /* control code */
4ed46869
KH
1932 error ("Invalid component character: %d", *p);
1933 else if (*p < 0x80) /* ASCII */
1934 {
1935 if (i + 2 >= MAX_LENGTH_OF_MULTI_BYTE_FORM)
1936 error ("Too long string to be composed: %s", XSTRING (str)->data);
1937 /* Prepend an ASCII charset indicator 0xA0, set MSB of the
1938 code itself. */
1939 buf[i++] = 0xA0;
1940 buf[i++] = *p++ + 0x80;
1941 }
1942 else if (*p == LEADING_CODE_COMPOSITION) /* composite char */
1943 {
1944 /* Already composed. Eliminate the heading
1945 LEADING_CODE_COMPOSITION, keep the remaining bytes
1946 unchanged. */
1947 p++;
de54b0d5
KH
1948 if (*p == 255)
1949 error ("Can't compose a rule-based composition character");
4ed46869 1950 ptemp = p;
6ae1f27e 1951 while (! CHAR_HEAD_P (*p)) p++;
9b4d1fe6
KH
1952 if (str_cmpchar_id (ptemp - 1, p - ptemp + 1) < 0)
1953 error ("Can't compose an invalid composition character");
4ed46869
KH
1954 if (i + (p - ptemp) >= MAX_LENGTH_OF_MULTI_BYTE_FORM)
1955 error ("Too long string to be composed: %s", XSTRING (str)->data);
1956 bcopy (ptemp, buf + i, p - ptemp);
1957 i += p - ptemp;
1958 }
1959 else /* multibyte char */
1960 {
1961 /* Add 0x20 to the base leading-code, keep the remaining
1962 bytes unchanged. */
ac4137cc 1963 int c = STRING_CHAR_AND_LENGTH (p, pend - p, len);
9b4d1fe6
KH
1964
1965 if (len <= 1 || ! CHAR_VALID_P (c, 0))
1966 error ("Can't compose an invalid character");
4ed46869
KH
1967 if (i + len >= MAX_LENGTH_OF_MULTI_BYTE_FORM)
1968 error ("Too long string to be composed: %s", XSTRING (str)->data);
1969 bcopy (p, buf + i, len);
1970 buf[i] += 0x20;
1971 p += len, i += len;
1972 }
1973 }
1974
1975 if (i < 5)
1976 /* STR contains only one character, which can't be composed. */
1977 error ("Too short string to be composed: %s", XSTRING (str)->data);
1978
27802600 1979 return make_string_from_bytes (buf, 1, i);
4ed46869
KH
1980}
1981
1982\f
dfcf069d 1983int
4ed46869
KH
1984charset_id_internal (charset_name)
1985 char *charset_name;
1986{
76d7b829 1987 Lisp_Object val;
4ed46869 1988
76d7b829 1989 val= Fget (intern (charset_name), Qcharset);
4ed46869
KH
1990 if (!VECTORP (val))
1991 error ("Charset %s is not defined", charset_name);
1992
1993 return (XINT (XVECTOR (val)->contents[0]));
1994}
1995
1996DEFUN ("setup-special-charsets", Fsetup_special_charsets,
1997 Ssetup_special_charsets, 0, 0, 0, "Internal use only.")
1998 ()
1999{
2000 charset_latin_iso8859_1 = charset_id_internal ("latin-iso8859-1");
2001 charset_jisx0208_1978 = charset_id_internal ("japanese-jisx0208-1978");
2002 charset_jisx0208 = charset_id_internal ("japanese-jisx0208");
2003 charset_katakana_jisx0201 = charset_id_internal ("katakana-jisx0201");
2004 charset_latin_jisx0201 = charset_id_internal ("latin-jisx0201");
2005 charset_big5_1 = charset_id_internal ("chinese-big5-1");
2006 charset_big5_2 = charset_id_internal ("chinese-big5-2");
2007 return Qnil;
2008}
2009
dfcf069d 2010void
4ed46869
KH
2011init_charset_once ()
2012{
2013 int i, j, k;
2014
2015 staticpro (&Vcharset_table);
2016 staticpro (&Vcharset_symbol_table);
8a73a704 2017 staticpro (&Vgeneric_character_list);
4ed46869
KH
2018
2019 /* This has to be done here, before we call Fmake_char_table. */
2020 Qcharset_table = intern ("charset-table");
2021 staticpro (&Qcharset_table);
2022
2023 /* Intern this now in case it isn't already done.
2024 Setting this variable twice is harmless.
2025 But don't staticpro it here--that is done in alloc.c. */
2026 Qchar_table_extra_slots = intern ("char-table-extra-slots");
2027
2028 /* Now we are ready to set up this property, so we can
2029 create the charset table. */
2030 Fput (Qcharset_table, Qchar_table_extra_slots, make_number (0));
2031 Vcharset_table = Fmake_char_table (Qcharset_table, Qnil);
2032
0282eb69
KH
2033 Qunknown = intern ("unknown");
2034 staticpro (&Qunknown);
2035 Vcharset_symbol_table = Fmake_vector (make_number (MAX_CHARSET + 1),
2036 Qunknown);
4ed46869
KH
2037
2038 /* Setup tables. */
2039 for (i = 0; i < 2; i++)
2040 for (j = 0; j < 2; j++)
2041 for (k = 0; k < 128; k++)
2042 iso_charset_table [i][j][k] = -1;
2043
2044 bzero (cmpchar_hash_table, sizeof cmpchar_hash_table);
2045 cmpchar_table_size = n_cmpchars = 0;
2046
60383934 2047 for (i = 0; i < 256; i++)
4ed46869 2048 BYTES_BY_CHAR_HEAD (i) = 1;
6ef23ebb
KH
2049 for (i = MIN_CHARSET_OFFICIAL_DIMENSION1;
2050 i <= MAX_CHARSET_OFFICIAL_DIMENSION1; i++)
2051 BYTES_BY_CHAR_HEAD (i) = 2;
2052 for (i = MIN_CHARSET_OFFICIAL_DIMENSION2;
2053 i <= MAX_CHARSET_OFFICIAL_DIMENSION2; i++)
2054 BYTES_BY_CHAR_HEAD (i) = 3;
4ed46869
KH
2055 BYTES_BY_CHAR_HEAD (LEADING_CODE_PRIVATE_11) = 3;
2056 BYTES_BY_CHAR_HEAD (LEADING_CODE_PRIVATE_12) = 3;
2057 BYTES_BY_CHAR_HEAD (LEADING_CODE_PRIVATE_21) = 4;
2058 BYTES_BY_CHAR_HEAD (LEADING_CODE_PRIVATE_22) = 4;
6ef23ebb 2059 /* The followings don't reflect the actual bytes, but just to tell
4ed46869
KH
2060 that it is a start of a multibyte character. */
2061 BYTES_BY_CHAR_HEAD (LEADING_CODE_COMPOSITION) = 2;
6ef23ebb
KH
2062 BYTES_BY_CHAR_HEAD (0x9E) = 2;
2063 BYTES_BY_CHAR_HEAD (0x9F) = 2;
4ed46869
KH
2064
2065 for (i = 0; i < 128; i++)
2066 WIDTH_BY_CHAR_HEAD (i) = 1;
2067 for (; i < 256; i++)
2068 WIDTH_BY_CHAR_HEAD (i) = 4;
2069 WIDTH_BY_CHAR_HEAD (LEADING_CODE_PRIVATE_11) = 1;
2070 WIDTH_BY_CHAR_HEAD (LEADING_CODE_PRIVATE_12) = 2;
2071 WIDTH_BY_CHAR_HEAD (LEADING_CODE_PRIVATE_21) = 1;
2072 WIDTH_BY_CHAR_HEAD (LEADING_CODE_PRIVATE_22) = 2;
8a73a704
KH
2073
2074 {
76d7b829 2075 Lisp_Object val;
8a73a704 2076
76d7b829 2077 val = Qnil;
8a73a704
KH
2078 for (i = 0x81; i < 0x90; i++)
2079 val = Fcons (make_number ((i - 0x70) << 7), val);
2080 for (; i < 0x9A; i++)
2081 val = Fcons (make_number ((i - 0x8F) << 14), val);
2082 for (i = 0xA0; i < 0xF0; i++)
2083 val = Fcons (make_number ((i - 0x70) << 7), val);
2084 for (; i < 0xFF; i++)
2085 val = Fcons (make_number ((i - 0xE0) << 14), val);
2086 val = Fcons (make_number (GENERIC_COMPOSITION_CHAR), val);
2087 Vgeneric_character_list = Fnreverse (val);
2088 }
bbf12bb3
KH
2089
2090 nonascii_insert_offset = 0;
2091 Vnonascii_translation_table = Qnil;
4ed46869
KH
2092}
2093
2094#ifdef emacs
2095
dfcf069d 2096void
4ed46869
KH
2097syms_of_charset ()
2098{
2099 Qascii = intern ("ascii");
2100 staticpro (&Qascii);
2101
2102 Qcharset = intern ("charset");
2103 staticpro (&Qcharset);
2104
2105 /* Define ASCII charset now. */
2106 update_charset_table (make_number (CHARSET_ASCII),
2107 make_number (1), make_number (94),
2108 make_number (1),
2109 make_number (0),
2110 make_number ('B'),
2111 make_number (0),
2112 build_string ("ASCII"),
2113 build_string ("ASCII"),
2114 build_string ("ASCII (ISO646 IRV)"));
2115 CHARSET_SYMBOL (CHARSET_ASCII) = Qascii;
2116 Fput (Qascii, Qcharset, CHARSET_TABLE_ENTRY (CHARSET_ASCII));
2117
2118 Qcomposition = intern ("composition");
2119 staticpro (&Qcomposition);
2120 CHARSET_SYMBOL (CHARSET_COMPOSITION) = Qcomposition;
2121
c1a08b4c
KH
2122 Qauto_fill_chars = intern ("auto-fill-chars");
2123 staticpro (&Qauto_fill_chars);
2124 Fput (Qauto_fill_chars, Qchar_table_extra_slots, make_number (0));
2125
4ed46869 2126 defsubr (&Sdefine_charset);
8a73a704 2127 defsubr (&Sgeneric_character_list);
3fac5a51 2128 defsubr (&Sget_unused_iso_final_char);
4ed46869
KH
2129 defsubr (&Sdeclare_equiv_charset);
2130 defsubr (&Sfind_charset_region);
2131 defsubr (&Sfind_charset_string);
2132 defsubr (&Smake_char_internal);
2133 defsubr (&Ssplit_char);
2134 defsubr (&Schar_charset);
90d7b74e 2135 defsubr (&Scharset_after);
4ed46869 2136 defsubr (&Siso_charset);
9d3d8cba 2137 defsubr (&Schar_valid_p);
d2665018 2138 defsubr (&Sunibyte_char_to_multibyte);
1bcc1567 2139 defsubr (&Smultibyte_char_to_unibyte);
4ed46869
KH
2140 defsubr (&Schar_bytes);
2141 defsubr (&Schar_width);
2142 defsubr (&Sstring_width);
2143 defsubr (&Schar_direction);
af4fecb4 2144 defsubr (&Schars_in_region);
87b089ad 2145 defsubr (&Sstring);
4ed46869
KH
2146 defsubr (&Scmpcharp);
2147 defsubr (&Scmpchar_component);
2148 defsubr (&Scmpchar_cmp_rule);
2149 defsubr (&Scmpchar_cmp_rule_p);
2150 defsubr (&Scmpchar_cmp_count);
2151 defsubr (&Scompose_string);
2152 defsubr (&Ssetup_special_charsets);
2153
2154 DEFVAR_LISP ("charset-list", &Vcharset_list,
2155 "List of charsets ever defined.");
2156 Vcharset_list = Fcons (Qascii, Qnil);
2157
537efd8d 2158 DEFVAR_LISP ("translation-table-vector", &Vtranslation_table_vector,
b4e9dd77
KH
2159 "Vector of cons cell of a symbol and translation table ever defined.\n\
2160An ID of a translation table is an index of this vector.");
537efd8d 2161 Vtranslation_table_vector = Fmake_vector (make_number (16), Qnil);
b0e3cf2b 2162
4ed46869
KH
2163 DEFVAR_INT ("leading-code-composition", &leading_code_composition,
2164 "Leading-code of composite characters.");
2165 leading_code_composition = LEADING_CODE_COMPOSITION;
2166
2167 DEFVAR_INT ("leading-code-private-11", &leading_code_private_11,
2168 "Leading-code of private TYPE9N charset of column-width 1.");
2169 leading_code_private_11 = LEADING_CODE_PRIVATE_11;
2170
2171 DEFVAR_INT ("leading-code-private-12", &leading_code_private_12,
2172 "Leading-code of private TYPE9N charset of column-width 2.");
2173 leading_code_private_12 = LEADING_CODE_PRIVATE_12;
2174
2175 DEFVAR_INT ("leading-code-private-21", &leading_code_private_21,
2176 "Leading-code of private TYPE9Nx9N charset of column-width 1.");
2177 leading_code_private_21 = LEADING_CODE_PRIVATE_21;
2178
2179 DEFVAR_INT ("leading-code-private-22", &leading_code_private_22,
2180 "Leading-code of private TYPE9Nx9N charset of column-width 2.");
2181 leading_code_private_22 = LEADING_CODE_PRIVATE_22;
35e623fb
RS
2182
2183 DEFVAR_INT ("nonascii-insert-offset", &nonascii_insert_offset,
d2665018 2184 "Offset for converting non-ASCII unibyte codes 0240...0377 to multibyte.\n\
4cf9710d
RS
2185This is used for converting unibyte text to multibyte,\n\
2186and for inserting character codes specified by number.\n\n\
3e8ceaac
RS
2187This serves to convert a Latin-1 or similar 8-bit character code\n\
2188to the corresponding Emacs multibyte character code.\n\
2189Typically the value should be (- (make-char CHARSET 0) 128),\n\
2190for your choice of character set.\n\
537efd8d 2191If `nonascii-translation-table' is non-nil, it overrides this variable.");
35e623fb 2192 nonascii_insert_offset = 0;
b0e3cf2b 2193
b4e9dd77 2194 DEFVAR_LISP ("nonascii-translation-table", &Vnonascii_translation_table,
537efd8d 2195 "Translation table to convert non-ASCII unibyte codes to multibyte.\n\
4cf9710d
RS
2196This is used for converting unibyte text to multibyte,\n\
2197and for inserting character codes specified by number.\n\n\
2198Conversion is performed only when multibyte characters are enabled,\n\
2199and it serves to convert a Latin-1 or similar 8-bit character code\n\
2200to the corresponding Emacs character code.\n\n\
da4d65af 2201If this is nil, `nonascii-insert-offset' is used instead.\n\
b4e9dd77
KH
2202See also the docstring of `make-translation-table'.");
2203 Vnonascii_translation_table = Qnil;
4cf9710d 2204
b0e3cf2b
KH
2205 DEFVAR_INT ("min-composite-char", &min_composite_char,
2206 "Minimum character code of a composite character.");
2207 min_composite_char = MIN_CHAR_COMPOSITION;
c1a08b4c
KH
2208
2209 DEFVAR_LISP ("auto-fill-chars", &Vauto_fill_chars,
2210 "A char-table for characters which invoke auto-filling.\n\
2211Such characters has value t in this table.");
2212 Vauto_fill_chars = Fmake_char_table (Qauto_fill_chars, Qnil);
60022cb7
AS
2213 CHAR_TABLE_SET (Vauto_fill_chars, make_number (' '), Qt);
2214 CHAR_TABLE_SET (Vauto_fill_chars, make_number ('\n'), Qt);
4ed46869
KH
2215}
2216
2217#endif /* emacs */