(timezone-parse-date): Use < 69 not < 70 to distinguish 20YY from 19YY.
[bpt/emacs.git] / src / charset.c
CommitLineData
75c8c592 1/* Basic multilingual character support.
35e623fb 2 Copyright (C) 1995, 1997, 1998 Electrotechnical Laboratory, JAPAN.
75c8c592 3 Licensed to the Free Software Foundation.
4ed46869 4
369314dc
KH
5This file is part of GNU Emacs.
6
7GNU Emacs is free software; you can redistribute it and/or modify
8it under the terms of the GNU General Public License as published by
9the Free Software Foundation; either version 2, or (at your option)
10any later version.
4ed46869 11
369314dc
KH
12GNU Emacs is distributed in the hope that it will be useful,
13but WITHOUT ANY WARRANTY; without even the implied warranty of
14MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15GNU General Public License for more details.
4ed46869 16
369314dc
KH
17You should have received a copy of the GNU General Public License
18along with GNU Emacs; see the file COPYING. If not, write to
19the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
20Boston, MA 02111-1307, USA. */
4ed46869
KH
21
22/* At first, see the document in `charset.h' to understand the code in
23 this file. */
24
25#include <stdio.h>
26
27#ifdef emacs
28
29#include <sys/types.h>
30#include <config.h>
31#include "lisp.h"
32#include "buffer.h"
33#include "charset.h"
34#include "coding.h"
fc6b09bf 35#include "disptab.h"
4ed46869
KH
36
37#else /* not emacs */
38
39#include "mulelib.h"
40
41#endif /* emacs */
42
43Lisp_Object Qcharset, Qascii, Qcomposition;
0282eb69 44Lisp_Object Qunknown;
4ed46869
KH
45
46/* Declaration of special leading-codes. */
47int leading_code_composition; /* for composite characters */
48int leading_code_private_11; /* for private DIMENSION1 of 1-column */
49int leading_code_private_12; /* for private DIMENSION1 of 2-column */
50int leading_code_private_21; /* for private DIMENSION2 of 1-column */
51int leading_code_private_22; /* for private DIMENSION2 of 2-column */
52
53/* Declaration of special charsets. */
54int charset_ascii; /* ASCII */
55int charset_composition; /* for a composite character */
56int charset_latin_iso8859_1; /* ISO8859-1 (Latin-1) */
57int charset_jisx0208_1978; /* JISX0208.1978 (Japanese Kanji old set) */
58int charset_jisx0208; /* JISX0208.1983 (Japanese Kanji) */
59int charset_katakana_jisx0201; /* JISX0201.Kana (Japanese Katakana) */
60int charset_latin_jisx0201; /* JISX0201.Roman (Japanese Roman) */
61int charset_big5_1; /* Big5 Level 1 (Chinese Traditional) */
62int charset_big5_2; /* Big5 Level 2 (Chinese Traditional) */
63
b0e3cf2b
KH
64int min_composite_char;
65
4ed46869
KH
66Lisp_Object Qcharset_table;
67
68/* A char-table containing information of each character set. */
69Lisp_Object Vcharset_table;
70
71/* A vector of charset symbol indexed by charset-id. This is used
72 only for returning charset symbol from C functions. */
73Lisp_Object Vcharset_symbol_table;
74
75/* A list of charset symbols ever defined. */
76Lisp_Object Vcharset_list;
77
537efd8d
KH
78/* Vector of translation table ever defined.
79 ID of a translation table is used to index this vector. */
80Lisp_Object Vtranslation_table_vector;
b0e3cf2b 81
c1a08b4c
KH
82/* A char-table for characters which may invoke auto-filling. */
83Lisp_Object Vauto_fill_chars;
84
85Lisp_Object Qauto_fill_chars;
86
4ed46869
KH
87/* Tables used by macros BYTES_BY_CHAR_HEAD and WIDTH_BY_CHAR_HEAD. */
88int bytes_by_char_head[256];
89int width_by_char_head[256];
90
91/* Mapping table from ISO2022's charset (specified by DIMENSION,
92 CHARS, and FINAL-CHAR) to Emacs' charset. */
93int iso_charset_table[2][2][128];
94
513ee442
KH
95/* Table of pointers to the structure `cmpchar_info' indexed by
96 CMPCHAR-ID. */
97struct cmpchar_info **cmpchar_table;
98/* The current size of `cmpchar_table'. */
99static int cmpchar_table_size;
100/* Number of the current composite characters. */
101int n_cmpchars;
102
4ed46869
KH
103/* Variables used locally in the macro FETCH_MULTIBYTE_CHAR. */
104unsigned char *_fetch_multibyte_char_p;
105int _fetch_multibyte_char_len;
106
35e623fb
RS
107/* Offset to add to a non-ASCII value when inserting it. */
108int nonascii_insert_offset;
109
4cf9710d
RS
110/* Translation table for converting non-ASCII unibyte characters
111 to multibyte codes, or nil. */
b4e9dd77 112Lisp_Object Vnonascii_translation_table;
4cf9710d 113
8a73a704
KH
114/* List of all possible generic characters. */
115Lisp_Object Vgeneric_character_list;
116
046b1f03
RS
117#define min(X, Y) ((X) < (Y) ? (X) : (Y))
118#define max(X, Y) ((X) > (Y) ? (X) : (Y))
119\f
93bcb785
KH
120void
121invalid_character (c)
122 int c;
123{
ba7434e5 124 error ("Invalid character: 0%o, %d, 0x%x", c, c, c);
93bcb785
KH
125}
126
ac4137cc
KH
127/* Parse string STR of length LENGTH (>= 2) and check if a composite
128 character is at STR. If there is a valid composite character, set
129 CHARSET, C1, and C2 to proper values so that MAKE_CHAR can compose
130 the composite character from them. Otherwise, set CHARSET to
131 CHARSET_COMPOSITION, but set C1 to the second byte of the sequence,
132 C2 to -1 so that MAKE_CHAR can compose the invalid multibyte
133 character whose string representation is two bytes of STR[0] and
134 STR[1]. In any case, set BYTES to LENGTH. */
135
136#define SPLIT_COMPOSITE_SEQ(str, length, bytes, charset, c1, c2) \
137 do { \
138 int cmpchar_id = str_cmpchar_id ((str), (length)); \
139 \
140 (charset) = CHARSET_COMPOSITION; \
141 (bytes) = (length); \
142 if (cmpchar_id >= 0) \
143 { \
144 (c1) = CHAR_FIELD2 (cmpchar_id); \
145 (c2) = CHAR_FIELD3 (cmpchar_id); \
146 } \
147 else \
148 { \
149 (c1) = (str)[1] & 0x7F; \
150 (c2) = -1; \
151 } \
152 } while (0)
153
154/* Parse string STR of length LENGTH (>= 2) and check if a
155 non-composite multibyte character is at STR. Set BYTES to the
156 actual length, CHARSET, C1, and C2 to proper values so that
157 MAKE_CHAR can compose the multibyte character from them. */
158
159#define SPLIT_CHARACTER_SEQ(str, length, bytes, charset, c1, c2) \
160 do { \
161 (bytes) = 1; \
162 (charset) = (str)[0]; \
163 if ((charset) >= LEADING_CODE_PRIVATE_11 \
164 && (charset) <= LEADING_CODE_PRIVATE_22) \
165 (charset) = (str)[(bytes)++]; \
166 if ((bytes) < (length)) \
167 { \
168 (c1) = (str)[(bytes)++] & 0x7F; \
169 if ((bytes) < (length)) \
170 (c2) = (str)[(bytes)++] & 0x7F; \
171 else \
172 (c2) = -1; \
173 } \
174 else \
175 (c1) = (c2) = -1; \
176 } while (0)
177
178/* Parse string STR of length LENGTH and check if a multibyte
179 characters is at STR. set BYTES to the actual length, CHARSET, C1,
180 C2 to proper values for that character. */
181
182#define SPLIT_MULTIBYTE_SEQ(str, length, bytes, charset, c1, c2) \
183 do { \
184 int i; \
185 for (i = 1; i < (length) && ! CHAR_HEAD_P ((str)[i]); i++); \
186 if (i == 1) \
187 (bytes) = 1, (charset) = CHARSET_ASCII, (c1) = (str)[0] ; \
188 else if ((str)[0] == LEADING_CODE_COMPOSITION) \
189 SPLIT_COMPOSITE_SEQ (str, i, bytes, charset, c1, c2); \
190 else \
191 { \
192 if (i > BYTES_BY_CHAR_HEAD ((str)[0])) \
193 i = BYTES_BY_CHAR_HEAD ((str)[0]); \
194 SPLIT_CHARACTER_SEQ (str, i, bytes, charset, c1, c2); \
195 } \
196 } while (0)
197
198/* 1 if CHARSET, C1, and C2 compose a valid character, else 0. */
199#define CHAR_COMPONENT_VALID_P(charset, c1, c2) \
200 (CHARSET_DIMENSION (charset) == 1 \
201 ? ((c1) >= 0x20 && (c1) <= 0x7F) \
202 : ((c1) >= 0x20 && (c1) <= 0x7F && (c2) >= 0x20 && (c2) <= 0x7F))
93bcb785 203
4ed46869
KH
204/* Set STR a pointer to the multi-byte form of the character C. If C
205 is not a composite character, the multi-byte form is set in WORKBUF
206 and STR points WORKBUF. The caller should allocate at least 4-byte
207 area at WORKBUF in advance. Returns the length of the multi-byte
ac4137cc
KH
208 form. If C is an invalid character, store (C & 0xFF) in WORKBUF[0]
209 and return 1.
4ed46869
KH
210
211 Use macro `CHAR_STRING (C, WORKBUF, STR)' instead of calling this
212 function directly if C can be an ASCII character. */
213
214int
215non_ascii_char_to_string (c, workbuf, str)
216 int c;
217 unsigned char *workbuf, **str;
218{
6662e69b 219 if (c & CHAR_MODIFIER_MASK) /* This includes the case C is negative. */
8ac5a9cc 220 {
6662e69b
KH
221 /* Multibyte character can't have a modifier bit. */
222 if (! SINGLE_BYTE_CHAR_P ((c & ~CHAR_MODIFIER_MASK)))
223 invalid_character (c);
224
225 /* For Meta, Shift, and Control modifiers, we need special care. */
8ac5a9cc 226 if (c & CHAR_META)
6662e69b
KH
227 {
228 /* Move the meta bit to the right place for a string. */
229 c = (c & ~CHAR_META) | 0x80;
230 }
231 if (c & CHAR_SHIFT)
232 {
233 /* Shift modifier is valid only with [A-Za-z]. */
234 if ((c & 0377) >= 'A' && (c & 0377) <= 'Z')
235 c &= ~CHAR_SHIFT;
236 else if ((c & 0377) >= 'a' && (c & 0377) <= 'z')
237 c = (c & ~CHAR_SHIFT) - ('a' - 'A');
238 }
8ac5a9cc 239 if (c & CHAR_CTL)
6662e69b
KH
240 {
241 /* Simulate the code in lread.c. */
242 /* Allow `\C- ' and `\C-?'. */
243 if (c == (CHAR_CTL | ' '))
244 c = 0;
245 else if (c == (CHAR_CTL | '?'))
246 c = 127;
247 /* ASCII control chars are made from letters (both cases),
248 as well as the non-letters within 0100...0137. */
249 else if ((c & 0137) >= 0101 && (c & 0137) <= 0132)
250 c &= (037 | (~0177 & ~CHAR_CTL));
251 else if ((c & 0177) >= 0100 && (c & 0177) <= 0137)
252 c &= (037 | (~0177 & ~CHAR_CTL));
253 }
254
255 /* If C still has any modifier bits, it is an invalid character. */
256 if (c & CHAR_MODIFIER_MASK)
257 invalid_character (c);
258
8ac5a9cc 259 *str = workbuf;
ac4137cc 260 *workbuf++ = c;
8ac5a9cc 261 }
ac4137cc 262 else
4ed46869 263 {
ac4137cc 264 int charset, c1, c2;
4ed46869 265
ac4137cc
KH
266 SPLIT_NON_ASCII_CHAR (c, charset, c1, c2);
267 if (charset == CHARSET_COMPOSITION)
4ed46869 268 {
ac4137cc
KH
269 if (c >= MAX_CHAR)
270 invalid_character (c);
271 if (c >= MIN_CHAR_COMPOSITION)
272 {
273 /* Valid composite character. */
274 *str = cmpchar_table[COMPOSITE_CHAR_ID (c)]->data;
275 workbuf = *str + cmpchar_table[COMPOSITE_CHAR_ID (c)]->len;
276 }
277 else
278 {
279 /* Invalid but can have multibyte form. */
280 *str = workbuf;
281 *workbuf++ = LEADING_CODE_COMPOSITION;
282 *workbuf++ = c1 | 0x80;
283 }
4ed46869 284 }
ac4137cc 285 else if (charset > CHARSET_COMPOSITION)
4ed46869 286 {
ac4137cc
KH
287 *str = workbuf;
288 if (charset >= LEADING_CODE_EXT_11)
289 *workbuf++ = (charset < LEADING_CODE_EXT_12
290 ? LEADING_CODE_PRIVATE_11
291 : (charset < LEADING_CODE_EXT_21
292 ? LEADING_CODE_PRIVATE_12
293 : (charset < LEADING_CODE_EXT_22
294 ? LEADING_CODE_PRIVATE_21
295 : LEADING_CODE_PRIVATE_22)));
296 *workbuf++ = charset;
297 if (c1 > 0 && c1 < 32 || c2 > 0 && c2 < 32)
298 invalid_character (c);
299 if (c1)
300 {
301 *workbuf++ = c1 | 0x80;
302 if (c2 > 0)
303 *workbuf++ = c2 | 0x80;
304 }
4ed46869 305 }
ac4137cc
KH
306 else if (charset == CHARSET_ASCII)
307 *workbuf++= c & 0x7F;
308 else
309 invalid_character (c);
4ed46869
KH
310 }
311
4ed46869
KH
312 return (workbuf - *str);
313}
314
315/* Return a non-ASCII character of which multi-byte form is at STR of
ac4137cc 316 length LEN. If ACTUAL_LEN is not NULL, the byte length of the
537efd8d
KH
317 multibyte form is set to the address ACTUAL_LEN.
318
4ed46869
KH
319 Use macro `STRING_CHAR (STR, LEN)' instead of calling this function
320 directly if STR can hold an ASCII character. */
321
dfcf069d 322int
ac4137cc 323string_to_non_ascii_char (str, len, actual_len)
8867de67 324 const unsigned char *str;
ac4137cc 325 int len, *actual_len;
4ed46869 326{
ac4137cc 327 int c, bytes, charset, c1, c2;
4ed46869 328
ac4137cc
KH
329 SPLIT_MULTIBYTE_SEQ (str, len, bytes, charset, c1, c2);
330 c = MAKE_CHAR (charset, c1, c2);
4ed46869 331 if (actual_len)
ac4137cc 332 *actual_len = bytes;
4ed46869
KH
333 return c;
334}
335
336/* Return the length of the multi-byte form at string STR of length LEN. */
337int
338multibyte_form_length (str, len)
8867de67 339 const unsigned char *str;
4ed46869
KH
340 int len;
341{
ac4137cc 342 int bytes;
4ed46869 343
ac4137cc 344 PARSE_MULTIBYTE_SEQ (str, len, bytes);
90d7b74e 345 return bytes;
4ed46869
KH
346}
347
ac4137cc
KH
348/* Check multibyte form at string STR of length LEN and set variables
349 pointed by CHARSET, C1, and C2 to charset and position codes of the
350 character at STR, and return 0. If there's no multibyte character,
4ed46869
KH
351 return -1. This should be used only in the macro SPLIT_STRING
352 which checks range of STR in advance. */
353
dfcf069d 354int
4ed46869 355split_non_ascii_string (str, len, charset, c1, c2)
ac4137cc
KH
356 const unsigned char *str;
357 unsigned char *c1, *c2;
358 int len, *charset;
4ed46869 359{
ac4137cc 360 register int bytes, cs, code1, code2 = -1;
4ed46869 361
ac4137cc
KH
362 SPLIT_MULTIBYTE_SEQ (str, len, bytes, cs, code1, code2);
363 if (cs == CHARSET_ASCII)
4ed46869 364 return -1;
ac4137cc
KH
365 *charset = cs;
366 *c1 = code1;
367 *c2 = code2;
368}
369
370/* Return 1 iff character C has valid printable glyph. */
371int
372char_printable_p (c)
373 int c;
374{
375 int charset, c1, c2, chars;
376
377 if (SINGLE_BYTE_CHAR_P (c))
378 return 1;
379 if (c >= MIN_CHAR_COMPOSITION)
380 return (c < MAX_CHAR);
381
382 SPLIT_NON_ASCII_CHAR (c, charset, c1, c2);
383 if (! CHARSET_DEFINED_P (charset))
384 return 0;
385 if (CHARSET_CHARS (charset) == 94
386 ? c1 <= 32 || c1 >= 127
387 : c1 < 32)
388 return 0;
389 if (CHARSET_DIMENSION (charset) == 2
390 && (CHARSET_CHARS (charset) == 94
391 ? c2 <= 32 || c2 >= 127
392 : c2 < 32))
393 return 0;
394 return 1;
4ed46869
KH
395}
396
537efd8d 397/* Translate character C by translation table TABLE. If C
b4e9dd77
KH
398 is negative, translate a character specified by CHARSET, C1, and C2
399 (C1 and C2 are code points of the character). If no translation is
400 found in TABLE, return C. */
dfcf069d 401int
b4e9dd77 402translate_char (table, c, charset, c1, c2)
23d2a7f1
KH
403 Lisp_Object table;
404 int c, charset, c1, c2;
405{
406 Lisp_Object ch;
407 int alt_charset, alt_c1, alt_c2, dimension;
408
409 if (c < 0) c = MAKE_CHAR (charset, c1, c2);
410 if (!CHAR_TABLE_P (table)
ac4137cc 411 || (ch = Faref (table, make_number (c)), !NATNUMP (ch)))
23d2a7f1
KH
412 return c;
413
414 SPLIT_CHAR (XFASTINT (ch), alt_charset, alt_c1, alt_c2);
415 dimension = CHARSET_DIMENSION (alt_charset);
416 if (dimension == 1 && alt_c1 > 0 || dimension == 2 && alt_c2 > 0)
417 /* CH is not a generic character, just return it. */
418 return XFASTINT (ch);
419
420 /* Since CH is a generic character, we must return a specific
421 charater which has the same position codes as C from CH. */
422 if (charset < 0)
423 SPLIT_CHAR (c, charset, c1, c2);
424 if (dimension != CHARSET_DIMENSION (charset))
425 /* We can't make such a character because of dimension mismatch. */
426 return c;
23d2a7f1
KH
427 return MAKE_CHAR (alt_charset, c1, c2);
428}
429
d2665018 430/* Convert the unibyte character C to multibyte based on
b4e9dd77 431 Vnonascii_translation_table or nonascii_insert_offset. If they can't
d2665018
KH
432 convert C to a valid multibyte character, convert it based on
433 DEFAULT_NONASCII_INSERT_OFFSET which makes C a Latin-1 character. */
35e623fb 434
dfcf069d 435int
35e623fb
RS
436unibyte_char_to_multibyte (c)
437 int c;
438{
543b4f61 439 if (c < 0400 && c >= 0200)
35e623fb 440 {
d2665018
KH
441 int c_save = c;
442
b4e9dd77 443 if (! NILP (Vnonascii_translation_table))
bbf12bb3
KH
444 {
445 c = XINT (Faref (Vnonascii_translation_table, make_number (c)));
ac4137cc 446 if (c >= 0400 && ! char_valid_p (c, 0))
bbf12bb3
KH
447 c = c_save + DEFAULT_NONASCII_INSERT_OFFSET;
448 }
449 else if (c >= 0240 && nonascii_insert_offset > 0)
450 {
451 c += nonascii_insert_offset;
ac4137cc 452 if (c < 0400 || ! char_valid_p (c, 0))
bbf12bb3
KH
453 c = c_save + DEFAULT_NONASCII_INSERT_OFFSET;
454 }
455 else if (c >= 0240)
d2665018 456 c = c_save + DEFAULT_NONASCII_INSERT_OFFSET;
35e623fb
RS
457 }
458 return c;
459}
76d7b829
KH
460
461
462/* Convert the multibyte character C to unibyte 8-bit character based
463 on Vnonascii_translation_table or nonascii_insert_offset. If
464 REV_TBL is non-nil, it should be a reverse table of
465 Vnonascii_translation_table, i.e. what given by:
466 Fchar_table_extra_slot (Vnonascii_translation_table, make_number (0)) */
467
468int
469multibyte_char_to_unibyte (c, rev_tbl)
470 int c;
471 Lisp_Object rev_tbl;
472{
473 if (!SINGLE_BYTE_CHAR_P (c))
474 {
475 int c_save = c;
476
477 if (! CHAR_TABLE_P (rev_tbl)
478 && CHAR_TABLE_P (Vnonascii_translation_table))
479 rev_tbl = Fchar_table_extra_slot (Vnonascii_translation_table,
480 make_number (0));
481 if (CHAR_TABLE_P (rev_tbl))
482 {
483 Lisp_Object temp;
484 temp = Faref (rev_tbl, make_number (c));
485 if (INTEGERP (temp))
486 c = XINT (temp);
bbf12bb3
KH
487 if (c >= 256)
488 c = (c_save & 0177) + 0200;
489 }
490 else
491 {
492 if (nonascii_insert_offset > 0)
493 c -= nonascii_insert_offset;
494 if (c < 128 || c >= 256)
495 c = (c_save & 0177) + 0200;
76d7b829 496 }
76d7b829
KH
497 }
498
499 return c;
500}
501
35e623fb 502\f
4ed46869
KH
503/* Update the table Vcharset_table with the given arguments (see the
504 document of `define-charset' for the meaning of each argument).
505 Several other table contents are also updated. The caller should
506 check the validity of CHARSET-ID and the remaining arguments in
507 advance. */
508
509void
510update_charset_table (charset_id, dimension, chars, width, direction,
511 iso_final_char, iso_graphic_plane,
512 short_name, long_name, description)
513 Lisp_Object charset_id, dimension, chars, width, direction;
514 Lisp_Object iso_final_char, iso_graphic_plane;
515 Lisp_Object short_name, long_name, description;
516{
517 int charset = XINT (charset_id);
518 int bytes;
519 unsigned char leading_code_base, leading_code_ext;
520
6dc0722d
KH
521 if (NILP (CHARSET_TABLE_ENTRY (charset)))
522 CHARSET_TABLE_ENTRY (charset)
523 = Fmake_vector (make_number (CHARSET_MAX_IDX), Qnil);
4ed46869
KH
524
525 /* Get byte length of multibyte form, base leading-code, and
526 extended leading-code of the charset. See the comment under the
527 title "GENERAL NOTE on CHARACTER SET (CHARSET)" in charset.h. */
528 bytes = XINT (dimension);
529 if (charset < MIN_CHARSET_PRIVATE_DIMENSION1)
530 {
531 /* Official charset, it doesn't have an extended leading-code. */
532 if (charset != CHARSET_ASCII)
533 bytes += 1; /* For a base leading-code. */
534 leading_code_base = charset;
535 leading_code_ext = 0;
536 }
537 else
538 {
539 /* Private charset. */
540 bytes += 2; /* For base and extended leading-codes. */
541 leading_code_base
542 = (charset < LEADING_CODE_EXT_12
543 ? LEADING_CODE_PRIVATE_11
544 : (charset < LEADING_CODE_EXT_21
545 ? LEADING_CODE_PRIVATE_12
546 : (charset < LEADING_CODE_EXT_22
547 ? LEADING_CODE_PRIVATE_21
548 : LEADING_CODE_PRIVATE_22)));
549 leading_code_ext = charset;
550 }
551
6ef23ebb
KH
552 if (BYTES_BY_CHAR_HEAD (leading_code_base) != bytes)
553 error ("Invalid dimension for the charset-ID %d", charset);
554
4ed46869
KH
555 CHARSET_TABLE_INFO (charset, CHARSET_ID_IDX) = charset_id;
556 CHARSET_TABLE_INFO (charset, CHARSET_BYTES_IDX) = make_number (bytes);
557 CHARSET_TABLE_INFO (charset, CHARSET_DIMENSION_IDX) = dimension;
558 CHARSET_TABLE_INFO (charset, CHARSET_CHARS_IDX) = chars;
559 CHARSET_TABLE_INFO (charset, CHARSET_WIDTH_IDX) = width;
560 CHARSET_TABLE_INFO (charset, CHARSET_DIRECTION_IDX) = direction;
561 CHARSET_TABLE_INFO (charset, CHARSET_LEADING_CODE_BASE_IDX)
562 = make_number (leading_code_base);
563 CHARSET_TABLE_INFO (charset, CHARSET_LEADING_CODE_EXT_IDX)
564 = make_number (leading_code_ext);
565 CHARSET_TABLE_INFO (charset, CHARSET_ISO_FINAL_CHAR_IDX) = iso_final_char;
566 CHARSET_TABLE_INFO (charset, CHARSET_ISO_GRAPHIC_PLANE_IDX)
567 = iso_graphic_plane;
568 CHARSET_TABLE_INFO (charset, CHARSET_SHORT_NAME_IDX) = short_name;
569 CHARSET_TABLE_INFO (charset, CHARSET_LONG_NAME_IDX) = long_name;
570 CHARSET_TABLE_INFO (charset, CHARSET_DESCRIPTION_IDX) = description;
571 CHARSET_TABLE_INFO (charset, CHARSET_PLIST_IDX) = Qnil;
572
573 {
574 /* If we have already defined a charset which has the same
575 DIMENSION, CHARS and ISO-FINAL-CHAR but the different
576 DIRECTION, we must update the entry REVERSE-CHARSET of both
577 charsets. If there's no such charset, the value of the entry
578 is set to nil. */
579 int i;
580
513ee442 581 for (i = 0; i <= MAX_CHARSET; i++)
4ed46869
KH
582 if (!NILP (CHARSET_TABLE_ENTRY (i)))
583 {
584 if (CHARSET_DIMENSION (i) == XINT (dimension)
585 && CHARSET_CHARS (i) == XINT (chars)
586 && CHARSET_ISO_FINAL_CHAR (i) == XINT (iso_final_char)
587 && CHARSET_DIRECTION (i) != XINT (direction))
588 {
589 CHARSET_TABLE_INFO (charset, CHARSET_REVERSE_CHARSET_IDX)
590 = make_number (i);
591 CHARSET_TABLE_INFO (i, CHARSET_REVERSE_CHARSET_IDX) = charset_id;
592 break;
593 }
594 }
513ee442 595 if (i > MAX_CHARSET)
4ed46869
KH
596 /* No such a charset. */
597 CHARSET_TABLE_INFO (charset, CHARSET_REVERSE_CHARSET_IDX)
598 = make_number (-1);
599 }
600
601 if (charset != CHARSET_ASCII
602 && charset < MIN_CHARSET_PRIVATE_DIMENSION1)
603 {
4ed46869
KH
604 width_by_char_head[leading_code_base] = XINT (width);
605
606 /* Update table emacs_code_class. */
607 emacs_code_class[charset] = (bytes == 2
608 ? EMACS_leading_code_2
609 : (bytes == 3
610 ? EMACS_leading_code_3
611 : EMACS_leading_code_4));
612 }
613
614 /* Update table iso_charset_table. */
615 if (ISO_CHARSET_TABLE (dimension, chars, iso_final_char) < 0)
616 ISO_CHARSET_TABLE (dimension, chars, iso_final_char) = charset;
617}
618
619#ifdef emacs
620
621/* Return charset id of CHARSET_SYMBOL, or return -1 if CHARSET_SYMBOL
622 is invalid. */
623int
624get_charset_id (charset_symbol)
625 Lisp_Object charset_symbol;
626{
627 Lisp_Object val;
628 int charset;
629
630 return ((SYMBOLP (charset_symbol)
631 && (val = Fget (charset_symbol, Qcharset), VECTORP (val))
632 && (charset = XINT (XVECTOR (val)->contents[CHARSET_ID_IDX]),
633 CHARSET_VALID_P (charset)))
634 ? charset : -1);
635}
636
637/* Return an identification number for a new private charset of
638 DIMENSION and WIDTH. If there's no more room for the new charset,
639 return 0. */
640Lisp_Object
641get_new_private_charset_id (dimension, width)
642 int dimension, width;
643{
644 int charset, from, to;
645
646 if (dimension == 1)
647 {
648 if (width == 1)
649 from = LEADING_CODE_EXT_11, to = LEADING_CODE_EXT_12;
650 else
651 from = LEADING_CODE_EXT_12, to = LEADING_CODE_EXT_21;
652 }
653 else
654 {
655 if (width == 1)
656 from = LEADING_CODE_EXT_21, to = LEADING_CODE_EXT_22;
657 else
b0e3cf2b 658 from = LEADING_CODE_EXT_22, to = LEADING_CODE_EXT_MAX + 1;
4ed46869
KH
659 }
660
661 for (charset = from; charset < to; charset++)
662 if (!CHARSET_DEFINED_P (charset)) break;
663
664 return make_number (charset < to ? charset : 0);
665}
666
667DEFUN ("define-charset", Fdefine_charset, Sdefine_charset, 3, 3, 0,
668 "Define CHARSET-ID as the identification number of CHARSET with INFO-VECTOR.\n\
23d2a7f1 669If CHARSET-ID is nil, it is decided automatically, which means CHARSET is\n\
4ed46869
KH
670 treated as a private charset.\n\
671INFO-VECTOR is a vector of the format:\n\
672 [DIMENSION CHARS WIDTH DIRECTION ISO-FINAL-CHAR ISO-GRAPHIC-PLANE\n\
673 SHORT-NAME LONG-NAME DESCRIPTION]\n\
674The meanings of each elements is as follows:\n\
675DIMENSION (integer) is the number of bytes to represent a character: 1 or 2.\n\
676CHARS (integer) is the number of characters in a dimension: 94 or 96.\n\
677WIDTH (integer) is the number of columns a character in the charset\n\
678occupies on the screen: one of 0, 1, and 2.\n\
679\n\
680DIRECTION (integer) is the rendering direction of characters in the\n\
277576f6
KH
681charset when rendering. If 0, render from left to right, else\n\
682render from right to left.\n\
4ed46869
KH
683\n\
684ISO-FINAL-CHAR (character) is the final character of the\n\
685corresponding ISO 2022 charset.\n\
686\n\
687ISO-GRAPHIC-PLANE (integer) is the graphic plane to be invoked\n\
688while encoding to variants of ISO 2022 coding system, one of the\n\
689following: 0/graphic-plane-left(GL), 1/graphic-plane-right(GR).\n\
690\n\
691SHORT-NAME (string) is the short name to refer to the charset.\n\
692\n\
693LONG-NAME (string) is the long name to refer to the charset.\n\
694\n\
695DESCRIPTION (string) is the description string of the charset.")
696 (charset_id, charset_symbol, info_vector)
697 Lisp_Object charset_id, charset_symbol, info_vector;
698{
699 Lisp_Object *vec;
700
701 if (!NILP (charset_id))
702 CHECK_NUMBER (charset_id, 0);
703 CHECK_SYMBOL (charset_symbol, 1);
704 CHECK_VECTOR (info_vector, 2);
705
706 if (! NILP (charset_id))
707 {
708 if (! CHARSET_VALID_P (XINT (charset_id)))
709 error ("Invalid CHARSET: %d", XINT (charset_id));
710 else if (CHARSET_DEFINED_P (XINT (charset_id)))
711 error ("Already defined charset: %d", XINT (charset_id));
712 }
713
714 vec = XVECTOR (info_vector)->contents;
715 if (XVECTOR (info_vector)->size != 9
716 || !INTEGERP (vec[0]) || !(XINT (vec[0]) == 1 || XINT (vec[0]) == 2)
717 || !INTEGERP (vec[1]) || !(XINT (vec[1]) == 94 || XINT (vec[1]) == 96)
718 || !INTEGERP (vec[2]) || !(XINT (vec[2]) == 1 || XINT (vec[2]) == 2)
719 || !INTEGERP (vec[3]) || !(XINT (vec[3]) == 0 || XINT (vec[3]) == 1)
720 || !INTEGERP (vec[4]) || !(XINT (vec[4]) >= '0' && XINT (vec[4]) <= '~')
721 || !INTEGERP (vec[5]) || !(XINT (vec[5]) == 0 || XINT (vec[5]) == 1)
722 || !STRINGP (vec[6])
723 || !STRINGP (vec[7])
724 || !STRINGP (vec[8]))
725 error ("Invalid info-vector argument for defining charset %s",
726 XSYMBOL (charset_symbol)->name->data);
727
728 if (NILP (charset_id))
729 {
730 charset_id = get_new_private_charset_id (XINT (vec[0]), XINT (vec[2]));
731 if (XINT (charset_id) == 0)
732 error ("There's no room for a new private charset %s",
733 XSYMBOL (charset_symbol)->name->data);
734 }
735
736 update_charset_table (charset_id, vec[0], vec[1], vec[2], vec[3],
737 vec[4], vec[5], vec[6], vec[7], vec[8]);
6dc0722d 738 Fput (charset_symbol, Qcharset, CHARSET_TABLE_ENTRY (XINT (charset_id)));
4ed46869
KH
739 CHARSET_SYMBOL (XINT (charset_id)) = charset_symbol;
740 Vcharset_list = Fcons (charset_symbol, Vcharset_list);
741 return Qnil;
742}
743
8a73a704
KH
744DEFUN ("generic-character-list", Fgeneric_character_list,
745 Sgeneric_character_list, 0, 0, 0,
746 "Return a list of all possible generic characters.\n\
747It includes a generic character for a charset not yet defined.")
748 ()
749{
750 return Vgeneric_character_list;
751}
752
3fac5a51
KH
753DEFUN ("get-unused-iso-final-char", Fget_unused_iso_final_char,
754 Sget_unused_iso_final_char, 2, 2, 0,
755 "Return an unsed ISO's final char for a charset of DIMENISION and CHARS.\n\
756DIMENSION is the number of bytes to represent a character: 1 or 2.\n\
757CHARS is the number of characters in a dimension: 94 or 96.\n\
758\n\
759This final char is for private use, thus the range is `0' (48) .. `?' (63).\n\
760If there's no unused final char for the specified kind of charset,\n\
761return nil.")
762 (dimension, chars)
763 Lisp_Object dimension, chars;
764{
765 int final_char;
766
767 CHECK_NUMBER (dimension, 0);
768 CHECK_NUMBER (chars, 1);
769 if (XINT (dimension) != 1 && XINT (dimension) != 2)
770 error ("Invalid charset dimension %d, it should be 1 or 2",
771 XINT (dimension));
772 if (XINT (chars) != 94 && XINT (chars) != 96)
773 error ("Invalid charset chars %d, it should be 94 or 96",
774 XINT (chars));
775 for (final_char = '0'; final_char <= '?'; final_char++)
776 {
777 if (ISO_CHARSET_TABLE (dimension, chars, make_number (final_char)) < 0)
778 break;
779 }
780 return (final_char <= '?' ? make_number (final_char) : Qnil);
781}
782
4ed46869
KH
783DEFUN ("declare-equiv-charset", Fdeclare_equiv_charset, Sdeclare_equiv_charset,
784 4, 4, 0,
785 "Declare a charset of DIMENSION, CHARS, FINAL-CHAR is the same as CHARSET.\n\
786CHARSET should be defined by `defined-charset' in advance.")
787 (dimension, chars, final_char, charset_symbol)
788 Lisp_Object dimension, chars, final_char, charset_symbol;
789{
790 int charset;
791
792 CHECK_NUMBER (dimension, 0);
793 CHECK_NUMBER (chars, 1);
794 CHECK_NUMBER (final_char, 2);
795 CHECK_SYMBOL (charset_symbol, 3);
796
797 if (XINT (dimension) != 1 && XINT (dimension) != 2)
798 error ("Invalid DIMENSION %d, it should be 1 or 2", XINT (dimension));
799 if (XINT (chars) != 94 && XINT (chars) != 96)
800 error ("Invalid CHARS %d, it should be 94 or 96", XINT (chars));
801 if (XINT (final_char) < '0' || XFASTINT (final_char) > '~')
802 error ("Invalid FINAL-CHAR %c, it should be `0'..`~'", XINT (chars));
803 if ((charset = get_charset_id (charset_symbol)) < 0)
804 error ("Invalid charset %s", XSYMBOL (charset_symbol)->name->data);
805
806 ISO_CHARSET_TABLE (dimension, chars, final_char) = charset;
807 return Qnil;
808}
809
810/* Return number of different charsets in STR of length LEN. In
811 addition, for each found charset N, CHARSETS[N] is set 1. The
a29e3b1b 812 caller should allocate CHARSETS (MAX_CHARSET + 1 elements) in advance.
1d67c29b
KH
813 It may lookup a translation table TABLE if supplied.
814
815 If CMPCHARP is nonzero and some composite character is found,
816 CHARSETS[128] is also set 1 and the returned number is incremented
0282eb69
KH
817 by 1.
818
819 If MULTIBYTE is zero, do not check multibyte characters, i.e. if
820 any ASCII codes (7-bit) are found, CHARSET[0] is set to 1, if any
821 8-bit codes are found CHARSET[1] is set to 1. */
4ed46869
KH
822
823int
0282eb69 824find_charset_in_str (str, len, charsets, table, cmpcharp, multibyte)
028d516b
KH
825 unsigned char *str;
826 int len, *charsets;
23d2a7f1 827 Lisp_Object table;
1d67c29b 828 int cmpcharp;
0282eb69 829 int multibyte;
4ed46869 830{
733eafd8 831 register int num = 0, c;
4ed46869 832
0282eb69
KH
833 if (! multibyte)
834 {
835 unsigned char *endp = str + len;
836 int maskbits = 0;
837
838 while (str < endp && maskbits != 3)
839 maskbits |= (*str++ < 0x80 ? 1 : 2);
840 if (maskbits & 1)
841 {
842 charsets[0] = 1;
843 num++;
844 }
845 if (maskbits & 2)
846 {
847 charsets[1] = 1;
848 num++;
849 }
850 return num;
851 }
852
23d2a7f1
KH
853 if (! CHAR_TABLE_P (table))
854 table = Qnil;
855
4ed46869
KH
856 while (len > 0)
857 {
05505664 858 int bytes, charset;
733eafd8 859 c = *str;
23d2a7f1 860
733eafd8 861 if (c == LEADING_CODE_COMPOSITION)
05505664 862 {
733eafd8
KH
863 int cmpchar_id = str_cmpchar_id (str, len);
864 GLYPH *glyph;
05505664 865
1d67c29b 866 if (cmpchar_id >= 0)
05505664 867 {
020da460 868 struct cmpchar_info *cmp_p = cmpchar_table[cmpchar_id];
733eafd8
KH
869 int i;
870
020da460 871 for (i = 0; i < cmp_p->glyph_len; i++)
733eafd8 872 {
020da460 873 c = cmp_p->glyph[i];
733eafd8
KH
874 if (!NILP (table))
875 {
b4e9dd77 876 if ((c = translate_char (table, c, 0, 0, 0)) < 0)
020da460 877 c = cmp_p->glyph[i];
733eafd8
KH
878 }
879 if ((charset = CHAR_CHARSET (c)) < 0)
880 charset = CHARSET_ASCII;
881 if (!charsets[charset])
882 {
883 charsets[charset] = 1;
884 num += 1;
885 }
886 }
020da460
KH
887 str += cmp_p->len;
888 len -= cmp_p->len;
889 if (cmpcharp && !charsets[CHARSET_COMPOSITION])
890 {
891 charsets[CHARSET_COMPOSITION] = 1;
892 num += 1;
893 }
733eafd8 894 continue;
05505664 895 }
05505664 896
0282eb69 897 charset = 1; /* This leads to `unknown' charset. */
733eafd8
KH
898 bytes = 1;
899 }
23d2a7f1
KH
900 else
901 {
733eafd8
KH
902 c = STRING_CHAR_AND_LENGTH (str, len, bytes);
903 if (! NILP (table))
904 {
b4e9dd77 905 int c1 = translate_char (table, c, 0, 0, 0);
733eafd8
KH
906 if (c1 >= 0)
907 c = c1;
908 }
909 charset = CHAR_CHARSET (c);
23d2a7f1 910 }
4ed46869
KH
911
912 if (!charsets[charset])
913 {
914 charsets[charset] = 1;
915 num += 1;
916 }
917 str += bytes;
918 len -= bytes;
919 }
920 return num;
921}
922
923DEFUN ("find-charset-region", Ffind_charset_region, Sfind_charset_region,
23d2a7f1 924 2, 3, 0,
4ed46869 925 "Return a list of charsets in the region between BEG and END.\n\
23d2a7f1 926BEG and END are buffer positions.\n\
020da460
KH
927If the region contains any composite character,\n\
928`composition' is included in the returned list.\n\
0282eb69
KH
929Optional arg TABLE if non-nil is a translation table to look up.\n\
930\n\
931If the region contains invalid multiybte characters,\n\
38f02ede 932`unknown' is included in the returned list.\n\
0282eb69
KH
933\n\
934If the current buffer is unibyte, the returned list contains\n\
935`ascii' if any 7-bit characters are found,\n\
936and `unknown' if any 8-bit characters are found.")
23d2a7f1
KH
937 (beg, end, table)
938 Lisp_Object beg, end, table;
4ed46869 939{
028d516b 940 int charsets[MAX_CHARSET + 1];
6ae1f27e 941 int from, from_byte, to, stop, stop_byte, i;
4ed46869 942 Lisp_Object val;
0282eb69
KH
943 int undefined;
944 int multibyte = !NILP (current_buffer->enable_multibyte_characters);
4ed46869
KH
945
946 validate_region (&beg, &end);
947 from = XFASTINT (beg);
948 stop = to = XFASTINT (end);
6ae1f27e 949
4ed46869 950 if (from < GPT && GPT < to)
6ae1f27e
RS
951 {
952 stop = GPT;
953 stop_byte = GPT_BYTE;
954 }
955 else
956 stop_byte = CHAR_TO_BYTE (stop);
957
958 from_byte = CHAR_TO_BYTE (from);
959
028d516b 960 bzero (charsets, (MAX_CHARSET + 1) * sizeof (int));
4ed46869
KH
961 while (1)
962 {
6ae1f27e 963 find_charset_in_str (BYTE_POS_ADDR (from_byte), stop_byte - from_byte,
0282eb69 964 charsets, table, 1, multibyte);
4ed46869 965 if (stop < to)
6ae1f27e
RS
966 {
967 from = stop, from_byte = stop_byte;
968 stop = to, stop_byte = CHAR_TO_BYTE (stop);
969 }
4ed46869
KH
970 else
971 break;
972 }
6ae1f27e 973
4ed46869 974 val = Qnil;
0282eb69
KH
975 undefined = 0;
976 for (i = (multibyte ? MAX_CHARSET : 1); i >= 0; i--)
4ed46869 977 if (charsets[i])
0282eb69
KH
978 {
979 if (CHARSET_DEFINED_P (i) || i == CHARSET_COMPOSITION)
980 val = Fcons (CHARSET_SYMBOL (i), val);
981 else
982 undefined = 1;
983 }
984 if (undefined)
985 val = Fcons (Qunknown, val);
4ed46869
KH
986 return val;
987}
988
989DEFUN ("find-charset-string", Ffind_charset_string, Sfind_charset_string,
23d2a7f1
KH
990 1, 2, 0,
991 "Return a list of charsets in STR.\n\
020da460
KH
992If the string contains any composite characters,\n\
993`composition' is included in the returned list.\n\
0282eb69
KH
994Optional arg TABLE if non-nil is a translation table to look up.\n\
995\n\
996If the region contains invalid multiybte characters,\n\
997`unknown' is included in the returned list.\n\
998\n\
999If STR is unibyte, the returned list contains\n\
1000`ascii' if any 7-bit characters are found,\n\
1001and `unknown' if any 8-bit characters are found.")
23d2a7f1
KH
1002 (str, table)
1003 Lisp_Object str, table;
4ed46869 1004{
a29e3b1b 1005 int charsets[MAX_CHARSET + 1];
4ed46869
KH
1006 int i;
1007 Lisp_Object val;
0282eb69
KH
1008 int undefined;
1009 int multibyte;
4ed46869
KH
1010
1011 CHECK_STRING (str, 0);
0282eb69 1012 multibyte = STRING_MULTIBYTE (str);
87b089ad 1013
a29e3b1b 1014 bzero (charsets, (MAX_CHARSET + 1) * sizeof (int));
fc932ac6 1015 find_charset_in_str (XSTRING (str)->data, STRING_BYTES (XSTRING (str)),
0282eb69 1016 charsets, table, 1, multibyte);
4ed46869 1017 val = Qnil;
0282eb69
KH
1018 undefined = 0;
1019 for (i = (multibyte ? MAX_CHARSET : 1); i >= 0; i--)
4ed46869 1020 if (charsets[i])
0282eb69
KH
1021 {
1022 if (CHARSET_DEFINED_P (i) || i == CHARSET_COMPOSITION)
1023 val = Fcons (CHARSET_SYMBOL (i), val);
1024 else
1025 undefined = 1;
1026 }
1027 if (undefined)
1028 val = Fcons (Qunknown, val);
4ed46869
KH
1029 return val;
1030}
1031\f
1032DEFUN ("make-char-internal", Fmake_char_internal, Smake_char_internal, 1, 3, 0,
513ee442 1033 "")
4ed46869
KH
1034 (charset, code1, code2)
1035 Lisp_Object charset, code1, code2;
1036{
ac4137cc
KH
1037 int charset_id, c1, c2;
1038
4ed46869 1039 CHECK_NUMBER (charset, 0);
ac4137cc
KH
1040 charset_id = XINT (charset);
1041 if (!CHARSET_DEFINED_P (charset_id))
1042 error ("Invalid charset ID: %d", XINT (charset));
4ed46869
KH
1043
1044 if (NILP (code1))
ac4137cc 1045 c1 = 0;
4ed46869 1046 else
ac4137cc
KH
1047 {
1048 CHECK_NUMBER (code1, 1);
1049 c1 = XINT (code1);
1050 }
4ed46869 1051 if (NILP (code2))
ac4137cc 1052 c2 = 0;
4ed46869 1053 else
ac4137cc
KH
1054 {
1055 CHECK_NUMBER (code2, 2);
1056 c2 = XINT (code2);
1057 }
4ed46869 1058
ac4137cc
KH
1059 if (c1 < 0 || c1 > 0xFF || c2 < 0 || c2 > 0xFF)
1060 error ("Invalid code points: %d %d", c1, c2);
1061 c1 &= 0x7F;
1062 c2 &= 0x7F;
1063 if (c1 == 0
1064 ? c2 != 0
1065 : (c2 == 0
1066 ? !CHAR_COMPONENT_VALID_P (charset, c1, 0x20)
1067 : !CHAR_COMPONENT_VALID_P (charset, c1, c2)))
1068 error ("Invalid code points: %d %d", c1, c2);
1069
1070 return make_number (MAKE_CHAR (charset_id, c1, c2));
4ed46869
KH
1071}
1072
1073DEFUN ("split-char", Fsplit_char, Ssplit_char, 1, 1, 0,
0282eb69
KH
1074 "Return list of charset and one or two position-codes of CHAR.\n\
1075If CHAR is invalid as a character code,\n\
1076return a list of symbol `unknown' and CHAR.")
4ed46869
KH
1077 (ch)
1078 Lisp_Object ch;
1079{
1080 Lisp_Object val;
0282eb69 1081 int c, charset, c1, c2;
4ed46869
KH
1082
1083 CHECK_NUMBER (ch, 0);
0282eb69
KH
1084 c = XFASTINT (ch);
1085 if (!CHAR_VALID_P (c, 1))
1086 return Fcons (Qunknown, Fcons (ch, Qnil));
4ed46869 1087 SPLIT_CHAR (XFASTINT (ch), charset, c1, c2);
6dc0722d 1088 return (c2 >= 0
4ed46869
KH
1089 ? Fcons (CHARSET_SYMBOL (charset),
1090 Fcons (make_number (c1), Fcons (make_number (c2), Qnil)))
1091 : Fcons (CHARSET_SYMBOL (charset), Fcons (make_number (c1), Qnil)));
1092}
1093
1094DEFUN ("char-charset", Fchar_charset, Schar_charset, 1, 1, 0,
1095 "Return charset of CHAR.")
1096 (ch)
1097 Lisp_Object ch;
1098{
1099 CHECK_NUMBER (ch, 0);
1100
1101 return CHARSET_SYMBOL (CHAR_CHARSET (XINT (ch)));
1102}
1103
90d7b74e 1104DEFUN ("charset-after", Fcharset_after, Scharset_after, 0, 1, 0,
ac4137cc 1105 "Return charset of a character in the current buffer at position POS.\n\
e6e114f2
KH
1106If POS is nil, it defauls to the current point.\n\
1107If POS is out of range, the value is nil.")
90d7b74e
KH
1108 (pos)
1109 Lisp_Object pos;
1110{
ac4137cc 1111 register int pos_byte, bytes, charset, c1, c2;
90d7b74e
KH
1112 register unsigned char *p;
1113
1114 if (NILP (pos))
1115 pos_byte = PT_BYTE;
1116 else if (MARKERP (pos))
e6e114f2
KH
1117 {
1118 pos_byte = marker_byte_position (pos);
1119 if (pos_byte < BEGV_BYTE || pos_byte >= ZV_BYTE)
1120 return Qnil;
1121 }
90d7b74e
KH
1122 else
1123 {
1124 CHECK_NUMBER (pos, 0);
e6e114f2
KH
1125 if (XINT (pos) < BEGV || XINT (pos) >= ZV)
1126 return Qnil;
90d7b74e
KH
1127 pos_byte = CHAR_TO_BYTE (XINT (pos));
1128 }
1129 p = BYTE_POS_ADDR (pos_byte);
ac4137cc
KH
1130 if (BASE_LEADING_CODE_P (*p))
1131 {
1132 SPLIT_MULTIBYTE_SEQ (p, Z_BYTE - pos_byte, bytes, charset, c1, c2);
1133 if (charset < 0)
1134 charset = 1;
1135 }
1136 else
1137 charset = CHARSET_ASCII;
1138
90d7b74e
KH
1139 return CHARSET_SYMBOL (charset);
1140}
1141
4ed46869 1142DEFUN ("iso-charset", Fiso_charset, Siso_charset, 3, 3, 0,
2b71bb78
KH
1143 "Return charset of ISO's specification DIMENSION, CHARS, and FINAL-CHAR.\n\
1144\n\
1145ISO 2022's designation sequence (escape sequence) distinguishes charsets\n\
1146by their DIMENSION, CHARS, and FINAL-CHAR,\n\
1147where as Emacs distinguishes them by charset symbol.\n\
1148See the documentation of the function `charset-info' for the meanings of\n\
1149DIMENSION, CHARS, and FINAL-CHAR.")
4ed46869
KH
1150 (dimension, chars, final_char)
1151 Lisp_Object dimension, chars, final_char;
1152{
1153 int charset;
1154
1155 CHECK_NUMBER (dimension, 0);
1156 CHECK_NUMBER (chars, 1);
1157 CHECK_NUMBER (final_char, 2);
1158
1159 if ((charset = ISO_CHARSET_TABLE (dimension, chars, final_char)) < 0)
1160 return Qnil;
1161 return CHARSET_SYMBOL (charset);
1162}
1163
9d3d8cba
KH
1164/* If GENERICP is nonzero, return nonzero iff C is a valid normal or
1165 generic character. If GENERICP is zero, return nonzero iff C is a
1166 valid normal character. Do not call this function directly,
1167 instead use macro CHAR_VALID_P. */
1168int
1169char_valid_p (c, genericp)
1170 int c, genericp;
1171{
1172 int charset, c1, c2;
1173
1174 if (c < 0)
1175 return 0;
1176 if (SINGLE_BYTE_CHAR_P (c))
1177 return 1;
1178 SPLIT_NON_ASCII_CHAR (c, charset, c1, c2);
ac4137cc
KH
1179 if (charset == CHARSET_COMPOSITION)
1180 return ((c >= MIN_CHAR_COMPOSITION
1181 && c < MIN_CHAR_COMPOSITION + n_cmpchars)
1182 || (genericp && c == GENERIC_COMPOSITION_CHAR));
1183 if (genericp)
1184 {
1185 if (c1)
1186 {
1187 if (c2 <= 0) c2 = 0x20;
1188 }
1189 else
1190 {
1191 if (c2 <= 0) c1 = c2 = 0x20;
1192 }
1193 }
1194 return (CHARSET_DEFINED_P (charset)
1195 && CHAR_COMPONENT_VALID_P (charset, c1, c2));
9d3d8cba
KH
1196}
1197
1198DEFUN ("char-valid-p", Fchar_valid_p, Schar_valid_p, 1, 2, 0,
a9d02884
DL
1199 "Return t if OBJECT is a valid normal character.\n\
1200If optional arg GENERICP is non-nil, also return t if OBJECT is\n\
9d3d8cba
KH
1201a valid generic character.")
1202 (object, genericp)
1203 Lisp_Object object, genericp;
1204{
1205 if (! NATNUMP (object))
1206 return Qnil;
1207 return (CHAR_VALID_P (XFASTINT (object), !NILP (genericp)) ? Qt : Qnil);
1208}
1209
d2665018
KH
1210DEFUN ("unibyte-char-to-multibyte", Funibyte_char_to_multibyte,
1211 Sunibyte_char_to_multibyte, 1, 1, 0,
1212 "Convert the unibyte character CH to multibyte character.\n\
537efd8d 1213The conversion is done based on `nonascii-translation-table' (which see)\n\
340b8d58 1214 or `nonascii-insert-offset' (which see).")
d2665018
KH
1215 (ch)
1216 Lisp_Object ch;
1217{
1218 int c;
1219
1220 CHECK_NUMBER (ch, 0);
1221 c = XINT (ch);
1222 if (c < 0 || c >= 0400)
1223 error ("Invalid unibyte character: %d", c);
1224 c = unibyte_char_to_multibyte (c);
1225 if (c < 0)
1226 error ("Can't convert to multibyte character: %d", XINT (ch));
1227 return make_number (c);
1228}
1229
1bcc1567
RS
1230DEFUN ("multibyte-char-to-unibyte", Fmultibyte_char_to_unibyte,
1231 Smultibyte_char_to_unibyte, 1, 1, 0,
1232 "Convert the multibyte character CH to unibyte character.\n\
1233The conversion is done based on `nonascii-translation-table' (which see)\n\
1234 or `nonascii-insert-offset' (which see).")
1235 (ch)
1236 Lisp_Object ch;
1237{
1238 int c;
1239
1240 CHECK_NUMBER (ch, 0);
1241 c = XINT (ch);
ac4137cc 1242 if (! CHAR_VALID_P (c, 0))
1bcc1567
RS
1243 error ("Invalid multibyte character: %d", c);
1244 c = multibyte_char_to_unibyte (c, Qnil);
1245 if (c < 0)
1246 error ("Can't convert to unibyte character: %d", XINT (ch));
1247 return make_number (c);
1248}
1249
4ed46869 1250DEFUN ("char-bytes", Fchar_bytes, Schar_bytes, 1, 1, 0,
f78643ef 1251 "Return 1 regardless of the argument CHAR.\n\
60022cb7 1252This is now an obsolete function. We keep it just for backward compatibility.")
4ed46869
KH
1253 (ch)
1254 Lisp_Object ch;
1255{
1256 Lisp_Object val;
4ed46869
KH
1257
1258 CHECK_NUMBER (ch, 0);
9b6a601f
KH
1259 return make_number (1);
1260}
1261
1262/* Return how many bytes C will occupy in a multibyte buffer.
1263 Don't call this function directly, instead use macro CHAR_BYTES. */
1264int
1265char_bytes (c)
1266 int c;
1267{
1268 int bytes;
1269
8ac5a9cc
KH
1270 if (SINGLE_BYTE_CHAR_P (c) || (c & ~GLYPH_MASK_CHAR))
1271 return 1;
1272
9b6a601f 1273 if (COMPOSITE_CHAR_P (c))
4ed46869 1274 {
9b6a601f 1275 unsigned int id = COMPOSITE_CHAR_ID (c);
4ed46869
KH
1276
1277 bytes = (id < n_cmpchars ? cmpchar_table[id]->len : 1);
1278 }
1279 else
1280 {
9b6a601f 1281 int charset = CHAR_CHARSET (c);
4ed46869
KH
1282
1283 bytes = CHARSET_DEFINED_P (charset) ? CHARSET_BYTES (charset) : 1;
1284 }
1285
60022cb7 1286 return bytes;
4ed46869
KH
1287}
1288
1289/* Return the width of character of which multi-byte form starts with
1290 C. The width is measured by how many columns occupied on the
1291 screen when displayed in the current buffer. */
1292
1293#define ONE_BYTE_CHAR_WIDTH(c) \
1294 (c < 0x20 \
1295 ? (c == '\t' \
53316e55 1296 ? XFASTINT (current_buffer->tab_width) \
4ed46869
KH
1297 : (c == '\n' ? 0 : (NILP (current_buffer->ctl_arrow) ? 4 : 2))) \
1298 : (c < 0x7f \
1299 ? 1 \
1300 : (c == 0x7F \
1301 ? (NILP (current_buffer->ctl_arrow) ? 4 : 2) \
1302 : ((! NILP (current_buffer->enable_multibyte_characters) \
1303 && BASE_LEADING_CODE_P (c)) \
1304 ? WIDTH_BY_CHAR_HEAD (c) \
b4e9dd77 1305 : 4))))
4ed46869
KH
1306
1307DEFUN ("char-width", Fchar_width, Schar_width, 1, 1, 0,
1308 "Return width of CHAR when displayed in the current buffer.\n\
1309The width is measured by how many columns it occupies on the screen.")
1310 (ch)
1311 Lisp_Object ch;
1312{
859f2b3c 1313 Lisp_Object val, disp;
4ed46869 1314 int c;
51c4025f 1315 struct Lisp_Char_Table *dp = buffer_display_table ();
4ed46869
KH
1316
1317 CHECK_NUMBER (ch, 0);
1318
859f2b3c
RS
1319 c = XINT (ch);
1320
1321 /* Get the way the display table would display it. */
51c4025f 1322 disp = dp ? DISP_CHAR_VECTOR (dp, c) : Qnil;
859f2b3c
RS
1323
1324 if (VECTORP (disp))
1325 XSETINT (val, XVECTOR (disp)->size);
1326 else if (SINGLE_BYTE_CHAR_P (c))
1327 XSETINT (val, ONE_BYTE_CHAR_WIDTH (c));
4ed46869
KH
1328 else if (COMPOSITE_CHAR_P (c))
1329 {
1330 int id = COMPOSITE_CHAR_ID (XFASTINT (ch));
0282eb69 1331 XSETFASTINT (val, (id < n_cmpchars ? cmpchar_table[id]->width : 1));
4ed46869
KH
1332 }
1333 else
1334 {
1335 int charset = CHAR_CHARSET (c);
1336
1337 XSETFASTINT (val, CHARSET_WIDTH (charset));
1338 }
1339 return val;
1340}
1341
1342/* Return width of string STR of length LEN when displayed in the
1343 current buffer. The width is measured by how many columns it
1344 occupies on the screen. */
859f2b3c 1345
4ed46869
KH
1346int
1347strwidth (str, len)
1348 unsigned char *str;
1349 int len;
1350{
1351 unsigned char *endp = str + len;
1352 int width = 0;
c4a4e28f 1353 struct Lisp_Char_Table *dp = buffer_display_table ();
4ed46869 1354
859f2b3c
RS
1355 while (str < endp)
1356 {
1357 if (*str == LEADING_CODE_COMPOSITION)
1358 {
1359 int id = str_cmpchar_id (str, endp - str);
1360
1361 if (id < 0)
1362 {
1363 width += 4;
1364 str++;
1365 }
1366 else
1367 {
1368 width += cmpchar_table[id]->width;
1369 str += cmpchar_table[id]->len;
1370 }
1371 }
1372 else
1373 {
1374 Lisp_Object disp;
e515b0a9
KH
1375 int thislen;
1376 int c = STRING_CHAR_AND_LENGTH (str, endp - str, thislen);
859f2b3c
RS
1377
1378 /* Get the way the display table would display it. */
acc35c36
RS
1379 if (dp)
1380 disp = DISP_CHAR_VECTOR (dp, c);
1381 else
1382 disp = Qnil;
859f2b3c
RS
1383
1384 if (VECTORP (disp))
e515b0a9 1385 width += XVECTOR (disp)->size;
859f2b3c 1386 else
e515b0a9 1387 width += ONE_BYTE_CHAR_WIDTH (*str);
859f2b3c 1388
e515b0a9 1389 str += thislen;
859f2b3c
RS
1390 }
1391 }
4ed46869
KH
1392 return width;
1393}
1394
1395DEFUN ("string-width", Fstring_width, Sstring_width, 1, 1, 0,
1396 "Return width of STRING when displayed in the current buffer.\n\
1397Width is measured by how many columns it occupies on the screen.\n\
046b1f03
RS
1398When calculating width of a multibyte character in STRING,\n\
1399only the base leading-code is considered; the validity of\n\
1400the following bytes is not checked.")
4ed46869
KH
1401 (str)
1402 Lisp_Object str;
1403{
1404 Lisp_Object val;
1405
1406 CHECK_STRING (str, 0);
fc932ac6
RS
1407 XSETFASTINT (val, strwidth (XSTRING (str)->data,
1408 STRING_BYTES (XSTRING (str))));
4ed46869
KH
1409 return val;
1410}
1411
1412DEFUN ("char-direction", Fchar_direction, Schar_direction, 1, 1, 0,
1413 "Return the direction of CHAR.\n\
1414The returned value is 0 for left-to-right and 1 for right-to-left.")
1415 (ch)
1416 Lisp_Object ch;
1417{
1418 int charset;
1419
1420 CHECK_NUMBER (ch, 0);
1421 charset = CHAR_CHARSET (XFASTINT (ch));
1422 if (!CHARSET_DEFINED_P (charset))
93bcb785 1423 invalid_character (XINT (ch));
4ed46869
KH
1424 return CHARSET_TABLE_INFO (charset, CHARSET_DIRECTION_IDX);
1425}
1426
af4fecb4 1427DEFUN ("chars-in-region", Fchars_in_region, Schars_in_region, 2, 2, 0,
6ae1f27e 1428 "Return number of characters between BEG and END.")
046b1f03
RS
1429 (beg, end)
1430 Lisp_Object beg, end;
1431{
6ae1f27e 1432 int from, to;
046b1f03 1433
17e7ef1b
RS
1434 CHECK_NUMBER_COERCE_MARKER (beg, 0);
1435 CHECK_NUMBER_COERCE_MARKER (end, 1);
1436
046b1f03 1437 from = min (XFASTINT (beg), XFASTINT (end));
a8a35e61 1438 to = max (XFASTINT (beg), XFASTINT (end));
046b1f03 1439
a8c21066 1440 return make_number (to - from);
6ae1f27e 1441}
9036eb45 1442
87b089ad
RS
1443/* Return the number of characters in the NBYTES bytes at PTR.
1444 This works by looking at the contents and checking for multibyte sequences.
1445 However, if the current buffer has enable-multibyte-characters = nil,
1446 we treat each byte as a character. */
1447
6ae1f27e
RS
1448int
1449chars_in_text (ptr, nbytes)
1450 unsigned char *ptr;
1451 int nbytes;
1452{
87b089ad
RS
1453 /* current_buffer is null at early stages of Emacs initialization. */
1454 if (current_buffer == 0
1455 || NILP (current_buffer->enable_multibyte_characters))
6ae1f27e 1456 return nbytes;
a8a35e61 1457
ac4137cc 1458 return multibyte_chars_in_text (ptr, nbytes);
046b1f03
RS
1459}
1460
87b089ad
RS
1461/* Return the number of characters in the NBYTES bytes at PTR.
1462 This works by looking at the contents and checking for multibyte sequences.
1463 It ignores enable-multibyte-characters. */
1464
1465int
1466multibyte_chars_in_text (ptr, nbytes)
1467 unsigned char *ptr;
1468 int nbytes;
1469{
ac4137cc
KH
1470 unsigned char *endp;
1471 int chars, bytes;
87b089ad
RS
1472
1473 endp = ptr + nbytes;
1474 chars = 0;
1475
1476 while (ptr < endp)
1477 {
ac4137cc
KH
1478 if (BASE_LEADING_CODE_P (*ptr))
1479 {
1480 PARSE_MULTIBYTE_SEQ (ptr, nbytes, bytes);
1481 ptr += bytes;
1482 nbytes -= bytes;
1483 }
1484 else
1485 {
1486 ptr++;
1487 nbytes--;
1488 }
87b089ad
RS
1489 chars++;
1490 }
1491
1492 return chars;
1493}
1494
1495DEFUN ("string", Fstring, Sstring, 1, MANY, 0,
4ed46869 1496 "Concatenate all the argument characters and make the result a string.")
53316e55
KH
1497 (n, args)
1498 int n;
4ed46869
KH
1499 Lisp_Object *args;
1500{
53316e55 1501 int i;
4ed46869 1502 unsigned char *buf
bd4c6dd0 1503 = (unsigned char *) alloca (MAX_LENGTH_OF_MULTI_BYTE_FORM * n);
4ed46869
KH
1504 unsigned char *p = buf;
1505 Lisp_Object val;
1506
1507 for (i = 0; i < n; i++)
1508 {
1509 int c, len;
1510 unsigned char *str;
1511
1512 if (!INTEGERP (args[i]))
b0e3cf2b 1513 CHECK_NUMBER (args[i], 0);
4ed46869
KH
1514 c = XINT (args[i]);
1515 len = CHAR_STRING (c, p, str);
1516 if (p != str)
1517 /* C is a composite character. */
1518 bcopy (str, p, len);
1519 p += len;
1520 }
1521
020da460
KH
1522 /* Here, we can't use make_string_from_bytes because of byte
1523 combining problem. */
1524 val = make_string (buf, p - buf);
4ed46869
KH
1525 return val;
1526}
1527
1528#endif /* emacs */
1529\f
1530/*** Composite characters staffs ***/
1531
1532/* Each composite character is identified by CMPCHAR-ID which is
1533 assigned when Emacs needs the character code of the composite
1534 character (e.g. when displaying it on the screen). See the
1535 document "GENERAL NOTE on COMPOSITE CHARACTER" in `charset.h' how a
1536 composite character is represented in Emacs. */
1537
1538/* If `static' is defined, it means that it is defined to null string. */
1539#ifndef static
1540/* The following function is copied from lread.c. */
1541static int
1542hash_string (ptr, len)
1543 unsigned char *ptr;
1544 int len;
1545{
1546 register unsigned char *p = ptr;
1547 register unsigned char *end = p + len;
1548 register unsigned char c;
1549 register int hash = 0;
1550
1551 while (p != end)
1552 {
1553 c = *p++;
1554 if (c >= 0140) c -= 40;
1555 hash = ((hash<<3) + (hash>>28) + c);
1556 }
1557 return hash & 07777777777;
1558}
1559#endif
1560
4ed46869
KH
1561#define CMPCHAR_HASH_TABLE_SIZE 0xFFF
1562
1563static int *cmpchar_hash_table[CMPCHAR_HASH_TABLE_SIZE];
1564
1565/* Each element of `cmpchar_hash_table' is a pointer to an array of
1566 integer, where the 1st element is the size of the array, the 2nd
1567 element is how many elements are actually used in the array, and
1568 the remaining elements are CMPCHAR-IDs of composite characters of
1569 the same hash value. */
1570#define CMPCHAR_HASH_SIZE(table) table[0]
1571#define CMPCHAR_HASH_USED(table) table[1]
1572#define CMPCHAR_HASH_CMPCHAR_ID(table, i) table[i]
1573
1574/* Return CMPCHAR-ID of the composite character in STR of the length
1575 LEN. If the composite character has not yet been registered,
1576 register it in `cmpchar_table' and assign new CMPCHAR-ID. This
1577 is the sole function for assigning CMPCHAR-ID. */
1578int
1579str_cmpchar_id (str, len)
8867de67 1580 const unsigned char *str;
4ed46869
KH
1581 int len;
1582{
1583 int hash_idx, *hashp;
1584 unsigned char *buf;
1585 int embedded_rule; /* 1 if composition rule is embedded. */
1586 int chars; /* number of components. */
1587 int i;
1588 struct cmpchar_info *cmpcharp;
1589
ac4137cc 1590 /* The second byte 0xFF means COMPOSITION rule is embedded. */
4ed46869
KH
1591 embedded_rule = (str[1] == 0xFF);
1592
1593 /* At first, get the actual length of the composite character. */
1594 {
8867de67 1595 const unsigned char *p, *endp = str + 1, *lastp = str + len;
4ed46869
KH
1596 int bytes;
1597
6ae1f27e 1598 while (endp < lastp && ! CHAR_HEAD_P (*endp)) endp++;
93bcb785
KH
1599 if (endp - str < 5)
1600 /* Any composite char have at least 5-byte length. */
1601 return -1;
1602
4ed46869 1603 chars = 0;
93bcb785 1604 p = str + 1;
4ed46869
KH
1605 while (p < endp)
1606 {
9b4d1fe6
KH
1607 if (embedded_rule)
1608 {
1609 p++;
1610 if (p >= endp)
1611 return -1;
1612 }
4ed46869 1613 /* No need of checking if *P is 0xA0 because
93bcb785
KH
1614 BYTES_BY_CHAR_HEAD (0x80) surely returns 2. */
1615 p += BYTES_BY_CHAR_HEAD (*p - 0x20);
4ed46869
KH
1616 chars++;
1617 }
93bcb785
KH
1618 if (p > endp || chars < 2 || chars > MAX_COMPONENT_COUNT)
1619 /* Invalid components. */
4ed46869 1620 return -1;
93bcb785 1621 len = p - str;
4ed46869
KH
1622 }
1623 hash_idx = hash_string (str, len) % CMPCHAR_HASH_TABLE_SIZE;
1624 hashp = cmpchar_hash_table[hash_idx];
1625
1626 /* Then, look into the hash table. */
1627 if (hashp != NULL)
1628 /* Find the correct one among composite characters of the same
1629 hash value. */
1630 for (i = 2; i < CMPCHAR_HASH_USED (hashp); i++)
1631 {
1632 cmpcharp = cmpchar_table[CMPCHAR_HASH_CMPCHAR_ID (hashp, i)];
1633 if (len == cmpcharp->len
1634 && ! bcmp (str, cmpcharp->data, len))
1635 return CMPCHAR_HASH_CMPCHAR_ID (hashp, i);
1636 }
1637
1638 /* We have to register the composite character in cmpchar_table. */
0282eb69 1639 if (n_cmpchars >= (CHAR_FIELD2_MASK | CHAR_FIELD3_MASK))
513ee442
KH
1640 /* No, we have no more room for a new composite character. */
1641 return -1;
1642
4ed46869
KH
1643 /* Make the entry in hash table. */
1644 if (hashp == NULL)
1645 {
1646 /* Make a table for 8 composite characters initially. */
1647 hashp = (cmpchar_hash_table[hash_idx]
1648 = (int *) xmalloc (sizeof (int) * (2 + 8)));
1649 CMPCHAR_HASH_SIZE (hashp) = 10;
1650 CMPCHAR_HASH_USED (hashp) = 2;
1651 }
1652 else if (CMPCHAR_HASH_USED (hashp) >= CMPCHAR_HASH_SIZE (hashp))
1653 {
1654 CMPCHAR_HASH_SIZE (hashp) += 8;
1655 hashp = (cmpchar_hash_table[hash_idx]
1656 = (int *) xrealloc (hashp,
1657 sizeof (int) * CMPCHAR_HASH_SIZE (hashp)));
1658 }
1659 CMPCHAR_HASH_CMPCHAR_ID (hashp, CMPCHAR_HASH_USED (hashp)) = n_cmpchars;
1660 CMPCHAR_HASH_USED (hashp)++;
1661
1662 /* Set information of the composite character in cmpchar_table. */
1663 if (cmpchar_table_size == 0)
1664 {
1665 /* This is the first composite character to be registered. */
1666 cmpchar_table_size = 256;
1667 cmpchar_table
1668 = (struct cmpchar_info **) xmalloc (sizeof (cmpchar_table[0])
1669 * cmpchar_table_size);
1670 }
1671 else if (cmpchar_table_size <= n_cmpchars)
1672 {
1673 cmpchar_table_size += 256;
1674 cmpchar_table
1675 = (struct cmpchar_info **) xrealloc (cmpchar_table,
1676 sizeof (cmpchar_table[0])
1677 * cmpchar_table_size);
1678 }
1679
1680 cmpcharp = (struct cmpchar_info *) xmalloc (sizeof (struct cmpchar_info));
1681
1682 cmpcharp->len = len;
1683 cmpcharp->data = (unsigned char *) xmalloc (len + 1);
1684 bcopy (str, cmpcharp->data, len);
1685 cmpcharp->data[len] = 0;
1686 cmpcharp->glyph_len = chars;
1687 cmpcharp->glyph = (GLYPH *) xmalloc (sizeof (GLYPH) * chars);
1688 if (embedded_rule)
1689 {
1690 cmpcharp->cmp_rule = (unsigned char *) xmalloc (chars);
1691 cmpcharp->col_offset = (float *) xmalloc (sizeof (float) * chars);
1692 }
1693 else
1694 {
1695 cmpcharp->cmp_rule = NULL;
1696 cmpcharp->col_offset = NULL;
1697 }
1698
1699 /* Setup GLYPH data and composition rules (if any) so as not to make
1700 them every time on displaying. */
1701 {
1702 unsigned char *bufp;
1703 int width;
1704 float leftmost = 0.0, rightmost = 1.0;
1705
1706 if (embedded_rule)
1707 /* At first, col_offset[N] is set to relative to col_offset[0]. */
1708 cmpcharp->col_offset[0] = 0;
1709
1710 for (i = 0, bufp = cmpcharp->data + 1; i < chars; i++)
1711 {
1712 if (embedded_rule)
1713 cmpcharp->cmp_rule[i] = *bufp++;
1714
1715 if (*bufp == 0xA0) /* This is an ASCII character. */
1716 {
1717 cmpcharp->glyph[i] = FAST_MAKE_GLYPH ((*++bufp & 0x7F), 0);
1718 width = 1;
1719 bufp++;
1720 }
1721 else /* Multibyte character. */
1722 {
1723 /* Make `bufp' point normal multi-byte form temporally. */
1724 *bufp -= 0x20;
1725 cmpcharp->glyph[i]
ac4137cc 1726 = FAST_MAKE_GLYPH (string_to_non_ascii_char (bufp, 4, 0), 0);
4ed46869
KH
1727 width = WIDTH_BY_CHAR_HEAD (*bufp);
1728 *bufp += 0x20;
1729 bufp += BYTES_BY_CHAR_HEAD (*bufp - 0x20);
1730 }
1731
1732 if (embedded_rule && i > 0)
1733 {
1734 /* Reference points (global_ref and new_ref) are
1735 encoded as below:
1736
1737 0--1--2 -- ascent
1738 | |
1739 | |
1740 | 4 -+--- center
1741 -- 3 5 -- baseline
1742 | |
1743 6--7--8 -- descent
1744
1745 Now, we calculate the column offset of the new glyph
1746 from the left edge of the first glyph. This can avoid
1747 the same calculation everytime displaying this
1748 composite character. */
1749
1750 /* Reference points of global glyph and new glyph. */
1751 int global_ref = (cmpcharp->cmp_rule[i] - 0xA0) / 9;
1752 int new_ref = (cmpcharp->cmp_rule[i] - 0xA0) % 9;
1753 /* Column offset relative to the first glyph. */
1754 float left = (leftmost
1755 + (global_ref % 3) * (rightmost - leftmost) / 2.0
1756 - (new_ref % 3) * width / 2.0);
1757
1758 cmpcharp->col_offset[i] = left;
1759 if (left < leftmost)
1760 leftmost = left;
1761 if (left + width > rightmost)
1762 rightmost = left + width;
1763 }
1764 else
1765 {
1766 if (width > rightmost)
1767 rightmost = width;
1768 }
1769 }
1770 if (embedded_rule)
1771 {
1772 /* Now col_offset[N] are relative to the left edge of the
1773 first component. Make them relative to the left edge of
1774 overall glyph. */
1775 for (i = 0; i < chars; i++)
1776 cmpcharp->col_offset[i] -= leftmost;
1777 /* Make rightmost holds width of overall glyph. */
1778 rightmost -= leftmost;
1779 }
1780
1781 cmpcharp->width = rightmost;
1782 if (cmpcharp->width < rightmost)
1783 /* To get a ceiling integer value. */
1784 cmpcharp->width++;
1785 }
1786
1787 cmpchar_table[n_cmpchars] = cmpcharp;
1788
1789 return n_cmpchars++;
1790}
1791
de54b0d5
KH
1792/* Return the Nth element of the composite character C. If NOERROR is
1793 nonzero, return 0 on error condition (C is an invalid composite
1794 charcter, or N is out of range). */
4ed46869 1795int
de54b0d5
KH
1796cmpchar_component (c, n, noerror)
1797 int c, n, noerror;
4ed46869
KH
1798{
1799 int id = COMPOSITE_CHAR_ID (c);
1800
de54b0d5
KH
1801 if (id < 0 || id >= n_cmpchars)
1802 {
1803 /* C is not a valid composite character. */
1804 if (noerror) return 0;
1805 error ("Invalid composite character: %d", c) ;
1806 }
1807 if (n >= cmpchar_table[id]->glyph_len)
1808 {
1809 /* No such component. */
1810 if (noerror) return 0;
1811 args_out_of_range (make_number (c), make_number (n));
1812 }
4ed46869
KH
1813 /* No face data is stored in glyph code. */
1814 return ((int) (cmpchar_table[id]->glyph[n]));
1815}
1816
1817DEFUN ("cmpcharp", Fcmpcharp, Scmpcharp, 1, 1, 0,
1818 "T if CHAR is a composite character.")
1819 (ch)
1820 Lisp_Object ch;
1821{
1822 CHECK_NUMBER (ch, 0);
1823 return (COMPOSITE_CHAR_P (XINT (ch)) ? Qt : Qnil);
1824}
1825
1826DEFUN ("composite-char-component", Fcmpchar_component, Scmpchar_component,
1827 2, 2, 0,
de54b0d5
KH
1828 "Return the Nth component character of composite character CHARACTER.")
1829 (character, n)
1830 Lisp_Object character, n;
4ed46869 1831{
de54b0d5 1832 int id;
4ed46869
KH
1833
1834 CHECK_NUMBER (character, 0);
de54b0d5 1835 CHECK_NUMBER (n, 1);
4ed46869 1836
de54b0d5 1837 return (make_number (cmpchar_component (XINT (character), XINT (n), 0)));
4ed46869
KH
1838}
1839
1840DEFUN ("composite-char-composition-rule", Fcmpchar_cmp_rule, Scmpchar_cmp_rule,
1841 2, 2, 0,
de54b0d5 1842 "Return the Nth composition rule of composite character CHARACTER.\n\
55001746 1843The returned rule is for composing the Nth component\n\
de54b0d5
KH
1844on the (N-1)th component.\n\
1845If CHARACTER should be composed relatively or N is 0, return 255.")
55001746
KH
1846 (character, n)
1847 Lisp_Object character, n;
4ed46869 1848{
de54b0d5 1849 int id;
4ed46869
KH
1850
1851 CHECK_NUMBER (character, 0);
55001746 1852 CHECK_NUMBER (n, 1);
4ed46869
KH
1853
1854 id = COMPOSITE_CHAR_ID (XINT (character));
1855 if (id < 0 || id >= n_cmpchars)
1856 error ("Invalid composite character: %d", XINT (character));
de54b0d5 1857 if (XINT (n) < 0 || XINT (n) >= cmpchar_table[id]->glyph_len)
55001746 1858 args_out_of_range (character, n);
4ed46869 1859
de54b0d5
KH
1860 return make_number (cmpchar_table[id]->cmp_rule
1861 ? cmpchar_table[id]->cmp_rule[XINT (n)]
1862 : 255);
4ed46869
KH
1863}
1864
1865DEFUN ("composite-char-composition-rule-p", Fcmpchar_cmp_rule_p,
1866 Scmpchar_cmp_rule_p, 1, 1, 0,
1867 "Return non-nil if composite character CHARACTER contains a embedded rule.")
1868 (character)
1869 Lisp_Object character;
1870{
1871 int id;
1872
1873 CHECK_NUMBER (character, 0);
1874 id = COMPOSITE_CHAR_ID (XINT (character));
1875 if (id < 0 || id >= n_cmpchars)
1876 error ("Invalid composite character: %d", XINT (character));
1877
1878 return (cmpchar_table[id]->cmp_rule ? Qt : Qnil);
1879}
1880
1881DEFUN ("composite-char-component-count", Fcmpchar_cmp_count,
1882 Scmpchar_cmp_count, 1, 1, 0,
1883 "Return number of compoents of composite character CHARACTER.")
1884 (character)
1885 Lisp_Object character;
1886{
1887 int id;
1888
1889 CHECK_NUMBER (character, 0);
1890 id = COMPOSITE_CHAR_ID (XINT (character));
1891 if (id < 0 || id >= n_cmpchars)
1892 error ("Invalid composite character: %d", XINT (character));
1893
1894 return (make_number (cmpchar_table[id]->glyph_len));
1895}
1896
1897DEFUN ("compose-string", Fcompose_string, Scompose_string,
1898 1, 1, 0,
1899 "Return one char string composed from all characters in STRING.")
1900 (str)
1901 Lisp_Object str;
1902{
1903 unsigned char buf[MAX_LENGTH_OF_MULTI_BYTE_FORM], *p, *pend, *ptemp;
1904 int len, i;
1905
1906 CHECK_STRING (str, 0);
1907
1908 buf[0] = LEADING_CODE_COMPOSITION;
1909 p = XSTRING (str)->data;
fc932ac6 1910 pend = p + STRING_BYTES (XSTRING (str));
4ed46869
KH
1911 i = 1;
1912 while (p < pend)
1913 {
9b4d1fe6 1914 if (*p < 0x20) /* control code */
4ed46869
KH
1915 error ("Invalid component character: %d", *p);
1916 else if (*p < 0x80) /* ASCII */
1917 {
1918 if (i + 2 >= MAX_LENGTH_OF_MULTI_BYTE_FORM)
1919 error ("Too long string to be composed: %s", XSTRING (str)->data);
1920 /* Prepend an ASCII charset indicator 0xA0, set MSB of the
1921 code itself. */
1922 buf[i++] = 0xA0;
1923 buf[i++] = *p++ + 0x80;
1924 }
1925 else if (*p == LEADING_CODE_COMPOSITION) /* composite char */
1926 {
1927 /* Already composed. Eliminate the heading
1928 LEADING_CODE_COMPOSITION, keep the remaining bytes
1929 unchanged. */
1930 p++;
de54b0d5
KH
1931 if (*p == 255)
1932 error ("Can't compose a rule-based composition character");
4ed46869 1933 ptemp = p;
6ae1f27e 1934 while (! CHAR_HEAD_P (*p)) p++;
9b4d1fe6
KH
1935 if (str_cmpchar_id (ptemp - 1, p - ptemp + 1) < 0)
1936 error ("Can't compose an invalid composition character");
4ed46869
KH
1937 if (i + (p - ptemp) >= MAX_LENGTH_OF_MULTI_BYTE_FORM)
1938 error ("Too long string to be composed: %s", XSTRING (str)->data);
1939 bcopy (ptemp, buf + i, p - ptemp);
1940 i += p - ptemp;
1941 }
1942 else /* multibyte char */
1943 {
1944 /* Add 0x20 to the base leading-code, keep the remaining
1945 bytes unchanged. */
ac4137cc 1946 int c = STRING_CHAR_AND_LENGTH (p, pend - p, len);
9b4d1fe6
KH
1947
1948 if (len <= 1 || ! CHAR_VALID_P (c, 0))
1949 error ("Can't compose an invalid character");
4ed46869
KH
1950 if (i + len >= MAX_LENGTH_OF_MULTI_BYTE_FORM)
1951 error ("Too long string to be composed: %s", XSTRING (str)->data);
1952 bcopy (p, buf + i, len);
1953 buf[i] += 0x20;
1954 p += len, i += len;
1955 }
1956 }
1957
1958 if (i < 5)
1959 /* STR contains only one character, which can't be composed. */
1960 error ("Too short string to be composed: %s", XSTRING (str)->data);
1961
27802600 1962 return make_string_from_bytes (buf, 1, i);
4ed46869
KH
1963}
1964
1965\f
dfcf069d 1966int
4ed46869
KH
1967charset_id_internal (charset_name)
1968 char *charset_name;
1969{
76d7b829 1970 Lisp_Object val;
4ed46869 1971
76d7b829 1972 val= Fget (intern (charset_name), Qcharset);
4ed46869
KH
1973 if (!VECTORP (val))
1974 error ("Charset %s is not defined", charset_name);
1975
1976 return (XINT (XVECTOR (val)->contents[0]));
1977}
1978
1979DEFUN ("setup-special-charsets", Fsetup_special_charsets,
1980 Ssetup_special_charsets, 0, 0, 0, "Internal use only.")
1981 ()
1982{
1983 charset_latin_iso8859_1 = charset_id_internal ("latin-iso8859-1");
1984 charset_jisx0208_1978 = charset_id_internal ("japanese-jisx0208-1978");
1985 charset_jisx0208 = charset_id_internal ("japanese-jisx0208");
1986 charset_katakana_jisx0201 = charset_id_internal ("katakana-jisx0201");
1987 charset_latin_jisx0201 = charset_id_internal ("latin-jisx0201");
1988 charset_big5_1 = charset_id_internal ("chinese-big5-1");
1989 charset_big5_2 = charset_id_internal ("chinese-big5-2");
1990 return Qnil;
1991}
1992
dfcf069d 1993void
4ed46869
KH
1994init_charset_once ()
1995{
1996 int i, j, k;
1997
1998 staticpro (&Vcharset_table);
1999 staticpro (&Vcharset_symbol_table);
8a73a704 2000 staticpro (&Vgeneric_character_list);
4ed46869
KH
2001
2002 /* This has to be done here, before we call Fmake_char_table. */
2003 Qcharset_table = intern ("charset-table");
2004 staticpro (&Qcharset_table);
2005
2006 /* Intern this now in case it isn't already done.
2007 Setting this variable twice is harmless.
2008 But don't staticpro it here--that is done in alloc.c. */
2009 Qchar_table_extra_slots = intern ("char-table-extra-slots");
2010
2011 /* Now we are ready to set up this property, so we can
2012 create the charset table. */
2013 Fput (Qcharset_table, Qchar_table_extra_slots, make_number (0));
2014 Vcharset_table = Fmake_char_table (Qcharset_table, Qnil);
2015
0282eb69
KH
2016 Qunknown = intern ("unknown");
2017 staticpro (&Qunknown);
2018 Vcharset_symbol_table = Fmake_vector (make_number (MAX_CHARSET + 1),
2019 Qunknown);
4ed46869
KH
2020
2021 /* Setup tables. */
2022 for (i = 0; i < 2; i++)
2023 for (j = 0; j < 2; j++)
2024 for (k = 0; k < 128; k++)
2025 iso_charset_table [i][j][k] = -1;
2026
2027 bzero (cmpchar_hash_table, sizeof cmpchar_hash_table);
2028 cmpchar_table_size = n_cmpchars = 0;
2029
60383934 2030 for (i = 0; i < 256; i++)
4ed46869 2031 BYTES_BY_CHAR_HEAD (i) = 1;
6ef23ebb
KH
2032 for (i = MIN_CHARSET_OFFICIAL_DIMENSION1;
2033 i <= MAX_CHARSET_OFFICIAL_DIMENSION1; i++)
2034 BYTES_BY_CHAR_HEAD (i) = 2;
2035 for (i = MIN_CHARSET_OFFICIAL_DIMENSION2;
2036 i <= MAX_CHARSET_OFFICIAL_DIMENSION2; i++)
2037 BYTES_BY_CHAR_HEAD (i) = 3;
4ed46869
KH
2038 BYTES_BY_CHAR_HEAD (LEADING_CODE_PRIVATE_11) = 3;
2039 BYTES_BY_CHAR_HEAD (LEADING_CODE_PRIVATE_12) = 3;
2040 BYTES_BY_CHAR_HEAD (LEADING_CODE_PRIVATE_21) = 4;
2041 BYTES_BY_CHAR_HEAD (LEADING_CODE_PRIVATE_22) = 4;
6ef23ebb 2042 /* The followings don't reflect the actual bytes, but just to tell
4ed46869
KH
2043 that it is a start of a multibyte character. */
2044 BYTES_BY_CHAR_HEAD (LEADING_CODE_COMPOSITION) = 2;
6ef23ebb
KH
2045 BYTES_BY_CHAR_HEAD (0x9E) = 2;
2046 BYTES_BY_CHAR_HEAD (0x9F) = 2;
4ed46869
KH
2047
2048 for (i = 0; i < 128; i++)
2049 WIDTH_BY_CHAR_HEAD (i) = 1;
2050 for (; i < 256; i++)
2051 WIDTH_BY_CHAR_HEAD (i) = 4;
2052 WIDTH_BY_CHAR_HEAD (LEADING_CODE_PRIVATE_11) = 1;
2053 WIDTH_BY_CHAR_HEAD (LEADING_CODE_PRIVATE_12) = 2;
2054 WIDTH_BY_CHAR_HEAD (LEADING_CODE_PRIVATE_21) = 1;
2055 WIDTH_BY_CHAR_HEAD (LEADING_CODE_PRIVATE_22) = 2;
8a73a704
KH
2056
2057 {
76d7b829 2058 Lisp_Object val;
8a73a704 2059
76d7b829 2060 val = Qnil;
8a73a704
KH
2061 for (i = 0x81; i < 0x90; i++)
2062 val = Fcons (make_number ((i - 0x70) << 7), val);
2063 for (; i < 0x9A; i++)
2064 val = Fcons (make_number ((i - 0x8F) << 14), val);
2065 for (i = 0xA0; i < 0xF0; i++)
2066 val = Fcons (make_number ((i - 0x70) << 7), val);
2067 for (; i < 0xFF; i++)
2068 val = Fcons (make_number ((i - 0xE0) << 14), val);
2069 val = Fcons (make_number (GENERIC_COMPOSITION_CHAR), val);
2070 Vgeneric_character_list = Fnreverse (val);
2071 }
bbf12bb3
KH
2072
2073 nonascii_insert_offset = 0;
2074 Vnonascii_translation_table = Qnil;
4ed46869
KH
2075}
2076
2077#ifdef emacs
2078
dfcf069d 2079void
4ed46869
KH
2080syms_of_charset ()
2081{
2082 Qascii = intern ("ascii");
2083 staticpro (&Qascii);
2084
2085 Qcharset = intern ("charset");
2086 staticpro (&Qcharset);
2087
2088 /* Define ASCII charset now. */
2089 update_charset_table (make_number (CHARSET_ASCII),
2090 make_number (1), make_number (94),
2091 make_number (1),
2092 make_number (0),
2093 make_number ('B'),
2094 make_number (0),
2095 build_string ("ASCII"),
2096 build_string ("ASCII"),
2097 build_string ("ASCII (ISO646 IRV)"));
2098 CHARSET_SYMBOL (CHARSET_ASCII) = Qascii;
2099 Fput (Qascii, Qcharset, CHARSET_TABLE_ENTRY (CHARSET_ASCII));
2100
2101 Qcomposition = intern ("composition");
2102 staticpro (&Qcomposition);
2103 CHARSET_SYMBOL (CHARSET_COMPOSITION) = Qcomposition;
2104
c1a08b4c
KH
2105 Qauto_fill_chars = intern ("auto-fill-chars");
2106 staticpro (&Qauto_fill_chars);
2107 Fput (Qauto_fill_chars, Qchar_table_extra_slots, make_number (0));
2108
4ed46869 2109 defsubr (&Sdefine_charset);
8a73a704 2110 defsubr (&Sgeneric_character_list);
3fac5a51 2111 defsubr (&Sget_unused_iso_final_char);
4ed46869
KH
2112 defsubr (&Sdeclare_equiv_charset);
2113 defsubr (&Sfind_charset_region);
2114 defsubr (&Sfind_charset_string);
2115 defsubr (&Smake_char_internal);
2116 defsubr (&Ssplit_char);
2117 defsubr (&Schar_charset);
90d7b74e 2118 defsubr (&Scharset_after);
4ed46869 2119 defsubr (&Siso_charset);
9d3d8cba 2120 defsubr (&Schar_valid_p);
d2665018 2121 defsubr (&Sunibyte_char_to_multibyte);
1bcc1567 2122 defsubr (&Smultibyte_char_to_unibyte);
4ed46869
KH
2123 defsubr (&Schar_bytes);
2124 defsubr (&Schar_width);
2125 defsubr (&Sstring_width);
2126 defsubr (&Schar_direction);
af4fecb4 2127 defsubr (&Schars_in_region);
87b089ad 2128 defsubr (&Sstring);
4ed46869
KH
2129 defsubr (&Scmpcharp);
2130 defsubr (&Scmpchar_component);
2131 defsubr (&Scmpchar_cmp_rule);
2132 defsubr (&Scmpchar_cmp_rule_p);
2133 defsubr (&Scmpchar_cmp_count);
2134 defsubr (&Scompose_string);
2135 defsubr (&Ssetup_special_charsets);
2136
2137 DEFVAR_LISP ("charset-list", &Vcharset_list,
2138 "List of charsets ever defined.");
2139 Vcharset_list = Fcons (Qascii, Qnil);
2140
537efd8d 2141 DEFVAR_LISP ("translation-table-vector", &Vtranslation_table_vector,
b4e9dd77
KH
2142 "Vector of cons cell of a symbol and translation table ever defined.\n\
2143An ID of a translation table is an index of this vector.");
537efd8d 2144 Vtranslation_table_vector = Fmake_vector (make_number (16), Qnil);
b0e3cf2b 2145
4ed46869
KH
2146 DEFVAR_INT ("leading-code-composition", &leading_code_composition,
2147 "Leading-code of composite characters.");
2148 leading_code_composition = LEADING_CODE_COMPOSITION;
2149
2150 DEFVAR_INT ("leading-code-private-11", &leading_code_private_11,
2151 "Leading-code of private TYPE9N charset of column-width 1.");
2152 leading_code_private_11 = LEADING_CODE_PRIVATE_11;
2153
2154 DEFVAR_INT ("leading-code-private-12", &leading_code_private_12,
2155 "Leading-code of private TYPE9N charset of column-width 2.");
2156 leading_code_private_12 = LEADING_CODE_PRIVATE_12;
2157
2158 DEFVAR_INT ("leading-code-private-21", &leading_code_private_21,
2159 "Leading-code of private TYPE9Nx9N charset of column-width 1.");
2160 leading_code_private_21 = LEADING_CODE_PRIVATE_21;
2161
2162 DEFVAR_INT ("leading-code-private-22", &leading_code_private_22,
2163 "Leading-code of private TYPE9Nx9N charset of column-width 2.");
2164 leading_code_private_22 = LEADING_CODE_PRIVATE_22;
35e623fb
RS
2165
2166 DEFVAR_INT ("nonascii-insert-offset", &nonascii_insert_offset,
d2665018 2167 "Offset for converting non-ASCII unibyte codes 0240...0377 to multibyte.\n\
4cf9710d
RS
2168This is used for converting unibyte text to multibyte,\n\
2169and for inserting character codes specified by number.\n\n\
3e8ceaac
RS
2170This serves to convert a Latin-1 or similar 8-bit character code\n\
2171to the corresponding Emacs multibyte character code.\n\
2172Typically the value should be (- (make-char CHARSET 0) 128),\n\
2173for your choice of character set.\n\
537efd8d 2174If `nonascii-translation-table' is non-nil, it overrides this variable.");
35e623fb 2175 nonascii_insert_offset = 0;
b0e3cf2b 2176
b4e9dd77 2177 DEFVAR_LISP ("nonascii-translation-table", &Vnonascii_translation_table,
537efd8d 2178 "Translation table to convert non-ASCII unibyte codes to multibyte.\n\
4cf9710d
RS
2179This is used for converting unibyte text to multibyte,\n\
2180and for inserting character codes specified by number.\n\n\
2181Conversion is performed only when multibyte characters are enabled,\n\
2182and it serves to convert a Latin-1 or similar 8-bit character code\n\
2183to the corresponding Emacs character code.\n\n\
da4d65af 2184If this is nil, `nonascii-insert-offset' is used instead.\n\
b4e9dd77
KH
2185See also the docstring of `make-translation-table'.");
2186 Vnonascii_translation_table = Qnil;
4cf9710d 2187
b0e3cf2b
KH
2188 DEFVAR_INT ("min-composite-char", &min_composite_char,
2189 "Minimum character code of a composite character.");
2190 min_composite_char = MIN_CHAR_COMPOSITION;
c1a08b4c
KH
2191
2192 DEFVAR_LISP ("auto-fill-chars", &Vauto_fill_chars,
2193 "A char-table for characters which invoke auto-filling.\n\
2194Such characters has value t in this table.");
2195 Vauto_fill_chars = Fmake_char_table (Qauto_fill_chars, Qnil);
60022cb7
AS
2196 CHAR_TABLE_SET (Vauto_fill_chars, make_number (' '), Qt);
2197 CHAR_TABLE_SET (Vauto_fill_chars, make_number ('\n'), Qt);
4ed46869
KH
2198}
2199
2200#endif /* emacs */