*** empty log message ***
[bpt/emacs.git] / src / charset.c
CommitLineData
75c8c592 1/* Basic multilingual character support.
35e623fb 2 Copyright (C) 1995, 1997, 1998 Electrotechnical Laboratory, JAPAN.
75c8c592 3 Licensed to the Free Software Foundation.
4ed46869 4
369314dc
KH
5This file is part of GNU Emacs.
6
7GNU Emacs is free software; you can redistribute it and/or modify
8it under the terms of the GNU General Public License as published by
9the Free Software Foundation; either version 2, or (at your option)
10any later version.
4ed46869 11
369314dc
KH
12GNU Emacs is distributed in the hope that it will be useful,
13but WITHOUT ANY WARRANTY; without even the implied warranty of
14MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15GNU General Public License for more details.
4ed46869 16
369314dc
KH
17You should have received a copy of the GNU General Public License
18along with GNU Emacs; see the file COPYING. If not, write to
19the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
20Boston, MA 02111-1307, USA. */
4ed46869
KH
21
22/* At first, see the document in `charset.h' to understand the code in
23 this file. */
24
68c45bf0
PE
25#ifdef emacs
26#include <config.h>
27#endif
28
4ed46869
KH
29#include <stdio.h>
30
31#ifdef emacs
32
33#include <sys/types.h>
4ed46869
KH
34#include "lisp.h"
35#include "buffer.h"
36#include "charset.h"
37#include "coding.h"
fc6b09bf 38#include "disptab.h"
4ed46869
KH
39
40#else /* not emacs */
41
42#include "mulelib.h"
43
44#endif /* emacs */
45
99529c2c 46Lisp_Object Qcharset, Qascii;
0282eb69 47Lisp_Object Qunknown;
4ed46869
KH
48
49/* Declaration of special leading-codes. */
4ed46869
KH
50int leading_code_private_11; /* for private DIMENSION1 of 1-column */
51int leading_code_private_12; /* for private DIMENSION1 of 2-column */
52int leading_code_private_21; /* for private DIMENSION2 of 1-column */
53int leading_code_private_22; /* for private DIMENSION2 of 2-column */
54
55/* Declaration of special charsets. */
56int charset_ascii; /* ASCII */
4ed46869
KH
57int charset_latin_iso8859_1; /* ISO8859-1 (Latin-1) */
58int charset_jisx0208_1978; /* JISX0208.1978 (Japanese Kanji old set) */
59int charset_jisx0208; /* JISX0208.1983 (Japanese Kanji) */
60int charset_katakana_jisx0201; /* JISX0201.Kana (Japanese Katakana) */
61int charset_latin_jisx0201; /* JISX0201.Roman (Japanese Roman) */
62int charset_big5_1; /* Big5 Level 1 (Chinese Traditional) */
63int charset_big5_2; /* Big5 Level 2 (Chinese Traditional) */
64
65Lisp_Object Qcharset_table;
66
67/* A char-table containing information of each character set. */
68Lisp_Object Vcharset_table;
69
70/* A vector of charset symbol indexed by charset-id. This is used
71 only for returning charset symbol from C functions. */
72Lisp_Object Vcharset_symbol_table;
73
74/* A list of charset symbols ever defined. */
75Lisp_Object Vcharset_list;
76
537efd8d
KH
77/* Vector of translation table ever defined.
78 ID of a translation table is used to index this vector. */
79Lisp_Object Vtranslation_table_vector;
b0e3cf2b 80
c1a08b4c
KH
81/* A char-table for characters which may invoke auto-filling. */
82Lisp_Object Vauto_fill_chars;
83
84Lisp_Object Qauto_fill_chars;
85
4ed46869
KH
86/* Tables used by macros BYTES_BY_CHAR_HEAD and WIDTH_BY_CHAR_HEAD. */
87int bytes_by_char_head[256];
88int width_by_char_head[256];
89
90/* Mapping table from ISO2022's charset (specified by DIMENSION,
91 CHARS, and FINAL-CHAR) to Emacs' charset. */
92int iso_charset_table[2][2][128];
93
94/* Variables used locally in the macro FETCH_MULTIBYTE_CHAR. */
95unsigned char *_fetch_multibyte_char_p;
96int _fetch_multibyte_char_len;
97
35e623fb
RS
98/* Offset to add to a non-ASCII value when inserting it. */
99int nonascii_insert_offset;
100
4cf9710d
RS
101/* Translation table for converting non-ASCII unibyte characters
102 to multibyte codes, or nil. */
b4e9dd77 103Lisp_Object Vnonascii_translation_table;
4cf9710d 104
8a73a704
KH
105/* List of all possible generic characters. */
106Lisp_Object Vgeneric_character_list;
107
046b1f03
RS
108#define min(X, Y) ((X) < (Y) ? (X) : (Y))
109#define max(X, Y) ((X) > (Y) ? (X) : (Y))
110\f
93bcb785
KH
111void
112invalid_character (c)
113 int c;
114{
ba7434e5 115 error ("Invalid character: 0%o, %d, 0x%x", c, c, c);
93bcb785
KH
116}
117
99529c2c
KH
118/* Parse a multibyte character string STR of length LENGTH (>= 2) set
119 BYTES to the length of actual multibyte sequence, CHARSET, C1, and
120 C2 to such values that MAKE_CHAR can make the multibyte character
121 from them.
44c6492d 122
99529c2c 123 It is assumed that *STR is one of base leading codes and the
44c6492d
KH
124 following (LENGTH - 1) bytes satisfy !CHAR_HEAD_P.
125
44c6492d 126 This macro should be called only from SPLIT_MULTIBYTE_SEQ. */
ac4137cc
KH
127
128#define SPLIT_CHARACTER_SEQ(str, length, bytes, charset, c1, c2) \
129 do { \
130 (bytes) = 1; \
131 (charset) = (str)[0]; \
132 if ((charset) >= LEADING_CODE_PRIVATE_11 \
133 && (charset) <= LEADING_CODE_PRIVATE_22) \
134 (charset) = (str)[(bytes)++]; \
135 if ((bytes) < (length)) \
136 { \
137 (c1) = (str)[(bytes)++] & 0x7F; \
138 if ((bytes) < (length)) \
139 (c2) = (str)[(bytes)++] & 0x7F; \
140 else \
141 (c2) = -1; \
142 } \
143 else \
144 (c1) = (c2) = -1; \
145 } while (0)
146
147/* Parse string STR of length LENGTH and check if a multibyte
99529c2c 148 characters is at STR. Set BYTES to the actual length, CHARSET, C1,
ac4137cc
KH
149 C2 to proper values for that character. */
150
151#define SPLIT_MULTIBYTE_SEQ(str, length, bytes, charset, c1, c2) \
152 do { \
153 int i; \
44c6492d
KH
154 if (ASCII_BYTE_P ((str)[0])) \
155 i = 1; \
156 else \
157 for (i = 1; i < (length) && ! CHAR_HEAD_P ((str)[i]); i++); \
ac4137cc
KH
158 if (i == 1) \
159 (bytes) = 1, (charset) = CHARSET_ASCII, (c1) = (str)[0] ; \
ac4137cc
KH
160 else \
161 { \
162 if (i > BYTES_BY_CHAR_HEAD ((str)[0])) \
163 i = BYTES_BY_CHAR_HEAD ((str)[0]); \
164 SPLIT_CHARACTER_SEQ (str, i, bytes, charset, c1, c2); \
165 } \
166 } while (0)
167
168/* 1 if CHARSET, C1, and C2 compose a valid character, else 0. */
44c6492d 169#define CHAR_COMPONENTS_VALID_P(charset, c1, c2) \
63f4d579
KH
170 (charset == CHARSET_ASCII \
171 ? ((c1) >= 0 && (c1) <= 0x7F) \
172 : (CHARSET_DIMENSION (charset) == 1 \
173 ? ((c1) >= 0x20 && (c1) <= 0x7F) \
174 : ((c1) >= 0x20 && (c1) <= 0x7F && (c2) >= 0x20 && (c2) <= 0x7F)))
93bcb785 175
99529c2c
KH
176/* Store multi-byte form of the character C in STR. The caller should
177 allocate at least 4-byte area at STR in advance. Returns the
178 length of the multi-byte form. If C is an invalid character code,
179 signal an error.
4ed46869 180
99529c2c
KH
181 Use macro `CHAR_STRING (C, STR)' instead of calling this function
182 directly if C can be an ASCII character. */
4ed46869
KH
183
184int
99529c2c 185char_to_string (c, str)
4ed46869 186 int c;
99529c2c 187 unsigned char *str;
4ed46869 188{
99529c2c
KH
189 unsigned char *p = str;
190
6662e69b 191 if (c & CHAR_MODIFIER_MASK) /* This includes the case C is negative. */
8ac5a9cc 192 {
6662e69b
KH
193 /* Multibyte character can't have a modifier bit. */
194 if (! SINGLE_BYTE_CHAR_P ((c & ~CHAR_MODIFIER_MASK)))
195 invalid_character (c);
196
197 /* For Meta, Shift, and Control modifiers, we need special care. */
8ac5a9cc 198 if (c & CHAR_META)
6662e69b
KH
199 {
200 /* Move the meta bit to the right place for a string. */
201 c = (c & ~CHAR_META) | 0x80;
202 }
203 if (c & CHAR_SHIFT)
204 {
205 /* Shift modifier is valid only with [A-Za-z]. */
206 if ((c & 0377) >= 'A' && (c & 0377) <= 'Z')
207 c &= ~CHAR_SHIFT;
208 else if ((c & 0377) >= 'a' && (c & 0377) <= 'z')
209 c = (c & ~CHAR_SHIFT) - ('a' - 'A');
210 }
8ac5a9cc 211 if (c & CHAR_CTL)
6662e69b
KH
212 {
213 /* Simulate the code in lread.c. */
214 /* Allow `\C- ' and `\C-?'. */
215 if (c == (CHAR_CTL | ' '))
216 c = 0;
217 else if (c == (CHAR_CTL | '?'))
218 c = 127;
219 /* ASCII control chars are made from letters (both cases),
220 as well as the non-letters within 0100...0137. */
221 else if ((c & 0137) >= 0101 && (c & 0137) <= 0132)
222 c &= (037 | (~0177 & ~CHAR_CTL));
223 else if ((c & 0177) >= 0100 && (c & 0177) <= 0137)
224 c &= (037 | (~0177 & ~CHAR_CTL));
225 }
226
227 /* If C still has any modifier bits, it is an invalid character. */
228 if (c & CHAR_MODIFIER_MASK)
229 invalid_character (c);
230
99529c2c 231 *p++ = c;
8ac5a9cc 232 }
99529c2c 233 else if (c < MAX_CHAR)
4ed46869 234 {
ac4137cc 235 int charset, c1, c2;
4ed46869 236
ac4137cc 237 SPLIT_NON_ASCII_CHAR (c, charset, c1, c2);
99529c2c
KH
238
239 if (charset >= LEADING_CODE_EXT_11)
240 *p++ = (charset < LEADING_CODE_EXT_12
241 ? LEADING_CODE_PRIVATE_11
242 : (charset < LEADING_CODE_EXT_21
243 ? LEADING_CODE_PRIVATE_12
244 : (charset < LEADING_CODE_EXT_22
245 ? LEADING_CODE_PRIVATE_21
246 : LEADING_CODE_PRIVATE_22)));
247 *p++ = charset;
248 if (c1 > 0 && c1 < 32 || c2 > 0 && c2 < 32)
249 invalid_character (c);
250 if (c1)
4ed46869 251 {
99529c2c
KH
252 *p++ = c1 | 0x80;
253 if (c2 > 0)
254 *p++ = c2 | 0x80;
4ed46869
KH
255 }
256 }
257
99529c2c 258 return (p -str);
4ed46869
KH
259}
260
44c6492d
KH
261/* Return the non-ASCII character corresponding to multi-byte form at
262 STR of length LEN. If ACTUAL_LEN is not NULL, store the byte
263 length of the multibyte form in *ACTUAL_LEN.
537efd8d 264
99529c2c
KH
265 Use macros STRING_CHAR or STRING_CHAR_AND_LENGTH instead of calling
266 this function directly if you want ot handle ASCII characters as
267 well. */
4ed46869 268
dfcf069d 269int
99529c2c 270string_to_char (str, len, actual_len)
8867de67 271 const unsigned char *str;
ac4137cc 272 int len, *actual_len;
4ed46869 273{
ac4137cc 274 int c, bytes, charset, c1, c2;
4ed46869 275
ac4137cc
KH
276 SPLIT_MULTIBYTE_SEQ (str, len, bytes, charset, c1, c2);
277 c = MAKE_CHAR (charset, c1, c2);
4ed46869 278 if (actual_len)
ac4137cc 279 *actual_len = bytes;
4ed46869
KH
280 return c;
281}
282
44c6492d
KH
283/* Return the length of the multi-byte form at string STR of length LEN.
284 Use the macro MULTIBYTE_FORM_LENGTH instead. */
4ed46869
KH
285int
286multibyte_form_length (str, len)
8867de67 287 const unsigned char *str;
4ed46869
KH
288 int len;
289{
ac4137cc 290 int bytes;
4ed46869 291
ac4137cc 292 PARSE_MULTIBYTE_SEQ (str, len, bytes);
90d7b74e 293 return bytes;
4ed46869
KH
294}
295
ac4137cc
KH
296/* Check multibyte form at string STR of length LEN and set variables
297 pointed by CHARSET, C1, and C2 to charset and position codes of the
298 character at STR, and return 0. If there's no multibyte character,
4ed46869
KH
299 return -1. This should be used only in the macro SPLIT_STRING
300 which checks range of STR in advance. */
301
dfcf069d 302int
99529c2c 303split_string (str, len, charset, c1, c2)
ac4137cc
KH
304 const unsigned char *str;
305 unsigned char *c1, *c2;
306 int len, *charset;
4ed46869 307{
ac4137cc 308 register int bytes, cs, code1, code2 = -1;
4ed46869 309
ac4137cc
KH
310 SPLIT_MULTIBYTE_SEQ (str, len, bytes, cs, code1, code2);
311 if (cs == CHARSET_ASCII)
4ed46869 312 return -1;
ac4137cc
KH
313 *charset = cs;
314 *c1 = code1;
315 *c2 = code2;
5865af0d 316 return 0;
ac4137cc
KH
317}
318
44c6492d
KH
319/* Return 1 iff character C has valid printable glyph.
320 Use the macro CHAR_PRINTABLE_P instead. */
ac4137cc
KH
321int
322char_printable_p (c)
323 int c;
324{
325 int charset, c1, c2, chars;
326
327 if (SINGLE_BYTE_CHAR_P (c))
328 return 1;
99529c2c
KH
329 if (c >= MAX_CHAR)
330 return 0;
ac4137cc
KH
331
332 SPLIT_NON_ASCII_CHAR (c, charset, c1, c2);
333 if (! CHARSET_DEFINED_P (charset))
334 return 0;
335 if (CHARSET_CHARS (charset) == 94
336 ? c1 <= 32 || c1 >= 127
337 : c1 < 32)
338 return 0;
339 if (CHARSET_DIMENSION (charset) == 2
340 && (CHARSET_CHARS (charset) == 94
341 ? c2 <= 32 || c2 >= 127
342 : c2 < 32))
343 return 0;
344 return 1;
4ed46869
KH
345}
346
537efd8d 347/* Translate character C by translation table TABLE. If C
b4e9dd77
KH
348 is negative, translate a character specified by CHARSET, C1, and C2
349 (C1 and C2 are code points of the character). If no translation is
350 found in TABLE, return C. */
dfcf069d 351int
b4e9dd77 352translate_char (table, c, charset, c1, c2)
23d2a7f1
KH
353 Lisp_Object table;
354 int c, charset, c1, c2;
355{
356 Lisp_Object ch;
357 int alt_charset, alt_c1, alt_c2, dimension;
358
0ad3f83d 359 if (c < 0) c = MAKE_CHAR (charset, (c1 & 0x7F) , (c2 & 0x7F));
23d2a7f1 360 if (!CHAR_TABLE_P (table)
ac4137cc 361 || (ch = Faref (table, make_number (c)), !NATNUMP (ch)))
23d2a7f1
KH
362 return c;
363
364 SPLIT_CHAR (XFASTINT (ch), alt_charset, alt_c1, alt_c2);
365 dimension = CHARSET_DIMENSION (alt_charset);
366 if (dimension == 1 && alt_c1 > 0 || dimension == 2 && alt_c2 > 0)
367 /* CH is not a generic character, just return it. */
368 return XFASTINT (ch);
369
370 /* Since CH is a generic character, we must return a specific
371 charater which has the same position codes as C from CH. */
372 if (charset < 0)
373 SPLIT_CHAR (c, charset, c1, c2);
374 if (dimension != CHARSET_DIMENSION (charset))
375 /* We can't make such a character because of dimension mismatch. */
376 return c;
23d2a7f1
KH
377 return MAKE_CHAR (alt_charset, c1, c2);
378}
379
d2665018 380/* Convert the unibyte character C to multibyte based on
b4e9dd77 381 Vnonascii_translation_table or nonascii_insert_offset. If they can't
d2665018
KH
382 convert C to a valid multibyte character, convert it based on
383 DEFAULT_NONASCII_INSERT_OFFSET which makes C a Latin-1 character. */
35e623fb 384
dfcf069d 385int
35e623fb
RS
386unibyte_char_to_multibyte (c)
387 int c;
388{
543b4f61 389 if (c < 0400 && c >= 0200)
35e623fb 390 {
d2665018
KH
391 int c_save = c;
392
b4e9dd77 393 if (! NILP (Vnonascii_translation_table))
bbf12bb3
KH
394 {
395 c = XINT (Faref (Vnonascii_translation_table, make_number (c)));
ac4137cc 396 if (c >= 0400 && ! char_valid_p (c, 0))
bbf12bb3
KH
397 c = c_save + DEFAULT_NONASCII_INSERT_OFFSET;
398 }
399 else if (c >= 0240 && nonascii_insert_offset > 0)
400 {
401 c += nonascii_insert_offset;
ac4137cc 402 if (c < 0400 || ! char_valid_p (c, 0))
bbf12bb3
KH
403 c = c_save + DEFAULT_NONASCII_INSERT_OFFSET;
404 }
405 else if (c >= 0240)
d2665018 406 c = c_save + DEFAULT_NONASCII_INSERT_OFFSET;
35e623fb
RS
407 }
408 return c;
409}
76d7b829
KH
410
411
412/* Convert the multibyte character C to unibyte 8-bit character based
413 on Vnonascii_translation_table or nonascii_insert_offset. If
414 REV_TBL is non-nil, it should be a reverse table of
415 Vnonascii_translation_table, i.e. what given by:
416 Fchar_table_extra_slot (Vnonascii_translation_table, make_number (0)) */
417
418int
419multibyte_char_to_unibyte (c, rev_tbl)
420 int c;
421 Lisp_Object rev_tbl;
422{
423 if (!SINGLE_BYTE_CHAR_P (c))
424 {
425 int c_save = c;
426
427 if (! CHAR_TABLE_P (rev_tbl)
428 && CHAR_TABLE_P (Vnonascii_translation_table))
429 rev_tbl = Fchar_table_extra_slot (Vnonascii_translation_table,
430 make_number (0));
431 if (CHAR_TABLE_P (rev_tbl))
432 {
433 Lisp_Object temp;
434 temp = Faref (rev_tbl, make_number (c));
435 if (INTEGERP (temp))
436 c = XINT (temp);
bbf12bb3
KH
437 if (c >= 256)
438 c = (c_save & 0177) + 0200;
439 }
440 else
441 {
442 if (nonascii_insert_offset > 0)
443 c -= nonascii_insert_offset;
444 if (c < 128 || c >= 256)
445 c = (c_save & 0177) + 0200;
76d7b829 446 }
76d7b829
KH
447 }
448
449 return c;
450}
451
35e623fb 452\f
4ed46869
KH
453/* Update the table Vcharset_table with the given arguments (see the
454 document of `define-charset' for the meaning of each argument).
455 Several other table contents are also updated. The caller should
456 check the validity of CHARSET-ID and the remaining arguments in
457 advance. */
458
459void
460update_charset_table (charset_id, dimension, chars, width, direction,
461 iso_final_char, iso_graphic_plane,
462 short_name, long_name, description)
463 Lisp_Object charset_id, dimension, chars, width, direction;
464 Lisp_Object iso_final_char, iso_graphic_plane;
465 Lisp_Object short_name, long_name, description;
466{
467 int charset = XINT (charset_id);
468 int bytes;
469 unsigned char leading_code_base, leading_code_ext;
470
6dc0722d
KH
471 if (NILP (CHARSET_TABLE_ENTRY (charset)))
472 CHARSET_TABLE_ENTRY (charset)
473 = Fmake_vector (make_number (CHARSET_MAX_IDX), Qnil);
4ed46869
KH
474
475 /* Get byte length of multibyte form, base leading-code, and
476 extended leading-code of the charset. See the comment under the
477 title "GENERAL NOTE on CHARACTER SET (CHARSET)" in charset.h. */
478 bytes = XINT (dimension);
479 if (charset < MIN_CHARSET_PRIVATE_DIMENSION1)
480 {
481 /* Official charset, it doesn't have an extended leading-code. */
482 if (charset != CHARSET_ASCII)
483 bytes += 1; /* For a base leading-code. */
484 leading_code_base = charset;
485 leading_code_ext = 0;
486 }
487 else
488 {
489 /* Private charset. */
490 bytes += 2; /* For base and extended leading-codes. */
491 leading_code_base
492 = (charset < LEADING_CODE_EXT_12
493 ? LEADING_CODE_PRIVATE_11
494 : (charset < LEADING_CODE_EXT_21
495 ? LEADING_CODE_PRIVATE_12
496 : (charset < LEADING_CODE_EXT_22
497 ? LEADING_CODE_PRIVATE_21
498 : LEADING_CODE_PRIVATE_22)));
499 leading_code_ext = charset;
500 }
501
6ef23ebb
KH
502 if (BYTES_BY_CHAR_HEAD (leading_code_base) != bytes)
503 error ("Invalid dimension for the charset-ID %d", charset);
504
4ed46869
KH
505 CHARSET_TABLE_INFO (charset, CHARSET_ID_IDX) = charset_id;
506 CHARSET_TABLE_INFO (charset, CHARSET_BYTES_IDX) = make_number (bytes);
507 CHARSET_TABLE_INFO (charset, CHARSET_DIMENSION_IDX) = dimension;
508 CHARSET_TABLE_INFO (charset, CHARSET_CHARS_IDX) = chars;
509 CHARSET_TABLE_INFO (charset, CHARSET_WIDTH_IDX) = width;
510 CHARSET_TABLE_INFO (charset, CHARSET_DIRECTION_IDX) = direction;
511 CHARSET_TABLE_INFO (charset, CHARSET_LEADING_CODE_BASE_IDX)
512 = make_number (leading_code_base);
513 CHARSET_TABLE_INFO (charset, CHARSET_LEADING_CODE_EXT_IDX)
514 = make_number (leading_code_ext);
515 CHARSET_TABLE_INFO (charset, CHARSET_ISO_FINAL_CHAR_IDX) = iso_final_char;
516 CHARSET_TABLE_INFO (charset, CHARSET_ISO_GRAPHIC_PLANE_IDX)
517 = iso_graphic_plane;
518 CHARSET_TABLE_INFO (charset, CHARSET_SHORT_NAME_IDX) = short_name;
519 CHARSET_TABLE_INFO (charset, CHARSET_LONG_NAME_IDX) = long_name;
520 CHARSET_TABLE_INFO (charset, CHARSET_DESCRIPTION_IDX) = description;
521 CHARSET_TABLE_INFO (charset, CHARSET_PLIST_IDX) = Qnil;
522
523 {
524 /* If we have already defined a charset which has the same
525 DIMENSION, CHARS and ISO-FINAL-CHAR but the different
526 DIRECTION, we must update the entry REVERSE-CHARSET of both
527 charsets. If there's no such charset, the value of the entry
528 is set to nil. */
529 int i;
530
513ee442 531 for (i = 0; i <= MAX_CHARSET; i++)
4ed46869
KH
532 if (!NILP (CHARSET_TABLE_ENTRY (i)))
533 {
534 if (CHARSET_DIMENSION (i) == XINT (dimension)
535 && CHARSET_CHARS (i) == XINT (chars)
536 && CHARSET_ISO_FINAL_CHAR (i) == XINT (iso_final_char)
537 && CHARSET_DIRECTION (i) != XINT (direction))
538 {
539 CHARSET_TABLE_INFO (charset, CHARSET_REVERSE_CHARSET_IDX)
540 = make_number (i);
541 CHARSET_TABLE_INFO (i, CHARSET_REVERSE_CHARSET_IDX) = charset_id;
542 break;
543 }
544 }
513ee442 545 if (i > MAX_CHARSET)
4ed46869
KH
546 /* No such a charset. */
547 CHARSET_TABLE_INFO (charset, CHARSET_REVERSE_CHARSET_IDX)
548 = make_number (-1);
549 }
550
551 if (charset != CHARSET_ASCII
552 && charset < MIN_CHARSET_PRIVATE_DIMENSION1)
553 {
4ed46869
KH
554 width_by_char_head[leading_code_base] = XINT (width);
555
556 /* Update table emacs_code_class. */
557 emacs_code_class[charset] = (bytes == 2
558 ? EMACS_leading_code_2
559 : (bytes == 3
560 ? EMACS_leading_code_3
561 : EMACS_leading_code_4));
562 }
563
564 /* Update table iso_charset_table. */
565 if (ISO_CHARSET_TABLE (dimension, chars, iso_final_char) < 0)
566 ISO_CHARSET_TABLE (dimension, chars, iso_final_char) = charset;
567}
568
569#ifdef emacs
570
571/* Return charset id of CHARSET_SYMBOL, or return -1 if CHARSET_SYMBOL
572 is invalid. */
573int
574get_charset_id (charset_symbol)
575 Lisp_Object charset_symbol;
576{
577 Lisp_Object val;
578 int charset;
579
580 return ((SYMBOLP (charset_symbol)
581 && (val = Fget (charset_symbol, Qcharset), VECTORP (val))
582 && (charset = XINT (XVECTOR (val)->contents[CHARSET_ID_IDX]),
583 CHARSET_VALID_P (charset)))
584 ? charset : -1);
585}
586
587/* Return an identification number for a new private charset of
588 DIMENSION and WIDTH. If there's no more room for the new charset,
589 return 0. */
590Lisp_Object
591get_new_private_charset_id (dimension, width)
592 int dimension, width;
593{
594 int charset, from, to;
595
596 if (dimension == 1)
597 {
598 if (width == 1)
599 from = LEADING_CODE_EXT_11, to = LEADING_CODE_EXT_12;
600 else
601 from = LEADING_CODE_EXT_12, to = LEADING_CODE_EXT_21;
602 }
603 else
604 {
605 if (width == 1)
606 from = LEADING_CODE_EXT_21, to = LEADING_CODE_EXT_22;
607 else
b0e3cf2b 608 from = LEADING_CODE_EXT_22, to = LEADING_CODE_EXT_MAX + 1;
4ed46869
KH
609 }
610
611 for (charset = from; charset < to; charset++)
612 if (!CHARSET_DEFINED_P (charset)) break;
613
614 return make_number (charset < to ? charset : 0);
615}
616
617DEFUN ("define-charset", Fdefine_charset, Sdefine_charset, 3, 3, 0,
618 "Define CHARSET-ID as the identification number of CHARSET with INFO-VECTOR.\n\
23d2a7f1 619If CHARSET-ID is nil, it is decided automatically, which means CHARSET is\n\
4ed46869
KH
620 treated as a private charset.\n\
621INFO-VECTOR is a vector of the format:\n\
622 [DIMENSION CHARS WIDTH DIRECTION ISO-FINAL-CHAR ISO-GRAPHIC-PLANE\n\
623 SHORT-NAME LONG-NAME DESCRIPTION]\n\
624The meanings of each elements is as follows:\n\
625DIMENSION (integer) is the number of bytes to represent a character: 1 or 2.\n\
626CHARS (integer) is the number of characters in a dimension: 94 or 96.\n\
627WIDTH (integer) is the number of columns a character in the charset\n\
628occupies on the screen: one of 0, 1, and 2.\n\
629\n\
630DIRECTION (integer) is the rendering direction of characters in the\n\
277576f6
KH
631charset when rendering. If 0, render from left to right, else\n\
632render from right to left.\n\
4ed46869
KH
633\n\
634ISO-FINAL-CHAR (character) is the final character of the\n\
635corresponding ISO 2022 charset.\n\
636\n\
637ISO-GRAPHIC-PLANE (integer) is the graphic plane to be invoked\n\
638while encoding to variants of ISO 2022 coding system, one of the\n\
639following: 0/graphic-plane-left(GL), 1/graphic-plane-right(GR).\n\
640\n\
641SHORT-NAME (string) is the short name to refer to the charset.\n\
642\n\
643LONG-NAME (string) is the long name to refer to the charset.\n\
644\n\
645DESCRIPTION (string) is the description string of the charset.")
646 (charset_id, charset_symbol, info_vector)
647 Lisp_Object charset_id, charset_symbol, info_vector;
648{
649 Lisp_Object *vec;
650
651 if (!NILP (charset_id))
652 CHECK_NUMBER (charset_id, 0);
653 CHECK_SYMBOL (charset_symbol, 1);
654 CHECK_VECTOR (info_vector, 2);
655
656 if (! NILP (charset_id))
657 {
658 if (! CHARSET_VALID_P (XINT (charset_id)))
659 error ("Invalid CHARSET: %d", XINT (charset_id));
660 else if (CHARSET_DEFINED_P (XINT (charset_id)))
661 error ("Already defined charset: %d", XINT (charset_id));
662 }
663
664 vec = XVECTOR (info_vector)->contents;
665 if (XVECTOR (info_vector)->size != 9
666 || !INTEGERP (vec[0]) || !(XINT (vec[0]) == 1 || XINT (vec[0]) == 2)
667 || !INTEGERP (vec[1]) || !(XINT (vec[1]) == 94 || XINT (vec[1]) == 96)
668 || !INTEGERP (vec[2]) || !(XINT (vec[2]) == 1 || XINT (vec[2]) == 2)
669 || !INTEGERP (vec[3]) || !(XINT (vec[3]) == 0 || XINT (vec[3]) == 1)
670 || !INTEGERP (vec[4]) || !(XINT (vec[4]) >= '0' && XINT (vec[4]) <= '~')
671 || !INTEGERP (vec[5]) || !(XINT (vec[5]) == 0 || XINT (vec[5]) == 1)
672 || !STRINGP (vec[6])
673 || !STRINGP (vec[7])
674 || !STRINGP (vec[8]))
675 error ("Invalid info-vector argument for defining charset %s",
676 XSYMBOL (charset_symbol)->name->data);
677
678 if (NILP (charset_id))
679 {
680 charset_id = get_new_private_charset_id (XINT (vec[0]), XINT (vec[2]));
681 if (XINT (charset_id) == 0)
682 error ("There's no room for a new private charset %s",
683 XSYMBOL (charset_symbol)->name->data);
684 }
685
686 update_charset_table (charset_id, vec[0], vec[1], vec[2], vec[3],
687 vec[4], vec[5], vec[6], vec[7], vec[8]);
6dc0722d 688 Fput (charset_symbol, Qcharset, CHARSET_TABLE_ENTRY (XINT (charset_id)));
4ed46869
KH
689 CHARSET_SYMBOL (XINT (charset_id)) = charset_symbol;
690 Vcharset_list = Fcons (charset_symbol, Vcharset_list);
691 return Qnil;
692}
693
8a73a704
KH
694DEFUN ("generic-character-list", Fgeneric_character_list,
695 Sgeneric_character_list, 0, 0, 0,
696 "Return a list of all possible generic characters.\n\
697It includes a generic character for a charset not yet defined.")
698 ()
699{
700 return Vgeneric_character_list;
701}
702
3fac5a51
KH
703DEFUN ("get-unused-iso-final-char", Fget_unused_iso_final_char,
704 Sget_unused_iso_final_char, 2, 2, 0,
705 "Return an unsed ISO's final char for a charset of DIMENISION and CHARS.\n\
706DIMENSION is the number of bytes to represent a character: 1 or 2.\n\
707CHARS is the number of characters in a dimension: 94 or 96.\n\
708\n\
709This final char is for private use, thus the range is `0' (48) .. `?' (63).\n\
710If there's no unused final char for the specified kind of charset,\n\
711return nil.")
712 (dimension, chars)
713 Lisp_Object dimension, chars;
714{
715 int final_char;
716
717 CHECK_NUMBER (dimension, 0);
718 CHECK_NUMBER (chars, 1);
719 if (XINT (dimension) != 1 && XINT (dimension) != 2)
720 error ("Invalid charset dimension %d, it should be 1 or 2",
721 XINT (dimension));
722 if (XINT (chars) != 94 && XINT (chars) != 96)
723 error ("Invalid charset chars %d, it should be 94 or 96",
724 XINT (chars));
725 for (final_char = '0'; final_char <= '?'; final_char++)
726 {
727 if (ISO_CHARSET_TABLE (dimension, chars, make_number (final_char)) < 0)
728 break;
729 }
730 return (final_char <= '?' ? make_number (final_char) : Qnil);
731}
732
4ed46869
KH
733DEFUN ("declare-equiv-charset", Fdeclare_equiv_charset, Sdeclare_equiv_charset,
734 4, 4, 0,
735 "Declare a charset of DIMENSION, CHARS, FINAL-CHAR is the same as CHARSET.\n\
736CHARSET should be defined by `defined-charset' in advance.")
737 (dimension, chars, final_char, charset_symbol)
738 Lisp_Object dimension, chars, final_char, charset_symbol;
739{
740 int charset;
741
742 CHECK_NUMBER (dimension, 0);
743 CHECK_NUMBER (chars, 1);
744 CHECK_NUMBER (final_char, 2);
745 CHECK_SYMBOL (charset_symbol, 3);
746
747 if (XINT (dimension) != 1 && XINT (dimension) != 2)
748 error ("Invalid DIMENSION %d, it should be 1 or 2", XINT (dimension));
749 if (XINT (chars) != 94 && XINT (chars) != 96)
750 error ("Invalid CHARS %d, it should be 94 or 96", XINT (chars));
751 if (XINT (final_char) < '0' || XFASTINT (final_char) > '~')
752 error ("Invalid FINAL-CHAR %c, it should be `0'..`~'", XINT (chars));
753 if ((charset = get_charset_id (charset_symbol)) < 0)
754 error ("Invalid charset %s", XSYMBOL (charset_symbol)->name->data);
755
756 ISO_CHARSET_TABLE (dimension, chars, final_char) = charset;
757 return Qnil;
758}
759
760/* Return number of different charsets in STR of length LEN. In
761 addition, for each found charset N, CHARSETS[N] is set 1. The
a29e3b1b 762 caller should allocate CHARSETS (MAX_CHARSET + 1 elements) in advance.
1d67c29b
KH
763 It may lookup a translation table TABLE if supplied.
764
0282eb69
KH
765 If MULTIBYTE is zero, do not check multibyte characters, i.e. if
766 any ASCII codes (7-bit) are found, CHARSET[0] is set to 1, if any
767 8-bit codes are found CHARSET[1] is set to 1. */
4ed46869
KH
768
769int
99529c2c 770find_charset_in_str (str, len, charsets, table, multibyte)
028d516b
KH
771 unsigned char *str;
772 int len, *charsets;
23d2a7f1 773 Lisp_Object table;
0282eb69 774 int multibyte;
4ed46869 775{
65d0af29 776 register int num = 0;
4ed46869 777
0282eb69
KH
778 if (! multibyte)
779 {
780 unsigned char *endp = str + len;
781 int maskbits = 0;
782
783 while (str < endp && maskbits != 3)
784 maskbits |= (*str++ < 0x80 ? 1 : 2);
785 if (maskbits & 1)
786 {
787 charsets[0] = 1;
788 num++;
789 }
790 if (maskbits & 2)
791 {
792 charsets[1] = 1;
793 num++;
794 }
795 return num;
796 }
797
23d2a7f1
KH
798 if (! CHAR_TABLE_P (table))
799 table = Qnil;
800
4ed46869
KH
801 while (len > 0)
802 {
99529c2c 803 int bytes, charset, c1, c2;
05505664 804
99529c2c 805 SPLIT_MULTIBYTE_SEQ (str, len, bytes, charset, c1, c2);
05505664 806
99529c2c 807 if (! NILP (table))
23d2a7f1 808 {
99529c2c
KH
809 int c1 = translate_char (table, -1, charset, c1, c2);
810 if (c1 >= 0)
65d0af29 811 charset = CHAR_CHARSET (c1);
23d2a7f1 812 }
4ed46869
KH
813
814 if (!charsets[charset])
815 {
816 charsets[charset] = 1;
817 num += 1;
818 }
819 str += bytes;
820 len -= bytes;
821 }
822 return num;
823}
824
825DEFUN ("find-charset-region", Ffind_charset_region, Sfind_charset_region,
23d2a7f1 826 2, 3, 0,
4ed46869 827 "Return a list of charsets in the region between BEG and END.\n\
23d2a7f1 828BEG and END are buffer positions.\n\
0282eb69
KH
829Optional arg TABLE if non-nil is a translation table to look up.\n\
830\n\
831If the region contains invalid multiybte characters,\n\
38f02ede 832`unknown' is included in the returned list.\n\
0282eb69
KH
833\n\
834If the current buffer is unibyte, the returned list contains\n\
835`ascii' if any 7-bit characters are found,\n\
836and `unknown' if any 8-bit characters are found.")
23d2a7f1
KH
837 (beg, end, table)
838 Lisp_Object beg, end, table;
4ed46869 839{
028d516b 840 int charsets[MAX_CHARSET + 1];
6ae1f27e 841 int from, from_byte, to, stop, stop_byte, i;
4ed46869 842 Lisp_Object val;
0282eb69
KH
843 int undefined;
844 int multibyte = !NILP (current_buffer->enable_multibyte_characters);
4ed46869
KH
845
846 validate_region (&beg, &end);
847 from = XFASTINT (beg);
848 stop = to = XFASTINT (end);
6ae1f27e 849
4ed46869 850 if (from < GPT && GPT < to)
6ae1f27e
RS
851 {
852 stop = GPT;
853 stop_byte = GPT_BYTE;
854 }
855 else
856 stop_byte = CHAR_TO_BYTE (stop);
857
858 from_byte = CHAR_TO_BYTE (from);
859
028d516b 860 bzero (charsets, (MAX_CHARSET + 1) * sizeof (int));
4ed46869
KH
861 while (1)
862 {
6ae1f27e 863 find_charset_in_str (BYTE_POS_ADDR (from_byte), stop_byte - from_byte,
99529c2c 864 charsets, table, multibyte);
4ed46869 865 if (stop < to)
6ae1f27e
RS
866 {
867 from = stop, from_byte = stop_byte;
868 stop = to, stop_byte = CHAR_TO_BYTE (stop);
869 }
4ed46869
KH
870 else
871 break;
872 }
6ae1f27e 873
4ed46869 874 val = Qnil;
0282eb69
KH
875 undefined = 0;
876 for (i = (multibyte ? MAX_CHARSET : 1); i >= 0; i--)
4ed46869 877 if (charsets[i])
0282eb69 878 {
99529c2c 879 if (CHARSET_DEFINED_P (i))
0282eb69
KH
880 val = Fcons (CHARSET_SYMBOL (i), val);
881 else
882 undefined = 1;
883 }
884 if (undefined)
885 val = Fcons (Qunknown, val);
4ed46869
KH
886 return val;
887}
888
889DEFUN ("find-charset-string", Ffind_charset_string, Sfind_charset_string,
23d2a7f1
KH
890 1, 2, 0,
891 "Return a list of charsets in STR.\n\
0282eb69
KH
892Optional arg TABLE if non-nil is a translation table to look up.\n\
893\n\
894If the region contains invalid multiybte characters,\n\
895`unknown' is included in the returned list.\n\
896\n\
897If STR is unibyte, the returned list contains\n\
898`ascii' if any 7-bit characters are found,\n\
899and `unknown' if any 8-bit characters are found.")
23d2a7f1
KH
900 (str, table)
901 Lisp_Object str, table;
4ed46869 902{
a29e3b1b 903 int charsets[MAX_CHARSET + 1];
4ed46869
KH
904 int i;
905 Lisp_Object val;
0282eb69
KH
906 int undefined;
907 int multibyte;
4ed46869
KH
908
909 CHECK_STRING (str, 0);
0282eb69 910 multibyte = STRING_MULTIBYTE (str);
87b089ad 911
a29e3b1b 912 bzero (charsets, (MAX_CHARSET + 1) * sizeof (int));
fc932ac6 913 find_charset_in_str (XSTRING (str)->data, STRING_BYTES (XSTRING (str)),
99529c2c 914 charsets, table, multibyte);
4ed46869 915 val = Qnil;
0282eb69
KH
916 undefined = 0;
917 for (i = (multibyte ? MAX_CHARSET : 1); i >= 0; i--)
4ed46869 918 if (charsets[i])
0282eb69 919 {
99529c2c 920 if (CHARSET_DEFINED_P (i))
0282eb69
KH
921 val = Fcons (CHARSET_SYMBOL (i), val);
922 else
923 undefined = 1;
924 }
925 if (undefined)
926 val = Fcons (Qunknown, val);
4ed46869
KH
927 return val;
928}
929\f
930DEFUN ("make-char-internal", Fmake_char_internal, Smake_char_internal, 1, 3, 0,
513ee442 931 "")
4ed46869
KH
932 (charset, code1, code2)
933 Lisp_Object charset, code1, code2;
934{
ac4137cc
KH
935 int charset_id, c1, c2;
936
4ed46869 937 CHECK_NUMBER (charset, 0);
ac4137cc
KH
938 charset_id = XINT (charset);
939 if (!CHARSET_DEFINED_P (charset_id))
940 error ("Invalid charset ID: %d", XINT (charset));
4ed46869
KH
941
942 if (NILP (code1))
ac4137cc 943 c1 = 0;
4ed46869 944 else
ac4137cc
KH
945 {
946 CHECK_NUMBER (code1, 1);
947 c1 = XINT (code1);
948 }
4ed46869 949 if (NILP (code2))
ac4137cc 950 c2 = 0;
4ed46869 951 else
ac4137cc
KH
952 {
953 CHECK_NUMBER (code2, 2);
954 c2 = XINT (code2);
955 }
4ed46869 956
ac4137cc 957 if (c1 < 0 || c1 > 0xFF || c2 < 0 || c2 > 0xFF)
613a8346 958 error ("Invalid code points for charset ID %d: %d %d", charset_id, c1, c2);
ac4137cc
KH
959 c1 &= 0x7F;
960 c2 &= 0x7F;
961 if (c1 == 0
962 ? c2 != 0
963 : (c2 == 0
87f67317
KR
964 ? !CHAR_COMPONENTS_VALID_P (charset_id, c1, 0x20)
965 : !CHAR_COMPONENTS_VALID_P (charset_id, c1, c2)))
613a8346 966 error ("Invalid code points for charset ID %d: %d %d", charset_id, c1, c2);
ac4137cc
KH
967
968 return make_number (MAKE_CHAR (charset_id, c1, c2));
4ed46869
KH
969}
970
971DEFUN ("split-char", Fsplit_char, Ssplit_char, 1, 1, 0,
0282eb69
KH
972 "Return list of charset and one or two position-codes of CHAR.\n\
973If CHAR is invalid as a character code,\n\
974return a list of symbol `unknown' and CHAR.")
4ed46869
KH
975 (ch)
976 Lisp_Object ch;
977{
978 Lisp_Object val;
0282eb69 979 int c, charset, c1, c2;
4ed46869
KH
980
981 CHECK_NUMBER (ch, 0);
0282eb69
KH
982 c = XFASTINT (ch);
983 if (!CHAR_VALID_P (c, 1))
984 return Fcons (Qunknown, Fcons (ch, Qnil));
4ed46869 985 SPLIT_CHAR (XFASTINT (ch), charset, c1, c2);
6dc0722d 986 return (c2 >= 0
4ed46869
KH
987 ? Fcons (CHARSET_SYMBOL (charset),
988 Fcons (make_number (c1), Fcons (make_number (c2), Qnil)))
989 : Fcons (CHARSET_SYMBOL (charset), Fcons (make_number (c1), Qnil)));
990}
991
992DEFUN ("char-charset", Fchar_charset, Schar_charset, 1, 1, 0,
993 "Return charset of CHAR.")
994 (ch)
995 Lisp_Object ch;
996{
997 CHECK_NUMBER (ch, 0);
998
999 return CHARSET_SYMBOL (CHAR_CHARSET (XINT (ch)));
1000}
1001
90d7b74e 1002DEFUN ("charset-after", Fcharset_after, Scharset_after, 0, 1, 0,
ac4137cc 1003 "Return charset of a character in the current buffer at position POS.\n\
e6e114f2
KH
1004If POS is nil, it defauls to the current point.\n\
1005If POS is out of range, the value is nil.")
90d7b74e
KH
1006 (pos)
1007 Lisp_Object pos;
1008{
ac4137cc 1009 register int pos_byte, bytes, charset, c1, c2;
90d7b74e
KH
1010 register unsigned char *p;
1011
1012 if (NILP (pos))
1013 pos_byte = PT_BYTE;
1014 else if (MARKERP (pos))
e6e114f2
KH
1015 {
1016 pos_byte = marker_byte_position (pos);
1017 if (pos_byte < BEGV_BYTE || pos_byte >= ZV_BYTE)
1018 return Qnil;
1019 }
90d7b74e
KH
1020 else
1021 {
1022 CHECK_NUMBER (pos, 0);
e6e114f2
KH
1023 if (XINT (pos) < BEGV || XINT (pos) >= ZV)
1024 return Qnil;
90d7b74e
KH
1025 pos_byte = CHAR_TO_BYTE (XINT (pos));
1026 }
1027 p = BYTE_POS_ADDR (pos_byte);
ac4137cc
KH
1028 if (BASE_LEADING_CODE_P (*p))
1029 {
1030 SPLIT_MULTIBYTE_SEQ (p, Z_BYTE - pos_byte, bytes, charset, c1, c2);
1031 if (charset < 0)
1032 charset = 1;
1033 }
1034 else
1035 charset = CHARSET_ASCII;
1036
90d7b74e
KH
1037 return CHARSET_SYMBOL (charset);
1038}
1039
4ed46869 1040DEFUN ("iso-charset", Fiso_charset, Siso_charset, 3, 3, 0,
2b71bb78
KH
1041 "Return charset of ISO's specification DIMENSION, CHARS, and FINAL-CHAR.\n\
1042\n\
1043ISO 2022's designation sequence (escape sequence) distinguishes charsets\n\
1044by their DIMENSION, CHARS, and FINAL-CHAR,\n\
1045where as Emacs distinguishes them by charset symbol.\n\
1046See the documentation of the function `charset-info' for the meanings of\n\
1047DIMENSION, CHARS, and FINAL-CHAR.")
4ed46869
KH
1048 (dimension, chars, final_char)
1049 Lisp_Object dimension, chars, final_char;
1050{
1051 int charset;
1052
1053 CHECK_NUMBER (dimension, 0);
1054 CHECK_NUMBER (chars, 1);
1055 CHECK_NUMBER (final_char, 2);
1056
1057 if ((charset = ISO_CHARSET_TABLE (dimension, chars, final_char)) < 0)
1058 return Qnil;
1059 return CHARSET_SYMBOL (charset);
1060}
1061
9d3d8cba
KH
1062/* If GENERICP is nonzero, return nonzero iff C is a valid normal or
1063 generic character. If GENERICP is zero, return nonzero iff C is a
1064 valid normal character. Do not call this function directly,
1065 instead use macro CHAR_VALID_P. */
1066int
1067char_valid_p (c, genericp)
1068 int c, genericp;
1069{
1070 int charset, c1, c2;
1071
1072 if (c < 0)
1073 return 0;
1074 if (SINGLE_BYTE_CHAR_P (c))
1075 return 1;
1076 SPLIT_NON_ASCII_CHAR (c, charset, c1, c2);
ac4137cc
KH
1077 if (genericp)
1078 {
1079 if (c1)
1080 {
1081 if (c2 <= 0) c2 = 0x20;
1082 }
1083 else
1084 {
1085 if (c2 <= 0) c1 = c2 = 0x20;
1086 }
1087 }
1088 return (CHARSET_DEFINED_P (charset)
44c6492d 1089 && CHAR_COMPONENTS_VALID_P (charset, c1, c2));
9d3d8cba
KH
1090}
1091
1092DEFUN ("char-valid-p", Fchar_valid_p, Schar_valid_p, 1, 2, 0,
a9d02884
DL
1093 "Return t if OBJECT is a valid normal character.\n\
1094If optional arg GENERICP is non-nil, also return t if OBJECT is\n\
9d3d8cba
KH
1095a valid generic character.")
1096 (object, genericp)
1097 Lisp_Object object, genericp;
1098{
1099 if (! NATNUMP (object))
1100 return Qnil;
1101 return (CHAR_VALID_P (XFASTINT (object), !NILP (genericp)) ? Qt : Qnil);
1102}
1103
d2665018
KH
1104DEFUN ("unibyte-char-to-multibyte", Funibyte_char_to_multibyte,
1105 Sunibyte_char_to_multibyte, 1, 1, 0,
1106 "Convert the unibyte character CH to multibyte character.\n\
537efd8d 1107The conversion is done based on `nonascii-translation-table' (which see)\n\
340b8d58 1108 or `nonascii-insert-offset' (which see).")
d2665018
KH
1109 (ch)
1110 Lisp_Object ch;
1111{
1112 int c;
1113
1114 CHECK_NUMBER (ch, 0);
1115 c = XINT (ch);
1116 if (c < 0 || c >= 0400)
1117 error ("Invalid unibyte character: %d", c);
1118 c = unibyte_char_to_multibyte (c);
1119 if (c < 0)
1120 error ("Can't convert to multibyte character: %d", XINT (ch));
1121 return make_number (c);
1122}
1123
1bcc1567
RS
1124DEFUN ("multibyte-char-to-unibyte", Fmultibyte_char_to_unibyte,
1125 Smultibyte_char_to_unibyte, 1, 1, 0,
1126 "Convert the multibyte character CH to unibyte character.\n\
1127The conversion is done based on `nonascii-translation-table' (which see)\n\
1128 or `nonascii-insert-offset' (which see).")
1129 (ch)
1130 Lisp_Object ch;
1131{
1132 int c;
1133
1134 CHECK_NUMBER (ch, 0);
1135 c = XINT (ch);
ac4137cc 1136 if (! CHAR_VALID_P (c, 0))
1bcc1567
RS
1137 error ("Invalid multibyte character: %d", c);
1138 c = multibyte_char_to_unibyte (c, Qnil);
1139 if (c < 0)
1140 error ("Can't convert to unibyte character: %d", XINT (ch));
1141 return make_number (c);
1142}
1143
4ed46869 1144DEFUN ("char-bytes", Fchar_bytes, Schar_bytes, 1, 1, 0,
f78643ef 1145 "Return 1 regardless of the argument CHAR.\n\
60022cb7 1146This is now an obsolete function. We keep it just for backward compatibility.")
4ed46869
KH
1147 (ch)
1148 Lisp_Object ch;
1149{
1150 Lisp_Object val;
4ed46869
KH
1151
1152 CHECK_NUMBER (ch, 0);
9b6a601f
KH
1153 return make_number (1);
1154}
1155
1156/* Return how many bytes C will occupy in a multibyte buffer.
1157 Don't call this function directly, instead use macro CHAR_BYTES. */
1158int
1159char_bytes (c)
1160 int c;
1161{
99529c2c 1162 int charset;
9b6a601f 1163
ae87e396 1164 if (SINGLE_BYTE_CHAR_P (c) || (c & ~((1 << CHARACTERBITS) - 1)))
8ac5a9cc
KH
1165 return 1;
1166
99529c2c
KH
1167 charset = CHAR_CHARSET (c);
1168 return (CHARSET_DEFINED_P (charset) ? CHARSET_BYTES (charset) : 1);
4ed46869
KH
1169}
1170
1171/* Return the width of character of which multi-byte form starts with
1172 C. The width is measured by how many columns occupied on the
1173 screen when displayed in the current buffer. */
1174
1175#define ONE_BYTE_CHAR_WIDTH(c) \
1176 (c < 0x20 \
1177 ? (c == '\t' \
53316e55 1178 ? XFASTINT (current_buffer->tab_width) \
4ed46869
KH
1179 : (c == '\n' ? 0 : (NILP (current_buffer->ctl_arrow) ? 4 : 2))) \
1180 : (c < 0x7f \
1181 ? 1 \
1182 : (c == 0x7F \
1183 ? (NILP (current_buffer->ctl_arrow) ? 4 : 2) \
1184 : ((! NILP (current_buffer->enable_multibyte_characters) \
1185 && BASE_LEADING_CODE_P (c)) \
1186 ? WIDTH_BY_CHAR_HEAD (c) \
b4e9dd77 1187 : 4))))
4ed46869
KH
1188
1189DEFUN ("char-width", Fchar_width, Schar_width, 1, 1, 0,
1190 "Return width of CHAR when displayed in the current buffer.\n\
1191The width is measured by how many columns it occupies on the screen.")
1192 (ch)
1193 Lisp_Object ch;
1194{
859f2b3c 1195 Lisp_Object val, disp;
4ed46869 1196 int c;
51c4025f 1197 struct Lisp_Char_Table *dp = buffer_display_table ();
4ed46869
KH
1198
1199 CHECK_NUMBER (ch, 0);
1200
859f2b3c
RS
1201 c = XINT (ch);
1202
1203 /* Get the way the display table would display it. */
51c4025f 1204 disp = dp ? DISP_CHAR_VECTOR (dp, c) : Qnil;
859f2b3c
RS
1205
1206 if (VECTORP (disp))
1207 XSETINT (val, XVECTOR (disp)->size);
1208 else if (SINGLE_BYTE_CHAR_P (c))
1209 XSETINT (val, ONE_BYTE_CHAR_WIDTH (c));
4ed46869
KH
1210 else
1211 {
1212 int charset = CHAR_CHARSET (c);
1213
1214 XSETFASTINT (val, CHARSET_WIDTH (charset));
1215 }
1216 return val;
1217}
1218
1219/* Return width of string STR of length LEN when displayed in the
1220 current buffer. The width is measured by how many columns it
1221 occupies on the screen. */
859f2b3c 1222
4ed46869
KH
1223int
1224strwidth (str, len)
1225 unsigned char *str;
1226 int len;
1227{
1228 unsigned char *endp = str + len;
1229 int width = 0;
c4a4e28f 1230 struct Lisp_Char_Table *dp = buffer_display_table ();
4ed46869 1231
859f2b3c
RS
1232 while (str < endp)
1233 {
99529c2c
KH
1234 Lisp_Object disp;
1235 int thislen;
1236 int c = STRING_CHAR_AND_LENGTH (str, endp - str, thislen);
859f2b3c 1237
99529c2c
KH
1238 /* Get the way the display table would display it. */
1239 if (dp)
1240 disp = DISP_CHAR_VECTOR (dp, c);
859f2b3c 1241 else
99529c2c
KH
1242 disp = Qnil;
1243
1244 if (VECTORP (disp))
1245 width += XVECTOR (disp)->size;
1246 else
1247 width += ONE_BYTE_CHAR_WIDTH (*str);
1248
1249 str += thislen;
859f2b3c 1250 }
4ed46869
KH
1251 return width;
1252}
1253
1254DEFUN ("string-width", Fstring_width, Sstring_width, 1, 1, 0,
1255 "Return width of STRING when displayed in the current buffer.\n\
1256Width is measured by how many columns it occupies on the screen.\n\
046b1f03
RS
1257When calculating width of a multibyte character in STRING,\n\
1258only the base leading-code is considered; the validity of\n\
1259the following bytes is not checked.")
4ed46869
KH
1260 (str)
1261 Lisp_Object str;
1262{
1263 Lisp_Object val;
1264
1265 CHECK_STRING (str, 0);
fc932ac6
RS
1266 XSETFASTINT (val, strwidth (XSTRING (str)->data,
1267 STRING_BYTES (XSTRING (str))));
4ed46869
KH
1268 return val;
1269}
1270
1271DEFUN ("char-direction", Fchar_direction, Schar_direction, 1, 1, 0,
1272 "Return the direction of CHAR.\n\
1273The returned value is 0 for left-to-right and 1 for right-to-left.")
1274 (ch)
1275 Lisp_Object ch;
1276{
1277 int charset;
1278
1279 CHECK_NUMBER (ch, 0);
1280 charset = CHAR_CHARSET (XFASTINT (ch));
1281 if (!CHARSET_DEFINED_P (charset))
93bcb785 1282 invalid_character (XINT (ch));
4ed46869
KH
1283 return CHARSET_TABLE_INFO (charset, CHARSET_DIRECTION_IDX);
1284}
1285
af4fecb4 1286DEFUN ("chars-in-region", Fchars_in_region, Schars_in_region, 2, 2, 0,
6ae1f27e 1287 "Return number of characters between BEG and END.")
046b1f03
RS
1288 (beg, end)
1289 Lisp_Object beg, end;
1290{
6ae1f27e 1291 int from, to;
046b1f03 1292
17e7ef1b
RS
1293 CHECK_NUMBER_COERCE_MARKER (beg, 0);
1294 CHECK_NUMBER_COERCE_MARKER (end, 1);
1295
046b1f03 1296 from = min (XFASTINT (beg), XFASTINT (end));
a8a35e61 1297 to = max (XFASTINT (beg), XFASTINT (end));
046b1f03 1298
a8c21066 1299 return make_number (to - from);
6ae1f27e 1300}
9036eb45 1301
87b089ad
RS
1302/* Return the number of characters in the NBYTES bytes at PTR.
1303 This works by looking at the contents and checking for multibyte sequences.
1304 However, if the current buffer has enable-multibyte-characters = nil,
1305 we treat each byte as a character. */
1306
6ae1f27e
RS
1307int
1308chars_in_text (ptr, nbytes)
1309 unsigned char *ptr;
1310 int nbytes;
1311{
87b089ad
RS
1312 /* current_buffer is null at early stages of Emacs initialization. */
1313 if (current_buffer == 0
1314 || NILP (current_buffer->enable_multibyte_characters))
6ae1f27e 1315 return nbytes;
a8a35e61 1316
ac4137cc 1317 return multibyte_chars_in_text (ptr, nbytes);
046b1f03
RS
1318}
1319
87b089ad
RS
1320/* Return the number of characters in the NBYTES bytes at PTR.
1321 This works by looking at the contents and checking for multibyte sequences.
1322 It ignores enable-multibyte-characters. */
1323
1324int
1325multibyte_chars_in_text (ptr, nbytes)
1326 unsigned char *ptr;
1327 int nbytes;
1328{
ac4137cc
KH
1329 unsigned char *endp;
1330 int chars, bytes;
87b089ad
RS
1331
1332 endp = ptr + nbytes;
1333 chars = 0;
1334
1335 while (ptr < endp)
1336 {
ac4137cc
KH
1337 if (BASE_LEADING_CODE_P (*ptr))
1338 {
1339 PARSE_MULTIBYTE_SEQ (ptr, nbytes, bytes);
1340 ptr += bytes;
1341 nbytes -= bytes;
1342 }
1343 else
1344 {
1345 ptr++;
1346 nbytes--;
1347 }
87b089ad
RS
1348 chars++;
1349 }
1350
1351 return chars;
1352}
1353
1354DEFUN ("string", Fstring, Sstring, 1, MANY, 0,
4ed46869 1355 "Concatenate all the argument characters and make the result a string.")
53316e55
KH
1356 (n, args)
1357 int n;
4ed46869
KH
1358 Lisp_Object *args;
1359{
53316e55 1360 int i;
99529c2c 1361 unsigned char *buf = (unsigned char *) alloca (MAX_MULTIBYTE_LENGTH * n);
4ed46869
KH
1362 unsigned char *p = buf;
1363 Lisp_Object val;
d0c037d8 1364 int c, multibyte_p = 0;
4ed46869
KH
1365
1366 for (i = 0; i < n; i++)
1367 {
d0c037d8 1368 CHECK_NUMBER (args[i], 0);
4ed46869 1369 c = XINT (args[i]);
99529c2c 1370 p += CHAR_STRING (c, p);
d0c037d8
GM
1371
1372 if (!SINGLE_BYTE_CHAR_P (c))
1373 multibyte_p = 1;
4ed46869
KH
1374 }
1375
f4db5c4b 1376 /* Here, we can't use make_string_from_bytes because of the byte
d0c037d8 1377 combining problem. Make a multibyte string if there is any
f4db5c4b 1378 multibyte character in ARGS to make sure that `(string 2276)'
d0c037d8
GM
1379 returns a multibyte string if running --unibyte. */
1380 if (multibyte_p)
1381 val = make_multibyte_string (buf, n, p - buf);
1382 else
1383 val = make_unibyte_string (buf, p - buf);
4ed46869
KH
1384 return val;
1385}
1386
1387#endif /* emacs */
1388\f
dfcf069d 1389int
4ed46869
KH
1390charset_id_internal (charset_name)
1391 char *charset_name;
1392{
76d7b829 1393 Lisp_Object val;
4ed46869 1394
76d7b829 1395 val= Fget (intern (charset_name), Qcharset);
4ed46869
KH
1396 if (!VECTORP (val))
1397 error ("Charset %s is not defined", charset_name);
1398
1399 return (XINT (XVECTOR (val)->contents[0]));
1400}
1401
1402DEFUN ("setup-special-charsets", Fsetup_special_charsets,
1403 Ssetup_special_charsets, 0, 0, 0, "Internal use only.")
1404 ()
1405{
1406 charset_latin_iso8859_1 = charset_id_internal ("latin-iso8859-1");
1407 charset_jisx0208_1978 = charset_id_internal ("japanese-jisx0208-1978");
1408 charset_jisx0208 = charset_id_internal ("japanese-jisx0208");
1409 charset_katakana_jisx0201 = charset_id_internal ("katakana-jisx0201");
1410 charset_latin_jisx0201 = charset_id_internal ("latin-jisx0201");
1411 charset_big5_1 = charset_id_internal ("chinese-big5-1");
1412 charset_big5_2 = charset_id_internal ("chinese-big5-2");
1413 return Qnil;
1414}
1415
dfcf069d 1416void
4ed46869
KH
1417init_charset_once ()
1418{
1419 int i, j, k;
1420
1421 staticpro (&Vcharset_table);
1422 staticpro (&Vcharset_symbol_table);
8a73a704 1423 staticpro (&Vgeneric_character_list);
4ed46869
KH
1424
1425 /* This has to be done here, before we call Fmake_char_table. */
1426 Qcharset_table = intern ("charset-table");
1427 staticpro (&Qcharset_table);
1428
1429 /* Intern this now in case it isn't already done.
1430 Setting this variable twice is harmless.
1431 But don't staticpro it here--that is done in alloc.c. */
1432 Qchar_table_extra_slots = intern ("char-table-extra-slots");
1433
1434 /* Now we are ready to set up this property, so we can
1435 create the charset table. */
1436 Fput (Qcharset_table, Qchar_table_extra_slots, make_number (0));
1437 Vcharset_table = Fmake_char_table (Qcharset_table, Qnil);
1438
0282eb69
KH
1439 Qunknown = intern ("unknown");
1440 staticpro (&Qunknown);
1441 Vcharset_symbol_table = Fmake_vector (make_number (MAX_CHARSET + 1),
1442 Qunknown);
4ed46869
KH
1443
1444 /* Setup tables. */
1445 for (i = 0; i < 2; i++)
1446 for (j = 0; j < 2; j++)
1447 for (k = 0; k < 128; k++)
1448 iso_charset_table [i][j][k] = -1;
1449
60383934 1450 for (i = 0; i < 256; i++)
4ed46869 1451 BYTES_BY_CHAR_HEAD (i) = 1;
6ef23ebb
KH
1452 for (i = MIN_CHARSET_OFFICIAL_DIMENSION1;
1453 i <= MAX_CHARSET_OFFICIAL_DIMENSION1; i++)
1454 BYTES_BY_CHAR_HEAD (i) = 2;
1455 for (i = MIN_CHARSET_OFFICIAL_DIMENSION2;
1456 i <= MAX_CHARSET_OFFICIAL_DIMENSION2; i++)
1457 BYTES_BY_CHAR_HEAD (i) = 3;
4ed46869
KH
1458 BYTES_BY_CHAR_HEAD (LEADING_CODE_PRIVATE_11) = 3;
1459 BYTES_BY_CHAR_HEAD (LEADING_CODE_PRIVATE_12) = 3;
1460 BYTES_BY_CHAR_HEAD (LEADING_CODE_PRIVATE_21) = 4;
1461 BYTES_BY_CHAR_HEAD (LEADING_CODE_PRIVATE_22) = 4;
6ef23ebb 1462 /* The followings don't reflect the actual bytes, but just to tell
4ed46869 1463 that it is a start of a multibyte character. */
99529c2c 1464 BYTES_BY_CHAR_HEAD (0x80) = 2;
6ef23ebb
KH
1465 BYTES_BY_CHAR_HEAD (0x9E) = 2;
1466 BYTES_BY_CHAR_HEAD (0x9F) = 2;
4ed46869
KH
1467
1468 for (i = 0; i < 128; i++)
1469 WIDTH_BY_CHAR_HEAD (i) = 1;
1470 for (; i < 256; i++)
1471 WIDTH_BY_CHAR_HEAD (i) = 4;
1472 WIDTH_BY_CHAR_HEAD (LEADING_CODE_PRIVATE_11) = 1;
1473 WIDTH_BY_CHAR_HEAD (LEADING_CODE_PRIVATE_12) = 2;
1474 WIDTH_BY_CHAR_HEAD (LEADING_CODE_PRIVATE_21) = 1;
1475 WIDTH_BY_CHAR_HEAD (LEADING_CODE_PRIVATE_22) = 2;
8a73a704
KH
1476
1477 {
76d7b829 1478 Lisp_Object val;
8a73a704 1479
76d7b829 1480 val = Qnil;
8a73a704
KH
1481 for (i = 0x81; i < 0x90; i++)
1482 val = Fcons (make_number ((i - 0x70) << 7), val);
1483 for (; i < 0x9A; i++)
1484 val = Fcons (make_number ((i - 0x8F) << 14), val);
1485 for (i = 0xA0; i < 0xF0; i++)
1486 val = Fcons (make_number ((i - 0x70) << 7), val);
1487 for (; i < 0xFF; i++)
1488 val = Fcons (make_number ((i - 0xE0) << 14), val);
8a73a704
KH
1489 Vgeneric_character_list = Fnreverse (val);
1490 }
bbf12bb3
KH
1491
1492 nonascii_insert_offset = 0;
1493 Vnonascii_translation_table = Qnil;
4ed46869
KH
1494}
1495
1496#ifdef emacs
1497
dfcf069d 1498void
4ed46869
KH
1499syms_of_charset ()
1500{
1501 Qascii = intern ("ascii");
1502 staticpro (&Qascii);
1503
1504 Qcharset = intern ("charset");
1505 staticpro (&Qcharset);
1506
1507 /* Define ASCII charset now. */
1508 update_charset_table (make_number (CHARSET_ASCII),
1509 make_number (1), make_number (94),
1510 make_number (1),
1511 make_number (0),
1512 make_number ('B'),
1513 make_number (0),
1514 build_string ("ASCII"),
1515 build_string ("ASCII"),
1516 build_string ("ASCII (ISO646 IRV)"));
1517 CHARSET_SYMBOL (CHARSET_ASCII) = Qascii;
1518 Fput (Qascii, Qcharset, CHARSET_TABLE_ENTRY (CHARSET_ASCII));
1519
c1a08b4c
KH
1520 Qauto_fill_chars = intern ("auto-fill-chars");
1521 staticpro (&Qauto_fill_chars);
1522 Fput (Qauto_fill_chars, Qchar_table_extra_slots, make_number (0));
1523
4ed46869 1524 defsubr (&Sdefine_charset);
8a73a704 1525 defsubr (&Sgeneric_character_list);
3fac5a51 1526 defsubr (&Sget_unused_iso_final_char);
4ed46869
KH
1527 defsubr (&Sdeclare_equiv_charset);
1528 defsubr (&Sfind_charset_region);
1529 defsubr (&Sfind_charset_string);
1530 defsubr (&Smake_char_internal);
1531 defsubr (&Ssplit_char);
1532 defsubr (&Schar_charset);
90d7b74e 1533 defsubr (&Scharset_after);
4ed46869 1534 defsubr (&Siso_charset);
9d3d8cba 1535 defsubr (&Schar_valid_p);
d2665018 1536 defsubr (&Sunibyte_char_to_multibyte);
1bcc1567 1537 defsubr (&Smultibyte_char_to_unibyte);
4ed46869
KH
1538 defsubr (&Schar_bytes);
1539 defsubr (&Schar_width);
1540 defsubr (&Sstring_width);
1541 defsubr (&Schar_direction);
af4fecb4 1542 defsubr (&Schars_in_region);
87b089ad 1543 defsubr (&Sstring);
4ed46869
KH
1544 defsubr (&Ssetup_special_charsets);
1545
1546 DEFVAR_LISP ("charset-list", &Vcharset_list,
1547 "List of charsets ever defined.");
1548 Vcharset_list = Fcons (Qascii, Qnil);
1549
537efd8d 1550 DEFVAR_LISP ("translation-table-vector", &Vtranslation_table_vector,
b4e9dd77
KH
1551 "Vector of cons cell of a symbol and translation table ever defined.\n\
1552An ID of a translation table is an index of this vector.");
537efd8d 1553 Vtranslation_table_vector = Fmake_vector (make_number (16), Qnil);
b0e3cf2b 1554
4ed46869
KH
1555 DEFVAR_INT ("leading-code-private-11", &leading_code_private_11,
1556 "Leading-code of private TYPE9N charset of column-width 1.");
1557 leading_code_private_11 = LEADING_CODE_PRIVATE_11;
1558
1559 DEFVAR_INT ("leading-code-private-12", &leading_code_private_12,
1560 "Leading-code of private TYPE9N charset of column-width 2.");
1561 leading_code_private_12 = LEADING_CODE_PRIVATE_12;
1562
1563 DEFVAR_INT ("leading-code-private-21", &leading_code_private_21,
1564 "Leading-code of private TYPE9Nx9N charset of column-width 1.");
1565 leading_code_private_21 = LEADING_CODE_PRIVATE_21;
1566
1567 DEFVAR_INT ("leading-code-private-22", &leading_code_private_22,
1568 "Leading-code of private TYPE9Nx9N charset of column-width 2.");
1569 leading_code_private_22 = LEADING_CODE_PRIVATE_22;
35e623fb
RS
1570
1571 DEFVAR_INT ("nonascii-insert-offset", &nonascii_insert_offset,
d2665018 1572 "Offset for converting non-ASCII unibyte codes 0240...0377 to multibyte.\n\
4cf9710d
RS
1573This is used for converting unibyte text to multibyte,\n\
1574and for inserting character codes specified by number.\n\n\
3e8ceaac
RS
1575This serves to convert a Latin-1 or similar 8-bit character code\n\
1576to the corresponding Emacs multibyte character code.\n\
1577Typically the value should be (- (make-char CHARSET 0) 128),\n\
1578for your choice of character set.\n\
537efd8d 1579If `nonascii-translation-table' is non-nil, it overrides this variable.");
35e623fb 1580 nonascii_insert_offset = 0;
b0e3cf2b 1581
b4e9dd77 1582 DEFVAR_LISP ("nonascii-translation-table", &Vnonascii_translation_table,
537efd8d 1583 "Translation table to convert non-ASCII unibyte codes to multibyte.\n\
4cf9710d
RS
1584This is used for converting unibyte text to multibyte,\n\
1585and for inserting character codes specified by number.\n\n\
1586Conversion is performed only when multibyte characters are enabled,\n\
1587and it serves to convert a Latin-1 or similar 8-bit character code\n\
1588to the corresponding Emacs character code.\n\n\
da4d65af 1589If this is nil, `nonascii-insert-offset' is used instead.\n\
b4e9dd77
KH
1590See also the docstring of `make-translation-table'.");
1591 Vnonascii_translation_table = Qnil;
4cf9710d 1592
c1a08b4c
KH
1593 DEFVAR_LISP ("auto-fill-chars", &Vauto_fill_chars,
1594 "A char-table for characters which invoke auto-filling.\n\
1595Such characters has value t in this table.");
1596 Vauto_fill_chars = Fmake_char_table (Qauto_fill_chars, Qnil);
60022cb7
AS
1597 CHAR_TABLE_SET (Vauto_fill_chars, make_number (' '), Qt);
1598 CHAR_TABLE_SET (Vauto_fill_chars, make_number ('\n'), Qt);
4ed46869
KH
1599}
1600
1601#endif /* emacs */