(char_bytes): Use ((1 << CHARACTERBITS) - 1) instead
[bpt/emacs.git] / src / charset.c
1 /* Basic multilingual character support.
2 Copyright (C) 1995, 1997, 1998 Electrotechnical Laboratory, JAPAN.
3 Licensed to the Free Software Foundation.
4
5 This file is part of GNU Emacs.
6
7 GNU Emacs is free software; you can redistribute it and/or modify
8 it under the terms of the GNU General Public License as published by
9 the Free Software Foundation; either version 2, or (at your option)
10 any later version.
11
12 GNU Emacs is distributed in the hope that it will be useful,
13 but WITHOUT ANY WARRANTY; without even the implied warranty of
14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 GNU General Public License for more details.
16
17 You should have received a copy of the GNU General Public License
18 along with GNU Emacs; see the file COPYING. If not, write to
19 the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
20 Boston, MA 02111-1307, USA. */
21
22 /* At first, see the document in `charset.h' to understand the code in
23 this file. */
24
25 #ifdef emacs
26 #include <config.h>
27 #endif
28
29 #include <stdio.h>
30
31 #ifdef emacs
32
33 #include <sys/types.h>
34 #include "lisp.h"
35 #include "buffer.h"
36 #include "charset.h"
37 #include "coding.h"
38 #include "disptab.h"
39
40 #else /* not emacs */
41
42 #include "mulelib.h"
43
44 #endif /* emacs */
45
46 Lisp_Object Qcharset, Qascii;
47 Lisp_Object Qunknown;
48
49 /* Declaration of special leading-codes. */
50 int leading_code_private_11; /* for private DIMENSION1 of 1-column */
51 int leading_code_private_12; /* for private DIMENSION1 of 2-column */
52 int leading_code_private_21; /* for private DIMENSION2 of 1-column */
53 int leading_code_private_22; /* for private DIMENSION2 of 2-column */
54
55 /* Declaration of special charsets. */
56 int charset_ascii; /* ASCII */
57 int charset_latin_iso8859_1; /* ISO8859-1 (Latin-1) */
58 int charset_jisx0208_1978; /* JISX0208.1978 (Japanese Kanji old set) */
59 int charset_jisx0208; /* JISX0208.1983 (Japanese Kanji) */
60 int charset_katakana_jisx0201; /* JISX0201.Kana (Japanese Katakana) */
61 int charset_latin_jisx0201; /* JISX0201.Roman (Japanese Roman) */
62 int charset_big5_1; /* Big5 Level 1 (Chinese Traditional) */
63 int charset_big5_2; /* Big5 Level 2 (Chinese Traditional) */
64
65 Lisp_Object Qcharset_table;
66
67 /* A char-table containing information of each character set. */
68 Lisp_Object Vcharset_table;
69
70 /* A vector of charset symbol indexed by charset-id. This is used
71 only for returning charset symbol from C functions. */
72 Lisp_Object Vcharset_symbol_table;
73
74 /* A list of charset symbols ever defined. */
75 Lisp_Object Vcharset_list;
76
77 /* Vector of translation table ever defined.
78 ID of a translation table is used to index this vector. */
79 Lisp_Object Vtranslation_table_vector;
80
81 /* A char-table for characters which may invoke auto-filling. */
82 Lisp_Object Vauto_fill_chars;
83
84 Lisp_Object Qauto_fill_chars;
85
86 /* Tables used by macros BYTES_BY_CHAR_HEAD and WIDTH_BY_CHAR_HEAD. */
87 int bytes_by_char_head[256];
88 int width_by_char_head[256];
89
90 /* Mapping table from ISO2022's charset (specified by DIMENSION,
91 CHARS, and FINAL-CHAR) to Emacs' charset. */
92 int iso_charset_table[2][2][128];
93
94 /* Variables used locally in the macro FETCH_MULTIBYTE_CHAR. */
95 unsigned char *_fetch_multibyte_char_p;
96 int _fetch_multibyte_char_len;
97
98 /* Offset to add to a non-ASCII value when inserting it. */
99 int nonascii_insert_offset;
100
101 /* Translation table for converting non-ASCII unibyte characters
102 to multibyte codes, or nil. */
103 Lisp_Object Vnonascii_translation_table;
104
105 /* List of all possible generic characters. */
106 Lisp_Object Vgeneric_character_list;
107
108 #define min(X, Y) ((X) < (Y) ? (X) : (Y))
109 #define max(X, Y) ((X) > (Y) ? (X) : (Y))
110 \f
111 void
112 invalid_character (c)
113 int c;
114 {
115 error ("Invalid character: 0%o, %d, 0x%x", c, c, c);
116 }
117
118 /* Parse a multibyte character string STR of length LENGTH (>= 2) set
119 BYTES to the length of actual multibyte sequence, CHARSET, C1, and
120 C2 to such values that MAKE_CHAR can make the multibyte character
121 from them.
122
123 It is assumed that *STR is one of base leading codes and the
124 following (LENGTH - 1) bytes satisfy !CHAR_HEAD_P.
125
126 This macro should be called only from SPLIT_MULTIBYTE_SEQ. */
127
128 #define SPLIT_CHARACTER_SEQ(str, length, bytes, charset, c1, c2) \
129 do { \
130 (bytes) = 1; \
131 (charset) = (str)[0]; \
132 if ((charset) >= LEADING_CODE_PRIVATE_11 \
133 && (charset) <= LEADING_CODE_PRIVATE_22) \
134 (charset) = (str)[(bytes)++]; \
135 if ((bytes) < (length)) \
136 { \
137 (c1) = (str)[(bytes)++] & 0x7F; \
138 if ((bytes) < (length)) \
139 (c2) = (str)[(bytes)++] & 0x7F; \
140 else \
141 (c2) = -1; \
142 } \
143 else \
144 (c1) = (c2) = -1; \
145 } while (0)
146
147 /* Parse string STR of length LENGTH and check if a multibyte
148 characters is at STR. Set BYTES to the actual length, CHARSET, C1,
149 C2 to proper values for that character. */
150
151 #define SPLIT_MULTIBYTE_SEQ(str, length, bytes, charset, c1, c2) \
152 do { \
153 int i; \
154 if (ASCII_BYTE_P ((str)[0])) \
155 i = 1; \
156 else \
157 for (i = 1; i < (length) && ! CHAR_HEAD_P ((str)[i]); i++); \
158 if (i == 1) \
159 (bytes) = 1, (charset) = CHARSET_ASCII, (c1) = (str)[0] ; \
160 else \
161 { \
162 if (i > BYTES_BY_CHAR_HEAD ((str)[0])) \
163 i = BYTES_BY_CHAR_HEAD ((str)[0]); \
164 SPLIT_CHARACTER_SEQ (str, i, bytes, charset, c1, c2); \
165 } \
166 } while (0)
167
168 /* 1 if CHARSET, C1, and C2 compose a valid character, else 0. */
169 #define CHAR_COMPONENTS_VALID_P(charset, c1, c2) \
170 (charset == CHARSET_ASCII \
171 ? ((c1) >= 0 && (c1) <= 0x7F) \
172 : (CHARSET_DIMENSION (charset) == 1 \
173 ? ((c1) >= 0x20 && (c1) <= 0x7F) \
174 : ((c1) >= 0x20 && (c1) <= 0x7F && (c2) >= 0x20 && (c2) <= 0x7F)))
175
176 /* Store multi-byte form of the character C in STR. The caller should
177 allocate at least 4-byte area at STR in advance. Returns the
178 length of the multi-byte form. If C is an invalid character code,
179 signal an error.
180
181 Use macro `CHAR_STRING (C, STR)' instead of calling this function
182 directly if C can be an ASCII character. */
183
184 int
185 char_to_string (c, str)
186 int c;
187 unsigned char *str;
188 {
189 unsigned char *p = str;
190
191 if (c & CHAR_MODIFIER_MASK) /* This includes the case C is negative. */
192 {
193 /* Multibyte character can't have a modifier bit. */
194 if (! SINGLE_BYTE_CHAR_P ((c & ~CHAR_MODIFIER_MASK)))
195 invalid_character (c);
196
197 /* For Meta, Shift, and Control modifiers, we need special care. */
198 if (c & CHAR_META)
199 {
200 /* Move the meta bit to the right place for a string. */
201 c = (c & ~CHAR_META) | 0x80;
202 }
203 if (c & CHAR_SHIFT)
204 {
205 /* Shift modifier is valid only with [A-Za-z]. */
206 if ((c & 0377) >= 'A' && (c & 0377) <= 'Z')
207 c &= ~CHAR_SHIFT;
208 else if ((c & 0377) >= 'a' && (c & 0377) <= 'z')
209 c = (c & ~CHAR_SHIFT) - ('a' - 'A');
210 }
211 if (c & CHAR_CTL)
212 {
213 /* Simulate the code in lread.c. */
214 /* Allow `\C- ' and `\C-?'. */
215 if (c == (CHAR_CTL | ' '))
216 c = 0;
217 else if (c == (CHAR_CTL | '?'))
218 c = 127;
219 /* ASCII control chars are made from letters (both cases),
220 as well as the non-letters within 0100...0137. */
221 else if ((c & 0137) >= 0101 && (c & 0137) <= 0132)
222 c &= (037 | (~0177 & ~CHAR_CTL));
223 else if ((c & 0177) >= 0100 && (c & 0177) <= 0137)
224 c &= (037 | (~0177 & ~CHAR_CTL));
225 }
226
227 /* If C still has any modifier bits, it is an invalid character. */
228 if (c & CHAR_MODIFIER_MASK)
229 invalid_character (c);
230
231 *p++ = c;
232 }
233 else if (c < MAX_CHAR)
234 {
235 int charset, c1, c2;
236
237 SPLIT_NON_ASCII_CHAR (c, charset, c1, c2);
238
239 if (charset >= LEADING_CODE_EXT_11)
240 *p++ = (charset < LEADING_CODE_EXT_12
241 ? LEADING_CODE_PRIVATE_11
242 : (charset < LEADING_CODE_EXT_21
243 ? LEADING_CODE_PRIVATE_12
244 : (charset < LEADING_CODE_EXT_22
245 ? LEADING_CODE_PRIVATE_21
246 : LEADING_CODE_PRIVATE_22)));
247 *p++ = charset;
248 if (c1 > 0 && c1 < 32 || c2 > 0 && c2 < 32)
249 invalid_character (c);
250 if (c1)
251 {
252 *p++ = c1 | 0x80;
253 if (c2 > 0)
254 *p++ = c2 | 0x80;
255 }
256 }
257
258 return (p -str);
259 }
260
261 /* Return the non-ASCII character corresponding to multi-byte form at
262 STR of length LEN. If ACTUAL_LEN is not NULL, store the byte
263 length of the multibyte form in *ACTUAL_LEN.
264
265 Use macros STRING_CHAR or STRING_CHAR_AND_LENGTH instead of calling
266 this function directly if you want ot handle ASCII characters as
267 well. */
268
269 int
270 string_to_char (str, len, actual_len)
271 const unsigned char *str;
272 int len, *actual_len;
273 {
274 int c, bytes, charset, c1, c2;
275
276 SPLIT_MULTIBYTE_SEQ (str, len, bytes, charset, c1, c2);
277 c = MAKE_CHAR (charset, c1, c2);
278 if (actual_len)
279 *actual_len = bytes;
280 return c;
281 }
282
283 /* Return the length of the multi-byte form at string STR of length LEN.
284 Use the macro MULTIBYTE_FORM_LENGTH instead. */
285 int
286 multibyte_form_length (str, len)
287 const unsigned char *str;
288 int len;
289 {
290 int bytes;
291
292 PARSE_MULTIBYTE_SEQ (str, len, bytes);
293 return bytes;
294 }
295
296 /* Check multibyte form at string STR of length LEN and set variables
297 pointed by CHARSET, C1, and C2 to charset and position codes of the
298 character at STR, and return 0. If there's no multibyte character,
299 return -1. This should be used only in the macro SPLIT_STRING
300 which checks range of STR in advance. */
301
302 int
303 split_string (str, len, charset, c1, c2)
304 const unsigned char *str;
305 unsigned char *c1, *c2;
306 int len, *charset;
307 {
308 register int bytes, cs, code1, code2 = -1;
309
310 SPLIT_MULTIBYTE_SEQ (str, len, bytes, cs, code1, code2);
311 if (cs == CHARSET_ASCII)
312 return -1;
313 *charset = cs;
314 *c1 = code1;
315 *c2 = code2;
316 return 0;
317 }
318
319 /* Return 1 iff character C has valid printable glyph.
320 Use the macro CHAR_PRINTABLE_P instead. */
321 int
322 char_printable_p (c)
323 int c;
324 {
325 int charset, c1, c2, chars;
326
327 if (SINGLE_BYTE_CHAR_P (c))
328 return 1;
329 if (c >= MAX_CHAR)
330 return 0;
331
332 SPLIT_NON_ASCII_CHAR (c, charset, c1, c2);
333 if (! CHARSET_DEFINED_P (charset))
334 return 0;
335 if (CHARSET_CHARS (charset) == 94
336 ? c1 <= 32 || c1 >= 127
337 : c1 < 32)
338 return 0;
339 if (CHARSET_DIMENSION (charset) == 2
340 && (CHARSET_CHARS (charset) == 94
341 ? c2 <= 32 || c2 >= 127
342 : c2 < 32))
343 return 0;
344 return 1;
345 }
346
347 /* Translate character C by translation table TABLE. If C
348 is negative, translate a character specified by CHARSET, C1, and C2
349 (C1 and C2 are code points of the character). If no translation is
350 found in TABLE, return C. */
351 int
352 translate_char (table, c, charset, c1, c2)
353 Lisp_Object table;
354 int c, charset, c1, c2;
355 {
356 Lisp_Object ch;
357 int alt_charset, alt_c1, alt_c2, dimension;
358
359 if (c < 0) c = MAKE_CHAR (charset, (c1 & 0x7F) , (c2 & 0x7F));
360 if (!CHAR_TABLE_P (table)
361 || (ch = Faref (table, make_number (c)), !NATNUMP (ch)))
362 return c;
363
364 SPLIT_CHAR (XFASTINT (ch), alt_charset, alt_c1, alt_c2);
365 dimension = CHARSET_DIMENSION (alt_charset);
366 if (dimension == 1 && alt_c1 > 0 || dimension == 2 && alt_c2 > 0)
367 /* CH is not a generic character, just return it. */
368 return XFASTINT (ch);
369
370 /* Since CH is a generic character, we must return a specific
371 charater which has the same position codes as C from CH. */
372 if (charset < 0)
373 SPLIT_CHAR (c, charset, c1, c2);
374 if (dimension != CHARSET_DIMENSION (charset))
375 /* We can't make such a character because of dimension mismatch. */
376 return c;
377 return MAKE_CHAR (alt_charset, c1, c2);
378 }
379
380 /* Convert the unibyte character C to multibyte based on
381 Vnonascii_translation_table or nonascii_insert_offset. If they can't
382 convert C to a valid multibyte character, convert it based on
383 DEFAULT_NONASCII_INSERT_OFFSET which makes C a Latin-1 character. */
384
385 int
386 unibyte_char_to_multibyte (c)
387 int c;
388 {
389 if (c < 0400 && c >= 0200)
390 {
391 int c_save = c;
392
393 if (! NILP (Vnonascii_translation_table))
394 {
395 c = XINT (Faref (Vnonascii_translation_table, make_number (c)));
396 if (c >= 0400 && ! char_valid_p (c, 0))
397 c = c_save + DEFAULT_NONASCII_INSERT_OFFSET;
398 }
399 else if (c >= 0240 && nonascii_insert_offset > 0)
400 {
401 c += nonascii_insert_offset;
402 if (c < 0400 || ! char_valid_p (c, 0))
403 c = c_save + DEFAULT_NONASCII_INSERT_OFFSET;
404 }
405 else if (c >= 0240)
406 c = c_save + DEFAULT_NONASCII_INSERT_OFFSET;
407 }
408 return c;
409 }
410
411
412 /* Convert the multibyte character C to unibyte 8-bit character based
413 on Vnonascii_translation_table or nonascii_insert_offset. If
414 REV_TBL is non-nil, it should be a reverse table of
415 Vnonascii_translation_table, i.e. what given by:
416 Fchar_table_extra_slot (Vnonascii_translation_table, make_number (0)) */
417
418 int
419 multibyte_char_to_unibyte (c, rev_tbl)
420 int c;
421 Lisp_Object rev_tbl;
422 {
423 if (!SINGLE_BYTE_CHAR_P (c))
424 {
425 int c_save = c;
426
427 if (! CHAR_TABLE_P (rev_tbl)
428 && CHAR_TABLE_P (Vnonascii_translation_table))
429 rev_tbl = Fchar_table_extra_slot (Vnonascii_translation_table,
430 make_number (0));
431 if (CHAR_TABLE_P (rev_tbl))
432 {
433 Lisp_Object temp;
434 temp = Faref (rev_tbl, make_number (c));
435 if (INTEGERP (temp))
436 c = XINT (temp);
437 if (c >= 256)
438 c = (c_save & 0177) + 0200;
439 }
440 else
441 {
442 if (nonascii_insert_offset > 0)
443 c -= nonascii_insert_offset;
444 if (c < 128 || c >= 256)
445 c = (c_save & 0177) + 0200;
446 }
447 }
448
449 return c;
450 }
451
452 \f
453 /* Update the table Vcharset_table with the given arguments (see the
454 document of `define-charset' for the meaning of each argument).
455 Several other table contents are also updated. The caller should
456 check the validity of CHARSET-ID and the remaining arguments in
457 advance. */
458
459 void
460 update_charset_table (charset_id, dimension, chars, width, direction,
461 iso_final_char, iso_graphic_plane,
462 short_name, long_name, description)
463 Lisp_Object charset_id, dimension, chars, width, direction;
464 Lisp_Object iso_final_char, iso_graphic_plane;
465 Lisp_Object short_name, long_name, description;
466 {
467 int charset = XINT (charset_id);
468 int bytes;
469 unsigned char leading_code_base, leading_code_ext;
470
471 if (NILP (CHARSET_TABLE_ENTRY (charset)))
472 CHARSET_TABLE_ENTRY (charset)
473 = Fmake_vector (make_number (CHARSET_MAX_IDX), Qnil);
474
475 /* Get byte length of multibyte form, base leading-code, and
476 extended leading-code of the charset. See the comment under the
477 title "GENERAL NOTE on CHARACTER SET (CHARSET)" in charset.h. */
478 bytes = XINT (dimension);
479 if (charset < MIN_CHARSET_PRIVATE_DIMENSION1)
480 {
481 /* Official charset, it doesn't have an extended leading-code. */
482 if (charset != CHARSET_ASCII)
483 bytes += 1; /* For a base leading-code. */
484 leading_code_base = charset;
485 leading_code_ext = 0;
486 }
487 else
488 {
489 /* Private charset. */
490 bytes += 2; /* For base and extended leading-codes. */
491 leading_code_base
492 = (charset < LEADING_CODE_EXT_12
493 ? LEADING_CODE_PRIVATE_11
494 : (charset < LEADING_CODE_EXT_21
495 ? LEADING_CODE_PRIVATE_12
496 : (charset < LEADING_CODE_EXT_22
497 ? LEADING_CODE_PRIVATE_21
498 : LEADING_CODE_PRIVATE_22)));
499 leading_code_ext = charset;
500 }
501
502 if (BYTES_BY_CHAR_HEAD (leading_code_base) != bytes)
503 error ("Invalid dimension for the charset-ID %d", charset);
504
505 CHARSET_TABLE_INFO (charset, CHARSET_ID_IDX) = charset_id;
506 CHARSET_TABLE_INFO (charset, CHARSET_BYTES_IDX) = make_number (bytes);
507 CHARSET_TABLE_INFO (charset, CHARSET_DIMENSION_IDX) = dimension;
508 CHARSET_TABLE_INFO (charset, CHARSET_CHARS_IDX) = chars;
509 CHARSET_TABLE_INFO (charset, CHARSET_WIDTH_IDX) = width;
510 CHARSET_TABLE_INFO (charset, CHARSET_DIRECTION_IDX) = direction;
511 CHARSET_TABLE_INFO (charset, CHARSET_LEADING_CODE_BASE_IDX)
512 = make_number (leading_code_base);
513 CHARSET_TABLE_INFO (charset, CHARSET_LEADING_CODE_EXT_IDX)
514 = make_number (leading_code_ext);
515 CHARSET_TABLE_INFO (charset, CHARSET_ISO_FINAL_CHAR_IDX) = iso_final_char;
516 CHARSET_TABLE_INFO (charset, CHARSET_ISO_GRAPHIC_PLANE_IDX)
517 = iso_graphic_plane;
518 CHARSET_TABLE_INFO (charset, CHARSET_SHORT_NAME_IDX) = short_name;
519 CHARSET_TABLE_INFO (charset, CHARSET_LONG_NAME_IDX) = long_name;
520 CHARSET_TABLE_INFO (charset, CHARSET_DESCRIPTION_IDX) = description;
521 CHARSET_TABLE_INFO (charset, CHARSET_PLIST_IDX) = Qnil;
522
523 {
524 /* If we have already defined a charset which has the same
525 DIMENSION, CHARS and ISO-FINAL-CHAR but the different
526 DIRECTION, we must update the entry REVERSE-CHARSET of both
527 charsets. If there's no such charset, the value of the entry
528 is set to nil. */
529 int i;
530
531 for (i = 0; i <= MAX_CHARSET; i++)
532 if (!NILP (CHARSET_TABLE_ENTRY (i)))
533 {
534 if (CHARSET_DIMENSION (i) == XINT (dimension)
535 && CHARSET_CHARS (i) == XINT (chars)
536 && CHARSET_ISO_FINAL_CHAR (i) == XINT (iso_final_char)
537 && CHARSET_DIRECTION (i) != XINT (direction))
538 {
539 CHARSET_TABLE_INFO (charset, CHARSET_REVERSE_CHARSET_IDX)
540 = make_number (i);
541 CHARSET_TABLE_INFO (i, CHARSET_REVERSE_CHARSET_IDX) = charset_id;
542 break;
543 }
544 }
545 if (i > MAX_CHARSET)
546 /* No such a charset. */
547 CHARSET_TABLE_INFO (charset, CHARSET_REVERSE_CHARSET_IDX)
548 = make_number (-1);
549 }
550
551 if (charset != CHARSET_ASCII
552 && charset < MIN_CHARSET_PRIVATE_DIMENSION1)
553 {
554 width_by_char_head[leading_code_base] = XINT (width);
555
556 /* Update table emacs_code_class. */
557 emacs_code_class[charset] = (bytes == 2
558 ? EMACS_leading_code_2
559 : (bytes == 3
560 ? EMACS_leading_code_3
561 : EMACS_leading_code_4));
562 }
563
564 /* Update table iso_charset_table. */
565 if (ISO_CHARSET_TABLE (dimension, chars, iso_final_char) < 0)
566 ISO_CHARSET_TABLE (dimension, chars, iso_final_char) = charset;
567 }
568
569 #ifdef emacs
570
571 /* Return charset id of CHARSET_SYMBOL, or return -1 if CHARSET_SYMBOL
572 is invalid. */
573 int
574 get_charset_id (charset_symbol)
575 Lisp_Object charset_symbol;
576 {
577 Lisp_Object val;
578 int charset;
579
580 return ((SYMBOLP (charset_symbol)
581 && (val = Fget (charset_symbol, Qcharset), VECTORP (val))
582 && (charset = XINT (XVECTOR (val)->contents[CHARSET_ID_IDX]),
583 CHARSET_VALID_P (charset)))
584 ? charset : -1);
585 }
586
587 /* Return an identification number for a new private charset of
588 DIMENSION and WIDTH. If there's no more room for the new charset,
589 return 0. */
590 Lisp_Object
591 get_new_private_charset_id (dimension, width)
592 int dimension, width;
593 {
594 int charset, from, to;
595
596 if (dimension == 1)
597 {
598 if (width == 1)
599 from = LEADING_CODE_EXT_11, to = LEADING_CODE_EXT_12;
600 else
601 from = LEADING_CODE_EXT_12, to = LEADING_CODE_EXT_21;
602 }
603 else
604 {
605 if (width == 1)
606 from = LEADING_CODE_EXT_21, to = LEADING_CODE_EXT_22;
607 else
608 from = LEADING_CODE_EXT_22, to = LEADING_CODE_EXT_MAX + 1;
609 }
610
611 for (charset = from; charset < to; charset++)
612 if (!CHARSET_DEFINED_P (charset)) break;
613
614 return make_number (charset < to ? charset : 0);
615 }
616
617 DEFUN ("define-charset", Fdefine_charset, Sdefine_charset, 3, 3, 0,
618 "Define CHARSET-ID as the identification number of CHARSET with INFO-VECTOR.\n\
619 If CHARSET-ID is nil, it is decided automatically, which means CHARSET is\n\
620 treated as a private charset.\n\
621 INFO-VECTOR is a vector of the format:\n\
622 [DIMENSION CHARS WIDTH DIRECTION ISO-FINAL-CHAR ISO-GRAPHIC-PLANE\n\
623 SHORT-NAME LONG-NAME DESCRIPTION]\n\
624 The meanings of each elements is as follows:\n\
625 DIMENSION (integer) is the number of bytes to represent a character: 1 or 2.\n\
626 CHARS (integer) is the number of characters in a dimension: 94 or 96.\n\
627 WIDTH (integer) is the number of columns a character in the charset\n\
628 occupies on the screen: one of 0, 1, and 2.\n\
629 \n\
630 DIRECTION (integer) is the rendering direction of characters in the\n\
631 charset when rendering. If 0, render from left to right, else\n\
632 render from right to left.\n\
633 \n\
634 ISO-FINAL-CHAR (character) is the final character of the\n\
635 corresponding ISO 2022 charset.\n\
636 \n\
637 ISO-GRAPHIC-PLANE (integer) is the graphic plane to be invoked\n\
638 while encoding to variants of ISO 2022 coding system, one of the\n\
639 following: 0/graphic-plane-left(GL), 1/graphic-plane-right(GR).\n\
640 \n\
641 SHORT-NAME (string) is the short name to refer to the charset.\n\
642 \n\
643 LONG-NAME (string) is the long name to refer to the charset.\n\
644 \n\
645 DESCRIPTION (string) is the description string of the charset.")
646 (charset_id, charset_symbol, info_vector)
647 Lisp_Object charset_id, charset_symbol, info_vector;
648 {
649 Lisp_Object *vec;
650
651 if (!NILP (charset_id))
652 CHECK_NUMBER (charset_id, 0);
653 CHECK_SYMBOL (charset_symbol, 1);
654 CHECK_VECTOR (info_vector, 2);
655
656 if (! NILP (charset_id))
657 {
658 if (! CHARSET_VALID_P (XINT (charset_id)))
659 error ("Invalid CHARSET: %d", XINT (charset_id));
660 else if (CHARSET_DEFINED_P (XINT (charset_id)))
661 error ("Already defined charset: %d", XINT (charset_id));
662 }
663
664 vec = XVECTOR (info_vector)->contents;
665 if (XVECTOR (info_vector)->size != 9
666 || !INTEGERP (vec[0]) || !(XINT (vec[0]) == 1 || XINT (vec[0]) == 2)
667 || !INTEGERP (vec[1]) || !(XINT (vec[1]) == 94 || XINT (vec[1]) == 96)
668 || !INTEGERP (vec[2]) || !(XINT (vec[2]) == 1 || XINT (vec[2]) == 2)
669 || !INTEGERP (vec[3]) || !(XINT (vec[3]) == 0 || XINT (vec[3]) == 1)
670 || !INTEGERP (vec[4]) || !(XINT (vec[4]) >= '0' && XINT (vec[4]) <= '~')
671 || !INTEGERP (vec[5]) || !(XINT (vec[5]) == 0 || XINT (vec[5]) == 1)
672 || !STRINGP (vec[6])
673 || !STRINGP (vec[7])
674 || !STRINGP (vec[8]))
675 error ("Invalid info-vector argument for defining charset %s",
676 XSYMBOL (charset_symbol)->name->data);
677
678 if (NILP (charset_id))
679 {
680 charset_id = get_new_private_charset_id (XINT (vec[0]), XINT (vec[2]));
681 if (XINT (charset_id) == 0)
682 error ("There's no room for a new private charset %s",
683 XSYMBOL (charset_symbol)->name->data);
684 }
685
686 update_charset_table (charset_id, vec[0], vec[1], vec[2], vec[3],
687 vec[4], vec[5], vec[6], vec[7], vec[8]);
688 Fput (charset_symbol, Qcharset, CHARSET_TABLE_ENTRY (XINT (charset_id)));
689 CHARSET_SYMBOL (XINT (charset_id)) = charset_symbol;
690 Vcharset_list = Fcons (charset_symbol, Vcharset_list);
691 return Qnil;
692 }
693
694 DEFUN ("generic-character-list", Fgeneric_character_list,
695 Sgeneric_character_list, 0, 0, 0,
696 "Return a list of all possible generic characters.\n\
697 It includes a generic character for a charset not yet defined.")
698 ()
699 {
700 return Vgeneric_character_list;
701 }
702
703 DEFUN ("get-unused-iso-final-char", Fget_unused_iso_final_char,
704 Sget_unused_iso_final_char, 2, 2, 0,
705 "Return an unsed ISO's final char for a charset of DIMENISION and CHARS.\n\
706 DIMENSION is the number of bytes to represent a character: 1 or 2.\n\
707 CHARS is the number of characters in a dimension: 94 or 96.\n\
708 \n\
709 This final char is for private use, thus the range is `0' (48) .. `?' (63).\n\
710 If there's no unused final char for the specified kind of charset,\n\
711 return nil.")
712 (dimension, chars)
713 Lisp_Object dimension, chars;
714 {
715 int final_char;
716
717 CHECK_NUMBER (dimension, 0);
718 CHECK_NUMBER (chars, 1);
719 if (XINT (dimension) != 1 && XINT (dimension) != 2)
720 error ("Invalid charset dimension %d, it should be 1 or 2",
721 XINT (dimension));
722 if (XINT (chars) != 94 && XINT (chars) != 96)
723 error ("Invalid charset chars %d, it should be 94 or 96",
724 XINT (chars));
725 for (final_char = '0'; final_char <= '?'; final_char++)
726 {
727 if (ISO_CHARSET_TABLE (dimension, chars, make_number (final_char)) < 0)
728 break;
729 }
730 return (final_char <= '?' ? make_number (final_char) : Qnil);
731 }
732
733 DEFUN ("declare-equiv-charset", Fdeclare_equiv_charset, Sdeclare_equiv_charset,
734 4, 4, 0,
735 "Declare a charset of DIMENSION, CHARS, FINAL-CHAR is the same as CHARSET.\n\
736 CHARSET should be defined by `defined-charset' in advance.")
737 (dimension, chars, final_char, charset_symbol)
738 Lisp_Object dimension, chars, final_char, charset_symbol;
739 {
740 int charset;
741
742 CHECK_NUMBER (dimension, 0);
743 CHECK_NUMBER (chars, 1);
744 CHECK_NUMBER (final_char, 2);
745 CHECK_SYMBOL (charset_symbol, 3);
746
747 if (XINT (dimension) != 1 && XINT (dimension) != 2)
748 error ("Invalid DIMENSION %d, it should be 1 or 2", XINT (dimension));
749 if (XINT (chars) != 94 && XINT (chars) != 96)
750 error ("Invalid CHARS %d, it should be 94 or 96", XINT (chars));
751 if (XINT (final_char) < '0' || XFASTINT (final_char) > '~')
752 error ("Invalid FINAL-CHAR %c, it should be `0'..`~'", XINT (chars));
753 if ((charset = get_charset_id (charset_symbol)) < 0)
754 error ("Invalid charset %s", XSYMBOL (charset_symbol)->name->data);
755
756 ISO_CHARSET_TABLE (dimension, chars, final_char) = charset;
757 return Qnil;
758 }
759
760 /* Return number of different charsets in STR of length LEN. In
761 addition, for each found charset N, CHARSETS[N] is set 1. The
762 caller should allocate CHARSETS (MAX_CHARSET + 1 elements) in advance.
763 It may lookup a translation table TABLE if supplied.
764
765 If MULTIBYTE is zero, do not check multibyte characters, i.e. if
766 any ASCII codes (7-bit) are found, CHARSET[0] is set to 1, if any
767 8-bit codes are found CHARSET[1] is set to 1. */
768
769 int
770 find_charset_in_str (str, len, charsets, table, multibyte)
771 unsigned char *str;
772 int len, *charsets;
773 Lisp_Object table;
774 int multibyte;
775 {
776 register int num = 0, c;
777
778 if (! multibyte)
779 {
780 unsigned char *endp = str + len;
781 int maskbits = 0;
782
783 while (str < endp && maskbits != 3)
784 maskbits |= (*str++ < 0x80 ? 1 : 2);
785 if (maskbits & 1)
786 {
787 charsets[0] = 1;
788 num++;
789 }
790 if (maskbits & 2)
791 {
792 charsets[1] = 1;
793 num++;
794 }
795 return num;
796 }
797
798 if (! CHAR_TABLE_P (table))
799 table = Qnil;
800
801 while (len > 0)
802 {
803 int bytes, charset, c1, c2;
804
805 SPLIT_MULTIBYTE_SEQ (str, len, bytes, charset, c1, c2);
806
807 if (! NILP (table))
808 {
809 int c1 = translate_char (table, -1, charset, c1, c2);
810 if (c1 >= 0)
811 charset = CHAR_CHARSET (c);
812 }
813
814 if (!charsets[charset])
815 {
816 charsets[charset] = 1;
817 num += 1;
818 }
819 str += bytes;
820 len -= bytes;
821 }
822 return num;
823 }
824
825 DEFUN ("find-charset-region", Ffind_charset_region, Sfind_charset_region,
826 2, 3, 0,
827 "Return a list of charsets in the region between BEG and END.\n\
828 BEG and END are buffer positions.\n\
829 Optional arg TABLE if non-nil is a translation table to look up.\n\
830 \n\
831 If the region contains invalid multiybte characters,\n\
832 `unknown' is included in the returned list.\n\
833 \n\
834 If the current buffer is unibyte, the returned list contains\n\
835 `ascii' if any 7-bit characters are found,\n\
836 and `unknown' if any 8-bit characters are found.")
837 (beg, end, table)
838 Lisp_Object beg, end, table;
839 {
840 int charsets[MAX_CHARSET + 1];
841 int from, from_byte, to, stop, stop_byte, i;
842 Lisp_Object val;
843 int undefined;
844 int multibyte = !NILP (current_buffer->enable_multibyte_characters);
845
846 validate_region (&beg, &end);
847 from = XFASTINT (beg);
848 stop = to = XFASTINT (end);
849
850 if (from < GPT && GPT < to)
851 {
852 stop = GPT;
853 stop_byte = GPT_BYTE;
854 }
855 else
856 stop_byte = CHAR_TO_BYTE (stop);
857
858 from_byte = CHAR_TO_BYTE (from);
859
860 bzero (charsets, (MAX_CHARSET + 1) * sizeof (int));
861 while (1)
862 {
863 find_charset_in_str (BYTE_POS_ADDR (from_byte), stop_byte - from_byte,
864 charsets, table, multibyte);
865 if (stop < to)
866 {
867 from = stop, from_byte = stop_byte;
868 stop = to, stop_byte = CHAR_TO_BYTE (stop);
869 }
870 else
871 break;
872 }
873
874 val = Qnil;
875 undefined = 0;
876 for (i = (multibyte ? MAX_CHARSET : 1); i >= 0; i--)
877 if (charsets[i])
878 {
879 if (CHARSET_DEFINED_P (i))
880 val = Fcons (CHARSET_SYMBOL (i), val);
881 else
882 undefined = 1;
883 }
884 if (undefined)
885 val = Fcons (Qunknown, val);
886 return val;
887 }
888
889 DEFUN ("find-charset-string", Ffind_charset_string, Sfind_charset_string,
890 1, 2, 0,
891 "Return a list of charsets in STR.\n\
892 Optional arg TABLE if non-nil is a translation table to look up.\n\
893 \n\
894 If the region contains invalid multiybte characters,\n\
895 `unknown' is included in the returned list.\n\
896 \n\
897 If STR is unibyte, the returned list contains\n\
898 `ascii' if any 7-bit characters are found,\n\
899 and `unknown' if any 8-bit characters are found.")
900 (str, table)
901 Lisp_Object str, table;
902 {
903 int charsets[MAX_CHARSET + 1];
904 int i;
905 Lisp_Object val;
906 int undefined;
907 int multibyte;
908
909 CHECK_STRING (str, 0);
910 multibyte = STRING_MULTIBYTE (str);
911
912 bzero (charsets, (MAX_CHARSET + 1) * sizeof (int));
913 find_charset_in_str (XSTRING (str)->data, STRING_BYTES (XSTRING (str)),
914 charsets, table, multibyte);
915 val = Qnil;
916 undefined = 0;
917 for (i = (multibyte ? MAX_CHARSET : 1); i >= 0; i--)
918 if (charsets[i])
919 {
920 if (CHARSET_DEFINED_P (i))
921 val = Fcons (CHARSET_SYMBOL (i), val);
922 else
923 undefined = 1;
924 }
925 if (undefined)
926 val = Fcons (Qunknown, val);
927 return val;
928 }
929 \f
930 DEFUN ("make-char-internal", Fmake_char_internal, Smake_char_internal, 1, 3, 0,
931 "")
932 (charset, code1, code2)
933 Lisp_Object charset, code1, code2;
934 {
935 int charset_id, c1, c2;
936
937 CHECK_NUMBER (charset, 0);
938 charset_id = XINT (charset);
939 if (!CHARSET_DEFINED_P (charset_id))
940 error ("Invalid charset ID: %d", XINT (charset));
941
942 if (NILP (code1))
943 c1 = 0;
944 else
945 {
946 CHECK_NUMBER (code1, 1);
947 c1 = XINT (code1);
948 }
949 if (NILP (code2))
950 c2 = 0;
951 else
952 {
953 CHECK_NUMBER (code2, 2);
954 c2 = XINT (code2);
955 }
956
957 if (c1 < 0 || c1 > 0xFF || c2 < 0 || c2 > 0xFF)
958 error ("Invalid code points for charset ID %d: %d %d", charset_id, c1, c2);
959 c1 &= 0x7F;
960 c2 &= 0x7F;
961 if (c1 == 0
962 ? c2 != 0
963 : (c2 == 0
964 ? !CHAR_COMPONENTS_VALID_P (charset, c1, 0x20)
965 : !CHAR_COMPONENTS_VALID_P (charset, c1, c2)))
966 error ("Invalid code points for charset ID %d: %d %d", charset_id, c1, c2);
967
968 return make_number (MAKE_CHAR (charset_id, c1, c2));
969 }
970
971 DEFUN ("split-char", Fsplit_char, Ssplit_char, 1, 1, 0,
972 "Return list of charset and one or two position-codes of CHAR.\n\
973 If CHAR is invalid as a character code,\n\
974 return a list of symbol `unknown' and CHAR.")
975 (ch)
976 Lisp_Object ch;
977 {
978 Lisp_Object val;
979 int c, charset, c1, c2;
980
981 CHECK_NUMBER (ch, 0);
982 c = XFASTINT (ch);
983 if (!CHAR_VALID_P (c, 1))
984 return Fcons (Qunknown, Fcons (ch, Qnil));
985 SPLIT_CHAR (XFASTINT (ch), charset, c1, c2);
986 return (c2 >= 0
987 ? Fcons (CHARSET_SYMBOL (charset),
988 Fcons (make_number (c1), Fcons (make_number (c2), Qnil)))
989 : Fcons (CHARSET_SYMBOL (charset), Fcons (make_number (c1), Qnil)));
990 }
991
992 DEFUN ("char-charset", Fchar_charset, Schar_charset, 1, 1, 0,
993 "Return charset of CHAR.")
994 (ch)
995 Lisp_Object ch;
996 {
997 CHECK_NUMBER (ch, 0);
998
999 return CHARSET_SYMBOL (CHAR_CHARSET (XINT (ch)));
1000 }
1001
1002 DEFUN ("charset-after", Fcharset_after, Scharset_after, 0, 1, 0,
1003 "Return charset of a character in the current buffer at position POS.\n\
1004 If POS is nil, it defauls to the current point.\n\
1005 If POS is out of range, the value is nil.")
1006 (pos)
1007 Lisp_Object pos;
1008 {
1009 register int pos_byte, bytes, charset, c1, c2;
1010 register unsigned char *p;
1011
1012 if (NILP (pos))
1013 pos_byte = PT_BYTE;
1014 else if (MARKERP (pos))
1015 {
1016 pos_byte = marker_byte_position (pos);
1017 if (pos_byte < BEGV_BYTE || pos_byte >= ZV_BYTE)
1018 return Qnil;
1019 }
1020 else
1021 {
1022 CHECK_NUMBER (pos, 0);
1023 if (XINT (pos) < BEGV || XINT (pos) >= ZV)
1024 return Qnil;
1025 pos_byte = CHAR_TO_BYTE (XINT (pos));
1026 }
1027 p = BYTE_POS_ADDR (pos_byte);
1028 if (BASE_LEADING_CODE_P (*p))
1029 {
1030 SPLIT_MULTIBYTE_SEQ (p, Z_BYTE - pos_byte, bytes, charset, c1, c2);
1031 if (charset < 0)
1032 charset = 1;
1033 }
1034 else
1035 charset = CHARSET_ASCII;
1036
1037 return CHARSET_SYMBOL (charset);
1038 }
1039
1040 DEFUN ("iso-charset", Fiso_charset, Siso_charset, 3, 3, 0,
1041 "Return charset of ISO's specification DIMENSION, CHARS, and FINAL-CHAR.\n\
1042 \n\
1043 ISO 2022's designation sequence (escape sequence) distinguishes charsets\n\
1044 by their DIMENSION, CHARS, and FINAL-CHAR,\n\
1045 where as Emacs distinguishes them by charset symbol.\n\
1046 See the documentation of the function `charset-info' for the meanings of\n\
1047 DIMENSION, CHARS, and FINAL-CHAR.")
1048 (dimension, chars, final_char)
1049 Lisp_Object dimension, chars, final_char;
1050 {
1051 int charset;
1052
1053 CHECK_NUMBER (dimension, 0);
1054 CHECK_NUMBER (chars, 1);
1055 CHECK_NUMBER (final_char, 2);
1056
1057 if ((charset = ISO_CHARSET_TABLE (dimension, chars, final_char)) < 0)
1058 return Qnil;
1059 return CHARSET_SYMBOL (charset);
1060 }
1061
1062 /* If GENERICP is nonzero, return nonzero iff C is a valid normal or
1063 generic character. If GENERICP is zero, return nonzero iff C is a
1064 valid normal character. Do not call this function directly,
1065 instead use macro CHAR_VALID_P. */
1066 int
1067 char_valid_p (c, genericp)
1068 int c, genericp;
1069 {
1070 int charset, c1, c2;
1071
1072 if (c < 0)
1073 return 0;
1074 if (SINGLE_BYTE_CHAR_P (c))
1075 return 1;
1076 SPLIT_NON_ASCII_CHAR (c, charset, c1, c2);
1077 if (genericp)
1078 {
1079 if (c1)
1080 {
1081 if (c2 <= 0) c2 = 0x20;
1082 }
1083 else
1084 {
1085 if (c2 <= 0) c1 = c2 = 0x20;
1086 }
1087 }
1088 return (CHARSET_DEFINED_P (charset)
1089 && CHAR_COMPONENTS_VALID_P (charset, c1, c2));
1090 }
1091
1092 DEFUN ("char-valid-p", Fchar_valid_p, Schar_valid_p, 1, 2, 0,
1093 "Return t if OBJECT is a valid normal character.\n\
1094 If optional arg GENERICP is non-nil, also return t if OBJECT is\n\
1095 a valid generic character.")
1096 (object, genericp)
1097 Lisp_Object object, genericp;
1098 {
1099 if (! NATNUMP (object))
1100 return Qnil;
1101 return (CHAR_VALID_P (XFASTINT (object), !NILP (genericp)) ? Qt : Qnil);
1102 }
1103
1104 DEFUN ("unibyte-char-to-multibyte", Funibyte_char_to_multibyte,
1105 Sunibyte_char_to_multibyte, 1, 1, 0,
1106 "Convert the unibyte character CH to multibyte character.\n\
1107 The conversion is done based on `nonascii-translation-table' (which see)\n\
1108 or `nonascii-insert-offset' (which see).")
1109 (ch)
1110 Lisp_Object ch;
1111 {
1112 int c;
1113
1114 CHECK_NUMBER (ch, 0);
1115 c = XINT (ch);
1116 if (c < 0 || c >= 0400)
1117 error ("Invalid unibyte character: %d", c);
1118 c = unibyte_char_to_multibyte (c);
1119 if (c < 0)
1120 error ("Can't convert to multibyte character: %d", XINT (ch));
1121 return make_number (c);
1122 }
1123
1124 DEFUN ("multibyte-char-to-unibyte", Fmultibyte_char_to_unibyte,
1125 Smultibyte_char_to_unibyte, 1, 1, 0,
1126 "Convert the multibyte character CH to unibyte character.\n\
1127 The conversion is done based on `nonascii-translation-table' (which see)\n\
1128 or `nonascii-insert-offset' (which see).")
1129 (ch)
1130 Lisp_Object ch;
1131 {
1132 int c;
1133
1134 CHECK_NUMBER (ch, 0);
1135 c = XINT (ch);
1136 if (! CHAR_VALID_P (c, 0))
1137 error ("Invalid multibyte character: %d", c);
1138 c = multibyte_char_to_unibyte (c, Qnil);
1139 if (c < 0)
1140 error ("Can't convert to unibyte character: %d", XINT (ch));
1141 return make_number (c);
1142 }
1143
1144 DEFUN ("char-bytes", Fchar_bytes, Schar_bytes, 1, 1, 0,
1145 "Return 1 regardless of the argument CHAR.\n\
1146 This is now an obsolete function. We keep it just for backward compatibility.")
1147 (ch)
1148 Lisp_Object ch;
1149 {
1150 Lisp_Object val;
1151
1152 CHECK_NUMBER (ch, 0);
1153 return make_number (1);
1154 }
1155
1156 /* Return how many bytes C will occupy in a multibyte buffer.
1157 Don't call this function directly, instead use macro CHAR_BYTES. */
1158 int
1159 char_bytes (c)
1160 int c;
1161 {
1162 int charset;
1163
1164 if (SINGLE_BYTE_CHAR_P (c) || (c & ~((1 << CHARACTERBITS) - 1)))
1165 return 1;
1166
1167 charset = CHAR_CHARSET (c);
1168 return (CHARSET_DEFINED_P (charset) ? CHARSET_BYTES (charset) : 1);
1169 }
1170
1171 /* Return the width of character of which multi-byte form starts with
1172 C. The width is measured by how many columns occupied on the
1173 screen when displayed in the current buffer. */
1174
1175 #define ONE_BYTE_CHAR_WIDTH(c) \
1176 (c < 0x20 \
1177 ? (c == '\t' \
1178 ? XFASTINT (current_buffer->tab_width) \
1179 : (c == '\n' ? 0 : (NILP (current_buffer->ctl_arrow) ? 4 : 2))) \
1180 : (c < 0x7f \
1181 ? 1 \
1182 : (c == 0x7F \
1183 ? (NILP (current_buffer->ctl_arrow) ? 4 : 2) \
1184 : ((! NILP (current_buffer->enable_multibyte_characters) \
1185 && BASE_LEADING_CODE_P (c)) \
1186 ? WIDTH_BY_CHAR_HEAD (c) \
1187 : 4))))
1188
1189 DEFUN ("char-width", Fchar_width, Schar_width, 1, 1, 0,
1190 "Return width of CHAR when displayed in the current buffer.\n\
1191 The width is measured by how many columns it occupies on the screen.")
1192 (ch)
1193 Lisp_Object ch;
1194 {
1195 Lisp_Object val, disp;
1196 int c;
1197 struct Lisp_Char_Table *dp = buffer_display_table ();
1198
1199 CHECK_NUMBER (ch, 0);
1200
1201 c = XINT (ch);
1202
1203 /* Get the way the display table would display it. */
1204 disp = dp ? DISP_CHAR_VECTOR (dp, c) : Qnil;
1205
1206 if (VECTORP (disp))
1207 XSETINT (val, XVECTOR (disp)->size);
1208 else if (SINGLE_BYTE_CHAR_P (c))
1209 XSETINT (val, ONE_BYTE_CHAR_WIDTH (c));
1210 else
1211 {
1212 int charset = CHAR_CHARSET (c);
1213
1214 XSETFASTINT (val, CHARSET_WIDTH (charset));
1215 }
1216 return val;
1217 }
1218
1219 /* Return width of string STR of length LEN when displayed in the
1220 current buffer. The width is measured by how many columns it
1221 occupies on the screen. */
1222
1223 int
1224 strwidth (str, len)
1225 unsigned char *str;
1226 int len;
1227 {
1228 unsigned char *endp = str + len;
1229 int width = 0;
1230 struct Lisp_Char_Table *dp = buffer_display_table ();
1231
1232 while (str < endp)
1233 {
1234 Lisp_Object disp;
1235 int thislen;
1236 int c = STRING_CHAR_AND_LENGTH (str, endp - str, thislen);
1237
1238 /* Get the way the display table would display it. */
1239 if (dp)
1240 disp = DISP_CHAR_VECTOR (dp, c);
1241 else
1242 disp = Qnil;
1243
1244 if (VECTORP (disp))
1245 width += XVECTOR (disp)->size;
1246 else
1247 width += ONE_BYTE_CHAR_WIDTH (*str);
1248
1249 str += thislen;
1250 }
1251 return width;
1252 }
1253
1254 DEFUN ("string-width", Fstring_width, Sstring_width, 1, 1, 0,
1255 "Return width of STRING when displayed in the current buffer.\n\
1256 Width is measured by how many columns it occupies on the screen.\n\
1257 When calculating width of a multibyte character in STRING,\n\
1258 only the base leading-code is considered; the validity of\n\
1259 the following bytes is not checked.")
1260 (str)
1261 Lisp_Object str;
1262 {
1263 Lisp_Object val;
1264
1265 CHECK_STRING (str, 0);
1266 XSETFASTINT (val, strwidth (XSTRING (str)->data,
1267 STRING_BYTES (XSTRING (str))));
1268 return val;
1269 }
1270
1271 DEFUN ("char-direction", Fchar_direction, Schar_direction, 1, 1, 0,
1272 "Return the direction of CHAR.\n\
1273 The returned value is 0 for left-to-right and 1 for right-to-left.")
1274 (ch)
1275 Lisp_Object ch;
1276 {
1277 int charset;
1278
1279 CHECK_NUMBER (ch, 0);
1280 charset = CHAR_CHARSET (XFASTINT (ch));
1281 if (!CHARSET_DEFINED_P (charset))
1282 invalid_character (XINT (ch));
1283 return CHARSET_TABLE_INFO (charset, CHARSET_DIRECTION_IDX);
1284 }
1285
1286 DEFUN ("chars-in-region", Fchars_in_region, Schars_in_region, 2, 2, 0,
1287 "Return number of characters between BEG and END.")
1288 (beg, end)
1289 Lisp_Object beg, end;
1290 {
1291 int from, to;
1292
1293 CHECK_NUMBER_COERCE_MARKER (beg, 0);
1294 CHECK_NUMBER_COERCE_MARKER (end, 1);
1295
1296 from = min (XFASTINT (beg), XFASTINT (end));
1297 to = max (XFASTINT (beg), XFASTINT (end));
1298
1299 return make_number (to - from);
1300 }
1301
1302 /* Return the number of characters in the NBYTES bytes at PTR.
1303 This works by looking at the contents and checking for multibyte sequences.
1304 However, if the current buffer has enable-multibyte-characters = nil,
1305 we treat each byte as a character. */
1306
1307 int
1308 chars_in_text (ptr, nbytes)
1309 unsigned char *ptr;
1310 int nbytes;
1311 {
1312 /* current_buffer is null at early stages of Emacs initialization. */
1313 if (current_buffer == 0
1314 || NILP (current_buffer->enable_multibyte_characters))
1315 return nbytes;
1316
1317 return multibyte_chars_in_text (ptr, nbytes);
1318 }
1319
1320 /* Return the number of characters in the NBYTES bytes at PTR.
1321 This works by looking at the contents and checking for multibyte sequences.
1322 It ignores enable-multibyte-characters. */
1323
1324 int
1325 multibyte_chars_in_text (ptr, nbytes)
1326 unsigned char *ptr;
1327 int nbytes;
1328 {
1329 unsigned char *endp;
1330 int chars, bytes;
1331
1332 endp = ptr + nbytes;
1333 chars = 0;
1334
1335 while (ptr < endp)
1336 {
1337 if (BASE_LEADING_CODE_P (*ptr))
1338 {
1339 PARSE_MULTIBYTE_SEQ (ptr, nbytes, bytes);
1340 ptr += bytes;
1341 nbytes -= bytes;
1342 }
1343 else
1344 {
1345 ptr++;
1346 nbytes--;
1347 }
1348 chars++;
1349 }
1350
1351 return chars;
1352 }
1353
1354 DEFUN ("string", Fstring, Sstring, 1, MANY, 0,
1355 "Concatenate all the argument characters and make the result a string.")
1356 (n, args)
1357 int n;
1358 Lisp_Object *args;
1359 {
1360 int i;
1361 unsigned char *buf = (unsigned char *) alloca (MAX_MULTIBYTE_LENGTH * n);
1362 unsigned char *p = buf;
1363 Lisp_Object val;
1364 int c;
1365
1366 for (i = 0; i < n; i++)
1367 {
1368 if (!INTEGERP (args[i]))
1369 CHECK_NUMBER (args[i], 0);
1370 c = XINT (args[i]);
1371 p += CHAR_STRING (c, p);
1372 }
1373
1374 /* Here, we can't use make_string_from_bytes because of byte
1375 combining problem. */
1376 val = make_string (buf, p - buf);
1377 return val;
1378 }
1379
1380 #endif /* emacs */
1381 \f
1382 int
1383 charset_id_internal (charset_name)
1384 char *charset_name;
1385 {
1386 Lisp_Object val;
1387
1388 val= Fget (intern (charset_name), Qcharset);
1389 if (!VECTORP (val))
1390 error ("Charset %s is not defined", charset_name);
1391
1392 return (XINT (XVECTOR (val)->contents[0]));
1393 }
1394
1395 DEFUN ("setup-special-charsets", Fsetup_special_charsets,
1396 Ssetup_special_charsets, 0, 0, 0, "Internal use only.")
1397 ()
1398 {
1399 charset_latin_iso8859_1 = charset_id_internal ("latin-iso8859-1");
1400 charset_jisx0208_1978 = charset_id_internal ("japanese-jisx0208-1978");
1401 charset_jisx0208 = charset_id_internal ("japanese-jisx0208");
1402 charset_katakana_jisx0201 = charset_id_internal ("katakana-jisx0201");
1403 charset_latin_jisx0201 = charset_id_internal ("latin-jisx0201");
1404 charset_big5_1 = charset_id_internal ("chinese-big5-1");
1405 charset_big5_2 = charset_id_internal ("chinese-big5-2");
1406 return Qnil;
1407 }
1408
1409 void
1410 init_charset_once ()
1411 {
1412 int i, j, k;
1413
1414 staticpro (&Vcharset_table);
1415 staticpro (&Vcharset_symbol_table);
1416 staticpro (&Vgeneric_character_list);
1417
1418 /* This has to be done here, before we call Fmake_char_table. */
1419 Qcharset_table = intern ("charset-table");
1420 staticpro (&Qcharset_table);
1421
1422 /* Intern this now in case it isn't already done.
1423 Setting this variable twice is harmless.
1424 But don't staticpro it here--that is done in alloc.c. */
1425 Qchar_table_extra_slots = intern ("char-table-extra-slots");
1426
1427 /* Now we are ready to set up this property, so we can
1428 create the charset table. */
1429 Fput (Qcharset_table, Qchar_table_extra_slots, make_number (0));
1430 Vcharset_table = Fmake_char_table (Qcharset_table, Qnil);
1431
1432 Qunknown = intern ("unknown");
1433 staticpro (&Qunknown);
1434 Vcharset_symbol_table = Fmake_vector (make_number (MAX_CHARSET + 1),
1435 Qunknown);
1436
1437 /* Setup tables. */
1438 for (i = 0; i < 2; i++)
1439 for (j = 0; j < 2; j++)
1440 for (k = 0; k < 128; k++)
1441 iso_charset_table [i][j][k] = -1;
1442
1443 for (i = 0; i < 256; i++)
1444 BYTES_BY_CHAR_HEAD (i) = 1;
1445 for (i = MIN_CHARSET_OFFICIAL_DIMENSION1;
1446 i <= MAX_CHARSET_OFFICIAL_DIMENSION1; i++)
1447 BYTES_BY_CHAR_HEAD (i) = 2;
1448 for (i = MIN_CHARSET_OFFICIAL_DIMENSION2;
1449 i <= MAX_CHARSET_OFFICIAL_DIMENSION2; i++)
1450 BYTES_BY_CHAR_HEAD (i) = 3;
1451 BYTES_BY_CHAR_HEAD (LEADING_CODE_PRIVATE_11) = 3;
1452 BYTES_BY_CHAR_HEAD (LEADING_CODE_PRIVATE_12) = 3;
1453 BYTES_BY_CHAR_HEAD (LEADING_CODE_PRIVATE_21) = 4;
1454 BYTES_BY_CHAR_HEAD (LEADING_CODE_PRIVATE_22) = 4;
1455 /* The followings don't reflect the actual bytes, but just to tell
1456 that it is a start of a multibyte character. */
1457 BYTES_BY_CHAR_HEAD (0x80) = 2;
1458 BYTES_BY_CHAR_HEAD (0x9E) = 2;
1459 BYTES_BY_CHAR_HEAD (0x9F) = 2;
1460
1461 for (i = 0; i < 128; i++)
1462 WIDTH_BY_CHAR_HEAD (i) = 1;
1463 for (; i < 256; i++)
1464 WIDTH_BY_CHAR_HEAD (i) = 4;
1465 WIDTH_BY_CHAR_HEAD (LEADING_CODE_PRIVATE_11) = 1;
1466 WIDTH_BY_CHAR_HEAD (LEADING_CODE_PRIVATE_12) = 2;
1467 WIDTH_BY_CHAR_HEAD (LEADING_CODE_PRIVATE_21) = 1;
1468 WIDTH_BY_CHAR_HEAD (LEADING_CODE_PRIVATE_22) = 2;
1469
1470 {
1471 Lisp_Object val;
1472
1473 val = Qnil;
1474 for (i = 0x81; i < 0x90; i++)
1475 val = Fcons (make_number ((i - 0x70) << 7), val);
1476 for (; i < 0x9A; i++)
1477 val = Fcons (make_number ((i - 0x8F) << 14), val);
1478 for (i = 0xA0; i < 0xF0; i++)
1479 val = Fcons (make_number ((i - 0x70) << 7), val);
1480 for (; i < 0xFF; i++)
1481 val = Fcons (make_number ((i - 0xE0) << 14), val);
1482 Vgeneric_character_list = Fnreverse (val);
1483 }
1484
1485 nonascii_insert_offset = 0;
1486 Vnonascii_translation_table = Qnil;
1487 }
1488
1489 #ifdef emacs
1490
1491 void
1492 syms_of_charset ()
1493 {
1494 Qascii = intern ("ascii");
1495 staticpro (&Qascii);
1496
1497 Qcharset = intern ("charset");
1498 staticpro (&Qcharset);
1499
1500 /* Define ASCII charset now. */
1501 update_charset_table (make_number (CHARSET_ASCII),
1502 make_number (1), make_number (94),
1503 make_number (1),
1504 make_number (0),
1505 make_number ('B'),
1506 make_number (0),
1507 build_string ("ASCII"),
1508 build_string ("ASCII"),
1509 build_string ("ASCII (ISO646 IRV)"));
1510 CHARSET_SYMBOL (CHARSET_ASCII) = Qascii;
1511 Fput (Qascii, Qcharset, CHARSET_TABLE_ENTRY (CHARSET_ASCII));
1512
1513 Qauto_fill_chars = intern ("auto-fill-chars");
1514 staticpro (&Qauto_fill_chars);
1515 Fput (Qauto_fill_chars, Qchar_table_extra_slots, make_number (0));
1516
1517 defsubr (&Sdefine_charset);
1518 defsubr (&Sgeneric_character_list);
1519 defsubr (&Sget_unused_iso_final_char);
1520 defsubr (&Sdeclare_equiv_charset);
1521 defsubr (&Sfind_charset_region);
1522 defsubr (&Sfind_charset_string);
1523 defsubr (&Smake_char_internal);
1524 defsubr (&Ssplit_char);
1525 defsubr (&Schar_charset);
1526 defsubr (&Scharset_after);
1527 defsubr (&Siso_charset);
1528 defsubr (&Schar_valid_p);
1529 defsubr (&Sunibyte_char_to_multibyte);
1530 defsubr (&Smultibyte_char_to_unibyte);
1531 defsubr (&Schar_bytes);
1532 defsubr (&Schar_width);
1533 defsubr (&Sstring_width);
1534 defsubr (&Schar_direction);
1535 defsubr (&Schars_in_region);
1536 defsubr (&Sstring);
1537 defsubr (&Ssetup_special_charsets);
1538
1539 DEFVAR_LISP ("charset-list", &Vcharset_list,
1540 "List of charsets ever defined.");
1541 Vcharset_list = Fcons (Qascii, Qnil);
1542
1543 DEFVAR_LISP ("translation-table-vector", &Vtranslation_table_vector,
1544 "Vector of cons cell of a symbol and translation table ever defined.\n\
1545 An ID of a translation table is an index of this vector.");
1546 Vtranslation_table_vector = Fmake_vector (make_number (16), Qnil);
1547
1548 DEFVAR_INT ("leading-code-private-11", &leading_code_private_11,
1549 "Leading-code of private TYPE9N charset of column-width 1.");
1550 leading_code_private_11 = LEADING_CODE_PRIVATE_11;
1551
1552 DEFVAR_INT ("leading-code-private-12", &leading_code_private_12,
1553 "Leading-code of private TYPE9N charset of column-width 2.");
1554 leading_code_private_12 = LEADING_CODE_PRIVATE_12;
1555
1556 DEFVAR_INT ("leading-code-private-21", &leading_code_private_21,
1557 "Leading-code of private TYPE9Nx9N charset of column-width 1.");
1558 leading_code_private_21 = LEADING_CODE_PRIVATE_21;
1559
1560 DEFVAR_INT ("leading-code-private-22", &leading_code_private_22,
1561 "Leading-code of private TYPE9Nx9N charset of column-width 2.");
1562 leading_code_private_22 = LEADING_CODE_PRIVATE_22;
1563
1564 DEFVAR_INT ("nonascii-insert-offset", &nonascii_insert_offset,
1565 "Offset for converting non-ASCII unibyte codes 0240...0377 to multibyte.\n\
1566 This is used for converting unibyte text to multibyte,\n\
1567 and for inserting character codes specified by number.\n\n\
1568 This serves to convert a Latin-1 or similar 8-bit character code\n\
1569 to the corresponding Emacs multibyte character code.\n\
1570 Typically the value should be (- (make-char CHARSET 0) 128),\n\
1571 for your choice of character set.\n\
1572 If `nonascii-translation-table' is non-nil, it overrides this variable.");
1573 nonascii_insert_offset = 0;
1574
1575 DEFVAR_LISP ("nonascii-translation-table", &Vnonascii_translation_table,
1576 "Translation table to convert non-ASCII unibyte codes to multibyte.\n\
1577 This is used for converting unibyte text to multibyte,\n\
1578 and for inserting character codes specified by number.\n\n\
1579 Conversion is performed only when multibyte characters are enabled,\n\
1580 and it serves to convert a Latin-1 or similar 8-bit character code\n\
1581 to the corresponding Emacs character code.\n\n\
1582 If this is nil, `nonascii-insert-offset' is used instead.\n\
1583 See also the docstring of `make-translation-table'.");
1584 Vnonascii_translation_table = Qnil;
1585
1586 DEFVAR_LISP ("auto-fill-chars", &Vauto_fill_chars,
1587 "A char-table for characters which invoke auto-filling.\n\
1588 Such characters has value t in this table.");
1589 Vauto_fill_chars = Fmake_char_table (Qauto_fill_chars, Qnil);
1590 CHAR_TABLE_SET (Vauto_fill_chars, make_number (' '), Qt);
1591 CHAR_TABLE_SET (Vauto_fill_chars, make_number ('\n'), Qt);
1592 }
1593
1594 #endif /* emacs */