(syms_of_charset) [auto-fill-chars]: Doc fix.
[bpt/emacs.git] / src / charset.c
1 /* Basic multilingual character support.
2 Copyright (C) 1995, 1997, 1998 Electrotechnical Laboratory, JAPAN.
3 Licensed to the Free Software Foundation.
4
5 This file is part of GNU Emacs.
6
7 GNU Emacs is free software; you can redistribute it and/or modify
8 it under the terms of the GNU General Public License as published by
9 the Free Software Foundation; either version 2, or (at your option)
10 any later version.
11
12 GNU Emacs is distributed in the hope that it will be useful,
13 but WITHOUT ANY WARRANTY; without even the implied warranty of
14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 GNU General Public License for more details.
16
17 You should have received a copy of the GNU General Public License
18 along with GNU Emacs; see the file COPYING. If not, write to
19 the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
20 Boston, MA 02111-1307, USA. */
21
22 /* At first, see the document in `charset.h' to understand the code in
23 this file. */
24
25 #ifdef emacs
26 #include <config.h>
27 #endif
28
29 #include <stdio.h>
30
31 #ifdef emacs
32
33 #include <sys/types.h>
34 #include "lisp.h"
35 #include "buffer.h"
36 #include "charset.h"
37 #include "coding.h"
38 #include "disptab.h"
39
40 #else /* not emacs */
41
42 #include "mulelib.h"
43
44 #endif /* emacs */
45
46 Lisp_Object Qcharset, Qascii;
47 Lisp_Object Qunknown;
48
49 /* Declaration of special leading-codes. */
50 int leading_code_private_11; /* for private DIMENSION1 of 1-column */
51 int leading_code_private_12; /* for private DIMENSION1 of 2-column */
52 int leading_code_private_21; /* for private DIMENSION2 of 1-column */
53 int leading_code_private_22; /* for private DIMENSION2 of 2-column */
54
55 /* Declaration of special charsets. */
56 int charset_ascii; /* ASCII */
57 int charset_latin_iso8859_1; /* ISO8859-1 (Latin-1) */
58 int charset_jisx0208_1978; /* JISX0208.1978 (Japanese Kanji old set) */
59 int charset_jisx0208; /* JISX0208.1983 (Japanese Kanji) */
60 int charset_katakana_jisx0201; /* JISX0201.Kana (Japanese Katakana) */
61 int charset_latin_jisx0201; /* JISX0201.Roman (Japanese Roman) */
62 int charset_big5_1; /* Big5 Level 1 (Chinese Traditional) */
63 int charset_big5_2; /* Big5 Level 2 (Chinese Traditional) */
64
65 Lisp_Object Qcharset_table;
66
67 /* A char-table containing information of each character set. */
68 Lisp_Object Vcharset_table;
69
70 /* A vector of charset symbol indexed by charset-id. This is used
71 only for returning charset symbol from C functions. */
72 Lisp_Object Vcharset_symbol_table;
73
74 /* A list of charset symbols ever defined. */
75 Lisp_Object Vcharset_list;
76
77 /* Vector of translation table ever defined.
78 ID of a translation table is used to index this vector. */
79 Lisp_Object Vtranslation_table_vector;
80
81 /* A char-table for characters which may invoke auto-filling. */
82 Lisp_Object Vauto_fill_chars;
83
84 Lisp_Object Qauto_fill_chars;
85
86 /* Tables used by macros BYTES_BY_CHAR_HEAD and WIDTH_BY_CHAR_HEAD. */
87 int bytes_by_char_head[256];
88 int width_by_char_head[256];
89
90 /* Mapping table from ISO2022's charset (specified by DIMENSION,
91 CHARS, and FINAL-CHAR) to Emacs' charset. */
92 int iso_charset_table[2][2][128];
93
94 /* Variables used locally in the macro FETCH_MULTIBYTE_CHAR. */
95 unsigned char *_fetch_multibyte_char_p;
96 int _fetch_multibyte_char_len;
97
98 /* Offset to add to a non-ASCII value when inserting it. */
99 int nonascii_insert_offset;
100
101 /* Translation table for converting non-ASCII unibyte characters
102 to multibyte codes, or nil. */
103 Lisp_Object Vnonascii_translation_table;
104
105 /* List of all possible generic characters. */
106 Lisp_Object Vgeneric_character_list;
107
108 #define min(X, Y) ((X) < (Y) ? (X) : (Y))
109 #define max(X, Y) ((X) > (Y) ? (X) : (Y))
110 \f
111 void
112 invalid_character (c)
113 int c;
114 {
115 error ("Invalid character: 0%o, %d, 0x%x", c, c, c);
116 }
117
118 /* Parse a multibyte character string STR of length LENGTH (>= 2) set
119 BYTES to the length of actual multibyte sequence, CHARSET, C1, and
120 C2 to such values that MAKE_CHAR can make the multibyte character
121 from them.
122
123 It is assumed that *STR is one of base leading codes and the
124 following (LENGTH - 1) bytes satisfy !CHAR_HEAD_P.
125
126 This macro should be called only from SPLIT_MULTIBYTE_SEQ. */
127
128 #define SPLIT_CHARACTER_SEQ(str, length, bytes, charset, c1, c2) \
129 do { \
130 (bytes) = 1; \
131 (charset) = (str)[0]; \
132 if ((charset) >= LEADING_CODE_PRIVATE_11 \
133 && (charset) <= LEADING_CODE_PRIVATE_22) \
134 (charset) = (str)[(bytes)++]; \
135 if ((bytes) < (length)) \
136 { \
137 (c1) = (str)[(bytes)++] & 0x7F; \
138 if ((bytes) < (length)) \
139 (c2) = (str)[(bytes)++] & 0x7F; \
140 else \
141 (c2) = -1; \
142 } \
143 else \
144 (c1) = (c2) = -1; \
145 } while (0)
146
147 /* Parse string STR of length LENGTH and check if a multibyte
148 characters is at STR. Set BYTES to the actual length, CHARSET, C1,
149 C2 to proper values for that character. */
150
151 #define SPLIT_MULTIBYTE_SEQ(str, length, bytes, charset, c1, c2) \
152 do { \
153 int i; \
154 if (ASCII_BYTE_P ((str)[0])) \
155 i = 1; \
156 else \
157 for (i = 1; i < (length) && ! CHAR_HEAD_P ((str)[i]); i++); \
158 if (i == 1) \
159 (bytes) = 1, (charset) = CHARSET_ASCII, (c1) = (str)[0] ; \
160 else \
161 { \
162 if (i > BYTES_BY_CHAR_HEAD ((str)[0])) \
163 i = BYTES_BY_CHAR_HEAD ((str)[0]); \
164 SPLIT_CHARACTER_SEQ (str, i, bytes, charset, c1, c2); \
165 } \
166 } while (0)
167
168 /* 1 if CHARSET, C1, and C2 compose a valid character, else 0. */
169 #define CHAR_COMPONENTS_VALID_P(charset, c1, c2) \
170 (charset == CHARSET_ASCII \
171 ? ((c1) >= 0 && (c1) <= 0x7F) \
172 : (CHARSET_DIMENSION (charset) == 1 \
173 ? ((c1) >= 0x20 && (c1) <= 0x7F) \
174 : ((c1) >= 0x20 && (c1) <= 0x7F && (c2) >= 0x20 && (c2) <= 0x7F)))
175
176 /* Store multi-byte form of the character C in STR. The caller should
177 allocate at least 4-byte area at STR in advance. Returns the
178 length of the multi-byte form. If C is an invalid character code,
179 signal an error.
180
181 Use macro `CHAR_STRING (C, STR)' instead of calling this function
182 directly if C can be an ASCII character. */
183
184 int
185 char_to_string (c, str)
186 int c;
187 unsigned char *str;
188 {
189 unsigned char *p = str;
190
191 if (c & CHAR_MODIFIER_MASK) /* This includes the case C is negative. */
192 {
193 /* Multibyte character can't have a modifier bit. */
194 if (! SINGLE_BYTE_CHAR_P ((c & ~CHAR_MODIFIER_MASK)))
195 invalid_character (c);
196
197 /* For Meta, Shift, and Control modifiers, we need special care. */
198 if (c & CHAR_META)
199 {
200 /* Move the meta bit to the right place for a string. */
201 c = (c & ~CHAR_META) | 0x80;
202 }
203 if (c & CHAR_SHIFT)
204 {
205 /* Shift modifier is valid only with [A-Za-z]. */
206 if ((c & 0377) >= 'A' && (c & 0377) <= 'Z')
207 c &= ~CHAR_SHIFT;
208 else if ((c & 0377) >= 'a' && (c & 0377) <= 'z')
209 c = (c & ~CHAR_SHIFT) - ('a' - 'A');
210 }
211 if (c & CHAR_CTL)
212 {
213 /* Simulate the code in lread.c. */
214 /* Allow `\C- ' and `\C-?'. */
215 if (c == (CHAR_CTL | ' '))
216 c = 0;
217 else if (c == (CHAR_CTL | '?'))
218 c = 127;
219 /* ASCII control chars are made from letters (both cases),
220 as well as the non-letters within 0100...0137. */
221 else if ((c & 0137) >= 0101 && (c & 0137) <= 0132)
222 c &= (037 | (~0177 & ~CHAR_CTL));
223 else if ((c & 0177) >= 0100 && (c & 0177) <= 0137)
224 c &= (037 | (~0177 & ~CHAR_CTL));
225 }
226
227 /* If C still has any modifier bits, it is an invalid character. */
228 if (c & CHAR_MODIFIER_MASK)
229 invalid_character (c);
230
231 *p++ = c;
232 }
233 else if (c < MAX_CHAR)
234 {
235 int charset, c1, c2;
236
237 SPLIT_NON_ASCII_CHAR (c, charset, c1, c2);
238
239 if (charset >= LEADING_CODE_EXT_11)
240 *p++ = (charset < LEADING_CODE_EXT_12
241 ? LEADING_CODE_PRIVATE_11
242 : (charset < LEADING_CODE_EXT_21
243 ? LEADING_CODE_PRIVATE_12
244 : (charset < LEADING_CODE_EXT_22
245 ? LEADING_CODE_PRIVATE_21
246 : LEADING_CODE_PRIVATE_22)));
247 *p++ = charset;
248 if (c1 > 0 && c1 < 32 || c2 > 0 && c2 < 32)
249 invalid_character (c);
250 if (c1)
251 {
252 *p++ = c1 | 0x80;
253 if (c2 > 0)
254 *p++ = c2 | 0x80;
255 }
256 }
257
258 return (p -str);
259 }
260
261 /* Return the non-ASCII character corresponding to multi-byte form at
262 STR of length LEN. If ACTUAL_LEN is not NULL, store the byte
263 length of the multibyte form in *ACTUAL_LEN.
264
265 Use macros STRING_CHAR or STRING_CHAR_AND_LENGTH instead of calling
266 this function directly if you want ot handle ASCII characters as
267 well. */
268
269 int
270 string_to_char (str, len, actual_len)
271 const unsigned char *str;
272 int len, *actual_len;
273 {
274 int c, bytes, charset, c1, c2;
275
276 SPLIT_MULTIBYTE_SEQ (str, len, bytes, charset, c1, c2);
277 c = MAKE_CHAR (charset, c1, c2);
278 if (actual_len)
279 *actual_len = bytes;
280 return c;
281 }
282
283 /* Return the length of the multi-byte form at string STR of length LEN.
284 Use the macro MULTIBYTE_FORM_LENGTH instead. */
285 int
286 multibyte_form_length (str, len)
287 const unsigned char *str;
288 int len;
289 {
290 int bytes;
291
292 PARSE_MULTIBYTE_SEQ (str, len, bytes);
293 return bytes;
294 }
295
296 /* Check multibyte form at string STR of length LEN and set variables
297 pointed by CHARSET, C1, and C2 to charset and position codes of the
298 character at STR, and return 0. If there's no multibyte character,
299 return -1. This should be used only in the macro SPLIT_STRING
300 which checks range of STR in advance. */
301
302 int
303 split_string (str, len, charset, c1, c2)
304 const unsigned char *str;
305 unsigned char *c1, *c2;
306 int len, *charset;
307 {
308 register int bytes, cs, code1, code2 = -1;
309
310 SPLIT_MULTIBYTE_SEQ (str, len, bytes, cs, code1, code2);
311 if (cs == CHARSET_ASCII)
312 return -1;
313 *charset = cs;
314 *c1 = code1;
315 *c2 = code2;
316 return 0;
317 }
318
319 /* Return 1 iff character C has valid printable glyph.
320 Use the macro CHAR_PRINTABLE_P instead. */
321 int
322 char_printable_p (c)
323 int c;
324 {
325 int charset, c1, c2, chars;
326
327 if (SINGLE_BYTE_CHAR_P (c))
328 return 1;
329 if (c >= MAX_CHAR)
330 return 0;
331
332 SPLIT_NON_ASCII_CHAR (c, charset, c1, c2);
333 if (! CHARSET_DEFINED_P (charset))
334 return 0;
335 if (CHARSET_CHARS (charset) == 94
336 ? c1 <= 32 || c1 >= 127
337 : c1 < 32)
338 return 0;
339 if (CHARSET_DIMENSION (charset) == 2
340 && (CHARSET_CHARS (charset) == 94
341 ? c2 <= 32 || c2 >= 127
342 : c2 < 32))
343 return 0;
344 return 1;
345 }
346
347 /* Translate character C by translation table TABLE. If C
348 is negative, translate a character specified by CHARSET, C1, and C2
349 (C1 and C2 are code points of the character). If no translation is
350 found in TABLE, return C. */
351 int
352 translate_char (table, c, charset, c1, c2)
353 Lisp_Object table;
354 int c, charset, c1, c2;
355 {
356 Lisp_Object ch;
357 int alt_charset, alt_c1, alt_c2, dimension;
358
359 if (c < 0) c = MAKE_CHAR (charset, (c1 & 0x7F) , (c2 & 0x7F));
360 if (!CHAR_TABLE_P (table)
361 || (ch = Faref (table, make_number (c)), !NATNUMP (ch)))
362 return c;
363
364 SPLIT_CHAR (XFASTINT (ch), alt_charset, alt_c1, alt_c2);
365 dimension = CHARSET_DIMENSION (alt_charset);
366 if (dimension == 1 && alt_c1 > 0 || dimension == 2 && alt_c2 > 0)
367 /* CH is not a generic character, just return it. */
368 return XFASTINT (ch);
369
370 /* Since CH is a generic character, we must return a specific
371 charater which has the same position codes as C from CH. */
372 if (charset < 0)
373 SPLIT_CHAR (c, charset, c1, c2);
374 if (dimension != CHARSET_DIMENSION (charset))
375 /* We can't make such a character because of dimension mismatch. */
376 return c;
377 return MAKE_CHAR (alt_charset, c1, c2);
378 }
379
380 /* Convert the unibyte character C to multibyte based on
381 Vnonascii_translation_table or nonascii_insert_offset. If they can't
382 convert C to a valid multibyte character, convert it based on
383 DEFAULT_NONASCII_INSERT_OFFSET which makes C a Latin-1 character. */
384
385 int
386 unibyte_char_to_multibyte (c)
387 int c;
388 {
389 if (c < 0400 && c >= 0200)
390 {
391 int c_save = c;
392
393 if (! NILP (Vnonascii_translation_table))
394 {
395 c = XINT (Faref (Vnonascii_translation_table, make_number (c)));
396 if (c >= 0400 && ! char_valid_p (c, 0))
397 c = c_save + DEFAULT_NONASCII_INSERT_OFFSET;
398 }
399 else if (c >= 0240 && nonascii_insert_offset > 0)
400 {
401 c += nonascii_insert_offset;
402 if (c < 0400 || ! char_valid_p (c, 0))
403 c = c_save + DEFAULT_NONASCII_INSERT_OFFSET;
404 }
405 else if (c >= 0240)
406 c = c_save + DEFAULT_NONASCII_INSERT_OFFSET;
407 }
408 return c;
409 }
410
411
412 /* Convert the multibyte character C to unibyte 8-bit character based
413 on Vnonascii_translation_table or nonascii_insert_offset. If
414 REV_TBL is non-nil, it should be a reverse table of
415 Vnonascii_translation_table, i.e. what given by:
416 Fchar_table_extra_slot (Vnonascii_translation_table, make_number (0)) */
417
418 int
419 multibyte_char_to_unibyte (c, rev_tbl)
420 int c;
421 Lisp_Object rev_tbl;
422 {
423 if (!SINGLE_BYTE_CHAR_P (c))
424 {
425 int c_save = c;
426
427 if (! CHAR_TABLE_P (rev_tbl)
428 && CHAR_TABLE_P (Vnonascii_translation_table))
429 rev_tbl = Fchar_table_extra_slot (Vnonascii_translation_table,
430 make_number (0));
431 if (CHAR_TABLE_P (rev_tbl))
432 {
433 Lisp_Object temp;
434 temp = Faref (rev_tbl, make_number (c));
435 if (INTEGERP (temp))
436 c = XINT (temp);
437 if (c >= 256)
438 c = (c_save & 0177) + 0200;
439 }
440 else
441 {
442 if (nonascii_insert_offset > 0)
443 c -= nonascii_insert_offset;
444 if (c < 128 || c >= 256)
445 c = (c_save & 0177) + 0200;
446 }
447 }
448
449 return c;
450 }
451
452 \f
453 /* Update the table Vcharset_table with the given arguments (see the
454 document of `define-charset' for the meaning of each argument).
455 Several other table contents are also updated. The caller should
456 check the validity of CHARSET-ID and the remaining arguments in
457 advance. */
458
459 void
460 update_charset_table (charset_id, dimension, chars, width, direction,
461 iso_final_char, iso_graphic_plane,
462 short_name, long_name, description)
463 Lisp_Object charset_id, dimension, chars, width, direction;
464 Lisp_Object iso_final_char, iso_graphic_plane;
465 Lisp_Object short_name, long_name, description;
466 {
467 int charset = XINT (charset_id);
468 int bytes;
469 unsigned char leading_code_base, leading_code_ext;
470
471 if (NILP (CHARSET_TABLE_ENTRY (charset)))
472 CHARSET_TABLE_ENTRY (charset)
473 = Fmake_vector (make_number (CHARSET_MAX_IDX), Qnil);
474
475 /* Get byte length of multibyte form, base leading-code, and
476 extended leading-code of the charset. See the comment under the
477 title "GENERAL NOTE on CHARACTER SET (CHARSET)" in charset.h. */
478 bytes = XINT (dimension);
479 if (charset < MIN_CHARSET_PRIVATE_DIMENSION1)
480 {
481 /* Official charset, it doesn't have an extended leading-code. */
482 if (charset != CHARSET_ASCII)
483 bytes += 1; /* For a base leading-code. */
484 leading_code_base = charset;
485 leading_code_ext = 0;
486 }
487 else
488 {
489 /* Private charset. */
490 bytes += 2; /* For base and extended leading-codes. */
491 leading_code_base
492 = (charset < LEADING_CODE_EXT_12
493 ? LEADING_CODE_PRIVATE_11
494 : (charset < LEADING_CODE_EXT_21
495 ? LEADING_CODE_PRIVATE_12
496 : (charset < LEADING_CODE_EXT_22
497 ? LEADING_CODE_PRIVATE_21
498 : LEADING_CODE_PRIVATE_22)));
499 leading_code_ext = charset;
500 }
501
502 if (BYTES_BY_CHAR_HEAD (leading_code_base) != bytes)
503 error ("Invalid dimension for the charset-ID %d", charset);
504
505 CHARSET_TABLE_INFO (charset, CHARSET_ID_IDX) = charset_id;
506 CHARSET_TABLE_INFO (charset, CHARSET_BYTES_IDX) = make_number (bytes);
507 CHARSET_TABLE_INFO (charset, CHARSET_DIMENSION_IDX) = dimension;
508 CHARSET_TABLE_INFO (charset, CHARSET_CHARS_IDX) = chars;
509 CHARSET_TABLE_INFO (charset, CHARSET_WIDTH_IDX) = width;
510 CHARSET_TABLE_INFO (charset, CHARSET_DIRECTION_IDX) = direction;
511 CHARSET_TABLE_INFO (charset, CHARSET_LEADING_CODE_BASE_IDX)
512 = make_number (leading_code_base);
513 CHARSET_TABLE_INFO (charset, CHARSET_LEADING_CODE_EXT_IDX)
514 = make_number (leading_code_ext);
515 CHARSET_TABLE_INFO (charset, CHARSET_ISO_FINAL_CHAR_IDX) = iso_final_char;
516 CHARSET_TABLE_INFO (charset, CHARSET_ISO_GRAPHIC_PLANE_IDX)
517 = iso_graphic_plane;
518 CHARSET_TABLE_INFO (charset, CHARSET_SHORT_NAME_IDX) = short_name;
519 CHARSET_TABLE_INFO (charset, CHARSET_LONG_NAME_IDX) = long_name;
520 CHARSET_TABLE_INFO (charset, CHARSET_DESCRIPTION_IDX) = description;
521 CHARSET_TABLE_INFO (charset, CHARSET_PLIST_IDX) = Qnil;
522
523 {
524 /* If we have already defined a charset which has the same
525 DIMENSION, CHARS and ISO-FINAL-CHAR but the different
526 DIRECTION, we must update the entry REVERSE-CHARSET of both
527 charsets. If there's no such charset, the value of the entry
528 is set to nil. */
529 int i;
530
531 for (i = 0; i <= MAX_CHARSET; i++)
532 if (!NILP (CHARSET_TABLE_ENTRY (i)))
533 {
534 if (CHARSET_DIMENSION (i) == XINT (dimension)
535 && CHARSET_CHARS (i) == XINT (chars)
536 && CHARSET_ISO_FINAL_CHAR (i) == XINT (iso_final_char)
537 && CHARSET_DIRECTION (i) != XINT (direction))
538 {
539 CHARSET_TABLE_INFO (charset, CHARSET_REVERSE_CHARSET_IDX)
540 = make_number (i);
541 CHARSET_TABLE_INFO (i, CHARSET_REVERSE_CHARSET_IDX) = charset_id;
542 break;
543 }
544 }
545 if (i > MAX_CHARSET)
546 /* No such a charset. */
547 CHARSET_TABLE_INFO (charset, CHARSET_REVERSE_CHARSET_IDX)
548 = make_number (-1);
549 }
550
551 if (charset != CHARSET_ASCII
552 && charset < MIN_CHARSET_PRIVATE_DIMENSION1)
553 {
554 width_by_char_head[leading_code_base] = XINT (width);
555
556 /* Update table emacs_code_class. */
557 emacs_code_class[charset] = (bytes == 2
558 ? EMACS_leading_code_2
559 : (bytes == 3
560 ? EMACS_leading_code_3
561 : EMACS_leading_code_4));
562 }
563
564 /* Update table iso_charset_table. */
565 if (ISO_CHARSET_TABLE (dimension, chars, iso_final_char) < 0)
566 ISO_CHARSET_TABLE (dimension, chars, iso_final_char) = charset;
567 }
568
569 #ifdef emacs
570
571 /* Return charset id of CHARSET_SYMBOL, or return -1 if CHARSET_SYMBOL
572 is invalid. */
573 int
574 get_charset_id (charset_symbol)
575 Lisp_Object charset_symbol;
576 {
577 Lisp_Object val;
578 int charset;
579
580 return ((SYMBOLP (charset_symbol)
581 && (val = Fget (charset_symbol, Qcharset), VECTORP (val))
582 && (charset = XINT (XVECTOR (val)->contents[CHARSET_ID_IDX]),
583 CHARSET_VALID_P (charset)))
584 ? charset : -1);
585 }
586
587 /* Return an identification number for a new private charset of
588 DIMENSION and WIDTH. If there's no more room for the new charset,
589 return 0. */
590 Lisp_Object
591 get_new_private_charset_id (dimension, width)
592 int dimension, width;
593 {
594 int charset, from, to;
595
596 if (dimension == 1)
597 {
598 if (width == 1)
599 from = LEADING_CODE_EXT_11, to = LEADING_CODE_EXT_12;
600 else
601 from = LEADING_CODE_EXT_12, to = LEADING_CODE_EXT_21;
602 }
603 else
604 {
605 if (width == 1)
606 from = LEADING_CODE_EXT_21, to = LEADING_CODE_EXT_22;
607 else
608 from = LEADING_CODE_EXT_22, to = LEADING_CODE_EXT_MAX + 1;
609 }
610
611 for (charset = from; charset < to; charset++)
612 if (!CHARSET_DEFINED_P (charset)) break;
613
614 return make_number (charset < to ? charset : 0);
615 }
616
617 DEFUN ("define-charset", Fdefine_charset, Sdefine_charset, 3, 3, 0,
618 "Define CHARSET-ID as the identification number of CHARSET with INFO-VECTOR.\n\
619 If CHARSET-ID is nil, it is decided automatically, which means CHARSET is\n\
620 treated as a private charset.\n\
621 INFO-VECTOR is a vector of the format:\n\
622 [DIMENSION CHARS WIDTH DIRECTION ISO-FINAL-CHAR ISO-GRAPHIC-PLANE\n\
623 SHORT-NAME LONG-NAME DESCRIPTION]\n\
624 The meanings of each elements is as follows:\n\
625 DIMENSION (integer) is the number of bytes to represent a character: 1 or 2.\n\
626 CHARS (integer) is the number of characters in a dimension: 94 or 96.\n\
627 WIDTH (integer) is the number of columns a character in the charset\n\
628 occupies on the screen: one of 0, 1, and 2.\n\
629 \n\
630 DIRECTION (integer) is the rendering direction of characters in the\n\
631 charset when rendering. If 0, render from left to right, else\n\
632 render from right to left.\n\
633 \n\
634 ISO-FINAL-CHAR (character) is the final character of the\n\
635 corresponding ISO 2022 charset.\n\
636 \n\
637 ISO-GRAPHIC-PLANE (integer) is the graphic plane to be invoked\n\
638 while encoding to variants of ISO 2022 coding system, one of the\n\
639 following: 0/graphic-plane-left(GL), 1/graphic-plane-right(GR).\n\
640 \n\
641 SHORT-NAME (string) is the short name to refer to the charset.\n\
642 \n\
643 LONG-NAME (string) is the long name to refer to the charset.\n\
644 \n\
645 DESCRIPTION (string) is the description string of the charset.")
646 (charset_id, charset_symbol, info_vector)
647 Lisp_Object charset_id, charset_symbol, info_vector;
648 {
649 Lisp_Object *vec;
650
651 if (!NILP (charset_id))
652 CHECK_NUMBER (charset_id, 0);
653 CHECK_SYMBOL (charset_symbol, 1);
654 CHECK_VECTOR (info_vector, 2);
655
656 if (! NILP (charset_id))
657 {
658 if (! CHARSET_VALID_P (XINT (charset_id)))
659 error ("Invalid CHARSET: %d", XINT (charset_id));
660 else if (CHARSET_DEFINED_P (XINT (charset_id)))
661 error ("Already defined charset: %d", XINT (charset_id));
662 }
663
664 vec = XVECTOR (info_vector)->contents;
665 if (XVECTOR (info_vector)->size != 9
666 || !INTEGERP (vec[0]) || !(XINT (vec[0]) == 1 || XINT (vec[0]) == 2)
667 || !INTEGERP (vec[1]) || !(XINT (vec[1]) == 94 || XINT (vec[1]) == 96)
668 || !INTEGERP (vec[2]) || !(XINT (vec[2]) == 1 || XINT (vec[2]) == 2)
669 || !INTEGERP (vec[3]) || !(XINT (vec[3]) == 0 || XINT (vec[3]) == 1)
670 || !INTEGERP (vec[4]) || !(XINT (vec[4]) >= '0' && XINT (vec[4]) <= '~')
671 || !INTEGERP (vec[5]) || !(XINT (vec[5]) == 0 || XINT (vec[5]) == 1)
672 || !STRINGP (vec[6])
673 || !STRINGP (vec[7])
674 || !STRINGP (vec[8]))
675 error ("Invalid info-vector argument for defining charset %s",
676 XSYMBOL (charset_symbol)->name->data);
677
678 if (NILP (charset_id))
679 {
680 charset_id = get_new_private_charset_id (XINT (vec[0]), XINT (vec[2]));
681 if (XINT (charset_id) == 0)
682 error ("There's no room for a new private charset %s",
683 XSYMBOL (charset_symbol)->name->data);
684 }
685
686 update_charset_table (charset_id, vec[0], vec[1], vec[2], vec[3],
687 vec[4], vec[5], vec[6], vec[7], vec[8]);
688 Fput (charset_symbol, Qcharset, CHARSET_TABLE_ENTRY (XINT (charset_id)));
689 CHARSET_SYMBOL (XINT (charset_id)) = charset_symbol;
690 Vcharset_list = Fcons (charset_symbol, Vcharset_list);
691 return Qnil;
692 }
693
694 DEFUN ("generic-character-list", Fgeneric_character_list,
695 Sgeneric_character_list, 0, 0, 0,
696 "Return a list of all possible generic characters.\n\
697 It includes a generic character for a charset not yet defined.")
698 ()
699 {
700 return Vgeneric_character_list;
701 }
702
703 DEFUN ("get-unused-iso-final-char", Fget_unused_iso_final_char,
704 Sget_unused_iso_final_char, 2, 2, 0,
705 "Return an unsed ISO's final char for a charset of DIMENISION and CHARS.\n\
706 DIMENSION is the number of bytes to represent a character: 1 or 2.\n\
707 CHARS is the number of characters in a dimension: 94 or 96.\n\
708 \n\
709 This final char is for private use, thus the range is `0' (48) .. `?' (63).\n\
710 If there's no unused final char for the specified kind of charset,\n\
711 return nil.")
712 (dimension, chars)
713 Lisp_Object dimension, chars;
714 {
715 int final_char;
716
717 CHECK_NUMBER (dimension, 0);
718 CHECK_NUMBER (chars, 1);
719 if (XINT (dimension) != 1 && XINT (dimension) != 2)
720 error ("Invalid charset dimension %d, it should be 1 or 2",
721 XINT (dimension));
722 if (XINT (chars) != 94 && XINT (chars) != 96)
723 error ("Invalid charset chars %d, it should be 94 or 96",
724 XINT (chars));
725 for (final_char = '0'; final_char <= '?'; final_char++)
726 {
727 if (ISO_CHARSET_TABLE (dimension, chars, make_number (final_char)) < 0)
728 break;
729 }
730 return (final_char <= '?' ? make_number (final_char) : Qnil);
731 }
732
733 DEFUN ("declare-equiv-charset", Fdeclare_equiv_charset, Sdeclare_equiv_charset,
734 4, 4, 0,
735 "Declare a charset of DIMENSION, CHARS, FINAL-CHAR is the same as CHARSET.\n\
736 CHARSET should be defined by `defined-charset' in advance.")
737 (dimension, chars, final_char, charset_symbol)
738 Lisp_Object dimension, chars, final_char, charset_symbol;
739 {
740 int charset;
741
742 CHECK_NUMBER (dimension, 0);
743 CHECK_NUMBER (chars, 1);
744 CHECK_NUMBER (final_char, 2);
745 CHECK_SYMBOL (charset_symbol, 3);
746
747 if (XINT (dimension) != 1 && XINT (dimension) != 2)
748 error ("Invalid DIMENSION %d, it should be 1 or 2", XINT (dimension));
749 if (XINT (chars) != 94 && XINT (chars) != 96)
750 error ("Invalid CHARS %d, it should be 94 or 96", XINT (chars));
751 if (XINT (final_char) < '0' || XFASTINT (final_char) > '~')
752 error ("Invalid FINAL-CHAR %c, it should be `0'..`~'", XINT (chars));
753 if ((charset = get_charset_id (charset_symbol)) < 0)
754 error ("Invalid charset %s", XSYMBOL (charset_symbol)->name->data);
755
756 ISO_CHARSET_TABLE (dimension, chars, final_char) = charset;
757 return Qnil;
758 }
759
760 /* Return number of different charsets in STR of length LEN. In
761 addition, for each found charset N, CHARSETS[N] is set 1. The
762 caller should allocate CHARSETS (MAX_CHARSET + 1 elements) in advance.
763 It may lookup a translation table TABLE if supplied.
764
765 If MULTIBYTE is zero, do not check multibyte characters, i.e. if
766 any ASCII codes (7-bit) are found, CHARSET[0] is set to 1, if any
767 8-bit codes are found CHARSET[1] is set to 1. */
768
769 int
770 find_charset_in_str (str, len, charsets, table, multibyte)
771 unsigned char *str;
772 int len, *charsets;
773 Lisp_Object table;
774 int multibyte;
775 {
776 register int num = 0;
777
778 if (! multibyte)
779 {
780 unsigned char *endp = str + len;
781 int maskbits = 0;
782
783 while (str < endp && maskbits != 3)
784 maskbits |= (*str++ < 0x80 ? 1 : 2);
785 if (maskbits & 1)
786 {
787 charsets[0] = 1;
788 num++;
789 }
790 if (maskbits & 2)
791 {
792 charsets[1] = 1;
793 num++;
794 }
795 return num;
796 }
797
798 if (! CHAR_TABLE_P (table))
799 table = Qnil;
800
801 while (len > 0)
802 {
803 int bytes, charset, c1, c2;
804
805 SPLIT_MULTIBYTE_SEQ (str, len, bytes, charset, c1, c2);
806
807 if (! NILP (table))
808 {
809 int c1 = translate_char (table, -1, charset, c1, c2);
810 if (c1 >= 0)
811 charset = CHAR_CHARSET (c1);
812 }
813
814 if (!charsets[charset])
815 {
816 charsets[charset] = 1;
817 num += 1;
818 }
819 str += bytes;
820 len -= bytes;
821 }
822 return num;
823 }
824
825 DEFUN ("find-charset-region", Ffind_charset_region, Sfind_charset_region,
826 2, 3, 0,
827 "Return a list of charsets in the region between BEG and END.\n\
828 BEG and END are buffer positions.\n\
829 Optional arg TABLE if non-nil is a translation table to look up.\n\
830 \n\
831 If the region contains invalid multiybte characters,\n\
832 `unknown' is included in the returned list.\n\
833 \n\
834 If the current buffer is unibyte, the returned list contains\n\
835 `ascii' if any 7-bit characters are found,\n\
836 and `unknown' if any 8-bit characters are found.")
837 (beg, end, table)
838 Lisp_Object beg, end, table;
839 {
840 int charsets[MAX_CHARSET + 1];
841 int from, from_byte, to, stop, stop_byte, i;
842 Lisp_Object val;
843 int undefined;
844 int multibyte = !NILP (current_buffer->enable_multibyte_characters);
845
846 validate_region (&beg, &end);
847 from = XFASTINT (beg);
848 stop = to = XFASTINT (end);
849
850 if (from < GPT && GPT < to)
851 {
852 stop = GPT;
853 stop_byte = GPT_BYTE;
854 }
855 else
856 stop_byte = CHAR_TO_BYTE (stop);
857
858 from_byte = CHAR_TO_BYTE (from);
859
860 bzero (charsets, (MAX_CHARSET + 1) * sizeof (int));
861 while (1)
862 {
863 find_charset_in_str (BYTE_POS_ADDR (from_byte), stop_byte - from_byte,
864 charsets, table, multibyte);
865 if (stop < to)
866 {
867 from = stop, from_byte = stop_byte;
868 stop = to, stop_byte = CHAR_TO_BYTE (stop);
869 }
870 else
871 break;
872 }
873
874 val = Qnil;
875 undefined = 0;
876 for (i = (multibyte ? MAX_CHARSET : 1); i >= 0; i--)
877 if (charsets[i])
878 {
879 if (CHARSET_DEFINED_P (i))
880 val = Fcons (CHARSET_SYMBOL (i), val);
881 else
882 undefined = 1;
883 }
884 if (undefined)
885 val = Fcons (Qunknown, val);
886 return val;
887 }
888
889 DEFUN ("find-charset-string", Ffind_charset_string, Sfind_charset_string,
890 1, 2, 0,
891 "Return a list of charsets in STR.\n\
892 Optional arg TABLE if non-nil is a translation table to look up.\n\
893 \n\
894 If the region contains invalid multiybte characters,\n\
895 `unknown' is included in the returned list.\n\
896 \n\
897 If STR is unibyte, the returned list contains\n\
898 `ascii' if any 7-bit characters are found,\n\
899 and `unknown' if any 8-bit characters are found.")
900 (str, table)
901 Lisp_Object str, table;
902 {
903 int charsets[MAX_CHARSET + 1];
904 int i;
905 Lisp_Object val;
906 int undefined;
907 int multibyte;
908
909 CHECK_STRING (str, 0);
910 multibyte = STRING_MULTIBYTE (str);
911
912 bzero (charsets, (MAX_CHARSET + 1) * sizeof (int));
913 find_charset_in_str (XSTRING (str)->data, STRING_BYTES (XSTRING (str)),
914 charsets, table, multibyte);
915 val = Qnil;
916 undefined = 0;
917 for (i = (multibyte ? MAX_CHARSET : 1); i >= 0; i--)
918 if (charsets[i])
919 {
920 if (CHARSET_DEFINED_P (i))
921 val = Fcons (CHARSET_SYMBOL (i), val);
922 else
923 undefined = 1;
924 }
925 if (undefined)
926 val = Fcons (Qunknown, val);
927 return val;
928 }
929 \f
930 DEFUN ("make-char-internal", Fmake_char_internal, Smake_char_internal, 1, 3, 0,
931 "")
932 (charset, code1, code2)
933 Lisp_Object charset, code1, code2;
934 {
935 int charset_id, c1, c2;
936
937 CHECK_NUMBER (charset, 0);
938 charset_id = XINT (charset);
939 if (!CHARSET_DEFINED_P (charset_id))
940 error ("Invalid charset ID: %d", XINT (charset));
941
942 if (NILP (code1))
943 c1 = 0;
944 else
945 {
946 CHECK_NUMBER (code1, 1);
947 c1 = XINT (code1);
948 }
949 if (NILP (code2))
950 c2 = 0;
951 else
952 {
953 CHECK_NUMBER (code2, 2);
954 c2 = XINT (code2);
955 }
956
957 if (c1 < 0 || c1 > 0xFF || c2 < 0 || c2 > 0xFF)
958 error ("Invalid code points for charset ID %d: %d %d", charset_id, c1, c2);
959 c1 &= 0x7F;
960 c2 &= 0x7F;
961 if (c1 == 0
962 ? c2 != 0
963 : (c2 == 0
964 ? !CHAR_COMPONENTS_VALID_P (charset_id, c1, 0x20)
965 : !CHAR_COMPONENTS_VALID_P (charset_id, c1, c2)))
966 error ("Invalid code points for charset ID %d: %d %d", charset_id, c1, c2);
967
968 return make_number (MAKE_CHAR (charset_id, c1, c2));
969 }
970
971 DEFUN ("split-char", Fsplit_char, Ssplit_char, 1, 1, 0,
972 "Return list of charset and one or two position-codes of CHAR.\n\
973 If CHAR is invalid as a character code,\n\
974 return a list of symbol `unknown' and CHAR.")
975 (ch)
976 Lisp_Object ch;
977 {
978 Lisp_Object val;
979 int c, charset, c1, c2;
980
981 CHECK_NUMBER (ch, 0);
982 c = XFASTINT (ch);
983 if (!CHAR_VALID_P (c, 1))
984 return Fcons (Qunknown, Fcons (ch, Qnil));
985 SPLIT_CHAR (XFASTINT (ch), charset, c1, c2);
986 return (c2 >= 0
987 ? Fcons (CHARSET_SYMBOL (charset),
988 Fcons (make_number (c1), Fcons (make_number (c2), Qnil)))
989 : Fcons (CHARSET_SYMBOL (charset), Fcons (make_number (c1), Qnil)));
990 }
991
992 DEFUN ("char-charset", Fchar_charset, Schar_charset, 1, 1, 0,
993 "Return charset of CHAR.")
994 (ch)
995 Lisp_Object ch;
996 {
997 CHECK_NUMBER (ch, 0);
998
999 return CHARSET_SYMBOL (CHAR_CHARSET (XINT (ch)));
1000 }
1001
1002 DEFUN ("charset-after", Fcharset_after, Scharset_after, 0, 1, 0,
1003 "Return charset of a character in the current buffer at position POS.\n\
1004 If POS is nil, it defauls to the current point.\n\
1005 If POS is out of range, the value is nil.")
1006 (pos)
1007 Lisp_Object pos;
1008 {
1009 register int pos_byte, bytes, charset, c1, c2;
1010 register unsigned char *p;
1011
1012 if (NILP (pos))
1013 pos_byte = PT_BYTE;
1014 else if (MARKERP (pos))
1015 {
1016 pos_byte = marker_byte_position (pos);
1017 if (pos_byte < BEGV_BYTE || pos_byte >= ZV_BYTE)
1018 return Qnil;
1019 }
1020 else
1021 {
1022 CHECK_NUMBER (pos, 0);
1023 if (XINT (pos) < BEGV || XINT (pos) >= ZV)
1024 return Qnil;
1025 pos_byte = CHAR_TO_BYTE (XINT (pos));
1026 }
1027 p = BYTE_POS_ADDR (pos_byte);
1028 if (BASE_LEADING_CODE_P (*p))
1029 {
1030 SPLIT_MULTIBYTE_SEQ (p, Z_BYTE - pos_byte, bytes, charset, c1, c2);
1031 if (charset < 0)
1032 charset = 1;
1033 }
1034 else
1035 charset = CHARSET_ASCII;
1036
1037 return CHARSET_SYMBOL (charset);
1038 }
1039
1040 DEFUN ("iso-charset", Fiso_charset, Siso_charset, 3, 3, 0,
1041 "Return charset of ISO's specification DIMENSION, CHARS, and FINAL-CHAR.\n\
1042 \n\
1043 ISO 2022's designation sequence (escape sequence) distinguishes charsets\n\
1044 by their DIMENSION, CHARS, and FINAL-CHAR,\n\
1045 where as Emacs distinguishes them by charset symbol.\n\
1046 See the documentation of the function `charset-info' for the meanings of\n\
1047 DIMENSION, CHARS, and FINAL-CHAR.")
1048 (dimension, chars, final_char)
1049 Lisp_Object dimension, chars, final_char;
1050 {
1051 int charset;
1052
1053 CHECK_NUMBER (dimension, 0);
1054 CHECK_NUMBER (chars, 1);
1055 CHECK_NUMBER (final_char, 2);
1056
1057 if ((charset = ISO_CHARSET_TABLE (dimension, chars, final_char)) < 0)
1058 return Qnil;
1059 return CHARSET_SYMBOL (charset);
1060 }
1061
1062 /* If GENERICP is nonzero, return nonzero iff C is a valid normal or
1063 generic character. If GENERICP is zero, return nonzero iff C is a
1064 valid normal character. Do not call this function directly,
1065 instead use macro CHAR_VALID_P. */
1066 int
1067 char_valid_p (c, genericp)
1068 int c, genericp;
1069 {
1070 int charset, c1, c2;
1071
1072 if (c < 0)
1073 return 0;
1074 if (SINGLE_BYTE_CHAR_P (c))
1075 return 1;
1076 SPLIT_NON_ASCII_CHAR (c, charset, c1, c2);
1077 if (genericp)
1078 {
1079 if (c1)
1080 {
1081 if (c2 <= 0) c2 = 0x20;
1082 }
1083 else
1084 {
1085 if (c2 <= 0) c1 = c2 = 0x20;
1086 }
1087 }
1088 return (CHARSET_DEFINED_P (charset)
1089 && CHAR_COMPONENTS_VALID_P (charset, c1, c2));
1090 }
1091
1092 DEFUN ("char-valid-p", Fchar_valid_p, Schar_valid_p, 1, 2, 0,
1093 "Return t if OBJECT is a valid normal character.\n\
1094 If optional arg GENERICP is non-nil, also return t if OBJECT is\n\
1095 a valid generic character.")
1096 (object, genericp)
1097 Lisp_Object object, genericp;
1098 {
1099 if (! NATNUMP (object))
1100 return Qnil;
1101 return (CHAR_VALID_P (XFASTINT (object), !NILP (genericp)) ? Qt : Qnil);
1102 }
1103
1104 DEFUN ("unibyte-char-to-multibyte", Funibyte_char_to_multibyte,
1105 Sunibyte_char_to_multibyte, 1, 1, 0,
1106 "Convert the unibyte character CH to multibyte character.\n\
1107 The conversion is done based on `nonascii-translation-table' (which see)\n\
1108 or `nonascii-insert-offset' (which see).")
1109 (ch)
1110 Lisp_Object ch;
1111 {
1112 int c;
1113
1114 CHECK_NUMBER (ch, 0);
1115 c = XINT (ch);
1116 if (c < 0 || c >= 0400)
1117 error ("Invalid unibyte character: %d", c);
1118 c = unibyte_char_to_multibyte (c);
1119 if (c < 0)
1120 error ("Can't convert to multibyte character: %d", XINT (ch));
1121 return make_number (c);
1122 }
1123
1124 DEFUN ("multibyte-char-to-unibyte", Fmultibyte_char_to_unibyte,
1125 Smultibyte_char_to_unibyte, 1, 1, 0,
1126 "Convert the multibyte character CH to unibyte character.\n\
1127 The conversion is done based on `nonascii-translation-table' (which see)\n\
1128 or `nonascii-insert-offset' (which see).")
1129 (ch)
1130 Lisp_Object ch;
1131 {
1132 int c;
1133
1134 CHECK_NUMBER (ch, 0);
1135 c = XINT (ch);
1136 if (! CHAR_VALID_P (c, 0))
1137 error ("Invalid multibyte character: %d", c);
1138 c = multibyte_char_to_unibyte (c, Qnil);
1139 if (c < 0)
1140 error ("Can't convert to unibyte character: %d", XINT (ch));
1141 return make_number (c);
1142 }
1143
1144 DEFUN ("char-bytes", Fchar_bytes, Schar_bytes, 1, 1, 0,
1145 "Return 1 regardless of the argument CHAR.\n\
1146 This is now an obsolete function. We keep it just for backward compatibility.")
1147 (ch)
1148 Lisp_Object ch;
1149 {
1150 Lisp_Object val;
1151
1152 CHECK_NUMBER (ch, 0);
1153 return make_number (1);
1154 }
1155
1156 /* Return how many bytes C will occupy in a multibyte buffer.
1157 Don't call this function directly, instead use macro CHAR_BYTES. */
1158 int
1159 char_bytes (c)
1160 int c;
1161 {
1162 int charset;
1163
1164 if (SINGLE_BYTE_CHAR_P (c) || (c & ~((1 << CHARACTERBITS) - 1)))
1165 return 1;
1166
1167 charset = CHAR_CHARSET (c);
1168 return (CHARSET_DEFINED_P (charset) ? CHARSET_BYTES (charset) : 1);
1169 }
1170
1171 /* Return the width of character of which multi-byte form starts with
1172 C. The width is measured by how many columns occupied on the
1173 screen when displayed in the current buffer. */
1174
1175 #define ONE_BYTE_CHAR_WIDTH(c) \
1176 (c < 0x20 \
1177 ? (c == '\t' \
1178 ? XFASTINT (current_buffer->tab_width) \
1179 : (c == '\n' ? 0 : (NILP (current_buffer->ctl_arrow) ? 4 : 2))) \
1180 : (c < 0x7f \
1181 ? 1 \
1182 : (c == 0x7F \
1183 ? (NILP (current_buffer->ctl_arrow) ? 4 : 2) \
1184 : ((! NILP (current_buffer->enable_multibyte_characters) \
1185 && BASE_LEADING_CODE_P (c)) \
1186 ? WIDTH_BY_CHAR_HEAD (c) \
1187 : 4))))
1188
1189 DEFUN ("char-width", Fchar_width, Schar_width, 1, 1, 0,
1190 "Return width of CHAR when displayed in the current buffer.\n\
1191 The width is measured by how many columns it occupies on the screen.")
1192 (ch)
1193 Lisp_Object ch;
1194 {
1195 Lisp_Object val, disp;
1196 int c;
1197 struct Lisp_Char_Table *dp = buffer_display_table ();
1198
1199 CHECK_NUMBER (ch, 0);
1200
1201 c = XINT (ch);
1202
1203 /* Get the way the display table would display it. */
1204 disp = dp ? DISP_CHAR_VECTOR (dp, c) : Qnil;
1205
1206 if (VECTORP (disp))
1207 XSETINT (val, XVECTOR (disp)->size);
1208 else if (SINGLE_BYTE_CHAR_P (c))
1209 XSETINT (val, ONE_BYTE_CHAR_WIDTH (c));
1210 else
1211 {
1212 int charset = CHAR_CHARSET (c);
1213
1214 XSETFASTINT (val, CHARSET_WIDTH (charset));
1215 }
1216 return val;
1217 }
1218
1219 /* Return width of string STR of length LEN when displayed in the
1220 current buffer. The width is measured by how many columns it
1221 occupies on the screen. */
1222
1223 int
1224 strwidth (str, len)
1225 unsigned char *str;
1226 int len;
1227 {
1228 unsigned char *endp = str + len;
1229 int width = 0;
1230 struct Lisp_Char_Table *dp = buffer_display_table ();
1231
1232 while (str < endp)
1233 {
1234 Lisp_Object disp;
1235 int thislen;
1236 int c = STRING_CHAR_AND_LENGTH (str, endp - str, thislen);
1237
1238 /* Get the way the display table would display it. */
1239 if (dp)
1240 disp = DISP_CHAR_VECTOR (dp, c);
1241 else
1242 disp = Qnil;
1243
1244 if (VECTORP (disp))
1245 width += XVECTOR (disp)->size;
1246 else
1247 width += ONE_BYTE_CHAR_WIDTH (*str);
1248
1249 str += thislen;
1250 }
1251 return width;
1252 }
1253
1254 DEFUN ("string-width", Fstring_width, Sstring_width, 1, 1, 0,
1255 "Return width of STRING when displayed in the current buffer.\n\
1256 Width is measured by how many columns it occupies on the screen.\n\
1257 When calculating width of a multibyte character in STRING,\n\
1258 only the base leading-code is considered; the validity of\n\
1259 the following bytes is not checked.")
1260 (str)
1261 Lisp_Object str;
1262 {
1263 Lisp_Object val;
1264
1265 CHECK_STRING (str, 0);
1266 XSETFASTINT (val, strwidth (XSTRING (str)->data,
1267 STRING_BYTES (XSTRING (str))));
1268 return val;
1269 }
1270
1271 DEFUN ("char-direction", Fchar_direction, Schar_direction, 1, 1, 0,
1272 "Return the direction of CHAR.\n\
1273 The returned value is 0 for left-to-right and 1 for right-to-left.")
1274 (ch)
1275 Lisp_Object ch;
1276 {
1277 int charset;
1278
1279 CHECK_NUMBER (ch, 0);
1280 charset = CHAR_CHARSET (XFASTINT (ch));
1281 if (!CHARSET_DEFINED_P (charset))
1282 invalid_character (XINT (ch));
1283 return CHARSET_TABLE_INFO (charset, CHARSET_DIRECTION_IDX);
1284 }
1285
1286 DEFUN ("chars-in-region", Fchars_in_region, Schars_in_region, 2, 2, 0,
1287 "Return number of characters between BEG and END.")
1288 (beg, end)
1289 Lisp_Object beg, end;
1290 {
1291 int from, to;
1292
1293 CHECK_NUMBER_COERCE_MARKER (beg, 0);
1294 CHECK_NUMBER_COERCE_MARKER (end, 1);
1295
1296 from = min (XFASTINT (beg), XFASTINT (end));
1297 to = max (XFASTINT (beg), XFASTINT (end));
1298
1299 return make_number (to - from);
1300 }
1301
1302 /* Return the number of characters in the NBYTES bytes at PTR.
1303 This works by looking at the contents and checking for multibyte sequences.
1304 However, if the current buffer has enable-multibyte-characters = nil,
1305 we treat each byte as a character. */
1306
1307 int
1308 chars_in_text (ptr, nbytes)
1309 unsigned char *ptr;
1310 int nbytes;
1311 {
1312 /* current_buffer is null at early stages of Emacs initialization. */
1313 if (current_buffer == 0
1314 || NILP (current_buffer->enable_multibyte_characters))
1315 return nbytes;
1316
1317 return multibyte_chars_in_text (ptr, nbytes);
1318 }
1319
1320 /* Return the number of characters in the NBYTES bytes at PTR.
1321 This works by looking at the contents and checking for multibyte sequences.
1322 It ignores enable-multibyte-characters. */
1323
1324 int
1325 multibyte_chars_in_text (ptr, nbytes)
1326 unsigned char *ptr;
1327 int nbytes;
1328 {
1329 unsigned char *endp;
1330 int chars, bytes;
1331
1332 endp = ptr + nbytes;
1333 chars = 0;
1334
1335 while (ptr < endp)
1336 {
1337 if (BASE_LEADING_CODE_P (*ptr))
1338 {
1339 PARSE_MULTIBYTE_SEQ (ptr, nbytes, bytes);
1340 ptr += bytes;
1341 nbytes -= bytes;
1342 }
1343 else
1344 {
1345 ptr++;
1346 nbytes--;
1347 }
1348 chars++;
1349 }
1350
1351 return chars;
1352 }
1353
1354 DEFUN ("string", Fstring, Sstring, 1, MANY, 0,
1355 "Concatenate all the argument characters and make the result a string.")
1356 (n, args)
1357 int n;
1358 Lisp_Object *args;
1359 {
1360 int i;
1361 unsigned char *buf = (unsigned char *) alloca (MAX_MULTIBYTE_LENGTH * n);
1362 unsigned char *p = buf;
1363 Lisp_Object val;
1364 int c, multibyte_p = 0;
1365
1366 for (i = 0; i < n; i++)
1367 {
1368 CHECK_NUMBER (args[i], 0);
1369 c = XINT (args[i]);
1370 p += CHAR_STRING (c, p);
1371
1372 if (!SINGLE_BYTE_CHAR_P (c))
1373 multibyte_p = 1;
1374 }
1375
1376 /* Here, we can't use make_string_from_bytes because of the byte
1377 combining problem. Make a multibyte string if there is any
1378 multibyte character in ARGS to make sure that `(string 2276)'
1379 returns a multibyte string if running --unibyte. */
1380 if (multibyte_p)
1381 val = make_multibyte_string (buf, n, p - buf);
1382 else
1383 val = make_unibyte_string (buf, p - buf);
1384 return val;
1385 }
1386
1387 #endif /* emacs */
1388 \f
1389 int
1390 charset_id_internal (charset_name)
1391 char *charset_name;
1392 {
1393 Lisp_Object val;
1394
1395 val= Fget (intern (charset_name), Qcharset);
1396 if (!VECTORP (val))
1397 error ("Charset %s is not defined", charset_name);
1398
1399 return (XINT (XVECTOR (val)->contents[0]));
1400 }
1401
1402 DEFUN ("setup-special-charsets", Fsetup_special_charsets,
1403 Ssetup_special_charsets, 0, 0, 0, "Internal use only.")
1404 ()
1405 {
1406 charset_latin_iso8859_1 = charset_id_internal ("latin-iso8859-1");
1407 charset_jisx0208_1978 = charset_id_internal ("japanese-jisx0208-1978");
1408 charset_jisx0208 = charset_id_internal ("japanese-jisx0208");
1409 charset_katakana_jisx0201 = charset_id_internal ("katakana-jisx0201");
1410 charset_latin_jisx0201 = charset_id_internal ("latin-jisx0201");
1411 charset_big5_1 = charset_id_internal ("chinese-big5-1");
1412 charset_big5_2 = charset_id_internal ("chinese-big5-2");
1413 return Qnil;
1414 }
1415
1416 void
1417 init_charset_once ()
1418 {
1419 int i, j, k;
1420
1421 staticpro (&Vcharset_table);
1422 staticpro (&Vcharset_symbol_table);
1423 staticpro (&Vgeneric_character_list);
1424
1425 /* This has to be done here, before we call Fmake_char_table. */
1426 Qcharset_table = intern ("charset-table");
1427 staticpro (&Qcharset_table);
1428
1429 /* Intern this now in case it isn't already done.
1430 Setting this variable twice is harmless.
1431 But don't staticpro it here--that is done in alloc.c. */
1432 Qchar_table_extra_slots = intern ("char-table-extra-slots");
1433
1434 /* Now we are ready to set up this property, so we can
1435 create the charset table. */
1436 Fput (Qcharset_table, Qchar_table_extra_slots, make_number (0));
1437 Vcharset_table = Fmake_char_table (Qcharset_table, Qnil);
1438
1439 Qunknown = intern ("unknown");
1440 staticpro (&Qunknown);
1441 Vcharset_symbol_table = Fmake_vector (make_number (MAX_CHARSET + 1),
1442 Qunknown);
1443
1444 /* Setup tables. */
1445 for (i = 0; i < 2; i++)
1446 for (j = 0; j < 2; j++)
1447 for (k = 0; k < 128; k++)
1448 iso_charset_table [i][j][k] = -1;
1449
1450 for (i = 0; i < 256; i++)
1451 BYTES_BY_CHAR_HEAD (i) = 1;
1452 for (i = MIN_CHARSET_OFFICIAL_DIMENSION1;
1453 i <= MAX_CHARSET_OFFICIAL_DIMENSION1; i++)
1454 BYTES_BY_CHAR_HEAD (i) = 2;
1455 for (i = MIN_CHARSET_OFFICIAL_DIMENSION2;
1456 i <= MAX_CHARSET_OFFICIAL_DIMENSION2; i++)
1457 BYTES_BY_CHAR_HEAD (i) = 3;
1458 BYTES_BY_CHAR_HEAD (LEADING_CODE_PRIVATE_11) = 3;
1459 BYTES_BY_CHAR_HEAD (LEADING_CODE_PRIVATE_12) = 3;
1460 BYTES_BY_CHAR_HEAD (LEADING_CODE_PRIVATE_21) = 4;
1461 BYTES_BY_CHAR_HEAD (LEADING_CODE_PRIVATE_22) = 4;
1462 /* The followings don't reflect the actual bytes, but just to tell
1463 that it is a start of a multibyte character. */
1464 BYTES_BY_CHAR_HEAD (0x80) = 2;
1465 BYTES_BY_CHAR_HEAD (0x9E) = 2;
1466 BYTES_BY_CHAR_HEAD (0x9F) = 2;
1467
1468 for (i = 0; i < 128; i++)
1469 WIDTH_BY_CHAR_HEAD (i) = 1;
1470 for (; i < 256; i++)
1471 WIDTH_BY_CHAR_HEAD (i) = 4;
1472 WIDTH_BY_CHAR_HEAD (LEADING_CODE_PRIVATE_11) = 1;
1473 WIDTH_BY_CHAR_HEAD (LEADING_CODE_PRIVATE_12) = 2;
1474 WIDTH_BY_CHAR_HEAD (LEADING_CODE_PRIVATE_21) = 1;
1475 WIDTH_BY_CHAR_HEAD (LEADING_CODE_PRIVATE_22) = 2;
1476
1477 {
1478 Lisp_Object val;
1479
1480 val = Qnil;
1481 for (i = 0x81; i < 0x90; i++)
1482 val = Fcons (make_number ((i - 0x70) << 7), val);
1483 for (; i < 0x9A; i++)
1484 val = Fcons (make_number ((i - 0x8F) << 14), val);
1485 for (i = 0xA0; i < 0xF0; i++)
1486 val = Fcons (make_number ((i - 0x70) << 7), val);
1487 for (; i < 0xFF; i++)
1488 val = Fcons (make_number ((i - 0xE0) << 14), val);
1489 Vgeneric_character_list = Fnreverse (val);
1490 }
1491
1492 nonascii_insert_offset = 0;
1493 Vnonascii_translation_table = Qnil;
1494 }
1495
1496 #ifdef emacs
1497
1498 void
1499 syms_of_charset ()
1500 {
1501 Qascii = intern ("ascii");
1502 staticpro (&Qascii);
1503
1504 Qcharset = intern ("charset");
1505 staticpro (&Qcharset);
1506
1507 /* Define ASCII charset now. */
1508 update_charset_table (make_number (CHARSET_ASCII),
1509 make_number (1), make_number (94),
1510 make_number (1),
1511 make_number (0),
1512 make_number ('B'),
1513 make_number (0),
1514 build_string ("ASCII"),
1515 build_string ("ASCII"),
1516 build_string ("ASCII (ISO646 IRV)"));
1517 CHARSET_SYMBOL (CHARSET_ASCII) = Qascii;
1518 Fput (Qascii, Qcharset, CHARSET_TABLE_ENTRY (CHARSET_ASCII));
1519
1520 Qauto_fill_chars = intern ("auto-fill-chars");
1521 staticpro (&Qauto_fill_chars);
1522 Fput (Qauto_fill_chars, Qchar_table_extra_slots, make_number (0));
1523
1524 defsubr (&Sdefine_charset);
1525 defsubr (&Sgeneric_character_list);
1526 defsubr (&Sget_unused_iso_final_char);
1527 defsubr (&Sdeclare_equiv_charset);
1528 defsubr (&Sfind_charset_region);
1529 defsubr (&Sfind_charset_string);
1530 defsubr (&Smake_char_internal);
1531 defsubr (&Ssplit_char);
1532 defsubr (&Schar_charset);
1533 defsubr (&Scharset_after);
1534 defsubr (&Siso_charset);
1535 defsubr (&Schar_valid_p);
1536 defsubr (&Sunibyte_char_to_multibyte);
1537 defsubr (&Smultibyte_char_to_unibyte);
1538 defsubr (&Schar_bytes);
1539 defsubr (&Schar_width);
1540 defsubr (&Sstring_width);
1541 defsubr (&Schar_direction);
1542 defsubr (&Schars_in_region);
1543 defsubr (&Sstring);
1544 defsubr (&Ssetup_special_charsets);
1545
1546 DEFVAR_LISP ("charset-list", &Vcharset_list,
1547 "List of charsets ever defined.");
1548 Vcharset_list = Fcons (Qascii, Qnil);
1549
1550 DEFVAR_LISP ("translation-table-vector", &Vtranslation_table_vector,
1551 "Vector of cons cell of a symbol and translation table ever defined.\n\
1552 An ID of a translation table is an index of this vector.");
1553 Vtranslation_table_vector = Fmake_vector (make_number (16), Qnil);
1554
1555 DEFVAR_INT ("leading-code-private-11", &leading_code_private_11,
1556 "Leading-code of private TYPE9N charset of column-width 1.");
1557 leading_code_private_11 = LEADING_CODE_PRIVATE_11;
1558
1559 DEFVAR_INT ("leading-code-private-12", &leading_code_private_12,
1560 "Leading-code of private TYPE9N charset of column-width 2.");
1561 leading_code_private_12 = LEADING_CODE_PRIVATE_12;
1562
1563 DEFVAR_INT ("leading-code-private-21", &leading_code_private_21,
1564 "Leading-code of private TYPE9Nx9N charset of column-width 1.");
1565 leading_code_private_21 = LEADING_CODE_PRIVATE_21;
1566
1567 DEFVAR_INT ("leading-code-private-22", &leading_code_private_22,
1568 "Leading-code of private TYPE9Nx9N charset of column-width 2.");
1569 leading_code_private_22 = LEADING_CODE_PRIVATE_22;
1570
1571 DEFVAR_INT ("nonascii-insert-offset", &nonascii_insert_offset,
1572 "Offset for converting non-ASCII unibyte codes 0240...0377 to multibyte.\n\
1573 This is used for converting unibyte text to multibyte,\n\
1574 and for inserting character codes specified by number.\n\n\
1575 This serves to convert a Latin-1 or similar 8-bit character code\n\
1576 to the corresponding Emacs multibyte character code.\n\
1577 Typically the value should be (- (make-char CHARSET 0) 128),\n\
1578 for your choice of character set.\n\
1579 If `nonascii-translation-table' is non-nil, it overrides this variable.");
1580 nonascii_insert_offset = 0;
1581
1582 DEFVAR_LISP ("nonascii-translation-table", &Vnonascii_translation_table,
1583 "Translation table to convert non-ASCII unibyte codes to multibyte.\n\
1584 This is used for converting unibyte text to multibyte,\n\
1585 and for inserting character codes specified by number.\n\n\
1586 Conversion is performed only when multibyte characters are enabled,\n\
1587 and it serves to convert a Latin-1 or similar 8-bit character code\n\
1588 to the corresponding Emacs character code.\n\n\
1589 If this is nil, `nonascii-insert-offset' is used instead.\n\
1590 See also the docstring of `make-translation-table'.");
1591 Vnonascii_translation_table = Qnil;
1592
1593 DEFVAR_LISP ("auto-fill-chars", &Vauto_fill_chars,
1594 "A char-table for characters which invoke auto-filling.\n\
1595 Such characters have value t in this table.");
1596 Vauto_fill_chars = Fmake_char_table (Qauto_fill_chars, Qnil);
1597 CHAR_TABLE_SET (Vauto_fill_chars, make_number (' '), Qt);
1598 CHAR_TABLE_SET (Vauto_fill_chars, make_number ('\n'), Qt);
1599 }
1600
1601 #endif /* emacs */