(char_to_string): Check the character validity.
[bpt/emacs.git] / src / charset.c
1 /* Basic multilingual character support.
2 Copyright (C) 1995, 1997, 1998 Electrotechnical Laboratory, JAPAN.
3 Licensed to the Free Software Foundation.
4
5 This file is part of GNU Emacs.
6
7 GNU Emacs is free software; you can redistribute it and/or modify
8 it under the terms of the GNU General Public License as published by
9 the Free Software Foundation; either version 2, or (at your option)
10 any later version.
11
12 GNU Emacs is distributed in the hope that it will be useful,
13 but WITHOUT ANY WARRANTY; without even the implied warranty of
14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 GNU General Public License for more details.
16
17 You should have received a copy of the GNU General Public License
18 along with GNU Emacs; see the file COPYING. If not, write to
19 the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
20 Boston, MA 02111-1307, USA. */
21
22 /* At first, see the document in `charset.h' to understand the code in
23 this file. */
24
25 #ifdef emacs
26 #include <config.h>
27 #endif
28
29 #include <stdio.h>
30
31 #ifdef emacs
32
33 #include <sys/types.h>
34 #include "lisp.h"
35 #include "buffer.h"
36 #include "charset.h"
37 #include "coding.h"
38 #include "disptab.h"
39
40 #else /* not emacs */
41
42 #include "mulelib.h"
43
44 #endif /* emacs */
45
46 Lisp_Object Qcharset, Qascii, Qeight_bit_control, Qeight_bit_graphic;
47 Lisp_Object Qunknown;
48
49 /* Declaration of special leading-codes. */
50 int leading_code_private_11; /* for private DIMENSION1 of 1-column */
51 int leading_code_private_12; /* for private DIMENSION1 of 2-column */
52 int leading_code_private_21; /* for private DIMENSION2 of 1-column */
53 int leading_code_private_22; /* for private DIMENSION2 of 2-column */
54
55 /* Declaration of special charsets. The values are set by
56 Fsetup_special_charsets. */
57 int charset_latin_iso8859_1; /* ISO8859-1 (Latin-1) */
58 int charset_jisx0208_1978; /* JISX0208.1978 (Japanese Kanji old set) */
59 int charset_jisx0208; /* JISX0208.1983 (Japanese Kanji) */
60 int charset_katakana_jisx0201; /* JISX0201.Kana (Japanese Katakana) */
61 int charset_latin_jisx0201; /* JISX0201.Roman (Japanese Roman) */
62 int charset_big5_1; /* Big5 Level 1 (Chinese Traditional) */
63 int charset_big5_2; /* Big5 Level 2 (Chinese Traditional) */
64
65 Lisp_Object Qcharset_table;
66
67 /* A char-table containing information of each character set. */
68 Lisp_Object Vcharset_table;
69
70 /* A vector of charset symbol indexed by charset-id. This is used
71 only for returning charset symbol from C functions. */
72 Lisp_Object Vcharset_symbol_table;
73
74 /* A list of charset symbols ever defined. */
75 Lisp_Object Vcharset_list;
76
77 /* Vector of translation table ever defined.
78 ID of a translation table is used to index this vector. */
79 Lisp_Object Vtranslation_table_vector;
80
81 /* A char-table for characters which may invoke auto-filling. */
82 Lisp_Object Vauto_fill_chars;
83
84 Lisp_Object Qauto_fill_chars;
85
86 /* Tables used by macros BYTES_BY_CHAR_HEAD and WIDTH_BY_CHAR_HEAD. */
87 int bytes_by_char_head[256];
88 int width_by_char_head[256];
89
90 /* Mapping table from ISO2022's charset (specified by DIMENSION,
91 CHARS, and FINAL-CHAR) to Emacs' charset. */
92 int iso_charset_table[2][2][128];
93
94 /* Variables used locally in the macro FETCH_MULTIBYTE_CHAR. */
95 unsigned char *_fetch_multibyte_char_p;
96 int _fetch_multibyte_char_len;
97
98 /* Offset to add to a non-ASCII value when inserting it. */
99 int nonascii_insert_offset;
100
101 /* Translation table for converting non-ASCII unibyte characters
102 to multibyte codes, or nil. */
103 Lisp_Object Vnonascii_translation_table;
104
105 /* List of all possible generic characters. */
106 Lisp_Object Vgeneric_character_list;
107
108 #define min(X, Y) ((X) < (Y) ? (X) : (Y))
109 #define max(X, Y) ((X) > (Y) ? (X) : (Y))
110 \f
111 void
112 invalid_character (c)
113 int c;
114 {
115 error ("Invalid character: 0%o, %d, 0x%x", c, c, c);
116 }
117
118 /* Parse string STR of length LENGTH and fetch information of a
119 character at STR. Set BYTES to the byte length the character
120 occupies, CHARSET, C1, C2 to proper values of the character. */
121
122 #define SPLIT_MULTIBYTE_SEQ(str, length, bytes, charset, c1, c2) \
123 do { \
124 (c1) = *(str); \
125 (bytes) = BYTES_BY_CHAR_HEAD (c1); \
126 if ((bytes) == 1) \
127 (charset) = ASCII_BYTE_P (c1) ? CHARSET_ASCII : CHARSET_8_BIT_GRAPHIC; \
128 else if ((bytes) == 2) \
129 { \
130 if ((c1) == LEADING_CODE_8_BIT_CONTROL) \
131 (charset) = CHARSET_8_BIT_CONTROL, (c1) = (str)[1] - 0x20; \
132 else \
133 (charset) = (c1), (c1) = (str)[1] & 0x7F; \
134 } \
135 else if ((bytes) == 3) \
136 { \
137 if ((c1) < LEADING_CODE_PRIVATE_11) \
138 (charset) = (c1), (c1) = (str)[1] & 0x7F, (c2) = (str)[2] & 0x7F; \
139 else \
140 (charset) = (str)[1], (c1) = (str)[2] & 0x7F; \
141 } \
142 else \
143 (charset) = (str)[1], (c1) = (str)[2] & 0x7F, (c2) = (str)[3] & 0x7F; \
144 } while (0)
145
146 /* 1 if CHARSET, C1, and C2 compose a valid character, else 0. */
147 #define CHAR_COMPONENTS_VALID_P(charset, c1, c2) \
148 ((charset) == CHARSET_ASCII \
149 ? ((c1) >= 0 && (c1) <= 0x7F) \
150 : ((charset) == CHARSET_8_BIT_CONTROL \
151 ? ((c1) >= 0x80 && (c1) <= 0x9F) \
152 : ((charset) == CHARSET_8_BIT_GRAPHIC \
153 ? ((c1) >= 0x80 && (c1) <= 0xFF) \
154 : (CHARSET_DIMENSION (charset) == 1 \
155 ? ((c1) >= 0x20 && (c1) <= 0x7F) \
156 : ((c1) >= 0x20 && (c1) <= 0x7F \
157 && (c2) >= 0x20 && (c2) <= 0x7F)))))
158
159 /* Store multi-byte form of the character C in STR. The caller should
160 allocate at least 4-byte area at STR in advance. Returns the
161 length of the multi-byte form. If C is an invalid character code,
162 signal an error.
163
164 Use macro `CHAR_STRING (C, STR)' instead of calling this function
165 directly if C can be an ASCII character. */
166
167 int
168 char_to_string (c, str)
169 int c;
170 unsigned char *str;
171 {
172 unsigned char *p = str;
173
174 if (c & CHAR_MODIFIER_MASK) /* This includes the case C is negative. */
175 {
176 /* Multibyte character can't have a modifier bit. */
177 if (! SINGLE_BYTE_CHAR_P ((c & ~CHAR_MODIFIER_MASK)))
178 invalid_character (c);
179
180 /* For Meta, Shift, and Control modifiers, we need special care. */
181 if (c & CHAR_META)
182 {
183 /* Move the meta bit to the right place for a string. */
184 c = (c & ~CHAR_META) | 0x80;
185 }
186 if (c & CHAR_SHIFT)
187 {
188 /* Shift modifier is valid only with [A-Za-z]. */
189 if ((c & 0377) >= 'A' && (c & 0377) <= 'Z')
190 c &= ~CHAR_SHIFT;
191 else if ((c & 0377) >= 'a' && (c & 0377) <= 'z')
192 c = (c & ~CHAR_SHIFT) - ('a' - 'A');
193 }
194 if (c & CHAR_CTL)
195 {
196 /* Simulate the code in lread.c. */
197 /* Allow `\C- ' and `\C-?'. */
198 if (c == (CHAR_CTL | ' '))
199 c = 0;
200 else if (c == (CHAR_CTL | '?'))
201 c = 127;
202 /* ASCII control chars are made from letters (both cases),
203 as well as the non-letters within 0100...0137. */
204 else if ((c & 0137) >= 0101 && (c & 0137) <= 0132)
205 c &= (037 | (~0177 & ~CHAR_CTL));
206 else if ((c & 0177) >= 0100 && (c & 0177) <= 0137)
207 c &= (037 | (~0177 & ~CHAR_CTL));
208 }
209
210 /* If C still has any modifier bits, it is an invalid character. */
211 if (c & CHAR_MODIFIER_MASK)
212 invalid_character (c);
213 }
214 if (SINGLE_BYTE_CHAR_P (c))
215 {
216 if (ASCII_BYTE_P (c) || c >= 0xA0)
217 *p++ = c;
218 else
219 {
220 *p++ = LEADING_CODE_8_BIT_CONTROL;
221 *p++ = c + 0x20;
222 }
223 }
224 else if (CHAR_VALID_P (c))
225 {
226 int charset, c1, c2;
227
228 SPLIT_CHAR (c, charset, c1, c2);
229
230 if (charset >= LEADING_CODE_EXT_11)
231 *p++ = (charset < LEADING_CODE_EXT_12
232 ? LEADING_CODE_PRIVATE_11
233 : (charset < LEADING_CODE_EXT_21
234 ? LEADING_CODE_PRIVATE_12
235 : (charset < LEADING_CODE_EXT_22
236 ? LEADING_CODE_PRIVATE_21
237 : LEADING_CODE_PRIVATE_22)));
238 *p++ = charset;
239 if (c1 > 0 && c1 < 32 || c2 > 0 && c2 < 32)
240 invalid_character (c);
241 if (c1)
242 {
243 *p++ = c1 | 0x80;
244 if (c2 > 0)
245 *p++ = c2 | 0x80;
246 }
247 }
248 else
249 invalid_character (c);
250
251 return (p - str);
252 }
253
254 /* Return the non-ASCII character corresponding to multi-byte form at
255 STR of length LEN. If ACTUAL_LEN is not NULL, store the byte
256 length of the multibyte form in *ACTUAL_LEN.
257
258 Use macros STRING_CHAR or STRING_CHAR_AND_LENGTH instead of calling
259 this function directly if you want ot handle ASCII characters as
260 well. */
261
262 int
263 string_to_char (str, len, actual_len)
264 const unsigned char *str;
265 int len, *actual_len;
266 {
267 int c, bytes, charset, c1, c2;
268
269 SPLIT_MULTIBYTE_SEQ (str, len, bytes, charset, c1, c2);
270 c = MAKE_CHAR (charset, c1, c2);
271 if (actual_len)
272 *actual_len = bytes;
273 return c;
274 }
275
276 /* Return the length of the multi-byte form at string STR of length LEN.
277 Use the macro MULTIBYTE_FORM_LENGTH instead. */
278 int
279 multibyte_form_length (str, len)
280 const unsigned char *str;
281 int len;
282 {
283 int bytes;
284
285 PARSE_MULTIBYTE_SEQ (str, len, bytes);
286 return bytes;
287 }
288
289 /* Check multibyte form at string STR of length LEN and set variables
290 pointed by CHARSET, C1, and C2 to charset and position codes of the
291 character at STR, and return 0. If there's no multibyte character,
292 return -1. This should be used only in the macro SPLIT_STRING
293 which checks range of STR in advance. */
294
295 int
296 split_string (str, len, charset, c1, c2)
297 const unsigned char *str;
298 unsigned char *c1, *c2;
299 int len, *charset;
300 {
301 register int bytes, cs, code1, code2 = -1;
302
303 SPLIT_MULTIBYTE_SEQ (str, len, bytes, cs, code1, code2);
304 if (cs == CHARSET_ASCII)
305 return -1;
306 *charset = cs;
307 *c1 = code1;
308 *c2 = code2;
309 return 0;
310 }
311
312 /* Return 1 iff character C has valid printable glyph.
313 Use the macro CHAR_PRINTABLE_P instead. */
314 int
315 char_printable_p (c)
316 int c;
317 {
318 int charset, c1, c2, chars;
319
320 if (ASCII_BYTE_P (c))
321 return 1;
322 else if (SINGLE_BYTE_CHAR_P (c))
323 return 0;
324 else if (c >= MAX_CHAR)
325 return 0;
326
327 SPLIT_CHAR (c, charset, c1, c2);
328 if (! CHARSET_DEFINED_P (charset))
329 return 0;
330 if (CHARSET_CHARS (charset) == 94
331 ? c1 <= 32 || c1 >= 127
332 : c1 < 32)
333 return 0;
334 if (CHARSET_DIMENSION (charset) == 2
335 && (CHARSET_CHARS (charset) == 94
336 ? c2 <= 32 || c2 >= 127
337 : c2 < 32))
338 return 0;
339 return 1;
340 }
341
342 /* Translate character C by translation table TABLE. If C
343 is negative, translate a character specified by CHARSET, C1, and C2
344 (C1 and C2 are code points of the character). If no translation is
345 found in TABLE, return C. */
346 int
347 translate_char (table, c, charset, c1, c2)
348 Lisp_Object table;
349 int c, charset, c1, c2;
350 {
351 Lisp_Object ch;
352 int alt_charset, alt_c1, alt_c2, dimension;
353
354 if (c < 0) c = MAKE_CHAR (charset, (c1 & 0x7F) , (c2 & 0x7F));
355 if (!CHAR_TABLE_P (table)
356 || (ch = Faref (table, make_number (c)), !NATNUMP (ch)))
357 return c;
358
359 SPLIT_CHAR (XFASTINT (ch), alt_charset, alt_c1, alt_c2);
360 dimension = CHARSET_DIMENSION (alt_charset);
361 if (dimension == 1 && alt_c1 > 0 || dimension == 2 && alt_c2 > 0)
362 /* CH is not a generic character, just return it. */
363 return XFASTINT (ch);
364
365 /* Since CH is a generic character, we must return a specific
366 charater which has the same position codes as C from CH. */
367 if (charset < 0)
368 SPLIT_CHAR (c, charset, c1, c2);
369 if (dimension != CHARSET_DIMENSION (charset))
370 /* We can't make such a character because of dimension mismatch. */
371 return c;
372 return MAKE_CHAR (alt_charset, c1, c2);
373 }
374
375 /* Convert the unibyte character C to multibyte based on
376 Vnonascii_translation_table or nonascii_insert_offset. If they can't
377 convert C to a valid multibyte character, convert it based on
378 DEFAULT_NONASCII_INSERT_OFFSET which makes C a Latin-1 character. */
379
380 int
381 unibyte_char_to_multibyte (c)
382 int c;
383 {
384 if (c < 0400 && c >= 0200)
385 {
386 int c_save = c;
387
388 if (! NILP (Vnonascii_translation_table))
389 {
390 c = XINT (Faref (Vnonascii_translation_table, make_number (c)));
391 if (c >= 0400 && ! char_valid_p (c, 0))
392 c = c_save + DEFAULT_NONASCII_INSERT_OFFSET;
393 }
394 else if (c >= 0240 && nonascii_insert_offset > 0)
395 {
396 c += nonascii_insert_offset;
397 if (c < 0400 || ! char_valid_p (c, 0))
398 c = c_save + DEFAULT_NONASCII_INSERT_OFFSET;
399 }
400 else if (c >= 0240)
401 c = c_save + DEFAULT_NONASCII_INSERT_OFFSET;
402 }
403 return c;
404 }
405
406
407 /* Convert the multibyte character C to unibyte 8-bit character based
408 on Vnonascii_translation_table or nonascii_insert_offset. If
409 REV_TBL is non-nil, it should be a reverse table of
410 Vnonascii_translation_table, i.e. what given by:
411 Fchar_table_extra_slot (Vnonascii_translation_table, make_number (0)) */
412
413 int
414 multibyte_char_to_unibyte (c, rev_tbl)
415 int c;
416 Lisp_Object rev_tbl;
417 {
418 if (!SINGLE_BYTE_CHAR_P (c))
419 {
420 int c_save = c;
421
422 if (! CHAR_TABLE_P (rev_tbl)
423 && CHAR_TABLE_P (Vnonascii_translation_table))
424 rev_tbl = Fchar_table_extra_slot (Vnonascii_translation_table,
425 make_number (0));
426 if (CHAR_TABLE_P (rev_tbl))
427 {
428 Lisp_Object temp;
429 temp = Faref (rev_tbl, make_number (c));
430 if (INTEGERP (temp))
431 c = XINT (temp);
432 if (c >= 256)
433 c = (c_save & 0177) + 0200;
434 }
435 else
436 {
437 if (nonascii_insert_offset > 0)
438 c -= nonascii_insert_offset;
439 if (c < 128 || c >= 256)
440 c = (c_save & 0177) + 0200;
441 }
442 }
443
444 return c;
445 }
446
447 \f
448 /* Update the table Vcharset_table with the given arguments (see the
449 document of `define-charset' for the meaning of each argument).
450 Several other table contents are also updated. The caller should
451 check the validity of CHARSET-ID and the remaining arguments in
452 advance. */
453
454 void
455 update_charset_table (charset_id, dimension, chars, width, direction,
456 iso_final_char, iso_graphic_plane,
457 short_name, long_name, description)
458 Lisp_Object charset_id, dimension, chars, width, direction;
459 Lisp_Object iso_final_char, iso_graphic_plane;
460 Lisp_Object short_name, long_name, description;
461 {
462 int charset = XINT (charset_id);
463 int bytes;
464 unsigned char leading_code_base, leading_code_ext;
465
466 if (NILP (CHARSET_TABLE_ENTRY (charset)))
467 CHARSET_TABLE_ENTRY (charset)
468 = Fmake_vector (make_number (CHARSET_MAX_IDX), Qnil);
469
470 if (NILP (long_name))
471 long_name = short_name;
472 if (NILP (description))
473 description = long_name;
474
475 /* Get byte length of multibyte form, base leading-code, and
476 extended leading-code of the charset. See the comment under the
477 title "GENERAL NOTE on CHARACTER SET (CHARSET)" in charset.h. */
478 bytes = XINT (dimension);
479 if (charset < MIN_CHARSET_PRIVATE_DIMENSION1)
480 {
481 /* Official charset, it doesn't have an extended leading-code. */
482 if (charset != CHARSET_ASCII && charset != CHARSET_8_BIT_GRAPHIC)
483 bytes += 1; /* For a base leading-code. */
484 leading_code_base = charset;
485 leading_code_ext = 0;
486 }
487 else
488 {
489 /* Private charset. */
490 bytes += 2; /* For base and extended leading-codes. */
491 leading_code_base
492 = (charset < LEADING_CODE_EXT_12
493 ? LEADING_CODE_PRIVATE_11
494 : (charset < LEADING_CODE_EXT_21
495 ? LEADING_CODE_PRIVATE_12
496 : (charset < LEADING_CODE_EXT_22
497 ? LEADING_CODE_PRIVATE_21
498 : LEADING_CODE_PRIVATE_22)));
499 leading_code_ext = charset;
500 if (BYTES_BY_CHAR_HEAD (leading_code_base) != bytes)
501 error ("Invalid dimension for the charset-ID %d", charset);
502 }
503
504 CHARSET_TABLE_INFO (charset, CHARSET_ID_IDX) = charset_id;
505 CHARSET_TABLE_INFO (charset, CHARSET_BYTES_IDX) = make_number (bytes);
506 CHARSET_TABLE_INFO (charset, CHARSET_DIMENSION_IDX) = dimension;
507 CHARSET_TABLE_INFO (charset, CHARSET_CHARS_IDX) = chars;
508 CHARSET_TABLE_INFO (charset, CHARSET_WIDTH_IDX) = width;
509 CHARSET_TABLE_INFO (charset, CHARSET_DIRECTION_IDX) = direction;
510 CHARSET_TABLE_INFO (charset, CHARSET_LEADING_CODE_BASE_IDX)
511 = make_number (leading_code_base);
512 CHARSET_TABLE_INFO (charset, CHARSET_LEADING_CODE_EXT_IDX)
513 = make_number (leading_code_ext);
514 CHARSET_TABLE_INFO (charset, CHARSET_ISO_FINAL_CHAR_IDX) = iso_final_char;
515 CHARSET_TABLE_INFO (charset, CHARSET_ISO_GRAPHIC_PLANE_IDX)
516 = iso_graphic_plane;
517 CHARSET_TABLE_INFO (charset, CHARSET_SHORT_NAME_IDX) = short_name;
518 CHARSET_TABLE_INFO (charset, CHARSET_LONG_NAME_IDX) = long_name;
519 CHARSET_TABLE_INFO (charset, CHARSET_DESCRIPTION_IDX) = description;
520 CHARSET_TABLE_INFO (charset, CHARSET_PLIST_IDX) = Qnil;
521
522 {
523 /* If we have already defined a charset which has the same
524 DIMENSION, CHARS and ISO-FINAL-CHAR but the different
525 DIRECTION, we must update the entry REVERSE-CHARSET of both
526 charsets. If there's no such charset, the value of the entry
527 is set to nil. */
528 int i;
529
530 for (i = 0; i <= MAX_CHARSET; i++)
531 if (!NILP (CHARSET_TABLE_ENTRY (i)))
532 {
533 if (CHARSET_DIMENSION (i) == XINT (dimension)
534 && CHARSET_CHARS (i) == XINT (chars)
535 && CHARSET_ISO_FINAL_CHAR (i) == XINT (iso_final_char)
536 && CHARSET_DIRECTION (i) != XINT (direction))
537 {
538 CHARSET_TABLE_INFO (charset, CHARSET_REVERSE_CHARSET_IDX)
539 = make_number (i);
540 CHARSET_TABLE_INFO (i, CHARSET_REVERSE_CHARSET_IDX) = charset_id;
541 break;
542 }
543 }
544 if (i > MAX_CHARSET)
545 /* No such a charset. */
546 CHARSET_TABLE_INFO (charset, CHARSET_REVERSE_CHARSET_IDX)
547 = make_number (-1);
548 }
549
550 if (charset != CHARSET_ASCII && charset != CHARSET_8_BIT_GRAPHIC
551 && charset < MIN_CHARSET_PRIVATE_DIMENSION1)
552 {
553 bytes_by_char_head[leading_code_base] = bytes;
554 width_by_char_head[leading_code_base] = XINT (width);
555
556 /* Update table emacs_code_class. */
557 emacs_code_class[charset] = (bytes == 2
558 ? EMACS_leading_code_2
559 : (bytes == 3
560 ? EMACS_leading_code_3
561 : EMACS_leading_code_4));
562 }
563
564 /* Update table iso_charset_table. */
565 if (XINT (iso_final_char) >= 0
566 && ISO_CHARSET_TABLE (dimension, chars, iso_final_char) < 0)
567 ISO_CHARSET_TABLE (dimension, chars, iso_final_char) = charset;
568 }
569
570 #ifdef emacs
571
572 /* Return charset id of CHARSET_SYMBOL, or return -1 if CHARSET_SYMBOL
573 is invalid. */
574 int
575 get_charset_id (charset_symbol)
576 Lisp_Object charset_symbol;
577 {
578 Lisp_Object val;
579 int charset;
580
581 return ((SYMBOLP (charset_symbol)
582 && (val = Fget (charset_symbol, Qcharset), VECTORP (val))
583 && (charset = XINT (XVECTOR (val)->contents[CHARSET_ID_IDX]),
584 CHARSET_VALID_P (charset)))
585 ? charset : -1);
586 }
587
588 /* Return an identification number for a new private charset of
589 DIMENSION and WIDTH. If there's no more room for the new charset,
590 return 0. */
591 Lisp_Object
592 get_new_private_charset_id (dimension, width)
593 int dimension, width;
594 {
595 int charset, from, to;
596
597 if (dimension == 1)
598 {
599 if (width == 1)
600 from = LEADING_CODE_EXT_11, to = LEADING_CODE_EXT_12;
601 else
602 from = LEADING_CODE_EXT_12, to = LEADING_CODE_EXT_21;
603 }
604 else
605 {
606 if (width == 1)
607 from = LEADING_CODE_EXT_21, to = LEADING_CODE_EXT_22;
608 else
609 from = LEADING_CODE_EXT_22, to = LEADING_CODE_EXT_MAX + 1;
610 }
611
612 for (charset = from; charset < to; charset++)
613 if (!CHARSET_DEFINED_P (charset)) break;
614
615 return make_number (charset < to ? charset : 0);
616 }
617
618 DEFUN ("define-charset", Fdefine_charset, Sdefine_charset, 3, 3, 0,
619 "Define CHARSET-ID as the identification number of CHARSET with INFO-VECTOR.\n\
620 If CHARSET-ID is nil, it is decided automatically, which means CHARSET is\n\
621 treated as a private charset.\n\
622 INFO-VECTOR is a vector of the format:\n\
623 [DIMENSION CHARS WIDTH DIRECTION ISO-FINAL-CHAR ISO-GRAPHIC-PLANE\n\
624 SHORT-NAME LONG-NAME DESCRIPTION]\n\
625 The meanings of each elements is as follows:\n\
626 DIMENSION (integer) is the number of bytes to represent a character: 1 or 2.\n\
627 CHARS (integer) is the number of characters in a dimension: 94 or 96.\n\
628 WIDTH (integer) is the number of columns a character in the charset\n\
629 occupies on the screen: one of 0, 1, and 2.\n\
630 \n\
631 DIRECTION (integer) is the rendering direction of characters in the\n\
632 charset when rendering. If 0, render from left to right, else\n\
633 render from right to left.\n\
634 \n\
635 ISO-FINAL-CHAR (character) is the final character of the\n\
636 corresponding ISO 2022 charset.\n\
637 It may be -1 if the charset is internal use only.\n\
638 \n\
639 ISO-GRAPHIC-PLANE (integer) is the graphic plane to be invoked\n\
640 while encoding to variants of ISO 2022 coding system, one of the\n\
641 following: 0/graphic-plane-left(GL), 1/graphic-plane-right(GR).\n\
642 It may be -1 if the charset is internal use only.\n\
643 \n\
644 SHORT-NAME (string) is the short name to refer to the charset.\n\
645 \n\
646 LONG-NAME (string) is the long name to refer to the charset.\n\
647 \n\
648 DESCRIPTION (string) is the description string of the charset.")
649 (charset_id, charset_symbol, info_vector)
650 Lisp_Object charset_id, charset_symbol, info_vector;
651 {
652 Lisp_Object *vec;
653
654 if (!NILP (charset_id))
655 CHECK_NUMBER (charset_id, 0);
656 CHECK_SYMBOL (charset_symbol, 1);
657 CHECK_VECTOR (info_vector, 2);
658
659 if (! NILP (charset_id))
660 {
661 if (! CHARSET_VALID_P (XINT (charset_id)))
662 error ("Invalid CHARSET: %d", XINT (charset_id));
663 else if (CHARSET_DEFINED_P (XINT (charset_id)))
664 error ("Already defined charset: %d", XINT (charset_id));
665 }
666
667 vec = XVECTOR (info_vector)->contents;
668 if (XVECTOR (info_vector)->size != 9
669 || !INTEGERP (vec[0]) || !(XINT (vec[0]) == 1 || XINT (vec[0]) == 2)
670 || !INTEGERP (vec[1]) || !(XINT (vec[1]) == 94 || XINT (vec[1]) == 96)
671 || !INTEGERP (vec[2]) || !(XINT (vec[2]) == 1 || XINT (vec[2]) == 2)
672 || !INTEGERP (vec[3]) || !(XINT (vec[3]) == 0 || XINT (vec[3]) == 1)
673 || !INTEGERP (vec[4])
674 || !(XINT (vec[4]) == -1 || XINT (vec[4]) >= '0' && XINT (vec[4]) <= '~')
675 || !INTEGERP (vec[5])
676 || !(XINT (vec[5]) == -1 || XINT (vec[5]) == 0 || XINT (vec[5]) == 1)
677 || !STRINGP (vec[6])
678 || !STRINGP (vec[7])
679 || !STRINGP (vec[8]))
680 error ("Invalid info-vector argument for defining charset %s",
681 XSYMBOL (charset_symbol)->name->data);
682
683 if (NILP (charset_id))
684 {
685 charset_id = get_new_private_charset_id (XINT (vec[0]), XINT (vec[2]));
686 if (XINT (charset_id) == 0)
687 error ("There's no room for a new private charset %s",
688 XSYMBOL (charset_symbol)->name->data);
689 }
690
691 update_charset_table (charset_id, vec[0], vec[1], vec[2], vec[3],
692 vec[4], vec[5], vec[6], vec[7], vec[8]);
693 Fput (charset_symbol, Qcharset, CHARSET_TABLE_ENTRY (XINT (charset_id)));
694 CHARSET_SYMBOL (XINT (charset_id)) = charset_symbol;
695 Vcharset_list = Fcons (charset_symbol, Vcharset_list);
696 return Qnil;
697 }
698
699 DEFUN ("generic-character-list", Fgeneric_character_list,
700 Sgeneric_character_list, 0, 0, 0,
701 "Return a list of all possible generic characters.\n\
702 It includes a generic character for a charset not yet defined.")
703 ()
704 {
705 return Vgeneric_character_list;
706 }
707
708 DEFUN ("get-unused-iso-final-char", Fget_unused_iso_final_char,
709 Sget_unused_iso_final_char, 2, 2, 0,
710 "Return an unsed ISO's final char for a charset of DIMENISION and CHARS.\n\
711 DIMENSION is the number of bytes to represent a character: 1 or 2.\n\
712 CHARS is the number of characters in a dimension: 94 or 96.\n\
713 \n\
714 This final char is for private use, thus the range is `0' (48) .. `?' (63).\n\
715 If there's no unused final char for the specified kind of charset,\n\
716 return nil.")
717 (dimension, chars)
718 Lisp_Object dimension, chars;
719 {
720 int final_char;
721
722 CHECK_NUMBER (dimension, 0);
723 CHECK_NUMBER (chars, 1);
724 if (XINT (dimension) != 1 && XINT (dimension) != 2)
725 error ("Invalid charset dimension %d, it should be 1 or 2",
726 XINT (dimension));
727 if (XINT (chars) != 94 && XINT (chars) != 96)
728 error ("Invalid charset chars %d, it should be 94 or 96",
729 XINT (chars));
730 for (final_char = '0'; final_char <= '?'; final_char++)
731 {
732 if (ISO_CHARSET_TABLE (dimension, chars, make_number (final_char)) < 0)
733 break;
734 }
735 return (final_char <= '?' ? make_number (final_char) : Qnil);
736 }
737
738 DEFUN ("declare-equiv-charset", Fdeclare_equiv_charset, Sdeclare_equiv_charset,
739 4, 4, 0,
740 "Declare a charset of DIMENSION, CHARS, FINAL-CHAR is the same as CHARSET.\n\
741 CHARSET should be defined by `defined-charset' in advance.")
742 (dimension, chars, final_char, charset_symbol)
743 Lisp_Object dimension, chars, final_char, charset_symbol;
744 {
745 int charset;
746
747 CHECK_NUMBER (dimension, 0);
748 CHECK_NUMBER (chars, 1);
749 CHECK_NUMBER (final_char, 2);
750 CHECK_SYMBOL (charset_symbol, 3);
751
752 if (XINT (dimension) != 1 && XINT (dimension) != 2)
753 error ("Invalid DIMENSION %d, it should be 1 or 2", XINT (dimension));
754 if (XINT (chars) != 94 && XINT (chars) != 96)
755 error ("Invalid CHARS %d, it should be 94 or 96", XINT (chars));
756 if (XINT (final_char) < '0' || XFASTINT (final_char) > '~')
757 error ("Invalid FINAL-CHAR %c, it should be `0'..`~'", XINT (chars));
758 if ((charset = get_charset_id (charset_symbol)) < 0)
759 error ("Invalid charset %s", XSYMBOL (charset_symbol)->name->data);
760
761 ISO_CHARSET_TABLE (dimension, chars, final_char) = charset;
762 return Qnil;
763 }
764
765 /* Return information about charsets in the text at PTR of NBYTES
766 bytes, which are NCHARS characters. The value is:
767
768 0: Each character is represented by one byte. This is always
769 true for unibyte text.
770 1: No charsets other than ascii eight-bit-control,
771 eight-bit-graphic, and latin-1 are found.
772 2: Otherwise.
773
774 In addition, if CHARSETS is nonzero, for each found charset N, set
775 CHARSETS[N] to 1. For that, callers should allocate CHARSETS
776 (MAX_CHARSET + 1 elements) in advance. It may lookup a translation
777 table TABLE if supplied. For invalid charsets, set CHARSETS[1] to
778 1 (note that there's no charset whose ID is 1). */
779
780 int
781 find_charset_in_text (ptr, nchars, nbytes, charsets, table)
782 unsigned char *ptr;
783 int nchars, nbytes, *charsets;
784 Lisp_Object table;
785 {
786 if (nchars == nbytes)
787 {
788 if (charsets && nbytes > 0)
789 {
790 unsigned char *endp = ptr + nbytes;
791 int maskbits = 0;
792
793 while (ptr < endp && maskbits != 7)
794 {
795 maskbits |= (*ptr < 0x80 ? 1 : *ptr < 0xA0 ? 2 : 4);
796 ptr++;
797 }
798
799 if (maskbits & 1)
800 charsets[CHARSET_ASCII] = 1;
801 if (maskbits & 2)
802 charsets[CHARSET_8_BIT_CONTROL] = 1;
803 if (maskbits & 4)
804 charsets[CHARSET_8_BIT_GRAPHIC] = 1;
805 }
806 return 0;
807 }
808 else
809 {
810 int return_val = 1;
811 int bytes, charset, c1, c2;
812
813 if (! CHAR_TABLE_P (table))
814 table = Qnil;
815
816 while (nchars-- > 0)
817 {
818 SPLIT_MULTIBYTE_SEQ (ptr, len, bytes, charset, c1, c2);
819 ptr += bytes;
820
821 if (!CHARSET_DEFINED_P (charset))
822 charset = 1;
823 else if (! NILP (table))
824 {
825 int c = translate_char (table, -1, charset, c1, c2);
826 if (c >= 0)
827 charset = CHAR_CHARSET (c);
828 }
829
830 if (return_val == 1
831 && charset != CHARSET_ASCII
832 && charset != CHARSET_8_BIT_CONTROL
833 && charset != CHARSET_8_BIT_GRAPHIC
834 && charset != charset_latin_iso8859_1)
835 return_val = 2;
836
837 if (charsets)
838 charsets[charset] = 1;
839 else if (return_val == 2)
840 break;
841 }
842 return return_val;
843 }
844 }
845
846 DEFUN ("find-charset-region", Ffind_charset_region, Sfind_charset_region,
847 2, 3, 0,
848 "Return a list of charsets in the region between BEG and END.\n\
849 BEG and END are buffer positions.\n\
850 Optional arg TABLE if non-nil is a translation table to look up.\n\
851 \n\
852 If the region contains invalid multiybte characters,\n\
853 `unknown' is included in the returned list.\n\
854 \n\
855 If the current buffer is unibyte, the returned list may contain\n\
856 only `ascii', `eight-bit-control', and `eight-bit-graphic'.")
857 (beg, end, table)
858 Lisp_Object beg, end, table;
859 {
860 int charsets[MAX_CHARSET + 1];
861 int from, from_byte, to, stop, stop_byte, i;
862 Lisp_Object val;
863
864 validate_region (&beg, &end);
865 from = XFASTINT (beg);
866 stop = to = XFASTINT (end);
867
868 if (from < GPT && GPT < to)
869 {
870 stop = GPT;
871 stop_byte = GPT_BYTE;
872 }
873 else
874 stop_byte = CHAR_TO_BYTE (stop);
875
876 from_byte = CHAR_TO_BYTE (from);
877
878 bzero (charsets, (MAX_CHARSET + 1) * sizeof (int));
879 while (1)
880 {
881 find_charset_in_text (BYTE_POS_ADDR (from_byte), stop - from,
882 stop_byte - from_byte, charsets, table);
883 if (stop < to)
884 {
885 from = stop, from_byte = stop_byte;
886 stop = to, stop_byte = CHAR_TO_BYTE (stop);
887 }
888 else
889 break;
890 }
891
892 val = Qnil;
893 if (charsets[1])
894 val = Fcons (Qunknown, val);
895 for (i = MAX_CHARSET; i >= MIN_CHARSET_OFFICIAL_DIMENSION1; i--)
896 if (charsets[i])
897 val = Fcons (CHARSET_SYMBOL (i), val);
898 if (charsets[0])
899 val = Fcons (Qascii, val);
900 return val;
901 }
902
903 DEFUN ("find-charset-string", Ffind_charset_string, Sfind_charset_string,
904 1, 2, 0,
905 "Return a list of charsets in STR.\n\
906 Optional arg TABLE if non-nil is a translation table to look up.\n\
907 \n\
908 If the region contains invalid multiybte characters,\n\
909 `unknown' is included in the returned list.\n\
910 \n\
911 If STR is unibyte, the returned list may contain\n\
912 only `ascii', `eight-bit-control', and `eight-bit-graphic'.")
913 (str, table)
914 Lisp_Object str, table;
915 {
916 int charsets[MAX_CHARSET + 1];
917 int i;
918 Lisp_Object val;
919
920 CHECK_STRING (str, 0);
921
922 bzero (charsets, (MAX_CHARSET + 1) * sizeof (int));
923 find_charset_in_text (XSTRING (str)->data, XSTRING (str)->size,
924 STRING_BYTES (XSTRING (str)), charsets, table);
925
926 val = Qnil;
927 if (charsets[1])
928 val = Fcons (Qunknown, val);
929 for (i = MAX_CHARSET; i >= MIN_CHARSET_OFFICIAL_DIMENSION1; i--)
930 if (charsets[i])
931 val = Fcons (CHARSET_SYMBOL (i), val);
932 if (charsets[0])
933 val = Fcons (Qascii, val);
934 return val;
935 }
936
937 \f
938 DEFUN ("make-char-internal", Fmake_char_internal, Smake_char_internal, 1, 3, 0,
939 "")
940 (charset, code1, code2)
941 Lisp_Object charset, code1, code2;
942 {
943 int charset_id, c1, c2;
944
945 CHECK_NUMBER (charset, 0);
946 charset_id = XINT (charset);
947 if (!CHARSET_DEFINED_P (charset_id))
948 error ("Invalid charset ID: %d", XINT (charset));
949
950 if (NILP (code1))
951 c1 = 0;
952 else
953 {
954 CHECK_NUMBER (code1, 1);
955 c1 = XINT (code1);
956 }
957 if (NILP (code2))
958 c2 = 0;
959 else
960 {
961 CHECK_NUMBER (code2, 2);
962 c2 = XINT (code2);
963 }
964
965 if (charset_id == CHARSET_ASCII)
966 {
967 if (c1 < 0 || c1 > 0x7F)
968 goto invalid_code_posints;
969 return make_number (c1);
970 }
971 else if (charset_id == CHARSET_8_BIT_CONTROL)
972 {
973 if (c1 < 0x80 || c1 > 0x9F)
974 goto invalid_code_posints;
975 return make_number (c1);
976 }
977 else if (charset_id == CHARSET_8_BIT_GRAPHIC)
978 {
979 if (c1 < 0xA0 || c1 > 0xFF)
980 goto invalid_code_posints;
981 return make_number (c1);
982 }
983 else if (c1 < 0 || c1 > 0xFF || c2 < 0 || c2 > 0xFF)
984 goto invalid_code_posints;
985 c1 &= 0x7F;
986 c2 &= 0x7F;
987 if (c1 == 0
988 ? c2 != 0
989 : (c2 == 0
990 ? !CHAR_COMPONENTS_VALID_P (charset_id, c1, 0x20)
991 : !CHAR_COMPONENTS_VALID_P (charset_id, c1, c2)))
992 goto invalid_code_posints;
993 return make_number (MAKE_CHAR (charset_id, c1, c2));
994
995 invalid_code_posints:
996 error ("Invalid code points for charset ID %d: %d %d", charset_id, c1, c2);
997 }
998
999 DEFUN ("split-char", Fsplit_char, Ssplit_char, 1, 1, 0,
1000 "Return list of charset and one or two position-codes of CHAR.\n\
1001 If CHAR is invalid as a character code,\n\
1002 return a list of symbol `unknown' and CHAR.")
1003 (ch)
1004 Lisp_Object ch;
1005 {
1006 Lisp_Object val;
1007 int c, charset, c1, c2;
1008
1009 CHECK_NUMBER (ch, 0);
1010 c = XFASTINT (ch);
1011 if (!CHAR_VALID_P (c, 1))
1012 return Fcons (Qunknown, Fcons (ch, Qnil));
1013 SPLIT_CHAR (XFASTINT (ch), charset, c1, c2);
1014 return (c2 >= 0
1015 ? Fcons (CHARSET_SYMBOL (charset),
1016 Fcons (make_number (c1), Fcons (make_number (c2), Qnil)))
1017 : Fcons (CHARSET_SYMBOL (charset), Fcons (make_number (c1), Qnil)));
1018 }
1019
1020 DEFUN ("char-charset", Fchar_charset, Schar_charset, 1, 1, 0,
1021 "Return charset of CHAR.")
1022 (ch)
1023 Lisp_Object ch;
1024 {
1025 CHECK_NUMBER (ch, 0);
1026
1027 return CHARSET_SYMBOL (CHAR_CHARSET (XINT (ch)));
1028 }
1029
1030 DEFUN ("charset-after", Fcharset_after, Scharset_after, 0, 1, 0,
1031 "Return charset of a character in the current buffer at position POS.\n\
1032 If POS is nil, it defauls to the current point.\n\
1033 If POS is out of range, the value is nil.")
1034 (pos)
1035 Lisp_Object pos;
1036 {
1037 Lisp_Object ch;
1038 int charset;
1039
1040 ch = Fchar_after (pos);
1041 if (! INTEGERP (ch))
1042 return ch;
1043 charset = CHAR_CHARSET (XINT (ch));
1044 return CHARSET_SYMBOL (charset);
1045 }
1046
1047 DEFUN ("iso-charset", Fiso_charset, Siso_charset, 3, 3, 0,
1048 "Return charset of ISO's specification DIMENSION, CHARS, and FINAL-CHAR.\n\
1049 \n\
1050 ISO 2022's designation sequence (escape sequence) distinguishes charsets\n\
1051 by their DIMENSION, CHARS, and FINAL-CHAR,\n\
1052 where as Emacs distinguishes them by charset symbol.\n\
1053 See the documentation of the function `charset-info' for the meanings of\n\
1054 DIMENSION, CHARS, and FINAL-CHAR.")
1055 (dimension, chars, final_char)
1056 Lisp_Object dimension, chars, final_char;
1057 {
1058 int charset;
1059
1060 CHECK_NUMBER (dimension, 0);
1061 CHECK_NUMBER (chars, 1);
1062 CHECK_NUMBER (final_char, 2);
1063
1064 if ((charset = ISO_CHARSET_TABLE (dimension, chars, final_char)) < 0)
1065 return Qnil;
1066 return CHARSET_SYMBOL (charset);
1067 }
1068
1069 /* If GENERICP is nonzero, return nonzero iff C is a valid normal or
1070 generic character. If GENERICP is zero, return nonzero iff C is a
1071 valid normal character. Do not call this function directly,
1072 instead use macro CHAR_VALID_P. */
1073 int
1074 char_valid_p (c, genericp)
1075 int c, genericp;
1076 {
1077 int charset, c1, c2;
1078
1079 if (c < 0 || c >= MAX_CHAR)
1080 return 0;
1081 if (SINGLE_BYTE_CHAR_P (c))
1082 return 1;
1083 SPLIT_CHAR (c, charset, c1, c2);
1084 if (genericp)
1085 {
1086 if (c1)
1087 {
1088 if (c2 <= 0) c2 = 0x20;
1089 }
1090 else
1091 {
1092 if (c2 <= 0) c1 = c2 = 0x20;
1093 }
1094 }
1095 return (CHARSET_DEFINED_P (charset)
1096 && CHAR_COMPONENTS_VALID_P (charset, c1, c2));
1097 }
1098
1099 DEFUN ("char-valid-p", Fchar_valid_p, Schar_valid_p, 1, 2, 0,
1100 "Return t if OBJECT is a valid normal character.\n\
1101 If optional arg GENERICP is non-nil, also return t if OBJECT is\n\
1102 a valid generic character.")
1103 (object, genericp)
1104 Lisp_Object object, genericp;
1105 {
1106 if (! NATNUMP (object))
1107 return Qnil;
1108 return (CHAR_VALID_P (XFASTINT (object), !NILP (genericp)) ? Qt : Qnil);
1109 }
1110
1111 DEFUN ("unibyte-char-to-multibyte", Funibyte_char_to_multibyte,
1112 Sunibyte_char_to_multibyte, 1, 1, 0,
1113 "Convert the unibyte character CH to multibyte character.\n\
1114 The conversion is done based on `nonascii-translation-table' (which see)\n\
1115 or `nonascii-insert-offset' (which see).")
1116 (ch)
1117 Lisp_Object ch;
1118 {
1119 int c;
1120
1121 CHECK_NUMBER (ch, 0);
1122 c = XINT (ch);
1123 if (c < 0 || c >= 0400)
1124 error ("Invalid unibyte character: %d", c);
1125 c = unibyte_char_to_multibyte (c);
1126 if (c < 0)
1127 error ("Can't convert to multibyte character: %d", XINT (ch));
1128 return make_number (c);
1129 }
1130
1131 DEFUN ("multibyte-char-to-unibyte", Fmultibyte_char_to_unibyte,
1132 Smultibyte_char_to_unibyte, 1, 1, 0,
1133 "Convert the multibyte character CH to unibyte character.\n\
1134 The conversion is done based on `nonascii-translation-table' (which see)\n\
1135 or `nonascii-insert-offset' (which see).")
1136 (ch)
1137 Lisp_Object ch;
1138 {
1139 int c;
1140
1141 CHECK_NUMBER (ch, 0);
1142 c = XINT (ch);
1143 if (! CHAR_VALID_P (c, 0))
1144 error ("Invalid multibyte character: %d", c);
1145 c = multibyte_char_to_unibyte (c, Qnil);
1146 if (c < 0)
1147 error ("Can't convert to unibyte character: %d", XINT (ch));
1148 return make_number (c);
1149 }
1150
1151 DEFUN ("char-bytes", Fchar_bytes, Schar_bytes, 1, 1, 0,
1152 "Return 1 regardless of the argument CHAR.\n\
1153 This is now an obsolete function. We keep it just for backward compatibility.")
1154 (ch)
1155 Lisp_Object ch;
1156 {
1157 Lisp_Object val;
1158
1159 CHECK_NUMBER (ch, 0);
1160 return make_number (1);
1161 }
1162
1163 /* Return how many bytes C will occupy in a multibyte buffer.
1164 Don't call this function directly, instead use macro CHAR_BYTES. */
1165 int
1166 char_bytes (c)
1167 int c;
1168 {
1169 int charset;
1170
1171 if (ASCII_BYTE_P (c) || (c & ~((1 << CHARACTERBITS) -1)))
1172 return 1;
1173 if (SINGLE_BYTE_CHAR_P (c) && c >= 0xA0)
1174 return 1;
1175
1176 charset = CHAR_CHARSET (c);
1177 return (CHARSET_DEFINED_P (charset) ? CHARSET_BYTES (charset) : 1);
1178 }
1179
1180 /* Return the width of character of which multi-byte form starts with
1181 C. The width is measured by how many columns occupied on the
1182 screen when displayed in the current buffer. */
1183
1184 #define ONE_BYTE_CHAR_WIDTH(c) \
1185 (c < 0x20 \
1186 ? (c == '\t' \
1187 ? XFASTINT (current_buffer->tab_width) \
1188 : (c == '\n' ? 0 : (NILP (current_buffer->ctl_arrow) ? 4 : 2))) \
1189 : (c < 0x7f \
1190 ? 1 \
1191 : (c == 0x7F \
1192 ? (NILP (current_buffer->ctl_arrow) ? 4 : 2) \
1193 : ((! NILP (current_buffer->enable_multibyte_characters) \
1194 && BASE_LEADING_CODE_P (c)) \
1195 ? WIDTH_BY_CHAR_HEAD (c) \
1196 : 4))))
1197
1198 DEFUN ("char-width", Fchar_width, Schar_width, 1, 1, 0,
1199 "Return width of CHAR when displayed in the current buffer.\n\
1200 The width is measured by how many columns it occupies on the screen.\n\
1201 Tab is taken to occupy `tab-width' columns.")
1202 (ch)
1203 Lisp_Object ch;
1204 {
1205 Lisp_Object val, disp;
1206 int c;
1207 struct Lisp_Char_Table *dp = buffer_display_table ();
1208
1209 CHECK_NUMBER (ch, 0);
1210
1211 c = XINT (ch);
1212
1213 /* Get the way the display table would display it. */
1214 disp = dp ? DISP_CHAR_VECTOR (dp, c) : Qnil;
1215
1216 if (VECTORP (disp))
1217 XSETINT (val, XVECTOR (disp)->size);
1218 else if (SINGLE_BYTE_CHAR_P (c))
1219 XSETINT (val, ONE_BYTE_CHAR_WIDTH (c));
1220 else
1221 {
1222 int charset = CHAR_CHARSET (c);
1223
1224 XSETFASTINT (val, CHARSET_WIDTH (charset));
1225 }
1226 return val;
1227 }
1228
1229 /* Return width of string STR of length LEN when displayed in the
1230 current buffer. The width is measured by how many columns it
1231 occupies on the screen. */
1232
1233 int
1234 strwidth (str, len)
1235 unsigned char *str;
1236 int len;
1237 {
1238 unsigned char *endp = str + len;
1239 int width = 0;
1240 struct Lisp_Char_Table *dp = buffer_display_table ();
1241
1242 while (str < endp)
1243 {
1244 Lisp_Object disp;
1245 int thislen;
1246 int c = STRING_CHAR_AND_LENGTH (str, endp - str, thislen);
1247
1248 /* Get the way the display table would display it. */
1249 if (dp)
1250 disp = DISP_CHAR_VECTOR (dp, c);
1251 else
1252 disp = Qnil;
1253
1254 if (VECTORP (disp))
1255 width += XVECTOR (disp)->size;
1256 else
1257 width += ONE_BYTE_CHAR_WIDTH (*str);
1258
1259 str += thislen;
1260 }
1261 return width;
1262 }
1263
1264 DEFUN ("string-width", Fstring_width, Sstring_width, 1, 1, 0,
1265 "Return width of STRING when displayed in the current buffer.\n\
1266 Width is measured by how many columns it occupies on the screen.\n\
1267 When calculating width of a multibyte character in STRING,\n\
1268 only the base leading-code is considered; the validity of\n\
1269 the following bytes is not checked. Tabs in STRING are always\n\
1270 taken to occupy `tab-width' columns.")
1271 (str)
1272 Lisp_Object str;
1273 {
1274 Lisp_Object val;
1275
1276 CHECK_STRING (str, 0);
1277 XSETFASTINT (val, strwidth (XSTRING (str)->data,
1278 STRING_BYTES (XSTRING (str))));
1279 return val;
1280 }
1281
1282 DEFUN ("char-direction", Fchar_direction, Schar_direction, 1, 1, 0,
1283 "Return the direction of CHAR.\n\
1284 The returned value is 0 for left-to-right and 1 for right-to-left.")
1285 (ch)
1286 Lisp_Object ch;
1287 {
1288 int charset;
1289
1290 CHECK_NUMBER (ch, 0);
1291 charset = CHAR_CHARSET (XFASTINT (ch));
1292 if (!CHARSET_DEFINED_P (charset))
1293 invalid_character (XINT (ch));
1294 return CHARSET_TABLE_INFO (charset, CHARSET_DIRECTION_IDX);
1295 }
1296
1297 DEFUN ("chars-in-region", Fchars_in_region, Schars_in_region, 2, 2, 0,
1298 "Return number of characters between BEG and END.")
1299 (beg, end)
1300 Lisp_Object beg, end;
1301 {
1302 int from, to;
1303
1304 CHECK_NUMBER_COERCE_MARKER (beg, 0);
1305 CHECK_NUMBER_COERCE_MARKER (end, 1);
1306
1307 from = min (XFASTINT (beg), XFASTINT (end));
1308 to = max (XFASTINT (beg), XFASTINT (end));
1309
1310 return make_number (to - from);
1311 }
1312
1313 /* Return the number of characters in the NBYTES bytes at PTR.
1314 This works by looking at the contents and checking for multibyte sequences.
1315 However, if the current buffer has enable-multibyte-characters = nil,
1316 we treat each byte as a character. */
1317
1318 int
1319 chars_in_text (ptr, nbytes)
1320 unsigned char *ptr;
1321 int nbytes;
1322 {
1323 /* current_buffer is null at early stages of Emacs initialization. */
1324 if (current_buffer == 0
1325 || NILP (current_buffer->enable_multibyte_characters))
1326 return nbytes;
1327
1328 return multibyte_chars_in_text (ptr, nbytes);
1329 }
1330
1331 /* Return the number of characters in the NBYTES bytes at PTR.
1332 This works by looking at the contents and checking for multibyte sequences.
1333 It ignores enable-multibyte-characters. */
1334
1335 int
1336 multibyte_chars_in_text (ptr, nbytes)
1337 unsigned char *ptr;
1338 int nbytes;
1339 {
1340 unsigned char *endp;
1341 int chars, bytes;
1342
1343 endp = ptr + nbytes;
1344 chars = 0;
1345
1346 while (ptr < endp)
1347 {
1348 PARSE_MULTIBYTE_SEQ (ptr, endp - ptr, bytes);
1349 ptr += bytes;
1350 chars++;
1351 }
1352
1353 return chars;
1354 }
1355
1356 /* Parse unibyte text at STR of LEN bytes as a multibyte text, and
1357 count the numbers of characters and bytes in it. On counting
1358 bytes, pay attention to that 8-bit characters in the range
1359 0x80..0x9F are represented by 2-byte in a multibyte text. */
1360 void
1361 parse_str_as_multibyte (str, len, nchars, nbytes)
1362 unsigned char *str;
1363 int len, *nchars, *nbytes;
1364 {
1365 unsigned char *endp = str + len;
1366 int n, chars = 0, bytes = 0;
1367
1368 while (str < endp)
1369 {
1370 if (UNIBYTE_STR_AS_MULTIBYTE_P (str, endp - str, n))
1371 str += n, bytes += n;
1372 else
1373 str++, bytes += 2;
1374 chars++;
1375 }
1376 *nchars = chars;
1377 *nbytes = bytes;
1378 return;
1379 }
1380
1381 /* Arrange unibyte text at STR of NBYTES bytes as a multibyte text.
1382 It actually converts only 8-bit characters in the range 0x80..0x9F
1383 that don't contruct multibyte characters to multibyte forms. If
1384 NCHARS is nonzero, set *NCHARS to the number of characters in the
1385 text. It is assured that we can use LEN bytes at STR as a work
1386 area and that is enough. Return the number of bytes of the
1387 resulting text. */
1388
1389 int
1390 str_as_multibyte (str, len, nbytes, nchars)
1391 unsigned char *str;
1392 int len, nbytes, *nchars;
1393 {
1394 unsigned char *p = str, *endp = str + nbytes;
1395 unsigned char *to;
1396 int chars = 0;
1397 int n;
1398
1399 while (p < endp && UNIBYTE_STR_AS_MULTIBYTE_P (p, endp - p, n))
1400 p += n, chars++;
1401 if (nchars)
1402 *nchars = chars;
1403 if (p == endp)
1404 return nbytes;
1405
1406 to = p;
1407 nbytes = endp - p;
1408 endp = str + len;
1409 safe_bcopy (p, endp - nbytes, nbytes);
1410 p = endp - nbytes;
1411 while (p < endp)
1412 {
1413 if (UNIBYTE_STR_AS_MULTIBYTE_P (p, endp - p, n))
1414 {
1415 while (n--)
1416 *to++ = *p++;
1417 }
1418 else
1419 {
1420 *to++ = LEADING_CODE_8_BIT_CONTROL;
1421 *to++ = *p++ + 0x20;
1422 }
1423 chars++;
1424 }
1425 if (nchars)
1426 *nchars = chars;
1427 return (to - str);
1428 }
1429
1430 /* Convert unibyte text at STR of NBYTES bytes to a multibyte text
1431 that contains the same single-byte characters. It actually
1432 converts all 8-bit characters to multibyte forms. It is assured
1433 that we can use LEN bytes at STR as a work area and that is
1434 enough. */
1435
1436 int
1437 str_to_multibyte (str, len, bytes)
1438 unsigned char *str;
1439 int len, bytes;
1440 {
1441 unsigned char *p = str, *endp = str + bytes;
1442 unsigned char *to;
1443 int c;
1444
1445 while (p < endp && (*p < 0x80 || *p >= 0xA0)) p++;
1446 if (p == endp)
1447 return bytes;
1448 to = p;
1449 bytes = endp - p;
1450 endp = str + len;
1451 safe_bcopy (p, endp - bytes, bytes);
1452 p = endp - bytes;
1453 while (p < endp)
1454 {
1455 if (*p < 0x80 || *p >= 0xA0)
1456 *to++ = *p++;
1457 else
1458 *to++ = LEADING_CODE_8_BIT_CONTROL, *to++ = *p++ + 0x20;
1459 }
1460 return (to - str);
1461 }
1462
1463 /* Arrange multibyte text at STR of LEN bytes as a unibyte text. It
1464 actually converts only 8-bit characters in the range 0x80..0x9F to
1465 unibyte forms. */
1466
1467 int
1468 str_as_unibyte (str, bytes)
1469 unsigned char *str;
1470 int bytes;
1471 {
1472 unsigned char *p = str, *endp = str + bytes;
1473 unsigned char *to = str;
1474
1475 while (p < endp && *p != LEADING_CODE_8_BIT_CONTROL) p++;
1476 to = p;
1477 while (p < endp)
1478 {
1479 if (*p == LEADING_CODE_8_BIT_CONTROL)
1480 *to++ = *(p + 1) - 0x20, p += 2;
1481 else
1482 *to++ = *p++;
1483 }
1484 return (to - str);
1485 }
1486
1487 \f
1488 DEFUN ("string", Fstring, Sstring, 1, MANY, 0,
1489 "Concatenate all the argument characters and make the result a string.")
1490 (n, args)
1491 int n;
1492 Lisp_Object *args;
1493 {
1494 int i;
1495 unsigned char *buf = (unsigned char *) alloca (MAX_MULTIBYTE_LENGTH * n);
1496 unsigned char *p = buf;
1497 int c;
1498
1499 for (i = 0; i < n; i++)
1500 {
1501 CHECK_NUMBER (args[i], 0);
1502 c = XINT (args[i]);
1503 p += CHAR_STRING (c, p);
1504 }
1505
1506 return make_string_from_bytes (buf, n, p - buf);
1507 }
1508
1509 #endif /* emacs */
1510 \f
1511 int
1512 charset_id_internal (charset_name)
1513 char *charset_name;
1514 {
1515 Lisp_Object val;
1516
1517 val= Fget (intern (charset_name), Qcharset);
1518 if (!VECTORP (val))
1519 error ("Charset %s is not defined", charset_name);
1520
1521 return (XINT (XVECTOR (val)->contents[0]));
1522 }
1523
1524 DEFUN ("setup-special-charsets", Fsetup_special_charsets,
1525 Ssetup_special_charsets, 0, 0, 0, "Internal use only.")
1526 ()
1527 {
1528 charset_latin_iso8859_1 = charset_id_internal ("latin-iso8859-1");
1529 charset_jisx0208_1978 = charset_id_internal ("japanese-jisx0208-1978");
1530 charset_jisx0208 = charset_id_internal ("japanese-jisx0208");
1531 charset_katakana_jisx0201 = charset_id_internal ("katakana-jisx0201");
1532 charset_latin_jisx0201 = charset_id_internal ("latin-jisx0201");
1533 charset_big5_1 = charset_id_internal ("chinese-big5-1");
1534 charset_big5_2 = charset_id_internal ("chinese-big5-2");
1535 return Qnil;
1536 }
1537
1538 void
1539 init_charset_once ()
1540 {
1541 int i, j, k;
1542
1543 staticpro (&Vcharset_table);
1544 staticpro (&Vcharset_symbol_table);
1545 staticpro (&Vgeneric_character_list);
1546
1547 /* This has to be done here, before we call Fmake_char_table. */
1548 Qcharset_table = intern ("charset-table");
1549 staticpro (&Qcharset_table);
1550
1551 /* Intern this now in case it isn't already done.
1552 Setting this variable twice is harmless.
1553 But don't staticpro it here--that is done in alloc.c. */
1554 Qchar_table_extra_slots = intern ("char-table-extra-slots");
1555
1556 /* Now we are ready to set up this property, so we can
1557 create the charset table. */
1558 Fput (Qcharset_table, Qchar_table_extra_slots, make_number (0));
1559 Vcharset_table = Fmake_char_table (Qcharset_table, Qnil);
1560
1561 Qunknown = intern ("unknown");
1562 staticpro (&Qunknown);
1563 Vcharset_symbol_table = Fmake_vector (make_number (MAX_CHARSET + 1),
1564 Qunknown);
1565
1566 /* Setup tables. */
1567 for (i = 0; i < 2; i++)
1568 for (j = 0; j < 2; j++)
1569 for (k = 0; k < 128; k++)
1570 iso_charset_table [i][j][k] = -1;
1571
1572 for (i = 0; i < 256; i++)
1573 bytes_by_char_head[i] = 1;
1574 bytes_by_char_head[LEADING_CODE_PRIVATE_11] = 3;
1575 bytes_by_char_head[LEADING_CODE_PRIVATE_12] = 3;
1576 bytes_by_char_head[LEADING_CODE_PRIVATE_21] = 4;
1577 bytes_by_char_head[LEADING_CODE_PRIVATE_22] = 4;
1578
1579 for (i = 0; i < 128; i++)
1580 width_by_char_head[i] = 1;
1581 for (; i < 256; i++)
1582 width_by_char_head[i] = 4;
1583 width_by_char_head[LEADING_CODE_PRIVATE_11] = 1;
1584 width_by_char_head[LEADING_CODE_PRIVATE_12] = 2;
1585 width_by_char_head[LEADING_CODE_PRIVATE_21] = 1;
1586 width_by_char_head[LEADING_CODE_PRIVATE_22] = 2;
1587
1588 {
1589 Lisp_Object val;
1590
1591 val = Qnil;
1592 for (i = 0x81; i < 0x90; i++)
1593 val = Fcons (make_number ((i - 0x70) << 7), val);
1594 for (; i < 0x9A; i++)
1595 val = Fcons (make_number ((i - 0x8F) << 14), val);
1596 for (i = 0xA0; i < 0xF0; i++)
1597 val = Fcons (make_number ((i - 0x70) << 7), val);
1598 for (; i < 0xFF; i++)
1599 val = Fcons (make_number ((i - 0xE0) << 14), val);
1600 Vgeneric_character_list = Fnreverse (val);
1601 }
1602
1603 nonascii_insert_offset = 0;
1604 Vnonascii_translation_table = Qnil;
1605 }
1606
1607 #ifdef emacs
1608
1609 void
1610 syms_of_charset ()
1611 {
1612 Qcharset = intern ("charset");
1613 staticpro (&Qcharset);
1614
1615 Qascii = intern ("ascii");
1616 staticpro (&Qascii);
1617
1618 Qeight_bit_control = intern ("eight-bit-control");
1619 staticpro (&Qeight_bit_control);
1620
1621 Qeight_bit_graphic = intern ("eight-bit-graphic");
1622 staticpro (&Qeight_bit_graphic);
1623
1624 /* Define special charsets ascii, eight-bit-control, and
1625 eight-bit-graphic. */
1626 update_charset_table (make_number (CHARSET_ASCII),
1627 make_number (1), make_number (94),
1628 make_number (1),
1629 make_number (0),
1630 make_number ('B'),
1631 make_number (0),
1632 build_string ("ASCII"),
1633 Qnil, /* same as above */
1634 build_string ("ASCII (ISO646 IRV)"));
1635 CHARSET_SYMBOL (CHARSET_ASCII) = Qascii;
1636 Fput (Qascii, Qcharset, CHARSET_TABLE_ENTRY (CHARSET_ASCII));
1637
1638 update_charset_table (make_number (CHARSET_8_BIT_CONTROL),
1639 make_number (1), make_number (96),
1640 make_number (4),
1641 make_number (0),
1642 make_number (-1),
1643 make_number (-1),
1644 build_string ("8-bit control code (0x80..0x9F)"),
1645 Qnil, /* same as above */
1646 Qnil); /* same as above */
1647 CHARSET_SYMBOL (CHARSET_8_BIT_CONTROL) = Qeight_bit_control;
1648 Fput (Qeight_bit_control, Qcharset,
1649 CHARSET_TABLE_ENTRY (CHARSET_8_BIT_CONTROL));
1650
1651 update_charset_table (make_number (CHARSET_8_BIT_GRAPHIC),
1652 make_number (1), make_number (96),
1653 make_number (4),
1654 make_number (0),
1655 make_number (-1),
1656 make_number (-1),
1657 build_string ("8-bit graphic char (0xA0..0xFF)"),
1658 Qnil, /* same as above */
1659 Qnil); /* same as above */
1660 CHARSET_SYMBOL (CHARSET_8_BIT_GRAPHIC) = Qeight_bit_graphic;
1661 Fput (Qeight_bit_graphic, Qcharset,
1662 CHARSET_TABLE_ENTRY (CHARSET_8_BIT_GRAPHIC));
1663
1664 Qauto_fill_chars = intern ("auto-fill-chars");
1665 staticpro (&Qauto_fill_chars);
1666 Fput (Qauto_fill_chars, Qchar_table_extra_slots, make_number (0));
1667
1668 defsubr (&Sdefine_charset);
1669 defsubr (&Sgeneric_character_list);
1670 defsubr (&Sget_unused_iso_final_char);
1671 defsubr (&Sdeclare_equiv_charset);
1672 defsubr (&Sfind_charset_region);
1673 defsubr (&Sfind_charset_string);
1674 defsubr (&Smake_char_internal);
1675 defsubr (&Ssplit_char);
1676 defsubr (&Schar_charset);
1677 defsubr (&Scharset_after);
1678 defsubr (&Siso_charset);
1679 defsubr (&Schar_valid_p);
1680 defsubr (&Sunibyte_char_to_multibyte);
1681 defsubr (&Smultibyte_char_to_unibyte);
1682 defsubr (&Schar_bytes);
1683 defsubr (&Schar_width);
1684 defsubr (&Sstring_width);
1685 defsubr (&Schar_direction);
1686 defsubr (&Schars_in_region);
1687 defsubr (&Sstring);
1688 defsubr (&Ssetup_special_charsets);
1689
1690 DEFVAR_LISP ("charset-list", &Vcharset_list,
1691 "List of charsets ever defined.");
1692 Vcharset_list = Fcons (Qascii, Fcons (Qeight_bit_control,
1693 Fcons (Qeight_bit_graphic, Qnil)));
1694
1695 DEFVAR_LISP ("translation-table-vector", &Vtranslation_table_vector,
1696 "Vector of cons cell of a symbol and translation table ever defined.\n\
1697 An ID of a translation table is an index of this vector.");
1698 Vtranslation_table_vector = Fmake_vector (make_number (16), Qnil);
1699
1700 DEFVAR_INT ("leading-code-private-11", &leading_code_private_11,
1701 "Leading-code of private TYPE9N charset of column-width 1.");
1702 leading_code_private_11 = LEADING_CODE_PRIVATE_11;
1703
1704 DEFVAR_INT ("leading-code-private-12", &leading_code_private_12,
1705 "Leading-code of private TYPE9N charset of column-width 2.");
1706 leading_code_private_12 = LEADING_CODE_PRIVATE_12;
1707
1708 DEFVAR_INT ("leading-code-private-21", &leading_code_private_21,
1709 "Leading-code of private TYPE9Nx9N charset of column-width 1.");
1710 leading_code_private_21 = LEADING_CODE_PRIVATE_21;
1711
1712 DEFVAR_INT ("leading-code-private-22", &leading_code_private_22,
1713 "Leading-code of private TYPE9Nx9N charset of column-width 2.");
1714 leading_code_private_22 = LEADING_CODE_PRIVATE_22;
1715
1716 DEFVAR_INT ("nonascii-insert-offset", &nonascii_insert_offset,
1717 "Offset for converting non-ASCII unibyte codes 0240...0377 to multibyte.\n\
1718 This is used for converting unibyte text to multibyte,\n\
1719 and for inserting character codes specified by number.\n\n\
1720 This serves to convert a Latin-1 or similar 8-bit character code\n\
1721 to the corresponding Emacs multibyte character code.\n\
1722 Typically the value should be (- (make-char CHARSET 0) 128),\n\
1723 for your choice of character set.\n\
1724 If `nonascii-translation-table' is non-nil, it overrides this variable.");
1725 nonascii_insert_offset = 0;
1726
1727 DEFVAR_LISP ("nonascii-translation-table", &Vnonascii_translation_table,
1728 "Translation table to convert non-ASCII unibyte codes to multibyte.\n\
1729 This is used for converting unibyte text to multibyte,\n\
1730 and for inserting character codes specified by number.\n\n\
1731 Conversion is performed only when multibyte characters are enabled,\n\
1732 and it serves to convert a Latin-1 or similar 8-bit character code\n\
1733 to the corresponding Emacs character code.\n\n\
1734 If this is nil, `nonascii-insert-offset' is used instead.\n\
1735 See also the docstring of `make-translation-table'.");
1736 Vnonascii_translation_table = Qnil;
1737
1738 DEFVAR_LISP ("auto-fill-chars", &Vauto_fill_chars,
1739 "A char-table for characters which invoke auto-filling.\n\
1740 Such characters have value t in this table.");
1741 Vauto_fill_chars = Fmake_char_table (Qauto_fill_chars, Qnil);
1742 CHAR_TABLE_SET (Vauto_fill_chars, make_number (' '), Qt);
1743 CHAR_TABLE_SET (Vauto_fill_chars, make_number ('\n'), Qt);
1744 }
1745
1746 #endif /* emacs */