(Qeight_bit_control, Qeight_bit_graphic): New
[bpt/emacs.git] / src / charset.c
1 /* Basic multilingual character support.
2 Copyright (C) 1995, 1997, 1998 Electrotechnical Laboratory, JAPAN.
3 Licensed to the Free Software Foundation.
4
5 This file is part of GNU Emacs.
6
7 GNU Emacs is free software; you can redistribute it and/or modify
8 it under the terms of the GNU General Public License as published by
9 the Free Software Foundation; either version 2, or (at your option)
10 any later version.
11
12 GNU Emacs is distributed in the hope that it will be useful,
13 but WITHOUT ANY WARRANTY; without even the implied warranty of
14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 GNU General Public License for more details.
16
17 You should have received a copy of the GNU General Public License
18 along with GNU Emacs; see the file COPYING. If not, write to
19 the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
20 Boston, MA 02111-1307, USA. */
21
22 /* At first, see the document in `charset.h' to understand the code in
23 this file. */
24
25 #ifdef emacs
26 #include <config.h>
27 #endif
28
29 #include <stdio.h>
30
31 #ifdef emacs
32
33 #include <sys/types.h>
34 #include "lisp.h"
35 #include "buffer.h"
36 #include "charset.h"
37 #include "coding.h"
38 #include "disptab.h"
39
40 #else /* not emacs */
41
42 #include "mulelib.h"
43
44 #endif /* emacs */
45
46 Lisp_Object Qcharset, Qascii, Qeight_bit_control, Qeight_bit_graphic;
47 Lisp_Object Qunknown;
48
49 /* Declaration of special leading-codes. */
50 int leading_code_private_11; /* for private DIMENSION1 of 1-column */
51 int leading_code_private_12; /* for private DIMENSION1 of 2-column */
52 int leading_code_private_21; /* for private DIMENSION2 of 1-column */
53 int leading_code_private_22; /* for private DIMENSION2 of 2-column */
54
55 /* Declaration of special charsets. The values are set by
56 Fsetup_special_charsets. */
57 int charset_latin_iso8859_1; /* ISO8859-1 (Latin-1) */
58 int charset_jisx0208_1978; /* JISX0208.1978 (Japanese Kanji old set) */
59 int charset_jisx0208; /* JISX0208.1983 (Japanese Kanji) */
60 int charset_katakana_jisx0201; /* JISX0201.Kana (Japanese Katakana) */
61 int charset_latin_jisx0201; /* JISX0201.Roman (Japanese Roman) */
62 int charset_big5_1; /* Big5 Level 1 (Chinese Traditional) */
63 int charset_big5_2; /* Big5 Level 2 (Chinese Traditional) */
64
65 Lisp_Object Qcharset_table;
66
67 /* A char-table containing information of each character set. */
68 Lisp_Object Vcharset_table;
69
70 /* A vector of charset symbol indexed by charset-id. This is used
71 only for returning charset symbol from C functions. */
72 Lisp_Object Vcharset_symbol_table;
73
74 /* A list of charset symbols ever defined. */
75 Lisp_Object Vcharset_list;
76
77 /* Vector of translation table ever defined.
78 ID of a translation table is used to index this vector. */
79 Lisp_Object Vtranslation_table_vector;
80
81 /* A char-table for characters which may invoke auto-filling. */
82 Lisp_Object Vauto_fill_chars;
83
84 Lisp_Object Qauto_fill_chars;
85
86 /* Tables used by macros BYTES_BY_CHAR_HEAD and WIDTH_BY_CHAR_HEAD. */
87 int bytes_by_char_head[256];
88 int width_by_char_head[256];
89
90 /* Mapping table from ISO2022's charset (specified by DIMENSION,
91 CHARS, and FINAL-CHAR) to Emacs' charset. */
92 int iso_charset_table[2][2][128];
93
94 /* Variables used locally in the macro FETCH_MULTIBYTE_CHAR. */
95 unsigned char *_fetch_multibyte_char_p;
96 int _fetch_multibyte_char_len;
97
98 /* Offset to add to a non-ASCII value when inserting it. */
99 int nonascii_insert_offset;
100
101 /* Translation table for converting non-ASCII unibyte characters
102 to multibyte codes, or nil. */
103 Lisp_Object Vnonascii_translation_table;
104
105 /* List of all possible generic characters. */
106 Lisp_Object Vgeneric_character_list;
107
108 #define min(X, Y) ((X) < (Y) ? (X) : (Y))
109 #define max(X, Y) ((X) > (Y) ? (X) : (Y))
110 \f
111 void
112 invalid_character (c)
113 int c;
114 {
115 error ("Invalid character: 0%o, %d, 0x%x", c, c, c);
116 }
117
118 /* Parse string STR of length LENGTH and fetch information of a
119 character at STR. Set BYTES to the byte length the character
120 occupies, CHARSET, C1, C2 to proper values of the character. */
121
122 #define SPLIT_MULTIBYTE_SEQ(str, length, bytes, charset, c1, c2) \
123 do { \
124 (c1) = *(str); \
125 (bytes) = BYTES_BY_CHAR_HEAD (c1); \
126 if ((bytes) == 1) \
127 (charset) = ASCII_BYTE_P (c1) ? CHARSET_ASCII : CHARSET_8_BIT_GRAPHIC; \
128 else if ((bytes) == 2) \
129 { \
130 if ((c1) == LEADING_CODE_8_BIT_CONTROL) \
131 (charset) = CHARSET_8_BIT_CONTROL, (c1) = (str)[1] - 0x20; \
132 else \
133 (charset) = (c1), (c1) = (str)[1] & 0x7F; \
134 } \
135 else if ((bytes) == 3) \
136 { \
137 if ((c1) < LEADING_CODE_PRIVATE_11) \
138 (charset) = (c1), (c1) = (str)[1] & 0x7F, (c2) = (str)[2] & 0x7F; \
139 else \
140 (charset) = (str)[1], (c1) = (str)[2] & 0x7F; \
141 } \
142 else \
143 (charset) = (str)[1], (c1) = (str)[2] & 0x7F, (c2) = (str)[3] & 0x7F; \
144 } while (0)
145
146 /* 1 if CHARSET, C1, and C2 compose a valid character, else 0. */
147 #define CHAR_COMPONENTS_VALID_P(charset, c1, c2) \
148 ((charset) == CHARSET_ASCII \
149 ? ((c1) >= 0 && (c1) <= 0x7F) \
150 : ((charset) == CHARSET_8_BIT_CONTROL \
151 ? ((c1) >= 0x80 && (c1) <= 0x9F) \
152 : ((charset) == CHARSET_8_BIT_GRAPHIC \
153 ? ((c1) >= 0x80 && (c1) <= 0xFF) \
154 : (CHARSET_DIMENSION (charset) == 1 \
155 ? ((c1) >= 0x20 && (c1) <= 0x7F) \
156 : ((c1) >= 0x20 && (c1) <= 0x7F \
157 && (c2) >= 0x20 && (c2) <= 0x7F)))))
158
159 /* Store multi-byte form of the character C in STR. The caller should
160 allocate at least 4-byte area at STR in advance. Returns the
161 length of the multi-byte form. If C is an invalid character code,
162 signal an error.
163
164 Use macro `CHAR_STRING (C, STR)' instead of calling this function
165 directly if C can be an ASCII character. */
166
167 int
168 char_to_string (c, str)
169 int c;
170 unsigned char *str;
171 {
172 unsigned char *p = str;
173
174 if (c & CHAR_MODIFIER_MASK) /* This includes the case C is negative. */
175 {
176 /* Multibyte character can't have a modifier bit. */
177 if (! SINGLE_BYTE_CHAR_P ((c & ~CHAR_MODIFIER_MASK)))
178 invalid_character (c);
179
180 /* For Meta, Shift, and Control modifiers, we need special care. */
181 if (c & CHAR_META)
182 {
183 /* Move the meta bit to the right place for a string. */
184 c = (c & ~CHAR_META) | 0x80;
185 }
186 if (c & CHAR_SHIFT)
187 {
188 /* Shift modifier is valid only with [A-Za-z]. */
189 if ((c & 0377) >= 'A' && (c & 0377) <= 'Z')
190 c &= ~CHAR_SHIFT;
191 else if ((c & 0377) >= 'a' && (c & 0377) <= 'z')
192 c = (c & ~CHAR_SHIFT) - ('a' - 'A');
193 }
194 if (c & CHAR_CTL)
195 {
196 /* Simulate the code in lread.c. */
197 /* Allow `\C- ' and `\C-?'. */
198 if (c == (CHAR_CTL | ' '))
199 c = 0;
200 else if (c == (CHAR_CTL | '?'))
201 c = 127;
202 /* ASCII control chars are made from letters (both cases),
203 as well as the non-letters within 0100...0137. */
204 else if ((c & 0137) >= 0101 && (c & 0137) <= 0132)
205 c &= (037 | (~0177 & ~CHAR_CTL));
206 else if ((c & 0177) >= 0100 && (c & 0177) <= 0137)
207 c &= (037 | (~0177 & ~CHAR_CTL));
208 }
209
210 /* If C still has any modifier bits, it is an invalid character. */
211 if (c & CHAR_MODIFIER_MASK)
212 invalid_character (c);
213 }
214 if (SINGLE_BYTE_CHAR_P (c))
215 {
216 if (ASCII_BYTE_P (c) || c >= 0xA0)
217 *p++ = c;
218 else
219 {
220 *p++ = LEADING_CODE_8_BIT_CONTROL;
221 *p++ = c + 0x20;
222 }
223 }
224 else if (c < MAX_CHAR)
225 {
226 int charset, c1, c2;
227
228 SPLIT_CHAR (c, charset, c1, c2);
229
230 if (charset >= LEADING_CODE_EXT_11)
231 *p++ = (charset < LEADING_CODE_EXT_12
232 ? LEADING_CODE_PRIVATE_11
233 : (charset < LEADING_CODE_EXT_21
234 ? LEADING_CODE_PRIVATE_12
235 : (charset < LEADING_CODE_EXT_22
236 ? LEADING_CODE_PRIVATE_21
237 : LEADING_CODE_PRIVATE_22)));
238 *p++ = charset;
239 if (c1 > 0 && c1 < 32 || c2 > 0 && c2 < 32)
240 invalid_character (c);
241 if (c1)
242 {
243 *p++ = c1 | 0x80;
244 if (c2 > 0)
245 *p++ = c2 | 0x80;
246 }
247 }
248 else
249 invalid_character (c);
250
251 return (p - str);
252 }
253
254 /* Return the non-ASCII character corresponding to multi-byte form at
255 STR of length LEN. If ACTUAL_LEN is not NULL, store the byte
256 length of the multibyte form in *ACTUAL_LEN.
257
258 Use macros STRING_CHAR or STRING_CHAR_AND_LENGTH instead of calling
259 this function directly if you want ot handle ASCII characters as
260 well. */
261
262 int
263 string_to_char (str, len, actual_len)
264 const unsigned char *str;
265 int len, *actual_len;
266 {
267 int c, bytes, charset, c1, c2;
268
269 SPLIT_MULTIBYTE_SEQ (str, len, bytes, charset, c1, c2);
270 c = MAKE_CHAR (charset, c1, c2);
271 if (actual_len)
272 *actual_len = bytes;
273 return c;
274 }
275
276 /* Return the length of the multi-byte form at string STR of length LEN.
277 Use the macro MULTIBYTE_FORM_LENGTH instead. */
278 int
279 multibyte_form_length (str, len)
280 const unsigned char *str;
281 int len;
282 {
283 int bytes;
284
285 PARSE_MULTIBYTE_SEQ (str, len, bytes);
286 return bytes;
287 }
288
289 /* Check multibyte form at string STR of length LEN and set variables
290 pointed by CHARSET, C1, and C2 to charset and position codes of the
291 character at STR, and return 0. If there's no multibyte character,
292 return -1. This should be used only in the macro SPLIT_STRING
293 which checks range of STR in advance. */
294
295 int
296 split_string (str, len, charset, c1, c2)
297 const unsigned char *str;
298 unsigned char *c1, *c2;
299 int len, *charset;
300 {
301 register int bytes, cs, code1, code2 = -1;
302
303 SPLIT_MULTIBYTE_SEQ (str, len, bytes, cs, code1, code2);
304 if (cs == CHARSET_ASCII)
305 return -1;
306 *charset = cs;
307 *c1 = code1;
308 *c2 = code2;
309 return 0;
310 }
311
312 /* Return 1 iff character C has valid printable glyph.
313 Use the macro CHAR_PRINTABLE_P instead. */
314 int
315 char_printable_p (c)
316 int c;
317 {
318 int charset, c1, c2, chars;
319
320 if (ASCII_BYTE_P (c))
321 return 1;
322 else if (SINGLE_BYTE_CHAR_P (c))
323 return 0;
324 else if (c >= MAX_CHAR)
325 return 0;
326
327 SPLIT_CHAR (c, charset, c1, c2);
328 if (! CHARSET_DEFINED_P (charset))
329 return 0;
330 if (CHARSET_CHARS (charset) == 94
331 ? c1 <= 32 || c1 >= 127
332 : c1 < 32)
333 return 0;
334 if (CHARSET_DIMENSION (charset) == 2
335 && (CHARSET_CHARS (charset) == 94
336 ? c2 <= 32 || c2 >= 127
337 : c2 < 32))
338 return 0;
339 return 1;
340 }
341
342 /* Translate character C by translation table TABLE. If C
343 is negative, translate a character specified by CHARSET, C1, and C2
344 (C1 and C2 are code points of the character). If no translation is
345 found in TABLE, return C. */
346 int
347 translate_char (table, c, charset, c1, c2)
348 Lisp_Object table;
349 int c, charset, c1, c2;
350 {
351 Lisp_Object ch;
352 int alt_charset, alt_c1, alt_c2, dimension;
353
354 if (c < 0) c = MAKE_CHAR (charset, (c1 & 0x7F) , (c2 & 0x7F));
355 if (!CHAR_TABLE_P (table)
356 || (ch = Faref (table, make_number (c)), !NATNUMP (ch)))
357 return c;
358
359 SPLIT_CHAR (XFASTINT (ch), alt_charset, alt_c1, alt_c2);
360 dimension = CHARSET_DIMENSION (alt_charset);
361 if (dimension == 1 && alt_c1 > 0 || dimension == 2 && alt_c2 > 0)
362 /* CH is not a generic character, just return it. */
363 return XFASTINT (ch);
364
365 /* Since CH is a generic character, we must return a specific
366 charater which has the same position codes as C from CH. */
367 if (charset < 0)
368 SPLIT_CHAR (c, charset, c1, c2);
369 if (dimension != CHARSET_DIMENSION (charset))
370 /* We can't make such a character because of dimension mismatch. */
371 return c;
372 return MAKE_CHAR (alt_charset, c1, c2);
373 }
374
375 /* Convert the unibyte character C to multibyte based on
376 Vnonascii_translation_table or nonascii_insert_offset. If they can't
377 convert C to a valid multibyte character, convert it based on
378 DEFAULT_NONASCII_INSERT_OFFSET which makes C a Latin-1 character. */
379
380 int
381 unibyte_char_to_multibyte (c)
382 int c;
383 {
384 if (c < 0400 && c >= 0200)
385 {
386 int c_save = c;
387
388 if (! NILP (Vnonascii_translation_table))
389 {
390 c = XINT (Faref (Vnonascii_translation_table, make_number (c)));
391 if (c >= 0400 && ! char_valid_p (c, 0))
392 c = c_save + DEFAULT_NONASCII_INSERT_OFFSET;
393 }
394 else if (c >= 0240 && nonascii_insert_offset > 0)
395 {
396 c += nonascii_insert_offset;
397 if (c < 0400 || ! char_valid_p (c, 0))
398 c = c_save + DEFAULT_NONASCII_INSERT_OFFSET;
399 }
400 else if (c >= 0240)
401 c = c_save + DEFAULT_NONASCII_INSERT_OFFSET;
402 }
403 return c;
404 }
405
406
407 /* Convert the multibyte character C to unibyte 8-bit character based
408 on Vnonascii_translation_table or nonascii_insert_offset. If
409 REV_TBL is non-nil, it should be a reverse table of
410 Vnonascii_translation_table, i.e. what given by:
411 Fchar_table_extra_slot (Vnonascii_translation_table, make_number (0)) */
412
413 int
414 multibyte_char_to_unibyte (c, rev_tbl)
415 int c;
416 Lisp_Object rev_tbl;
417 {
418 if (!SINGLE_BYTE_CHAR_P (c))
419 {
420 int c_save = c;
421
422 if (! CHAR_TABLE_P (rev_tbl)
423 && CHAR_TABLE_P (Vnonascii_translation_table))
424 rev_tbl = Fchar_table_extra_slot (Vnonascii_translation_table,
425 make_number (0));
426 if (CHAR_TABLE_P (rev_tbl))
427 {
428 Lisp_Object temp;
429 temp = Faref (rev_tbl, make_number (c));
430 if (INTEGERP (temp))
431 c = XINT (temp);
432 if (c >= 256)
433 c = (c_save & 0177) + 0200;
434 }
435 else
436 {
437 if (nonascii_insert_offset > 0)
438 c -= nonascii_insert_offset;
439 if (c < 128 || c >= 256)
440 c = (c_save & 0177) + 0200;
441 }
442 }
443
444 return c;
445 }
446
447 \f
448 /* Update the table Vcharset_table with the given arguments (see the
449 document of `define-charset' for the meaning of each argument).
450 Several other table contents are also updated. The caller should
451 check the validity of CHARSET-ID and the remaining arguments in
452 advance. */
453
454 void
455 update_charset_table (charset_id, dimension, chars, width, direction,
456 iso_final_char, iso_graphic_plane,
457 short_name, long_name, description)
458 Lisp_Object charset_id, dimension, chars, width, direction;
459 Lisp_Object iso_final_char, iso_graphic_plane;
460 Lisp_Object short_name, long_name, description;
461 {
462 int charset = XINT (charset_id);
463 int bytes;
464 unsigned char leading_code_base, leading_code_ext;
465
466 if (NILP (CHARSET_TABLE_ENTRY (charset)))
467 CHARSET_TABLE_ENTRY (charset)
468 = Fmake_vector (make_number (CHARSET_MAX_IDX), Qnil);
469
470 /* Get byte length of multibyte form, base leading-code, and
471 extended leading-code of the charset. See the comment under the
472 title "GENERAL NOTE on CHARACTER SET (CHARSET)" in charset.h. */
473 bytes = XINT (dimension);
474 if (charset < MIN_CHARSET_PRIVATE_DIMENSION1)
475 {
476 /* Official charset, it doesn't have an extended leading-code. */
477 if (charset != CHARSET_ASCII && charset != CHARSET_8_BIT_GRAPHIC)
478 bytes += 1; /* For a base leading-code. */
479 leading_code_base = charset;
480 leading_code_ext = 0;
481 }
482 else
483 {
484 /* Private charset. */
485 bytes += 2; /* For base and extended leading-codes. */
486 leading_code_base
487 = (charset < LEADING_CODE_EXT_12
488 ? LEADING_CODE_PRIVATE_11
489 : (charset < LEADING_CODE_EXT_21
490 ? LEADING_CODE_PRIVATE_12
491 : (charset < LEADING_CODE_EXT_22
492 ? LEADING_CODE_PRIVATE_21
493 : LEADING_CODE_PRIVATE_22)));
494 leading_code_ext = charset;
495 }
496
497 if (charset != CHARSET_ASCII && charset != CHARSET_8_BIT_GRAPHIC
498 &&BYTES_BY_CHAR_HEAD (leading_code_base) != bytes)
499 error ("Invalid dimension for the charset-ID %d", charset);
500
501 CHARSET_TABLE_INFO (charset, CHARSET_ID_IDX) = charset_id;
502 CHARSET_TABLE_INFO (charset, CHARSET_BYTES_IDX) = make_number (bytes);
503 CHARSET_TABLE_INFO (charset, CHARSET_DIMENSION_IDX) = dimension;
504 CHARSET_TABLE_INFO (charset, CHARSET_CHARS_IDX) = chars;
505 CHARSET_TABLE_INFO (charset, CHARSET_WIDTH_IDX) = width;
506 CHARSET_TABLE_INFO (charset, CHARSET_DIRECTION_IDX) = direction;
507 CHARSET_TABLE_INFO (charset, CHARSET_LEADING_CODE_BASE_IDX)
508 = make_number (leading_code_base);
509 CHARSET_TABLE_INFO (charset, CHARSET_LEADING_CODE_EXT_IDX)
510 = make_number (leading_code_ext);
511 CHARSET_TABLE_INFO (charset, CHARSET_ISO_FINAL_CHAR_IDX) = iso_final_char;
512 CHARSET_TABLE_INFO (charset, CHARSET_ISO_GRAPHIC_PLANE_IDX)
513 = iso_graphic_plane;
514 CHARSET_TABLE_INFO (charset, CHARSET_SHORT_NAME_IDX) = short_name;
515 CHARSET_TABLE_INFO (charset, CHARSET_LONG_NAME_IDX) = long_name;
516 CHARSET_TABLE_INFO (charset, CHARSET_DESCRIPTION_IDX) = description;
517 CHARSET_TABLE_INFO (charset, CHARSET_PLIST_IDX) = Qnil;
518
519 {
520 /* If we have already defined a charset which has the same
521 DIMENSION, CHARS and ISO-FINAL-CHAR but the different
522 DIRECTION, we must update the entry REVERSE-CHARSET of both
523 charsets. If there's no such charset, the value of the entry
524 is set to nil. */
525 int i;
526
527 for (i = 0; i <= MAX_CHARSET; i++)
528 if (!NILP (CHARSET_TABLE_ENTRY (i)))
529 {
530 if (CHARSET_DIMENSION (i) == XINT (dimension)
531 && CHARSET_CHARS (i) == XINT (chars)
532 && CHARSET_ISO_FINAL_CHAR (i) == XINT (iso_final_char)
533 && CHARSET_DIRECTION (i) != XINT (direction))
534 {
535 CHARSET_TABLE_INFO (charset, CHARSET_REVERSE_CHARSET_IDX)
536 = make_number (i);
537 CHARSET_TABLE_INFO (i, CHARSET_REVERSE_CHARSET_IDX) = charset_id;
538 break;
539 }
540 }
541 if (i > MAX_CHARSET)
542 /* No such a charset. */
543 CHARSET_TABLE_INFO (charset, CHARSET_REVERSE_CHARSET_IDX)
544 = make_number (-1);
545 }
546
547 if (charset != CHARSET_ASCII
548 && charset < MIN_CHARSET_PRIVATE_DIMENSION1)
549 {
550 width_by_char_head[leading_code_base] = XINT (width);
551
552 /* Update table emacs_code_class. */
553 emacs_code_class[charset] = (bytes == 2
554 ? EMACS_leading_code_2
555 : (bytes == 3
556 ? EMACS_leading_code_3
557 : EMACS_leading_code_4));
558 }
559
560 /* Update table iso_charset_table. */
561 if (iso_final_char >= 0
562 && ISO_CHARSET_TABLE (dimension, chars, iso_final_char) < 0)
563 ISO_CHARSET_TABLE (dimension, chars, iso_final_char) = charset;
564 }
565
566 #ifdef emacs
567
568 /* Return charset id of CHARSET_SYMBOL, or return -1 if CHARSET_SYMBOL
569 is invalid. */
570 int
571 get_charset_id (charset_symbol)
572 Lisp_Object charset_symbol;
573 {
574 Lisp_Object val;
575 int charset;
576
577 return ((SYMBOLP (charset_symbol)
578 && (val = Fget (charset_symbol, Qcharset), VECTORP (val))
579 && (charset = XINT (XVECTOR (val)->contents[CHARSET_ID_IDX]),
580 CHARSET_VALID_P (charset)))
581 ? charset : -1);
582 }
583
584 /* Return an identification number for a new private charset of
585 DIMENSION and WIDTH. If there's no more room for the new charset,
586 return 0. */
587 Lisp_Object
588 get_new_private_charset_id (dimension, width)
589 int dimension, width;
590 {
591 int charset, from, to;
592
593 if (dimension == 1)
594 {
595 if (width == 1)
596 from = LEADING_CODE_EXT_11, to = LEADING_CODE_EXT_12;
597 else
598 from = LEADING_CODE_EXT_12, to = LEADING_CODE_EXT_21;
599 }
600 else
601 {
602 if (width == 1)
603 from = LEADING_CODE_EXT_21, to = LEADING_CODE_EXT_22;
604 else
605 from = LEADING_CODE_EXT_22, to = LEADING_CODE_EXT_MAX + 1;
606 }
607
608 for (charset = from; charset < to; charset++)
609 if (!CHARSET_DEFINED_P (charset)) break;
610
611 return make_number (charset < to ? charset : 0);
612 }
613
614 DEFUN ("define-charset", Fdefine_charset, Sdefine_charset, 3, 3, 0,
615 "Define CHARSET-ID as the identification number of CHARSET with INFO-VECTOR.\n\
616 If CHARSET-ID is nil, it is decided automatically, which means CHARSET is\n\
617 treated as a private charset.\n\
618 INFO-VECTOR is a vector of the format:\n\
619 [DIMENSION CHARS WIDTH DIRECTION ISO-FINAL-CHAR ISO-GRAPHIC-PLANE\n\
620 SHORT-NAME LONG-NAME DESCRIPTION]\n\
621 The meanings of each elements is as follows:\n\
622 DIMENSION (integer) is the number of bytes to represent a character: 1 or 2.\n\
623 CHARS (integer) is the number of characters in a dimension: 94 or 96.\n\
624 WIDTH (integer) is the number of columns a character in the charset\n\
625 occupies on the screen: one of 0, 1, and 2.\n\
626 \n\
627 DIRECTION (integer) is the rendering direction of characters in the\n\
628 charset when rendering. If 0, render from left to right, else\n\
629 render from right to left.\n\
630 \n\
631 ISO-FINAL-CHAR (character) is the final character of the\n\
632 corresponding ISO 2022 charset.\n\
633 It may be -1 if the charset is internal use only.\n\
634 \n\
635 ISO-GRAPHIC-PLANE (integer) is the graphic plane to be invoked\n\
636 while encoding to variants of ISO 2022 coding system, one of the\n\
637 following: 0/graphic-plane-left(GL), 1/graphic-plane-right(GR).\n\
638 It may be -1 if the charset is internal use only.\n\
639 \n\
640 SHORT-NAME (string) is the short name to refer to the charset.\n\
641 \n\
642 LONG-NAME (string) is the long name to refer to the charset.\n\
643 \n\
644 DESCRIPTION (string) is the description string of the charset.")
645 (charset_id, charset_symbol, info_vector)
646 Lisp_Object charset_id, charset_symbol, info_vector;
647 {
648 Lisp_Object *vec;
649
650 if (!NILP (charset_id))
651 CHECK_NUMBER (charset_id, 0);
652 CHECK_SYMBOL (charset_symbol, 1);
653 CHECK_VECTOR (info_vector, 2);
654
655 if (! NILP (charset_id))
656 {
657 if (! CHARSET_VALID_P (XINT (charset_id)))
658 error ("Invalid CHARSET: %d", XINT (charset_id));
659 else if (CHARSET_DEFINED_P (XINT (charset_id)))
660 error ("Already defined charset: %d", XINT (charset_id));
661 }
662
663 vec = XVECTOR (info_vector)->contents;
664 if (XVECTOR (info_vector)->size != 9
665 || !INTEGERP (vec[0]) || !(XINT (vec[0]) == 1 || XINT (vec[0]) == 2)
666 || !INTEGERP (vec[1]) || !(XINT (vec[1]) == 94 || XINT (vec[1]) == 96)
667 || !INTEGERP (vec[2]) || !(XINT (vec[2]) == 1 || XINT (vec[2]) == 2)
668 || !INTEGERP (vec[3]) || !(XINT (vec[3]) == 0 || XINT (vec[3]) == 1)
669 || !INTEGERP (vec[4])
670 || !(XINT (vec[4]) == -1 || XINT (vec[4]) >= '0' && XINT (vec[4]) <= '~')
671 || !INTEGERP (vec[5])
672 || !(XINT (vec[5]) == -1 || XINT (vec[5]) == 0 || XINT (vec[5]) == 1)
673 || !STRINGP (vec[6])
674 || !STRINGP (vec[7])
675 || !STRINGP (vec[8]))
676 error ("Invalid info-vector argument for defining charset %s",
677 XSYMBOL (charset_symbol)->name->data);
678
679 if (NILP (charset_id))
680 {
681 charset_id = get_new_private_charset_id (XINT (vec[0]), XINT (vec[2]));
682 if (XINT (charset_id) == 0)
683 error ("There's no room for a new private charset %s",
684 XSYMBOL (charset_symbol)->name->data);
685 }
686
687 update_charset_table (charset_id, vec[0], vec[1], vec[2], vec[3],
688 vec[4], vec[5], vec[6], vec[7], vec[8]);
689 Fput (charset_symbol, Qcharset, CHARSET_TABLE_ENTRY (XINT (charset_id)));
690 CHARSET_SYMBOL (XINT (charset_id)) = charset_symbol;
691 Vcharset_list = Fcons (charset_symbol, Vcharset_list);
692 return Qnil;
693 }
694
695 DEFUN ("generic-character-list", Fgeneric_character_list,
696 Sgeneric_character_list, 0, 0, 0,
697 "Return a list of all possible generic characters.\n\
698 It includes a generic character for a charset not yet defined.")
699 ()
700 {
701 return Vgeneric_character_list;
702 }
703
704 DEFUN ("get-unused-iso-final-char", Fget_unused_iso_final_char,
705 Sget_unused_iso_final_char, 2, 2, 0,
706 "Return an unsed ISO's final char for a charset of DIMENISION and CHARS.\n\
707 DIMENSION is the number of bytes to represent a character: 1 or 2.\n\
708 CHARS is the number of characters in a dimension: 94 or 96.\n\
709 \n\
710 This final char is for private use, thus the range is `0' (48) .. `?' (63).\n\
711 If there's no unused final char for the specified kind of charset,\n\
712 return nil.")
713 (dimension, chars)
714 Lisp_Object dimension, chars;
715 {
716 int final_char;
717
718 CHECK_NUMBER (dimension, 0);
719 CHECK_NUMBER (chars, 1);
720 if (XINT (dimension) != 1 && XINT (dimension) != 2)
721 error ("Invalid charset dimension %d, it should be 1 or 2",
722 XINT (dimension));
723 if (XINT (chars) != 94 && XINT (chars) != 96)
724 error ("Invalid charset chars %d, it should be 94 or 96",
725 XINT (chars));
726 for (final_char = '0'; final_char <= '?'; final_char++)
727 {
728 if (ISO_CHARSET_TABLE (dimension, chars, make_number (final_char)) < 0)
729 break;
730 }
731 return (final_char <= '?' ? make_number (final_char) : Qnil);
732 }
733
734 DEFUN ("declare-equiv-charset", Fdeclare_equiv_charset, Sdeclare_equiv_charset,
735 4, 4, 0,
736 "Declare a charset of DIMENSION, CHARS, FINAL-CHAR is the same as CHARSET.\n\
737 CHARSET should be defined by `defined-charset' in advance.")
738 (dimension, chars, final_char, charset_symbol)
739 Lisp_Object dimension, chars, final_char, charset_symbol;
740 {
741 int charset;
742
743 CHECK_NUMBER (dimension, 0);
744 CHECK_NUMBER (chars, 1);
745 CHECK_NUMBER (final_char, 2);
746 CHECK_SYMBOL (charset_symbol, 3);
747
748 if (XINT (dimension) != 1 && XINT (dimension) != 2)
749 error ("Invalid DIMENSION %d, it should be 1 or 2", XINT (dimension));
750 if (XINT (chars) != 94 && XINT (chars) != 96)
751 error ("Invalid CHARS %d, it should be 94 or 96", XINT (chars));
752 if (XINT (final_char) < '0' || XFASTINT (final_char) > '~')
753 error ("Invalid FINAL-CHAR %c, it should be `0'..`~'", XINT (chars));
754 if ((charset = get_charset_id (charset_symbol)) < 0)
755 error ("Invalid charset %s", XSYMBOL (charset_symbol)->name->data);
756
757 ISO_CHARSET_TABLE (dimension, chars, final_char) = charset;
758 return Qnil;
759 }
760
761 /* Return information about charsets in the text at PTR of NBYTES
762 bytes, which are NCHARS characters. The value is:
763 0: No multibyte characters (including 8-bit code of range 0x80..0x9F)
764 are found.
765 1: No charsets other than ascii eight-bit-control,
766 eight-bit-graphic, and latin-1 are found.
767 2: Otherwise.
768
769 In addition, if CHARSETS is nonzero, for each found charset N, set
770 CHARSETS[N] to 1. For that, callers should allocate CHARSETS
771 (MAX_CHARSET + 1 elements) in advance. It may lookup a translation
772 table TABLE if supplied. For invalid charsets, set CHARSETS[1] to
773 1 (note that there's no charset whose ID is 1). */
774
775 int
776 find_charset_in_text (ptr, nchars, nbytes, charsets, table)
777 unsigned char *ptr;
778 int nchars, nbytes, *charsets;
779 Lisp_Object table;
780 {
781 if (nchars == nbytes)
782 {
783 if (charsets && nbytes > 0)
784 {
785 unsigned char *endp = ptr + nbytes;
786 int maskbits = 0;
787
788 while (ptr < endp && maskbits != 7)
789 {
790 maskbits |= (*ptr < 0x80 ? 1 : *ptr < 0xA0 ? 2 : 4);
791 ptr++;
792 }
793
794 if (maskbits & 1)
795 charsets[CHARSET_ASCII] = 1;
796 if (maskbits & 2)
797 charsets[CHARSET_8_BIT_CONTROL] = 1;
798 if (maskbits & 4)
799 charsets[CHARSET_8_BIT_GRAPHIC] = 1;
800 }
801 return 0;
802 }
803 else
804 {
805 int return_val = 1;
806 int bytes, charset, c1, c2;
807
808 if (! CHAR_TABLE_P (table))
809 table = Qnil;
810
811 while (nchars-- > 0)
812 {
813 SPLIT_MULTIBYTE_SEQ (ptr, len, bytes, charset, c1, c2);
814 ptr += bytes;
815
816 if (!CHARSET_DEFINED_P (charset))
817 charset = 1;
818 else if (! NILP (table))
819 {
820 int c = translate_char (table, -1, charset, c1, c2);
821 if (c >= 0)
822 charset = CHAR_CHARSET (c);
823 }
824
825 if (return_val == 1
826 && charset != CHARSET_ASCII
827 && charset != CHARSET_8_BIT_CONTROL
828 && charset != CHARSET_8_BIT_GRAPHIC
829 && charset != charset_latin_iso8859_1)
830 return_val = 2;
831
832 if (charsets)
833 charsets[charset] = 1;
834 else if (return_val == 2)
835 break;
836 }
837 return return_val;
838 }
839 }
840
841 DEFUN ("find-charset-region", Ffind_charset_region, Sfind_charset_region,
842 2, 3, 0,
843 "Return a list of charsets in the region between BEG and END.\n\
844 BEG and END are buffer positions.\n\
845 Optional arg TABLE if non-nil is a translation table to look up.\n\
846 \n\
847 If the region contains invalid multiybte characters,\n\
848 `unknown' is included in the returned list.\n\
849 \n\
850 If the current buffer is unibyte, the returned list may contain\n\
851 only `ascii', `eight-bit-control', and `eight-bit-graphic'.")
852 (beg, end, table)
853 Lisp_Object beg, end, table;
854 {
855 int charsets[MAX_CHARSET + 1];
856 int from, from_byte, to, stop, stop_byte, i;
857 Lisp_Object val;
858
859 validate_region (&beg, &end);
860 from = XFASTINT (beg);
861 stop = to = XFASTINT (end);
862
863 if (from < GPT && GPT < to)
864 {
865 stop = GPT;
866 stop_byte = GPT_BYTE;
867 }
868 else
869 stop_byte = CHAR_TO_BYTE (stop);
870
871 from_byte = CHAR_TO_BYTE (from);
872
873 bzero (charsets, (MAX_CHARSET + 1) * sizeof (int));
874 while (1)
875 {
876 find_charset_in_text (BYTE_POS_ADDR (from_byte), stop - from,
877 stop_byte - from_byte, charsets, table);
878 if (stop < to)
879 {
880 from = stop, from_byte = stop_byte;
881 stop = to, stop_byte = CHAR_TO_BYTE (stop);
882 }
883 else
884 break;
885 }
886
887 val = Qnil;
888 if (charsets[1])
889 val = Fcons (Qunknown, val);
890 for (i = MAX_CHARSET; i >= MIN_CHARSET_OFFICIAL_DIMENSION1; i--)
891 if (charsets[i])
892 val = Fcons (CHARSET_SYMBOL (i), val);
893 if (charsets[0])
894 val = Fcons (Qascii, val);
895 return val;
896 }
897
898 DEFUN ("find-charset-string", Ffind_charset_string, Sfind_charset_string,
899 1, 2, 0,
900 "Return a list of charsets in STR.\n\
901 Optional arg TABLE if non-nil is a translation table to look up.\n\
902 \n\
903 If the region contains invalid multiybte characters,\n\
904 `unknown' is included in the returned list.\n\
905 \n\
906 If STR is unibyte, the returned list may contain\n\
907 only `ascii', `eight-bit-control', and `eight-bit-graphic'.")
908 (str, table)
909 Lisp_Object str, table;
910 {
911 int charsets[MAX_CHARSET + 1];
912 int i;
913 Lisp_Object val;
914
915 CHECK_STRING (str, 0);
916
917 bzero (charsets, (MAX_CHARSET + 1) * sizeof (int));
918 find_charset_in_text (XSTRING (str)->data, XSTRING (str)->size,
919 STRING_BYTES (XSTRING (str)), charsets, table);
920
921 val = Qnil;
922 if (charsets[1])
923 val = Fcons (Qunknown, val);
924 for (i = MAX_CHARSET; i >= MIN_CHARSET_OFFICIAL_DIMENSION1; i--)
925 if (charsets[i])
926 val = Fcons (CHARSET_SYMBOL (i), val);
927 if (charsets[0])
928 val = Fcons (Qascii, val);
929 return val;
930 }
931
932 \f
933 DEFUN ("make-char-internal", Fmake_char_internal, Smake_char_internal, 1, 3, 0,
934 "")
935 (charset, code1, code2)
936 Lisp_Object charset, code1, code2;
937 {
938 int charset_id, c1, c2;
939
940 CHECK_NUMBER (charset, 0);
941 charset_id = XINT (charset);
942 if (!CHARSET_DEFINED_P (charset_id))
943 error ("Invalid charset ID: %d", XINT (charset));
944
945 if (NILP (code1))
946 c1 = 0;
947 else
948 {
949 CHECK_NUMBER (code1, 1);
950 c1 = XINT (code1);
951 }
952 if (NILP (code2))
953 c2 = 0;
954 else
955 {
956 CHECK_NUMBER (code2, 2);
957 c2 = XINT (code2);
958 }
959
960 if (charset_id == CHARSET_ASCII)
961 {
962 if (c1 < 0 || c1 > 0x7F)
963 goto invalid_code_posints;
964 return make_number (c1);
965 }
966 else if (charset_id == CHARSET_8_BIT_CONTROL)
967 {
968 if (c1 < 0x80 || c1 > 0x9F)
969 goto invalid_code_posints;
970 return make_number (c1);
971 }
972 else if (charset_id == CHARSET_8_BIT_GRAPHIC)
973 {
974 if (c1 < 0xA0 || c1 > 0xFF)
975 goto invalid_code_posints;
976 return make_number (c1);
977 }
978 else if (c1 < 0 || c1 > 0xFF || c2 < 0 || c2 > 0xFF)
979 goto invalid_code_posints;
980 c1 &= 0x7F;
981 c2 &= 0x7F;
982 if (c1 == 0
983 ? c2 != 0
984 : (c2 == 0
985 ? !CHAR_COMPONENTS_VALID_P (charset_id, c1, 0x20)
986 : !CHAR_COMPONENTS_VALID_P (charset_id, c1, c2)))
987 goto invalid_code_posints;
988 return make_number (MAKE_CHAR (charset_id, c1, c2));
989
990 invalid_code_posints:
991 error ("Invalid code points for charset ID %d: %d %d", charset_id, c1, c2);
992 }
993
994 DEFUN ("split-char", Fsplit_char, Ssplit_char, 1, 1, 0,
995 "Return list of charset and one or two position-codes of CHAR.\n\
996 If CHAR is invalid as a character code,\n\
997 return a list of symbol `unknown' and CHAR.")
998 (ch)
999 Lisp_Object ch;
1000 {
1001 Lisp_Object val;
1002 int c, charset, c1, c2;
1003
1004 CHECK_NUMBER (ch, 0);
1005 c = XFASTINT (ch);
1006 if (!CHAR_VALID_P (c, 1))
1007 return Fcons (Qunknown, Fcons (ch, Qnil));
1008 SPLIT_CHAR (XFASTINT (ch), charset, c1, c2);
1009 return (c2 >= 0
1010 ? Fcons (CHARSET_SYMBOL (charset),
1011 Fcons (make_number (c1), Fcons (make_number (c2), Qnil)))
1012 : Fcons (CHARSET_SYMBOL (charset), Fcons (make_number (c1), Qnil)));
1013 }
1014
1015 DEFUN ("char-charset", Fchar_charset, Schar_charset, 1, 1, 0,
1016 "Return charset of CHAR.")
1017 (ch)
1018 Lisp_Object ch;
1019 {
1020 CHECK_NUMBER (ch, 0);
1021
1022 return CHARSET_SYMBOL (CHAR_CHARSET (XINT (ch)));
1023 }
1024
1025 DEFUN ("charset-after", Fcharset_after, Scharset_after, 0, 1, 0,
1026 "Return charset of a character in the current buffer at position POS.\n\
1027 If POS is nil, it defauls to the current point.\n\
1028 If POS is out of range, the value is nil.")
1029 (pos)
1030 Lisp_Object pos;
1031 {
1032 Lisp_Object ch;
1033 int charset;
1034
1035 ch = Fchar_after (pos);
1036 if (! INTEGERP (ch))
1037 return ch;
1038 charset = CHAR_CHARSET (XINT (ch));
1039 return CHARSET_SYMBOL (charset);
1040 }
1041
1042 DEFUN ("iso-charset", Fiso_charset, Siso_charset, 3, 3, 0,
1043 "Return charset of ISO's specification DIMENSION, CHARS, and FINAL-CHAR.\n\
1044 \n\
1045 ISO 2022's designation sequence (escape sequence) distinguishes charsets\n\
1046 by their DIMENSION, CHARS, and FINAL-CHAR,\n\
1047 where as Emacs distinguishes them by charset symbol.\n\
1048 See the documentation of the function `charset-info' for the meanings of\n\
1049 DIMENSION, CHARS, and FINAL-CHAR.")
1050 (dimension, chars, final_char)
1051 Lisp_Object dimension, chars, final_char;
1052 {
1053 int charset;
1054
1055 CHECK_NUMBER (dimension, 0);
1056 CHECK_NUMBER (chars, 1);
1057 CHECK_NUMBER (final_char, 2);
1058
1059 if ((charset = ISO_CHARSET_TABLE (dimension, chars, final_char)) < 0)
1060 return Qnil;
1061 return CHARSET_SYMBOL (charset);
1062 }
1063
1064 /* If GENERICP is nonzero, return nonzero iff C is a valid normal or
1065 generic character. If GENERICP is zero, return nonzero iff C is a
1066 valid normal character. Do not call this function directly,
1067 instead use macro CHAR_VALID_P. */
1068 int
1069 char_valid_p (c, genericp)
1070 int c, genericp;
1071 {
1072 int charset, c1, c2;
1073
1074 if (c < 0)
1075 return 0;
1076 if (SINGLE_BYTE_CHAR_P (c))
1077 return 1;
1078 SPLIT_CHAR (c, charset, c1, c2);
1079 if (genericp)
1080 {
1081 if (c1)
1082 {
1083 if (c2 <= 0) c2 = 0x20;
1084 }
1085 else
1086 {
1087 if (c2 <= 0) c1 = c2 = 0x20;
1088 }
1089 }
1090 return (CHARSET_DEFINED_P (charset)
1091 && CHAR_COMPONENTS_VALID_P (charset, c1, c2));
1092 }
1093
1094 DEFUN ("char-valid-p", Fchar_valid_p, Schar_valid_p, 1, 2, 0,
1095 "Return t if OBJECT is a valid normal character.\n\
1096 If optional arg GENERICP is non-nil, also return t if OBJECT is\n\
1097 a valid generic character.")
1098 (object, genericp)
1099 Lisp_Object object, genericp;
1100 {
1101 if (! NATNUMP (object))
1102 return Qnil;
1103 return (CHAR_VALID_P (XFASTINT (object), !NILP (genericp)) ? Qt : Qnil);
1104 }
1105
1106 DEFUN ("unibyte-char-to-multibyte", Funibyte_char_to_multibyte,
1107 Sunibyte_char_to_multibyte, 1, 1, 0,
1108 "Convert the unibyte character CH to multibyte character.\n\
1109 The conversion is done based on `nonascii-translation-table' (which see)\n\
1110 or `nonascii-insert-offset' (which see).")
1111 (ch)
1112 Lisp_Object ch;
1113 {
1114 int c;
1115
1116 CHECK_NUMBER (ch, 0);
1117 c = XINT (ch);
1118 if (c < 0 || c >= 0400)
1119 error ("Invalid unibyte character: %d", c);
1120 c = unibyte_char_to_multibyte (c);
1121 if (c < 0)
1122 error ("Can't convert to multibyte character: %d", XINT (ch));
1123 return make_number (c);
1124 }
1125
1126 DEFUN ("multibyte-char-to-unibyte", Fmultibyte_char_to_unibyte,
1127 Smultibyte_char_to_unibyte, 1, 1, 0,
1128 "Convert the multibyte character CH to unibyte character.\n\
1129 The conversion is done based on `nonascii-translation-table' (which see)\n\
1130 or `nonascii-insert-offset' (which see).")
1131 (ch)
1132 Lisp_Object ch;
1133 {
1134 int c;
1135
1136 CHECK_NUMBER (ch, 0);
1137 c = XINT (ch);
1138 if (! CHAR_VALID_P (c, 0))
1139 error ("Invalid multibyte character: %d", c);
1140 c = multibyte_char_to_unibyte (c, Qnil);
1141 if (c < 0)
1142 error ("Can't convert to unibyte character: %d", XINT (ch));
1143 return make_number (c);
1144 }
1145
1146 DEFUN ("char-bytes", Fchar_bytes, Schar_bytes, 1, 1, 0,
1147 "Return 1 regardless of the argument CHAR.\n\
1148 This is now an obsolete function. We keep it just for backward compatibility.")
1149 (ch)
1150 Lisp_Object ch;
1151 {
1152 Lisp_Object val;
1153
1154 CHECK_NUMBER (ch, 0);
1155 return make_number (1);
1156 }
1157
1158 /* Return how many bytes C will occupy in a multibyte buffer.
1159 Don't call this function directly, instead use macro CHAR_BYTES. */
1160 int
1161 char_bytes (c)
1162 int c;
1163 {
1164 int charset;
1165
1166 if (ASCII_BYTE_P (c) || (c & ~((1 << CHARACTERBITS) -1)))
1167 return 1;
1168 if (SINGLE_BYTE_CHAR_P (c) && c >= 0xA0)
1169 return 1;
1170
1171 charset = CHAR_CHARSET (c);
1172 return (CHARSET_DEFINED_P (charset) ? CHARSET_BYTES (charset) : 1);
1173 }
1174
1175 /* Return the width of character of which multi-byte form starts with
1176 C. The width is measured by how many columns occupied on the
1177 screen when displayed in the current buffer. */
1178
1179 #define ONE_BYTE_CHAR_WIDTH(c) \
1180 (c < 0x20 \
1181 ? (c == '\t' \
1182 ? XFASTINT (current_buffer->tab_width) \
1183 : (c == '\n' ? 0 : (NILP (current_buffer->ctl_arrow) ? 4 : 2))) \
1184 : (c < 0x7f \
1185 ? 1 \
1186 : (c == 0x7F \
1187 ? (NILP (current_buffer->ctl_arrow) ? 4 : 2) \
1188 : ((! NILP (current_buffer->enable_multibyte_characters) \
1189 && BASE_LEADING_CODE_P (c)) \
1190 ? WIDTH_BY_CHAR_HEAD (c) \
1191 : 4))))
1192
1193 DEFUN ("char-width", Fchar_width, Schar_width, 1, 1, 0,
1194 "Return width of CHAR when displayed in the current buffer.\n\
1195 The width is measured by how many columns it occupies on the screen.")
1196 (ch)
1197 Lisp_Object ch;
1198 {
1199 Lisp_Object val, disp;
1200 int c;
1201 struct Lisp_Char_Table *dp = buffer_display_table ();
1202
1203 CHECK_NUMBER (ch, 0);
1204
1205 c = XINT (ch);
1206
1207 /* Get the way the display table would display it. */
1208 disp = dp ? DISP_CHAR_VECTOR (dp, c) : Qnil;
1209
1210 if (VECTORP (disp))
1211 XSETINT (val, XVECTOR (disp)->size);
1212 else if (SINGLE_BYTE_CHAR_P (c))
1213 XSETINT (val, ONE_BYTE_CHAR_WIDTH (c));
1214 else
1215 {
1216 int charset = CHAR_CHARSET (c);
1217
1218 XSETFASTINT (val, CHARSET_WIDTH (charset));
1219 }
1220 return val;
1221 }
1222
1223 /* Return width of string STR of length LEN when displayed in the
1224 current buffer. The width is measured by how many columns it
1225 occupies on the screen. */
1226
1227 int
1228 strwidth (str, len)
1229 unsigned char *str;
1230 int len;
1231 {
1232 unsigned char *endp = str + len;
1233 int width = 0;
1234 struct Lisp_Char_Table *dp = buffer_display_table ();
1235
1236 while (str < endp)
1237 {
1238 Lisp_Object disp;
1239 int thislen;
1240 int c = STRING_CHAR_AND_LENGTH (str, endp - str, thislen);
1241
1242 /* Get the way the display table would display it. */
1243 if (dp)
1244 disp = DISP_CHAR_VECTOR (dp, c);
1245 else
1246 disp = Qnil;
1247
1248 if (VECTORP (disp))
1249 width += XVECTOR (disp)->size;
1250 else
1251 width += ONE_BYTE_CHAR_WIDTH (*str);
1252
1253 str += thislen;
1254 }
1255 return width;
1256 }
1257
1258 DEFUN ("string-width", Fstring_width, Sstring_width, 1, 1, 0,
1259 "Return width of STRING when displayed in the current buffer.\n\
1260 Width is measured by how many columns it occupies on the screen.\n\
1261 When calculating width of a multibyte character in STRING,\n\
1262 only the base leading-code is considered; the validity of\n\
1263 the following bytes is not checked.")
1264 (str)
1265 Lisp_Object str;
1266 {
1267 Lisp_Object val;
1268
1269 CHECK_STRING (str, 0);
1270 XSETFASTINT (val, strwidth (XSTRING (str)->data,
1271 STRING_BYTES (XSTRING (str))));
1272 return val;
1273 }
1274
1275 DEFUN ("char-direction", Fchar_direction, Schar_direction, 1, 1, 0,
1276 "Return the direction of CHAR.\n\
1277 The returned value is 0 for left-to-right and 1 for right-to-left.")
1278 (ch)
1279 Lisp_Object ch;
1280 {
1281 int charset;
1282
1283 CHECK_NUMBER (ch, 0);
1284 charset = CHAR_CHARSET (XFASTINT (ch));
1285 if (!CHARSET_DEFINED_P (charset))
1286 invalid_character (XINT (ch));
1287 return CHARSET_TABLE_INFO (charset, CHARSET_DIRECTION_IDX);
1288 }
1289
1290 DEFUN ("chars-in-region", Fchars_in_region, Schars_in_region, 2, 2, 0,
1291 "Return number of characters between BEG and END.")
1292 (beg, end)
1293 Lisp_Object beg, end;
1294 {
1295 int from, to;
1296
1297 CHECK_NUMBER_COERCE_MARKER (beg, 0);
1298 CHECK_NUMBER_COERCE_MARKER (end, 1);
1299
1300 from = min (XFASTINT (beg), XFASTINT (end));
1301 to = max (XFASTINT (beg), XFASTINT (end));
1302
1303 return make_number (to - from);
1304 }
1305
1306 /* Return the number of characters in the NBYTES bytes at PTR.
1307 This works by looking at the contents and checking for multibyte sequences.
1308 However, if the current buffer has enable-multibyte-characters = nil,
1309 we treat each byte as a character. */
1310
1311 int
1312 chars_in_text (ptr, nbytes)
1313 unsigned char *ptr;
1314 int nbytes;
1315 {
1316 /* current_buffer is null at early stages of Emacs initialization. */
1317 if (current_buffer == 0
1318 || NILP (current_buffer->enable_multibyte_characters))
1319 return nbytes;
1320
1321 return multibyte_chars_in_text (ptr, nbytes);
1322 }
1323
1324 /* Return the number of characters in the NBYTES bytes at PTR.
1325 This works by looking at the contents and checking for multibyte sequences.
1326 It ignores enable-multibyte-characters. */
1327
1328 int
1329 multibyte_chars_in_text (ptr, nbytes)
1330 unsigned char *ptr;
1331 int nbytes;
1332 {
1333 unsigned char *endp;
1334 int chars, bytes;
1335
1336 endp = ptr + nbytes;
1337 chars = 0;
1338
1339 while (ptr < endp)
1340 {
1341 PARSE_MULTIBYTE_SEQ (ptr, endp - ptr, bytes);
1342 ptr += bytes;
1343 chars++;
1344 }
1345
1346 return chars;
1347 }
1348
1349 /* Parse unibyte text at STR of LEN bytes as a multibyte text, and
1350 count the numbers of characters and bytes in it. On counting
1351 bytes, pay attention to that 8-bit characters in the range
1352 0x80..0x9F are represented by 2-byte in a multibyte text. */
1353 void
1354 parse_str_as_multibyte (str, len, nchars, nbytes)
1355 unsigned char *str;
1356 int len, *nchars, *nbytes;
1357 {
1358 unsigned char *endp = str + len;
1359 int n, chars = 0, bytes = 0;
1360
1361 while (str < endp)
1362 {
1363 if (UNIBYTE_STR_AS_MULTIBYTE_P (str, endp - str, n))
1364 str += n, bytes += n;
1365 else
1366 str++, bytes += 2;
1367 chars++;
1368 }
1369 *nchars = chars;
1370 *nbytes = bytes;
1371 return;
1372 }
1373
1374 /* Arrange unibyte text at STR of NBYTES bytes as a multibyte text.
1375 It actually converts only 8-bit characters in the range 0x80..0x9F
1376 that don't contruct multibyte characters to multibyte forms. If
1377 NCHARS is nonzero, set *NCHARS to the number of characters in the
1378 text. It is assured that we can use LEN bytes at STR as a work
1379 area and that is enough. Return the number of bytes of the
1380 resulting text. */
1381
1382 int
1383 str_as_multibyte (str, len, nbytes, nchars)
1384 unsigned char *str;
1385 int len, nbytes, *nchars;
1386 {
1387 unsigned char *p = str, *endp = str + nbytes;
1388 unsigned char *to;
1389 int chars = 0;
1390 int n;
1391
1392 while (p < endp && UNIBYTE_STR_AS_MULTIBYTE_P (p, endp - p, n))
1393 p += n, chars++;
1394 if (nchars)
1395 *nchars = chars;
1396 if (p == endp)
1397 return nbytes;
1398
1399 to = p;
1400 nbytes = endp - p;
1401 endp = str + len;
1402 safe_bcopy (p, endp - nbytes, nbytes);
1403 p = endp - nbytes;
1404 while (p < endp)
1405 {
1406 if (UNIBYTE_STR_AS_MULTIBYTE_P (p, endp - p, n))
1407 {
1408 while (n--)
1409 *to++ = *p++;
1410 }
1411 else
1412 {
1413 *to++ = LEADING_CODE_8_BIT_CONTROL;
1414 *to++ = *p++ + 0x20;
1415 }
1416 chars++;
1417 }
1418 if (nchars)
1419 *nchars = chars;
1420 return (to - str);
1421 }
1422
1423 /* Convert unibyte text at STR of NBYTES bytes to a multibyte text
1424 that contains the same single-byte characters. It actually
1425 converts all 8-bit characters to multibyte forms. It is assured
1426 that we can use LEN bytes at STR as a work area and that is
1427 enough. */
1428
1429 int
1430 str_to_multibyte (str, len, bytes)
1431 unsigned char *str;
1432 int len, bytes;
1433 {
1434 unsigned char *p = str, *endp = str + bytes;
1435 unsigned char *to;
1436 int c;
1437
1438 while (p < endp && (*p < 0x80 || *p >= 0xA0)) p++;
1439 if (p == endp)
1440 return bytes;
1441 to = p;
1442 bytes = endp - p;
1443 endp = str + len;
1444 safe_bcopy (p, endp - bytes, bytes);
1445 p = endp - bytes;
1446 while (p < endp)
1447 {
1448 if (*p < 0x80 || *p >= 0xA0)
1449 *to++ = *p++;
1450 else
1451 *to++ = LEADING_CODE_8_BIT_CONTROL, *to++ = *p++ + 0x20;
1452 }
1453 return (to - str);
1454 }
1455
1456 /* Arrange multibyte text at STR of LEN bytes as a unibyte text. It
1457 actually converts only 8-bit characters in the range 0x80..0x9F to
1458 unibyte forms. */
1459
1460 int
1461 str_as_unibyte (str, bytes)
1462 unsigned char *str;
1463 int bytes;
1464 {
1465 unsigned char *p = str, *endp = str + bytes;
1466 unsigned char *to = str;
1467
1468 while (p < endp && *p != LEADING_CODE_8_BIT_CONTROL) p++;
1469 to = p;
1470 while (p < endp)
1471 {
1472 if (*p == LEADING_CODE_8_BIT_CONTROL)
1473 *to++ = *(p + 1) - 0x20, p += 2;
1474 else
1475 *to++ = *p++;
1476 }
1477 return (to - str);
1478 }
1479
1480 \f
1481 DEFUN ("string", Fstring, Sstring, 1, MANY, 0,
1482 "Concatenate all the argument characters and make the result a string.")
1483 (n, args)
1484 int n;
1485 Lisp_Object *args;
1486 {
1487 int i;
1488 unsigned char *buf = (unsigned char *) alloca (MAX_MULTIBYTE_LENGTH * n);
1489 unsigned char *p = buf;
1490 int c;
1491
1492 for (i = 0; i < n; i++)
1493 {
1494 CHECK_NUMBER (args[i], 0);
1495 c = XINT (args[i]);
1496 p += CHAR_STRING (c, p);
1497 }
1498
1499 return make_string_from_bytes (buf, n, p - buf);
1500 }
1501
1502 #endif /* emacs */
1503 \f
1504 int
1505 charset_id_internal (charset_name)
1506 char *charset_name;
1507 {
1508 Lisp_Object val;
1509
1510 val= Fget (intern (charset_name), Qcharset);
1511 if (!VECTORP (val))
1512 error ("Charset %s is not defined", charset_name);
1513
1514 return (XINT (XVECTOR (val)->contents[0]));
1515 }
1516
1517 DEFUN ("setup-special-charsets", Fsetup_special_charsets,
1518 Ssetup_special_charsets, 0, 0, 0, "Internal use only.")
1519 ()
1520 {
1521 charset_latin_iso8859_1 = charset_id_internal ("latin-iso8859-1");
1522 charset_jisx0208_1978 = charset_id_internal ("japanese-jisx0208-1978");
1523 charset_jisx0208 = charset_id_internal ("japanese-jisx0208");
1524 charset_katakana_jisx0201 = charset_id_internal ("katakana-jisx0201");
1525 charset_latin_jisx0201 = charset_id_internal ("latin-jisx0201");
1526 charset_big5_1 = charset_id_internal ("chinese-big5-1");
1527 charset_big5_2 = charset_id_internal ("chinese-big5-2");
1528 return Qnil;
1529 }
1530
1531 void
1532 init_charset_once ()
1533 {
1534 int i, j, k;
1535
1536 staticpro (&Vcharset_table);
1537 staticpro (&Vcharset_symbol_table);
1538 staticpro (&Vgeneric_character_list);
1539
1540 /* This has to be done here, before we call Fmake_char_table. */
1541 Qcharset_table = intern ("charset-table");
1542 staticpro (&Qcharset_table);
1543
1544 /* Intern this now in case it isn't already done.
1545 Setting this variable twice is harmless.
1546 But don't staticpro it here--that is done in alloc.c. */
1547 Qchar_table_extra_slots = intern ("char-table-extra-slots");
1548
1549 /* Now we are ready to set up this property, so we can
1550 create the charset table. */
1551 Fput (Qcharset_table, Qchar_table_extra_slots, make_number (0));
1552 Vcharset_table = Fmake_char_table (Qcharset_table, Qnil);
1553
1554 Qunknown = intern ("unknown");
1555 staticpro (&Qunknown);
1556 Vcharset_symbol_table = Fmake_vector (make_number (MAX_CHARSET + 1),
1557 Qunknown);
1558
1559 /* Setup tables. */
1560 for (i = 0; i < 2; i++)
1561 for (j = 0; j < 2; j++)
1562 for (k = 0; k < 128; k++)
1563 iso_charset_table [i][j][k] = -1;
1564
1565 for (i = 0; i < 256; i++)
1566 bytes_by_char_head[i] = 1;
1567 for (i = MIN_CHARSET_OFFICIAL_DIMENSION1;
1568 i <= MAX_CHARSET_OFFICIAL_DIMENSION1; i++)
1569 bytes_by_char_head[i] = 2;
1570 for (i = MIN_CHARSET_OFFICIAL_DIMENSION2;
1571 i <= MAX_CHARSET_OFFICIAL_DIMENSION2; i++)
1572 bytes_by_char_head[i] = 3;
1573 bytes_by_char_head[LEADING_CODE_PRIVATE_11] = 3;
1574 bytes_by_char_head[LEADING_CODE_PRIVATE_12] = 3;
1575 bytes_by_char_head[LEADING_CODE_PRIVATE_21] = 4;
1576 bytes_by_char_head[LEADING_CODE_PRIVATE_22] = 4;
1577 bytes_by_char_head[LEADING_CODE_8_BIT_CONTROL] = 2;
1578
1579 for (i = 0; i < 128; i++)
1580 width_by_char_head[i] = 1;
1581 for (; i < 256; i++)
1582 width_by_char_head[i] = 4;
1583 width_by_char_head[LEADING_CODE_PRIVATE_11] = 1;
1584 width_by_char_head[LEADING_CODE_PRIVATE_12] = 2;
1585 width_by_char_head[LEADING_CODE_PRIVATE_21] = 1;
1586 width_by_char_head[LEADING_CODE_PRIVATE_22] = 2;
1587
1588 {
1589 Lisp_Object val;
1590
1591 val = Qnil;
1592 for (i = 0x81; i < 0x90; i++)
1593 val = Fcons (make_number ((i - 0x70) << 7), val);
1594 for (; i < 0x9A; i++)
1595 val = Fcons (make_number ((i - 0x8F) << 14), val);
1596 for (i = 0xA0; i < 0xF0; i++)
1597 val = Fcons (make_number ((i - 0x70) << 7), val);
1598 for (; i < 0xFF; i++)
1599 val = Fcons (make_number ((i - 0xE0) << 14), val);
1600 Vgeneric_character_list = Fnreverse (val);
1601 }
1602
1603 nonascii_insert_offset = 0;
1604 Vnonascii_translation_table = Qnil;
1605 }
1606
1607 #ifdef emacs
1608
1609 void
1610 syms_of_charset ()
1611 {
1612 Qcharset = intern ("charset");
1613 staticpro (&Qcharset);
1614
1615 Qascii = intern ("ascii");
1616 staticpro (&Qascii);
1617
1618 Qeight_bit_control = intern ("eight-bit-control");
1619 staticpro (&Qeight_bit_control);
1620
1621 Qeight_bit_graphic = intern ("eight-bit-graphic");
1622 staticpro (&Qeight_bit_graphic);
1623
1624 /* Define special charsets ascii, eight-bit-control, and
1625 eight-bit-graphic. */
1626 update_charset_table (make_number (CHARSET_ASCII),
1627 make_number (1), make_number (94),
1628 make_number (1),
1629 make_number (0),
1630 make_number ('B'),
1631 make_number (0),
1632 build_string ("ASCII"),
1633 build_string ("ASCII"),
1634 build_string ("ASCII (ISO646 IRV)"));
1635 CHARSET_SYMBOL (CHARSET_ASCII) = Qascii;
1636 Fput (Qascii, Qcharset, CHARSET_TABLE_ENTRY (CHARSET_ASCII));
1637
1638 update_charset_table (make_number (CHARSET_8_BIT_CONTROL),
1639 make_number (1), make_number (96),
1640 make_number (4),
1641 make_number (0),
1642 make_number (-1),
1643 make_number (-1),
1644 build_string ("8-bit control code (0x80..0x9F)"),
1645 build_string ("8-bit control code (0x80..0x9F)"),
1646 build_string ("8-bit control code (0x80..0x9F)"));
1647 CHARSET_SYMBOL (CHARSET_8_BIT_CONTROL) = Qeight_bit_control;
1648 Fput (Qeight_bit_control, Qcharset,
1649 CHARSET_TABLE_ENTRY (CHARSET_8_BIT_CONTROL));
1650
1651 update_charset_table (make_number (CHARSET_8_BIT_GRAPHIC),
1652 make_number (1), make_number (96),
1653 make_number (4),
1654 make_number (0),
1655 make_number (-1),
1656 make_number (-1),
1657 build_string ("8-bit graphic char"),
1658 build_string ("8-bit graphic char (0xA0..0xFF)"),
1659 build_string ("8-bit graphic char (0xA0..0xFF)"));
1660 CHARSET_SYMBOL (CHARSET_8_BIT_GRAPHIC) = Qeight_bit_graphic;
1661 Fput (Qeight_bit_graphic, Qcharset,
1662 CHARSET_TABLE_ENTRY (CHARSET_8_BIT_GRAPHIC));
1663
1664 Qauto_fill_chars = intern ("auto-fill-chars");
1665 staticpro (&Qauto_fill_chars);
1666 Fput (Qauto_fill_chars, Qchar_table_extra_slots, make_number (0));
1667
1668 defsubr (&Sdefine_charset);
1669 defsubr (&Sgeneric_character_list);
1670 defsubr (&Sget_unused_iso_final_char);
1671 defsubr (&Sdeclare_equiv_charset);
1672 defsubr (&Sfind_charset_region);
1673 defsubr (&Sfind_charset_string);
1674 defsubr (&Smake_char_internal);
1675 defsubr (&Ssplit_char);
1676 defsubr (&Schar_charset);
1677 defsubr (&Scharset_after);
1678 defsubr (&Siso_charset);
1679 defsubr (&Schar_valid_p);
1680 defsubr (&Sunibyte_char_to_multibyte);
1681 defsubr (&Smultibyte_char_to_unibyte);
1682 defsubr (&Schar_bytes);
1683 defsubr (&Schar_width);
1684 defsubr (&Sstring_width);
1685 defsubr (&Schar_direction);
1686 defsubr (&Schars_in_region);
1687 defsubr (&Sstring);
1688 defsubr (&Ssetup_special_charsets);
1689
1690 DEFVAR_LISP ("charset-list", &Vcharset_list,
1691 "List of charsets ever defined.");
1692 Vcharset_list = Fcons (Qascii, Fcons (Qeight_bit_control,
1693 Fcons (Qeight_bit_graphic, Qnil)));
1694
1695 DEFVAR_LISP ("translation-table-vector", &Vtranslation_table_vector,
1696 "Vector of cons cell of a symbol and translation table ever defined.\n\
1697 An ID of a translation table is an index of this vector.");
1698 Vtranslation_table_vector = Fmake_vector (make_number (16), Qnil);
1699
1700 DEFVAR_INT ("leading-code-private-11", &leading_code_private_11,
1701 "Leading-code of private TYPE9N charset of column-width 1.");
1702 leading_code_private_11 = LEADING_CODE_PRIVATE_11;
1703
1704 DEFVAR_INT ("leading-code-private-12", &leading_code_private_12,
1705 "Leading-code of private TYPE9N charset of column-width 2.");
1706 leading_code_private_12 = LEADING_CODE_PRIVATE_12;
1707
1708 DEFVAR_INT ("leading-code-private-21", &leading_code_private_21,
1709 "Leading-code of private TYPE9Nx9N charset of column-width 1.");
1710 leading_code_private_21 = LEADING_CODE_PRIVATE_21;
1711
1712 DEFVAR_INT ("leading-code-private-22", &leading_code_private_22,
1713 "Leading-code of private TYPE9Nx9N charset of column-width 2.");
1714 leading_code_private_22 = LEADING_CODE_PRIVATE_22;
1715
1716 DEFVAR_INT ("nonascii-insert-offset", &nonascii_insert_offset,
1717 "Offset for converting non-ASCII unibyte codes 0240...0377 to multibyte.\n\
1718 This is used for converting unibyte text to multibyte,\n\
1719 and for inserting character codes specified by number.\n\n\
1720 This serves to convert a Latin-1 or similar 8-bit character code\n\
1721 to the corresponding Emacs multibyte character code.\n\
1722 Typically the value should be (- (make-char CHARSET 0) 128),\n\
1723 for your choice of character set.\n\
1724 If `nonascii-translation-table' is non-nil, it overrides this variable.");
1725 nonascii_insert_offset = 0;
1726
1727 DEFVAR_LISP ("nonascii-translation-table", &Vnonascii_translation_table,
1728 "Translation table to convert non-ASCII unibyte codes to multibyte.\n\
1729 This is used for converting unibyte text to multibyte,\n\
1730 and for inserting character codes specified by number.\n\n\
1731 Conversion is performed only when multibyte characters are enabled,\n\
1732 and it serves to convert a Latin-1 or similar 8-bit character code\n\
1733 to the corresponding Emacs character code.\n\n\
1734 If this is nil, `nonascii-insert-offset' is used instead.\n\
1735 See also the docstring of `make-translation-table'.");
1736 Vnonascii_translation_table = Qnil;
1737
1738 DEFVAR_LISP ("auto-fill-chars", &Vauto_fill_chars,
1739 "A char-table for characters which invoke auto-filling.\n\
1740 Such characters have value t in this table.");
1741 Vauto_fill_chars = Fmake_char_table (Qauto_fill_chars, Qnil);
1742 CHAR_TABLE_SET (Vauto_fill_chars, make_number (' '), Qt);
1743 CHAR_TABLE_SET (Vauto_fill_chars, make_number ('\n'), Qt);
1744 }
1745
1746 #endif /* emacs */