(char_printable_p): Remove unused variable `chars'.
[bpt/emacs.git] / src / charset.c
1 /* Basic multilingual character support.
2 Copyright (C) 1995, 1997, 1998 Electrotechnical Laboratory, JAPAN.
3 Licensed to the Free Software Foundation.
4
5 This file is part of GNU Emacs.
6
7 GNU Emacs is free software; you can redistribute it and/or modify
8 it under the terms of the GNU General Public License as published by
9 the Free Software Foundation; either version 2, or (at your option)
10 any later version.
11
12 GNU Emacs is distributed in the hope that it will be useful,
13 but WITHOUT ANY WARRANTY; without even the implied warranty of
14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 GNU General Public License for more details.
16
17 You should have received a copy of the GNU General Public License
18 along with GNU Emacs; see the file COPYING. If not, write to
19 the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
20 Boston, MA 02111-1307, USA. */
21
22 /* At first, see the document in `charset.h' to understand the code in
23 this file. */
24
25 #ifdef emacs
26 #include <config.h>
27 #endif
28
29 #include <stdio.h>
30
31 #ifdef emacs
32
33 #include <sys/types.h>
34 #include "lisp.h"
35 #include "buffer.h"
36 #include "charset.h"
37 #include "composite.h"
38 #include "coding.h"
39 #include "disptab.h"
40
41 #else /* not emacs */
42
43 #include "mulelib.h"
44
45 #endif /* emacs */
46
47 Lisp_Object Qcharset, Qascii, Qeight_bit_control, Qeight_bit_graphic;
48 Lisp_Object Qunknown;
49
50 /* Declaration of special leading-codes. */
51 int leading_code_private_11; /* for private DIMENSION1 of 1-column */
52 int leading_code_private_12; /* for private DIMENSION1 of 2-column */
53 int leading_code_private_21; /* for private DIMENSION2 of 1-column */
54 int leading_code_private_22; /* for private DIMENSION2 of 2-column */
55
56 /* Declaration of special charsets. The values are set by
57 Fsetup_special_charsets. */
58 int charset_latin_iso8859_1; /* ISO8859-1 (Latin-1) */
59 int charset_jisx0208_1978; /* JISX0208.1978 (Japanese Kanji old set) */
60 int charset_jisx0208; /* JISX0208.1983 (Japanese Kanji) */
61 int charset_katakana_jisx0201; /* JISX0201.Kana (Japanese Katakana) */
62 int charset_latin_jisx0201; /* JISX0201.Roman (Japanese Roman) */
63 int charset_big5_1; /* Big5 Level 1 (Chinese Traditional) */
64 int charset_big5_2; /* Big5 Level 2 (Chinese Traditional) */
65
66 Lisp_Object Qcharset_table;
67
68 /* A char-table containing information of each character set. */
69 Lisp_Object Vcharset_table;
70
71 /* A vector of charset symbol indexed by charset-id. This is used
72 only for returning charset symbol from C functions. */
73 Lisp_Object Vcharset_symbol_table;
74
75 /* A list of charset symbols ever defined. */
76 Lisp_Object Vcharset_list;
77
78 /* Vector of translation table ever defined.
79 ID of a translation table is used to index this vector. */
80 Lisp_Object Vtranslation_table_vector;
81
82 /* A char-table for characters which may invoke auto-filling. */
83 Lisp_Object Vauto_fill_chars;
84
85 Lisp_Object Qauto_fill_chars;
86
87 /* Tables used by macros BYTES_BY_CHAR_HEAD and WIDTH_BY_CHAR_HEAD. */
88 int bytes_by_char_head[256];
89 int width_by_char_head[256];
90
91 /* Mapping table from ISO2022's charset (specified by DIMENSION,
92 CHARS, and FINAL-CHAR) to Emacs' charset. */
93 int iso_charset_table[2][2][128];
94
95 /* Variables used locally in the macro FETCH_MULTIBYTE_CHAR. */
96 unsigned char *_fetch_multibyte_char_p;
97 int _fetch_multibyte_char_len;
98
99 /* Offset to add to a non-ASCII value when inserting it. */
100 int nonascii_insert_offset;
101
102 /* Translation table for converting non-ASCII unibyte characters
103 to multibyte codes, or nil. */
104 Lisp_Object Vnonascii_translation_table;
105
106 /* List of all possible generic characters. */
107 Lisp_Object Vgeneric_character_list;
108
109 #define min(X, Y) ((X) < (Y) ? (X) : (Y))
110 #define max(X, Y) ((X) > (Y) ? (X) : (Y))
111 \f
112 void
113 invalid_character (c)
114 int c;
115 {
116 error ("Invalid character: 0%o, %d, 0x%x", c, c, c);
117 }
118
119 /* Parse string STR of length LENGTH and fetch information of a
120 character at STR. Set BYTES to the byte length the character
121 occupies, CHARSET, C1, C2 to proper values of the character. */
122
123 #define SPLIT_MULTIBYTE_SEQ(str, length, bytes, charset, c1, c2) \
124 do { \
125 (c1) = *(str); \
126 (bytes) = BYTES_BY_CHAR_HEAD (c1); \
127 if ((bytes) == 1) \
128 (charset) = ASCII_BYTE_P (c1) ? CHARSET_ASCII : CHARSET_8_BIT_GRAPHIC; \
129 else if ((bytes) == 2) \
130 { \
131 if ((c1) == LEADING_CODE_8_BIT_CONTROL) \
132 (charset) = CHARSET_8_BIT_CONTROL, (c1) = (str)[1] - 0x20; \
133 else \
134 (charset) = (c1), (c1) = (str)[1] & 0x7F; \
135 } \
136 else if ((bytes) == 3) \
137 { \
138 if ((c1) < LEADING_CODE_PRIVATE_11) \
139 (charset) = (c1), (c1) = (str)[1] & 0x7F, (c2) = (str)[2] & 0x7F; \
140 else \
141 (charset) = (str)[1], (c1) = (str)[2] & 0x7F; \
142 } \
143 else \
144 (charset) = (str)[1], (c1) = (str)[2] & 0x7F, (c2) = (str)[3] & 0x7F; \
145 } while (0)
146
147 /* 1 if CHARSET, C1, and C2 compose a valid character, else 0. */
148 #define CHAR_COMPONENTS_VALID_P(charset, c1, c2) \
149 ((charset) == CHARSET_ASCII \
150 ? ((c1) >= 0 && (c1) <= 0x7F) \
151 : ((charset) == CHARSET_8_BIT_CONTROL \
152 ? ((c1) >= 0x80 && (c1) <= 0x9F) \
153 : ((charset) == CHARSET_8_BIT_GRAPHIC \
154 ? ((c1) >= 0x80 && (c1) <= 0xFF) \
155 : (CHARSET_DIMENSION (charset) == 1 \
156 ? ((c1) >= 0x20 && (c1) <= 0x7F) \
157 : ((c1) >= 0x20 && (c1) <= 0x7F \
158 && (c2) >= 0x20 && (c2) <= 0x7F)))))
159
160 /* Store multi-byte form of the character C in STR. The caller should
161 allocate at least 4-byte area at STR in advance. Returns the
162 length of the multi-byte form. If C is an invalid character code,
163 signal an error.
164
165 Use macro `CHAR_STRING (C, STR)' instead of calling this function
166 directly if C can be an ASCII character. */
167
168 int
169 char_to_string (c, str)
170 int c;
171 unsigned char *str;
172 {
173 unsigned char *p = str;
174
175 if (c & CHAR_MODIFIER_MASK) /* This includes the case C is negative. */
176 {
177 /* Multibyte character can't have a modifier bit. */
178 if (! SINGLE_BYTE_CHAR_P ((c & ~CHAR_MODIFIER_MASK)))
179 invalid_character (c);
180
181 /* For Meta, Shift, and Control modifiers, we need special care. */
182 if (c & CHAR_META)
183 {
184 /* Move the meta bit to the right place for a string. */
185 c = (c & ~CHAR_META) | 0x80;
186 }
187 if (c & CHAR_SHIFT)
188 {
189 /* Shift modifier is valid only with [A-Za-z]. */
190 if ((c & 0377) >= 'A' && (c & 0377) <= 'Z')
191 c &= ~CHAR_SHIFT;
192 else if ((c & 0377) >= 'a' && (c & 0377) <= 'z')
193 c = (c & ~CHAR_SHIFT) - ('a' - 'A');
194 }
195 if (c & CHAR_CTL)
196 {
197 /* Simulate the code in lread.c. */
198 /* Allow `\C- ' and `\C-?'. */
199 if (c == (CHAR_CTL | ' '))
200 c = 0;
201 else if (c == (CHAR_CTL | '?'))
202 c = 127;
203 /* ASCII control chars are made from letters (both cases),
204 as well as the non-letters within 0100...0137. */
205 else if ((c & 0137) >= 0101 && (c & 0137) <= 0132)
206 c &= (037 | (~0177 & ~CHAR_CTL));
207 else if ((c & 0177) >= 0100 && (c & 0177) <= 0137)
208 c &= (037 | (~0177 & ~CHAR_CTL));
209 }
210
211 /* If C still has any modifier bits, it is an invalid character. */
212 if (c & CHAR_MODIFIER_MASK)
213 invalid_character (c);
214 }
215 if (SINGLE_BYTE_CHAR_P (c))
216 {
217 if (ASCII_BYTE_P (c) || c >= 0xA0)
218 *p++ = c;
219 else
220 {
221 *p++ = LEADING_CODE_8_BIT_CONTROL;
222 *p++ = c + 0x20;
223 }
224 }
225 else if (CHAR_VALID_P (c, 0))
226 {
227 int charset, c1, c2;
228
229 SPLIT_CHAR (c, charset, c1, c2);
230
231 if (charset >= LEADING_CODE_EXT_11)
232 *p++ = (charset < LEADING_CODE_EXT_12
233 ? LEADING_CODE_PRIVATE_11
234 : (charset < LEADING_CODE_EXT_21
235 ? LEADING_CODE_PRIVATE_12
236 : (charset < LEADING_CODE_EXT_22
237 ? LEADING_CODE_PRIVATE_21
238 : LEADING_CODE_PRIVATE_22)));
239 *p++ = charset;
240 if (c1 > 0 && c1 < 32 || c2 > 0 && c2 < 32)
241 invalid_character (c);
242 if (c1)
243 {
244 *p++ = c1 | 0x80;
245 if (c2 > 0)
246 *p++ = c2 | 0x80;
247 }
248 }
249 else
250 invalid_character (c);
251
252 return (p - str);
253 }
254
255 /* Return the non-ASCII character corresponding to multi-byte form at
256 STR of length LEN. If ACTUAL_LEN is not NULL, store the byte
257 length of the multibyte form in *ACTUAL_LEN.
258
259 Use macros STRING_CHAR or STRING_CHAR_AND_LENGTH instead of calling
260 this function directly if you want ot handle ASCII characters as
261 well. */
262
263 int
264 string_to_char (str, len, actual_len)
265 const unsigned char *str;
266 int len, *actual_len;
267 {
268 int c, bytes, charset, c1, c2;
269
270 SPLIT_MULTIBYTE_SEQ (str, len, bytes, charset, c1, c2);
271 c = MAKE_CHAR (charset, c1, c2);
272 if (actual_len)
273 *actual_len = bytes;
274 return c;
275 }
276
277 /* Return the length of the multi-byte form at string STR of length LEN.
278 Use the macro MULTIBYTE_FORM_LENGTH instead. */
279 int
280 multibyte_form_length (str, len)
281 const unsigned char *str;
282 int len;
283 {
284 int bytes;
285
286 PARSE_MULTIBYTE_SEQ (str, len, bytes);
287 return bytes;
288 }
289
290 /* Check multibyte form at string STR of length LEN and set variables
291 pointed by CHARSET, C1, and C2 to charset and position codes of the
292 character at STR, and return 0. If there's no multibyte character,
293 return -1. This should be used only in the macro SPLIT_STRING
294 which checks range of STR in advance. */
295
296 int
297 split_string (str, len, charset, c1, c2)
298 const unsigned char *str;
299 unsigned char *c1, *c2;
300 int len, *charset;
301 {
302 register int bytes, cs, code1, code2 = -1;
303
304 SPLIT_MULTIBYTE_SEQ (str, len, bytes, cs, code1, code2);
305 if (cs == CHARSET_ASCII)
306 return -1;
307 *charset = cs;
308 *c1 = code1;
309 *c2 = code2;
310 return 0;
311 }
312
313 /* Return 1 iff character C has valid printable glyph.
314 Use the macro CHAR_PRINTABLE_P instead. */
315 int
316 char_printable_p (c)
317 int c;
318 {
319 int charset, c1, c2;
320
321 if (ASCII_BYTE_P (c))
322 return 1;
323 else if (SINGLE_BYTE_CHAR_P (c))
324 return 0;
325 else if (c >= MAX_CHAR)
326 return 0;
327
328 SPLIT_CHAR (c, charset, c1, c2);
329 if (! CHARSET_DEFINED_P (charset))
330 return 0;
331 if (CHARSET_CHARS (charset) == 94
332 ? c1 <= 32 || c1 >= 127
333 : c1 < 32)
334 return 0;
335 if (CHARSET_DIMENSION (charset) == 2
336 && (CHARSET_CHARS (charset) == 94
337 ? c2 <= 32 || c2 >= 127
338 : c2 < 32))
339 return 0;
340 return 1;
341 }
342
343 /* Translate character C by translation table TABLE. If C
344 is negative, translate a character specified by CHARSET, C1, and C2
345 (C1 and C2 are code points of the character). If no translation is
346 found in TABLE, return C. */
347 int
348 translate_char (table, c, charset, c1, c2)
349 Lisp_Object table;
350 int c, charset, c1, c2;
351 {
352 Lisp_Object ch;
353 int alt_charset, alt_c1, alt_c2, dimension;
354
355 if (c < 0) c = MAKE_CHAR (charset, (c1 & 0x7F) , (c2 & 0x7F));
356 if (!CHAR_TABLE_P (table)
357 || (ch = Faref (table, make_number (c)), !NATNUMP (ch)))
358 return c;
359
360 SPLIT_CHAR (XFASTINT (ch), alt_charset, alt_c1, alt_c2);
361 dimension = CHARSET_DIMENSION (alt_charset);
362 if (dimension == 1 && alt_c1 > 0 || dimension == 2 && alt_c2 > 0)
363 /* CH is not a generic character, just return it. */
364 return XFASTINT (ch);
365
366 /* Since CH is a generic character, we must return a specific
367 charater which has the same position codes as C from CH. */
368 if (charset < 0)
369 SPLIT_CHAR (c, charset, c1, c2);
370 if (dimension != CHARSET_DIMENSION (charset))
371 /* We can't make such a character because of dimension mismatch. */
372 return c;
373 return MAKE_CHAR (alt_charset, c1, c2);
374 }
375
376 /* Convert the unibyte character C to multibyte based on
377 Vnonascii_translation_table or nonascii_insert_offset. If they can't
378 convert C to a valid multibyte character, convert it based on
379 DEFAULT_NONASCII_INSERT_OFFSET which makes C a Latin-1 character. */
380
381 int
382 unibyte_char_to_multibyte (c)
383 int c;
384 {
385 if (c < 0400 && c >= 0200)
386 {
387 int c_save = c;
388
389 if (! NILP (Vnonascii_translation_table))
390 {
391 c = XINT (Faref (Vnonascii_translation_table, make_number (c)));
392 if (c >= 0400 && ! char_valid_p (c, 0))
393 c = c_save + DEFAULT_NONASCII_INSERT_OFFSET;
394 }
395 else if (c >= 0240 && nonascii_insert_offset > 0)
396 {
397 c += nonascii_insert_offset;
398 if (c < 0400 || ! char_valid_p (c, 0))
399 c = c_save + DEFAULT_NONASCII_INSERT_OFFSET;
400 }
401 else if (c >= 0240)
402 c = c_save + DEFAULT_NONASCII_INSERT_OFFSET;
403 }
404 return c;
405 }
406
407
408 /* Convert the multibyte character C to unibyte 8-bit character based
409 on Vnonascii_translation_table or nonascii_insert_offset. If
410 REV_TBL is non-nil, it should be a reverse table of
411 Vnonascii_translation_table, i.e. what given by:
412 Fchar_table_extra_slot (Vnonascii_translation_table, make_number (0)) */
413
414 int
415 multibyte_char_to_unibyte (c, rev_tbl)
416 int c;
417 Lisp_Object rev_tbl;
418 {
419 if (!SINGLE_BYTE_CHAR_P (c))
420 {
421 int c_save = c;
422
423 if (! CHAR_TABLE_P (rev_tbl)
424 && CHAR_TABLE_P (Vnonascii_translation_table))
425 rev_tbl = Fchar_table_extra_slot (Vnonascii_translation_table,
426 make_number (0));
427 if (CHAR_TABLE_P (rev_tbl))
428 {
429 Lisp_Object temp;
430 temp = Faref (rev_tbl, make_number (c));
431 if (INTEGERP (temp))
432 c = XINT (temp);
433 if (c >= 256)
434 c = (c_save & 0177) + 0200;
435 }
436 else
437 {
438 if (nonascii_insert_offset > 0)
439 c -= nonascii_insert_offset;
440 if (c < 128 || c >= 256)
441 c = (c_save & 0177) + 0200;
442 }
443 }
444
445 return c;
446 }
447
448 \f
449 /* Update the table Vcharset_table with the given arguments (see the
450 document of `define-charset' for the meaning of each argument).
451 Several other table contents are also updated. The caller should
452 check the validity of CHARSET-ID and the remaining arguments in
453 advance. */
454
455 void
456 update_charset_table (charset_id, dimension, chars, width, direction,
457 iso_final_char, iso_graphic_plane,
458 short_name, long_name, description)
459 Lisp_Object charset_id, dimension, chars, width, direction;
460 Lisp_Object iso_final_char, iso_graphic_plane;
461 Lisp_Object short_name, long_name, description;
462 {
463 int charset = XINT (charset_id);
464 int bytes;
465 unsigned char leading_code_base, leading_code_ext;
466
467 if (NILP (CHARSET_TABLE_ENTRY (charset)))
468 CHARSET_TABLE_ENTRY (charset)
469 = Fmake_vector (make_number (CHARSET_MAX_IDX), Qnil);
470
471 if (NILP (long_name))
472 long_name = short_name;
473 if (NILP (description))
474 description = long_name;
475
476 /* Get byte length of multibyte form, base leading-code, and
477 extended leading-code of the charset. See the comment under the
478 title "GENERAL NOTE on CHARACTER SET (CHARSET)" in charset.h. */
479 bytes = XINT (dimension);
480 if (charset < MIN_CHARSET_PRIVATE_DIMENSION1)
481 {
482 /* Official charset, it doesn't have an extended leading-code. */
483 if (charset != CHARSET_ASCII && charset != CHARSET_8_BIT_GRAPHIC)
484 bytes += 1; /* For a base leading-code. */
485 leading_code_base = charset;
486 leading_code_ext = 0;
487 }
488 else
489 {
490 /* Private charset. */
491 bytes += 2; /* For base and extended leading-codes. */
492 leading_code_base
493 = (charset < LEADING_CODE_EXT_12
494 ? LEADING_CODE_PRIVATE_11
495 : (charset < LEADING_CODE_EXT_21
496 ? LEADING_CODE_PRIVATE_12
497 : (charset < LEADING_CODE_EXT_22
498 ? LEADING_CODE_PRIVATE_21
499 : LEADING_CODE_PRIVATE_22)));
500 leading_code_ext = charset;
501 if (BYTES_BY_CHAR_HEAD (leading_code_base) != bytes)
502 error ("Invalid dimension for the charset-ID %d", charset);
503 }
504
505 CHARSET_TABLE_INFO (charset, CHARSET_ID_IDX) = charset_id;
506 CHARSET_TABLE_INFO (charset, CHARSET_BYTES_IDX) = make_number (bytes);
507 CHARSET_TABLE_INFO (charset, CHARSET_DIMENSION_IDX) = dimension;
508 CHARSET_TABLE_INFO (charset, CHARSET_CHARS_IDX) = chars;
509 CHARSET_TABLE_INFO (charset, CHARSET_WIDTH_IDX) = width;
510 CHARSET_TABLE_INFO (charset, CHARSET_DIRECTION_IDX) = direction;
511 CHARSET_TABLE_INFO (charset, CHARSET_LEADING_CODE_BASE_IDX)
512 = make_number (leading_code_base);
513 CHARSET_TABLE_INFO (charset, CHARSET_LEADING_CODE_EXT_IDX)
514 = make_number (leading_code_ext);
515 CHARSET_TABLE_INFO (charset, CHARSET_ISO_FINAL_CHAR_IDX) = iso_final_char;
516 CHARSET_TABLE_INFO (charset, CHARSET_ISO_GRAPHIC_PLANE_IDX)
517 = iso_graphic_plane;
518 CHARSET_TABLE_INFO (charset, CHARSET_SHORT_NAME_IDX) = short_name;
519 CHARSET_TABLE_INFO (charset, CHARSET_LONG_NAME_IDX) = long_name;
520 CHARSET_TABLE_INFO (charset, CHARSET_DESCRIPTION_IDX) = description;
521 CHARSET_TABLE_INFO (charset, CHARSET_PLIST_IDX) = Qnil;
522
523 {
524 /* If we have already defined a charset which has the same
525 DIMENSION, CHARS and ISO-FINAL-CHAR but the different
526 DIRECTION, we must update the entry REVERSE-CHARSET of both
527 charsets. If there's no such charset, the value of the entry
528 is set to nil. */
529 int i;
530
531 for (i = 0; i <= MAX_CHARSET; i++)
532 if (!NILP (CHARSET_TABLE_ENTRY (i)))
533 {
534 if (CHARSET_DIMENSION (i) == XINT (dimension)
535 && CHARSET_CHARS (i) == XINT (chars)
536 && CHARSET_ISO_FINAL_CHAR (i) == XINT (iso_final_char)
537 && CHARSET_DIRECTION (i) != XINT (direction))
538 {
539 CHARSET_TABLE_INFO (charset, CHARSET_REVERSE_CHARSET_IDX)
540 = make_number (i);
541 CHARSET_TABLE_INFO (i, CHARSET_REVERSE_CHARSET_IDX) = charset_id;
542 break;
543 }
544 }
545 if (i > MAX_CHARSET)
546 /* No such a charset. */
547 CHARSET_TABLE_INFO (charset, CHARSET_REVERSE_CHARSET_IDX)
548 = make_number (-1);
549 }
550
551 if (charset != CHARSET_ASCII && charset != CHARSET_8_BIT_GRAPHIC
552 && charset < MIN_CHARSET_PRIVATE_DIMENSION1)
553 {
554 bytes_by_char_head[leading_code_base] = bytes;
555 width_by_char_head[leading_code_base] = XINT (width);
556
557 /* Update table emacs_code_class. */
558 emacs_code_class[charset] = (bytes == 2
559 ? EMACS_leading_code_2
560 : (bytes == 3
561 ? EMACS_leading_code_3
562 : EMACS_leading_code_4));
563 }
564
565 /* Update table iso_charset_table. */
566 if (XINT (iso_final_char) >= 0
567 && ISO_CHARSET_TABLE (dimension, chars, iso_final_char) < 0)
568 ISO_CHARSET_TABLE (dimension, chars, iso_final_char) = charset;
569 }
570
571 #ifdef emacs
572
573 /* Return charset id of CHARSET_SYMBOL, or return -1 if CHARSET_SYMBOL
574 is invalid. */
575 int
576 get_charset_id (charset_symbol)
577 Lisp_Object charset_symbol;
578 {
579 Lisp_Object val;
580 int charset;
581
582 return ((SYMBOLP (charset_symbol)
583 && (val = Fget (charset_symbol, Qcharset), VECTORP (val))
584 && (charset = XINT (XVECTOR (val)->contents[CHARSET_ID_IDX]),
585 CHARSET_VALID_P (charset)))
586 ? charset : -1);
587 }
588
589 /* Return an identification number for a new private charset of
590 DIMENSION and WIDTH. If there's no more room for the new charset,
591 return 0. */
592 Lisp_Object
593 get_new_private_charset_id (dimension, width)
594 int dimension, width;
595 {
596 int charset, from, to;
597
598 if (dimension == 1)
599 {
600 from = LEADING_CODE_EXT_11;
601 to = LEADING_CODE_EXT_21;
602 }
603 else
604 {
605 from = LEADING_CODE_EXT_21;
606 to = LEADING_CODE_EXT_MAX + 1;
607 }
608
609 for (charset = from; charset < to; charset++)
610 if (!CHARSET_DEFINED_P (charset)) break;
611
612 return make_number (charset < to ? charset : 0);
613 }
614
615 DEFUN ("define-charset", Fdefine_charset, Sdefine_charset, 3, 3, 0,
616 "Define CHARSET-ID as the identification number of CHARSET with INFO-VECTOR.\n\
617 If CHARSET-ID is nil, it is decided automatically, which means CHARSET is\n\
618 treated as a private charset.\n\
619 INFO-VECTOR is a vector of the format:\n\
620 [DIMENSION CHARS WIDTH DIRECTION ISO-FINAL-CHAR ISO-GRAPHIC-PLANE\n\
621 SHORT-NAME LONG-NAME DESCRIPTION]\n\
622 The meanings of each elements is as follows:\n\
623 DIMENSION (integer) is the number of bytes to represent a character: 1 or 2.\n\
624 CHARS (integer) is the number of characters in a dimension: 94 or 96.\n\
625 WIDTH (integer) is the number of columns a character in the charset\n\
626 occupies on the screen: one of 0, 1, and 2.\n\
627 \n\
628 DIRECTION (integer) is the rendering direction of characters in the\n\
629 charset when rendering. If 0, render from left to right, else\n\
630 render from right to left.\n\
631 \n\
632 ISO-FINAL-CHAR (character) is the final character of the\n\
633 corresponding ISO 2022 charset.\n\
634 It may be -1 if the charset is internal use only.\n\
635 \n\
636 ISO-GRAPHIC-PLANE (integer) is the graphic plane to be invoked\n\
637 while encoding to variants of ISO 2022 coding system, one of the\n\
638 following: 0/graphic-plane-left(GL), 1/graphic-plane-right(GR).\n\
639 It may be -1 if the charset is internal use only.\n\
640 \n\
641 SHORT-NAME (string) is the short name to refer to the charset.\n\
642 \n\
643 LONG-NAME (string) is the long name to refer to the charset.\n\
644 \n\
645 DESCRIPTION (string) is the description string of the charset.")
646 (charset_id, charset_symbol, info_vector)
647 Lisp_Object charset_id, charset_symbol, info_vector;
648 {
649 Lisp_Object *vec;
650
651 if (!NILP (charset_id))
652 CHECK_NUMBER (charset_id, 0);
653 CHECK_SYMBOL (charset_symbol, 1);
654 CHECK_VECTOR (info_vector, 2);
655
656 if (! NILP (charset_id))
657 {
658 if (! CHARSET_VALID_P (XINT (charset_id)))
659 error ("Invalid CHARSET: %d", XINT (charset_id));
660 else if (CHARSET_DEFINED_P (XINT (charset_id)))
661 error ("Already defined charset: %d", XINT (charset_id));
662 }
663
664 vec = XVECTOR (info_vector)->contents;
665 if (XVECTOR (info_vector)->size != 9
666 || !INTEGERP (vec[0]) || !(XINT (vec[0]) == 1 || XINT (vec[0]) == 2)
667 || !INTEGERP (vec[1]) || !(XINT (vec[1]) == 94 || XINT (vec[1]) == 96)
668 || !INTEGERP (vec[2]) || !(XINT (vec[2]) == 1 || XINT (vec[2]) == 2)
669 || !INTEGERP (vec[3]) || !(XINT (vec[3]) == 0 || XINT (vec[3]) == 1)
670 || !INTEGERP (vec[4])
671 || !(XINT (vec[4]) == -1 || XINT (vec[4]) >= '0' && XINT (vec[4]) <= '~')
672 || !INTEGERP (vec[5])
673 || !(XINT (vec[5]) == -1 || XINT (vec[5]) == 0 || XINT (vec[5]) == 1)
674 || !STRINGP (vec[6])
675 || !STRINGP (vec[7])
676 || !STRINGP (vec[8]))
677 error ("Invalid info-vector argument for defining charset %s",
678 XSYMBOL (charset_symbol)->name->data);
679
680 if (NILP (charset_id))
681 {
682 charset_id = get_new_private_charset_id (XINT (vec[0]), XINT (vec[2]));
683 if (XINT (charset_id) == 0)
684 error ("There's no room for a new private charset %s",
685 XSYMBOL (charset_symbol)->name->data);
686 }
687
688 update_charset_table (charset_id, vec[0], vec[1], vec[2], vec[3],
689 vec[4], vec[5], vec[6], vec[7], vec[8]);
690 Fput (charset_symbol, Qcharset, CHARSET_TABLE_ENTRY (XINT (charset_id)));
691 CHARSET_SYMBOL (XINT (charset_id)) = charset_symbol;
692 Vcharset_list = Fcons (charset_symbol, Vcharset_list);
693 return Qnil;
694 }
695
696 DEFUN ("generic-character-list", Fgeneric_character_list,
697 Sgeneric_character_list, 0, 0, 0,
698 "Return a list of all possible generic characters.\n\
699 It includes a generic character for a charset not yet defined.")
700 ()
701 {
702 return Vgeneric_character_list;
703 }
704
705 DEFUN ("get-unused-iso-final-char", Fget_unused_iso_final_char,
706 Sget_unused_iso_final_char, 2, 2, 0,
707 "Return an unsed ISO's final char for a charset of DIMENISION and CHARS.\n\
708 DIMENSION is the number of bytes to represent a character: 1 or 2.\n\
709 CHARS is the number of characters in a dimension: 94 or 96.\n\
710 \n\
711 This final char is for private use, thus the range is `0' (48) .. `?' (63).\n\
712 If there's no unused final char for the specified kind of charset,\n\
713 return nil.")
714 (dimension, chars)
715 Lisp_Object dimension, chars;
716 {
717 int final_char;
718
719 CHECK_NUMBER (dimension, 0);
720 CHECK_NUMBER (chars, 1);
721 if (XINT (dimension) != 1 && XINT (dimension) != 2)
722 error ("Invalid charset dimension %d, it should be 1 or 2",
723 XINT (dimension));
724 if (XINT (chars) != 94 && XINT (chars) != 96)
725 error ("Invalid charset chars %d, it should be 94 or 96",
726 XINT (chars));
727 for (final_char = '0'; final_char <= '?'; final_char++)
728 {
729 if (ISO_CHARSET_TABLE (dimension, chars, make_number (final_char)) < 0)
730 break;
731 }
732 return (final_char <= '?' ? make_number (final_char) : Qnil);
733 }
734
735 DEFUN ("declare-equiv-charset", Fdeclare_equiv_charset, Sdeclare_equiv_charset,
736 4, 4, 0,
737 "Declare a charset of DIMENSION, CHARS, FINAL-CHAR is the same as CHARSET.\n\
738 CHARSET should be defined by `defined-charset' in advance.")
739 (dimension, chars, final_char, charset_symbol)
740 Lisp_Object dimension, chars, final_char, charset_symbol;
741 {
742 int charset;
743
744 CHECK_NUMBER (dimension, 0);
745 CHECK_NUMBER (chars, 1);
746 CHECK_NUMBER (final_char, 2);
747 CHECK_SYMBOL (charset_symbol, 3);
748
749 if (XINT (dimension) != 1 && XINT (dimension) != 2)
750 error ("Invalid DIMENSION %d, it should be 1 or 2", XINT (dimension));
751 if (XINT (chars) != 94 && XINT (chars) != 96)
752 error ("Invalid CHARS %d, it should be 94 or 96", XINT (chars));
753 if (XINT (final_char) < '0' || XFASTINT (final_char) > '~')
754 error ("Invalid FINAL-CHAR %c, it should be `0'..`~'", XINT (chars));
755 if ((charset = get_charset_id (charset_symbol)) < 0)
756 error ("Invalid charset %s", XSYMBOL (charset_symbol)->name->data);
757
758 ISO_CHARSET_TABLE (dimension, chars, final_char) = charset;
759 return Qnil;
760 }
761
762 /* Return information about charsets in the text at PTR of NBYTES
763 bytes, which are NCHARS characters. The value is:
764
765 0: Each character is represented by one byte. This is always
766 true for unibyte text.
767 1: No charsets other than ascii eight-bit-control,
768 eight-bit-graphic, and latin-1 are found.
769 2: Otherwise.
770
771 In addition, if CHARSETS is nonzero, for each found charset N, set
772 CHARSETS[N] to 1. For that, callers should allocate CHARSETS
773 (MAX_CHARSET + 1 elements) in advance. It may lookup a translation
774 table TABLE if supplied. For invalid charsets, set CHARSETS[1] to
775 1 (note that there's no charset whose ID is 1). */
776
777 int
778 find_charset_in_text (ptr, nchars, nbytes, charsets, table)
779 unsigned char *ptr;
780 int nchars, nbytes, *charsets;
781 Lisp_Object table;
782 {
783 if (nchars == nbytes)
784 {
785 if (charsets && nbytes > 0)
786 {
787 unsigned char *endp = ptr + nbytes;
788 int maskbits = 0;
789
790 while (ptr < endp && maskbits != 7)
791 {
792 maskbits |= (*ptr < 0x80 ? 1 : *ptr < 0xA0 ? 2 : 4);
793 ptr++;
794 }
795
796 if (maskbits & 1)
797 charsets[CHARSET_ASCII] = 1;
798 if (maskbits & 2)
799 charsets[CHARSET_8_BIT_CONTROL] = 1;
800 if (maskbits & 4)
801 charsets[CHARSET_8_BIT_GRAPHIC] = 1;
802 }
803 return 0;
804 }
805 else
806 {
807 int return_val = 1;
808 int bytes, charset, c1, c2;
809
810 if (! CHAR_TABLE_P (table))
811 table = Qnil;
812
813 while (nchars-- > 0)
814 {
815 SPLIT_MULTIBYTE_SEQ (ptr, len, bytes, charset, c1, c2);
816 ptr += bytes;
817
818 if (!CHARSET_DEFINED_P (charset))
819 charset = 1;
820 else if (! NILP (table))
821 {
822 int c = translate_char (table, -1, charset, c1, c2);
823 if (c >= 0)
824 charset = CHAR_CHARSET (c);
825 }
826
827 if (return_val == 1
828 && charset != CHARSET_ASCII
829 && charset != CHARSET_8_BIT_CONTROL
830 && charset != CHARSET_8_BIT_GRAPHIC
831 && charset != charset_latin_iso8859_1)
832 return_val = 2;
833
834 if (charsets)
835 charsets[charset] = 1;
836 else if (return_val == 2)
837 break;
838 }
839 return return_val;
840 }
841 }
842
843 DEFUN ("find-charset-region", Ffind_charset_region, Sfind_charset_region,
844 2, 3, 0,
845 "Return a list of charsets in the region between BEG and END.\n\
846 BEG and END are buffer positions.\n\
847 Optional arg TABLE if non-nil is a translation table to look up.\n\
848 \n\
849 If the region contains invalid multibyte characters,\n\
850 `unknown' is included in the returned list.\n\
851 \n\
852 If the current buffer is unibyte, the returned list may contain\n\
853 only `ascii', `eight-bit-control', and `eight-bit-graphic'.")
854 (beg, end, table)
855 Lisp_Object beg, end, table;
856 {
857 int charsets[MAX_CHARSET + 1];
858 int from, from_byte, to, stop, stop_byte, i;
859 Lisp_Object val;
860
861 validate_region (&beg, &end);
862 from = XFASTINT (beg);
863 stop = to = XFASTINT (end);
864
865 if (from < GPT && GPT < to)
866 {
867 stop = GPT;
868 stop_byte = GPT_BYTE;
869 }
870 else
871 stop_byte = CHAR_TO_BYTE (stop);
872
873 from_byte = CHAR_TO_BYTE (from);
874
875 bzero (charsets, (MAX_CHARSET + 1) * sizeof (int));
876 while (1)
877 {
878 find_charset_in_text (BYTE_POS_ADDR (from_byte), stop - from,
879 stop_byte - from_byte, charsets, table);
880 if (stop < to)
881 {
882 from = stop, from_byte = stop_byte;
883 stop = to, stop_byte = CHAR_TO_BYTE (stop);
884 }
885 else
886 break;
887 }
888
889 val = Qnil;
890 if (charsets[1])
891 val = Fcons (Qunknown, val);
892 for (i = MAX_CHARSET; i >= MIN_CHARSET_OFFICIAL_DIMENSION1; i--)
893 if (charsets[i])
894 val = Fcons (CHARSET_SYMBOL (i), val);
895 if (charsets[0])
896 val = Fcons (Qascii, val);
897 return val;
898 }
899
900 DEFUN ("find-charset-string", Ffind_charset_string, Sfind_charset_string,
901 1, 2, 0,
902 "Return a list of charsets in STR.\n\
903 Optional arg TABLE if non-nil is a translation table to look up.\n\
904 \n\
905 If the string contains invalid multibyte characters,\n\
906 `unknown' is included in the returned list.\n\
907 \n\
908 If STR is unibyte, the returned list may contain\n\
909 only `ascii', `eight-bit-control', and `eight-bit-graphic'.")
910 (str, table)
911 Lisp_Object str, table;
912 {
913 int charsets[MAX_CHARSET + 1];
914 int i;
915 Lisp_Object val;
916
917 CHECK_STRING (str, 0);
918
919 bzero (charsets, (MAX_CHARSET + 1) * sizeof (int));
920 find_charset_in_text (XSTRING (str)->data, XSTRING (str)->size,
921 STRING_BYTES (XSTRING (str)), charsets, table);
922
923 val = Qnil;
924 if (charsets[1])
925 val = Fcons (Qunknown, val);
926 for (i = MAX_CHARSET; i >= MIN_CHARSET_OFFICIAL_DIMENSION1; i--)
927 if (charsets[i])
928 val = Fcons (CHARSET_SYMBOL (i), val);
929 if (charsets[0])
930 val = Fcons (Qascii, val);
931 return val;
932 }
933
934 \f
935 DEFUN ("make-char-internal", Fmake_char_internal, Smake_char_internal, 1, 3, 0,
936 "")
937 (charset, code1, code2)
938 Lisp_Object charset, code1, code2;
939 {
940 int charset_id, c1, c2;
941
942 CHECK_NUMBER (charset, 0);
943 charset_id = XINT (charset);
944 if (!CHARSET_DEFINED_P (charset_id))
945 error ("Invalid charset ID: %d", XINT (charset));
946
947 if (NILP (code1))
948 c1 = 0;
949 else
950 {
951 CHECK_NUMBER (code1, 1);
952 c1 = XINT (code1);
953 }
954 if (NILP (code2))
955 c2 = 0;
956 else
957 {
958 CHECK_NUMBER (code2, 2);
959 c2 = XINT (code2);
960 }
961
962 if (charset_id == CHARSET_ASCII)
963 {
964 if (c1 < 0 || c1 > 0x7F)
965 goto invalid_code_posints;
966 return make_number (c1);
967 }
968 else if (charset_id == CHARSET_8_BIT_CONTROL)
969 {
970 if (NILP (code1))
971 c1 = 0x80;
972 else if (c1 < 0x80 || c1 > 0x9F)
973 goto invalid_code_posints;
974 return make_number (c1);
975 }
976 else if (charset_id == CHARSET_8_BIT_GRAPHIC)
977 {
978 if (NILP (code1))
979 c1 = 0xA0;
980 else if (c1 < 0xA0 || c1 > 0xFF)
981 goto invalid_code_posints;
982 return make_number (c1);
983 }
984 else if (c1 < 0 || c1 > 0xFF || c2 < 0 || c2 > 0xFF)
985 goto invalid_code_posints;
986 c1 &= 0x7F;
987 c2 &= 0x7F;
988 if (c1 == 0
989 ? c2 != 0
990 : (c2 == 0
991 ? !CHAR_COMPONENTS_VALID_P (charset_id, c1, 0x20)
992 : !CHAR_COMPONENTS_VALID_P (charset_id, c1, c2)))
993 goto invalid_code_posints;
994 return make_number (MAKE_CHAR (charset_id, c1, c2));
995
996 invalid_code_posints:
997 error ("Invalid code points for charset ID %d: %d %d", charset_id, c1, c2);
998 }
999
1000 DEFUN ("split-char", Fsplit_char, Ssplit_char, 1, 1, 0,
1001 "Return list of charset and one or two position-codes of CHAR.\n\
1002 If CHAR is invalid as a character code,\n\
1003 return a list of symbol `unknown' and CHAR.")
1004 (ch)
1005 Lisp_Object ch;
1006 {
1007 int c, charset, c1, c2;
1008
1009 CHECK_NUMBER (ch, 0);
1010 c = XFASTINT (ch);
1011 if (!CHAR_VALID_P (c, 1))
1012 return Fcons (Qunknown, Fcons (ch, Qnil));
1013 SPLIT_CHAR (XFASTINT (ch), charset, c1, c2);
1014 return (c2 >= 0
1015 ? Fcons (CHARSET_SYMBOL (charset),
1016 Fcons (make_number (c1), Fcons (make_number (c2), Qnil)))
1017 : Fcons (CHARSET_SYMBOL (charset), Fcons (make_number (c1), Qnil)));
1018 }
1019
1020 DEFUN ("char-charset", Fchar_charset, Schar_charset, 1, 1, 0,
1021 "Return charset of CHAR.")
1022 (ch)
1023 Lisp_Object ch;
1024 {
1025 CHECK_NUMBER (ch, 0);
1026
1027 return CHARSET_SYMBOL (CHAR_CHARSET (XINT (ch)));
1028 }
1029
1030 DEFUN ("charset-after", Fcharset_after, Scharset_after, 0, 1, 0,
1031 "Return charset of a character in the current buffer at position POS.\n\
1032 If POS is nil, it defauls to the current point.\n\
1033 If POS is out of range, the value is nil.")
1034 (pos)
1035 Lisp_Object pos;
1036 {
1037 Lisp_Object ch;
1038 int charset;
1039
1040 ch = Fchar_after (pos);
1041 if (! INTEGERP (ch))
1042 return ch;
1043 charset = CHAR_CHARSET (XINT (ch));
1044 return CHARSET_SYMBOL (charset);
1045 }
1046
1047 DEFUN ("iso-charset", Fiso_charset, Siso_charset, 3, 3, 0,
1048 "Return charset of ISO's specification DIMENSION, CHARS, and FINAL-CHAR.\n\
1049 \n\
1050 ISO 2022's designation sequence (escape sequence) distinguishes charsets\n\
1051 by their DIMENSION, CHARS, and FINAL-CHAR,\n\
1052 where as Emacs distinguishes them by charset symbol.\n\
1053 See the documentation of the function `charset-info' for the meanings of\n\
1054 DIMENSION, CHARS, and FINAL-CHAR.")
1055 (dimension, chars, final_char)
1056 Lisp_Object dimension, chars, final_char;
1057 {
1058 int charset;
1059
1060 CHECK_NUMBER (dimension, 0);
1061 CHECK_NUMBER (chars, 1);
1062 CHECK_NUMBER (final_char, 2);
1063
1064 if ((charset = ISO_CHARSET_TABLE (dimension, chars, final_char)) < 0)
1065 return Qnil;
1066 return CHARSET_SYMBOL (charset);
1067 }
1068
1069 /* If GENERICP is nonzero, return nonzero iff C is a valid normal or
1070 generic character. If GENERICP is zero, return nonzero iff C is a
1071 valid normal character. Do not call this function directly,
1072 instead use macro CHAR_VALID_P. */
1073 int
1074 char_valid_p (c, genericp)
1075 int c, genericp;
1076 {
1077 int charset, c1, c2;
1078
1079 if (c < 0 || c >= MAX_CHAR)
1080 return 0;
1081 if (SINGLE_BYTE_CHAR_P (c))
1082 return 1;
1083 SPLIT_CHAR (c, charset, c1, c2);
1084 if (genericp)
1085 {
1086 if (c1)
1087 {
1088 if (c2 <= 0) c2 = 0x20;
1089 }
1090 else
1091 {
1092 if (c2 <= 0) c1 = c2 = 0x20;
1093 }
1094 }
1095 return (CHARSET_DEFINED_P (charset)
1096 && CHAR_COMPONENTS_VALID_P (charset, c1, c2));
1097 }
1098
1099 DEFUN ("char-valid-p", Fchar_valid_p, Schar_valid_p, 1, 2, 0,
1100 "Return t if OBJECT is a valid normal character.\n\
1101 If optional arg GENERICP is non-nil, also return t if OBJECT is\n\
1102 a valid generic character.")
1103 (object, genericp)
1104 Lisp_Object object, genericp;
1105 {
1106 if (! NATNUMP (object))
1107 return Qnil;
1108 return (CHAR_VALID_P (XFASTINT (object), !NILP (genericp)) ? Qt : Qnil);
1109 }
1110
1111 DEFUN ("unibyte-char-to-multibyte", Funibyte_char_to_multibyte,
1112 Sunibyte_char_to_multibyte, 1, 1, 0,
1113 "Convert the unibyte character CH to multibyte character.\n\
1114 The conversion is done based on `nonascii-translation-table' (which see)\n\
1115 or `nonascii-insert-offset' (which see).")
1116 (ch)
1117 Lisp_Object ch;
1118 {
1119 int c;
1120
1121 CHECK_NUMBER (ch, 0);
1122 c = XINT (ch);
1123 if (c < 0 || c >= 0400)
1124 error ("Invalid unibyte character: %d", c);
1125 c = unibyte_char_to_multibyte (c);
1126 if (c < 0)
1127 error ("Can't convert to multibyte character: %d", XINT (ch));
1128 return make_number (c);
1129 }
1130
1131 DEFUN ("multibyte-char-to-unibyte", Fmultibyte_char_to_unibyte,
1132 Smultibyte_char_to_unibyte, 1, 1, 0,
1133 "Convert the multibyte character CH to unibyte character.\n\
1134 The conversion is done based on `nonascii-translation-table' (which see)\n\
1135 or `nonascii-insert-offset' (which see).")
1136 (ch)
1137 Lisp_Object ch;
1138 {
1139 int c;
1140
1141 CHECK_NUMBER (ch, 0);
1142 c = XINT (ch);
1143 if (! CHAR_VALID_P (c, 0))
1144 error ("Invalid multibyte character: %d", c);
1145 c = multibyte_char_to_unibyte (c, Qnil);
1146 if (c < 0)
1147 error ("Can't convert to unibyte character: %d", XINT (ch));
1148 return make_number (c);
1149 }
1150
1151 DEFUN ("char-bytes", Fchar_bytes, Schar_bytes, 1, 1, 0,
1152 "Return 1 regardless of the argument CHAR.\n\
1153 This is now an obsolete function. We keep it just for backward compatibility.")
1154 (ch)
1155 Lisp_Object ch;
1156 {
1157 CHECK_NUMBER (ch, 0);
1158 return make_number (1);
1159 }
1160
1161 /* Return how many bytes C will occupy in a multibyte buffer.
1162 Don't call this function directly, instead use macro CHAR_BYTES. */
1163 int
1164 char_bytes (c)
1165 int c;
1166 {
1167 int charset;
1168
1169 if (ASCII_BYTE_P (c) || (c & ~((1 << CHARACTERBITS) -1)))
1170 return 1;
1171 if (SINGLE_BYTE_CHAR_P (c) && c >= 0xA0)
1172 return 1;
1173
1174 charset = CHAR_CHARSET (c);
1175 return (CHARSET_DEFINED_P (charset) ? CHARSET_BYTES (charset) : 1);
1176 }
1177
1178 /* Return the width of character of which multi-byte form starts with
1179 C. The width is measured by how many columns occupied on the
1180 screen when displayed in the current buffer. */
1181
1182 #define ONE_BYTE_CHAR_WIDTH(c) \
1183 (c < 0x20 \
1184 ? (c == '\t' \
1185 ? XFASTINT (current_buffer->tab_width) \
1186 : (c == '\n' ? 0 : (NILP (current_buffer->ctl_arrow) ? 4 : 2))) \
1187 : (c < 0x7f \
1188 ? 1 \
1189 : (c == 0x7F \
1190 ? (NILP (current_buffer->ctl_arrow) ? 4 : 2) \
1191 : ((! NILP (current_buffer->enable_multibyte_characters) \
1192 && BASE_LEADING_CODE_P (c)) \
1193 ? WIDTH_BY_CHAR_HEAD (c) \
1194 : 4))))
1195
1196 DEFUN ("char-width", Fchar_width, Schar_width, 1, 1, 0,
1197 "Return width of CHAR when displayed in the current buffer.\n\
1198 The width is measured by how many columns it occupies on the screen.\n\
1199 Tab is taken to occupy `tab-width' columns.")
1200 (ch)
1201 Lisp_Object ch;
1202 {
1203 Lisp_Object val, disp;
1204 int c;
1205 struct Lisp_Char_Table *dp = buffer_display_table ();
1206
1207 CHECK_NUMBER (ch, 0);
1208
1209 c = XINT (ch);
1210
1211 /* Get the way the display table would display it. */
1212 disp = dp ? DISP_CHAR_VECTOR (dp, c) : Qnil;
1213
1214 if (VECTORP (disp))
1215 XSETINT (val, XVECTOR (disp)->size);
1216 else if (SINGLE_BYTE_CHAR_P (c))
1217 XSETINT (val, ONE_BYTE_CHAR_WIDTH (c));
1218 else
1219 {
1220 int charset = CHAR_CHARSET (c);
1221
1222 XSETFASTINT (val, CHARSET_WIDTH (charset));
1223 }
1224 return val;
1225 }
1226
1227 /* Return width of string STR of length LEN when displayed in the
1228 current buffer. The width is measured by how many columns it
1229 occupies on the screen. */
1230
1231 int
1232 strwidth (str, len)
1233 unsigned char *str;
1234 int len;
1235 {
1236 unsigned char *endp = str + len;
1237 int width = 0;
1238 struct Lisp_Char_Table *dp = buffer_display_table ();
1239
1240 while (str < endp)
1241 {
1242 Lisp_Object disp;
1243 int thislen;
1244 int c = STRING_CHAR_AND_LENGTH (str, endp - str, thislen);
1245
1246 /* Get the way the display table would display it. */
1247 if (dp)
1248 disp = DISP_CHAR_VECTOR (dp, c);
1249 else
1250 disp = Qnil;
1251
1252 if (VECTORP (disp))
1253 width += XVECTOR (disp)->size;
1254 else
1255 width += ONE_BYTE_CHAR_WIDTH (*str);
1256
1257 str += thislen;
1258 }
1259 return width;
1260 }
1261
1262 int
1263 lisp_string_width (str)
1264 Lisp_Object str;
1265 {
1266 int len = XSTRING (str)->size, len_byte = STRING_BYTES (XSTRING (str));
1267 int i = 0, i_byte;
1268 int width = 0;
1269 int start, end, start_byte;
1270 Lisp_Object prop;
1271 int cmp_id;
1272
1273 while (i < len)
1274 {
1275 if (find_composition (i, len, &start, &end, &prop, str))
1276 {
1277 start_byte = string_char_to_byte (str, start);
1278 if (i < start)
1279 {
1280 i_byte = string_char_to_byte (str, i);
1281 width += strwidth (XSTRING (str)->data + i_byte,
1282 start_byte - i_byte);
1283 }
1284 cmp_id
1285 = get_composition_id (start, start_byte, end - start, prop, str);
1286 if (cmp_id >= 0)
1287 width += composition_table[cmp_id]->width;
1288 i = end;
1289 }
1290 else
1291 {
1292 i_byte = string_char_to_byte (str, i);
1293 width += strwidth (XSTRING (str)->data + i_byte, len_byte - i_byte);
1294 i = len;
1295 }
1296 }
1297 return width;
1298 }
1299
1300 DEFUN ("string-width", Fstring_width, Sstring_width, 1, 1, 0,
1301 "Return width of STRING when displayed in the current buffer.\n\
1302 Width is measured by how many columns it occupies on the screen.\n\
1303 When calculating width of a multibyte character in STRING,\n\
1304 only the base leading-code is considered; the validity of\n\
1305 the following bytes is not checked. Tabs in STRING are always\n\
1306 taken to occupy `tab-width' columns.")
1307 (str)
1308 Lisp_Object str;
1309 {
1310 Lisp_Object val;
1311
1312 CHECK_STRING (str, 0);
1313 XSETFASTINT (val, lisp_string_width (str));
1314 return val;
1315 }
1316
1317 DEFUN ("char-direction", Fchar_direction, Schar_direction, 1, 1, 0,
1318 "Return the direction of CHAR.\n\
1319 The returned value is 0 for left-to-right and 1 for right-to-left.")
1320 (ch)
1321 Lisp_Object ch;
1322 {
1323 int charset;
1324
1325 CHECK_NUMBER (ch, 0);
1326 charset = CHAR_CHARSET (XFASTINT (ch));
1327 if (!CHARSET_DEFINED_P (charset))
1328 invalid_character (XINT (ch));
1329 return CHARSET_TABLE_INFO (charset, CHARSET_DIRECTION_IDX);
1330 }
1331
1332 DEFUN ("chars-in-region", Fchars_in_region, Schars_in_region, 2, 2, 0,
1333 "Return number of characters between BEG and END.")
1334 (beg, end)
1335 Lisp_Object beg, end;
1336 {
1337 int from, to;
1338
1339 CHECK_NUMBER_COERCE_MARKER (beg, 0);
1340 CHECK_NUMBER_COERCE_MARKER (end, 1);
1341
1342 from = min (XFASTINT (beg), XFASTINT (end));
1343 to = max (XFASTINT (beg), XFASTINT (end));
1344
1345 return make_number (to - from);
1346 }
1347
1348 /* Return the number of characters in the NBYTES bytes at PTR.
1349 This works by looking at the contents and checking for multibyte sequences.
1350 However, if the current buffer has enable-multibyte-characters = nil,
1351 we treat each byte as a character. */
1352
1353 int
1354 chars_in_text (ptr, nbytes)
1355 unsigned char *ptr;
1356 int nbytes;
1357 {
1358 /* current_buffer is null at early stages of Emacs initialization. */
1359 if (current_buffer == 0
1360 || NILP (current_buffer->enable_multibyte_characters))
1361 return nbytes;
1362
1363 return multibyte_chars_in_text (ptr, nbytes);
1364 }
1365
1366 /* Return the number of characters in the NBYTES bytes at PTR.
1367 This works by looking at the contents and checking for multibyte sequences.
1368 It ignores enable-multibyte-characters. */
1369
1370 int
1371 multibyte_chars_in_text (ptr, nbytes)
1372 unsigned char *ptr;
1373 int nbytes;
1374 {
1375 unsigned char *endp;
1376 int chars, bytes;
1377
1378 endp = ptr + nbytes;
1379 chars = 0;
1380
1381 while (ptr < endp)
1382 {
1383 PARSE_MULTIBYTE_SEQ (ptr, endp - ptr, bytes);
1384 ptr += bytes;
1385 chars++;
1386 }
1387
1388 return chars;
1389 }
1390
1391 /* Parse unibyte text at STR of LEN bytes as a multibyte text, and
1392 count the numbers of characters and bytes in it. On counting
1393 bytes, pay attention to that 8-bit characters in the range
1394 0x80..0x9F are represented by 2-byte in a multibyte text. */
1395 void
1396 parse_str_as_multibyte (str, len, nchars, nbytes)
1397 unsigned char *str;
1398 int len, *nchars, *nbytes;
1399 {
1400 unsigned char *endp = str + len;
1401 int n, chars = 0, bytes = 0;
1402
1403 while (str < endp)
1404 {
1405 if (UNIBYTE_STR_AS_MULTIBYTE_P (str, endp - str, n))
1406 str += n, bytes += n;
1407 else
1408 str++, bytes += 2;
1409 chars++;
1410 }
1411 *nchars = chars;
1412 *nbytes = bytes;
1413 return;
1414 }
1415
1416 /* Arrange unibyte text at STR of NBYTES bytes as a multibyte text.
1417 It actually converts only 8-bit characters in the range 0x80..0x9F
1418 that don't contruct multibyte characters to multibyte forms. If
1419 NCHARS is nonzero, set *NCHARS to the number of characters in the
1420 text. It is assured that we can use LEN bytes at STR as a work
1421 area and that is enough. Return the number of bytes of the
1422 resulting text. */
1423
1424 int
1425 str_as_multibyte (str, len, nbytes, nchars)
1426 unsigned char *str;
1427 int len, nbytes, *nchars;
1428 {
1429 unsigned char *p = str, *endp = str + nbytes;
1430 unsigned char *to;
1431 int chars = 0;
1432 int n;
1433
1434 while (p < endp && UNIBYTE_STR_AS_MULTIBYTE_P (p, endp - p, n))
1435 p += n, chars++;
1436 if (nchars)
1437 *nchars = chars;
1438 if (p == endp)
1439 return nbytes;
1440
1441 to = p;
1442 nbytes = endp - p;
1443 endp = str + len;
1444 safe_bcopy (p, endp - nbytes, nbytes);
1445 p = endp - nbytes;
1446 while (p < endp)
1447 {
1448 if (UNIBYTE_STR_AS_MULTIBYTE_P (p, endp - p, n))
1449 {
1450 while (n--)
1451 *to++ = *p++;
1452 }
1453 else
1454 {
1455 *to++ = LEADING_CODE_8_BIT_CONTROL;
1456 *to++ = *p++ + 0x20;
1457 }
1458 chars++;
1459 }
1460 if (nchars)
1461 *nchars = chars;
1462 return (to - str);
1463 }
1464
1465 /* Convert unibyte text at STR of NBYTES bytes to a multibyte text
1466 that contains the same single-byte characters. It actually
1467 converts all 8-bit characters to multibyte forms. It is assured
1468 that we can use LEN bytes at STR as a work area and that is
1469 enough. */
1470
1471 int
1472 str_to_multibyte (str, len, bytes)
1473 unsigned char *str;
1474 int len, bytes;
1475 {
1476 unsigned char *p = str, *endp = str + bytes;
1477 unsigned char *to;
1478
1479 while (p < endp && (*p < 0x80 || *p >= 0xA0)) p++;
1480 if (p == endp)
1481 return bytes;
1482 to = p;
1483 bytes = endp - p;
1484 endp = str + len;
1485 safe_bcopy (p, endp - bytes, bytes);
1486 p = endp - bytes;
1487 while (p < endp)
1488 {
1489 if (*p < 0x80 || *p >= 0xA0)
1490 *to++ = *p++;
1491 else
1492 *to++ = LEADING_CODE_8_BIT_CONTROL, *to++ = *p++ + 0x20;
1493 }
1494 return (to - str);
1495 }
1496
1497 /* Arrange multibyte text at STR of LEN bytes as a unibyte text. It
1498 actually converts only 8-bit characters in the range 0x80..0x9F to
1499 unibyte forms. */
1500
1501 int
1502 str_as_unibyte (str, bytes)
1503 unsigned char *str;
1504 int bytes;
1505 {
1506 unsigned char *p = str, *endp = str + bytes;
1507 unsigned char *to = str;
1508
1509 while (p < endp && *p != LEADING_CODE_8_BIT_CONTROL) p++;
1510 to = p;
1511 while (p < endp)
1512 {
1513 if (*p == LEADING_CODE_8_BIT_CONTROL)
1514 *to++ = *(p + 1) - 0x20, p += 2;
1515 else
1516 *to++ = *p++;
1517 }
1518 return (to - str);
1519 }
1520
1521 \f
1522 DEFUN ("string", Fstring, Sstring, 1, MANY, 0,
1523 "Concatenate all the argument characters and make the result a string.")
1524 (n, args)
1525 int n;
1526 Lisp_Object *args;
1527 {
1528 int i;
1529 unsigned char *buf = (unsigned char *) alloca (MAX_MULTIBYTE_LENGTH * n);
1530 unsigned char *p = buf;
1531 int c;
1532
1533 for (i = 0; i < n; i++)
1534 {
1535 CHECK_NUMBER (args[i], 0);
1536 c = XINT (args[i]);
1537 p += CHAR_STRING (c, p);
1538 }
1539
1540 return make_string_from_bytes (buf, n, p - buf);
1541 }
1542
1543 #endif /* emacs */
1544 \f
1545 int
1546 charset_id_internal (charset_name)
1547 char *charset_name;
1548 {
1549 Lisp_Object val;
1550
1551 val= Fget (intern (charset_name), Qcharset);
1552 if (!VECTORP (val))
1553 error ("Charset %s is not defined", charset_name);
1554
1555 return (XINT (XVECTOR (val)->contents[0]));
1556 }
1557
1558 DEFUN ("setup-special-charsets", Fsetup_special_charsets,
1559 Ssetup_special_charsets, 0, 0, 0, "Internal use only.")
1560 ()
1561 {
1562 charset_latin_iso8859_1 = charset_id_internal ("latin-iso8859-1");
1563 charset_jisx0208_1978 = charset_id_internal ("japanese-jisx0208-1978");
1564 charset_jisx0208 = charset_id_internal ("japanese-jisx0208");
1565 charset_katakana_jisx0201 = charset_id_internal ("katakana-jisx0201");
1566 charset_latin_jisx0201 = charset_id_internal ("latin-jisx0201");
1567 charset_big5_1 = charset_id_internal ("chinese-big5-1");
1568 charset_big5_2 = charset_id_internal ("chinese-big5-2");
1569 return Qnil;
1570 }
1571
1572 void
1573 init_charset_once ()
1574 {
1575 int i, j, k;
1576
1577 staticpro (&Vcharset_table);
1578 staticpro (&Vcharset_symbol_table);
1579 staticpro (&Vgeneric_character_list);
1580
1581 /* This has to be done here, before we call Fmake_char_table. */
1582 Qcharset_table = intern ("charset-table");
1583 staticpro (&Qcharset_table);
1584
1585 /* Intern this now in case it isn't already done.
1586 Setting this variable twice is harmless.
1587 But don't staticpro it here--that is done in alloc.c. */
1588 Qchar_table_extra_slots = intern ("char-table-extra-slots");
1589
1590 /* Now we are ready to set up this property, so we can
1591 create the charset table. */
1592 Fput (Qcharset_table, Qchar_table_extra_slots, make_number (0));
1593 Vcharset_table = Fmake_char_table (Qcharset_table, Qnil);
1594
1595 Qunknown = intern ("unknown");
1596 staticpro (&Qunknown);
1597 Vcharset_symbol_table = Fmake_vector (make_number (MAX_CHARSET + 1),
1598 Qunknown);
1599
1600 /* Setup tables. */
1601 for (i = 0; i < 2; i++)
1602 for (j = 0; j < 2; j++)
1603 for (k = 0; k < 128; k++)
1604 iso_charset_table [i][j][k] = -1;
1605
1606 for (i = 0; i < 256; i++)
1607 bytes_by_char_head[i] = 1;
1608 bytes_by_char_head[LEADING_CODE_PRIVATE_11] = 3;
1609 bytes_by_char_head[LEADING_CODE_PRIVATE_12] = 3;
1610 bytes_by_char_head[LEADING_CODE_PRIVATE_21] = 4;
1611 bytes_by_char_head[LEADING_CODE_PRIVATE_22] = 4;
1612
1613 for (i = 0; i < 128; i++)
1614 width_by_char_head[i] = 1;
1615 for (; i < 256; i++)
1616 width_by_char_head[i] = 4;
1617 width_by_char_head[LEADING_CODE_PRIVATE_11] = 1;
1618 width_by_char_head[LEADING_CODE_PRIVATE_12] = 2;
1619 width_by_char_head[LEADING_CODE_PRIVATE_21] = 1;
1620 width_by_char_head[LEADING_CODE_PRIVATE_22] = 2;
1621
1622 {
1623 Lisp_Object val;
1624
1625 val = Qnil;
1626 for (i = 0x81; i < 0x90; i++)
1627 val = Fcons (make_number ((i - 0x70) << 7), val);
1628 for (; i < 0x9A; i++)
1629 val = Fcons (make_number ((i - 0x8F) << 14), val);
1630 for (i = 0xA0; i < 0xF0; i++)
1631 val = Fcons (make_number ((i - 0x70) << 7), val);
1632 for (; i < 0xFF; i++)
1633 val = Fcons (make_number ((i - 0xE0) << 14), val);
1634 Vgeneric_character_list = Fnreverse (val);
1635 }
1636
1637 nonascii_insert_offset = 0;
1638 Vnonascii_translation_table = Qnil;
1639 }
1640
1641 #ifdef emacs
1642
1643 void
1644 syms_of_charset ()
1645 {
1646 Qcharset = intern ("charset");
1647 staticpro (&Qcharset);
1648
1649 Qascii = intern ("ascii");
1650 staticpro (&Qascii);
1651
1652 Qeight_bit_control = intern ("eight-bit-control");
1653 staticpro (&Qeight_bit_control);
1654
1655 Qeight_bit_graphic = intern ("eight-bit-graphic");
1656 staticpro (&Qeight_bit_graphic);
1657
1658 /* Define special charsets ascii, eight-bit-control, and
1659 eight-bit-graphic. */
1660 update_charset_table (make_number (CHARSET_ASCII),
1661 make_number (1), make_number (94),
1662 make_number (1),
1663 make_number (0),
1664 make_number ('B'),
1665 make_number (0),
1666 build_string ("ASCII"),
1667 Qnil, /* same as above */
1668 build_string ("ASCII (ISO646 IRV)"));
1669 CHARSET_SYMBOL (CHARSET_ASCII) = Qascii;
1670 Fput (Qascii, Qcharset, CHARSET_TABLE_ENTRY (CHARSET_ASCII));
1671
1672 update_charset_table (make_number (CHARSET_8_BIT_CONTROL),
1673 make_number (1), make_number (96),
1674 make_number (4),
1675 make_number (0),
1676 make_number (-1),
1677 make_number (-1),
1678 build_string ("8-bit control code (0x80..0x9F)"),
1679 Qnil, /* same as above */
1680 Qnil); /* same as above */
1681 CHARSET_SYMBOL (CHARSET_8_BIT_CONTROL) = Qeight_bit_control;
1682 Fput (Qeight_bit_control, Qcharset,
1683 CHARSET_TABLE_ENTRY (CHARSET_8_BIT_CONTROL));
1684
1685 update_charset_table (make_number (CHARSET_8_BIT_GRAPHIC),
1686 make_number (1), make_number (96),
1687 make_number (4),
1688 make_number (0),
1689 make_number (-1),
1690 make_number (-1),
1691 build_string ("8-bit graphic char (0xA0..0xFF)"),
1692 Qnil, /* same as above */
1693 Qnil); /* same as above */
1694 CHARSET_SYMBOL (CHARSET_8_BIT_GRAPHIC) = Qeight_bit_graphic;
1695 Fput (Qeight_bit_graphic, Qcharset,
1696 CHARSET_TABLE_ENTRY (CHARSET_8_BIT_GRAPHIC));
1697
1698 Qauto_fill_chars = intern ("auto-fill-chars");
1699 staticpro (&Qauto_fill_chars);
1700 Fput (Qauto_fill_chars, Qchar_table_extra_slots, make_number (0));
1701
1702 defsubr (&Sdefine_charset);
1703 defsubr (&Sgeneric_character_list);
1704 defsubr (&Sget_unused_iso_final_char);
1705 defsubr (&Sdeclare_equiv_charset);
1706 defsubr (&Sfind_charset_region);
1707 defsubr (&Sfind_charset_string);
1708 defsubr (&Smake_char_internal);
1709 defsubr (&Ssplit_char);
1710 defsubr (&Schar_charset);
1711 defsubr (&Scharset_after);
1712 defsubr (&Siso_charset);
1713 defsubr (&Schar_valid_p);
1714 defsubr (&Sunibyte_char_to_multibyte);
1715 defsubr (&Smultibyte_char_to_unibyte);
1716 defsubr (&Schar_bytes);
1717 defsubr (&Schar_width);
1718 defsubr (&Sstring_width);
1719 defsubr (&Schar_direction);
1720 defsubr (&Schars_in_region);
1721 defsubr (&Sstring);
1722 defsubr (&Ssetup_special_charsets);
1723
1724 DEFVAR_LISP ("charset-list", &Vcharset_list,
1725 "List of charsets ever defined.");
1726 Vcharset_list = Fcons (Qascii, Fcons (Qeight_bit_control,
1727 Fcons (Qeight_bit_graphic, Qnil)));
1728
1729 DEFVAR_LISP ("translation-table-vector", &Vtranslation_table_vector,
1730 "Vector of cons cell of a symbol and translation table ever defined.\n\
1731 An ID of a translation table is an index of this vector.");
1732 Vtranslation_table_vector = Fmake_vector (make_number (16), Qnil);
1733
1734 DEFVAR_INT ("leading-code-private-11", &leading_code_private_11,
1735 "Leading-code of private TYPE9N charset of column-width 1.");
1736 leading_code_private_11 = LEADING_CODE_PRIVATE_11;
1737
1738 DEFVAR_INT ("leading-code-private-12", &leading_code_private_12,
1739 "Leading-code of private TYPE9N charset of column-width 2.");
1740 leading_code_private_12 = LEADING_CODE_PRIVATE_12;
1741
1742 DEFVAR_INT ("leading-code-private-21", &leading_code_private_21,
1743 "Leading-code of private TYPE9Nx9N charset of column-width 1.");
1744 leading_code_private_21 = LEADING_CODE_PRIVATE_21;
1745
1746 DEFVAR_INT ("leading-code-private-22", &leading_code_private_22,
1747 "Leading-code of private TYPE9Nx9N charset of column-width 2.");
1748 leading_code_private_22 = LEADING_CODE_PRIVATE_22;
1749
1750 DEFVAR_INT ("nonascii-insert-offset", &nonascii_insert_offset,
1751 "Offset for converting non-ASCII unibyte codes 0240...0377 to multibyte.\n\
1752 This is used for converting unibyte text to multibyte,\n\
1753 and for inserting character codes specified by number.\n\n\
1754 This serves to convert a Latin-1 or similar 8-bit character code\n\
1755 to the corresponding Emacs multibyte character code.\n\
1756 Typically the value should be (- (make-char CHARSET 0) 128),\n\
1757 for your choice of character set.\n\
1758 If `nonascii-translation-table' is non-nil, it overrides this variable.");
1759 nonascii_insert_offset = 0;
1760
1761 DEFVAR_LISP ("nonascii-translation-table", &Vnonascii_translation_table,
1762 "Translation table to convert non-ASCII unibyte codes to multibyte.\n\
1763 This is used for converting unibyte text to multibyte,\n\
1764 and for inserting character codes specified by number.\n\n\
1765 Conversion is performed only when multibyte characters are enabled,\n\
1766 and it serves to convert a Latin-1 or similar 8-bit character code\n\
1767 to the corresponding Emacs character code.\n\n\
1768 If this is nil, `nonascii-insert-offset' is used instead.\n\
1769 See also the docstring of `make-translation-table'.");
1770 Vnonascii_translation_table = Qnil;
1771
1772 DEFVAR_LISP ("auto-fill-chars", &Vauto_fill_chars,
1773 "A char-table for characters which invoke auto-filling.\n\
1774 Such characters have value t in this table.");
1775 Vauto_fill_chars = Fmake_char_table (Qauto_fill_chars, Qnil);
1776 CHAR_TABLE_SET (Vauto_fill_chars, make_number (' '), Qt);
1777 CHAR_TABLE_SET (Vauto_fill_chars, make_number ('\n'), Qt);
1778 }
1779
1780 #endif /* emacs */