src/charset.c

   1 /* Basic multilingual character support.
   2    Copyright (C) 1995, 1997, 1998 Electrotechnical Laboratory, JAPAN.
   3    Licensed to the Free Software Foundation.
   4
   5 This file is part of GNU Emacs.
   6
   7 GNU Emacs is free software; you can redistribute it and/or modify
   8 it under the terms of the GNU General Public License as published by
   9 the Free Software Foundation; either version 2, or (at your option)
  10 any later version.
  11
  12 GNU Emacs is distributed in the hope that it will be useful,
  13 but WITHOUT ANY WARRANTY; without even the implied warranty of
  14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  15 GNU General Public License for more details.
  16
  17 You should have received a copy of the GNU General Public License
  18 along with GNU Emacs; see the file COPYING.  If not, write to
  19 the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
  20 Boston, MA 02111-1307, USA.  */
  21
  22 /* At first, see the document in `charset.h' to understand the code in
  23    this file.  */
  24
  25 #ifdef emacs
  26 #include <config.h>
  27 #endif
  28
  29 #include <stdio.h>
  30
  31 #ifdef emacs
  32
  33 #include <sys/types.h>
  34 #include "lisp.h"
  35 #include "buffer.h"
  36 #include "charset.h"
  37 #include "coding.h"
  38 #include "disptab.h"
  39
  40 #else  /* not emacs */
  41
  42 #include "mulelib.h"
  43
  44 #endif /* emacs */
  45
  46 Lisp_Object Qcharset, Qascii;
  47 Lisp_Object Qunknown;
  48
  49 /* Declaration of special leading-codes.  */
  50 int leading_code_private_11;    /* for private DIMENSION1 of 1-column */
  51 int leading_code_private_12;    /* for private DIMENSION1 of 2-column */
  52 int leading_code_private_21;    /* for private DIMENSION2 of 1-column */
  53 int leading_code_private_22;    /* for private DIMENSION2 of 2-column */
  54
  55 /* Declaration of special charsets.  */
  56 int charset_ascii;              /* ASCII */
  57 int charset_latin_iso8859_1;    /* ISO8859-1 (Latin-1) */
  58 int charset_jisx0208_1978;      /* JISX0208.1978 (Japanese Kanji old set) */
  59 int charset_jisx0208;           /* JISX0208.1983 (Japanese Kanji) */
  60 int charset_katakana_jisx0201;  /* JISX0201.Kana (Japanese Katakana) */
  61 int charset_latin_jisx0201;     /* JISX0201.Roman (Japanese Roman) */
  62 int charset_big5_1;             /* Big5 Level 1 (Chinese Traditional) */
  63 int charset_big5_2;             /* Big5 Level 2 (Chinese Traditional) */
  64
  65 Lisp_Object Qcharset_table;
  66
  67 /* A char-table containing information of each character set.  */
  68 Lisp_Object Vcharset_table;
  69
  70 /* A vector of charset symbol indexed by charset-id.  This is used
  71    only for returning charset symbol from C functions.  */
  72 Lisp_Object Vcharset_symbol_table;
  73
  74 /* A list of charset symbols ever defined.  */
  75 Lisp_Object Vcharset_list;
  76
  77 /* Vector of translation table ever defined.
  78    ID of a translation table is used to index this vector.  */
  79 Lisp_Object Vtranslation_table_vector;
  80
  81 /* A char-table for characters which may invoke auto-filling.  */
  82 Lisp_Object Vauto_fill_chars;
  83
  84 Lisp_Object Qauto_fill_chars;
  85
  86 /* Tables used by macros BYTES_BY_CHAR_HEAD and WIDTH_BY_CHAR_HEAD.  */
  87 int bytes_by_char_head[256];
  88 int width_by_char_head[256];
  89
  90 /* Mapping table from ISO2022's charset (specified by DIMENSION,
  91    CHARS, and FINAL-CHAR) to Emacs' charset.  */
  92 int iso_charset_table[2][2][128];
  93
  94 /* Variables used locally in the macro FETCH_MULTIBYTE_CHAR.  */
  95 unsigned char *_fetch_multibyte_char_p;
  96 int _fetch_multibyte_char_len;
  97
  98 /* Offset to add to a non-ASCII value when inserting it.  */
  99 int nonascii_insert_offset;
 100
 101 /* Translation table for converting non-ASCII unibyte characters
 102    to multibyte codes, or nil.  */
 103 Lisp_Object Vnonascii_translation_table;
 104
 105 /* List of all possible generic characters.  */
 106 Lisp_Object Vgeneric_character_list;
 107
 108 #define min(X, Y) ((X) < (Y) ? (X) : (Y))
 109 #define max(X, Y) ((X) > (Y) ? (X) : (Y))
 110 \f
 111 void
 112 invalid_character (c)
 113      int c;
 114 {
 115   error ("Invalid character: 0%o, %d, 0x%x", c, c, c);
 116 }
 117
 118 /* Parse a multibyte character string STR of length LENGTH (>= 2) set
 119    BYTES to the length of actual multibyte sequence, CHARSET, C1, and
 120    C2 to such values that MAKE_CHAR can make the multibyte character
 121    from them.
 122
 123    It is assumed that *STR is one of base leading codes and the
 124    following (LENGTH - 1) bytes satisfy !CHAR_HEAD_P.
 125
 126    This macro should be called only from SPLIT_MULTIBYTE_SEQ.  */
 127
 128 #define SPLIT_CHARACTER_SEQ(str, length, bytes, charset, c1, c2)        \
 129   do {                                                                  \
 130     (bytes) = 1;                                                        \
 131     (charset) = (str)[0];                                               \
 132     if ((charset) >= LEADING_CODE_PRIVATE_11                            \
 133         && (charset) <= LEADING_CODE_PRIVATE_22)                        \
 134       (charset) = (str)[(bytes)++];                                     \
 135     if ((bytes) < (length))                                             \
 136       {                                                                 \
 137         (c1) = (str)[(bytes)++] & 0x7F;                                 \
 138         if ((bytes) < (length))                                         \
 139           (c2) = (str)[(bytes)++] & 0x7F;                               \
 140         else                                                            \
 141           (c2) = -1;                                                    \
 142       }                                                                 \
 143     else                                                                \
 144       (c1) = (c2) = -1;                                                 \
 145   } while (0)
 146
 147 /* Parse string STR of length LENGTH and check if a multibyte
 148    characters is at STR.  Set BYTES to the actual length, CHARSET, C1,
 149    C2 to proper values for that character.  */
 150
 151 #define SPLIT_MULTIBYTE_SEQ(str, length, bytes, charset, c1, c2)        \
 152   do {                                                                  \
 153     int i;                                                              \
 154     if (ASCII_BYTE_P ((str)[0]))                                        \
 155       i = 1;                                                            \
 156     else                                                                \
 157       for (i = 1; i < (length) && ! CHAR_HEAD_P ((str)[i]); i++);       \
 158     if (i == 1)                                                         \
 159       (bytes) = 1, (charset) = CHARSET_ASCII, (c1) = (str)[0] ;         \
 160     else                                                                \
 161       {                                                                 \
 162         if (i > BYTES_BY_CHAR_HEAD ((str)[0]))                          \
 163           i = BYTES_BY_CHAR_HEAD ((str)[0]);                            \
 164         SPLIT_CHARACTER_SEQ (str, i, bytes, charset, c1, c2);           \
 165       }                                                                 \
 166   } while (0)
 167
 168 /* 1 if CHARSET, C1, and C2 compose a valid character, else 0.  */
 169 #define CHAR_COMPONENTS_VALID_P(charset, c1, c2)        \
 170   (charset == CHARSET_ASCII                             \
 171    ? ((c1) >= 0 && (c1) <= 0x7F)                        \
 172    : (CHARSET_DIMENSION (charset) == 1                  \
 173       ? ((c1) >= 0x20 && (c1) <= 0x7F)                  \
 174       : ((c1) >= 0x20 && (c1) <= 0x7F && (c2) >= 0x20 && (c2) <= 0x7F)))
 175
 176 /* Store multi-byte form of the character C in STR.  The caller should
 177    allocate at least 4-byte area at STR in advance.  Returns the
 178    length of the multi-byte form.  If C is an invalid character code,
 179    signal an error.
 180
 181    Use macro `CHAR_STRING (C, STR)' instead of calling this function
 182    directly if C can be an ASCII character.  */
 183
 184 int
 185 char_to_string (c, str)
 186      int c;
 187      unsigned char *str;
 188 {
 189   unsigned char *p = str;
 190
 191   if (c & CHAR_MODIFIER_MASK)   /* This includes the case C is negative.  */
 192     {
 193       /* Multibyte character can't have a modifier bit.  */
 194       if (! SINGLE_BYTE_CHAR_P ((c & ~CHAR_MODIFIER_MASK)))
 195         invalid_character (c);
 196
 197       /* For Meta, Shift, and Control modifiers, we need special care.  */
 198       if (c & CHAR_META)
 199         {
 200           /* Move the meta bit to the right place for a string.  */
 201           c = (c & ~CHAR_META) | 0x80;
 202         }
 203       if (c & CHAR_SHIFT)
 204         {
 205           /* Shift modifier is valid only with [A-Za-z].  */
 206           if ((c & 0377) >= 'A' && (c & 0377) <= 'Z')
 207             c &= ~CHAR_SHIFT;
 208           else if ((c & 0377) >= 'a' && (c & 0377) <= 'z')
 209             c = (c & ~CHAR_SHIFT) - ('a' - 'A');
 210         }
 211       if (c & CHAR_CTL)
 212         {
 213           /* Simulate the code in lread.c.  */
 214           /* Allow `\C- ' and `\C-?'.  */
 215           if (c == (CHAR_CTL | ' '))
 216             c = 0;
 217           else if (c == (CHAR_CTL | '?'))
 218             c = 127;
 219           /* ASCII control chars are made from letters (both cases),
 220              as well as the non-letters within 0100...0137.  */
 221           else if ((c & 0137) >= 0101 && (c & 0137) <= 0132)
 222             c &= (037 | (~0177 & ~CHAR_CTL));
 223           else if ((c & 0177) >= 0100 && (c & 0177) <= 0137)
 224             c &= (037 | (~0177 & ~CHAR_CTL));
 225         }
 226
 227       /* If C still has any modifier bits, it is an invalid character.  */
 228       if (c & CHAR_MODIFIER_MASK)
 229         invalid_character (c);
 230
 231       *p++ = c;
 232     }
 233   else if (c < MAX_CHAR)
 234     {
 235       int charset, c1, c2;
 236
 237       SPLIT_NON_ASCII_CHAR (c, charset, c1, c2);
 238
 239       if (charset >= LEADING_CODE_EXT_11)
 240         *p++ = (charset < LEADING_CODE_EXT_12
 241                 ? LEADING_CODE_PRIVATE_11
 242                 : (charset < LEADING_CODE_EXT_21
 243                    ? LEADING_CODE_PRIVATE_12
 244                    : (charset < LEADING_CODE_EXT_22
 245                       ? LEADING_CODE_PRIVATE_21
 246                       : LEADING_CODE_PRIVATE_22)));
 247       *p++ = charset;
 248       if (c1 > 0 && c1 < 32 || c2 > 0 && c2 < 32)
 249         invalid_character (c);
 250       if (c1)
 251         {
 252           *p++ = c1 | 0x80;
 253           if (c2 > 0)
 254             *p++ = c2 | 0x80;
 255         }
 256     }
 257
 258   return (p -str);
 259 }
 260
 261 /* Return the non-ASCII character corresponding to multi-byte form at
 262    STR of length LEN.  If ACTUAL_LEN is not NULL, store the byte
 263    length of the multibyte form in *ACTUAL_LEN.
 264
 265    Use macros STRING_CHAR or STRING_CHAR_AND_LENGTH instead of calling
 266    this function directly if you want ot handle ASCII characters as
 267    well.  */
 268
 269 int
 270 string_to_char (str, len, actual_len)
 271      const unsigned char *str;
 272      int len, *actual_len;
 273 {
 274   int c, bytes, charset, c1, c2;
 275
 276   SPLIT_MULTIBYTE_SEQ (str, len, bytes, charset, c1, c2);
 277   c = MAKE_CHAR (charset, c1, c2);
 278   if (actual_len)
 279     *actual_len = bytes;
 280   return c;
 281 }
 282
 283 /* Return the length of the multi-byte form at string STR of length LEN.
 284    Use the macro MULTIBYTE_FORM_LENGTH instead.  */
 285 int
 286 multibyte_form_length (str, len)
 287      const unsigned char *str;
 288      int len;
 289 {
 290   int bytes;
 291
 292   PARSE_MULTIBYTE_SEQ (str, len, bytes);
 293   return bytes;
 294 }
 295
 296 /* Check multibyte form at string STR of length LEN and set variables
 297    pointed by CHARSET, C1, and C2 to charset and position codes of the
 298    character at STR, and return 0.  If there's no multibyte character,
 299    return -1.  This should be used only in the macro SPLIT_STRING
 300    which checks range of STR in advance.  */
 301
 302 int
 303 split_string (str, len, charset, c1, c2)
 304      const unsigned char *str;
 305      unsigned char *c1, *c2;
 306      int len, *charset;
 307 {
 308   register int bytes, cs, code1, code2 = -1;
 309
 310   SPLIT_MULTIBYTE_SEQ (str, len, bytes, cs, code1, code2);
 311   if (cs == CHARSET_ASCII)
 312     return -1;
 313   *charset = cs;
 314   *c1 = code1;
 315   *c2 = code2;
 316   return 0;
 317 }
 318
 319 /* Return 1 iff character C has valid printable glyph.
 320    Use the macro CHAR_PRINTABLE_P instead.  */
 321 int
 322 char_printable_p (c)
 323      int c;
 324 {
 325   int charset, c1, c2, chars;
 326
 327   if (SINGLE_BYTE_CHAR_P (c))
 328     return 1;
 329   if (c >= MAX_CHAR)
 330     return 0;
 331
 332   SPLIT_NON_ASCII_CHAR (c, charset, c1, c2);
 333   if (! CHARSET_DEFINED_P (charset))
 334     return 0;
 335   if (CHARSET_CHARS (charset) == 94
 336       ? c1 <= 32 || c1 >= 127
 337       : c1 < 32)
 338     return 0;
 339   if (CHARSET_DIMENSION (charset) == 2
 340       && (CHARSET_CHARS (charset) == 94
 341           ? c2 <= 32 || c2 >= 127
 342           : c2 < 32))
 343     return 0;
 344   return 1;
 345 }
 346
 347 /* Translate character C by translation table TABLE.  If C
 348    is negative, translate a character specified by CHARSET, C1, and C2
 349    (C1 and C2 are code points of the character).  If no translation is
 350    found in TABLE, return C.  */
 351 int
 352 translate_char (table, c, charset, c1, c2)
 353      Lisp_Object table;
 354      int c, charset, c1, c2;
 355 {
 356   Lisp_Object ch;
 357   int alt_charset, alt_c1, alt_c2, dimension;
 358
 359   if (c < 0) c = MAKE_CHAR (charset, (c1 & 0x7F) , (c2 & 0x7F));
 360   if (!CHAR_TABLE_P (table)
 361       || (ch = Faref (table, make_number (c)), !NATNUMP (ch)))
 362     return c;
 363
 364   SPLIT_CHAR (XFASTINT (ch), alt_charset, alt_c1, alt_c2);
 365   dimension = CHARSET_DIMENSION (alt_charset);
 366   if (dimension == 1 && alt_c1 > 0 || dimension == 2 && alt_c2 > 0)
 367     /* CH is not a generic character, just return it.  */
 368     return XFASTINT (ch);
 369
 370   /* Since CH is a generic character, we must return a specific
 371      charater which has the same position codes as C from CH.  */
 372   if (charset < 0)
 373     SPLIT_CHAR (c, charset, c1, c2);
 374   if (dimension != CHARSET_DIMENSION (charset))
 375     /* We can't make such a character because of dimension mismatch.  */
 376     return c;
 377   return MAKE_CHAR (alt_charset, c1, c2);
 378 }
 379
 380 /* Convert the unibyte character C to multibyte based on
 381    Vnonascii_translation_table or nonascii_insert_offset.  If they can't
 382    convert C to a valid multibyte character, convert it based on
 383    DEFAULT_NONASCII_INSERT_OFFSET which makes C a Latin-1 character.  */
 384
 385 int
 386 unibyte_char_to_multibyte (c)
 387      int c;
 388 {
 389   if (c < 0400 && c >= 0200)
 390     {
 391       int c_save = c;
 392
 393       if (! NILP (Vnonascii_translation_table))
 394         {
 395           c = XINT (Faref (Vnonascii_translation_table, make_number (c)));
 396           if (c >= 0400 && ! char_valid_p (c, 0))
 397             c = c_save + DEFAULT_NONASCII_INSERT_OFFSET;
 398         }
 399       else if (c >= 0240 && nonascii_insert_offset > 0)
 400         {
 401           c += nonascii_insert_offset;
 402           if (c < 0400 || ! char_valid_p (c, 0))
 403             c = c_save + DEFAULT_NONASCII_INSERT_OFFSET;
 404         }
 405       else if (c >= 0240)
 406         c = c_save + DEFAULT_NONASCII_INSERT_OFFSET;
 407     }
 408   return c;
 409 }
 410
 411
 412 /* Convert the multibyte character C to unibyte 8-bit character based
 413    on Vnonascii_translation_table or nonascii_insert_offset.  If
 414    REV_TBL is non-nil, it should be a reverse table of
 415    Vnonascii_translation_table, i.e. what given by:
 416      Fchar_table_extra_slot (Vnonascii_translation_table, make_number (0))  */
 417
 418 int
 419 multibyte_char_to_unibyte (c, rev_tbl)
 420      int c;
 421      Lisp_Object rev_tbl;
 422 {
 423   if (!SINGLE_BYTE_CHAR_P (c))
 424     {
 425       int c_save = c;
 426
 427       if (! CHAR_TABLE_P (rev_tbl)
 428           && CHAR_TABLE_P (Vnonascii_translation_table))
 429         rev_tbl = Fchar_table_extra_slot (Vnonascii_translation_table,
 430                                           make_number (0));
 431       if (CHAR_TABLE_P (rev_tbl))
 432         {
 433           Lisp_Object temp;
 434           temp = Faref (rev_tbl, make_number (c));
 435           if (INTEGERP (temp))
 436             c = XINT (temp);
 437           if (c >= 256)
 438             c = (c_save & 0177) + 0200;
 439         }
 440       else
 441         {
 442           if (nonascii_insert_offset > 0)
 443             c -= nonascii_insert_offset;
 444           if (c < 128 || c >= 256)
 445             c = (c_save & 0177) + 0200;
 446         }
 447     }
 448
 449   return c;
 450 }
 451
 452 \f
 453 /* Update the table Vcharset_table with the given arguments (see the
 454    document of `define-charset' for the meaning of each argument).
 455    Several other table contents are also updated.  The caller should
 456    check the validity of CHARSET-ID and the remaining arguments in
 457    advance.  */
 458
 459 void
 460 update_charset_table (charset_id, dimension, chars, width, direction,
 461                       iso_final_char, iso_graphic_plane,
 462                       short_name, long_name, description)
 463      Lisp_Object charset_id, dimension, chars, width, direction;
 464      Lisp_Object iso_final_char, iso_graphic_plane;
 465      Lisp_Object short_name, long_name, description;
 466 {
 467   int charset = XINT (charset_id);
 468   int bytes;
 469   unsigned char leading_code_base, leading_code_ext;
 470
 471   if (NILP (CHARSET_TABLE_ENTRY (charset)))
 472     CHARSET_TABLE_ENTRY (charset)
 473       = Fmake_vector (make_number (CHARSET_MAX_IDX), Qnil);
 474
 475   /* Get byte length of multibyte form, base leading-code, and
 476      extended leading-code of the charset.  See the comment under the
 477      title "GENERAL NOTE on CHARACTER SET (CHARSET)" in charset.h.  */
 478   bytes = XINT (dimension);
 479   if (charset < MIN_CHARSET_PRIVATE_DIMENSION1)
 480     {
 481       /* Official charset, it doesn't have an extended leading-code.  */
 482       if (charset != CHARSET_ASCII)
 483         bytes += 1; /* For a base leading-code.  */
 484       leading_code_base = charset;
 485       leading_code_ext = 0;
 486     }
 487   else
 488     {
 489       /* Private charset.  */
 490       bytes += 2; /* For base and extended leading-codes.  */
 491       leading_code_base
 492         = (charset < LEADING_CODE_EXT_12
 493            ? LEADING_CODE_PRIVATE_11
 494            : (charset < LEADING_CODE_EXT_21
 495               ? LEADING_CODE_PRIVATE_12
 496               : (charset < LEADING_CODE_EXT_22
 497                  ? LEADING_CODE_PRIVATE_21
 498                  : LEADING_CODE_PRIVATE_22)));
 499       leading_code_ext = charset;
 500     }
 501
 502   if (BYTES_BY_CHAR_HEAD (leading_code_base) != bytes)
 503     error ("Invalid dimension for the charset-ID %d", charset);
 504
 505   CHARSET_TABLE_INFO (charset, CHARSET_ID_IDX) = charset_id;
 506   CHARSET_TABLE_INFO (charset, CHARSET_BYTES_IDX) = make_number (bytes);
 507   CHARSET_TABLE_INFO (charset, CHARSET_DIMENSION_IDX) = dimension;
 508   CHARSET_TABLE_INFO (charset, CHARSET_CHARS_IDX) = chars;
 509   CHARSET_TABLE_INFO (charset, CHARSET_WIDTH_IDX) = width;
 510   CHARSET_TABLE_INFO (charset, CHARSET_DIRECTION_IDX) = direction;
 511   CHARSET_TABLE_INFO (charset, CHARSET_LEADING_CODE_BASE_IDX)
 512     = make_number (leading_code_base);
 513   CHARSET_TABLE_INFO (charset, CHARSET_LEADING_CODE_EXT_IDX)
 514     = make_number (leading_code_ext);
 515   CHARSET_TABLE_INFO (charset, CHARSET_ISO_FINAL_CHAR_IDX) = iso_final_char;
 516   CHARSET_TABLE_INFO (charset, CHARSET_ISO_GRAPHIC_PLANE_IDX)
 517     = iso_graphic_plane;
 518   CHARSET_TABLE_INFO (charset, CHARSET_SHORT_NAME_IDX) = short_name;
 519   CHARSET_TABLE_INFO (charset, CHARSET_LONG_NAME_IDX) = long_name;
 520   CHARSET_TABLE_INFO (charset, CHARSET_DESCRIPTION_IDX) = description;
 521   CHARSET_TABLE_INFO (charset, CHARSET_PLIST_IDX) = Qnil;
 522
 523   {
 524     /* If we have already defined a charset which has the same
 525        DIMENSION, CHARS and ISO-FINAL-CHAR but the different
 526        DIRECTION, we must update the entry REVERSE-CHARSET of both
 527        charsets.  If there's no such charset, the value of the entry
 528        is set to nil.  */
 529     int i;
 530
 531     for (i = 0; i <= MAX_CHARSET; i++)
 532       if (!NILP (CHARSET_TABLE_ENTRY (i)))
 533         {
 534           if (CHARSET_DIMENSION (i) == XINT (dimension)
 535               && CHARSET_CHARS (i) == XINT (chars)
 536               && CHARSET_ISO_FINAL_CHAR (i) == XINT (iso_final_char)
 537               && CHARSET_DIRECTION (i) != XINT (direction))
 538             {
 539               CHARSET_TABLE_INFO (charset, CHARSET_REVERSE_CHARSET_IDX)
 540                 = make_number (i);
 541               CHARSET_TABLE_INFO (i, CHARSET_REVERSE_CHARSET_IDX) = charset_id;
 542               break;
 543             }
 544         }
 545     if (i > MAX_CHARSET)
 546       /* No such a charset.  */
 547       CHARSET_TABLE_INFO (charset, CHARSET_REVERSE_CHARSET_IDX)
 548         = make_number (-1);
 549   }
 550
 551   if (charset != CHARSET_ASCII
 552       && charset < MIN_CHARSET_PRIVATE_DIMENSION1)
 553     {
 554       width_by_char_head[leading_code_base] = XINT (width);
 555
 556       /* Update table emacs_code_class.  */
 557       emacs_code_class[charset] = (bytes == 2
 558                                    ? EMACS_leading_code_2
 559                                    : (bytes == 3
 560                                       ? EMACS_leading_code_3
 561                                       : EMACS_leading_code_4));
 562     }
 563
 564   /* Update table iso_charset_table.  */
 565   if (ISO_CHARSET_TABLE (dimension, chars, iso_final_char) < 0)
 566     ISO_CHARSET_TABLE (dimension, chars, iso_final_char) = charset;
 567 }
 568
 569 #ifdef emacs
 570
 571 /* Return charset id of CHARSET_SYMBOL, or return -1 if CHARSET_SYMBOL
 572    is invalid.  */
 573 int
 574 get_charset_id (charset_symbol)
 575      Lisp_Object charset_symbol;
 576 {
 577   Lisp_Object val;
 578   int charset;
 579
 580   return ((SYMBOLP (charset_symbol)
 581            && (val = Fget (charset_symbol, Qcharset), VECTORP (val))
 582            && (charset = XINT (XVECTOR (val)->contents[CHARSET_ID_IDX]),
 583                CHARSET_VALID_P (charset)))
 584           ? charset : -1);
 585 }
 586
 587 /* Return an identification number for a new private charset of
 588    DIMENSION and WIDTH.  If there's no more room for the new charset,
 589    return 0.  */
 590 Lisp_Object
 591 get_new_private_charset_id (dimension, width)
 592      int dimension, width;
 593 {
 594   int charset, from, to;
 595
 596   if (dimension == 1)
 597     {
 598       if (width == 1)
 599         from = LEADING_CODE_EXT_11, to = LEADING_CODE_EXT_12;
 600       else
 601         from = LEADING_CODE_EXT_12, to = LEADING_CODE_EXT_21;
 602     }
 603   else
 604     {
 605       if (width == 1)
 606         from = LEADING_CODE_EXT_21, to = LEADING_CODE_EXT_22;
 607       else
 608         from = LEADING_CODE_EXT_22, to = LEADING_CODE_EXT_MAX + 1;
 609     }
 610
 611   for (charset = from; charset < to; charset++)
 612     if (!CHARSET_DEFINED_P (charset)) break;
 613
 614   return make_number (charset < to ? charset : 0);
 615 }
 616
 617 DEFUN ("define-charset", Fdefine_charset, Sdefine_charset, 3, 3, 0,
 618   "Define CHARSET-ID as the identification number of CHARSET with INFO-VECTOR.\n\
 619 If CHARSET-ID is nil, it is decided automatically, which means CHARSET is\n\
 620  treated as a private charset.\n\
 621 INFO-VECTOR is a vector of the format:\n\
 622    [DIMENSION CHARS WIDTH DIRECTION ISO-FINAL-CHAR ISO-GRAPHIC-PLANE\n\
 623     SHORT-NAME LONG-NAME DESCRIPTION]\n\
 624 The meanings of each elements is as follows:\n\
 625 DIMENSION (integer) is the number of bytes to represent a character: 1 or 2.\n\
 626 CHARS (integer) is the number of characters in a dimension: 94 or 96.\n\
 627 WIDTH (integer) is the number of columns a character in the charset\n\
 628 occupies on the screen: one of 0, 1, and 2.\n\
 629 \n\
 630 DIRECTION (integer) is the rendering direction of characters in the\n\
 631 charset when rendering.  If 0, render from left to right, else\n\
 632 render from right to left.\n\
 633 \n\
 634 ISO-FINAL-CHAR (character) is the final character of the\n\
 635 corresponding ISO 2022 charset.\n\
 636 \n\
 637 ISO-GRAPHIC-PLANE (integer) is the graphic plane to be invoked\n\
 638 while encoding to variants of ISO 2022 coding system, one of the\n\
 639 following: 0/graphic-plane-left(GL), 1/graphic-plane-right(GR).\n\
 640 \n\
 641 SHORT-NAME (string) is the short name to refer to the charset.\n\
 642 \n\
 643 LONG-NAME (string) is the long name to refer to the charset.\n\
 644 \n\
 645 DESCRIPTION (string) is the description string of the charset.")
 646   (charset_id, charset_symbol, info_vector)
 647      Lisp_Object charset_id, charset_symbol, info_vector;
 648 {
 649   Lisp_Object *vec;
 650
 651   if (!NILP (charset_id))
 652     CHECK_NUMBER (charset_id, 0);
 653   CHECK_SYMBOL (charset_symbol, 1);
 654   CHECK_VECTOR (info_vector, 2);
 655
 656   if (! NILP (charset_id))
 657     {
 658       if (! CHARSET_VALID_P (XINT (charset_id)))
 659         error ("Invalid CHARSET: %d", XINT (charset_id));
 660       else if (CHARSET_DEFINED_P (XINT (charset_id)))
 661         error ("Already defined charset: %d", XINT (charset_id));
 662     }
 663
 664   vec = XVECTOR (info_vector)->contents;
 665   if (XVECTOR (info_vector)->size != 9
 666       || !INTEGERP (vec[0]) || !(XINT (vec[0]) == 1 || XINT (vec[0]) == 2)
 667       || !INTEGERP (vec[1]) || !(XINT (vec[1]) == 94 || XINT (vec[1]) == 96)
 668       || !INTEGERP (vec[2]) || !(XINT (vec[2]) == 1 || XINT (vec[2]) == 2)
 669       || !INTEGERP (vec[3]) || !(XINT (vec[3]) == 0 || XINT (vec[3]) == 1)
 670       || !INTEGERP (vec[4]) || !(XINT (vec[4]) >= '0' && XINT (vec[4]) <= '~')
 671       || !INTEGERP (vec[5]) || !(XINT (vec[5]) == 0 || XINT (vec[5]) == 1)
 672       || !STRINGP (vec[6])
 673       || !STRINGP (vec[7])
 674       || !STRINGP (vec[8]))
 675     error ("Invalid info-vector argument for defining charset %s",
 676            XSYMBOL (charset_symbol)->name->data);
 677
 678   if (NILP (charset_id))
 679     {
 680       charset_id = get_new_private_charset_id (XINT (vec[0]), XINT (vec[2]));
 681       if (XINT (charset_id) == 0)
 682         error ("There's no room for a new private charset %s",
 683                XSYMBOL (charset_symbol)->name->data);
 684     }
 685
 686   update_charset_table (charset_id, vec[0], vec[1], vec[2], vec[3],
 687                         vec[4], vec[5], vec[6], vec[7], vec[8]);
 688   Fput (charset_symbol, Qcharset, CHARSET_TABLE_ENTRY (XINT (charset_id)));
 689   CHARSET_SYMBOL (XINT (charset_id)) = charset_symbol;
 690   Vcharset_list = Fcons (charset_symbol, Vcharset_list);
 691   return Qnil;
 692 }
 693
 694 DEFUN ("generic-character-list", Fgeneric_character_list,
 695        Sgeneric_character_list, 0, 0, 0,
 696   "Return a list of all possible generic characters.\n\
 697 It includes a generic character for a charset not yet defined.")
 698   ()
 699 {
 700   return Vgeneric_character_list;
 701 }
 702
 703 DEFUN ("get-unused-iso-final-char", Fget_unused_iso_final_char,
 704        Sget_unused_iso_final_char, 2, 2, 0,
 705   "Return an unsed ISO's final char for a charset of DIMENISION and CHARS.\n\
 706 DIMENSION is the number of bytes to represent a character: 1 or 2.\n\
 707 CHARS is the number of characters in a dimension: 94 or 96.\n\
 708 \n\
 709 This final char is for private use, thus the range is `0' (48) .. `?' (63).\n\
 710 If there's no unused final char for the specified kind of charset,\n\
 711 return nil.")
 712   (dimension, chars)
 713      Lisp_Object dimension, chars;
 714 {
 715   int final_char;
 716
 717   CHECK_NUMBER (dimension, 0);
 718   CHECK_NUMBER (chars, 1);
 719   if (XINT (dimension) != 1 && XINT (dimension) != 2)
 720     error ("Invalid charset dimension %d, it should be 1 or 2",
 721            XINT (dimension));
 722   if (XINT (chars) != 94 && XINT (chars) != 96)
 723     error ("Invalid charset chars %d, it should be 94 or 96",
 724            XINT (chars));
 725   for (final_char = '0'; final_char <= '?'; final_char++)
 726     {
 727       if (ISO_CHARSET_TABLE (dimension, chars, make_number (final_char)) < 0)
 728         break;
 729     }
 730   return (final_char <= '?' ? make_number (final_char) : Qnil);
 731 }
 732
 733 DEFUN ("declare-equiv-charset", Fdeclare_equiv_charset, Sdeclare_equiv_charset,
 734        4, 4, 0,
 735   "Declare a charset of DIMENSION, CHARS, FINAL-CHAR is the same as CHARSET.\n\
 736 CHARSET should be defined by `defined-charset' in advance.")
 737   (dimension, chars, final_char, charset_symbol)
 738      Lisp_Object dimension, chars, final_char, charset_symbol;
 739 {
 740   int charset;
 741
 742   CHECK_NUMBER (dimension, 0);
 743   CHECK_NUMBER (chars, 1);
 744   CHECK_NUMBER (final_char, 2);
 745   CHECK_SYMBOL (charset_symbol, 3);
 746
 747   if (XINT (dimension) != 1 && XINT (dimension) != 2)
 748     error ("Invalid DIMENSION %d, it should be 1 or 2", XINT (dimension));
 749   if (XINT (chars) != 94 && XINT (chars) != 96)
 750     error ("Invalid CHARS %d, it should be 94 or 96", XINT (chars));
 751   if (XINT (final_char) < '0' || XFASTINT (final_char) > '~')
 752     error ("Invalid FINAL-CHAR %c, it should be `0'..`~'", XINT (chars));
 753   if ((charset = get_charset_id (charset_symbol)) < 0)
 754     error ("Invalid charset %s", XSYMBOL (charset_symbol)->name->data);
 755
 756   ISO_CHARSET_TABLE (dimension, chars, final_char) = charset;
 757   return Qnil;
 758 }
 759
 760 /* Return number of different charsets in STR of length LEN.  In
 761    addition, for each found charset N, CHARSETS[N] is set 1.  The
 762    caller should allocate CHARSETS (MAX_CHARSET + 1 elements) in advance.
 763    It may lookup a translation table TABLE if supplied.
 764
 765    If MULTIBYTE is zero, do not check multibyte characters, i.e. if
 766    any ASCII codes (7-bit) are found, CHARSET[0] is set to 1, if any
 767    8-bit codes are found CHARSET[1] is set to 1.  */
 768
 769 int
 770 find_charset_in_str (str, len, charsets, table, multibyte)
 771      unsigned char *str;
 772      int len, *charsets;
 773      Lisp_Object table;
 774      int multibyte;
 775 {
 776   register int num = 0;
 777
 778   if (! multibyte)
 779     {
 780       unsigned char *endp = str + len;
 781       int maskbits = 0;
 782
 783       while (str < endp && maskbits != 3)
 784         maskbits |=  (*str++ < 0x80 ? 1 : 2);
 785       if (maskbits & 1)
 786         {
 787           charsets[0] = 1;
 788           num++;
 789         }
 790       if (maskbits & 2)
 791         {
 792           charsets[1] = 1;
 793           num++;
 794         }
 795       return num;
 796     }
 797
 798   if (! CHAR_TABLE_P (table))
 799     table = Qnil;
 800
 801   while (len > 0)
 802     {
 803       int bytes, charset, c1, c2;
 804
 805       SPLIT_MULTIBYTE_SEQ (str, len, bytes, charset, c1, c2);
 806
 807       if (! NILP (table))
 808         {
 809           int c1 = translate_char (table, -1, charset, c1, c2);
 810           if (c1 >= 0)
 811             charset = CHAR_CHARSET (c1);
 812         }
 813
 814       if (!charsets[charset])
 815         {
 816           charsets[charset] = 1;
 817           num += 1;
 818         }
 819       str += bytes;
 820       len -= bytes;
 821     }
 822   return num;
 823 }
 824
 825 DEFUN ("find-charset-region", Ffind_charset_region, Sfind_charset_region,
 826        2, 3, 0,
 827   "Return a list of charsets in the region between BEG and END.\n\
 828 BEG and END are buffer positions.\n\
 829 Optional arg TABLE if non-nil is a translation table to look up.\n\
 830 \n\
 831 If the region contains invalid multiybte characters,\n\
 832 `unknown' is included in the returned list.\n\
 833 \n\
 834 If the current buffer is unibyte, the returned list contains\n\
 835 `ascii' if any 7-bit characters are found,\n\
 836 and `unknown' if any 8-bit characters are found.")
 837   (beg, end, table)
 838      Lisp_Object beg, end, table;
 839 {
 840   int charsets[MAX_CHARSET + 1];
 841   int from, from_byte, to, stop, stop_byte, i;
 842   Lisp_Object val;
 843   int undefined;
 844   int multibyte = !NILP (current_buffer->enable_multibyte_characters);
 845
 846   validate_region (&beg, &end);
 847   from = XFASTINT (beg);
 848   stop = to = XFASTINT (end);
 849
 850   if (from < GPT && GPT < to)
 851     {
 852       stop = GPT;
 853       stop_byte = GPT_BYTE;
 854     }
 855   else
 856     stop_byte = CHAR_TO_BYTE (stop);
 857
 858   from_byte = CHAR_TO_BYTE (from);
 859
 860   bzero (charsets, (MAX_CHARSET + 1) * sizeof (int));
 861   while (1)
 862     {
 863       find_charset_in_str (BYTE_POS_ADDR (from_byte), stop_byte - from_byte,
 864                            charsets, table, multibyte);
 865       if (stop < to)
 866         {
 867           from = stop, from_byte = stop_byte;
 868           stop = to, stop_byte = CHAR_TO_BYTE (stop);
 869         }
 870       else
 871         break;
 872     }
 873
 874   val = Qnil;
 875   undefined = 0;
 876   for (i = (multibyte ? MAX_CHARSET : 1); i >= 0; i--)
 877     if (charsets[i])
 878       {
 879         if (CHARSET_DEFINED_P (i))
 880           val = Fcons (CHARSET_SYMBOL (i), val);
 881         else
 882           undefined = 1;
 883       }
 884   if (undefined)
 885     val = Fcons (Qunknown, val);
 886   return val;
 887 }
 888
 889 DEFUN ("find-charset-string", Ffind_charset_string, Sfind_charset_string,
 890        1, 2, 0,
 891   "Return a list of charsets in STR.\n\
 892 Optional arg TABLE if non-nil is a translation table to look up.\n\
 893 \n\
 894 If the region contains invalid multiybte characters,\n\
 895 `unknown' is included in the returned list.\n\
 896 \n\
 897 If STR is unibyte, the returned list contains\n\
 898 `ascii' if any 7-bit characters are found,\n\
 899 and `unknown' if any 8-bit characters are found.")
 900   (str, table)
 901      Lisp_Object str, table;
 902 {
 903   int charsets[MAX_CHARSET + 1];
 904   int i;
 905   Lisp_Object val;
 906   int undefined;
 907   int multibyte;
 908
 909   CHECK_STRING (str, 0);
 910   multibyte = STRING_MULTIBYTE (str);
 911
 912   bzero (charsets, (MAX_CHARSET + 1) * sizeof (int));
 913   find_charset_in_str (XSTRING (str)->data, STRING_BYTES (XSTRING (str)),
 914                        charsets, table, multibyte);
 915   val = Qnil;
 916   undefined = 0;
 917   for (i = (multibyte ? MAX_CHARSET : 1); i >= 0; i--)
 918     if (charsets[i])
 919       {
 920         if (CHARSET_DEFINED_P (i))
 921           val = Fcons (CHARSET_SYMBOL (i), val);
 922         else
 923           undefined = 1;
 924       }
 925   if (undefined)
 926     val = Fcons (Qunknown, val);
 927   return val;
 928 }
 929 \f
 930 DEFUN ("make-char-internal", Fmake_char_internal, Smake_char_internal, 1, 3, 0,
 931   "")
 932   (charset, code1, code2)
 933      Lisp_Object charset, code1, code2;
 934 {
 935   int charset_id, c1, c2;
 936
 937   CHECK_NUMBER (charset, 0);
 938   charset_id = XINT (charset);
 939   if (!CHARSET_DEFINED_P (charset_id))
 940     error ("Invalid charset ID: %d", XINT (charset));
 941
 942   if (NILP (code1))
 943     c1 = 0;
 944   else
 945     {
 946       CHECK_NUMBER (code1, 1);
 947       c1 = XINT (code1);
 948     }
 949   if (NILP (code2))
 950     c2 = 0;
 951   else
 952     {
 953       CHECK_NUMBER (code2, 2);
 954       c2 = XINT (code2);
 955     }
 956
 957   if (c1 < 0 || c1 > 0xFF || c2 < 0 || c2 > 0xFF)
 958     error ("Invalid code points for charset ID %d: %d %d", charset_id, c1, c2);
 959   c1 &= 0x7F;
 960   c2 &= 0x7F;
 961   if (c1 == 0
 962       ? c2 != 0
 963       : (c2 == 0
 964          ? !CHAR_COMPONENTS_VALID_P (charset_id, c1, 0x20)
 965          : !CHAR_COMPONENTS_VALID_P (charset_id, c1, c2)))
 966     error ("Invalid code points for charset ID %d: %d %d", charset_id, c1, c2);
 967
 968   return make_number (MAKE_CHAR (charset_id, c1, c2));
 969 }
 970
 971 DEFUN ("split-char", Fsplit_char, Ssplit_char, 1, 1, 0,
 972   "Return list of charset and one or two position-codes of CHAR.\n\
 973 If CHAR is invalid as a character code,\n\
 974 return a list of symbol `unknown' and CHAR.")
 975   (ch)
 976      Lisp_Object ch;
 977 {
 978   Lisp_Object val;
 979   int c, charset, c1, c2;
 980
 981   CHECK_NUMBER (ch, 0);
 982   c = XFASTINT (ch);
 983   if (!CHAR_VALID_P (c, 1))
 984     return Fcons (Qunknown, Fcons (ch, Qnil));
 985   SPLIT_CHAR (XFASTINT (ch), charset, c1, c2);
 986   return (c2 >= 0
 987           ? Fcons (CHARSET_SYMBOL (charset),
 988                    Fcons (make_number (c1), Fcons (make_number (c2), Qnil)))
 989           : Fcons (CHARSET_SYMBOL (charset), Fcons (make_number (c1), Qnil)));
 990 }
 991
 992 DEFUN ("char-charset", Fchar_charset, Schar_charset, 1, 1, 0,
 993   "Return charset of CHAR.")
 994   (ch)
 995      Lisp_Object ch;
 996 {
 997   CHECK_NUMBER (ch, 0);
 998
 999   return CHARSET_SYMBOL (CHAR_CHARSET (XINT (ch)));
1000 }
1001
1002 DEFUN ("charset-after", Fcharset_after, Scharset_after, 0, 1, 0,
1003   "Return charset of a character in the current buffer at position POS.\n\
1004 If POS is nil, it defauls to the current point.\n\
1005 If POS is out of range, the value is nil.")
1006   (pos)
1007      Lisp_Object pos;
1008 {
1009   register int pos_byte, bytes, charset, c1, c2;
1010   register unsigned char *p;
1011
1012   if (NILP (pos))
1013     pos_byte = PT_BYTE;
1014   else if (MARKERP (pos))
1015     {
1016       pos_byte = marker_byte_position (pos);
1017       if (pos_byte < BEGV_BYTE || pos_byte >= ZV_BYTE)
1018         return Qnil;
1019     }
1020   else
1021     {
1022       CHECK_NUMBER (pos, 0);
1023       if (XINT (pos) < BEGV || XINT (pos) >= ZV)
1024         return Qnil;
1025       pos_byte = CHAR_TO_BYTE (XINT (pos));
1026     }
1027   p = BYTE_POS_ADDR (pos_byte);
1028   if (BASE_LEADING_CODE_P (*p))
1029     {
1030       SPLIT_MULTIBYTE_SEQ (p, Z_BYTE - pos_byte, bytes, charset, c1, c2);
1031       if (charset < 0)
1032         charset = 1;
1033     }
1034   else
1035     charset = CHARSET_ASCII;
1036
1037   return CHARSET_SYMBOL (charset);
1038 }
1039
1040 DEFUN ("iso-charset", Fiso_charset, Siso_charset, 3, 3, 0,
1041   "Return charset of ISO's specification DIMENSION, CHARS, and FINAL-CHAR.\n\
1042 \n\
1043 ISO 2022's designation sequence (escape sequence) distinguishes charsets\n\
1044 by their DIMENSION, CHARS, and FINAL-CHAR,\n\
1045 where as Emacs distinguishes them by charset symbol.\n\
1046 See the documentation of the function `charset-info' for the meanings of\n\
1047 DIMENSION, CHARS, and FINAL-CHAR.")
1048   (dimension, chars, final_char)
1049      Lisp_Object dimension, chars, final_char;
1050 {
1051   int charset;
1052
1053   CHECK_NUMBER (dimension, 0);
1054   CHECK_NUMBER (chars, 1);
1055   CHECK_NUMBER (final_char, 2);
1056
1057   if ((charset = ISO_CHARSET_TABLE (dimension, chars, final_char)) < 0)
1058     return Qnil;
1059   return CHARSET_SYMBOL (charset);
1060 }
1061
1062 /* If GENERICP is nonzero, return nonzero iff C is a valid normal or
1063    generic character.  If GENERICP is zero, return nonzero iff C is a
1064    valid normal character.  Do not call this function directly,
1065    instead use macro CHAR_VALID_P.  */
1066 int
1067 char_valid_p (c, genericp)
1068      int c, genericp;
1069 {
1070   int charset, c1, c2;
1071
1072   if (c < 0)
1073     return 0;
1074   if (SINGLE_BYTE_CHAR_P (c))
1075     return 1;
1076   SPLIT_NON_ASCII_CHAR (c, charset, c1, c2);
1077   if (genericp)
1078     {
1079       if (c1)
1080         {
1081           if (c2 <= 0) c2 = 0x20;
1082         }
1083       else
1084         {
1085           if (c2 <= 0) c1 = c2 = 0x20;
1086         }
1087     }
1088   return (CHARSET_DEFINED_P (charset)
1089           && CHAR_COMPONENTS_VALID_P (charset, c1, c2));
1090 }
1091
1092 DEFUN ("char-valid-p", Fchar_valid_p, Schar_valid_p, 1, 2, 0,
1093   "Return t if OBJECT is a valid normal character.\n\
1094 If optional arg GENERICP is non-nil, also return t if OBJECT is\n\
1095 a valid generic character.")
1096   (object, genericp)
1097      Lisp_Object object, genericp;
1098 {
1099   if (! NATNUMP (object))
1100     return Qnil;
1101   return (CHAR_VALID_P (XFASTINT (object), !NILP (genericp)) ? Qt : Qnil);
1102 }
1103
1104 DEFUN ("unibyte-char-to-multibyte", Funibyte_char_to_multibyte,
1105        Sunibyte_char_to_multibyte, 1, 1, 0,
1106   "Convert the unibyte character CH to multibyte character.\n\
1107 The conversion is done based on `nonascii-translation-table' (which see)\n\
1108  or `nonascii-insert-offset' (which see).")
1109   (ch)
1110      Lisp_Object ch;
1111 {
1112   int c;
1113
1114   CHECK_NUMBER (ch, 0);
1115   c = XINT (ch);
1116   if (c < 0 || c >= 0400)
1117     error ("Invalid unibyte character: %d", c);
1118   c = unibyte_char_to_multibyte (c);
1119   if (c < 0)
1120     error ("Can't convert to multibyte character: %d", XINT (ch));
1121   return make_number (c);
1122 }
1123
1124 DEFUN ("multibyte-char-to-unibyte", Fmultibyte_char_to_unibyte,
1125        Smultibyte_char_to_unibyte, 1, 1, 0,
1126   "Convert the multibyte character CH to unibyte character.\n\
1127 The conversion is done based on `nonascii-translation-table' (which see)\n\
1128  or `nonascii-insert-offset' (which see).")
1129   (ch)
1130      Lisp_Object ch;
1131 {
1132   int c;
1133
1134   CHECK_NUMBER (ch, 0);
1135   c = XINT (ch);
1136   if (! CHAR_VALID_P (c, 0))
1137     error ("Invalid multibyte character: %d", c);
1138   c = multibyte_char_to_unibyte (c, Qnil);
1139   if (c < 0)
1140     error ("Can't convert to unibyte character: %d", XINT (ch));
1141   return make_number (c);
1142 }
1143
1144 DEFUN ("char-bytes", Fchar_bytes, Schar_bytes, 1, 1, 0,
1145   "Return 1 regardless of the argument CHAR.\n\
1146 This is now an obsolete function.  We keep it just for backward compatibility.")
1147   (ch)
1148      Lisp_Object ch;
1149 {
1150   Lisp_Object val;
1151
1152   CHECK_NUMBER (ch, 0);
1153   return make_number (1);
1154 }
1155
1156 /* Return how many bytes C will occupy in a multibyte buffer.
1157    Don't call this function directly, instead use macro CHAR_BYTES.  */
1158 int
1159 char_bytes (c)
1160      int c;
1161 {
1162   int charset;
1163
1164   if (SINGLE_BYTE_CHAR_P (c) || (c & ~((1 << CHARACTERBITS) - 1)))
1165     return 1;
1166
1167   charset = CHAR_CHARSET (c);
1168   return (CHARSET_DEFINED_P (charset) ? CHARSET_BYTES (charset) : 1);
1169 }
1170
1171 /* Return the width of character of which multi-byte form starts with
1172    C.  The width is measured by how many columns occupied on the
1173    screen when displayed in the current buffer.  */
1174
1175 #define ONE_BYTE_CHAR_WIDTH(c)                                          \
1176   (c < 0x20                                                             \
1177    ? (c == '\t'                                                         \
1178       ? XFASTINT (current_buffer->tab_width)                            \
1179       : (c == '\n' ? 0 : (NILP (current_buffer->ctl_arrow) ? 4 : 2)))   \
1180    : (c < 0x7f                                                          \
1181       ? 1                                                               \
1182       : (c == 0x7F                                                      \
1183          ? (NILP (current_buffer->ctl_arrow) ? 4 : 2)                   \
1184          : ((! NILP (current_buffer->enable_multibyte_characters)       \
1185              && BASE_LEADING_CODE_P (c))                                \
1186             ? WIDTH_BY_CHAR_HEAD (c)                                    \
1187             : 4))))
1188
1189 DEFUN ("char-width", Fchar_width, Schar_width, 1, 1, 0,
1190   "Return width of CHAR when displayed in the current buffer.\n\
1191 The width is measured by how many columns it occupies on the screen.")
1192   (ch)
1193        Lisp_Object ch;
1194 {
1195   Lisp_Object val, disp;
1196   int c;
1197   struct Lisp_Char_Table *dp = buffer_display_table ();
1198
1199   CHECK_NUMBER (ch, 0);
1200
1201   c = XINT (ch);
1202
1203   /* Get the way the display table would display it.  */
1204   disp = dp ? DISP_CHAR_VECTOR (dp, c) : Qnil;
1205
1206   if (VECTORP (disp))
1207     XSETINT (val, XVECTOR (disp)->size);
1208   else if (SINGLE_BYTE_CHAR_P (c))
1209     XSETINT (val, ONE_BYTE_CHAR_WIDTH (c));
1210   else
1211     {
1212       int charset = CHAR_CHARSET (c);
1213
1214       XSETFASTINT (val, CHARSET_WIDTH (charset));
1215     }
1216   return val;
1217 }
1218
1219 /* Return width of string STR of length LEN when displayed in the
1220    current buffer.  The width is measured by how many columns it
1221    occupies on the screen.  */
1222
1223 int
1224 strwidth (str, len)
1225      unsigned char *str;
1226      int len;
1227 {
1228   unsigned char *endp = str + len;
1229   int width = 0;
1230   struct Lisp_Char_Table *dp = buffer_display_table ();
1231
1232   while (str < endp)
1233     {
1234       Lisp_Object disp;
1235       int thislen;
1236       int c = STRING_CHAR_AND_LENGTH (str, endp - str, thislen);
1237
1238       /* Get the way the display table would display it.  */
1239       if (dp)
1240         disp = DISP_CHAR_VECTOR (dp, c);
1241       else
1242         disp = Qnil;
1243
1244       if (VECTORP (disp))
1245         width += XVECTOR (disp)->size;
1246       else
1247         width += ONE_BYTE_CHAR_WIDTH (*str);
1248
1249       str += thislen;
1250     }
1251   return width;
1252 }
1253
1254 DEFUN ("string-width", Fstring_width, Sstring_width, 1, 1, 0,
1255   "Return width of STRING when displayed in the current buffer.\n\
1256 Width is measured by how many columns it occupies on the screen.\n\
1257 When calculating width of a multibyte character in STRING,\n\
1258 only the base leading-code is considered; the validity of\n\
1259 the following bytes is not checked.")
1260   (str)
1261      Lisp_Object str;
1262 {
1263   Lisp_Object val;
1264
1265   CHECK_STRING (str, 0);
1266   XSETFASTINT (val, strwidth (XSTRING (str)->data,
1267                               STRING_BYTES (XSTRING (str))));
1268   return val;
1269 }
1270
1271 DEFUN ("char-direction", Fchar_direction, Schar_direction, 1, 1, 0,
1272   "Return the direction of CHAR.\n\
1273 The returned value is 0 for left-to-right and 1 for right-to-left.")
1274   (ch)
1275      Lisp_Object ch;
1276 {
1277   int charset;
1278
1279   CHECK_NUMBER (ch, 0);
1280   charset = CHAR_CHARSET (XFASTINT (ch));
1281   if (!CHARSET_DEFINED_P (charset))
1282     invalid_character (XINT (ch));
1283   return CHARSET_TABLE_INFO (charset, CHARSET_DIRECTION_IDX);
1284 }
1285
1286 DEFUN ("chars-in-region", Fchars_in_region, Schars_in_region, 2, 2, 0,
1287   "Return number of characters between BEG and END.")
1288   (beg, end)
1289      Lisp_Object beg, end;
1290 {
1291   int from, to;
1292
1293   CHECK_NUMBER_COERCE_MARKER (beg, 0);
1294   CHECK_NUMBER_COERCE_MARKER (end, 1);
1295
1296   from = min (XFASTINT (beg), XFASTINT (end));
1297   to = max (XFASTINT (beg), XFASTINT (end));
1298
1299   return make_number (to - from);
1300 }
1301
1302 /* Return the number of characters in the NBYTES bytes at PTR.
1303    This works by looking at the contents and checking for multibyte sequences.
1304    However, if the current buffer has enable-multibyte-characters = nil,
1305    we treat each byte as a character.  */
1306
1307 int
1308 chars_in_text (ptr, nbytes)
1309      unsigned char *ptr;
1310      int nbytes;
1311 {
1312   /* current_buffer is null at early stages of Emacs initialization.  */
1313   if (current_buffer == 0
1314       || NILP (current_buffer->enable_multibyte_characters))
1315     return nbytes;
1316
1317   return multibyte_chars_in_text (ptr, nbytes);
1318 }
1319
1320 /* Return the number of characters in the NBYTES bytes at PTR.
1321    This works by looking at the contents and checking for multibyte sequences.
1322    It ignores enable-multibyte-characters.  */
1323
1324 int
1325 multibyte_chars_in_text (ptr, nbytes)
1326      unsigned char *ptr;
1327      int nbytes;
1328 {
1329   unsigned char *endp;
1330   int chars, bytes;
1331
1332   endp = ptr + nbytes;
1333   chars = 0;
1334
1335   while (ptr < endp)
1336     {
1337       if (BASE_LEADING_CODE_P (*ptr))
1338         {
1339           PARSE_MULTIBYTE_SEQ (ptr, nbytes, bytes);
1340           ptr += bytes;
1341           nbytes -= bytes;
1342         }
1343       else
1344         {
1345           ptr++;
1346           nbytes--;
1347         }
1348       chars++;
1349     }
1350
1351   return chars;
1352 }
1353
1354 DEFUN ("string", Fstring, Sstring, 1, MANY, 0,
1355   "Concatenate all the argument characters and make the result a string.")
1356   (n, args)
1357      int n;
1358      Lisp_Object *args;
1359 {
1360   int i;
1361   unsigned char *buf = (unsigned char *) alloca (MAX_MULTIBYTE_LENGTH * n);
1362   unsigned char *p = buf;
1363   Lisp_Object val;
1364   int c, multibyte_p = 0;
1365
1366   for (i = 0; i < n; i++)
1367     {
1368       CHECK_NUMBER (args[i], 0);
1369       c = XINT (args[i]);
1370       p += CHAR_STRING (c, p);
1371
1372       if (!SINGLE_BYTE_CHAR_P (c))
1373         multibyte_p = 1;
1374     }
1375
1376   /* Here, we can't use make_string_from_bytes because of the byte
1377      combining problem.  Make a multibyte string if there is any
1378      multibyte character in ARGS to make sure that `(string 2276)'
1379      returns a multibyte string if running --unibyte.  */
1380   if (multibyte_p)
1381     val = make_multibyte_string (buf, n, p - buf);
1382   else
1383     val = make_unibyte_string (buf, p - buf);
1384   return val;
1385 }
1386
1387 #endif /* emacs */
1388 \f
1389 int
1390 charset_id_internal (charset_name)
1391      char *charset_name;
1392 {
1393   Lisp_Object val;
1394
1395   val= Fget (intern (charset_name), Qcharset);
1396   if (!VECTORP (val))
1397     error ("Charset %s is not defined", charset_name);
1398
1399   return (XINT (XVECTOR (val)->contents[0]));
1400 }
1401
1402 DEFUN ("setup-special-charsets", Fsetup_special_charsets,
1403        Ssetup_special_charsets, 0, 0, 0, "Internal use only.")
1404    ()
1405 {
1406   charset_latin_iso8859_1 = charset_id_internal ("latin-iso8859-1");
1407   charset_jisx0208_1978 = charset_id_internal ("japanese-jisx0208-1978");
1408   charset_jisx0208 = charset_id_internal ("japanese-jisx0208");
1409   charset_katakana_jisx0201 = charset_id_internal ("katakana-jisx0201");
1410   charset_latin_jisx0201 = charset_id_internal ("latin-jisx0201");
1411   charset_big5_1 = charset_id_internal ("chinese-big5-1");
1412   charset_big5_2 = charset_id_internal ("chinese-big5-2");
1413   return Qnil;
1414 }
1415
1416 void
1417 init_charset_once ()
1418 {
1419   int i, j, k;
1420
1421   staticpro (&Vcharset_table);
1422   staticpro (&Vcharset_symbol_table);
1423   staticpro (&Vgeneric_character_list);
1424
1425   /* This has to be done here, before we call Fmake_char_table.  */
1426   Qcharset_table = intern ("charset-table");
1427   staticpro (&Qcharset_table);
1428
1429   /* Intern this now in case it isn't already done.
1430      Setting this variable twice is harmless.
1431      But don't staticpro it here--that is done in alloc.c.  */
1432   Qchar_table_extra_slots = intern ("char-table-extra-slots");
1433
1434   /* Now we are ready to set up this property, so we can
1435      create the charset table.  */
1436   Fput (Qcharset_table, Qchar_table_extra_slots, make_number (0));
1437   Vcharset_table = Fmake_char_table (Qcharset_table, Qnil);
1438
1439   Qunknown = intern ("unknown");
1440   staticpro (&Qunknown);
1441   Vcharset_symbol_table = Fmake_vector (make_number (MAX_CHARSET + 1),
1442                                         Qunknown);
1443
1444   /* Setup tables.  */
1445   for (i = 0; i < 2; i++)
1446     for (j = 0; j < 2; j++)
1447       for (k = 0; k < 128; k++)
1448         iso_charset_table [i][j][k] = -1;
1449
1450   for (i = 0; i < 256; i++)
1451     BYTES_BY_CHAR_HEAD (i) = 1;
1452   for (i = MIN_CHARSET_OFFICIAL_DIMENSION1;
1453        i <= MAX_CHARSET_OFFICIAL_DIMENSION1; i++)
1454     BYTES_BY_CHAR_HEAD (i) = 2;
1455   for (i = MIN_CHARSET_OFFICIAL_DIMENSION2;
1456        i <= MAX_CHARSET_OFFICIAL_DIMENSION2; i++)
1457     BYTES_BY_CHAR_HEAD (i) = 3;
1458   BYTES_BY_CHAR_HEAD (LEADING_CODE_PRIVATE_11) = 3;
1459   BYTES_BY_CHAR_HEAD (LEADING_CODE_PRIVATE_12) = 3;
1460   BYTES_BY_CHAR_HEAD (LEADING_CODE_PRIVATE_21) = 4;
1461   BYTES_BY_CHAR_HEAD (LEADING_CODE_PRIVATE_22) = 4;
1462   /* The followings don't reflect the actual bytes, but just to tell
1463      that it is a start of a multibyte character.  */
1464   BYTES_BY_CHAR_HEAD (0x80) = 2;
1465   BYTES_BY_CHAR_HEAD (0x9E) = 2;
1466   BYTES_BY_CHAR_HEAD (0x9F) = 2;
1467
1468   for (i = 0; i < 128; i++)
1469     WIDTH_BY_CHAR_HEAD (i) = 1;
1470   for (; i < 256; i++)
1471     WIDTH_BY_CHAR_HEAD (i) = 4;
1472   WIDTH_BY_CHAR_HEAD (LEADING_CODE_PRIVATE_11) = 1;
1473   WIDTH_BY_CHAR_HEAD (LEADING_CODE_PRIVATE_12) = 2;
1474   WIDTH_BY_CHAR_HEAD (LEADING_CODE_PRIVATE_21) = 1;
1475   WIDTH_BY_CHAR_HEAD (LEADING_CODE_PRIVATE_22) = 2;
1476
1477   {
1478     Lisp_Object val;
1479
1480     val = Qnil;
1481     for (i = 0x81; i < 0x90; i++)
1482       val = Fcons (make_number ((i - 0x70) << 7), val);
1483     for (; i < 0x9A; i++)
1484       val = Fcons (make_number ((i - 0x8F) << 14), val);
1485     for (i = 0xA0; i < 0xF0; i++)
1486       val = Fcons (make_number ((i - 0x70) << 7), val);
1487     for (; i < 0xFF; i++)
1488       val = Fcons (make_number ((i - 0xE0) << 14), val);
1489     Vgeneric_character_list = Fnreverse (val);
1490   }
1491
1492   nonascii_insert_offset = 0;
1493   Vnonascii_translation_table = Qnil;
1494 }
1495
1496 #ifdef emacs
1497
1498 void
1499 syms_of_charset ()
1500 {
1501   Qascii = intern ("ascii");
1502   staticpro (&Qascii);
1503
1504   Qcharset = intern ("charset");
1505   staticpro (&Qcharset);
1506
1507   /* Define ASCII charset now.  */
1508   update_charset_table (make_number (CHARSET_ASCII),
1509                         make_number (1), make_number (94),
1510                         make_number (1),
1511                         make_number (0),
1512                         make_number ('B'),
1513                         make_number (0),
1514                         build_string ("ASCII"),
1515                         build_string ("ASCII"),
1516                         build_string ("ASCII (ISO646 IRV)"));
1517   CHARSET_SYMBOL (CHARSET_ASCII) = Qascii;
1518   Fput (Qascii, Qcharset, CHARSET_TABLE_ENTRY (CHARSET_ASCII));
1519
1520   Qauto_fill_chars = intern ("auto-fill-chars");
1521   staticpro (&Qauto_fill_chars);
1522   Fput (Qauto_fill_chars, Qchar_table_extra_slots, make_number (0));
1523
1524   defsubr (&Sdefine_charset);
1525   defsubr (&Sgeneric_character_list);
1526   defsubr (&Sget_unused_iso_final_char);
1527   defsubr (&Sdeclare_equiv_charset);
1528   defsubr (&Sfind_charset_region);
1529   defsubr (&Sfind_charset_string);
1530   defsubr (&Smake_char_internal);
1531   defsubr (&Ssplit_char);
1532   defsubr (&Schar_charset);
1533   defsubr (&Scharset_after);
1534   defsubr (&Siso_charset);
1535   defsubr (&Schar_valid_p);
1536   defsubr (&Sunibyte_char_to_multibyte);
1537   defsubr (&Smultibyte_char_to_unibyte);
1538   defsubr (&Schar_bytes);
1539   defsubr (&Schar_width);
1540   defsubr (&Sstring_width);
1541   defsubr (&Schar_direction);
1542   defsubr (&Schars_in_region);
1543   defsubr (&Sstring);
1544   defsubr (&Ssetup_special_charsets);
1545
1546   DEFVAR_LISP ("charset-list", &Vcharset_list,
1547     "List of charsets ever defined.");
1548   Vcharset_list = Fcons (Qascii, Qnil);
1549
1550   DEFVAR_LISP ("translation-table-vector",  &Vtranslation_table_vector,
1551     "Vector of cons cell of a symbol and translation table ever defined.\n\
1552 An ID of a translation table is an index of this vector.");
1553   Vtranslation_table_vector = Fmake_vector (make_number (16), Qnil);
1554
1555   DEFVAR_INT ("leading-code-private-11", &leading_code_private_11,
1556     "Leading-code of private TYPE9N charset of column-width 1.");
1557   leading_code_private_11 = LEADING_CODE_PRIVATE_11;
1558
1559   DEFVAR_INT ("leading-code-private-12", &leading_code_private_12,
1560     "Leading-code of private TYPE9N charset of column-width 2.");
1561   leading_code_private_12 = LEADING_CODE_PRIVATE_12;
1562
1563   DEFVAR_INT ("leading-code-private-21", &leading_code_private_21,
1564     "Leading-code of private TYPE9Nx9N charset of column-width 1.");
1565   leading_code_private_21 = LEADING_CODE_PRIVATE_21;
1566
1567   DEFVAR_INT ("leading-code-private-22", &leading_code_private_22,
1568     "Leading-code of private TYPE9Nx9N charset of column-width 2.");
1569   leading_code_private_22 = LEADING_CODE_PRIVATE_22;
1570
1571   DEFVAR_INT ("nonascii-insert-offset", &nonascii_insert_offset,
1572     "Offset for converting non-ASCII unibyte codes 0240...0377 to multibyte.\n\
1573 This is used for converting unibyte text to multibyte,\n\
1574 and for inserting character codes specified by number.\n\n\
1575 This serves to convert a Latin-1 or similar 8-bit character code\n\
1576 to the corresponding Emacs multibyte character code.\n\
1577 Typically the value should be (- (make-char CHARSET 0) 128),\n\
1578 for your choice of character set.\n\
1579 If `nonascii-translation-table' is non-nil, it overrides this variable.");
1580   nonascii_insert_offset = 0;
1581
1582   DEFVAR_LISP ("nonascii-translation-table", &Vnonascii_translation_table,
1583     "Translation table to convert non-ASCII unibyte codes to multibyte.\n\
1584 This is used for converting unibyte text to multibyte,\n\
1585 and for inserting character codes specified by number.\n\n\
1586 Conversion is performed only when multibyte characters are enabled,\n\
1587 and it serves to convert a Latin-1 or similar 8-bit character code\n\
1588 to the corresponding Emacs character code.\n\n\
1589 If this is nil, `nonascii-insert-offset' is used instead.\n\
1590 See also the docstring of `make-translation-table'.");
1591   Vnonascii_translation_table = Qnil;
1592
1593   DEFVAR_LISP ("auto-fill-chars", &Vauto_fill_chars,
1594     "A char-table for characters which invoke auto-filling.\n\
1595 Such characters have value t in this table.");
1596   Vauto_fill_chars = Fmake_char_table (Qauto_fill_chars, Qnil);
1597   CHAR_TABLE_SET (Vauto_fill_chars, make_number (' '), Qt);
1598   CHAR_TABLE_SET (Vauto_fill_chars, make_number ('\n'), Qt);
1599 }
1600
1601 #endif /* emacs */