src/charset.c

   1 /* Basic multilingual character support.
   2    Copyright (C) 1995, 1997, 1998 Electrotechnical Laboratory, JAPAN.
   3    Licensed to the Free Software Foundation.
   4
   5 This file is part of GNU Emacs.
   6
   7 GNU Emacs is free software; you can redistribute it and/or modify
   8 it under the terms of the GNU General Public License as published by
   9 the Free Software Foundation; either version 2, or (at your option)
  10 any later version.
  11
  12 GNU Emacs is distributed in the hope that it will be useful,
  13 but WITHOUT ANY WARRANTY; without even the implied warranty of
  14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  15 GNU General Public License for more details.
  16
  17 You should have received a copy of the GNU General Public License
  18 along with GNU Emacs; see the file COPYING.  If not, write to
  19 the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
  20 Boston, MA 02111-1307, USA.  */
  21
  22 /* At first, see the document in `charset.h' to understand the code in
  23    this file.  */
  24
  25 #ifdef emacs
  26 #include <config.h>
  27 #endif
  28
  29 #include <stdio.h>
  30
  31 #ifdef emacs
  32
  33 #include <sys/types.h>
  34 #include "lisp.h"
  35 #include "buffer.h"
  36 #include "charset.h"
  37 #include "coding.h"
  38 #include "disptab.h"
  39
  40 #else  /* not emacs */
  41
  42 #include "mulelib.h"
  43
  44 #endif /* emacs */
  45
  46 Lisp_Object Qcharset, Qascii, Qeight_bit_control, Qeight_bit_graphic;
  47 Lisp_Object Qunknown;
  48
  49 /* Declaration of special leading-codes.  */
  50 int leading_code_private_11;    /* for private DIMENSION1 of 1-column */
  51 int leading_code_private_12;    /* for private DIMENSION1 of 2-column */
  52 int leading_code_private_21;    /* for private DIMENSION2 of 1-column */
  53 int leading_code_private_22;    /* for private DIMENSION2 of 2-column */
  54
  55 /* Declaration of special charsets.  The values are set by
  56    Fsetup_special_charsets.  */
  57 int charset_latin_iso8859_1;    /* ISO8859-1 (Latin-1) */
  58 int charset_jisx0208_1978;      /* JISX0208.1978 (Japanese Kanji old set) */
  59 int charset_jisx0208;           /* JISX0208.1983 (Japanese Kanji) */
  60 int charset_katakana_jisx0201;  /* JISX0201.Kana (Japanese Katakana) */
  61 int charset_latin_jisx0201;     /* JISX0201.Roman (Japanese Roman) */
  62 int charset_big5_1;             /* Big5 Level 1 (Chinese Traditional) */
  63 int charset_big5_2;             /* Big5 Level 2 (Chinese Traditional) */
  64
  65 Lisp_Object Qcharset_table;
  66
  67 /* A char-table containing information of each character set.  */
  68 Lisp_Object Vcharset_table;
  69
  70 /* A vector of charset symbol indexed by charset-id.  This is used
  71    only for returning charset symbol from C functions.  */
  72 Lisp_Object Vcharset_symbol_table;
  73
  74 /* A list of charset symbols ever defined.  */
  75 Lisp_Object Vcharset_list;
  76
  77 /* Vector of translation table ever defined.
  78    ID of a translation table is used to index this vector.  */
  79 Lisp_Object Vtranslation_table_vector;
  80
  81 /* A char-table for characters which may invoke auto-filling.  */
  82 Lisp_Object Vauto_fill_chars;
  83
  84 Lisp_Object Qauto_fill_chars;
  85
  86 /* Tables used by macros BYTES_BY_CHAR_HEAD and WIDTH_BY_CHAR_HEAD.  */
  87 int bytes_by_char_head[256];
  88 int width_by_char_head[256];
  89
  90 /* Mapping table from ISO2022's charset (specified by DIMENSION,
  91    CHARS, and FINAL-CHAR) to Emacs' charset.  */
  92 int iso_charset_table[2][2][128];
  93
  94 /* Variables used locally in the macro FETCH_MULTIBYTE_CHAR.  */
  95 unsigned char *_fetch_multibyte_char_p;
  96 int _fetch_multibyte_char_len;
  97
  98 /* Offset to add to a non-ASCII value when inserting it.  */
  99 int nonascii_insert_offset;
 100
 101 /* Translation table for converting non-ASCII unibyte characters
 102    to multibyte codes, or nil.  */
 103 Lisp_Object Vnonascii_translation_table;
 104
 105 /* List of all possible generic characters.  */
 106 Lisp_Object Vgeneric_character_list;
 107
 108 #define min(X, Y) ((X) < (Y) ? (X) : (Y))
 109 #define max(X, Y) ((X) > (Y) ? (X) : (Y))
 110 \f
 111 void
 112 invalid_character (c)
 113      int c;
 114 {
 115   error ("Invalid character: 0%o, %d, 0x%x", c, c, c);
 116 }
 117
 118 /* Parse string STR of length LENGTH and fetch information of a
 119    character at STR.  Set BYTES to the byte length the character
 120    occupies, CHARSET, C1, C2 to proper values of the character. */
 121
 122 #define SPLIT_MULTIBYTE_SEQ(str, length, bytes, charset, c1, c2)             \
 123   do {                                                                       \
 124     (c1) = *(str);                                                           \
 125     (bytes) = BYTES_BY_CHAR_HEAD (c1);                                       \
 126     if ((bytes) == 1)                                                        \
 127       (charset) = ASCII_BYTE_P (c1) ? CHARSET_ASCII : CHARSET_8_BIT_GRAPHIC; \
 128     else if ((bytes) == 2)                                                   \
 129       {                                                                      \
 130         if ((c1) == LEADING_CODE_8_BIT_CONTROL)                              \
 131           (charset) = CHARSET_8_BIT_CONTROL, (c1) = (str)[1] - 0x20;         \
 132         else                                                                 \
 133           (charset) = (c1), (c1) = (str)[1] & 0x7F;                          \
 134       }                                                                      \
 135     else if ((bytes) == 3)                                                   \
 136       {                                                                      \
 137         if ((c1) < LEADING_CODE_PRIVATE_11)                                  \
 138           (charset) = (c1), (c1) = (str)[1] & 0x7F, (c2) = (str)[2] & 0x7F;  \
 139         else                                                                 \
 140           (charset) = (str)[1], (c1) = (str)[2] & 0x7F;                      \
 141       }                                                                      \
 142     else                                                                     \
 143       (charset) = (str)[1], (c1) = (str)[2] & 0x7F, (c2) = (str)[3] & 0x7F;  \
 144   } while (0)
 145
 146 /* 1 if CHARSET, C1, and C2 compose a valid character, else 0.  */
 147 #define CHAR_COMPONENTS_VALID_P(charset, c1, c2)        \
 148   ((charset) == CHARSET_ASCII                           \
 149    ? ((c1) >= 0 && (c1) <= 0x7F)                        \
 150    : ((charset) == CHARSET_8_BIT_CONTROL                \
 151       ? ((c1) >= 0x80 && (c1) <= 0x9F)                  \
 152       : ((charset) == CHARSET_8_BIT_GRAPHIC             \
 153          ? ((c1) >= 0x80 && (c1) <= 0xFF)               \
 154          : (CHARSET_DIMENSION (charset) == 1            \
 155             ? ((c1) >= 0x20 && (c1) <= 0x7F)            \
 156             : ((c1) >= 0x20 && (c1) <= 0x7F             \
 157                && (c2) >= 0x20 && (c2) <= 0x7F)))))
 158
 159 /* Store multi-byte form of the character C in STR.  The caller should
 160    allocate at least 4-byte area at STR in advance.  Returns the
 161    length of the multi-byte form.  If C is an invalid character code,
 162    signal an error.
 163
 164    Use macro `CHAR_STRING (C, STR)' instead of calling this function
 165    directly if C can be an ASCII character.  */
 166
 167 int
 168 char_to_string (c, str)
 169      int c;
 170      unsigned char *str;
 171 {
 172   unsigned char *p = str;
 173
 174   if (c & CHAR_MODIFIER_MASK)   /* This includes the case C is negative.  */
 175     {
 176       /* Multibyte character can't have a modifier bit.  */
 177       if (! SINGLE_BYTE_CHAR_P ((c & ~CHAR_MODIFIER_MASK)))
 178         invalid_character (c);
 179
 180       /* For Meta, Shift, and Control modifiers, we need special care.  */
 181       if (c & CHAR_META)
 182         {
 183           /* Move the meta bit to the right place for a string.  */
 184           c = (c & ~CHAR_META) | 0x80;
 185         }
 186       if (c & CHAR_SHIFT)
 187         {
 188           /* Shift modifier is valid only with [A-Za-z].  */
 189           if ((c & 0377) >= 'A' && (c & 0377) <= 'Z')
 190             c &= ~CHAR_SHIFT;
 191           else if ((c & 0377) >= 'a' && (c & 0377) <= 'z')
 192             c = (c & ~CHAR_SHIFT) - ('a' - 'A');
 193         }
 194       if (c & CHAR_CTL)
 195         {
 196           /* Simulate the code in lread.c.  */
 197           /* Allow `\C- ' and `\C-?'.  */
 198           if (c == (CHAR_CTL | ' '))
 199             c = 0;
 200           else if (c == (CHAR_CTL | '?'))
 201             c = 127;
 202           /* ASCII control chars are made from letters (both cases),
 203              as well as the non-letters within 0100...0137.  */
 204           else if ((c & 0137) >= 0101 && (c & 0137) <= 0132)
 205             c &= (037 | (~0177 & ~CHAR_CTL));
 206           else if ((c & 0177) >= 0100 && (c & 0177) <= 0137)
 207             c &= (037 | (~0177 & ~CHAR_CTL));
 208         }
 209
 210       /* If C still has any modifier bits, it is an invalid character.  */
 211       if (c & CHAR_MODIFIER_MASK)
 212         invalid_character (c);
 213     }
 214   if (SINGLE_BYTE_CHAR_P (c))
 215     {
 216       if (ASCII_BYTE_P (c) || c >= 0xA0)
 217         *p++ = c;
 218       else
 219         {
 220           *p++ = LEADING_CODE_8_BIT_CONTROL;
 221           *p++ = c + 0x20;
 222         }
 223     }
 224   else if (CHAR_VALID_P (c))
 225     {
 226       int charset, c1, c2;
 227
 228       SPLIT_CHAR (c, charset, c1, c2);
 229
 230       if (charset >= LEADING_CODE_EXT_11)
 231         *p++ = (charset < LEADING_CODE_EXT_12
 232                 ? LEADING_CODE_PRIVATE_11
 233                 : (charset < LEADING_CODE_EXT_21
 234                    ? LEADING_CODE_PRIVATE_12
 235                    : (charset < LEADING_CODE_EXT_22
 236                       ? LEADING_CODE_PRIVATE_21
 237                       : LEADING_CODE_PRIVATE_22)));
 238       *p++ = charset;
 239       if (c1 > 0 && c1 < 32 || c2 > 0 && c2 < 32)
 240         invalid_character (c);
 241       if (c1)
 242         {
 243           *p++ = c1 | 0x80;
 244           if (c2 > 0)
 245             *p++ = c2 | 0x80;
 246         }
 247     }
 248   else
 249     invalid_character (c);
 250
 251   return (p - str);
 252 }
 253
 254 /* Return the non-ASCII character corresponding to multi-byte form at
 255    STR of length LEN.  If ACTUAL_LEN is not NULL, store the byte
 256    length of the multibyte form in *ACTUAL_LEN.
 257
 258    Use macros STRING_CHAR or STRING_CHAR_AND_LENGTH instead of calling
 259    this function directly if you want ot handle ASCII characters as
 260    well.  */
 261
 262 int
 263 string_to_char (str, len, actual_len)
 264      const unsigned char *str;
 265      int len, *actual_len;
 266 {
 267   int c, bytes, charset, c1, c2;
 268
 269   SPLIT_MULTIBYTE_SEQ (str, len, bytes, charset, c1, c2);
 270   c = MAKE_CHAR (charset, c1, c2);
 271   if (actual_len)
 272     *actual_len = bytes;
 273   return c;
 274 }
 275
 276 /* Return the length of the multi-byte form at string STR of length LEN.
 277    Use the macro MULTIBYTE_FORM_LENGTH instead.  */
 278 int
 279 multibyte_form_length (str, len)
 280      const unsigned char *str;
 281      int len;
 282 {
 283   int bytes;
 284
 285   PARSE_MULTIBYTE_SEQ (str, len, bytes);
 286   return bytes;
 287 }
 288
 289 /* Check multibyte form at string STR of length LEN and set variables
 290    pointed by CHARSET, C1, and C2 to charset and position codes of the
 291    character at STR, and return 0.  If there's no multibyte character,
 292    return -1.  This should be used only in the macro SPLIT_STRING
 293    which checks range of STR in advance.  */
 294
 295 int
 296 split_string (str, len, charset, c1, c2)
 297      const unsigned char *str;
 298      unsigned char *c1, *c2;
 299      int len, *charset;
 300 {
 301   register int bytes, cs, code1, code2 = -1;
 302
 303   SPLIT_MULTIBYTE_SEQ (str, len, bytes, cs, code1, code2);
 304   if (cs == CHARSET_ASCII)
 305     return -1;
 306   *charset = cs;
 307   *c1 = code1;
 308   *c2 = code2;
 309   return 0;
 310 }
 311
 312 /* Return 1 iff character C has valid printable glyph.
 313    Use the macro CHAR_PRINTABLE_P instead.  */
 314 int
 315 char_printable_p (c)
 316      int c;
 317 {
 318   int charset, c1, c2, chars;
 319
 320   if (ASCII_BYTE_P (c))
 321     return 1;
 322   else if (SINGLE_BYTE_CHAR_P (c))
 323     return 0;
 324   else if (c >= MAX_CHAR)
 325     return 0;
 326
 327   SPLIT_CHAR (c, charset, c1, c2);
 328   if (! CHARSET_DEFINED_P (charset))
 329     return 0;
 330   if (CHARSET_CHARS (charset) == 94
 331       ? c1 <= 32 || c1 >= 127
 332       : c1 < 32)
 333     return 0;
 334   if (CHARSET_DIMENSION (charset) == 2
 335       && (CHARSET_CHARS (charset) == 94
 336           ? c2 <= 32 || c2 >= 127
 337           : c2 < 32))
 338     return 0;
 339   return 1;
 340 }
 341
 342 /* Translate character C by translation table TABLE.  If C
 343    is negative, translate a character specified by CHARSET, C1, and C2
 344    (C1 and C2 are code points of the character).  If no translation is
 345    found in TABLE, return C.  */
 346 int
 347 translate_char (table, c, charset, c1, c2)
 348      Lisp_Object table;
 349      int c, charset, c1, c2;
 350 {
 351   Lisp_Object ch;
 352   int alt_charset, alt_c1, alt_c2, dimension;
 353
 354   if (c < 0) c = MAKE_CHAR (charset, (c1 & 0x7F) , (c2 & 0x7F));
 355   if (!CHAR_TABLE_P (table)
 356       || (ch = Faref (table, make_number (c)), !NATNUMP (ch)))
 357     return c;
 358
 359   SPLIT_CHAR (XFASTINT (ch), alt_charset, alt_c1, alt_c2);
 360   dimension = CHARSET_DIMENSION (alt_charset);
 361   if (dimension == 1 && alt_c1 > 0 || dimension == 2 && alt_c2 > 0)
 362     /* CH is not a generic character, just return it.  */
 363     return XFASTINT (ch);
 364
 365   /* Since CH is a generic character, we must return a specific
 366      charater which has the same position codes as C from CH.  */
 367   if (charset < 0)
 368     SPLIT_CHAR (c, charset, c1, c2);
 369   if (dimension != CHARSET_DIMENSION (charset))
 370     /* We can't make such a character because of dimension mismatch.  */
 371     return c;
 372   return MAKE_CHAR (alt_charset, c1, c2);
 373 }
 374
 375 /* Convert the unibyte character C to multibyte based on
 376    Vnonascii_translation_table or nonascii_insert_offset.  If they can't
 377    convert C to a valid multibyte character, convert it based on
 378    DEFAULT_NONASCII_INSERT_OFFSET which makes C a Latin-1 character.  */
 379
 380 int
 381 unibyte_char_to_multibyte (c)
 382      int c;
 383 {
 384   if (c < 0400 && c >= 0200)
 385     {
 386       int c_save = c;
 387
 388       if (! NILP (Vnonascii_translation_table))
 389         {
 390           c = XINT (Faref (Vnonascii_translation_table, make_number (c)));
 391           if (c >= 0400 && ! char_valid_p (c, 0))
 392             c = c_save + DEFAULT_NONASCII_INSERT_OFFSET;
 393         }
 394       else if (c >= 0240 && nonascii_insert_offset > 0)
 395         {
 396           c += nonascii_insert_offset;
 397           if (c < 0400 || ! char_valid_p (c, 0))
 398             c = c_save + DEFAULT_NONASCII_INSERT_OFFSET;
 399         }
 400       else if (c >= 0240)
 401         c = c_save + DEFAULT_NONASCII_INSERT_OFFSET;
 402     }
 403   return c;
 404 }
 405
 406
 407 /* Convert the multibyte character C to unibyte 8-bit character based
 408    on Vnonascii_translation_table or nonascii_insert_offset.  If
 409    REV_TBL is non-nil, it should be a reverse table of
 410    Vnonascii_translation_table, i.e. what given by:
 411      Fchar_table_extra_slot (Vnonascii_translation_table, make_number (0))  */
 412
 413 int
 414 multibyte_char_to_unibyte (c, rev_tbl)
 415      int c;
 416      Lisp_Object rev_tbl;
 417 {
 418   if (!SINGLE_BYTE_CHAR_P (c))
 419     {
 420       int c_save = c;
 421
 422       if (! CHAR_TABLE_P (rev_tbl)
 423           && CHAR_TABLE_P (Vnonascii_translation_table))
 424         rev_tbl = Fchar_table_extra_slot (Vnonascii_translation_table,
 425                                           make_number (0));
 426       if (CHAR_TABLE_P (rev_tbl))
 427         {
 428           Lisp_Object temp;
 429           temp = Faref (rev_tbl, make_number (c));
 430           if (INTEGERP (temp))
 431             c = XINT (temp);
 432           if (c >= 256)
 433             c = (c_save & 0177) + 0200;
 434         }
 435       else
 436         {
 437           if (nonascii_insert_offset > 0)
 438             c -= nonascii_insert_offset;
 439           if (c < 128 || c >= 256)
 440             c = (c_save & 0177) + 0200;
 441         }
 442     }
 443
 444   return c;
 445 }
 446
 447 \f
 448 /* Update the table Vcharset_table with the given arguments (see the
 449    document of `define-charset' for the meaning of each argument).
 450    Several other table contents are also updated.  The caller should
 451    check the validity of CHARSET-ID and the remaining arguments in
 452    advance.  */
 453
 454 void
 455 update_charset_table (charset_id, dimension, chars, width, direction,
 456                       iso_final_char, iso_graphic_plane,
 457                       short_name, long_name, description)
 458      Lisp_Object charset_id, dimension, chars, width, direction;
 459      Lisp_Object iso_final_char, iso_graphic_plane;
 460      Lisp_Object short_name, long_name, description;
 461 {
 462   int charset = XINT (charset_id);
 463   int bytes;
 464   unsigned char leading_code_base, leading_code_ext;
 465
 466   if (NILP (CHARSET_TABLE_ENTRY (charset)))
 467     CHARSET_TABLE_ENTRY (charset)
 468       = Fmake_vector (make_number (CHARSET_MAX_IDX), Qnil);
 469
 470   if (NILP (long_name))
 471     long_name = short_name;
 472   if (NILP (description))
 473     description = long_name;
 474
 475   /* Get byte length of multibyte form, base leading-code, and
 476      extended leading-code of the charset.  See the comment under the
 477      title "GENERAL NOTE on CHARACTER SET (CHARSET)" in charset.h.  */
 478   bytes = XINT (dimension);
 479   if (charset < MIN_CHARSET_PRIVATE_DIMENSION1)
 480     {
 481       /* Official charset, it doesn't have an extended leading-code.  */
 482       if (charset != CHARSET_ASCII && charset != CHARSET_8_BIT_GRAPHIC)
 483         bytes += 1; /* For a base leading-code.  */
 484       leading_code_base = charset;
 485       leading_code_ext = 0;
 486     }
 487   else
 488     {
 489       /* Private charset.  */
 490       bytes += 2; /* For base and extended leading-codes.  */
 491       leading_code_base
 492         = (charset < LEADING_CODE_EXT_12
 493            ? LEADING_CODE_PRIVATE_11
 494            : (charset < LEADING_CODE_EXT_21
 495               ? LEADING_CODE_PRIVATE_12
 496               : (charset < LEADING_CODE_EXT_22
 497                  ? LEADING_CODE_PRIVATE_21
 498                  : LEADING_CODE_PRIVATE_22)));
 499       leading_code_ext = charset;
 500       if (BYTES_BY_CHAR_HEAD (leading_code_base) != bytes)
 501         error ("Invalid dimension for the charset-ID %d", charset);
 502     }
 503
 504   CHARSET_TABLE_INFO (charset, CHARSET_ID_IDX) = charset_id;
 505   CHARSET_TABLE_INFO (charset, CHARSET_BYTES_IDX) = make_number (bytes);
 506   CHARSET_TABLE_INFO (charset, CHARSET_DIMENSION_IDX) = dimension;
 507   CHARSET_TABLE_INFO (charset, CHARSET_CHARS_IDX) = chars;
 508   CHARSET_TABLE_INFO (charset, CHARSET_WIDTH_IDX) = width;
 509   CHARSET_TABLE_INFO (charset, CHARSET_DIRECTION_IDX) = direction;
 510   CHARSET_TABLE_INFO (charset, CHARSET_LEADING_CODE_BASE_IDX)
 511     = make_number (leading_code_base);
 512   CHARSET_TABLE_INFO (charset, CHARSET_LEADING_CODE_EXT_IDX)
 513     = make_number (leading_code_ext);
 514   CHARSET_TABLE_INFO (charset, CHARSET_ISO_FINAL_CHAR_IDX) = iso_final_char;
 515   CHARSET_TABLE_INFO (charset, CHARSET_ISO_GRAPHIC_PLANE_IDX)
 516     = iso_graphic_plane;
 517   CHARSET_TABLE_INFO (charset, CHARSET_SHORT_NAME_IDX) = short_name;
 518   CHARSET_TABLE_INFO (charset, CHARSET_LONG_NAME_IDX) = long_name;
 519   CHARSET_TABLE_INFO (charset, CHARSET_DESCRIPTION_IDX) = description;
 520   CHARSET_TABLE_INFO (charset, CHARSET_PLIST_IDX) = Qnil;
 521
 522   {
 523     /* If we have already defined a charset which has the same
 524        DIMENSION, CHARS and ISO-FINAL-CHAR but the different
 525        DIRECTION, we must update the entry REVERSE-CHARSET of both
 526        charsets.  If there's no such charset, the value of the entry
 527        is set to nil.  */
 528     int i;
 529
 530     for (i = 0; i <= MAX_CHARSET; i++)
 531       if (!NILP (CHARSET_TABLE_ENTRY (i)))
 532         {
 533           if (CHARSET_DIMENSION (i) == XINT (dimension)
 534               && CHARSET_CHARS (i) == XINT (chars)
 535               && CHARSET_ISO_FINAL_CHAR (i) == XINT (iso_final_char)
 536               && CHARSET_DIRECTION (i) != XINT (direction))
 537             {
 538               CHARSET_TABLE_INFO (charset, CHARSET_REVERSE_CHARSET_IDX)
 539                 = make_number (i);
 540               CHARSET_TABLE_INFO (i, CHARSET_REVERSE_CHARSET_IDX) = charset_id;
 541               break;
 542             }
 543         }
 544     if (i > MAX_CHARSET)
 545       /* No such a charset.  */
 546       CHARSET_TABLE_INFO (charset, CHARSET_REVERSE_CHARSET_IDX)
 547         = make_number (-1);
 548   }
 549
 550   if (charset != CHARSET_ASCII && charset != CHARSET_8_BIT_GRAPHIC
 551       && charset < MIN_CHARSET_PRIVATE_DIMENSION1)
 552     {
 553       bytes_by_char_head[leading_code_base] = bytes;
 554       width_by_char_head[leading_code_base] = XINT (width);
 555
 556       /* Update table emacs_code_class.  */
 557       emacs_code_class[charset] = (bytes == 2
 558                                    ? EMACS_leading_code_2
 559                                    : (bytes == 3
 560                                       ? EMACS_leading_code_3
 561                                       : EMACS_leading_code_4));
 562     }
 563
 564   /* Update table iso_charset_table.  */
 565   if (XINT (iso_final_char) >= 0
 566       && ISO_CHARSET_TABLE (dimension, chars, iso_final_char) < 0)
 567     ISO_CHARSET_TABLE (dimension, chars, iso_final_char) = charset;
 568 }
 569
 570 #ifdef emacs
 571
 572 /* Return charset id of CHARSET_SYMBOL, or return -1 if CHARSET_SYMBOL
 573    is invalid.  */
 574 int
 575 get_charset_id (charset_symbol)
 576      Lisp_Object charset_symbol;
 577 {
 578   Lisp_Object val;
 579   int charset;
 580
 581   return ((SYMBOLP (charset_symbol)
 582            && (val = Fget (charset_symbol, Qcharset), VECTORP (val))
 583            && (charset = XINT (XVECTOR (val)->contents[CHARSET_ID_IDX]),
 584                CHARSET_VALID_P (charset)))
 585           ? charset : -1);
 586 }
 587
 588 /* Return an identification number for a new private charset of
 589    DIMENSION and WIDTH.  If there's no more room for the new charset,
 590    return 0.  */
 591 Lisp_Object
 592 get_new_private_charset_id (dimension, width)
 593      int dimension, width;
 594 {
 595   int charset, from, to;
 596
 597   if (dimension == 1)
 598     {
 599       if (width == 1)
 600         from = LEADING_CODE_EXT_11, to = LEADING_CODE_EXT_12;
 601       else
 602         from = LEADING_CODE_EXT_12, to = LEADING_CODE_EXT_21;
 603     }
 604   else
 605     {
 606       if (width == 1)
 607         from = LEADING_CODE_EXT_21, to = LEADING_CODE_EXT_22;
 608       else
 609         from = LEADING_CODE_EXT_22, to = LEADING_CODE_EXT_MAX + 1;
 610     }
 611
 612   for (charset = from; charset < to; charset++)
 613     if (!CHARSET_DEFINED_P (charset)) break;
 614
 615   return make_number (charset < to ? charset : 0);
 616 }
 617
 618 DEFUN ("define-charset", Fdefine_charset, Sdefine_charset, 3, 3, 0,
 619   "Define CHARSET-ID as the identification number of CHARSET with INFO-VECTOR.\n\
 620 If CHARSET-ID is nil, it is decided automatically, which means CHARSET is\n\
 621  treated as a private charset.\n\
 622 INFO-VECTOR is a vector of the format:\n\
 623    [DIMENSION CHARS WIDTH DIRECTION ISO-FINAL-CHAR ISO-GRAPHIC-PLANE\n\
 624     SHORT-NAME LONG-NAME DESCRIPTION]\n\
 625 The meanings of each elements is as follows:\n\
 626 DIMENSION (integer) is the number of bytes to represent a character: 1 or 2.\n\
 627 CHARS (integer) is the number of characters in a dimension: 94 or 96.\n\
 628 WIDTH (integer) is the number of columns a character in the charset\n\
 629 occupies on the screen: one of 0, 1, and 2.\n\
 630 \n\
 631 DIRECTION (integer) is the rendering direction of characters in the\n\
 632 charset when rendering.  If 0, render from left to right, else\n\
 633 render from right to left.\n\
 634 \n\
 635 ISO-FINAL-CHAR (character) is the final character of the\n\
 636 corresponding ISO 2022 charset.\n\
 637 It may be -1 if the charset is internal use only.\n\
 638 \n\
 639 ISO-GRAPHIC-PLANE (integer) is the graphic plane to be invoked\n\
 640 while encoding to variants of ISO 2022 coding system, one of the\n\
 641 following: 0/graphic-plane-left(GL), 1/graphic-plane-right(GR).\n\
 642 It may be -1 if the charset is internal use only.\n\
 643 \n\
 644 SHORT-NAME (string) is the short name to refer to the charset.\n\
 645 \n\
 646 LONG-NAME (string) is the long name to refer to the charset.\n\
 647 \n\
 648 DESCRIPTION (string) is the description string of the charset.")
 649   (charset_id, charset_symbol, info_vector)
 650      Lisp_Object charset_id, charset_symbol, info_vector;
 651 {
 652   Lisp_Object *vec;
 653
 654   if (!NILP (charset_id))
 655     CHECK_NUMBER (charset_id, 0);
 656   CHECK_SYMBOL (charset_symbol, 1);
 657   CHECK_VECTOR (info_vector, 2);
 658
 659   if (! NILP (charset_id))
 660     {
 661       if (! CHARSET_VALID_P (XINT (charset_id)))
 662         error ("Invalid CHARSET: %d", XINT (charset_id));
 663       else if (CHARSET_DEFINED_P (XINT (charset_id)))
 664         error ("Already defined charset: %d", XINT (charset_id));
 665     }
 666
 667   vec = XVECTOR (info_vector)->contents;
 668   if (XVECTOR (info_vector)->size != 9
 669       || !INTEGERP (vec[0]) || !(XINT (vec[0]) == 1 || XINT (vec[0]) == 2)
 670       || !INTEGERP (vec[1]) || !(XINT (vec[1]) == 94 || XINT (vec[1]) == 96)
 671       || !INTEGERP (vec[2]) || !(XINT (vec[2]) == 1 || XINT (vec[2]) == 2)
 672       || !INTEGERP (vec[3]) || !(XINT (vec[3]) == 0 || XINT (vec[3]) == 1)
 673       || !INTEGERP (vec[4])
 674       || !(XINT (vec[4]) == -1 || XINT (vec[4]) >= '0' && XINT (vec[4]) <= '~')
 675       || !INTEGERP (vec[5])
 676       || !(XINT (vec[5]) == -1 || XINT (vec[5]) == 0 || XINT (vec[5]) == 1)
 677       || !STRINGP (vec[6])
 678       || !STRINGP (vec[7])
 679       || !STRINGP (vec[8]))
 680     error ("Invalid info-vector argument for defining charset %s",
 681            XSYMBOL (charset_symbol)->name->data);
 682
 683   if (NILP (charset_id))
 684     {
 685       charset_id = get_new_private_charset_id (XINT (vec[0]), XINT (vec[2]));
 686       if (XINT (charset_id) == 0)
 687         error ("There's no room for a new private charset %s",
 688                XSYMBOL (charset_symbol)->name->data);
 689     }
 690
 691   update_charset_table (charset_id, vec[0], vec[1], vec[2], vec[3],
 692                         vec[4], vec[5], vec[6], vec[7], vec[8]);
 693   Fput (charset_symbol, Qcharset, CHARSET_TABLE_ENTRY (XINT (charset_id)));
 694   CHARSET_SYMBOL (XINT (charset_id)) = charset_symbol;
 695   Vcharset_list = Fcons (charset_symbol, Vcharset_list);
 696   return Qnil;
 697 }
 698
 699 DEFUN ("generic-character-list", Fgeneric_character_list,
 700        Sgeneric_character_list, 0, 0, 0,
 701   "Return a list of all possible generic characters.\n\
 702 It includes a generic character for a charset not yet defined.")
 703   ()
 704 {
 705   return Vgeneric_character_list;
 706 }
 707
 708 DEFUN ("get-unused-iso-final-char", Fget_unused_iso_final_char,
 709        Sget_unused_iso_final_char, 2, 2, 0,
 710   "Return an unsed ISO's final char for a charset of DIMENISION and CHARS.\n\
 711 DIMENSION is the number of bytes to represent a character: 1 or 2.\n\
 712 CHARS is the number of characters in a dimension: 94 or 96.\n\
 713 \n\
 714 This final char is for private use, thus the range is `0' (48) .. `?' (63).\n\
 715 If there's no unused final char for the specified kind of charset,\n\
 716 return nil.")
 717   (dimension, chars)
 718      Lisp_Object dimension, chars;
 719 {
 720   int final_char;
 721
 722   CHECK_NUMBER (dimension, 0);
 723   CHECK_NUMBER (chars, 1);
 724   if (XINT (dimension) != 1 && XINT (dimension) != 2)
 725     error ("Invalid charset dimension %d, it should be 1 or 2",
 726            XINT (dimension));
 727   if (XINT (chars) != 94 && XINT (chars) != 96)
 728     error ("Invalid charset chars %d, it should be 94 or 96",
 729            XINT (chars));
 730   for (final_char = '0'; final_char <= '?'; final_char++)
 731     {
 732       if (ISO_CHARSET_TABLE (dimension, chars, make_number (final_char)) < 0)
 733         break;
 734     }
 735   return (final_char <= '?' ? make_number (final_char) : Qnil);
 736 }
 737
 738 DEFUN ("declare-equiv-charset", Fdeclare_equiv_charset, Sdeclare_equiv_charset,
 739        4, 4, 0,
 740   "Declare a charset of DIMENSION, CHARS, FINAL-CHAR is the same as CHARSET.\n\
 741 CHARSET should be defined by `defined-charset' in advance.")
 742   (dimension, chars, final_char, charset_symbol)
 743      Lisp_Object dimension, chars, final_char, charset_symbol;
 744 {
 745   int charset;
 746
 747   CHECK_NUMBER (dimension, 0);
 748   CHECK_NUMBER (chars, 1);
 749   CHECK_NUMBER (final_char, 2);
 750   CHECK_SYMBOL (charset_symbol, 3);
 751
 752   if (XINT (dimension) != 1 && XINT (dimension) != 2)
 753     error ("Invalid DIMENSION %d, it should be 1 or 2", XINT (dimension));
 754   if (XINT (chars) != 94 && XINT (chars) != 96)
 755     error ("Invalid CHARS %d, it should be 94 or 96", XINT (chars));
 756   if (XINT (final_char) < '0' || XFASTINT (final_char) > '~')
 757     error ("Invalid FINAL-CHAR %c, it should be `0'..`~'", XINT (chars));
 758   if ((charset = get_charset_id (charset_symbol)) < 0)
 759     error ("Invalid charset %s", XSYMBOL (charset_symbol)->name->data);
 760
 761   ISO_CHARSET_TABLE (dimension, chars, final_char) = charset;
 762   return Qnil;
 763 }
 764
 765 /* Return information about charsets in the text at PTR of NBYTES
 766    bytes, which are NCHARS characters.  The value is:
 767
 768         0: Each character is represented by one byte.  This is always
 769            true for unibyte text.
 770         1: No charsets other than ascii eight-bit-control,
 771            eight-bit-graphic, and latin-1 are found.
 772         2: Otherwise.
 773
 774    In addition, if CHARSETS is nonzero, for each found charset N, set
 775    CHARSETS[N] to 1.  For that, callers should allocate CHARSETS
 776    (MAX_CHARSET + 1 elements) in advance.  It may lookup a translation
 777    table TABLE if supplied.  For invalid charsets, set CHARSETS[1] to
 778    1 (note that there's no charset whose ID is 1).  */
 779
 780 int
 781 find_charset_in_text (ptr, nchars, nbytes, charsets, table)
 782      unsigned char *ptr;
 783      int nchars, nbytes, *charsets;
 784      Lisp_Object table;
 785 {
 786   if (nchars == nbytes)
 787     {
 788       if (charsets && nbytes > 0)
 789         {
 790           unsigned char *endp = ptr + nbytes;
 791           int maskbits = 0;
 792
 793           while (ptr < endp && maskbits != 7)
 794             {
 795               maskbits |= (*ptr < 0x80 ? 1 : *ptr < 0xA0 ? 2 : 4);
 796               ptr++;
 797             }
 798
 799           if (maskbits & 1)
 800             charsets[CHARSET_ASCII] = 1;
 801           if (maskbits & 2)
 802             charsets[CHARSET_8_BIT_CONTROL] = 1;
 803           if (maskbits & 4)
 804             charsets[CHARSET_8_BIT_GRAPHIC] = 1;
 805         }
 806       return 0;
 807     }
 808   else
 809     {
 810       int return_val = 1;
 811       int bytes, charset, c1, c2;
 812
 813       if (! CHAR_TABLE_P (table))
 814         table = Qnil;
 815
 816       while (nchars-- > 0)
 817         {
 818           SPLIT_MULTIBYTE_SEQ (ptr, len, bytes, charset, c1, c2);
 819           ptr += bytes;
 820
 821           if (!CHARSET_DEFINED_P (charset))
 822             charset = 1;
 823           else if (! NILP (table))
 824             {
 825               int c = translate_char (table, -1, charset, c1, c2);
 826               if (c >= 0)
 827                 charset = CHAR_CHARSET (c);
 828             }
 829
 830           if (return_val == 1
 831               && charset != CHARSET_ASCII
 832               && charset != CHARSET_8_BIT_CONTROL
 833               && charset != CHARSET_8_BIT_GRAPHIC
 834               && charset != charset_latin_iso8859_1)
 835             return_val = 2;
 836
 837           if (charsets)
 838             charsets[charset] = 1;
 839           else if (return_val == 2)
 840             break;
 841         }
 842       return return_val;
 843     }
 844 }
 845
 846 DEFUN ("find-charset-region", Ffind_charset_region, Sfind_charset_region,
 847        2, 3, 0,
 848   "Return a list of charsets in the region between BEG and END.\n\
 849 BEG and END are buffer positions.\n\
 850 Optional arg TABLE if non-nil is a translation table to look up.\n\
 851 \n\
 852 If the region contains invalid multiybte characters,\n\
 853 `unknown' is included in the returned list.\n\
 854 \n\
 855 If the current buffer is unibyte, the returned list may contain\n\
 856 only `ascii', `eight-bit-control', and `eight-bit-graphic'.")
 857   (beg, end, table)
 858      Lisp_Object beg, end, table;
 859 {
 860   int charsets[MAX_CHARSET + 1];
 861   int from, from_byte, to, stop, stop_byte, i;
 862   Lisp_Object val;
 863
 864   validate_region (&beg, &end);
 865   from = XFASTINT (beg);
 866   stop = to = XFASTINT (end);
 867
 868   if (from < GPT && GPT < to)
 869     {
 870       stop = GPT;
 871       stop_byte = GPT_BYTE;
 872     }
 873   else
 874     stop_byte = CHAR_TO_BYTE (stop);
 875
 876   from_byte = CHAR_TO_BYTE (from);
 877
 878   bzero (charsets, (MAX_CHARSET + 1) * sizeof (int));
 879   while (1)
 880     {
 881       find_charset_in_text (BYTE_POS_ADDR (from_byte), stop - from,
 882                             stop_byte - from_byte, charsets, table);
 883       if (stop < to)
 884         {
 885           from = stop, from_byte = stop_byte;
 886           stop = to, stop_byte = CHAR_TO_BYTE (stop);
 887         }
 888       else
 889         break;
 890     }
 891
 892   val = Qnil;
 893   if (charsets[1])
 894     val = Fcons (Qunknown, val);
 895   for (i = MAX_CHARSET; i >= MIN_CHARSET_OFFICIAL_DIMENSION1; i--)
 896     if (charsets[i])
 897       val = Fcons (CHARSET_SYMBOL (i), val);
 898   if (charsets[0])
 899     val = Fcons (Qascii, val);
 900   return val;
 901 }
 902
 903 DEFUN ("find-charset-string", Ffind_charset_string, Sfind_charset_string,
 904        1, 2, 0,
 905   "Return a list of charsets in STR.\n\
 906 Optional arg TABLE if non-nil is a translation table to look up.\n\
 907 \n\
 908 If the region contains invalid multiybte characters,\n\
 909 `unknown' is included in the returned list.\n\
 910 \n\
 911 If STR is unibyte, the returned list may contain\n\
 912 only `ascii', `eight-bit-control', and `eight-bit-graphic'.")
 913   (str, table)
 914      Lisp_Object str, table;
 915 {
 916   int charsets[MAX_CHARSET + 1];
 917   int i;
 918   Lisp_Object val;
 919
 920   CHECK_STRING (str, 0);
 921
 922   bzero (charsets, (MAX_CHARSET + 1) * sizeof (int));
 923   find_charset_in_text (XSTRING (str)->data, XSTRING (str)->size,
 924                         STRING_BYTES (XSTRING (str)), charsets, table);
 925
 926   val = Qnil;
 927   if (charsets[1])
 928     val = Fcons (Qunknown, val);
 929   for (i = MAX_CHARSET; i >= MIN_CHARSET_OFFICIAL_DIMENSION1; i--)
 930     if (charsets[i])
 931       val = Fcons (CHARSET_SYMBOL (i), val);
 932   if (charsets[0])
 933     val = Fcons (Qascii, val);
 934   return val;
 935 }
 936
 937 \f
 938 DEFUN ("make-char-internal", Fmake_char_internal, Smake_char_internal, 1, 3, 0,
 939   "")
 940   (charset, code1, code2)
 941      Lisp_Object charset, code1, code2;
 942 {
 943   int charset_id, c1, c2;
 944
 945   CHECK_NUMBER (charset, 0);
 946   charset_id = XINT (charset);
 947   if (!CHARSET_DEFINED_P (charset_id))
 948     error ("Invalid charset ID: %d", XINT (charset));
 949
 950   if (NILP (code1))
 951     c1 = 0;
 952   else
 953     {
 954       CHECK_NUMBER (code1, 1);
 955       c1 = XINT (code1);
 956     }
 957   if (NILP (code2))
 958     c2 = 0;
 959   else
 960     {
 961       CHECK_NUMBER (code2, 2);
 962       c2 = XINT (code2);
 963     }
 964
 965   if (charset_id == CHARSET_ASCII)
 966     {
 967       if (c1 < 0 || c1 > 0x7F)
 968         goto invalid_code_posints;
 969       return make_number (c1);
 970     }
 971   else if (charset_id == CHARSET_8_BIT_CONTROL)
 972     {
 973       if (c1 < 0x80 || c1 > 0x9F)
 974         goto invalid_code_posints;
 975       return make_number (c1);
 976     }
 977   else if (charset_id == CHARSET_8_BIT_GRAPHIC)
 978     {
 979       if (c1 < 0xA0 || c1 > 0xFF)
 980         goto invalid_code_posints;
 981       return make_number (c1);
 982     }
 983   else if (c1 < 0 || c1 > 0xFF || c2 < 0 || c2 > 0xFF)
 984     goto invalid_code_posints;
 985   c1 &= 0x7F;
 986   c2 &= 0x7F;
 987   if (c1 == 0
 988       ? c2 != 0
 989       : (c2 == 0
 990          ? !CHAR_COMPONENTS_VALID_P (charset_id, c1, 0x20)
 991          : !CHAR_COMPONENTS_VALID_P (charset_id, c1, c2)))
 992     goto invalid_code_posints;
 993   return make_number (MAKE_CHAR (charset_id, c1, c2));
 994
 995  invalid_code_posints:
 996   error ("Invalid code points for charset ID %d: %d %d", charset_id, c1, c2);
 997 }
 998
 999 DEFUN ("split-char", Fsplit_char, Ssplit_char, 1, 1, 0,
1000   "Return list of charset and one or two position-codes of CHAR.\n\
1001 If CHAR is invalid as a character code,\n\
1002 return a list of symbol `unknown' and CHAR.")
1003   (ch)
1004      Lisp_Object ch;
1005 {
1006   Lisp_Object val;
1007   int c, charset, c1, c2;
1008
1009   CHECK_NUMBER (ch, 0);
1010   c = XFASTINT (ch);
1011   if (!CHAR_VALID_P (c, 1))
1012     return Fcons (Qunknown, Fcons (ch, Qnil));
1013   SPLIT_CHAR (XFASTINT (ch), charset, c1, c2);
1014   return (c2 >= 0
1015           ? Fcons (CHARSET_SYMBOL (charset),
1016                    Fcons (make_number (c1), Fcons (make_number (c2), Qnil)))
1017           : Fcons (CHARSET_SYMBOL (charset), Fcons (make_number (c1), Qnil)));
1018 }
1019
1020 DEFUN ("char-charset", Fchar_charset, Schar_charset, 1, 1, 0,
1021   "Return charset of CHAR.")
1022   (ch)
1023      Lisp_Object ch;
1024 {
1025   CHECK_NUMBER (ch, 0);
1026
1027   return CHARSET_SYMBOL (CHAR_CHARSET (XINT (ch)));
1028 }
1029
1030 DEFUN ("charset-after", Fcharset_after, Scharset_after, 0, 1, 0,
1031   "Return charset of a character in the current buffer at position POS.\n\
1032 If POS is nil, it defauls to the current point.\n\
1033 If POS is out of range, the value is nil.")
1034   (pos)
1035      Lisp_Object pos;
1036 {
1037   Lisp_Object ch;
1038   int charset;
1039
1040   ch = Fchar_after (pos);
1041   if (! INTEGERP (ch))
1042     return ch;
1043   charset = CHAR_CHARSET (XINT (ch));
1044   return CHARSET_SYMBOL (charset);
1045 }
1046
1047 DEFUN ("iso-charset", Fiso_charset, Siso_charset, 3, 3, 0,
1048   "Return charset of ISO's specification DIMENSION, CHARS, and FINAL-CHAR.\n\
1049 \n\
1050 ISO 2022's designation sequence (escape sequence) distinguishes charsets\n\
1051 by their DIMENSION, CHARS, and FINAL-CHAR,\n\
1052 where as Emacs distinguishes them by charset symbol.\n\
1053 See the documentation of the function `charset-info' for the meanings of\n\
1054 DIMENSION, CHARS, and FINAL-CHAR.")
1055   (dimension, chars, final_char)
1056      Lisp_Object dimension, chars, final_char;
1057 {
1058   int charset;
1059
1060   CHECK_NUMBER (dimension, 0);
1061   CHECK_NUMBER (chars, 1);
1062   CHECK_NUMBER (final_char, 2);
1063
1064   if ((charset = ISO_CHARSET_TABLE (dimension, chars, final_char)) < 0)
1065     return Qnil;
1066   return CHARSET_SYMBOL (charset);
1067 }
1068
1069 /* If GENERICP is nonzero, return nonzero iff C is a valid normal or
1070    generic character.  If GENERICP is zero, return nonzero iff C is a
1071    valid normal character.  Do not call this function directly,
1072    instead use macro CHAR_VALID_P.  */
1073 int
1074 char_valid_p (c, genericp)
1075      int c, genericp;
1076 {
1077   int charset, c1, c2;
1078
1079   if (c < 0 || c >= MAX_CHAR)
1080     return 0;
1081   if (SINGLE_BYTE_CHAR_P (c))
1082     return 1;
1083   SPLIT_CHAR (c, charset, c1, c2);
1084   if (genericp)
1085     {
1086       if (c1)
1087         {
1088           if (c2 <= 0) c2 = 0x20;
1089         }
1090       else
1091         {
1092           if (c2 <= 0) c1 = c2 = 0x20;
1093         }
1094     }
1095   return (CHARSET_DEFINED_P (charset)
1096           && CHAR_COMPONENTS_VALID_P (charset, c1, c2));
1097 }
1098
1099 DEFUN ("char-valid-p", Fchar_valid_p, Schar_valid_p, 1, 2, 0,
1100   "Return t if OBJECT is a valid normal character.\n\
1101 If optional arg GENERICP is non-nil, also return t if OBJECT is\n\
1102 a valid generic character.")
1103   (object, genericp)
1104      Lisp_Object object, genericp;
1105 {
1106   if (! NATNUMP (object))
1107     return Qnil;
1108   return (CHAR_VALID_P (XFASTINT (object), !NILP (genericp)) ? Qt : Qnil);
1109 }
1110
1111 DEFUN ("unibyte-char-to-multibyte", Funibyte_char_to_multibyte,
1112        Sunibyte_char_to_multibyte, 1, 1, 0,
1113   "Convert the unibyte character CH to multibyte character.\n\
1114 The conversion is done based on `nonascii-translation-table' (which see)\n\
1115  or `nonascii-insert-offset' (which see).")
1116   (ch)
1117      Lisp_Object ch;
1118 {
1119   int c;
1120
1121   CHECK_NUMBER (ch, 0);
1122   c = XINT (ch);
1123   if (c < 0 || c >= 0400)
1124     error ("Invalid unibyte character: %d", c);
1125   c = unibyte_char_to_multibyte (c);
1126   if (c < 0)
1127     error ("Can't convert to multibyte character: %d", XINT (ch));
1128   return make_number (c);
1129 }
1130
1131 DEFUN ("multibyte-char-to-unibyte", Fmultibyte_char_to_unibyte,
1132        Smultibyte_char_to_unibyte, 1, 1, 0,
1133   "Convert the multibyte character CH to unibyte character.\n\
1134 The conversion is done based on `nonascii-translation-table' (which see)\n\
1135  or `nonascii-insert-offset' (which see).")
1136   (ch)
1137      Lisp_Object ch;
1138 {
1139   int c;
1140
1141   CHECK_NUMBER (ch, 0);
1142   c = XINT (ch);
1143   if (! CHAR_VALID_P (c, 0))
1144     error ("Invalid multibyte character: %d", c);
1145   c = multibyte_char_to_unibyte (c, Qnil);
1146   if (c < 0)
1147     error ("Can't convert to unibyte character: %d", XINT (ch));
1148   return make_number (c);
1149 }
1150
1151 DEFUN ("char-bytes", Fchar_bytes, Schar_bytes, 1, 1, 0,
1152   "Return 1 regardless of the argument CHAR.\n\
1153 This is now an obsolete function.  We keep it just for backward compatibility.")
1154   (ch)
1155      Lisp_Object ch;
1156 {
1157   Lisp_Object val;
1158
1159   CHECK_NUMBER (ch, 0);
1160   return make_number (1);
1161 }
1162
1163 /* Return how many bytes C will occupy in a multibyte buffer.
1164    Don't call this function directly, instead use macro CHAR_BYTES.  */
1165 int
1166 char_bytes (c)
1167      int c;
1168 {
1169   int charset;
1170
1171   if (ASCII_BYTE_P (c) || (c & ~((1 << CHARACTERBITS) -1)))
1172     return 1;
1173   if (SINGLE_BYTE_CHAR_P (c) && c >= 0xA0)
1174     return 1;
1175
1176   charset = CHAR_CHARSET (c);
1177   return (CHARSET_DEFINED_P (charset) ? CHARSET_BYTES (charset) : 1);
1178 }
1179
1180 /* Return the width of character of which multi-byte form starts with
1181    C.  The width is measured by how many columns occupied on the
1182    screen when displayed in the current buffer.  */
1183
1184 #define ONE_BYTE_CHAR_WIDTH(c)                                          \
1185   (c < 0x20                                                             \
1186    ? (c == '\t'                                                         \
1187       ? XFASTINT (current_buffer->tab_width)                            \
1188       : (c == '\n' ? 0 : (NILP (current_buffer->ctl_arrow) ? 4 : 2)))   \
1189    : (c < 0x7f                                                          \
1190       ? 1                                                               \
1191       : (c == 0x7F                                                      \
1192          ? (NILP (current_buffer->ctl_arrow) ? 4 : 2)                   \
1193          : ((! NILP (current_buffer->enable_multibyte_characters)       \
1194              && BASE_LEADING_CODE_P (c))                                \
1195             ? WIDTH_BY_CHAR_HEAD (c)                                    \
1196             : 4))))
1197
1198 DEFUN ("char-width", Fchar_width, Schar_width, 1, 1, 0,
1199   "Return width of CHAR when displayed in the current buffer.\n\
1200 The width is measured by how many columns it occupies on the screen.\n\
1201 Tab is taken to occupy `tab-width' columns.")
1202   (ch)
1203        Lisp_Object ch;
1204 {
1205   Lisp_Object val, disp;
1206   int c;
1207   struct Lisp_Char_Table *dp = buffer_display_table ();
1208
1209   CHECK_NUMBER (ch, 0);
1210
1211   c = XINT (ch);
1212
1213   /* Get the way the display table would display it.  */
1214   disp = dp ? DISP_CHAR_VECTOR (dp, c) : Qnil;
1215
1216   if (VECTORP (disp))
1217     XSETINT (val, XVECTOR (disp)->size);
1218   else if (SINGLE_BYTE_CHAR_P (c))
1219     XSETINT (val, ONE_BYTE_CHAR_WIDTH (c));
1220   else
1221     {
1222       int charset = CHAR_CHARSET (c);
1223
1224       XSETFASTINT (val, CHARSET_WIDTH (charset));
1225     }
1226   return val;
1227 }
1228
1229 /* Return width of string STR of length LEN when displayed in the
1230    current buffer.  The width is measured by how many columns it
1231    occupies on the screen.  */
1232
1233 int
1234 strwidth (str, len)
1235      unsigned char *str;
1236      int len;
1237 {
1238   unsigned char *endp = str + len;
1239   int width = 0;
1240   struct Lisp_Char_Table *dp = buffer_display_table ();
1241
1242   while (str < endp)
1243     {
1244       Lisp_Object disp;
1245       int thislen;
1246       int c = STRING_CHAR_AND_LENGTH (str, endp - str, thislen);
1247
1248       /* Get the way the display table would display it.  */
1249       if (dp)
1250         disp = DISP_CHAR_VECTOR (dp, c);
1251       else
1252         disp = Qnil;
1253
1254       if (VECTORP (disp))
1255         width += XVECTOR (disp)->size;
1256       else
1257         width += ONE_BYTE_CHAR_WIDTH (*str);
1258
1259       str += thislen;
1260     }
1261   return width;
1262 }
1263
1264 DEFUN ("string-width", Fstring_width, Sstring_width, 1, 1, 0,
1265   "Return width of STRING when displayed in the current buffer.\n\
1266 Width is measured by how many columns it occupies on the screen.\n\
1267 When calculating width of a multibyte character in STRING,\n\
1268 only the base leading-code is considered; the validity of\n\
1269 the following bytes is not checked.  Tabs in STRING are always\n\
1270 taken to occupy `tab-width' columns.")
1271   (str)
1272      Lisp_Object str;
1273 {
1274   Lisp_Object val;
1275
1276   CHECK_STRING (str, 0);
1277   XSETFASTINT (val, strwidth (XSTRING (str)->data,
1278                               STRING_BYTES (XSTRING (str))));
1279   return val;
1280 }
1281
1282 DEFUN ("char-direction", Fchar_direction, Schar_direction, 1, 1, 0,
1283   "Return the direction of CHAR.\n\
1284 The returned value is 0 for left-to-right and 1 for right-to-left.")
1285   (ch)
1286      Lisp_Object ch;
1287 {
1288   int charset;
1289
1290   CHECK_NUMBER (ch, 0);
1291   charset = CHAR_CHARSET (XFASTINT (ch));
1292   if (!CHARSET_DEFINED_P (charset))
1293     invalid_character (XINT (ch));
1294   return CHARSET_TABLE_INFO (charset, CHARSET_DIRECTION_IDX);
1295 }
1296
1297 DEFUN ("chars-in-region", Fchars_in_region, Schars_in_region, 2, 2, 0,
1298   "Return number of characters between BEG and END.")
1299   (beg, end)
1300      Lisp_Object beg, end;
1301 {
1302   int from, to;
1303
1304   CHECK_NUMBER_COERCE_MARKER (beg, 0);
1305   CHECK_NUMBER_COERCE_MARKER (end, 1);
1306
1307   from = min (XFASTINT (beg), XFASTINT (end));
1308   to = max (XFASTINT (beg), XFASTINT (end));
1309
1310   return make_number (to - from);
1311 }
1312
1313 /* Return the number of characters in the NBYTES bytes at PTR.
1314    This works by looking at the contents and checking for multibyte sequences.
1315    However, if the current buffer has enable-multibyte-characters = nil,
1316    we treat each byte as a character.  */
1317
1318 int
1319 chars_in_text (ptr, nbytes)
1320      unsigned char *ptr;
1321      int nbytes;
1322 {
1323   /* current_buffer is null at early stages of Emacs initialization.  */
1324   if (current_buffer == 0
1325       || NILP (current_buffer->enable_multibyte_characters))
1326     return nbytes;
1327
1328   return multibyte_chars_in_text (ptr, nbytes);
1329 }
1330
1331 /* Return the number of characters in the NBYTES bytes at PTR.
1332    This works by looking at the contents and checking for multibyte sequences.
1333    It ignores enable-multibyte-characters.  */
1334
1335 int
1336 multibyte_chars_in_text (ptr, nbytes)
1337      unsigned char *ptr;
1338      int nbytes;
1339 {
1340   unsigned char *endp;
1341   int chars, bytes;
1342
1343   endp = ptr + nbytes;
1344   chars = 0;
1345
1346   while (ptr < endp)
1347     {
1348       PARSE_MULTIBYTE_SEQ (ptr, endp - ptr, bytes);
1349       ptr += bytes;
1350       chars++;
1351     }
1352
1353   return chars;
1354 }
1355
1356 /* Parse unibyte text at STR of LEN bytes as a multibyte text, and
1357    count the numbers of characters and bytes in it.  On counting
1358    bytes, pay attention to that 8-bit characters in the range
1359    0x80..0x9F are represented by 2-byte in a multibyte text.  */
1360 void
1361 parse_str_as_multibyte (str, len, nchars, nbytes)
1362      unsigned char *str;
1363      int len, *nchars, *nbytes;
1364 {
1365   unsigned char *endp = str + len;
1366   int n, chars = 0, bytes = 0;
1367
1368   while (str < endp)
1369     {
1370       if (UNIBYTE_STR_AS_MULTIBYTE_P (str, endp - str, n))
1371         str += n, bytes += n;
1372       else
1373         str++, bytes += 2;
1374       chars++;
1375     }
1376   *nchars = chars;
1377   *nbytes = bytes;
1378   return;
1379 }
1380
1381 /* Arrange unibyte text at STR of NBYTES bytes as a multibyte text.
1382    It actually converts only 8-bit characters in the range 0x80..0x9F
1383    that don't contruct multibyte characters to multibyte forms.  If
1384    NCHARS is nonzero, set *NCHARS to the number of characters in the
1385    text.  It is assured that we can use LEN bytes at STR as a work
1386    area and that is enough.  Return the number of bytes of the
1387    resulting text.  */
1388
1389 int
1390 str_as_multibyte (str, len, nbytes, nchars)
1391      unsigned char *str;
1392      int len, nbytes, *nchars;
1393 {
1394   unsigned char *p = str, *endp = str + nbytes;
1395   unsigned char *to;
1396   int chars = 0;
1397   int n;
1398
1399   while (p < endp && UNIBYTE_STR_AS_MULTIBYTE_P (p, endp - p, n))
1400     p += n, chars++;
1401   if (nchars)
1402     *nchars = chars;
1403   if (p == endp)
1404     return nbytes;
1405
1406   to = p;
1407   nbytes = endp - p;
1408   endp = str + len;
1409   safe_bcopy (p, endp - nbytes, nbytes);
1410   p = endp - nbytes;
1411   while (p < endp)
1412     {
1413       if (UNIBYTE_STR_AS_MULTIBYTE_P (p, endp - p, n))
1414         {
1415           while (n--)
1416             *to++ = *p++;
1417         }
1418       else
1419         {
1420           *to++ = LEADING_CODE_8_BIT_CONTROL;
1421           *to++ = *p++ + 0x20;
1422         }
1423       chars++;
1424     }
1425   if (nchars)
1426     *nchars = chars;
1427   return (to - str);
1428 }
1429
1430 /* Convert unibyte text at STR of NBYTES bytes to a multibyte text
1431    that contains the same single-byte characters.  It actually
1432    converts all 8-bit characters to multibyte forms.  It is assured
1433    that we can use LEN bytes at STR as a work area and that is
1434    enough.  */
1435
1436 int
1437 str_to_multibyte (str, len, bytes)
1438      unsigned char *str;
1439      int len, bytes;
1440 {
1441   unsigned char *p = str, *endp = str + bytes;
1442   unsigned char *to;
1443   int c;
1444
1445   while (p < endp && (*p < 0x80 || *p >= 0xA0)) p++;
1446   if (p == endp)
1447     return bytes;
1448   to = p;
1449   bytes = endp - p;
1450   endp = str + len;
1451   safe_bcopy (p, endp - bytes, bytes);
1452   p = endp - bytes;
1453   while (p < endp)
1454     {
1455       if (*p < 0x80 || *p >= 0xA0)
1456         *to++ = *p++;
1457       else
1458         *to++ = LEADING_CODE_8_BIT_CONTROL, *to++ = *p++ + 0x20;
1459     }
1460   return (to - str);
1461 }
1462
1463 /* Arrange multibyte text at STR of LEN bytes as a unibyte text.  It
1464    actually converts only 8-bit characters in the range 0x80..0x9F to
1465    unibyte forms.  */
1466
1467 int
1468 str_as_unibyte (str, bytes)
1469      unsigned char *str;
1470      int bytes;
1471 {
1472   unsigned char *p = str, *endp = str + bytes;
1473   unsigned char *to = str;
1474
1475   while (p < endp && *p != LEADING_CODE_8_BIT_CONTROL) p++;
1476   to = p;
1477   while (p < endp)
1478     {
1479       if (*p == LEADING_CODE_8_BIT_CONTROL)
1480         *to++ = *(p + 1) - 0x20, p += 2;
1481       else
1482         *to++ = *p++;
1483     }
1484   return (to - str);
1485 }
1486
1487 \f
1488 DEFUN ("string", Fstring, Sstring, 1, MANY, 0,
1489   "Concatenate all the argument characters and make the result a string.")
1490   (n, args)
1491      int n;
1492      Lisp_Object *args;
1493 {
1494   int i;
1495   unsigned char *buf = (unsigned char *) alloca (MAX_MULTIBYTE_LENGTH * n);
1496   unsigned char *p = buf;
1497   int c;
1498
1499   for (i = 0; i < n; i++)
1500     {
1501       CHECK_NUMBER (args[i], 0);
1502       c = XINT (args[i]);
1503       p += CHAR_STRING (c, p);
1504     }
1505
1506   return make_string_from_bytes (buf, n, p - buf);
1507 }
1508
1509 #endif /* emacs */
1510 \f
1511 int
1512 charset_id_internal (charset_name)
1513      char *charset_name;
1514 {
1515   Lisp_Object val;
1516
1517   val= Fget (intern (charset_name), Qcharset);
1518   if (!VECTORP (val))
1519     error ("Charset %s is not defined", charset_name);
1520
1521   return (XINT (XVECTOR (val)->contents[0]));
1522 }
1523
1524 DEFUN ("setup-special-charsets", Fsetup_special_charsets,
1525        Ssetup_special_charsets, 0, 0, 0, "Internal use only.")
1526    ()
1527 {
1528   charset_latin_iso8859_1 = charset_id_internal ("latin-iso8859-1");
1529   charset_jisx0208_1978 = charset_id_internal ("japanese-jisx0208-1978");
1530   charset_jisx0208 = charset_id_internal ("japanese-jisx0208");
1531   charset_katakana_jisx0201 = charset_id_internal ("katakana-jisx0201");
1532   charset_latin_jisx0201 = charset_id_internal ("latin-jisx0201");
1533   charset_big5_1 = charset_id_internal ("chinese-big5-1");
1534   charset_big5_2 = charset_id_internal ("chinese-big5-2");
1535   return Qnil;
1536 }
1537
1538 void
1539 init_charset_once ()
1540 {
1541   int i, j, k;
1542
1543   staticpro (&Vcharset_table);
1544   staticpro (&Vcharset_symbol_table);
1545   staticpro (&Vgeneric_character_list);
1546
1547   /* This has to be done here, before we call Fmake_char_table.  */
1548   Qcharset_table = intern ("charset-table");
1549   staticpro (&Qcharset_table);
1550
1551   /* Intern this now in case it isn't already done.
1552      Setting this variable twice is harmless.
1553      But don't staticpro it here--that is done in alloc.c.  */
1554   Qchar_table_extra_slots = intern ("char-table-extra-slots");
1555
1556   /* Now we are ready to set up this property, so we can
1557      create the charset table.  */
1558   Fput (Qcharset_table, Qchar_table_extra_slots, make_number (0));
1559   Vcharset_table = Fmake_char_table (Qcharset_table, Qnil);
1560
1561   Qunknown = intern ("unknown");
1562   staticpro (&Qunknown);
1563   Vcharset_symbol_table = Fmake_vector (make_number (MAX_CHARSET + 1),
1564                                         Qunknown);
1565
1566   /* Setup tables.  */
1567   for (i = 0; i < 2; i++)
1568     for (j = 0; j < 2; j++)
1569       for (k = 0; k < 128; k++)
1570         iso_charset_table [i][j][k] = -1;
1571
1572   for (i = 0; i < 256; i++)
1573     bytes_by_char_head[i] = 1;
1574   bytes_by_char_head[LEADING_CODE_PRIVATE_11] = 3;
1575   bytes_by_char_head[LEADING_CODE_PRIVATE_12] = 3;
1576   bytes_by_char_head[LEADING_CODE_PRIVATE_21] = 4;
1577   bytes_by_char_head[LEADING_CODE_PRIVATE_22] = 4;
1578
1579   for (i = 0; i < 128; i++)
1580     width_by_char_head[i] = 1;
1581   for (; i < 256; i++)
1582     width_by_char_head[i] = 4;
1583   width_by_char_head[LEADING_CODE_PRIVATE_11] = 1;
1584   width_by_char_head[LEADING_CODE_PRIVATE_12] = 2;
1585   width_by_char_head[LEADING_CODE_PRIVATE_21] = 1;
1586   width_by_char_head[LEADING_CODE_PRIVATE_22] = 2;
1587
1588   {
1589     Lisp_Object val;
1590
1591     val = Qnil;
1592     for (i = 0x81; i < 0x90; i++)
1593       val = Fcons (make_number ((i - 0x70) << 7), val);
1594     for (; i < 0x9A; i++)
1595       val = Fcons (make_number ((i - 0x8F) << 14), val);
1596     for (i = 0xA0; i < 0xF0; i++)
1597       val = Fcons (make_number ((i - 0x70) << 7), val);
1598     for (; i < 0xFF; i++)
1599       val = Fcons (make_number ((i - 0xE0) << 14), val);
1600     Vgeneric_character_list = Fnreverse (val);
1601   }
1602
1603   nonascii_insert_offset = 0;
1604   Vnonascii_translation_table = Qnil;
1605 }
1606
1607 #ifdef emacs
1608
1609 void
1610 syms_of_charset ()
1611 {
1612   Qcharset = intern ("charset");
1613   staticpro (&Qcharset);
1614
1615   Qascii = intern ("ascii");
1616   staticpro (&Qascii);
1617
1618   Qeight_bit_control = intern ("eight-bit-control");
1619   staticpro (&Qeight_bit_control);
1620
1621   Qeight_bit_graphic = intern ("eight-bit-graphic");
1622   staticpro (&Qeight_bit_graphic);
1623
1624   /* Define special charsets ascii, eight-bit-control, and
1625      eight-bit-graphic.  */
1626   update_charset_table (make_number (CHARSET_ASCII),
1627                         make_number (1), make_number (94),
1628                         make_number (1),
1629                         make_number (0),
1630                         make_number ('B'),
1631                         make_number (0),
1632                         build_string ("ASCII"),
1633                         Qnil,   /* same as above */
1634                         build_string ("ASCII (ISO646 IRV)"));
1635   CHARSET_SYMBOL (CHARSET_ASCII) = Qascii;
1636   Fput (Qascii, Qcharset, CHARSET_TABLE_ENTRY (CHARSET_ASCII));
1637
1638   update_charset_table (make_number (CHARSET_8_BIT_CONTROL),
1639                         make_number (1), make_number (96),
1640                         make_number (4),
1641                         make_number (0),
1642                         make_number (-1),
1643                         make_number (-1),
1644                         build_string ("8-bit control code (0x80..0x9F)"),
1645                         Qnil,   /* same as above */
1646                         Qnil);  /* same as above */
1647   CHARSET_SYMBOL (CHARSET_8_BIT_CONTROL) = Qeight_bit_control;
1648   Fput (Qeight_bit_control, Qcharset,
1649         CHARSET_TABLE_ENTRY (CHARSET_8_BIT_CONTROL));
1650
1651   update_charset_table (make_number (CHARSET_8_BIT_GRAPHIC),
1652                         make_number (1), make_number (96),
1653                         make_number (4),
1654                         make_number (0),
1655                         make_number (-1),
1656                         make_number (-1),
1657                         build_string ("8-bit graphic char (0xA0..0xFF)"),
1658                         Qnil,   /* same as above */
1659                         Qnil);  /* same as above */
1660   CHARSET_SYMBOL (CHARSET_8_BIT_GRAPHIC) = Qeight_bit_graphic;
1661   Fput (Qeight_bit_graphic, Qcharset,
1662         CHARSET_TABLE_ENTRY (CHARSET_8_BIT_GRAPHIC));
1663
1664   Qauto_fill_chars = intern ("auto-fill-chars");
1665   staticpro (&Qauto_fill_chars);
1666   Fput (Qauto_fill_chars, Qchar_table_extra_slots, make_number (0));
1667
1668   defsubr (&Sdefine_charset);
1669   defsubr (&Sgeneric_character_list);
1670   defsubr (&Sget_unused_iso_final_char);
1671   defsubr (&Sdeclare_equiv_charset);
1672   defsubr (&Sfind_charset_region);
1673   defsubr (&Sfind_charset_string);
1674   defsubr (&Smake_char_internal);
1675   defsubr (&Ssplit_char);
1676   defsubr (&Schar_charset);
1677   defsubr (&Scharset_after);
1678   defsubr (&Siso_charset);
1679   defsubr (&Schar_valid_p);
1680   defsubr (&Sunibyte_char_to_multibyte);
1681   defsubr (&Smultibyte_char_to_unibyte);
1682   defsubr (&Schar_bytes);
1683   defsubr (&Schar_width);
1684   defsubr (&Sstring_width);
1685   defsubr (&Schar_direction);
1686   defsubr (&Schars_in_region);
1687   defsubr (&Sstring);
1688   defsubr (&Ssetup_special_charsets);
1689
1690   DEFVAR_LISP ("charset-list", &Vcharset_list,
1691     "List of charsets ever defined.");
1692   Vcharset_list = Fcons (Qascii, Fcons (Qeight_bit_control,
1693                                         Fcons (Qeight_bit_graphic, Qnil)));
1694
1695   DEFVAR_LISP ("translation-table-vector",  &Vtranslation_table_vector,
1696     "Vector of cons cell of a symbol and translation table ever defined.\n\
1697 An ID of a translation table is an index of this vector.");
1698   Vtranslation_table_vector = Fmake_vector (make_number (16), Qnil);
1699
1700   DEFVAR_INT ("leading-code-private-11", &leading_code_private_11,
1701     "Leading-code of private TYPE9N charset of column-width 1.");
1702   leading_code_private_11 = LEADING_CODE_PRIVATE_11;
1703
1704   DEFVAR_INT ("leading-code-private-12", &leading_code_private_12,
1705     "Leading-code of private TYPE9N charset of column-width 2.");
1706   leading_code_private_12 = LEADING_CODE_PRIVATE_12;
1707
1708   DEFVAR_INT ("leading-code-private-21", &leading_code_private_21,
1709     "Leading-code of private TYPE9Nx9N charset of column-width 1.");
1710   leading_code_private_21 = LEADING_CODE_PRIVATE_21;
1711
1712   DEFVAR_INT ("leading-code-private-22", &leading_code_private_22,
1713     "Leading-code of private TYPE9Nx9N charset of column-width 2.");
1714   leading_code_private_22 = LEADING_CODE_PRIVATE_22;
1715
1716   DEFVAR_INT ("nonascii-insert-offset", &nonascii_insert_offset,
1717     "Offset for converting non-ASCII unibyte codes 0240...0377 to multibyte.\n\
1718 This is used for converting unibyte text to multibyte,\n\
1719 and for inserting character codes specified by number.\n\n\
1720 This serves to convert a Latin-1 or similar 8-bit character code\n\
1721 to the corresponding Emacs multibyte character code.\n\
1722 Typically the value should be (- (make-char CHARSET 0) 128),\n\
1723 for your choice of character set.\n\
1724 If `nonascii-translation-table' is non-nil, it overrides this variable.");
1725   nonascii_insert_offset = 0;
1726
1727   DEFVAR_LISP ("nonascii-translation-table", &Vnonascii_translation_table,
1728     "Translation table to convert non-ASCII unibyte codes to multibyte.\n\
1729 This is used for converting unibyte text to multibyte,\n\
1730 and for inserting character codes specified by number.\n\n\
1731 Conversion is performed only when multibyte characters are enabled,\n\
1732 and it serves to convert a Latin-1 or similar 8-bit character code\n\
1733 to the corresponding Emacs character code.\n\n\
1734 If this is nil, `nonascii-insert-offset' is used instead.\n\
1735 See also the docstring of `make-translation-table'.");
1736   Vnonascii_translation_table = Qnil;
1737
1738   DEFVAR_LISP ("auto-fill-chars", &Vauto_fill_chars,
1739     "A char-table for characters which invoke auto-filling.\n\
1740 Such characters have value t in this table.");
1741   Vauto_fill_chars = Fmake_char_table (Qauto_fill_chars, Qnil);
1742   CHAR_TABLE_SET (Vauto_fill_chars, make_number (' '), Qt);
1743   CHAR_TABLE_SET (Vauto_fill_chars, make_number ('\n'), Qt);
1744 }
1745
1746 #endif /* emacs */