src/charset.c

   1 /* Basic multilingual character support.
   2    Copyright (C) 1995, 1997, 1998 Electrotechnical Laboratory, JAPAN.
   3    Licensed to the Free Software Foundation.
   4
   5 This file is part of GNU Emacs.
   6
   7 GNU Emacs is free software; you can redistribute it and/or modify
   8 it under the terms of the GNU General Public License as published by
   9 the Free Software Foundation; either version 2, or (at your option)
  10 any later version.
  11
  12 GNU Emacs is distributed in the hope that it will be useful,
  13 but WITHOUT ANY WARRANTY; without even the implied warranty of
  14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  15 GNU General Public License for more details.
  16
  17 You should have received a copy of the GNU General Public License
  18 along with GNU Emacs; see the file COPYING.  If not, write to
  19 the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
  20 Boston, MA 02111-1307, USA.  */
  21
  22 /* At first, see the document in `charset.h' to understand the code in
  23    this file.  */
  24
  25 #ifdef emacs
  26 #include <config.h>
  27 #endif
  28
  29 #include <stdio.h>
  30
  31 #ifdef emacs
  32
  33 #include <sys/types.h>
  34 #include "lisp.h"
  35 #include "buffer.h"
  36 #include "charset.h"
  37 #include "coding.h"
  38 #include "disptab.h"
  39
  40 #else  /* not emacs */
  41
  42 #include "mulelib.h"
  43
  44 #endif /* emacs */
  45
  46 Lisp_Object Qcharset, Qascii;
  47 Lisp_Object Qunknown;
  48
  49 /* Declaration of special leading-codes.  */
  50 int leading_code_private_11;    /* for private DIMENSION1 of 1-column */
  51 int leading_code_private_12;    /* for private DIMENSION1 of 2-column */
  52 int leading_code_private_21;    /* for private DIMENSION2 of 1-column */
  53 int leading_code_private_22;    /* for private DIMENSION2 of 2-column */
  54
  55 /* Declaration of special charsets.  */
  56 int charset_ascii;              /* ASCII */
  57 int charset_latin_iso8859_1;    /* ISO8859-1 (Latin-1) */
  58 int charset_jisx0208_1978;      /* JISX0208.1978 (Japanese Kanji old set) */
  59 int charset_jisx0208;           /* JISX0208.1983 (Japanese Kanji) */
  60 int charset_katakana_jisx0201;  /* JISX0201.Kana (Japanese Katakana) */
  61 int charset_latin_jisx0201;     /* JISX0201.Roman (Japanese Roman) */
  62 int charset_big5_1;             /* Big5 Level 1 (Chinese Traditional) */
  63 int charset_big5_2;             /* Big5 Level 2 (Chinese Traditional) */
  64
  65 Lisp_Object Qcharset_table;
  66
  67 /* A char-table containing information of each character set.  */
  68 Lisp_Object Vcharset_table;
  69
  70 /* A vector of charset symbol indexed by charset-id.  This is used
  71    only for returning charset symbol from C functions.  */
  72 Lisp_Object Vcharset_symbol_table;
  73
  74 /* A list of charset symbols ever defined.  */
  75 Lisp_Object Vcharset_list;
  76
  77 /* Vector of translation table ever defined.
  78    ID of a translation table is used to index this vector.  */
  79 Lisp_Object Vtranslation_table_vector;
  80
  81 /* A char-table for characters which may invoke auto-filling.  */
  82 Lisp_Object Vauto_fill_chars;
  83
  84 Lisp_Object Qauto_fill_chars;
  85
  86 /* Tables used by macros BYTES_BY_CHAR_HEAD and WIDTH_BY_CHAR_HEAD.  */
  87 int bytes_by_char_head[256];
  88 int width_by_char_head[256];
  89
  90 /* Mapping table from ISO2022's charset (specified by DIMENSION,
  91    CHARS, and FINAL-CHAR) to Emacs' charset.  */
  92 int iso_charset_table[2][2][128];
  93
  94 /* Variables used locally in the macro FETCH_MULTIBYTE_CHAR.  */
  95 unsigned char *_fetch_multibyte_char_p;
  96 int _fetch_multibyte_char_len;
  97
  98 /* Offset to add to a non-ASCII value when inserting it.  */
  99 int nonascii_insert_offset;
 100
 101 /* Translation table for converting non-ASCII unibyte characters
 102    to multibyte codes, or nil.  */
 103 Lisp_Object Vnonascii_translation_table;
 104
 105 /* List of all possible generic characters.  */
 106 Lisp_Object Vgeneric_character_list;
 107
 108 #define min(X, Y) ((X) < (Y) ? (X) : (Y))
 109 #define max(X, Y) ((X) > (Y) ? (X) : (Y))
 110 \f
 111 void
 112 invalid_character (c)
 113      int c;
 114 {
 115   error ("Invalid character: 0%o, %d, 0x%x", c, c, c);
 116 }
 117
 118 /* Parse a multibyte character string STR of length LENGTH (>= 2) set
 119    BYTES to the length of actual multibyte sequence, CHARSET, C1, and
 120    C2 to such values that MAKE_CHAR can make the multibyte character
 121    from them.
 122
 123    It is assumed that *STR is one of base leading codes and the
 124    following (LENGTH - 1) bytes satisfy !CHAR_HEAD_P.
 125
 126    This macro should be called only from SPLIT_MULTIBYTE_SEQ.  */
 127
 128 #define SPLIT_CHARACTER_SEQ(str, length, bytes, charset, c1, c2)        \
 129   do {                                                                  \
 130     (bytes) = 1;                                                        \
 131     (charset) = (str)[0];                                               \
 132     if ((charset) >= LEADING_CODE_PRIVATE_11                            \
 133         && (charset) <= LEADING_CODE_PRIVATE_22)                        \
 134       (charset) = (str)[(bytes)++];                                     \
 135     if ((bytes) < (length))                                             \
 136       {                                                                 \
 137         (c1) = (str)[(bytes)++] & 0x7F;                                 \
 138         if ((bytes) < (length))                                         \
 139           (c2) = (str)[(bytes)++] & 0x7F;                               \
 140         else                                                            \
 141           (c2) = -1;                                                    \
 142       }                                                                 \
 143     else                                                                \
 144       (c1) = (c2) = -1;                                                 \
 145   } while (0)
 146
 147 /* Parse string STR of length LENGTH and check if a multibyte
 148    characters is at STR.  Set BYTES to the actual length, CHARSET, C1,
 149    C2 to proper values for that character.  */
 150
 151 #define SPLIT_MULTIBYTE_SEQ(str, length, bytes, charset, c1, c2)        \
 152   do {                                                                  \
 153     int i;                                                              \
 154     if (ASCII_BYTE_P ((str)[0]))                                        \
 155       i = 1;                                                            \
 156     else                                                                \
 157       for (i = 1; i < (length) && ! CHAR_HEAD_P ((str)[i]); i++);       \
 158     if (i == 1)                                                         \
 159       (bytes) = 1, (charset) = CHARSET_ASCII, (c1) = (str)[0] ;         \
 160     else                                                                \
 161       {                                                                 \
 162         if (i > BYTES_BY_CHAR_HEAD ((str)[0]))                          \
 163           i = BYTES_BY_CHAR_HEAD ((str)[0]);                            \
 164         SPLIT_CHARACTER_SEQ (str, i, bytes, charset, c1, c2);           \
 165       }                                                                 \
 166   } while (0)
 167
 168 /* 1 if CHARSET, C1, and C2 compose a valid character, else 0.  */
 169 #define CHAR_COMPONENTS_VALID_P(charset, c1, c2)        \
 170   (charset == CHARSET_ASCII                             \
 171    ? ((c1) >= 0 && (c1) <= 0x7F)                        \
 172    : (CHARSET_DIMENSION (charset) == 1                  \
 173       ? ((c1) >= 0x20 && (c1) <= 0x7F)                  \
 174       : ((c1) >= 0x20 && (c1) <= 0x7F && (c2) >= 0x20 && (c2) <= 0x7F)))
 175
 176 /* Store multi-byte form of the character C in STR.  The caller should
 177    allocate at least 4-byte area at STR in advance.  Returns the
 178    length of the multi-byte form.  If C is an invalid character code,
 179    signal an error.
 180
 181    Use macro `CHAR_STRING (C, STR)' instead of calling this function
 182    directly if C can be an ASCII character.  */
 183
 184 int
 185 char_to_string (c, str)
 186      int c;
 187      unsigned char *str;
 188 {
 189   unsigned char *p = str;
 190
 191   if (c & CHAR_MODIFIER_MASK)   /* This includes the case C is negative.  */
 192     {
 193       /* Multibyte character can't have a modifier bit.  */
 194       if (! SINGLE_BYTE_CHAR_P ((c & ~CHAR_MODIFIER_MASK)))
 195         invalid_character (c);
 196
 197       /* For Meta, Shift, and Control modifiers, we need special care.  */
 198       if (c & CHAR_META)
 199         {
 200           /* Move the meta bit to the right place for a string.  */
 201           c = (c & ~CHAR_META) | 0x80;
 202         }
 203       if (c & CHAR_SHIFT)
 204         {
 205           /* Shift modifier is valid only with [A-Za-z].  */
 206           if ((c & 0377) >= 'A' && (c & 0377) <= 'Z')
 207             c &= ~CHAR_SHIFT;
 208           else if ((c & 0377) >= 'a' && (c & 0377) <= 'z')
 209             c = (c & ~CHAR_SHIFT) - ('a' - 'A');
 210         }
 211       if (c & CHAR_CTL)
 212         {
 213           /* Simulate the code in lread.c.  */
 214           /* Allow `\C- ' and `\C-?'.  */
 215           if (c == (CHAR_CTL | ' '))
 216             c = 0;
 217           else if (c == (CHAR_CTL | '?'))
 218             c = 127;
 219           /* ASCII control chars are made from letters (both cases),
 220              as well as the non-letters within 0100...0137.  */
 221           else if ((c & 0137) >= 0101 && (c & 0137) <= 0132)
 222             c &= (037 | (~0177 & ~CHAR_CTL));
 223           else if ((c & 0177) >= 0100 && (c & 0177) <= 0137)
 224             c &= (037 | (~0177 & ~CHAR_CTL));
 225         }
 226
 227       /* If C still has any modifier bits, it is an invalid character.  */
 228       if (c & CHAR_MODIFIER_MASK)
 229         invalid_character (c);
 230
 231       *p++ = c;
 232     }
 233   else if (c < MAX_CHAR)
 234     {
 235       int charset, c1, c2;
 236
 237       SPLIT_NON_ASCII_CHAR (c, charset, c1, c2);
 238
 239       if (charset >= LEADING_CODE_EXT_11)
 240         *p++ = (charset < LEADING_CODE_EXT_12
 241                 ? LEADING_CODE_PRIVATE_11
 242                 : (charset < LEADING_CODE_EXT_21
 243                    ? LEADING_CODE_PRIVATE_12
 244                    : (charset < LEADING_CODE_EXT_22
 245                       ? LEADING_CODE_PRIVATE_21
 246                       : LEADING_CODE_PRIVATE_22)));
 247       *p++ = charset;
 248       if (c1 > 0 && c1 < 32 || c2 > 0 && c2 < 32)
 249         invalid_character (c);
 250       if (c1)
 251         {
 252           *p++ = c1 | 0x80;
 253           if (c2 > 0)
 254             *p++ = c2 | 0x80;
 255         }
 256     }
 257
 258   return (p -str);
 259 }
 260
 261 /* Return the non-ASCII character corresponding to multi-byte form at
 262    STR of length LEN.  If ACTUAL_LEN is not NULL, store the byte
 263    length of the multibyte form in *ACTUAL_LEN.
 264
 265    Use macros STRING_CHAR or STRING_CHAR_AND_LENGTH instead of calling
 266    this function directly if you want ot handle ASCII characters as
 267    well.  */
 268
 269 int
 270 string_to_char (str, len, actual_len)
 271      const unsigned char *str;
 272      int len, *actual_len;
 273 {
 274   int c, bytes, charset, c1, c2;
 275
 276   SPLIT_MULTIBYTE_SEQ (str, len, bytes, charset, c1, c2);
 277   c = MAKE_CHAR (charset, c1, c2);
 278   if (actual_len)
 279     *actual_len = bytes;
 280   return c;
 281 }
 282
 283 /* Return the length of the multi-byte form at string STR of length LEN.
 284    Use the macro MULTIBYTE_FORM_LENGTH instead.  */
 285 int
 286 multibyte_form_length (str, len)
 287      const unsigned char *str;
 288      int len;
 289 {
 290   int bytes;
 291
 292   PARSE_MULTIBYTE_SEQ (str, len, bytes);
 293   return bytes;
 294 }
 295
 296 /* Check multibyte form at string STR of length LEN and set variables
 297    pointed by CHARSET, C1, and C2 to charset and position codes of the
 298    character at STR, and return 0.  If there's no multibyte character,
 299    return -1.  This should be used only in the macro SPLIT_STRING
 300    which checks range of STR in advance.  */
 301
 302 int
 303 split_string (str, len, charset, c1, c2)
 304      const unsigned char *str;
 305      unsigned char *c1, *c2;
 306      int len, *charset;
 307 {
 308   register int bytes, cs, code1, code2 = -1;
 309
 310   SPLIT_MULTIBYTE_SEQ (str, len, bytes, cs, code1, code2);
 311   if (cs == CHARSET_ASCII)
 312     return -1;
 313   *charset = cs;
 314   *c1 = code1;
 315   *c2 = code2;
 316   return 0;
 317 }
 318
 319 /* Return 1 iff character C has valid printable glyph.
 320    Use the macro CHAR_PRINTABLE_P instead.  */
 321 int
 322 char_printable_p (c)
 323      int c;
 324 {
 325   int charset, c1, c2, chars;
 326
 327   if (SINGLE_BYTE_CHAR_P (c))
 328     return 1;
 329   if (c >= MAX_CHAR)
 330     return 0;
 331
 332   SPLIT_NON_ASCII_CHAR (c, charset, c1, c2);
 333   if (! CHARSET_DEFINED_P (charset))
 334     return 0;
 335   if (CHARSET_CHARS (charset) == 94
 336       ? c1 <= 32 || c1 >= 127
 337       : c1 < 32)
 338     return 0;
 339   if (CHARSET_DIMENSION (charset) == 2
 340       && (CHARSET_CHARS (charset) == 94
 341           ? c2 <= 32 || c2 >= 127
 342           : c2 < 32))
 343     return 0;
 344   return 1;
 345 }
 346
 347 /* Translate character C by translation table TABLE.  If C
 348    is negative, translate a character specified by CHARSET, C1, and C2
 349    (C1 and C2 are code points of the character).  If no translation is
 350    found in TABLE, return C.  */
 351 int
 352 translate_char (table, c, charset, c1, c2)
 353      Lisp_Object table;
 354      int c, charset, c1, c2;
 355 {
 356   Lisp_Object ch;
 357   int alt_charset, alt_c1, alt_c2, dimension;
 358
 359   if (c < 0) c = MAKE_CHAR (charset, (c1 & 0x7F) , (c2 & 0x7F));
 360   if (!CHAR_TABLE_P (table)
 361       || (ch = Faref (table, make_number (c)), !NATNUMP (ch)))
 362     return c;
 363
 364   SPLIT_CHAR (XFASTINT (ch), alt_charset, alt_c1, alt_c2);
 365   dimension = CHARSET_DIMENSION (alt_charset);
 366   if (dimension == 1 && alt_c1 > 0 || dimension == 2 && alt_c2 > 0)
 367     /* CH is not a generic character, just return it.  */
 368     return XFASTINT (ch);
 369
 370   /* Since CH is a generic character, we must return a specific
 371      charater which has the same position codes as C from CH.  */
 372   if (charset < 0)
 373     SPLIT_CHAR (c, charset, c1, c2);
 374   if (dimension != CHARSET_DIMENSION (charset))
 375     /* We can't make such a character because of dimension mismatch.  */
 376     return c;
 377   return MAKE_CHAR (alt_charset, c1, c2);
 378 }
 379
 380 /* Convert the unibyte character C to multibyte based on
 381    Vnonascii_translation_table or nonascii_insert_offset.  If they can't
 382    convert C to a valid multibyte character, convert it based on
 383    DEFAULT_NONASCII_INSERT_OFFSET which makes C a Latin-1 character.  */
 384
 385 int
 386 unibyte_char_to_multibyte (c)
 387      int c;
 388 {
 389   if (c < 0400 && c >= 0200)
 390     {
 391       int c_save = c;
 392
 393       if (! NILP (Vnonascii_translation_table))
 394         {
 395           c = XINT (Faref (Vnonascii_translation_table, make_number (c)));
 396           if (c >= 0400 && ! char_valid_p (c, 0))
 397             c = c_save + DEFAULT_NONASCII_INSERT_OFFSET;
 398         }
 399       else if (c >= 0240 && nonascii_insert_offset > 0)
 400         {
 401           c += nonascii_insert_offset;
 402           if (c < 0400 || ! char_valid_p (c, 0))
 403             c = c_save + DEFAULT_NONASCII_INSERT_OFFSET;
 404         }
 405       else if (c >= 0240)
 406         c = c_save + DEFAULT_NONASCII_INSERT_OFFSET;
 407     }
 408   return c;
 409 }
 410
 411
 412 /* Convert the multibyte character C to unibyte 8-bit character based
 413    on Vnonascii_translation_table or nonascii_insert_offset.  If
 414    REV_TBL is non-nil, it should be a reverse table of
 415    Vnonascii_translation_table, i.e. what given by:
 416      Fchar_table_extra_slot (Vnonascii_translation_table, make_number (0))  */
 417
 418 int
 419 multibyte_char_to_unibyte (c, rev_tbl)
 420      int c;
 421      Lisp_Object rev_tbl;
 422 {
 423   if (!SINGLE_BYTE_CHAR_P (c))
 424     {
 425       int c_save = c;
 426
 427       if (! CHAR_TABLE_P (rev_tbl)
 428           && CHAR_TABLE_P (Vnonascii_translation_table))
 429         rev_tbl = Fchar_table_extra_slot (Vnonascii_translation_table,
 430                                           make_number (0));
 431       if (CHAR_TABLE_P (rev_tbl))
 432         {
 433           Lisp_Object temp;
 434           temp = Faref (rev_tbl, make_number (c));
 435           if (INTEGERP (temp))
 436             c = XINT (temp);
 437           if (c >= 256)
 438             c = (c_save & 0177) + 0200;
 439         }
 440       else
 441         {
 442           if (nonascii_insert_offset > 0)
 443             c -= nonascii_insert_offset;
 444           if (c < 128 || c >= 256)
 445             c = (c_save & 0177) + 0200;
 446         }
 447     }
 448
 449   return c;
 450 }
 451
 452 \f
 453 /* Update the table Vcharset_table with the given arguments (see the
 454    document of `define-charset' for the meaning of each argument).
 455    Several other table contents are also updated.  The caller should
 456    check the validity of CHARSET-ID and the remaining arguments in
 457    advance.  */
 458
 459 void
 460 update_charset_table (charset_id, dimension, chars, width, direction,
 461                       iso_final_char, iso_graphic_plane,
 462                       short_name, long_name, description)
 463      Lisp_Object charset_id, dimension, chars, width, direction;
 464      Lisp_Object iso_final_char, iso_graphic_plane;
 465      Lisp_Object short_name, long_name, description;
 466 {
 467   int charset = XINT (charset_id);
 468   int bytes;
 469   unsigned char leading_code_base, leading_code_ext;
 470
 471   if (NILP (CHARSET_TABLE_ENTRY (charset)))
 472     CHARSET_TABLE_ENTRY (charset)
 473       = Fmake_vector (make_number (CHARSET_MAX_IDX), Qnil);
 474
 475   /* Get byte length of multibyte form, base leading-code, and
 476      extended leading-code of the charset.  See the comment under the
 477      title "GENERAL NOTE on CHARACTER SET (CHARSET)" in charset.h.  */
 478   bytes = XINT (dimension);
 479   if (charset < MIN_CHARSET_PRIVATE_DIMENSION1)
 480     {
 481       /* Official charset, it doesn't have an extended leading-code.  */
 482       if (charset != CHARSET_ASCII)
 483         bytes += 1; /* For a base leading-code.  */
 484       leading_code_base = charset;
 485       leading_code_ext = 0;
 486     }
 487   else
 488     {
 489       /* Private charset.  */
 490       bytes += 2; /* For base and extended leading-codes.  */
 491       leading_code_base
 492         = (charset < LEADING_CODE_EXT_12
 493            ? LEADING_CODE_PRIVATE_11
 494            : (charset < LEADING_CODE_EXT_21
 495               ? LEADING_CODE_PRIVATE_12
 496               : (charset < LEADING_CODE_EXT_22
 497                  ? LEADING_CODE_PRIVATE_21
 498                  : LEADING_CODE_PRIVATE_22)));
 499       leading_code_ext = charset;
 500     }
 501
 502   if (BYTES_BY_CHAR_HEAD (leading_code_base) != bytes)
 503     error ("Invalid dimension for the charset-ID %d", charset);
 504
 505   CHARSET_TABLE_INFO (charset, CHARSET_ID_IDX) = charset_id;
 506   CHARSET_TABLE_INFO (charset, CHARSET_BYTES_IDX) = make_number (bytes);
 507   CHARSET_TABLE_INFO (charset, CHARSET_DIMENSION_IDX) = dimension;
 508   CHARSET_TABLE_INFO (charset, CHARSET_CHARS_IDX) = chars;
 509   CHARSET_TABLE_INFO (charset, CHARSET_WIDTH_IDX) = width;
 510   CHARSET_TABLE_INFO (charset, CHARSET_DIRECTION_IDX) = direction;
 511   CHARSET_TABLE_INFO (charset, CHARSET_LEADING_CODE_BASE_IDX)
 512     = make_number (leading_code_base);
 513   CHARSET_TABLE_INFO (charset, CHARSET_LEADING_CODE_EXT_IDX)
 514     = make_number (leading_code_ext);
 515   CHARSET_TABLE_INFO (charset, CHARSET_ISO_FINAL_CHAR_IDX) = iso_final_char;
 516   CHARSET_TABLE_INFO (charset, CHARSET_ISO_GRAPHIC_PLANE_IDX)
 517     = iso_graphic_plane;
 518   CHARSET_TABLE_INFO (charset, CHARSET_SHORT_NAME_IDX) = short_name;
 519   CHARSET_TABLE_INFO (charset, CHARSET_LONG_NAME_IDX) = long_name;
 520   CHARSET_TABLE_INFO (charset, CHARSET_DESCRIPTION_IDX) = description;
 521   CHARSET_TABLE_INFO (charset, CHARSET_PLIST_IDX) = Qnil;
 522
 523   {
 524     /* If we have already defined a charset which has the same
 525        DIMENSION, CHARS and ISO-FINAL-CHAR but the different
 526        DIRECTION, we must update the entry REVERSE-CHARSET of both
 527        charsets.  If there's no such charset, the value of the entry
 528        is set to nil.  */
 529     int i;
 530
 531     for (i = 0; i <= MAX_CHARSET; i++)
 532       if (!NILP (CHARSET_TABLE_ENTRY (i)))
 533         {
 534           if (CHARSET_DIMENSION (i) == XINT (dimension)
 535               && CHARSET_CHARS (i) == XINT (chars)
 536               && CHARSET_ISO_FINAL_CHAR (i) == XINT (iso_final_char)
 537               && CHARSET_DIRECTION (i) != XINT (direction))
 538             {
 539               CHARSET_TABLE_INFO (charset, CHARSET_REVERSE_CHARSET_IDX)
 540                 = make_number (i);
 541               CHARSET_TABLE_INFO (i, CHARSET_REVERSE_CHARSET_IDX) = charset_id;
 542               break;
 543             }
 544         }
 545     if (i > MAX_CHARSET)
 546       /* No such a charset.  */
 547       CHARSET_TABLE_INFO (charset, CHARSET_REVERSE_CHARSET_IDX)
 548         = make_number (-1);
 549   }
 550
 551   if (charset != CHARSET_ASCII
 552       && charset < MIN_CHARSET_PRIVATE_DIMENSION1)
 553     {
 554       width_by_char_head[leading_code_base] = XINT (width);
 555
 556       /* Update table emacs_code_class.  */
 557       emacs_code_class[charset] = (bytes == 2
 558                                    ? EMACS_leading_code_2
 559                                    : (bytes == 3
 560                                       ? EMACS_leading_code_3
 561                                       : EMACS_leading_code_4));
 562     }
 563
 564   /* Update table iso_charset_table.  */
 565   if (ISO_CHARSET_TABLE (dimension, chars, iso_final_char) < 0)
 566     ISO_CHARSET_TABLE (dimension, chars, iso_final_char) = charset;
 567 }
 568
 569 #ifdef emacs
 570
 571 /* Return charset id of CHARSET_SYMBOL, or return -1 if CHARSET_SYMBOL
 572    is invalid.  */
 573 int
 574 get_charset_id (charset_symbol)
 575      Lisp_Object charset_symbol;
 576 {
 577   Lisp_Object val;
 578   int charset;
 579
 580   return ((SYMBOLP (charset_symbol)
 581            && (val = Fget (charset_symbol, Qcharset), VECTORP (val))
 582            && (charset = XINT (XVECTOR (val)->contents[CHARSET_ID_IDX]),
 583                CHARSET_VALID_P (charset)))
 584           ? charset : -1);
 585 }
 586
 587 /* Return an identification number for a new private charset of
 588    DIMENSION and WIDTH.  If there's no more room for the new charset,
 589    return 0.  */
 590 Lisp_Object
 591 get_new_private_charset_id (dimension, width)
 592      int dimension, width;
 593 {
 594   int charset, from, to;
 595
 596   if (dimension == 1)
 597     {
 598       if (width == 1)
 599         from = LEADING_CODE_EXT_11, to = LEADING_CODE_EXT_12;
 600       else
 601         from = LEADING_CODE_EXT_12, to = LEADING_CODE_EXT_21;
 602     }
 603   else
 604     {
 605       if (width == 1)
 606         from = LEADING_CODE_EXT_21, to = LEADING_CODE_EXT_22;
 607       else
 608         from = LEADING_CODE_EXT_22, to = LEADING_CODE_EXT_MAX + 1;
 609     }
 610
 611   for (charset = from; charset < to; charset++)
 612     if (!CHARSET_DEFINED_P (charset)) break;
 613
 614   return make_number (charset < to ? charset : 0);
 615 }
 616
 617 DEFUN ("define-charset", Fdefine_charset, Sdefine_charset, 3, 3, 0,
 618   "Define CHARSET-ID as the identification number of CHARSET with INFO-VECTOR.\n\
 619 If CHARSET-ID is nil, it is decided automatically, which means CHARSET is\n\
 620  treated as a private charset.\n\
 621 INFO-VECTOR is a vector of the format:\n\
 622    [DIMENSION CHARS WIDTH DIRECTION ISO-FINAL-CHAR ISO-GRAPHIC-PLANE\n\
 623     SHORT-NAME LONG-NAME DESCRIPTION]\n\
 624 The meanings of each elements is as follows:\n\
 625 DIMENSION (integer) is the number of bytes to represent a character: 1 or 2.\n\
 626 CHARS (integer) is the number of characters in a dimension: 94 or 96.\n\
 627 WIDTH (integer) is the number of columns a character in the charset\n\
 628 occupies on the screen: one of 0, 1, and 2.\n\
 629 \n\
 630 DIRECTION (integer) is the rendering direction of characters in the\n\
 631 charset when rendering.  If 0, render from left to right, else\n\
 632 render from right to left.\n\
 633 \n\
 634 ISO-FINAL-CHAR (character) is the final character of the\n\
 635 corresponding ISO 2022 charset.\n\
 636 \n\
 637 ISO-GRAPHIC-PLANE (integer) is the graphic plane to be invoked\n\
 638 while encoding to variants of ISO 2022 coding system, one of the\n\
 639 following: 0/graphic-plane-left(GL), 1/graphic-plane-right(GR).\n\
 640 \n\
 641 SHORT-NAME (string) is the short name to refer to the charset.\n\
 642 \n\
 643 LONG-NAME (string) is the long name to refer to the charset.\n\
 644 \n\
 645 DESCRIPTION (string) is the description string of the charset.")
 646   (charset_id, charset_symbol, info_vector)
 647      Lisp_Object charset_id, charset_symbol, info_vector;
 648 {
 649   Lisp_Object *vec;
 650
 651   if (!NILP (charset_id))
 652     CHECK_NUMBER (charset_id, 0);
 653   CHECK_SYMBOL (charset_symbol, 1);
 654   CHECK_VECTOR (info_vector, 2);
 655
 656   if (! NILP (charset_id))
 657     {
 658       if (! CHARSET_VALID_P (XINT (charset_id)))
 659         error ("Invalid CHARSET: %d", XINT (charset_id));
 660       else if (CHARSET_DEFINED_P (XINT (charset_id)))
 661         error ("Already defined charset: %d", XINT (charset_id));
 662     }
 663
 664   vec = XVECTOR (info_vector)->contents;
 665   if (XVECTOR (info_vector)->size != 9
 666       || !INTEGERP (vec[0]) || !(XINT (vec[0]) == 1 || XINT (vec[0]) == 2)
 667       || !INTEGERP (vec[1]) || !(XINT (vec[1]) == 94 || XINT (vec[1]) == 96)
 668       || !INTEGERP (vec[2]) || !(XINT (vec[2]) == 1 || XINT (vec[2]) == 2)
 669       || !INTEGERP (vec[3]) || !(XINT (vec[3]) == 0 || XINT (vec[3]) == 1)
 670       || !INTEGERP (vec[4]) || !(XINT (vec[4]) >= '0' && XINT (vec[4]) <= '~')
 671       || !INTEGERP (vec[5]) || !(XINT (vec[5]) == 0 || XINT (vec[5]) == 1)
 672       || !STRINGP (vec[6])
 673       || !STRINGP (vec[7])
 674       || !STRINGP (vec[8]))
 675     error ("Invalid info-vector argument for defining charset %s",
 676            XSYMBOL (charset_symbol)->name->data);
 677
 678   if (NILP (charset_id))
 679     {
 680       charset_id = get_new_private_charset_id (XINT (vec[0]), XINT (vec[2]));
 681       if (XINT (charset_id) == 0)
 682         error ("There's no room for a new private charset %s",
 683                XSYMBOL (charset_symbol)->name->data);
 684     }
 685
 686   update_charset_table (charset_id, vec[0], vec[1], vec[2], vec[3],
 687                         vec[4], vec[5], vec[6], vec[7], vec[8]);
 688   Fput (charset_symbol, Qcharset, CHARSET_TABLE_ENTRY (XINT (charset_id)));
 689   CHARSET_SYMBOL (XINT (charset_id)) = charset_symbol;
 690   Vcharset_list = Fcons (charset_symbol, Vcharset_list);
 691   return Qnil;
 692 }
 693
 694 DEFUN ("generic-character-list", Fgeneric_character_list,
 695        Sgeneric_character_list, 0, 0, 0,
 696   "Return a list of all possible generic characters.\n\
 697 It includes a generic character for a charset not yet defined.")
 698   ()
 699 {
 700   return Vgeneric_character_list;
 701 }
 702
 703 DEFUN ("get-unused-iso-final-char", Fget_unused_iso_final_char,
 704        Sget_unused_iso_final_char, 2, 2, 0,
 705   "Return an unsed ISO's final char for a charset of DIMENISION and CHARS.\n\
 706 DIMENSION is the number of bytes to represent a character: 1 or 2.\n\
 707 CHARS is the number of characters in a dimension: 94 or 96.\n\
 708 \n\
 709 This final char is for private use, thus the range is `0' (48) .. `?' (63).\n\
 710 If there's no unused final char for the specified kind of charset,\n\
 711 return nil.")
 712   (dimension, chars)
 713      Lisp_Object dimension, chars;
 714 {
 715   int final_char;
 716
 717   CHECK_NUMBER (dimension, 0);
 718   CHECK_NUMBER (chars, 1);
 719   if (XINT (dimension) != 1 && XINT (dimension) != 2)
 720     error ("Invalid charset dimension %d, it should be 1 or 2",
 721            XINT (dimension));
 722   if (XINT (chars) != 94 && XINT (chars) != 96)
 723     error ("Invalid charset chars %d, it should be 94 or 96",
 724            XINT (chars));
 725   for (final_char = '0'; final_char <= '?'; final_char++)
 726     {
 727       if (ISO_CHARSET_TABLE (dimension, chars, make_number (final_char)) < 0)
 728         break;
 729     }
 730   return (final_char <= '?' ? make_number (final_char) : Qnil);
 731 }
 732
 733 DEFUN ("declare-equiv-charset", Fdeclare_equiv_charset, Sdeclare_equiv_charset,
 734        4, 4, 0,
 735   "Declare a charset of DIMENSION, CHARS, FINAL-CHAR is the same as CHARSET.\n\
 736 CHARSET should be defined by `defined-charset' in advance.")
 737   (dimension, chars, final_char, charset_symbol)
 738      Lisp_Object dimension, chars, final_char, charset_symbol;
 739 {
 740   int charset;
 741
 742   CHECK_NUMBER (dimension, 0);
 743   CHECK_NUMBER (chars, 1);
 744   CHECK_NUMBER (final_char, 2);
 745   CHECK_SYMBOL (charset_symbol, 3);
 746
 747   if (XINT (dimension) != 1 && XINT (dimension) != 2)
 748     error ("Invalid DIMENSION %d, it should be 1 or 2", XINT (dimension));
 749   if (XINT (chars) != 94 && XINT (chars) != 96)
 750     error ("Invalid CHARS %d, it should be 94 or 96", XINT (chars));
 751   if (XINT (final_char) < '0' || XFASTINT (final_char) > '~')
 752     error ("Invalid FINAL-CHAR %c, it should be `0'..`~'", XINT (chars));
 753   if ((charset = get_charset_id (charset_symbol)) < 0)
 754     error ("Invalid charset %s", XSYMBOL (charset_symbol)->name->data);
 755
 756   ISO_CHARSET_TABLE (dimension, chars, final_char) = charset;
 757   return Qnil;
 758 }
 759
 760 /* Return number of different charsets in STR of length LEN.  In
 761    addition, for each found charset N, CHARSETS[N] is set 1.  The
 762    caller should allocate CHARSETS (MAX_CHARSET + 1 elements) in advance.
 763    It may lookup a translation table TABLE if supplied.
 764
 765    If MULTIBYTE is zero, do not check multibyte characters, i.e. if
 766    any ASCII codes (7-bit) are found, CHARSET[0] is set to 1, if any
 767    8-bit codes are found CHARSET[1] is set to 1.  */
 768
 769 int
 770 find_charset_in_str (str, len, charsets, table, multibyte)
 771      unsigned char *str;
 772      int len, *charsets;
 773      Lisp_Object table;
 774      int multibyte;
 775 {
 776   register int num = 0, c;
 777
 778   if (! multibyte)
 779     {
 780       unsigned char *endp = str + len;
 781       int maskbits = 0;
 782
 783       while (str < endp && maskbits != 3)
 784         maskbits |=  (*str++ < 0x80 ? 1 : 2);
 785       if (maskbits & 1)
 786         {
 787           charsets[0] = 1;
 788           num++;
 789         }
 790       if (maskbits & 2)
 791         {
 792           charsets[1] = 1;
 793           num++;
 794         }
 795       return num;
 796     }
 797
 798   if (! CHAR_TABLE_P (table))
 799     table = Qnil;
 800
 801   while (len > 0)
 802     {
 803       int bytes, charset, c1, c2;
 804
 805       SPLIT_MULTIBYTE_SEQ (str, len, bytes, charset, c1, c2);
 806
 807       if (! NILP (table))
 808         {
 809           int c1 = translate_char (table, -1, charset, c1, c2);
 810           if (c1 >= 0)
 811             charset = CHAR_CHARSET (c);
 812         }
 813
 814       if (!charsets[charset])
 815         {
 816           charsets[charset] = 1;
 817           num += 1;
 818         }
 819       str += bytes;
 820       len -= bytes;
 821     }
 822   return num;
 823 }
 824
 825 DEFUN ("find-charset-region", Ffind_charset_region, Sfind_charset_region,
 826        2, 3, 0,
 827   "Return a list of charsets in the region between BEG and END.\n\
 828 BEG and END are buffer positions.\n\
 829 Optional arg TABLE if non-nil is a translation table to look up.\n\
 830 \n\
 831 If the region contains invalid multiybte characters,\n\
 832 `unknown' is included in the returned list.\n\
 833 \n\
 834 If the current buffer is unibyte, the returned list contains\n\
 835 `ascii' if any 7-bit characters are found,\n\
 836 and `unknown' if any 8-bit characters are found.")
 837   (beg, end, table)
 838      Lisp_Object beg, end, table;
 839 {
 840   int charsets[MAX_CHARSET + 1];
 841   int from, from_byte, to, stop, stop_byte, i;
 842   Lisp_Object val;
 843   int undefined;
 844   int multibyte = !NILP (current_buffer->enable_multibyte_characters);
 845
 846   validate_region (&beg, &end);
 847   from = XFASTINT (beg);
 848   stop = to = XFASTINT (end);
 849
 850   if (from < GPT && GPT < to)
 851     {
 852       stop = GPT;
 853       stop_byte = GPT_BYTE;
 854     }
 855   else
 856     stop_byte = CHAR_TO_BYTE (stop);
 857
 858   from_byte = CHAR_TO_BYTE (from);
 859
 860   bzero (charsets, (MAX_CHARSET + 1) * sizeof (int));
 861   while (1)
 862     {
 863       find_charset_in_str (BYTE_POS_ADDR (from_byte), stop_byte - from_byte,
 864                            charsets, table, multibyte);
 865       if (stop < to)
 866         {
 867           from = stop, from_byte = stop_byte;
 868           stop = to, stop_byte = CHAR_TO_BYTE (stop);
 869         }
 870       else
 871         break;
 872     }
 873
 874   val = Qnil;
 875   undefined = 0;
 876   for (i = (multibyte ? MAX_CHARSET : 1); i >= 0; i--)
 877     if (charsets[i])
 878       {
 879         if (CHARSET_DEFINED_P (i))
 880           val = Fcons (CHARSET_SYMBOL (i), val);
 881         else
 882           undefined = 1;
 883       }
 884   if (undefined)
 885     val = Fcons (Qunknown, val);
 886   return val;
 887 }
 888
 889 DEFUN ("find-charset-string", Ffind_charset_string, Sfind_charset_string,
 890        1, 2, 0,
 891   "Return a list of charsets in STR.\n\
 892 Optional arg TABLE if non-nil is a translation table to look up.\n\
 893 \n\
 894 If the region contains invalid multiybte characters,\n\
 895 `unknown' is included in the returned list.\n\
 896 \n\
 897 If STR is unibyte, the returned list contains\n\
 898 `ascii' if any 7-bit characters are found,\n\
 899 and `unknown' if any 8-bit characters are found.")
 900   (str, table)
 901      Lisp_Object str, table;
 902 {
 903   int charsets[MAX_CHARSET + 1];
 904   int i;
 905   Lisp_Object val;
 906   int undefined;
 907   int multibyte;
 908
 909   CHECK_STRING (str, 0);
 910   multibyte = STRING_MULTIBYTE (str);
 911
 912   bzero (charsets, (MAX_CHARSET + 1) * sizeof (int));
 913   find_charset_in_str (XSTRING (str)->data, STRING_BYTES (XSTRING (str)),
 914                        charsets, table, multibyte);
 915   val = Qnil;
 916   undefined = 0;
 917   for (i = (multibyte ? MAX_CHARSET : 1); i >= 0; i--)
 918     if (charsets[i])
 919       {
 920         if (CHARSET_DEFINED_P (i))
 921           val = Fcons (CHARSET_SYMBOL (i), val);
 922         else
 923           undefined = 1;
 924       }
 925   if (undefined)
 926     val = Fcons (Qunknown, val);
 927   return val;
 928 }
 929 \f
 930 DEFUN ("make-char-internal", Fmake_char_internal, Smake_char_internal, 1, 3, 0,
 931   "")
 932   (charset, code1, code2)
 933      Lisp_Object charset, code1, code2;
 934 {
 935   int charset_id, c1, c2;
 936
 937   CHECK_NUMBER (charset, 0);
 938   charset_id = XINT (charset);
 939   if (!CHARSET_DEFINED_P (charset_id))
 940     error ("Invalid charset ID: %d", XINT (charset));
 941
 942   if (NILP (code1))
 943     c1 = 0;
 944   else
 945     {
 946       CHECK_NUMBER (code1, 1);
 947       c1 = XINT (code1);
 948     }
 949   if (NILP (code2))
 950     c2 = 0;
 951   else
 952     {
 953       CHECK_NUMBER (code2, 2);
 954       c2 = XINT (code2);
 955     }
 956
 957   if (c1 < 0 || c1 > 0xFF || c2 < 0 || c2 > 0xFF)
 958     error ("Invalid code points for charset ID %d: %d %d", charset_id, c1, c2);
 959   c1 &= 0x7F;
 960   c2 &= 0x7F;
 961   if (c1 == 0
 962       ? c2 != 0
 963       : (c2 == 0
 964          ? !CHAR_COMPONENTS_VALID_P (charset, c1, 0x20)
 965          : !CHAR_COMPONENTS_VALID_P (charset, c1, c2)))
 966     error ("Invalid code points for charset ID %d: %d %d", charset_id, c1, c2);
 967
 968   return make_number (MAKE_CHAR (charset_id, c1, c2));
 969 }
 970
 971 DEFUN ("split-char", Fsplit_char, Ssplit_char, 1, 1, 0,
 972   "Return list of charset and one or two position-codes of CHAR.\n\
 973 If CHAR is invalid as a character code,\n\
 974 return a list of symbol `unknown' and CHAR.")
 975   (ch)
 976      Lisp_Object ch;
 977 {
 978   Lisp_Object val;
 979   int c, charset, c1, c2;
 980
 981   CHECK_NUMBER (ch, 0);
 982   c = XFASTINT (ch);
 983   if (!CHAR_VALID_P (c, 1))
 984     return Fcons (Qunknown, Fcons (ch, Qnil));
 985   SPLIT_CHAR (XFASTINT (ch), charset, c1, c2);
 986   return (c2 >= 0
 987           ? Fcons (CHARSET_SYMBOL (charset),
 988                    Fcons (make_number (c1), Fcons (make_number (c2), Qnil)))
 989           : Fcons (CHARSET_SYMBOL (charset), Fcons (make_number (c1), Qnil)));
 990 }
 991
 992 DEFUN ("char-charset", Fchar_charset, Schar_charset, 1, 1, 0,
 993   "Return charset of CHAR.")
 994   (ch)
 995      Lisp_Object ch;
 996 {
 997   CHECK_NUMBER (ch, 0);
 998
 999   return CHARSET_SYMBOL (CHAR_CHARSET (XINT (ch)));
1000 }
1001
1002 DEFUN ("charset-after", Fcharset_after, Scharset_after, 0, 1, 0,
1003   "Return charset of a character in the current buffer at position POS.\n\
1004 If POS is nil, it defauls to the current point.\n\
1005 If POS is out of range, the value is nil.")
1006   (pos)
1007      Lisp_Object pos;
1008 {
1009   register int pos_byte, bytes, charset, c1, c2;
1010   register unsigned char *p;
1011
1012   if (NILP (pos))
1013     pos_byte = PT_BYTE;
1014   else if (MARKERP (pos))
1015     {
1016       pos_byte = marker_byte_position (pos);
1017       if (pos_byte < BEGV_BYTE || pos_byte >= ZV_BYTE)
1018         return Qnil;
1019     }
1020   else
1021     {
1022       CHECK_NUMBER (pos, 0);
1023       if (XINT (pos) < BEGV || XINT (pos) >= ZV)
1024         return Qnil;
1025       pos_byte = CHAR_TO_BYTE (XINT (pos));
1026     }
1027   p = BYTE_POS_ADDR (pos_byte);
1028   if (BASE_LEADING_CODE_P (*p))
1029     {
1030       SPLIT_MULTIBYTE_SEQ (p, Z_BYTE - pos_byte, bytes, charset, c1, c2);
1031       if (charset < 0)
1032         charset = 1;
1033     }
1034   else
1035     charset = CHARSET_ASCII;
1036
1037   return CHARSET_SYMBOL (charset);
1038 }
1039
1040 DEFUN ("iso-charset", Fiso_charset, Siso_charset, 3, 3, 0,
1041   "Return charset of ISO's specification DIMENSION, CHARS, and FINAL-CHAR.\n\
1042 \n\
1043 ISO 2022's designation sequence (escape sequence) distinguishes charsets\n\
1044 by their DIMENSION, CHARS, and FINAL-CHAR,\n\
1045 where as Emacs distinguishes them by charset symbol.\n\
1046 See the documentation of the function `charset-info' for the meanings of\n\
1047 DIMENSION, CHARS, and FINAL-CHAR.")
1048   (dimension, chars, final_char)
1049      Lisp_Object dimension, chars, final_char;
1050 {
1051   int charset;
1052
1053   CHECK_NUMBER (dimension, 0);
1054   CHECK_NUMBER (chars, 1);
1055   CHECK_NUMBER (final_char, 2);
1056
1057   if ((charset = ISO_CHARSET_TABLE (dimension, chars, final_char)) < 0)
1058     return Qnil;
1059   return CHARSET_SYMBOL (charset);
1060 }
1061
1062 /* If GENERICP is nonzero, return nonzero iff C is a valid normal or
1063    generic character.  If GENERICP is zero, return nonzero iff C is a
1064    valid normal character.  Do not call this function directly,
1065    instead use macro CHAR_VALID_P.  */
1066 int
1067 char_valid_p (c, genericp)
1068      int c, genericp;
1069 {
1070   int charset, c1, c2;
1071
1072   if (c < 0)
1073     return 0;
1074   if (SINGLE_BYTE_CHAR_P (c))
1075     return 1;
1076   SPLIT_NON_ASCII_CHAR (c, charset, c1, c2);
1077   if (genericp)
1078     {
1079       if (c1)
1080         {
1081           if (c2 <= 0) c2 = 0x20;
1082         }
1083       else
1084         {
1085           if (c2 <= 0) c1 = c2 = 0x20;
1086         }
1087     }
1088   return (CHARSET_DEFINED_P (charset)
1089           && CHAR_COMPONENTS_VALID_P (charset, c1, c2));
1090 }
1091
1092 DEFUN ("char-valid-p", Fchar_valid_p, Schar_valid_p, 1, 2, 0,
1093   "Return t if OBJECT is a valid normal character.\n\
1094 If optional arg GENERICP is non-nil, also return t if OBJECT is\n\
1095 a valid generic character.")
1096   (object, genericp)
1097      Lisp_Object object, genericp;
1098 {
1099   if (! NATNUMP (object))
1100     return Qnil;
1101   return (CHAR_VALID_P (XFASTINT (object), !NILP (genericp)) ? Qt : Qnil);
1102 }
1103
1104 DEFUN ("unibyte-char-to-multibyte", Funibyte_char_to_multibyte,
1105        Sunibyte_char_to_multibyte, 1, 1, 0,
1106   "Convert the unibyte character CH to multibyte character.\n\
1107 The conversion is done based on `nonascii-translation-table' (which see)\n\
1108  or `nonascii-insert-offset' (which see).")
1109   (ch)
1110      Lisp_Object ch;
1111 {
1112   int c;
1113
1114   CHECK_NUMBER (ch, 0);
1115   c = XINT (ch);
1116   if (c < 0 || c >= 0400)
1117     error ("Invalid unibyte character: %d", c);
1118   c = unibyte_char_to_multibyte (c);
1119   if (c < 0)
1120     error ("Can't convert to multibyte character: %d", XINT (ch));
1121   return make_number (c);
1122 }
1123
1124 DEFUN ("multibyte-char-to-unibyte", Fmultibyte_char_to_unibyte,
1125        Smultibyte_char_to_unibyte, 1, 1, 0,
1126   "Convert the multibyte character CH to unibyte character.\n\
1127 The conversion is done based on `nonascii-translation-table' (which see)\n\
1128  or `nonascii-insert-offset' (which see).")
1129   (ch)
1130      Lisp_Object ch;
1131 {
1132   int c;
1133
1134   CHECK_NUMBER (ch, 0);
1135   c = XINT (ch);
1136   if (! CHAR_VALID_P (c, 0))
1137     error ("Invalid multibyte character: %d", c);
1138   c = multibyte_char_to_unibyte (c, Qnil);
1139   if (c < 0)
1140     error ("Can't convert to unibyte character: %d", XINT (ch));
1141   return make_number (c);
1142 }
1143
1144 DEFUN ("char-bytes", Fchar_bytes, Schar_bytes, 1, 1, 0,
1145   "Return 1 regardless of the argument CHAR.\n\
1146 This is now an obsolete function.  We keep it just for backward compatibility.")
1147   (ch)
1148      Lisp_Object ch;
1149 {
1150   Lisp_Object val;
1151
1152   CHECK_NUMBER (ch, 0);
1153   return make_number (1);
1154 }
1155
1156 /* Return how many bytes C will occupy in a multibyte buffer.
1157    Don't call this function directly, instead use macro CHAR_BYTES.  */
1158 int
1159 char_bytes (c)
1160      int c;
1161 {
1162   int charset;
1163
1164   if (SINGLE_BYTE_CHAR_P (c) || (c & ~((1 << CHARACTERBITS) - 1)))
1165     return 1;
1166
1167   charset = CHAR_CHARSET (c);
1168   return (CHARSET_DEFINED_P (charset) ? CHARSET_BYTES (charset) : 1);
1169 }
1170
1171 /* Return the width of character of which multi-byte form starts with
1172    C.  The width is measured by how many columns occupied on the
1173    screen when displayed in the current buffer.  */
1174
1175 #define ONE_BYTE_CHAR_WIDTH(c)                                          \
1176   (c < 0x20                                                             \
1177    ? (c == '\t'                                                         \
1178       ? XFASTINT (current_buffer->tab_width)                            \
1179       : (c == '\n' ? 0 : (NILP (current_buffer->ctl_arrow) ? 4 : 2)))   \
1180    : (c < 0x7f                                                          \
1181       ? 1                                                               \
1182       : (c == 0x7F                                                      \
1183          ? (NILP (current_buffer->ctl_arrow) ? 4 : 2)                   \
1184          : ((! NILP (current_buffer->enable_multibyte_characters)       \
1185              && BASE_LEADING_CODE_P (c))                                \
1186             ? WIDTH_BY_CHAR_HEAD (c)                                    \
1187             : 4))))
1188
1189 DEFUN ("char-width", Fchar_width, Schar_width, 1, 1, 0,
1190   "Return width of CHAR when displayed in the current buffer.\n\
1191 The width is measured by how many columns it occupies on the screen.")
1192   (ch)
1193        Lisp_Object ch;
1194 {
1195   Lisp_Object val, disp;
1196   int c;
1197   struct Lisp_Char_Table *dp = buffer_display_table ();
1198
1199   CHECK_NUMBER (ch, 0);
1200
1201   c = XINT (ch);
1202
1203   /* Get the way the display table would display it.  */
1204   disp = dp ? DISP_CHAR_VECTOR (dp, c) : Qnil;
1205
1206   if (VECTORP (disp))
1207     XSETINT (val, XVECTOR (disp)->size);
1208   else if (SINGLE_BYTE_CHAR_P (c))
1209     XSETINT (val, ONE_BYTE_CHAR_WIDTH (c));
1210   else
1211     {
1212       int charset = CHAR_CHARSET (c);
1213
1214       XSETFASTINT (val, CHARSET_WIDTH (charset));
1215     }
1216   return val;
1217 }
1218
1219 /* Return width of string STR of length LEN when displayed in the
1220    current buffer.  The width is measured by how many columns it
1221    occupies on the screen.  */
1222
1223 int
1224 strwidth (str, len)
1225      unsigned char *str;
1226      int len;
1227 {
1228   unsigned char *endp = str + len;
1229   int width = 0;
1230   struct Lisp_Char_Table *dp = buffer_display_table ();
1231
1232   while (str < endp)
1233     {
1234       Lisp_Object disp;
1235       int thislen;
1236       int c = STRING_CHAR_AND_LENGTH (str, endp - str, thislen);
1237
1238       /* Get the way the display table would display it.  */
1239       if (dp)
1240         disp = DISP_CHAR_VECTOR (dp, c);
1241       else
1242         disp = Qnil;
1243
1244       if (VECTORP (disp))
1245         width += XVECTOR (disp)->size;
1246       else
1247         width += ONE_BYTE_CHAR_WIDTH (*str);
1248
1249       str += thislen;
1250     }
1251   return width;
1252 }
1253
1254 DEFUN ("string-width", Fstring_width, Sstring_width, 1, 1, 0,
1255   "Return width of STRING when displayed in the current buffer.\n\
1256 Width is measured by how many columns it occupies on the screen.\n\
1257 When calculating width of a multibyte character in STRING,\n\
1258 only the base leading-code is considered; the validity of\n\
1259 the following bytes is not checked.")
1260   (str)
1261      Lisp_Object str;
1262 {
1263   Lisp_Object val;
1264
1265   CHECK_STRING (str, 0);
1266   XSETFASTINT (val, strwidth (XSTRING (str)->data,
1267                               STRING_BYTES (XSTRING (str))));
1268   return val;
1269 }
1270
1271 DEFUN ("char-direction", Fchar_direction, Schar_direction, 1, 1, 0,
1272   "Return the direction of CHAR.\n\
1273 The returned value is 0 for left-to-right and 1 for right-to-left.")
1274   (ch)
1275      Lisp_Object ch;
1276 {
1277   int charset;
1278
1279   CHECK_NUMBER (ch, 0);
1280   charset = CHAR_CHARSET (XFASTINT (ch));
1281   if (!CHARSET_DEFINED_P (charset))
1282     invalid_character (XINT (ch));
1283   return CHARSET_TABLE_INFO (charset, CHARSET_DIRECTION_IDX);
1284 }
1285
1286 DEFUN ("chars-in-region", Fchars_in_region, Schars_in_region, 2, 2, 0,
1287   "Return number of characters between BEG and END.")
1288   (beg, end)
1289      Lisp_Object beg, end;
1290 {
1291   int from, to;
1292
1293   CHECK_NUMBER_COERCE_MARKER (beg, 0);
1294   CHECK_NUMBER_COERCE_MARKER (end, 1);
1295
1296   from = min (XFASTINT (beg), XFASTINT (end));
1297   to = max (XFASTINT (beg), XFASTINT (end));
1298
1299   return make_number (to - from);
1300 }
1301
1302 /* Return the number of characters in the NBYTES bytes at PTR.
1303    This works by looking at the contents and checking for multibyte sequences.
1304    However, if the current buffer has enable-multibyte-characters = nil,
1305    we treat each byte as a character.  */
1306
1307 int
1308 chars_in_text (ptr, nbytes)
1309      unsigned char *ptr;
1310      int nbytes;
1311 {
1312   /* current_buffer is null at early stages of Emacs initialization.  */
1313   if (current_buffer == 0
1314       || NILP (current_buffer->enable_multibyte_characters))
1315     return nbytes;
1316
1317   return multibyte_chars_in_text (ptr, nbytes);
1318 }
1319
1320 /* Return the number of characters in the NBYTES bytes at PTR.
1321    This works by looking at the contents and checking for multibyte sequences.
1322    It ignores enable-multibyte-characters.  */
1323
1324 int
1325 multibyte_chars_in_text (ptr, nbytes)
1326      unsigned char *ptr;
1327      int nbytes;
1328 {
1329   unsigned char *endp;
1330   int chars, bytes;
1331
1332   endp = ptr + nbytes;
1333   chars = 0;
1334
1335   while (ptr < endp)
1336     {
1337       if (BASE_LEADING_CODE_P (*ptr))
1338         {
1339           PARSE_MULTIBYTE_SEQ (ptr, nbytes, bytes);
1340           ptr += bytes;
1341           nbytes -= bytes;
1342         }
1343       else
1344         {
1345           ptr++;
1346           nbytes--;
1347         }
1348       chars++;
1349     }
1350
1351   return chars;
1352 }
1353
1354 DEFUN ("string", Fstring, Sstring, 1, MANY, 0,
1355   "Concatenate all the argument characters and make the result a string.")
1356   (n, args)
1357      int n;
1358      Lisp_Object *args;
1359 {
1360   int i;
1361   unsigned char *buf = (unsigned char *) alloca (MAX_MULTIBYTE_LENGTH * n);
1362   unsigned char *p = buf;
1363   Lisp_Object val;
1364   int c;
1365
1366   for (i = 0; i < n; i++)
1367     {
1368       if (!INTEGERP (args[i]))
1369         CHECK_NUMBER (args[i], 0);
1370       c = XINT (args[i]);
1371       p += CHAR_STRING (c, p);
1372     }
1373
1374   /* Here, we can't use make_string_from_bytes because of byte
1375      combining problem.  */
1376   val = make_string (buf, p - buf);
1377   return val;
1378 }
1379
1380 #endif /* emacs */
1381 \f
1382 int
1383 charset_id_internal (charset_name)
1384      char *charset_name;
1385 {
1386   Lisp_Object val;
1387
1388   val= Fget (intern (charset_name), Qcharset);
1389   if (!VECTORP (val))
1390     error ("Charset %s is not defined", charset_name);
1391
1392   return (XINT (XVECTOR (val)->contents[0]));
1393 }
1394
1395 DEFUN ("setup-special-charsets", Fsetup_special_charsets,
1396        Ssetup_special_charsets, 0, 0, 0, "Internal use only.")
1397    ()
1398 {
1399   charset_latin_iso8859_1 = charset_id_internal ("latin-iso8859-1");
1400   charset_jisx0208_1978 = charset_id_internal ("japanese-jisx0208-1978");
1401   charset_jisx0208 = charset_id_internal ("japanese-jisx0208");
1402   charset_katakana_jisx0201 = charset_id_internal ("katakana-jisx0201");
1403   charset_latin_jisx0201 = charset_id_internal ("latin-jisx0201");
1404   charset_big5_1 = charset_id_internal ("chinese-big5-1");
1405   charset_big5_2 = charset_id_internal ("chinese-big5-2");
1406   return Qnil;
1407 }
1408
1409 void
1410 init_charset_once ()
1411 {
1412   int i, j, k;
1413
1414   staticpro (&Vcharset_table);
1415   staticpro (&Vcharset_symbol_table);
1416   staticpro (&Vgeneric_character_list);
1417
1418   /* This has to be done here, before we call Fmake_char_table.  */
1419   Qcharset_table = intern ("charset-table");
1420   staticpro (&Qcharset_table);
1421
1422   /* Intern this now in case it isn't already done.
1423      Setting this variable twice is harmless.
1424      But don't staticpro it here--that is done in alloc.c.  */
1425   Qchar_table_extra_slots = intern ("char-table-extra-slots");
1426
1427   /* Now we are ready to set up this property, so we can
1428      create the charset table.  */
1429   Fput (Qcharset_table, Qchar_table_extra_slots, make_number (0));
1430   Vcharset_table = Fmake_char_table (Qcharset_table, Qnil);
1431
1432   Qunknown = intern ("unknown");
1433   staticpro (&Qunknown);
1434   Vcharset_symbol_table = Fmake_vector (make_number (MAX_CHARSET + 1),
1435                                         Qunknown);
1436
1437   /* Setup tables.  */
1438   for (i = 0; i < 2; i++)
1439     for (j = 0; j < 2; j++)
1440       for (k = 0; k < 128; k++)
1441         iso_charset_table [i][j][k] = -1;
1442
1443   for (i = 0; i < 256; i++)
1444     BYTES_BY_CHAR_HEAD (i) = 1;
1445   for (i = MIN_CHARSET_OFFICIAL_DIMENSION1;
1446        i <= MAX_CHARSET_OFFICIAL_DIMENSION1; i++)
1447     BYTES_BY_CHAR_HEAD (i) = 2;
1448   for (i = MIN_CHARSET_OFFICIAL_DIMENSION2;
1449        i <= MAX_CHARSET_OFFICIAL_DIMENSION2; i++)
1450     BYTES_BY_CHAR_HEAD (i) = 3;
1451   BYTES_BY_CHAR_HEAD (LEADING_CODE_PRIVATE_11) = 3;
1452   BYTES_BY_CHAR_HEAD (LEADING_CODE_PRIVATE_12) = 3;
1453   BYTES_BY_CHAR_HEAD (LEADING_CODE_PRIVATE_21) = 4;
1454   BYTES_BY_CHAR_HEAD (LEADING_CODE_PRIVATE_22) = 4;
1455   /* The followings don't reflect the actual bytes, but just to tell
1456      that it is a start of a multibyte character.  */
1457   BYTES_BY_CHAR_HEAD (0x80) = 2;
1458   BYTES_BY_CHAR_HEAD (0x9E) = 2;
1459   BYTES_BY_CHAR_HEAD (0x9F) = 2;
1460
1461   for (i = 0; i < 128; i++)
1462     WIDTH_BY_CHAR_HEAD (i) = 1;
1463   for (; i < 256; i++)
1464     WIDTH_BY_CHAR_HEAD (i) = 4;
1465   WIDTH_BY_CHAR_HEAD (LEADING_CODE_PRIVATE_11) = 1;
1466   WIDTH_BY_CHAR_HEAD (LEADING_CODE_PRIVATE_12) = 2;
1467   WIDTH_BY_CHAR_HEAD (LEADING_CODE_PRIVATE_21) = 1;
1468   WIDTH_BY_CHAR_HEAD (LEADING_CODE_PRIVATE_22) = 2;
1469
1470   {
1471     Lisp_Object val;
1472
1473     val = Qnil;
1474     for (i = 0x81; i < 0x90; i++)
1475       val = Fcons (make_number ((i - 0x70) << 7), val);
1476     for (; i < 0x9A; i++)
1477       val = Fcons (make_number ((i - 0x8F) << 14), val);
1478     for (i = 0xA0; i < 0xF0; i++)
1479       val = Fcons (make_number ((i - 0x70) << 7), val);
1480     for (; i < 0xFF; i++)
1481       val = Fcons (make_number ((i - 0xE0) << 14), val);
1482     Vgeneric_character_list = Fnreverse (val);
1483   }
1484
1485   nonascii_insert_offset = 0;
1486   Vnonascii_translation_table = Qnil;
1487 }
1488
1489 #ifdef emacs
1490
1491 void
1492 syms_of_charset ()
1493 {
1494   Qascii = intern ("ascii");
1495   staticpro (&Qascii);
1496
1497   Qcharset = intern ("charset");
1498   staticpro (&Qcharset);
1499
1500   /* Define ASCII charset now.  */
1501   update_charset_table (make_number (CHARSET_ASCII),
1502                         make_number (1), make_number (94),
1503                         make_number (1),
1504                         make_number (0),
1505                         make_number ('B'),
1506                         make_number (0),
1507                         build_string ("ASCII"),
1508                         build_string ("ASCII"),
1509                         build_string ("ASCII (ISO646 IRV)"));
1510   CHARSET_SYMBOL (CHARSET_ASCII) = Qascii;
1511   Fput (Qascii, Qcharset, CHARSET_TABLE_ENTRY (CHARSET_ASCII));
1512
1513   Qauto_fill_chars = intern ("auto-fill-chars");
1514   staticpro (&Qauto_fill_chars);
1515   Fput (Qauto_fill_chars, Qchar_table_extra_slots, make_number (0));
1516
1517   defsubr (&Sdefine_charset);
1518   defsubr (&Sgeneric_character_list);
1519   defsubr (&Sget_unused_iso_final_char);
1520   defsubr (&Sdeclare_equiv_charset);
1521   defsubr (&Sfind_charset_region);
1522   defsubr (&Sfind_charset_string);
1523   defsubr (&Smake_char_internal);
1524   defsubr (&Ssplit_char);
1525   defsubr (&Schar_charset);
1526   defsubr (&Scharset_after);
1527   defsubr (&Siso_charset);
1528   defsubr (&Schar_valid_p);
1529   defsubr (&Sunibyte_char_to_multibyte);
1530   defsubr (&Smultibyte_char_to_unibyte);
1531   defsubr (&Schar_bytes);
1532   defsubr (&Schar_width);
1533   defsubr (&Sstring_width);
1534   defsubr (&Schar_direction);
1535   defsubr (&Schars_in_region);
1536   defsubr (&Sstring);
1537   defsubr (&Ssetup_special_charsets);
1538
1539   DEFVAR_LISP ("charset-list", &Vcharset_list,
1540     "List of charsets ever defined.");
1541   Vcharset_list = Fcons (Qascii, Qnil);
1542
1543   DEFVAR_LISP ("translation-table-vector",  &Vtranslation_table_vector,
1544     "Vector of cons cell of a symbol and translation table ever defined.\n\
1545 An ID of a translation table is an index of this vector.");
1546   Vtranslation_table_vector = Fmake_vector (make_number (16), Qnil);
1547
1548   DEFVAR_INT ("leading-code-private-11", &leading_code_private_11,
1549     "Leading-code of private TYPE9N charset of column-width 1.");
1550   leading_code_private_11 = LEADING_CODE_PRIVATE_11;
1551
1552   DEFVAR_INT ("leading-code-private-12", &leading_code_private_12,
1553     "Leading-code of private TYPE9N charset of column-width 2.");
1554   leading_code_private_12 = LEADING_CODE_PRIVATE_12;
1555
1556   DEFVAR_INT ("leading-code-private-21", &leading_code_private_21,
1557     "Leading-code of private TYPE9Nx9N charset of column-width 1.");
1558   leading_code_private_21 = LEADING_CODE_PRIVATE_21;
1559
1560   DEFVAR_INT ("leading-code-private-22", &leading_code_private_22,
1561     "Leading-code of private TYPE9Nx9N charset of column-width 2.");
1562   leading_code_private_22 = LEADING_CODE_PRIVATE_22;
1563
1564   DEFVAR_INT ("nonascii-insert-offset", &nonascii_insert_offset,
1565     "Offset for converting non-ASCII unibyte codes 0240...0377 to multibyte.\n\
1566 This is used for converting unibyte text to multibyte,\n\
1567 and for inserting character codes specified by number.\n\n\
1568 This serves to convert a Latin-1 or similar 8-bit character code\n\
1569 to the corresponding Emacs multibyte character code.\n\
1570 Typically the value should be (- (make-char CHARSET 0) 128),\n\
1571 for your choice of character set.\n\
1572 If `nonascii-translation-table' is non-nil, it overrides this variable.");
1573   nonascii_insert_offset = 0;
1574
1575   DEFVAR_LISP ("nonascii-translation-table", &Vnonascii_translation_table,
1576     "Translation table to convert non-ASCII unibyte codes to multibyte.\n\
1577 This is used for converting unibyte text to multibyte,\n\
1578 and for inserting character codes specified by number.\n\n\
1579 Conversion is performed only when multibyte characters are enabled,\n\
1580 and it serves to convert a Latin-1 or similar 8-bit character code\n\
1581 to the corresponding Emacs character code.\n\n\
1582 If this is nil, `nonascii-insert-offset' is used instead.\n\
1583 See also the docstring of `make-translation-table'.");
1584   Vnonascii_translation_table = Qnil;
1585
1586   DEFVAR_LISP ("auto-fill-chars", &Vauto_fill_chars,
1587     "A char-table for characters which invoke auto-filling.\n\
1588 Such characters has value t in this table.");
1589   Vauto_fill_chars = Fmake_char_table (Qauto_fill_chars, Qnil);
1590   CHAR_TABLE_SET (Vauto_fill_chars, make_number (' '), Qt);
1591   CHAR_TABLE_SET (Vauto_fill_chars, make_number ('\n'), Qt);
1592 }
1593
1594 #endif /* emacs */